diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-03-06 13:31:36 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-03-06 13:31:36 -0500 |
commit | 8dcd175bc3d50b78413c56d5b17d4bddd77412ef (patch) | |
tree | 2c2fb25759b43f2e73830f07ef3b444d76825280 | |
parent | afe6fe7036c6efdcb46cabc64bec9b6e4a005210 (diff) | |
parent | fff04900ea79915939ef6a3aad78fca6511a3034 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge misc updates from Andrew Morton:
- a few misc things
- ocfs2 updates
- most of MM
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (159 commits)
tools/testing/selftests/proc/proc-self-syscall.c: remove duplicate include
proc: more robust bulk read test
proc: test /proc/*/maps, smaps, smaps_rollup, statm
proc: use seq_puts() everywhere
proc: read kernel cpu stat pointer once
proc: remove unused argument in proc_pid_lookup()
fs/proc/thread_self.c: code cleanup for proc_setup_thread_self()
fs/proc/self.c: code cleanup for proc_setup_self()
proc: return exit code 4 for skipped tests
mm,mremap: bail out earlier in mremap_to under map pressure
mm/sparse: fix a bad comparison
mm/memory.c: do_fault: avoid usage of stale vm_area_struct
writeback: fix inode cgroup switching comment
mm/huge_memory.c: fix "orig_pud" set but not used
mm/hotplug: fix an imbalance with DEBUG_PAGEALLOC
mm/memcontrol.c: fix bad line in comment
mm/cma.c: cma_declare_contiguous: correct err handling
mm/page_ext.c: fix an imbalance with kmemleak
mm/compaction: pass pgdat to too_many_isolated() instead of zone
mm: remove zone_lru_lock() function, access ->lru_lock directly
...
213 files changed, 4918 insertions, 2315 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 7bf3f129c68b..53d3288c328b 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst | |||
@@ -1189,6 +1189,10 @@ PAGE_SIZE multiple when read back. | |||
1189 | Amount of cached filesystem data that was modified and | 1189 | Amount of cached filesystem data that was modified and |
1190 | is currently being written back to disk | 1190 | is currently being written back to disk |
1191 | 1191 | ||
1192 | anon_thp | ||
1193 | Amount of memory used in anonymous mappings backed by | ||
1194 | transparent hugepages | ||
1195 | |||
1192 | inactive_anon, active_anon, inactive_file, active_file, unevictable | 1196 | inactive_anon, active_anon, inactive_file, active_file, unevictable |
1193 | Amount of memory, swap-backed and filesystem-backed, | 1197 | Amount of memory, swap-backed and filesystem-backed, |
1194 | on the internal memory management lists used by the | 1198 | on the internal memory management lists used by the |
@@ -1248,6 +1252,18 @@ PAGE_SIZE multiple when read back. | |||
1248 | 1252 | ||
1249 | Amount of reclaimed lazyfree pages | 1253 | Amount of reclaimed lazyfree pages |
1250 | 1254 | ||
1255 | thp_fault_alloc | ||
1256 | |||
1257 | Number of transparent hugepages which were allocated to satisfy | ||
1258 | a page fault, including COW faults. This counter is not present | ||
1259 | when CONFIG_TRANSPARENT_HUGEPAGE is not set. | ||
1260 | |||
1261 | thp_collapse_alloc | ||
1262 | |||
1263 | Number of transparent hugepages which were allocated to allow | ||
1264 | collapsing an existing range of pages. This counter is not | ||
1265 | present when CONFIG_TRANSPARENT_HUGEPAGE is not set. | ||
1266 | |||
1251 | memory.swap.current | 1267 | memory.swap.current |
1252 | A read-only single value file which exists on non-root | 1268 | A read-only single value file which exists on non-root |
1253 | cgroups. | 1269 | cgroups. |
diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index 3f7bade2c231..340a5aee9b80 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst | |||
@@ -75,9 +75,10 @@ number of times a page is mapped. | |||
75 | 20. NOPAGE | 75 | 20. NOPAGE |
76 | 21. KSM | 76 | 21. KSM |
77 | 22. THP | 77 | 22. THP |
78 | 23. BALLOON | 78 | 23. OFFLINE |
79 | 24. ZERO_PAGE | 79 | 24. ZERO_PAGE |
80 | 25. IDLE | 80 | 25. IDLE |
81 | 26. PGTABLE | ||
81 | 82 | ||
82 | * ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the | 83 | * ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the |
83 | memory cgroup each page is charged to, indexed by PFN. Only available when | 84 | memory cgroup each page is charged to, indexed by PFN. Only available when |
@@ -118,8 +119,8 @@ Short descriptions to the page flags | |||
118 | identical memory pages dynamically shared between one or more processes | 119 | identical memory pages dynamically shared between one or more processes |
119 | 22 - THP | 120 | 22 - THP |
120 | contiguous pages which construct transparent hugepages | 121 | contiguous pages which construct transparent hugepages |
121 | 23 - BALLOON | 122 | 23 - OFFLINE |
122 | balloon compaction page | 123 | page is logically offline |
123 | 24 - ZERO_PAGE | 124 | 24 - ZERO_PAGE |
124 | zero page for pfn_zero or huge_zero page | 125 | zero page for pfn_zero or huge_zero page |
125 | 25 - IDLE | 126 | 25 - IDLE |
@@ -128,6 +129,8 @@ Short descriptions to the page flags | |||
128 | Note that this flag may be stale in case the page was accessed via | 129 | Note that this flag may be stale in case the page was accessed via |
129 | a PTE. To make sure the flag is up-to-date one has to read | 130 | a PTE. To make sure the flag is up-to-date one has to read |
130 | ``/sys/kernel/mm/page_idle/bitmap`` first. | 131 | ``/sys/kernel/mm/page_idle/bitmap`` first. |
132 | 26 - PGTABLE | ||
133 | page is in use as a page table | ||
131 | 134 | ||
132 | IO related page flags | 135 | IO related page flags |
133 | --------------------- | 136 | --------------------- |
diff --git a/Documentation/cgroup-v1/memcg_test.txt b/Documentation/cgroup-v1/memcg_test.txt index 5c7f310f32bb..621e29ffb358 100644 --- a/Documentation/cgroup-v1/memcg_test.txt +++ b/Documentation/cgroup-v1/memcg_test.txt | |||
@@ -107,9 +107,9 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. | |||
107 | 107 | ||
108 | 8. LRU | 108 | 8. LRU |
109 | Each memcg has its own private LRU. Now, its handling is under global | 109 | Each memcg has its own private LRU. Now, its handling is under global |
110 | VM's control (means that it's handled under global zone_lru_lock). | 110 | VM's control (means that it's handled under global pgdat->lru_lock). |
111 | Almost all routines around memcg's LRU is called by global LRU's | 111 | Almost all routines around memcg's LRU is called by global LRU's |
112 | list management functions under zone_lru_lock(). | 112 | list management functions under pgdat->lru_lock. |
113 | 113 | ||
114 | A special function is mem_cgroup_isolate_pages(). This scans | 114 | A special function is mem_cgroup_isolate_pages(). This scans |
115 | memcg's private LRU and call __isolate_lru_page() to extract a page | 115 | memcg's private LRU and call __isolate_lru_page() to extract a page |
diff --git a/Documentation/cgroup-v1/memory.txt b/Documentation/cgroup-v1/memory.txt index 3682e99234c2..a347fc9293e5 100644 --- a/Documentation/cgroup-v1/memory.txt +++ b/Documentation/cgroup-v1/memory.txt | |||
@@ -267,11 +267,11 @@ When oom event notifier is registered, event will be delivered. | |||
267 | Other lock order is following: | 267 | Other lock order is following: |
268 | PG_locked. | 268 | PG_locked. |
269 | mm->page_table_lock | 269 | mm->page_table_lock |
270 | zone_lru_lock | 270 | pgdat->lru_lock |
271 | lock_page_cgroup. | 271 | lock_page_cgroup. |
272 | In many cases, just lock_page_cgroup() is called. | 272 | In many cases, just lock_page_cgroup() is called. |
273 | per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by | 273 | per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by |
274 | zone_lru_lock, it has no lock of its own. | 274 | pgdat->lru_lock, it has no lock of its own. |
275 | 275 | ||
276 | 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) | 276 | 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) |
277 | 277 | ||
diff --git a/MAINTAINERS b/MAINTAINERS index bd549618aea9..c7d3e51c7064 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -9835,6 +9835,14 @@ F: kernel/sched/membarrier.c | |||
9835 | F: include/uapi/linux/membarrier.h | 9835 | F: include/uapi/linux/membarrier.h |
9836 | F: arch/powerpc/include/asm/membarrier.h | 9836 | F: arch/powerpc/include/asm/membarrier.h |
9837 | 9837 | ||
9838 | MEMBLOCK | ||
9839 | M: Mike Rapoport <rppt@linux.ibm.com> | ||
9840 | L: linux-mm@kvack.org | ||
9841 | S: Maintained | ||
9842 | F: include/linux/memblock.h | ||
9843 | F: mm/memblock.c | ||
9844 | F: Documentation/core-api/boot-time-mm.rst | ||
9845 | |||
9838 | MEMORY MANAGEMENT | 9846 | MEMORY MANAGEMENT |
9839 | L: linux-mm@kvack.org | 9847 | L: linux-mm@kvack.org |
9840 | W: http://www.linux-mm.org | 9848 | W: http://www.linux-mm.org |
diff --git a/arch/alpha/include/asm/topology.h b/arch/alpha/include/asm/topology.h index e6e13a85796a..5a77a40567fa 100644 --- a/arch/alpha/include/asm/topology.h +++ b/arch/alpha/include/asm/topology.h | |||
@@ -4,6 +4,7 @@ | |||
4 | 4 | ||
5 | #include <linux/smp.h> | 5 | #include <linux/smp.h> |
6 | #include <linux/threads.h> | 6 | #include <linux/threads.h> |
7 | #include <linux/numa.h> | ||
7 | #include <asm/machvec.h> | 8 | #include <asm/machvec.h> |
8 | 9 | ||
9 | #ifdef CONFIG_NUMA | 10 | #ifdef CONFIG_NUMA |
@@ -29,7 +30,7 @@ static const struct cpumask *cpumask_of_node(int node) | |||
29 | { | 30 | { |
30 | int cpu; | 31 | int cpu; |
31 | 32 | ||
32 | if (node == -1) | 33 | if (node == NUMA_NO_NODE) |
33 | return cpu_all_mask; | 34 | return cpu_all_mask; |
34 | 35 | ||
35 | cpumask_clear(&node_to_cpumask_map[node]); | 36 | cpumask_clear(&node_to_cpumask_map[node]); |
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a4168d366127..cfbf307d6dc4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig | |||
@@ -1467,6 +1467,10 @@ config SYSVIPC_COMPAT | |||
1467 | def_bool y | 1467 | def_bool y |
1468 | depends on COMPAT && SYSVIPC | 1468 | depends on COMPAT && SYSVIPC |
1469 | 1469 | ||
1470 | config ARCH_ENABLE_HUGEPAGE_MIGRATION | ||
1471 | def_bool y | ||
1472 | depends on HUGETLB_PAGE && MIGRATION | ||
1473 | |||
1470 | menu "Power management options" | 1474 | menu "Power management options" |
1471 | 1475 | ||
1472 | source "kernel/power/Kconfig" | 1476 | source "kernel/power/Kconfig" |
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index fb6609875455..c6a07a3b433e 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h | |||
@@ -20,6 +20,11 @@ | |||
20 | 20 | ||
21 | #include <asm/page.h> | 21 | #include <asm/page.h> |
22 | 22 | ||
23 | #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION | ||
24 | #define arch_hugetlb_migration_supported arch_hugetlb_migration_supported | ||
25 | extern bool arch_hugetlb_migration_supported(struct hstate *h); | ||
26 | #endif | ||
27 | |||
23 | #define __HAVE_ARCH_HUGE_PTEP_GET | 28 | #define __HAVE_ARCH_HUGE_PTEP_GET |
24 | static inline pte_t huge_ptep_get(pte_t *ptep) | 29 | static inline pte_t huge_ptep_get(pte_t *ptep) |
25 | { | 30 | { |
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 0c656850eeea..b01ef0180a03 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h | |||
@@ -80,11 +80,7 @@ | |||
80 | */ | 80 | */ |
81 | #ifdef CONFIG_KASAN | 81 | #ifdef CONFIG_KASAN |
82 | #define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - KASAN_SHADOW_SCALE_SHIFT)) | 82 | #define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - KASAN_SHADOW_SCALE_SHIFT)) |
83 | #ifdef CONFIG_KASAN_EXTRA | ||
84 | #define KASAN_THREAD_SHIFT 2 | ||
85 | #else | ||
86 | #define KASAN_THREAD_SHIFT 1 | 83 | #define KASAN_THREAD_SHIFT 1 |
87 | #endif /* CONFIG_KASAN_EXTRA */ | ||
88 | #else | 84 | #else |
89 | #define KASAN_SHADOW_SIZE (0) | 85 | #define KASAN_SHADOW_SIZE (0) |
90 | #define KASAN_THREAD_SHIFT 0 | 86 | #define KASAN_THREAD_SHIFT 0 |
diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index aa9c94113700..66b5d697d943 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c | |||
@@ -321,7 +321,7 @@ void crash_post_resume(void) | |||
321 | * but does not hold any data of loaded kernel image. | 321 | * but does not hold any data of loaded kernel image. |
322 | * | 322 | * |
323 | * Note that all the pages in crash dump kernel memory have been initially | 323 | * Note that all the pages in crash dump kernel memory have been initially |
324 | * marked as Reserved in kexec_reserve_crashkres_pages(). | 324 | * marked as Reserved as memory was allocated via memblock_reserve(). |
325 | * | 325 | * |
326 | * In hibernation, the pages which are Reserved and yet "nosave" are excluded | 326 | * In hibernation, the pages which are Reserved and yet "nosave" are excluded |
327 | * from the hibernation iamge. crash_is_nosave() does thich check for crash | 327 | * from the hibernation iamge. crash_is_nosave() does thich check for crash |
@@ -361,7 +361,6 @@ void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) | |||
361 | 361 | ||
362 | for (addr = begin; addr < end; addr += PAGE_SIZE) { | 362 | for (addr = begin; addr < end; addr += PAGE_SIZE) { |
363 | page = phys_to_page(addr); | 363 | page = phys_to_page(addr); |
364 | ClearPageReserved(page); | ||
365 | free_reserved_page(page); | 364 | free_reserved_page(page); |
366 | } | 365 | } |
367 | } | 366 | } |
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 28cbc22d7e30..6b4a47b3adf4 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c | |||
@@ -27,6 +27,26 @@ | |||
27 | #include <asm/tlbflush.h> | 27 | #include <asm/tlbflush.h> |
28 | #include <asm/pgalloc.h> | 28 | #include <asm/pgalloc.h> |
29 | 29 | ||
30 | #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION | ||
31 | bool arch_hugetlb_migration_supported(struct hstate *h) | ||
32 | { | ||
33 | size_t pagesize = huge_page_size(h); | ||
34 | |||
35 | switch (pagesize) { | ||
36 | #ifdef CONFIG_ARM64_4K_PAGES | ||
37 | case PUD_SIZE: | ||
38 | #endif | ||
39 | case PMD_SIZE: | ||
40 | case CONT_PMD_SIZE: | ||
41 | case CONT_PTE_SIZE: | ||
42 | return true; | ||
43 | } | ||
44 | pr_warn("%s: unrecognized huge page size 0x%lx\n", | ||
45 | __func__, pagesize); | ||
46 | return false; | ||
47 | } | ||
48 | #endif | ||
49 | |||
30 | int pmd_huge(pmd_t pmd) | 50 | int pmd_huge(pmd_t pmd) |
31 | { | 51 | { |
32 | return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); | 52 | return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); |
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 7205a9085b4d..c38976b70069 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c | |||
@@ -118,35 +118,10 @@ static void __init reserve_crashkernel(void) | |||
118 | crashk_res.start = crash_base; | 118 | crashk_res.start = crash_base; |
119 | crashk_res.end = crash_base + crash_size - 1; | 119 | crashk_res.end = crash_base + crash_size - 1; |
120 | } | 120 | } |
121 | |||
122 | static void __init kexec_reserve_crashkres_pages(void) | ||
123 | { | ||
124 | #ifdef CONFIG_HIBERNATION | ||
125 | phys_addr_t addr; | ||
126 | struct page *page; | ||
127 | |||
128 | if (!crashk_res.end) | ||
129 | return; | ||
130 | |||
131 | /* | ||
132 | * To reduce the size of hibernation image, all the pages are | ||
133 | * marked as Reserved initially. | ||
134 | */ | ||
135 | for (addr = crashk_res.start; addr < (crashk_res.end + 1); | ||
136 | addr += PAGE_SIZE) { | ||
137 | page = phys_to_page(addr); | ||
138 | SetPageReserved(page); | ||
139 | } | ||
140 | #endif | ||
141 | } | ||
142 | #else | 121 | #else |
143 | static void __init reserve_crashkernel(void) | 122 | static void __init reserve_crashkernel(void) |
144 | { | 123 | { |
145 | } | 124 | } |
146 | |||
147 | static void __init kexec_reserve_crashkres_pages(void) | ||
148 | { | ||
149 | } | ||
150 | #endif /* CONFIG_KEXEC_CORE */ | 125 | #endif /* CONFIG_KEXEC_CORE */ |
151 | 126 | ||
152 | #ifdef CONFIG_CRASH_DUMP | 127 | #ifdef CONFIG_CRASH_DUMP |
@@ -586,8 +561,6 @@ void __init mem_init(void) | |||
586 | /* this will put all unused low memory onto the freelists */ | 561 | /* this will put all unused low memory onto the freelists */ |
587 | memblock_free_all(); | 562 | memblock_free_all(); |
588 | 563 | ||
589 | kexec_reserve_crashkres_pages(); | ||
590 | |||
591 | mem_init_print_info(NULL); | 564 | mem_init_print_info(NULL); |
592 | 565 | ||
593 | /* | 566 | /* |
diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index ae34e3a1cef1..7a0a555b366a 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c | |||
@@ -120,7 +120,7 @@ static void __init setup_node_to_cpumask_map(void) | |||
120 | } | 120 | } |
121 | 121 | ||
122 | /* cpumask_of_node() will now work */ | 122 | /* cpumask_of_node() will now work */ |
123 | pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); | 123 | pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); |
124 | } | 124 | } |
125 | 125 | ||
126 | /* | 126 | /* |
diff --git a/arch/ia64/kernel/numa.c b/arch/ia64/kernel/numa.c index 92c376279c6d..1315da6c7aeb 100644 --- a/arch/ia64/kernel/numa.c +++ b/arch/ia64/kernel/numa.c | |||
@@ -74,7 +74,7 @@ void __init build_cpu_to_node_map(void) | |||
74 | cpumask_clear(&node_to_cpu_mask[node]); | 74 | cpumask_clear(&node_to_cpu_mask[node]); |
75 | 75 | ||
76 | for_each_possible_early_cpu(cpu) { | 76 | for_each_possible_early_cpu(cpu) { |
77 | node = -1; | 77 | node = NUMA_NO_NODE; |
78 | for (i = 0; i < NR_CPUS; ++i) | 78 | for (i = 0; i < NR_CPUS; ++i) |
79 | if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) { | 79 | if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) { |
80 | node = node_cpuid[i].nid; | 80 | node = node_cpuid[i].nid; |
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 46bff1661836..7a969f4c3534 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c | |||
@@ -583,17 +583,6 @@ pfm_put_task(struct task_struct *task) | |||
583 | if (task != current) put_task_struct(task); | 583 | if (task != current) put_task_struct(task); |
584 | } | 584 | } |
585 | 585 | ||
586 | static inline void | ||
587 | pfm_reserve_page(unsigned long a) | ||
588 | { | ||
589 | SetPageReserved(vmalloc_to_page((void *)a)); | ||
590 | } | ||
591 | static inline void | ||
592 | pfm_unreserve_page(unsigned long a) | ||
593 | { | ||
594 | ClearPageReserved(vmalloc_to_page((void*)a)); | ||
595 | } | ||
596 | |||
597 | static inline unsigned long | 586 | static inline unsigned long |
598 | pfm_protect_ctx_ctxsw(pfm_context_t *x) | 587 | pfm_protect_ctx_ctxsw(pfm_context_t *x) |
599 | { | 588 | { |
@@ -816,44 +805,6 @@ pfm_reset_msgq(pfm_context_t *ctx) | |||
816 | DPRINT(("ctx=%p msgq reset\n", ctx)); | 805 | DPRINT(("ctx=%p msgq reset\n", ctx)); |
817 | } | 806 | } |
818 | 807 | ||
819 | static void * | ||
820 | pfm_rvmalloc(unsigned long size) | ||
821 | { | ||
822 | void *mem; | ||
823 | unsigned long addr; | ||
824 | |||
825 | size = PAGE_ALIGN(size); | ||
826 | mem = vzalloc(size); | ||
827 | if (mem) { | ||
828 | //printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem); | ||
829 | addr = (unsigned long)mem; | ||
830 | while (size > 0) { | ||
831 | pfm_reserve_page(addr); | ||
832 | addr+=PAGE_SIZE; | ||
833 | size-=PAGE_SIZE; | ||
834 | } | ||
835 | } | ||
836 | return mem; | ||
837 | } | ||
838 | |||
839 | static void | ||
840 | pfm_rvfree(void *mem, unsigned long size) | ||
841 | { | ||
842 | unsigned long addr; | ||
843 | |||
844 | if (mem) { | ||
845 | DPRINT(("freeing physical buffer @%p size=%lu\n", mem, size)); | ||
846 | addr = (unsigned long) mem; | ||
847 | while ((long) size > 0) { | ||
848 | pfm_unreserve_page(addr); | ||
849 | addr+=PAGE_SIZE; | ||
850 | size-=PAGE_SIZE; | ||
851 | } | ||
852 | vfree(mem); | ||
853 | } | ||
854 | return; | ||
855 | } | ||
856 | |||
857 | static pfm_context_t * | 808 | static pfm_context_t * |
858 | pfm_context_alloc(int ctx_flags) | 809 | pfm_context_alloc(int ctx_flags) |
859 | { | 810 | { |
@@ -1498,7 +1449,7 @@ pfm_free_smpl_buffer(pfm_context_t *ctx) | |||
1498 | /* | 1449 | /* |
1499 | * free the buffer | 1450 | * free the buffer |
1500 | */ | 1451 | */ |
1501 | pfm_rvfree(ctx->ctx_smpl_hdr, ctx->ctx_smpl_size); | 1452 | vfree(ctx->ctx_smpl_hdr); |
1502 | 1453 | ||
1503 | ctx->ctx_smpl_hdr = NULL; | 1454 | ctx->ctx_smpl_hdr = NULL; |
1504 | ctx->ctx_smpl_size = 0UL; | 1455 | ctx->ctx_smpl_size = 0UL; |
@@ -2137,7 +2088,7 @@ doit: | |||
2137 | * All memory free operations (especially for vmalloc'ed memory) | 2088 | * All memory free operations (especially for vmalloc'ed memory) |
2138 | * MUST be done with interrupts ENABLED. | 2089 | * MUST be done with interrupts ENABLED. |
2139 | */ | 2090 | */ |
2140 | if (smpl_buf_addr) pfm_rvfree(smpl_buf_addr, smpl_buf_size); | 2091 | vfree(smpl_buf_addr); |
2141 | 2092 | ||
2142 | /* | 2093 | /* |
2143 | * return the memory used by the context | 2094 | * return the memory used by the context |
@@ -2266,10 +2217,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t | |||
2266 | 2217 | ||
2267 | /* | 2218 | /* |
2268 | * We do the easy to undo allocations first. | 2219 | * We do the easy to undo allocations first. |
2269 | * | ||
2270 | * pfm_rvmalloc(), clears the buffer, so there is no leak | ||
2271 | */ | 2220 | */ |
2272 | smpl_buf = pfm_rvmalloc(size); | 2221 | smpl_buf = vzalloc(size); |
2273 | if (smpl_buf == NULL) { | 2222 | if (smpl_buf == NULL) { |
2274 | DPRINT(("Can't allocate sampling buffer\n")); | 2223 | DPRINT(("Can't allocate sampling buffer\n")); |
2275 | return -ENOMEM; | 2224 | return -ENOMEM; |
@@ -2346,7 +2295,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t | |||
2346 | error: | 2295 | error: |
2347 | vm_area_free(vma); | 2296 | vm_area_free(vma); |
2348 | error_kmem: | 2297 | error_kmem: |
2349 | pfm_rvfree(smpl_buf, size); | 2298 | vfree(smpl_buf); |
2350 | 2299 | ||
2351 | return -ENOMEM; | 2300 | return -ENOMEM; |
2352 | } | 2301 | } |
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 8a965784340c..f9c36750c6a4 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c | |||
@@ -227,7 +227,7 @@ void __init setup_per_cpu_areas(void) | |||
227 | * CPUs are put into groups according to node. Walk cpu_map | 227 | * CPUs are put into groups according to node. Walk cpu_map |
228 | * and create new groups at node boundaries. | 228 | * and create new groups at node boundaries. |
229 | */ | 229 | */ |
230 | prev_node = -1; | 230 | prev_node = NUMA_NO_NODE; |
231 | ai->nr_groups = 0; | 231 | ai->nr_groups = 0; |
232 | for (unit = 0; unit < nr_units; unit++) { | 232 | for (unit = 0; unit < nr_units; unit++) { |
233 | cpu = cpu_map[unit]; | 233 | cpu = cpu_map[unit]; |
@@ -435,7 +435,7 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize) | |||
435 | { | 435 | { |
436 | void *ptr = NULL; | 436 | void *ptr = NULL; |
437 | u8 best = 0xff; | 437 | u8 best = 0xff; |
438 | int bestnode = -1, node, anynode = 0; | 438 | int bestnode = NUMA_NO_NODE, node, anynode = 0; |
439 | 439 | ||
440 | for_each_online_node(node) { | 440 | for_each_online_node(node) { |
441 | if (node_isset(node, memory_less_mask)) | 441 | if (node_isset(node, memory_less_mask)) |
@@ -447,7 +447,7 @@ static void __init *memory_less_node_alloc(int nid, unsigned long pernodesize) | |||
447 | anynode = node; | 447 | anynode = node; |
448 | } | 448 | } |
449 | 449 | ||
450 | if (bestnode == -1) | 450 | if (bestnode == NUMA_NO_NODE) |
451 | bestnode = anynode; | 451 | bestnode = anynode; |
452 | 452 | ||
453 | ptr = memblock_alloc_try_nid(pernodesize, PERCPU_PAGE_SIZE, | 453 | ptr = memblock_alloc_try_nid(pernodesize, PERCPU_PAGE_SIZE, |
diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c index b86a2e21693b..227c04fe60d2 100644 --- a/arch/m68k/mm/memory.c +++ b/arch/m68k/mm/memory.c | |||
@@ -51,7 +51,7 @@ void __init init_pointer_table(unsigned long ptable) | |||
51 | pr_debug("init_pointer_table: %lx, %x\n", ptable, PD_MARKBITS(dp)); | 51 | pr_debug("init_pointer_table: %lx, %x\n", ptable, PD_MARKBITS(dp)); |
52 | 52 | ||
53 | /* unreserve the page so it's possible to free that page */ | 53 | /* unreserve the page so it's possible to free that page */ |
54 | PD_PAGE(dp)->flags &= ~(1 << PG_reserved); | 54 | __ClearPageReserved(PD_PAGE(dp)); |
55 | init_page_count(PD_PAGE(dp)); | 55 | init_page_count(PD_PAGE(dp)); |
56 | 56 | ||
57 | return; | 57 | return; |
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index 5b0177733994..66c1e4f88d65 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h | |||
@@ -13,6 +13,10 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | |||
13 | unsigned long len, unsigned long pgoff, | 13 | unsigned long len, unsigned long pgoff, |
14 | unsigned long flags); | 14 | unsigned long flags); |
15 | 15 | ||
16 | extern void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, | ||
17 | unsigned long addr, pte_t *ptep, | ||
18 | pte_t old_pte, pte_t pte); | ||
19 | |||
16 | static inline int hstate_get_psize(struct hstate *hstate) | 20 | static inline int hstate_get_psize(struct hstate *hstate) |
17 | { | 21 | { |
18 | unsigned long shift; | 22 | unsigned long shift; |
@@ -42,4 +46,12 @@ static inline bool gigantic_page_supported(void) | |||
42 | /* hugepd entry valid bit */ | 46 | /* hugepd entry valid bit */ |
43 | #define HUGEPD_VAL_BITS (0x8000000000000000UL) | 47 | #define HUGEPD_VAL_BITS (0x8000000000000000UL) |
44 | 48 | ||
49 | #define huge_ptep_modify_prot_start huge_ptep_modify_prot_start | ||
50 | extern pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, | ||
51 | unsigned long addr, pte_t *ptep); | ||
52 | |||
53 | #define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit | ||
54 | extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, | ||
55 | unsigned long addr, pte_t *ptep, | ||
56 | pte_t old_pte, pte_t new_pte); | ||
45 | #endif | 57 | #endif |
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index d8c8d7c9df15..868fcaf56f6b 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h | |||
@@ -1306,6 +1306,24 @@ static inline int pud_pfn(pud_t pud) | |||
1306 | BUILD_BUG(); | 1306 | BUILD_BUG(); |
1307 | return 0; | 1307 | return 0; |
1308 | } | 1308 | } |
1309 | #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION | ||
1310 | pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); | ||
1311 | void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, | ||
1312 | pte_t *, pte_t, pte_t); | ||
1313 | |||
1314 | /* | ||
1315 | * Returns true for a R -> RW upgrade of pte | ||
1316 | */ | ||
1317 | static inline bool is_pte_rw_upgrade(unsigned long old_val, unsigned long new_val) | ||
1318 | { | ||
1319 | if (!(old_val & _PAGE_READ)) | ||
1320 | return false; | ||
1321 | |||
1322 | if ((!(old_val & _PAGE_WRITE)) && (new_val & _PAGE_WRITE)) | ||
1323 | return true; | ||
1324 | |||
1325 | return false; | ||
1326 | } | ||
1309 | 1327 | ||
1310 | #endif /* __ASSEMBLY__ */ | 1328 | #endif /* __ASSEMBLY__ */ |
1311 | #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ | 1329 | #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ |
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 7d1a3d1543fc..5ab134eeed20 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h | |||
@@ -127,6 +127,10 @@ extern void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep | |||
127 | pte_t entry, unsigned long address, | 127 | pte_t entry, unsigned long address, |
128 | int psize); | 128 | int psize); |
129 | 129 | ||
130 | extern void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, | ||
131 | unsigned long addr, pte_t *ptep, | ||
132 | pte_t old_pte, pte_t pte); | ||
133 | |||
130 | static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr, | 134 | static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr, |
131 | unsigned long set) | 135 | unsigned long set) |
132 | { | 136 | { |
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index aee4fcc24990..77fc21278fa2 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/pci.h> | 10 | #include <linux/pci.h> |
11 | #include <linux/list.h> | 11 | #include <linux/list.h> |
12 | #include <linux/ioport.h> | 12 | #include <linux/ioport.h> |
13 | #include <linux/numa.h> | ||
13 | 14 | ||
14 | struct device_node; | 15 | struct device_node; |
15 | 16 | ||
@@ -265,7 +266,7 @@ extern int pcibios_map_io_space(struct pci_bus *bus); | |||
265 | #ifdef CONFIG_NUMA | 266 | #ifdef CONFIG_NUMA |
266 | #define PHB_SET_NODE(PHB, NODE) ((PHB)->node = (NODE)) | 267 | #define PHB_SET_NODE(PHB, NODE) ((PHB)->node = (NODE)) |
267 | #else | 268 | #else |
268 | #define PHB_SET_NODE(PHB, NODE) ((PHB)->node = -1) | 269 | #define PHB_SET_NODE(PHB, NODE) ((PHB)->node = NUMA_NO_NODE) |
269 | #endif | 270 | #endif |
270 | 271 | ||
271 | #endif /* CONFIG_PPC64 */ | 272 | #endif /* CONFIG_PPC64 */ |
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 913bfca09c4f..b8480127793d 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/export.h> | 11 | #include <linux/export.h> |
12 | #include <linux/memblock.h> | 12 | #include <linux/memblock.h> |
13 | #include <linux/sched/task.h> | 13 | #include <linux/sched/task.h> |
14 | #include <linux/numa.h> | ||
14 | 15 | ||
15 | #include <asm/lppaca.h> | 16 | #include <asm/lppaca.h> |
16 | #include <asm/paca.h> | 17 | #include <asm/paca.h> |
@@ -36,7 +37,7 @@ static void *__init alloc_paca_data(unsigned long size, unsigned long align, | |||
36 | * which will put its paca in the right place. | 37 | * which will put its paca in the right place. |
37 | */ | 38 | */ |
38 | if (cpu == boot_cpuid) { | 39 | if (cpu == boot_cpuid) { |
39 | nid = -1; | 40 | nid = NUMA_NO_NODE; |
40 | memblock_set_bottom_up(true); | 41 | memblock_set_bottom_up(true); |
41 | } else { | 42 | } else { |
42 | nid = early_cpu_to_node(cpu); | 43 | nid = early_cpu_to_node(cpu); |
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 88e4f69a09e5..4538e8ddde80 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/vmalloc.h> | 32 | #include <linux/vmalloc.h> |
33 | #include <linux/slab.h> | 33 | #include <linux/slab.h> |
34 | #include <linux/vgaarb.h> | 34 | #include <linux/vgaarb.h> |
35 | #include <linux/numa.h> | ||
35 | 36 | ||
36 | #include <asm/processor.h> | 37 | #include <asm/processor.h> |
37 | #include <asm/io.h> | 38 | #include <asm/io.h> |
@@ -132,7 +133,7 @@ struct pci_controller *pcibios_alloc_controller(struct device_node *dev) | |||
132 | int nid = of_node_to_nid(dev); | 133 | int nid = of_node_to_nid(dev); |
133 | 134 | ||
134 | if (nid < 0 || !node_online(nid)) | 135 | if (nid < 0 || !node_online(nid)) |
135 | nid = -1; | 136 | nid = NUMA_NO_NODE; |
136 | 137 | ||
137 | PHB_SET_NODE(phb, nid); | 138 | PHB_SET_NODE(phb, nid); |
138 | } | 139 | } |
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 7725a9714736..a31b6234fcd7 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c | |||
@@ -798,7 +798,6 @@ static int __init vdso_init(void) | |||
798 | BUG_ON(vdso32_pagelist == NULL); | 798 | BUG_ON(vdso32_pagelist == NULL); |
799 | for (i = 0; i < vdso32_pages; i++) { | 799 | for (i = 0; i < vdso32_pages; i++) { |
800 | struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); | 800 | struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); |
801 | ClearPageReserved(pg); | ||
802 | get_page(pg); | 801 | get_page(pg); |
803 | vdso32_pagelist[i] = pg; | 802 | vdso32_pagelist[i] = pg; |
804 | } | 803 | } |
@@ -812,7 +811,6 @@ static int __init vdso_init(void) | |||
812 | BUG_ON(vdso64_pagelist == NULL); | 811 | BUG_ON(vdso64_pagelist == NULL); |
813 | for (i = 0; i < vdso64_pages; i++) { | 812 | for (i = 0; i < vdso64_pages; i++) { |
814 | struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); | 813 | struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); |
815 | ClearPageReserved(pg); | ||
816 | get_page(pg); | 814 | get_page(pg); |
817 | vdso64_pagelist[i] = pg; | 815 | vdso64_pagelist[i] = pg; |
818 | } | 816 | } |
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index 2e6a8f9345d3..367ce3a4a503 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c | |||
@@ -121,3 +121,28 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, | |||
121 | *ptep = __pte(new_pte & ~H_PAGE_BUSY); | 121 | *ptep = __pte(new_pte & ~H_PAGE_BUSY); |
122 | return 0; | 122 | return 0; |
123 | } | 123 | } |
124 | |||
125 | pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, | ||
126 | unsigned long addr, pte_t *ptep) | ||
127 | { | ||
128 | unsigned long pte_val; | ||
129 | /* | ||
130 | * Clear the _PAGE_PRESENT so that no hardware parallel update is | ||
131 | * possible. Also keep the pte_present true so that we don't take | ||
132 | * wrong fault. | ||
133 | */ | ||
134 | pte_val = pte_update(vma->vm_mm, addr, ptep, | ||
135 | _PAGE_PRESENT, _PAGE_INVALID, 1); | ||
136 | |||
137 | return __pte(pte_val); | ||
138 | } | ||
139 | |||
140 | void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, | ||
141 | pte_t *ptep, pte_t old_pte, pte_t pte) | ||
142 | { | ||
143 | |||
144 | if (radix_enabled()) | ||
145 | return radix__huge_ptep_modify_prot_commit(vma, addr, ptep, | ||
146 | old_pte, pte); | ||
147 | set_huge_pte_at(vma->vm_mm, addr, ptep, pte); | ||
148 | } | ||
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c index 2486bee0f93e..11d9ea28a816 100644 --- a/arch/powerpc/mm/hugetlbpage-radix.c +++ b/arch/powerpc/mm/hugetlbpage-radix.c | |||
@@ -90,3 +90,20 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, | |||
90 | 90 | ||
91 | return vm_unmapped_area(&info); | 91 | return vm_unmapped_area(&info); |
92 | } | 92 | } |
93 | |||
94 | void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, | ||
95 | unsigned long addr, pte_t *ptep, | ||
96 | pte_t old_pte, pte_t pte) | ||
97 | { | ||
98 | struct mm_struct *mm = vma->vm_mm; | ||
99 | |||
100 | /* | ||
101 | * To avoid NMMU hang while relaxing access we need to flush the tlb before | ||
102 | * we set the new value. | ||
103 | */ | ||
104 | if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && | ||
105 | (atomic_read(&mm->context.copros) > 0)) | ||
106 | radix__flush_hugetlb_page(vma, addr); | ||
107 | |||
108 | set_huge_pte_at(vma->vm_mm, addr, ptep, pte); | ||
109 | } | ||
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index a712a650a8b6..e7a9c4f6bfca 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/sizes.h> | 21 | #include <linux/sizes.h> |
22 | #include <asm/mmu_context.h> | 22 | #include <asm/mmu_context.h> |
23 | #include <asm/pte-walk.h> | 23 | #include <asm/pte-walk.h> |
24 | #include <linux/mm_inline.h> | ||
24 | 25 | ||
25 | static DEFINE_MUTEX(mem_list_mutex); | 26 | static DEFINE_MUTEX(mem_list_mutex); |
26 | 27 | ||
@@ -34,8 +35,18 @@ struct mm_iommu_table_group_mem_t { | |||
34 | atomic64_t mapped; | 35 | atomic64_t mapped; |
35 | unsigned int pageshift; | 36 | unsigned int pageshift; |
36 | u64 ua; /* userspace address */ | 37 | u64 ua; /* userspace address */ |
37 | u64 entries; /* number of entries in hpas[] */ | 38 | u64 entries; /* number of entries in hpas/hpages[] */ |
38 | u64 *hpas; /* vmalloc'ed */ | 39 | /* |
40 | * in mm_iommu_get we temporarily use this to store | ||
41 | * struct page address. | ||
42 | * | ||
43 | * We need to convert ua to hpa in real mode. Make it | ||
44 | * simpler by storing physical address. | ||
45 | */ | ||
46 | union { | ||
47 | struct page **hpages; /* vmalloc'ed */ | ||
48 | phys_addr_t *hpas; | ||
49 | }; | ||
39 | #define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1) | 50 | #define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1) |
40 | u64 dev_hpa; /* Device memory base address */ | 51 | u64 dev_hpa; /* Device memory base address */ |
41 | }; | 52 | }; |
@@ -80,64 +91,13 @@ bool mm_iommu_preregistered(struct mm_struct *mm) | |||
80 | } | 91 | } |
81 | EXPORT_SYMBOL_GPL(mm_iommu_preregistered); | 92 | EXPORT_SYMBOL_GPL(mm_iommu_preregistered); |
82 | 93 | ||
83 | /* | ||
84 | * Taken from alloc_migrate_target with changes to remove CMA allocations | ||
85 | */ | ||
86 | struct page *new_iommu_non_cma_page(struct page *page, unsigned long private) | ||
87 | { | ||
88 | gfp_t gfp_mask = GFP_USER; | ||
89 | struct page *new_page; | ||
90 | |||
91 | if (PageCompound(page)) | ||
92 | return NULL; | ||
93 | |||
94 | if (PageHighMem(page)) | ||
95 | gfp_mask |= __GFP_HIGHMEM; | ||
96 | |||
97 | /* | ||
98 | * We don't want the allocation to force an OOM if possibe | ||
99 | */ | ||
100 | new_page = alloc_page(gfp_mask | __GFP_NORETRY | __GFP_NOWARN); | ||
101 | return new_page; | ||
102 | } | ||
103 | |||
104 | static int mm_iommu_move_page_from_cma(struct page *page) | ||
105 | { | ||
106 | int ret = 0; | ||
107 | LIST_HEAD(cma_migrate_pages); | ||
108 | |||
109 | /* Ignore huge pages for now */ | ||
110 | if (PageCompound(page)) | ||
111 | return -EBUSY; | ||
112 | |||
113 | lru_add_drain(); | ||
114 | ret = isolate_lru_page(page); | ||
115 | if (ret) | ||
116 | return ret; | ||
117 | |||
118 | list_add(&page->lru, &cma_migrate_pages); | ||
119 | put_page(page); /* Drop the gup reference */ | ||
120 | |||
121 | ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page, | ||
122 | NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE); | ||
123 | if (ret) { | ||
124 | if (!list_empty(&cma_migrate_pages)) | ||
125 | putback_movable_pages(&cma_migrate_pages); | ||
126 | } | ||
127 | |||
128 | return 0; | ||
129 | } | ||
130 | |||
131 | static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, | 94 | static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, |
132 | unsigned long entries, unsigned long dev_hpa, | 95 | unsigned long entries, unsigned long dev_hpa, |
133 | struct mm_iommu_table_group_mem_t **pmem) | 96 | struct mm_iommu_table_group_mem_t **pmem) |
134 | { | 97 | { |
135 | struct mm_iommu_table_group_mem_t *mem; | 98 | struct mm_iommu_table_group_mem_t *mem; |
136 | long i, j, ret = 0, locked_entries = 0; | 99 | long i, ret, locked_entries = 0; |
137 | unsigned int pageshift; | 100 | unsigned int pageshift; |
138 | unsigned long flags; | ||
139 | unsigned long cur_ua; | ||
140 | struct page *page = NULL; | ||
141 | 101 | ||
142 | mutex_lock(&mem_list_mutex); | 102 | mutex_lock(&mem_list_mutex); |
143 | 103 | ||
@@ -187,62 +147,43 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, | |||
187 | goto unlock_exit; | 147 | goto unlock_exit; |
188 | } | 148 | } |
189 | 149 | ||
150 | down_read(&mm->mmap_sem); | ||
151 | ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); | ||
152 | up_read(&mm->mmap_sem); | ||
153 | if (ret != entries) { | ||
154 | /* free the reference taken */ | ||
155 | for (i = 0; i < ret; i++) | ||
156 | put_page(mem->hpages[i]); | ||
157 | |||
158 | vfree(mem->hpas); | ||
159 | kfree(mem); | ||
160 | ret = -EFAULT; | ||
161 | goto unlock_exit; | ||
162 | } | ||
163 | |||
164 | pageshift = PAGE_SHIFT; | ||
190 | for (i = 0; i < entries; ++i) { | 165 | for (i = 0; i < entries; ++i) { |
191 | cur_ua = ua + (i << PAGE_SHIFT); | 166 | struct page *page = mem->hpages[i]; |
192 | if (1 != get_user_pages_fast(cur_ua, | 167 | |
193 | 1/* pages */, 1/* iswrite */, &page)) { | ||
194 | ret = -EFAULT; | ||
195 | for (j = 0; j < i; ++j) | ||
196 | put_page(pfn_to_page(mem->hpas[j] >> | ||
197 | PAGE_SHIFT)); | ||
198 | vfree(mem->hpas); | ||
199 | kfree(mem); | ||
200 | goto unlock_exit; | ||
201 | } | ||
202 | /* | 168 | /* |
203 | * If we get a page from the CMA zone, since we are going to | 169 | * Allow to use larger than 64k IOMMU pages. Only do that |
204 | * be pinning these entries, we might as well move them out | 170 | * if we are backed by hugetlb. |
205 | * of the CMA zone if possible. NOTE: faulting in + migration | ||
206 | * can be expensive. Batching can be considered later | ||
207 | */ | 171 | */ |
208 | if (is_migrate_cma_page(page)) { | 172 | if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { |
209 | if (mm_iommu_move_page_from_cma(page)) | ||
210 | goto populate; | ||
211 | if (1 != get_user_pages_fast(cur_ua, | ||
212 | 1/* pages */, 1/* iswrite */, | ||
213 | &page)) { | ||
214 | ret = -EFAULT; | ||
215 | for (j = 0; j < i; ++j) | ||
216 | put_page(pfn_to_page(mem->hpas[j] >> | ||
217 | PAGE_SHIFT)); | ||
218 | vfree(mem->hpas); | ||
219 | kfree(mem); | ||
220 | goto unlock_exit; | ||
221 | } | ||
222 | } | ||
223 | populate: | ||
224 | pageshift = PAGE_SHIFT; | ||
225 | if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) { | ||
226 | pte_t *pte; | ||
227 | struct page *head = compound_head(page); | 173 | struct page *head = compound_head(page); |
228 | unsigned int compshift = compound_order(head); | 174 | |
229 | unsigned int pteshift; | 175 | pageshift = compound_order(head) + PAGE_SHIFT; |
230 | |||
231 | local_irq_save(flags); /* disables as well */ | ||
232 | pte = find_linux_pte(mm->pgd, cur_ua, NULL, &pteshift); | ||
233 | |||
234 | /* Double check it is still the same pinned page */ | ||
235 | if (pte && pte_page(*pte) == head && | ||
236 | pteshift == compshift + PAGE_SHIFT) | ||
237 | pageshift = max_t(unsigned int, pteshift, | ||
238 | PAGE_SHIFT); | ||
239 | local_irq_restore(flags); | ||
240 | } | 176 | } |
241 | mem->pageshift = min(mem->pageshift, pageshift); | 177 | mem->pageshift = min(mem->pageshift, pageshift); |
178 | /* | ||
179 | * We don't need struct page reference any more, switch | ||
180 | * to physical address. | ||
181 | */ | ||
242 | mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; | 182 | mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; |
243 | } | 183 | } |
244 | 184 | ||
245 | good_exit: | 185 | good_exit: |
186 | ret = 0; | ||
246 | atomic64_set(&mem->mapped, 1); | 187 | atomic64_set(&mem->mapped, 1); |
247 | mem->used = 1; | 188 | mem->used = 1; |
248 | mem->ua = ua; | 189 | mem->ua = ua; |
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 87f0dd004295..df1e11ebbabb 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c | |||
@@ -84,7 +84,7 @@ static void __init setup_node_to_cpumask_map(void) | |||
84 | alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); | 84 | alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); |
85 | 85 | ||
86 | /* cpumask_of_node() will now work */ | 86 | /* cpumask_of_node() will now work */ |
87 | dbg("Node to cpumask map for %d nodes\n", nr_node_ids); | 87 | dbg("Node to cpumask map for %u nodes\n", nr_node_ids); |
88 | } | 88 | } |
89 | 89 | ||
90 | static int __init fake_numa_create_new_node(unsigned long end_pfn, | 90 | static int __init fake_numa_create_new_node(unsigned long end_pfn, |
@@ -215,7 +215,7 @@ static void initialize_distance_lookup_table(int nid, | |||
215 | */ | 215 | */ |
216 | static int associativity_to_nid(const __be32 *associativity) | 216 | static int associativity_to_nid(const __be32 *associativity) |
217 | { | 217 | { |
218 | int nid = -1; | 218 | int nid = NUMA_NO_NODE; |
219 | 219 | ||
220 | if (min_common_depth == -1) | 220 | if (min_common_depth == -1) |
221 | goto out; | 221 | goto out; |
@@ -225,7 +225,7 @@ static int associativity_to_nid(const __be32 *associativity) | |||
225 | 225 | ||
226 | /* POWER4 LPAR uses 0xffff as invalid node */ | 226 | /* POWER4 LPAR uses 0xffff as invalid node */ |
227 | if (nid == 0xffff || nid >= MAX_NUMNODES) | 227 | if (nid == 0xffff || nid >= MAX_NUMNODES) |
228 | nid = -1; | 228 | nid = NUMA_NO_NODE; |
229 | 229 | ||
230 | if (nid > 0 && | 230 | if (nid > 0 && |
231 | of_read_number(associativity, 1) >= distance_ref_points_depth) { | 231 | of_read_number(associativity, 1) >= distance_ref_points_depth) { |
@@ -244,7 +244,7 @@ out: | |||
244 | */ | 244 | */ |
245 | static int of_node_to_nid_single(struct device_node *device) | 245 | static int of_node_to_nid_single(struct device_node *device) |
246 | { | 246 | { |
247 | int nid = -1; | 247 | int nid = NUMA_NO_NODE; |
248 | const __be32 *tmp; | 248 | const __be32 *tmp; |
249 | 249 | ||
250 | tmp = of_get_associativity(device); | 250 | tmp = of_get_associativity(device); |
@@ -256,7 +256,7 @@ static int of_node_to_nid_single(struct device_node *device) | |||
256 | /* Walk the device tree upwards, looking for an associativity id */ | 256 | /* Walk the device tree upwards, looking for an associativity id */ |
257 | int of_node_to_nid(struct device_node *device) | 257 | int of_node_to_nid(struct device_node *device) |
258 | { | 258 | { |
259 | int nid = -1; | 259 | int nid = NUMA_NO_NODE; |
260 | 260 | ||
261 | of_node_get(device); | 261 | of_node_get(device); |
262 | while (device) { | 262 | while (device) { |
@@ -454,7 +454,7 @@ static int of_drconf_to_nid_single(struct drmem_lmb *lmb) | |||
454 | */ | 454 | */ |
455 | static int numa_setup_cpu(unsigned long lcpu) | 455 | static int numa_setup_cpu(unsigned long lcpu) |
456 | { | 456 | { |
457 | int nid = -1; | 457 | int nid = NUMA_NO_NODE; |
458 | struct device_node *cpu; | 458 | struct device_node *cpu; |
459 | 459 | ||
460 | /* | 460 | /* |
@@ -930,7 +930,7 @@ static int hot_add_drconf_scn_to_nid(unsigned long scn_addr) | |||
930 | { | 930 | { |
931 | struct drmem_lmb *lmb; | 931 | struct drmem_lmb *lmb; |
932 | unsigned long lmb_size; | 932 | unsigned long lmb_size; |
933 | int nid = -1; | 933 | int nid = NUMA_NO_NODE; |
934 | 934 | ||
935 | lmb_size = drmem_lmb_size(); | 935 | lmb_size = drmem_lmb_size(); |
936 | 936 | ||
@@ -960,7 +960,7 @@ static int hot_add_drconf_scn_to_nid(unsigned long scn_addr) | |||
960 | static int hot_add_node_scn_to_nid(unsigned long scn_addr) | 960 | static int hot_add_node_scn_to_nid(unsigned long scn_addr) |
961 | { | 961 | { |
962 | struct device_node *memory; | 962 | struct device_node *memory; |
963 | int nid = -1; | 963 | int nid = NUMA_NO_NODE; |
964 | 964 | ||
965 | for_each_node_by_type(memory, "memory") { | 965 | for_each_node_by_type(memory, "memory") { |
966 | unsigned long start, size; | 966 | unsigned long start, size; |
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index ecd31569a120..e7da590c7a78 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c | |||
@@ -401,6 +401,31 @@ void arch_report_meminfo(struct seq_file *m) | |||
401 | } | 401 | } |
402 | #endif /* CONFIG_PROC_FS */ | 402 | #endif /* CONFIG_PROC_FS */ |
403 | 403 | ||
404 | pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, | ||
405 | pte_t *ptep) | ||
406 | { | ||
407 | unsigned long pte_val; | ||
408 | |||
409 | /* | ||
410 | * Clear the _PAGE_PRESENT so that no hardware parallel update is | ||
411 | * possible. Also keep the pte_present true so that we don't take | ||
412 | * wrong fault. | ||
413 | */ | ||
414 | pte_val = pte_update(vma->vm_mm, addr, ptep, _PAGE_PRESENT, _PAGE_INVALID, 0); | ||
415 | |||
416 | return __pte(pte_val); | ||
417 | |||
418 | } | ||
419 | |||
420 | void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, | ||
421 | pte_t *ptep, pte_t old_pte, pte_t pte) | ||
422 | { | ||
423 | if (radix_enabled()) | ||
424 | return radix__ptep_modify_prot_commit(vma, addr, | ||
425 | ptep, old_pte, pte); | ||
426 | set_pte_at(vma->vm_mm, addr, ptep, pte); | ||
427 | } | ||
428 | |||
404 | /* | 429 | /* |
405 | * For hash translation mode, we use the deposited table to store hash slot | 430 | * For hash translation mode, we use the deposited table to store hash slot |
406 | * information and they are stored at PTRS_PER_PMD offset from related pmd | 431 | * information and they are stored at PTRS_PER_PMD offset from related pmd |
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 931156069a81..dced3cd241c2 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c | |||
@@ -1063,3 +1063,21 @@ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, | |||
1063 | } | 1063 | } |
1064 | /* See ptesync comment in radix__set_pte_at */ | 1064 | /* See ptesync comment in radix__set_pte_at */ |
1065 | } | 1065 | } |
1066 | |||
1067 | void radix__ptep_modify_prot_commit(struct vm_area_struct *vma, | ||
1068 | unsigned long addr, pte_t *ptep, | ||
1069 | pte_t old_pte, pte_t pte) | ||
1070 | { | ||
1071 | struct mm_struct *mm = vma->vm_mm; | ||
1072 | |||
1073 | /* | ||
1074 | * To avoid NMMU hang while relaxing access we need to flush the tlb before | ||
1075 | * we set the new value. We need to do this only for radix, because hash | ||
1076 | * translation does flush when updating the linux pte. | ||
1077 | */ | ||
1078 | if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) && | ||
1079 | (atomic_read(&mm->context.copros) > 0)) | ||
1080 | radix__flush_tlb_page(vma, addr); | ||
1081 | |||
1082 | set_pte_at(mm, addr, ptep, pte); | ||
1083 | } | ||
diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 84d038ed3882..248a38ad25c7 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/slab.h> | 20 | #include <linux/slab.h> |
21 | #include <linux/memory.h> | 21 | #include <linux/memory.h> |
22 | #include <linux/memory_hotplug.h> | 22 | #include <linux/memory_hotplug.h> |
23 | #include <linux/numa.h> | ||
23 | #include <asm/machdep.h> | 24 | #include <asm/machdep.h> |
24 | #include <asm/debugfs.h> | 25 | #include <asm/debugfs.h> |
25 | 26 | ||
@@ -223,7 +224,7 @@ static int memtrace_online(void) | |||
223 | ent = &memtrace_array[i]; | 224 | ent = &memtrace_array[i]; |
224 | 225 | ||
225 | /* We have onlined this chunk previously */ | 226 | /* We have onlined this chunk previously */ |
226 | if (ent->nid == -1) | 227 | if (ent->nid == NUMA_NO_NODE) |
227 | continue; | 228 | continue; |
228 | 229 | ||
229 | /* Remove from io mappings */ | 230 | /* Remove from io mappings */ |
@@ -257,7 +258,7 @@ static int memtrace_online(void) | |||
257 | */ | 258 | */ |
258 | debugfs_remove_recursive(ent->dir); | 259 | debugfs_remove_recursive(ent->dir); |
259 | pr_info("Added trace memory back to node %d\n", ent->nid); | 260 | pr_info("Added trace memory back to node %d\n", ent->nid); |
260 | ent->size = ent->start = ent->nid = -1; | 261 | ent->size = ent->start = ent->nid = NUMA_NO_NODE; |
261 | } | 262 | } |
262 | if (ret) | 263 | if (ret) |
263 | return ret; | 264 | return ret; |
diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 582cb153eb24..0cd044122234 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c | |||
@@ -54,7 +54,6 @@ static int __init vdso_init(void) | |||
54 | struct page *pg; | 54 | struct page *pg; |
55 | 55 | ||
56 | pg = virt_to_page(vdso_start + (i << PAGE_SHIFT)); | 56 | pg = virt_to_page(vdso_start + (i << PAGE_SHIFT)); |
57 | ClearPageReserved(pg); | ||
58 | vdso_pagelist[i] = pg; | 57 | vdso_pagelist[i] = pg; |
59 | } | 58 | } |
60 | vdso_pagelist[i] = virt_to_page(vdso_data); | 59 | vdso_pagelist[i] = virt_to_page(vdso_data); |
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 063732414dfb..76dc344edb8c 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h | |||
@@ -1069,8 +1069,9 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, | |||
1069 | } | 1069 | } |
1070 | 1070 | ||
1071 | #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION | 1071 | #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION |
1072 | pte_t ptep_modify_prot_start(struct mm_struct *, unsigned long, pte_t *); | 1072 | pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); |
1073 | void ptep_modify_prot_commit(struct mm_struct *, unsigned long, pte_t *, pte_t); | 1073 | void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, |
1074 | pte_t *, pte_t, pte_t); | ||
1074 | 1075 | ||
1075 | #define __HAVE_ARCH_PTEP_CLEAR_FLUSH | 1076 | #define __HAVE_ARCH_PTEP_CLEAR_FLUSH |
1076 | static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, | 1077 | static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, |
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 4ff354887db4..e7920a68a12e 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c | |||
@@ -291,7 +291,6 @@ static int __init vdso_init(void) | |||
291 | BUG_ON(vdso32_pagelist == NULL); | 291 | BUG_ON(vdso32_pagelist == NULL); |
292 | for (i = 0; i < vdso32_pages - 1; i++) { | 292 | for (i = 0; i < vdso32_pages - 1; i++) { |
293 | struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); | 293 | struct page *pg = virt_to_page(vdso32_kbase + i*PAGE_SIZE); |
294 | ClearPageReserved(pg); | ||
295 | get_page(pg); | 294 | get_page(pg); |
296 | vdso32_pagelist[i] = pg; | 295 | vdso32_pagelist[i] = pg; |
297 | } | 296 | } |
@@ -309,7 +308,6 @@ static int __init vdso_init(void) | |||
309 | BUG_ON(vdso64_pagelist == NULL); | 308 | BUG_ON(vdso64_pagelist == NULL); |
310 | for (i = 0; i < vdso64_pages - 1; i++) { | 309 | for (i = 0; i < vdso64_pages - 1; i++) { |
311 | struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); | 310 | struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); |
312 | ClearPageReserved(pg); | ||
313 | get_page(pg); | 311 | get_page(pg); |
314 | vdso64_pagelist[i] = pg; | 312 | vdso64_pagelist[i] = pg; |
315 | } | 313 | } |
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 689b66f29fc6..8485d6dc2754 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c | |||
@@ -301,12 +301,13 @@ pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr, | |||
301 | } | 301 | } |
302 | EXPORT_SYMBOL(ptep_xchg_lazy); | 302 | EXPORT_SYMBOL(ptep_xchg_lazy); |
303 | 303 | ||
304 | pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, | 304 | pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, |
305 | pte_t *ptep) | 305 | pte_t *ptep) |
306 | { | 306 | { |
307 | pgste_t pgste; | 307 | pgste_t pgste; |
308 | pte_t old; | 308 | pte_t old; |
309 | int nodat; | 309 | int nodat; |
310 | struct mm_struct *mm = vma->vm_mm; | ||
310 | 311 | ||
311 | preempt_disable(); | 312 | preempt_disable(); |
312 | pgste = ptep_xchg_start(mm, addr, ptep); | 313 | pgste = ptep_xchg_start(mm, addr, ptep); |
@@ -319,10 +320,11 @@ pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, | |||
319 | return old; | 320 | return old; |
320 | } | 321 | } |
321 | 322 | ||
322 | void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, | 323 | void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, |
323 | pte_t *ptep, pte_t pte) | 324 | pte_t *ptep, pte_t old_pte, pte_t pte) |
324 | { | 325 | { |
325 | pgste_t pgste; | 326 | pgste_t pgste; |
327 | struct mm_struct *mm = vma->vm_mm; | ||
326 | 328 | ||
327 | if (!MACHINE_HAS_NX) | 329 | if (!MACHINE_HAS_NX) |
328 | pte_val(pte) &= ~_PAGE_NOEXEC; | 330 | pte_val(pte) &= ~_PAGE_NOEXEC; |
diff --git a/arch/sh/kernel/syscalls/syscalltbl.sh b/arch/sh/kernel/syscalls/syscalltbl.sh index 85d78d9309ad..904b8e6e625d 100644 --- a/arch/sh/kernel/syscalls/syscalltbl.sh +++ b/arch/sh/kernel/syscalls/syscalltbl.sh | |||
@@ -13,10 +13,10 @@ emit() { | |||
13 | t_entry="$3" | 13 | t_entry="$3" |
14 | 14 | ||
15 | while [ $t_nxt -lt $t_nr ]; do | 15 | while [ $t_nxt -lt $t_nr ]; do |
16 | printf "__SYSCALL(%s, sys_ni_syscall, )\n" "${t_nxt}" | 16 | printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}" |
17 | t_nxt=$((t_nxt+1)) | 17 | t_nxt=$((t_nxt+1)) |
18 | done | 18 | done |
19 | printf "__SYSCALL(%s, %s, )\n" "${t_nxt}" "${t_entry}" | 19 | printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}" |
20 | } | 20 | } |
21 | 21 | ||
22 | grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( | 22 | grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | ( |
diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S index 96e9c54a07f5..bd1a9c544767 100644 --- a/arch/sh/kernel/syscalls_32.S +++ b/arch/sh/kernel/syscalls_32.S | |||
@@ -10,7 +10,7 @@ | |||
10 | #include <linux/sys.h> | 10 | #include <linux/sys.h> |
11 | #include <linux/linkage.h> | 11 | #include <linux/linkage.h> |
12 | 12 | ||
13 | #define __SYSCALL(nr, entry, nargs) .long entry | 13 | #define __SYSCALL(nr, entry) .long entry |
14 | .data | 14 | .data |
15 | ENTRY(sys_call_table) | 15 | ENTRY(sys_call_table) |
16 | #include <asm/syscall_table.h> | 16 | #include <asm/syscall_table.h> |
diff --git a/arch/sparc/kernel/pci_fire.c b/arch/sparc/kernel/pci_fire.c index be71ae086622..0ca08d455e80 100644 --- a/arch/sparc/kernel/pci_fire.c +++ b/arch/sparc/kernel/pci_fire.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/export.h> | 11 | #include <linux/export.h> |
12 | #include <linux/irq.h> | 12 | #include <linux/irq.h> |
13 | #include <linux/of_device.h> | 13 | #include <linux/of_device.h> |
14 | #include <linux/numa.h> | ||
14 | 15 | ||
15 | #include <asm/prom.h> | 16 | #include <asm/prom.h> |
16 | #include <asm/irq.h> | 17 | #include <asm/irq.h> |
@@ -416,7 +417,7 @@ static int pci_fire_pbm_init(struct pci_pbm_info *pbm, | |||
416 | struct device_node *dp = op->dev.of_node; | 417 | struct device_node *dp = op->dev.of_node; |
417 | int err; | 418 | int err; |
418 | 419 | ||
419 | pbm->numa_node = -1; | 420 | pbm->numa_node = NUMA_NO_NODE; |
420 | 421 | ||
421 | pbm->pci_ops = &sun4u_pci_ops; | 422 | pbm->pci_ops = &sun4u_pci_ops; |
422 | pbm->config_space_reg_bits = 12; | 423 | pbm->config_space_reg_bits = 12; |
diff --git a/arch/sparc/kernel/pci_schizo.c b/arch/sparc/kernel/pci_schizo.c index 934b97c72f7c..421aba00e6b0 100644 --- a/arch/sparc/kernel/pci_schizo.c +++ b/arch/sparc/kernel/pci_schizo.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/export.h> | 12 | #include <linux/export.h> |
13 | #include <linux/interrupt.h> | 13 | #include <linux/interrupt.h> |
14 | #include <linux/of_device.h> | 14 | #include <linux/of_device.h> |
15 | #include <linux/numa.h> | ||
15 | 16 | ||
16 | #include <asm/iommu.h> | 17 | #include <asm/iommu.h> |
17 | #include <asm/irq.h> | 18 | #include <asm/irq.h> |
@@ -1347,7 +1348,7 @@ static int schizo_pbm_init(struct pci_pbm_info *pbm, | |||
1347 | pbm->next = pci_pbm_root; | 1348 | pbm->next = pci_pbm_root; |
1348 | pci_pbm_root = pbm; | 1349 | pci_pbm_root = pbm; |
1349 | 1350 | ||
1350 | pbm->numa_node = -1; | 1351 | pbm->numa_node = NUMA_NO_NODE; |
1351 | 1352 | ||
1352 | pbm->pci_ops = &sun4u_pci_ops; | 1353 | pbm->pci_ops = &sun4u_pci_ops; |
1353 | pbm->config_space_reg_bits = 8; | 1354 | pbm->config_space_reg_bits = 8; |
diff --git a/arch/sparc/kernel/psycho_common.c b/arch/sparc/kernel/psycho_common.c index 81aa91e5c0e6..e90bcb6bad7f 100644 --- a/arch/sparc/kernel/psycho_common.c +++ b/arch/sparc/kernel/psycho_common.c | |||
@@ -5,6 +5,7 @@ | |||
5 | */ | 5 | */ |
6 | #include <linux/kernel.h> | 6 | #include <linux/kernel.h> |
7 | #include <linux/interrupt.h> | 7 | #include <linux/interrupt.h> |
8 | #include <linux/numa.h> | ||
8 | 9 | ||
9 | #include <asm/upa.h> | 10 | #include <asm/upa.h> |
10 | 11 | ||
@@ -454,7 +455,7 @@ void psycho_pbm_init_common(struct pci_pbm_info *pbm, struct platform_device *op | |||
454 | struct device_node *dp = op->dev.of_node; | 455 | struct device_node *dp = op->dev.of_node; |
455 | 456 | ||
456 | pbm->name = dp->full_name; | 457 | pbm->name = dp->full_name; |
457 | pbm->numa_node = -1; | 458 | pbm->numa_node = NUMA_NO_NODE; |
458 | pbm->chip_type = chip_type; | 459 | pbm->chip_type = chip_type; |
459 | pbm->chip_version = of_getintprop_default(dp, "version#", 0); | 460 | pbm->chip_version = of_getintprop_default(dp, "version#", 0); |
460 | pbm->chip_revision = of_getintprop_default(dp, "module-revision#", 0); | 461 | pbm->chip_revision = of_getintprop_default(dp, "module-revision#", 0); |
diff --git a/arch/sparc/kernel/sbus.c b/arch/sparc/kernel/sbus.c index 41c5deb581b8..32141e1006c4 100644 --- a/arch/sparc/kernel/sbus.c +++ b/arch/sparc/kernel/sbus.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/interrupt.h> | 15 | #include <linux/interrupt.h> |
16 | #include <linux/of.h> | 16 | #include <linux/of.h> |
17 | #include <linux/of_device.h> | 17 | #include <linux/of_device.h> |
18 | #include <linux/numa.h> | ||
18 | 19 | ||
19 | #include <asm/page.h> | 20 | #include <asm/page.h> |
20 | #include <asm/io.h> | 21 | #include <asm/io.h> |
@@ -561,7 +562,7 @@ static void __init sbus_iommu_init(struct platform_device *op) | |||
561 | 562 | ||
562 | op->dev.archdata.iommu = iommu; | 563 | op->dev.archdata.iommu = iommu; |
563 | op->dev.archdata.stc = strbuf; | 564 | op->dev.archdata.stc = strbuf; |
564 | op->dev.archdata.numa_node = -1; | 565 | op->dev.archdata.numa_node = NUMA_NO_NODE; |
565 | 566 | ||
566 | reg_base = regs + SYSIO_IOMMUREG_BASE; | 567 | reg_base = regs + SYSIO_IOMMUREG_BASE; |
567 | iommu->iommu_control = reg_base + IOMMU_CONTROL; | 568 | iommu->iommu_control = reg_base + IOMMU_CONTROL; |
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index b4221d3727d0..9e6bd868ba6f 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c | |||
@@ -976,13 +976,13 @@ static u64 __init memblock_nid_range_sun4u(u64 start, u64 end, int *nid) | |||
976 | { | 976 | { |
977 | int prev_nid, new_nid; | 977 | int prev_nid, new_nid; |
978 | 978 | ||
979 | prev_nid = -1; | 979 | prev_nid = NUMA_NO_NODE; |
980 | for ( ; start < end; start += PAGE_SIZE) { | 980 | for ( ; start < end; start += PAGE_SIZE) { |
981 | for (new_nid = 0; new_nid < num_node_masks; new_nid++) { | 981 | for (new_nid = 0; new_nid < num_node_masks; new_nid++) { |
982 | struct node_mem_mask *p = &node_masks[new_nid]; | 982 | struct node_mem_mask *p = &node_masks[new_nid]; |
983 | 983 | ||
984 | if ((start & p->mask) == p->match) { | 984 | if ((start & p->mask) == p->match) { |
985 | if (prev_nid == -1) | 985 | if (prev_nid == NUMA_NO_NODE) |
986 | prev_nid = new_nid; | 986 | prev_nid = new_nid; |
987 | break; | 987 | break; |
988 | } | 988 | } |
@@ -1208,7 +1208,7 @@ int of_node_to_nid(struct device_node *dp) | |||
1208 | md = mdesc_grab(); | 1208 | md = mdesc_grab(); |
1209 | 1209 | ||
1210 | count = 0; | 1210 | count = 0; |
1211 | nid = -1; | 1211 | nid = NUMA_NO_NODE; |
1212 | mdesc_for_each_node_by_name(md, grp, "group") { | 1212 | mdesc_for_each_node_by_name(md, grp, "group") { |
1213 | if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) { | 1213 | if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) { |
1214 | nid = count; | 1214 | nid = count; |
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a97f28d914d5..c25c38a05c1c 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h | |||
@@ -422,25 +422,26 @@ static inline pgdval_t pgd_val(pgd_t pgd) | |||
422 | } | 422 | } |
423 | 423 | ||
424 | #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION | 424 | #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION |
425 | static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, | 425 | static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, |
426 | pte_t *ptep) | 426 | pte_t *ptep) |
427 | { | 427 | { |
428 | pteval_t ret; | 428 | pteval_t ret; |
429 | 429 | ||
430 | ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, mm, addr, ptep); | 430 | ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep); |
431 | 431 | ||
432 | return (pte_t) { .pte = ret }; | 432 | return (pte_t) { .pte = ret }; |
433 | } | 433 | } |
434 | 434 | ||
435 | static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, | 435 | static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, |
436 | pte_t *ptep, pte_t pte) | 436 | pte_t *ptep, pte_t old_pte, pte_t pte) |
437 | { | 437 | { |
438 | |||
438 | if (sizeof(pteval_t) > sizeof(long)) | 439 | if (sizeof(pteval_t) > sizeof(long)) |
439 | /* 5 arg words */ | 440 | /* 5 arg words */ |
440 | pv_ops.mmu.ptep_modify_prot_commit(mm, addr, ptep, pte); | 441 | pv_ops.mmu.ptep_modify_prot_commit(vma, addr, ptep, pte); |
441 | else | 442 | else |
442 | PVOP_VCALL4(mmu.ptep_modify_prot_commit, | 443 | PVOP_VCALL4(mmu.ptep_modify_prot_commit, |
443 | mm, addr, ptep, pte.pte); | 444 | vma, addr, ptep, pte.pte); |
444 | } | 445 | } |
445 | 446 | ||
446 | static inline void set_pte(pte_t *ptep, pte_t pte) | 447 | static inline void set_pte(pte_t *ptep, pte_t pte) |
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 488c59686a73..2474e434a6f7 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h | |||
@@ -55,6 +55,7 @@ struct task_struct; | |||
55 | struct cpumask; | 55 | struct cpumask; |
56 | struct flush_tlb_info; | 56 | struct flush_tlb_info; |
57 | struct mmu_gather; | 57 | struct mmu_gather; |
58 | struct vm_area_struct; | ||
58 | 59 | ||
59 | /* | 60 | /* |
60 | * Wrapper type for pointers to code which uses the non-standard | 61 | * Wrapper type for pointers to code which uses the non-standard |
@@ -254,9 +255,9 @@ struct pv_mmu_ops { | |||
254 | pte_t *ptep, pte_t pteval); | 255 | pte_t *ptep, pte_t pteval); |
255 | void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); | 256 | void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); |
256 | 257 | ||
257 | pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, | 258 | pte_t (*ptep_modify_prot_start)(struct vm_area_struct *vma, unsigned long addr, |
258 | pte_t *ptep); | 259 | pte_t *ptep); |
259 | void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr, | 260 | void (*ptep_modify_prot_commit)(struct vm_area_struct *vma, unsigned long addr, |
260 | pte_t *ptep, pte_t pte); | 261 | pte_t *ptep, pte_t pte); |
261 | 262 | ||
262 | struct paravirt_callee_save pte_val; | 263 | struct paravirt_callee_save pte_val; |
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 662963681ea6..e662f987dfa2 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <linux/slab.h> | 7 | #include <linux/slab.h> |
8 | #include <linux/string.h> | 8 | #include <linux/string.h> |
9 | #include <linux/scatterlist.h> | 9 | #include <linux/scatterlist.h> |
10 | #include <linux/numa.h> | ||
10 | #include <asm/io.h> | 11 | #include <asm/io.h> |
11 | #include <asm/pat.h> | 12 | #include <asm/pat.h> |
12 | #include <asm/x86_init.h> | 13 | #include <asm/x86_init.h> |
@@ -141,7 +142,7 @@ cpumask_of_pcibus(const struct pci_bus *bus) | |||
141 | int node; | 142 | int node; |
142 | 143 | ||
143 | node = __pcibus_to_node(bus); | 144 | node = __pcibus_to_node(bus); |
144 | return (node == -1) ? cpu_online_mask : | 145 | return (node == NUMA_NO_NODE) ? cpu_online_mask : |
145 | cpumask_of_node(node); | 146 | cpumask_of_node(node); |
146 | } | 147 | } |
147 | #endif | 148 | #endif |
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 5e49a0acb5ee..62004d22524a 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h | |||
@@ -75,7 +75,7 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un | |||
75 | #endif | 75 | #endif |
76 | 76 | ||
77 | /** | 77 | /** |
78 | * access_ok: - Checks if a user space pointer is valid | 78 | * access_ok - Checks if a user space pointer is valid |
79 | * @addr: User space pointer to start of block to check | 79 | * @addr: User space pointer to start of block to check |
80 | * @size: Size of block to check | 80 | * @size: Size of block to check |
81 | * | 81 | * |
@@ -84,12 +84,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un | |||
84 | * | 84 | * |
85 | * Checks if a pointer to a block of memory in user space is valid. | 85 | * Checks if a pointer to a block of memory in user space is valid. |
86 | * | 86 | * |
87 | * Returns true (nonzero) if the memory block may be valid, false (zero) | ||
88 | * if it is definitely invalid. | ||
89 | * | ||
90 | * Note that, depending on architecture, this function probably just | 87 | * Note that, depending on architecture, this function probably just |
91 | * checks that the pointer is in the user space range - after calling | 88 | * checks that the pointer is in the user space range - after calling |
92 | * this function, memory access functions may still return -EFAULT. | 89 | * this function, memory access functions may still return -EFAULT. |
90 | * | ||
91 | * Return: true (nonzero) if the memory block may be valid, false (zero) | ||
92 | * if it is definitely invalid. | ||
93 | */ | 93 | */ |
94 | #define access_ok(addr, size) \ | 94 | #define access_ok(addr, size) \ |
95 | ({ \ | 95 | ({ \ |
@@ -134,7 +134,7 @@ extern int __get_user_bad(void); | |||
134 | __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) | 134 | __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) |
135 | 135 | ||
136 | /** | 136 | /** |
137 | * get_user: - Get a simple variable from user space. | 137 | * get_user - Get a simple variable from user space. |
138 | * @x: Variable to store result. | 138 | * @x: Variable to store result. |
139 | * @ptr: Source address, in user space. | 139 | * @ptr: Source address, in user space. |
140 | * | 140 | * |
@@ -148,7 +148,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) | |||
148 | * @ptr must have pointer-to-simple-variable type, and the result of | 148 | * @ptr must have pointer-to-simple-variable type, and the result of |
149 | * dereferencing @ptr must be assignable to @x without a cast. | 149 | * dereferencing @ptr must be assignable to @x without a cast. |
150 | * | 150 | * |
151 | * Returns zero on success, or -EFAULT on error. | 151 | * Return: zero on success, or -EFAULT on error. |
152 | * On error, the variable @x is set to zero. | 152 | * On error, the variable @x is set to zero. |
153 | */ | 153 | */ |
154 | /* | 154 | /* |
@@ -226,7 +226,7 @@ extern void __put_user_4(void); | |||
226 | extern void __put_user_8(void); | 226 | extern void __put_user_8(void); |
227 | 227 | ||
228 | /** | 228 | /** |
229 | * put_user: - Write a simple value into user space. | 229 | * put_user - Write a simple value into user space. |
230 | * @x: Value to copy to user space. | 230 | * @x: Value to copy to user space. |
231 | * @ptr: Destination address, in user space. | 231 | * @ptr: Destination address, in user space. |
232 | * | 232 | * |
@@ -240,7 +240,7 @@ extern void __put_user_8(void); | |||
240 | * @ptr must have pointer-to-simple-variable type, and @x must be assignable | 240 | * @ptr must have pointer-to-simple-variable type, and @x must be assignable |
241 | * to the result of dereferencing @ptr. | 241 | * to the result of dereferencing @ptr. |
242 | * | 242 | * |
243 | * Returns zero on success, or -EFAULT on error. | 243 | * Return: zero on success, or -EFAULT on error. |
244 | */ | 244 | */ |
245 | #define put_user(x, ptr) \ | 245 | #define put_user(x, ptr) \ |
246 | ({ \ | 246 | ({ \ |
@@ -502,7 +502,7 @@ struct __large_struct { unsigned long buf[100]; }; | |||
502 | } while (0) | 502 | } while (0) |
503 | 503 | ||
504 | /** | 504 | /** |
505 | * __get_user: - Get a simple variable from user space, with less checking. | 505 | * __get_user - Get a simple variable from user space, with less checking. |
506 | * @x: Variable to store result. | 506 | * @x: Variable to store result. |
507 | * @ptr: Source address, in user space. | 507 | * @ptr: Source address, in user space. |
508 | * | 508 | * |
@@ -519,7 +519,7 @@ struct __large_struct { unsigned long buf[100]; }; | |||
519 | * Caller must check the pointer with access_ok() before calling this | 519 | * Caller must check the pointer with access_ok() before calling this |
520 | * function. | 520 | * function. |
521 | * | 521 | * |
522 | * Returns zero on success, or -EFAULT on error. | 522 | * Return: zero on success, or -EFAULT on error. |
523 | * On error, the variable @x is set to zero. | 523 | * On error, the variable @x is set to zero. |
524 | */ | 524 | */ |
525 | 525 | ||
@@ -527,7 +527,7 @@ struct __large_struct { unsigned long buf[100]; }; | |||
527 | __get_user_nocheck((x), (ptr), sizeof(*(ptr))) | 527 | __get_user_nocheck((x), (ptr), sizeof(*(ptr))) |
528 | 528 | ||
529 | /** | 529 | /** |
530 | * __put_user: - Write a simple value into user space, with less checking. | 530 | * __put_user - Write a simple value into user space, with less checking. |
531 | * @x: Value to copy to user space. | 531 | * @x: Value to copy to user space. |
532 | * @ptr: Destination address, in user space. | 532 | * @ptr: Destination address, in user space. |
533 | * | 533 | * |
@@ -544,7 +544,7 @@ struct __large_struct { unsigned long buf[100]; }; | |||
544 | * Caller must check the pointer with access_ok() before calling this | 544 | * Caller must check the pointer with access_ok() before calling this |
545 | * function. | 545 | * function. |
546 | * | 546 | * |
547 | * Returns zero on success, or -EFAULT on error. | 547 | * Return: zero on success, or -EFAULT on error. |
548 | */ | 548 | */ |
549 | 549 | ||
550 | #define __put_user(x, ptr) \ | 550 | #define __put_user(x, ptr) \ |
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index a555da094157..1e225528f0d7 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/crash_dump.h> | 27 | #include <linux/crash_dump.h> |
28 | #include <linux/reboot.h> | 28 | #include <linux/reboot.h> |
29 | #include <linux/memory.h> | 29 | #include <linux/memory.h> |
30 | #include <linux/numa.h> | ||
30 | 31 | ||
31 | #include <asm/uv/uv_mmrs.h> | 32 | #include <asm/uv/uv_mmrs.h> |
32 | #include <asm/uv/uv_hub.h> | 33 | #include <asm/uv/uv_hub.h> |
@@ -1390,7 +1391,7 @@ static void __init build_socket_tables(void) | |||
1390 | } | 1391 | } |
1391 | 1392 | ||
1392 | /* Set socket -> node values: */ | 1393 | /* Set socket -> node values: */ |
1393 | lnid = -1; | 1394 | lnid = NUMA_NO_NODE; |
1394 | for_each_present_cpu(cpu) { | 1395 | for_each_present_cpu(cpu) { |
1395 | int nid = cpu_to_node(cpu); | 1396 | int nid = cpu_to_node(cpu); |
1396 | int apicid, sockid; | 1397 | int apicid, sockid; |
@@ -1521,7 +1522,7 @@ static void __init uv_system_init_hub(void) | |||
1521 | new_hub->pnode = 0xffff; | 1522 | new_hub->pnode = 0xffff; |
1522 | 1523 | ||
1523 | new_hub->numa_blade_id = uv_node_to_blade_id(nodeid); | 1524 | new_hub->numa_blade_id = uv_node_to_blade_id(nodeid); |
1524 | new_hub->memory_nid = -1; | 1525 | new_hub->memory_nid = NUMA_NO_NODE; |
1525 | new_hub->nr_possible_cpus = 0; | 1526 | new_hub->nr_possible_cpus = 0; |
1526 | new_hub->nr_online_cpus = 0; | 1527 | new_hub->nr_online_cpus = 0; |
1527 | } | 1528 | } |
@@ -1538,7 +1539,7 @@ static void __init uv_system_init_hub(void) | |||
1538 | 1539 | ||
1539 | uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid); | 1540 | uv_cpu_info_per(cpu)->p_uv_hub_info = uv_hub_info_list(nodeid); |
1540 | uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++; | 1541 | uv_cpu_info_per(cpu)->blade_cpu_id = uv_cpu_hub_info(cpu)->nr_possible_cpus++; |
1541 | if (uv_cpu_hub_info(cpu)->memory_nid == -1) | 1542 | if (uv_cpu_hub_info(cpu)->memory_nid == NUMA_NO_NODE) |
1542 | uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu); | 1543 | uv_cpu_hub_info(cpu)->memory_nid = cpu_to_node(cpu); |
1543 | 1544 | ||
1544 | /* Init memoryless node: */ | 1545 | /* Init memoryless node: */ |
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index e8796fcd7e5a..13af08827eef 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c | |||
@@ -171,7 +171,7 @@ void __init setup_per_cpu_areas(void) | |||
171 | unsigned long delta; | 171 | unsigned long delta; |
172 | int rc; | 172 | int rc; |
173 | 173 | ||
174 | pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%d\n", | 174 | pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%u nr_node_ids:%u\n", |
175 | NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); | 175 | NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); |
176 | 176 | ||
177 | /* | 177 | /* |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ccd1f2a8e557..c91ff9f9fe8a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -56,6 +56,7 @@ | |||
56 | #include <linux/stackprotector.h> | 56 | #include <linux/stackprotector.h> |
57 | #include <linux/gfp.h> | 57 | #include <linux/gfp.h> |
58 | #include <linux/cpuidle.h> | 58 | #include <linux/cpuidle.h> |
59 | #include <linux/numa.h> | ||
59 | 60 | ||
60 | #include <asm/acpi.h> | 61 | #include <asm/acpi.h> |
61 | #include <asm/desc.h> | 62 | #include <asm/desc.h> |
@@ -841,7 +842,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) | |||
841 | /* reduce the number of lines printed when booting a large cpu count system */ | 842 | /* reduce the number of lines printed when booting a large cpu count system */ |
842 | static void announce_cpu(int cpu, int apicid) | 843 | static void announce_cpu(int cpu, int apicid) |
843 | { | 844 | { |
844 | static int current_node = -1; | 845 | static int current_node = NUMA_NO_NODE; |
845 | int node = early_cpu_to_node(cpu); | 846 | int node = early_cpu_to_node(cpu); |
846 | static int width, node_width; | 847 | static int width, node_width; |
847 | 848 | ||
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index bfd94e7812fc..7d290777246d 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c | |||
@@ -54,13 +54,13 @@ do { \ | |||
54 | } while (0) | 54 | } while (0) |
55 | 55 | ||
56 | /** | 56 | /** |
57 | * clear_user: - Zero a block of memory in user space. | 57 | * clear_user - Zero a block of memory in user space. |
58 | * @to: Destination address, in user space. | 58 | * @to: Destination address, in user space. |
59 | * @n: Number of bytes to zero. | 59 | * @n: Number of bytes to zero. |
60 | * | 60 | * |
61 | * Zero a block of memory in user space. | 61 | * Zero a block of memory in user space. |
62 | * | 62 | * |
63 | * Returns number of bytes that could not be cleared. | 63 | * Return: number of bytes that could not be cleared. |
64 | * On success, this will be zero. | 64 | * On success, this will be zero. |
65 | */ | 65 | */ |
66 | unsigned long | 66 | unsigned long |
@@ -74,14 +74,14 @@ clear_user(void __user *to, unsigned long n) | |||
74 | EXPORT_SYMBOL(clear_user); | 74 | EXPORT_SYMBOL(clear_user); |
75 | 75 | ||
76 | /** | 76 | /** |
77 | * __clear_user: - Zero a block of memory in user space, with less checking. | 77 | * __clear_user - Zero a block of memory in user space, with less checking. |
78 | * @to: Destination address, in user space. | 78 | * @to: Destination address, in user space. |
79 | * @n: Number of bytes to zero. | 79 | * @n: Number of bytes to zero. |
80 | * | 80 | * |
81 | * Zero a block of memory in user space. Caller must check | 81 | * Zero a block of memory in user space. Caller must check |
82 | * the specified block with access_ok() before calling this function. | 82 | * the specified block with access_ok() before calling this function. |
83 | * | 83 | * |
84 | * Returns number of bytes that could not be cleared. | 84 | * Return: number of bytes that could not be cleared. |
85 | * On success, this will be zero. | 85 | * On success, this will be zero. |
86 | */ | 86 | */ |
87 | unsigned long | 87 | unsigned long |
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 1308f5408bf7..12c1b7a83ed7 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c | |||
@@ -123,7 +123,7 @@ void __init setup_node_to_cpumask_map(void) | |||
123 | alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); | 123 | alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); |
124 | 124 | ||
125 | /* cpumask_of_node() will now work */ | 125 | /* cpumask_of_node() will now work */ |
126 | pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); | 126 | pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); |
127 | } | 127 | } |
128 | 128 | ||
129 | static int __init numa_add_memblk_to(int nid, u64 start, u64 end, | 129 | static int __init numa_add_memblk_to(int nid, u64 start, u64 end, |
@@ -866,7 +866,7 @@ const struct cpumask *cpumask_of_node(int node) | |||
866 | { | 866 | { |
867 | if (node >= nr_node_ids) { | 867 | if (node >= nr_node_ids) { |
868 | printk(KERN_WARNING | 868 | printk(KERN_WARNING |
869 | "cpumask_of_node(%d): node > nr_node_ids(%d)\n", | 869 | "cpumask_of_node(%d): node > nr_node_ids(%u)\n", |
870 | node, nr_node_ids); | 870 | node, nr_node_ids); |
871 | dump_stack(); | 871 | dump_stack(); |
872 | return cpu_none_mask; | 872 | return cpu_none_mask; |
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index a7e47cf7ec6c..6e4c6bd62203 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h | |||
@@ -17,8 +17,8 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); | |||
17 | 17 | ||
18 | void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); | 18 | void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); |
19 | 19 | ||
20 | pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); | 20 | pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); |
21 | void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, | 21 | void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, |
22 | pte_t *ptep, pte_t pte); | 22 | pte_t *ptep, pte_t pte); |
23 | 23 | ||
24 | unsigned long xen_read_cr2_direct(void); | 24 | unsigned long xen_read_cr2_direct(void); |
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 0f4fe206dcc2..856a85814f00 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c | |||
@@ -306,20 +306,20 @@ static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
306 | __xen_set_pte(ptep, pteval); | 306 | __xen_set_pte(ptep, pteval); |
307 | } | 307 | } |
308 | 308 | ||
309 | pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, | 309 | pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, |
310 | unsigned long addr, pte_t *ptep) | 310 | unsigned long addr, pte_t *ptep) |
311 | { | 311 | { |
312 | /* Just return the pte as-is. We preserve the bits on commit */ | 312 | /* Just return the pte as-is. We preserve the bits on commit */ |
313 | trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); | 313 | trace_xen_mmu_ptep_modify_prot_start(vma->vm_mm, addr, ptep, *ptep); |
314 | return *ptep; | 314 | return *ptep; |
315 | } | 315 | } |
316 | 316 | ||
317 | void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, | 317 | void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, |
318 | pte_t *ptep, pte_t pte) | 318 | pte_t *ptep, pte_t pte) |
319 | { | 319 | { |
320 | struct mmu_update u; | 320 | struct mmu_update u; |
321 | 321 | ||
322 | trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); | 322 | trace_xen_mmu_ptep_modify_prot_commit(vma->vm_mm, addr, ptep, pte); |
323 | xen_mc_batch(); | 323 | xen_mc_batch(); |
324 | 324 | ||
325 | u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; | 325 | u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; |
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 88e8440e75c3..2f3ee4d6af82 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #include <linux/export.h> | 40 | #include <linux/export.h> |
41 | #include <linux/debugfs.h> | 41 | #include <linux/debugfs.h> |
42 | #include <linux/prefetch.h> | 42 | #include <linux/prefetch.h> |
43 | #include <linux/numa.h> | ||
43 | #include "mtip32xx.h" | 44 | #include "mtip32xx.h" |
44 | 45 | ||
45 | #define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32) | 46 | #define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32) |
@@ -4018,9 +4019,9 @@ static int get_least_used_cpu_on_node(int node) | |||
4018 | /* Helper for selecting a node in round robin mode */ | 4019 | /* Helper for selecting a node in round robin mode */ |
4019 | static inline int mtip_get_next_rr_node(void) | 4020 | static inline int mtip_get_next_rr_node(void) |
4020 | { | 4021 | { |
4021 | static int next_node = -1; | 4022 | static int next_node = NUMA_NO_NODE; |
4022 | 4023 | ||
4023 | if (next_node == -1) { | 4024 | if (next_node == NUMA_NO_NODE) { |
4024 | next_node = first_online_node; | 4025 | next_node = first_online_node; |
4025 | return next_node; | 4026 | return next_node; |
4026 | } | 4027 | } |
diff --git a/drivers/char/agp/efficeon-agp.c b/drivers/char/agp/efficeon-agp.c index 7f88490b5479..c53f0f9ef5b0 100644 --- a/drivers/char/agp/efficeon-agp.c +++ b/drivers/char/agp/efficeon-agp.c | |||
@@ -163,7 +163,6 @@ static int efficeon_free_gatt_table(struct agp_bridge_data *bridge) | |||
163 | unsigned long page = efficeon_private.l1_table[index]; | 163 | unsigned long page = efficeon_private.l1_table[index]; |
164 | if (page) { | 164 | if (page) { |
165 | efficeon_private.l1_table[index] = 0; | 165 | efficeon_private.l1_table[index] = 0; |
166 | ClearPageReserved(virt_to_page((char *)page)); | ||
167 | free_page(page); | 166 | free_page(page); |
168 | freed++; | 167 | freed++; |
169 | } | 168 | } |
@@ -219,7 +218,6 @@ static int efficeon_create_gatt_table(struct agp_bridge_data *bridge) | |||
219 | efficeon_free_gatt_table(agp_bridge); | 218 | efficeon_free_gatt_table(agp_bridge); |
220 | return -ENOMEM; | 219 | return -ENOMEM; |
221 | } | 220 | } |
222 | SetPageReserved(virt_to_page((char *)page)); | ||
223 | 221 | ||
224 | for (offset = 0; offset < PAGE_SIZE; offset += clflush_chunk) | 222 | for (offset = 0; offset < PAGE_SIZE; offset += clflush_chunk) |
225 | clflush((char *)page+offset); | 223 | clflush((char *)page+offset); |
diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index f1a441ab395d..3a11b1092e80 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c | |||
@@ -63,6 +63,7 @@ | |||
63 | #include <linux/acpi_dma.h> | 63 | #include <linux/acpi_dma.h> |
64 | #include <linux/of_dma.h> | 64 | #include <linux/of_dma.h> |
65 | #include <linux/mempool.h> | 65 | #include <linux/mempool.h> |
66 | #include <linux/numa.h> | ||
66 | 67 | ||
67 | static DEFINE_MUTEX(dma_list_mutex); | 68 | static DEFINE_MUTEX(dma_list_mutex); |
68 | static DEFINE_IDA(dma_ida); | 69 | static DEFINE_IDA(dma_ida); |
@@ -386,7 +387,8 @@ EXPORT_SYMBOL(dma_issue_pending_all); | |||
386 | static bool dma_chan_is_local(struct dma_chan *chan, int cpu) | 387 | static bool dma_chan_is_local(struct dma_chan *chan, int cpu) |
387 | { | 388 | { |
388 | int node = dev_to_node(chan->device->dev); | 389 | int node = dev_to_node(chan->device->dev); |
389 | return node == -1 || cpumask_test_cpu(cpu, cpumask_of_node(node)); | 390 | return node == NUMA_NO_NODE || |
391 | cpumask_test_cpu(cpu, cpumask_of_node(node)); | ||
390 | } | 392 | } |
391 | 393 | ||
392 | /** | 394 | /** |
diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h index 9726df37c4c4..540e20eb032c 100644 --- a/drivers/gpu/drm/i915/i915_utils.h +++ b/drivers/gpu/drm/i915/i915_utils.h | |||
@@ -123,12 +123,6 @@ static inline u64 ptr_to_u64(const void *ptr) | |||
123 | 123 | ||
124 | #include <linux/list.h> | 124 | #include <linux/list.h> |
125 | 125 | ||
126 | static inline int list_is_first(const struct list_head *list, | ||
127 | const struct list_head *head) | ||
128 | { | ||
129 | return head->next == list; | ||
130 | } | ||
131 | |||
132 | static inline void __list_del_many(struct list_head *head, | 126 | static inline void __list_del_many(struct list_head *head, |
133 | struct list_head *first) | 127 | struct list_head *first) |
134 | { | 128 | { |
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 7c6349a50ef1..dd475f3bcc8a 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c | |||
@@ -681,8 +681,13 @@ static struct notifier_block hv_memory_nb = { | |||
681 | /* Check if the particular page is backed and can be onlined and online it. */ | 681 | /* Check if the particular page is backed and can be onlined and online it. */ |
682 | static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) | 682 | static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg) |
683 | { | 683 | { |
684 | if (!has_pfn_is_backed(has, page_to_pfn(pg))) | 684 | if (!has_pfn_is_backed(has, page_to_pfn(pg))) { |
685 | if (!PageOffline(pg)) | ||
686 | __SetPageOffline(pg); | ||
685 | return; | 687 | return; |
688 | } | ||
689 | if (PageOffline(pg)) | ||
690 | __ClearPageOffline(pg); | ||
686 | 691 | ||
687 | /* This frame is currently backed; online the page. */ | 692 | /* This frame is currently backed; online the page. */ |
688 | __online_page_set_limits(pg); | 693 | __online_page_set_limits(pg); |
@@ -771,7 +776,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned long size, | |||
771 | } | 776 | } |
772 | } | 777 | } |
773 | 778 | ||
774 | static void hv_online_page(struct page *pg) | 779 | static void hv_online_page(struct page *pg, unsigned int order) |
775 | { | 780 | { |
776 | struct hv_hotadd_state *has; | 781 | struct hv_hotadd_state *has; |
777 | unsigned long flags; | 782 | unsigned long flags; |
@@ -780,10 +785,11 @@ static void hv_online_page(struct page *pg) | |||
780 | spin_lock_irqsave(&dm_device.ha_lock, flags); | 785 | spin_lock_irqsave(&dm_device.ha_lock, flags); |
781 | list_for_each_entry(has, &dm_device.ha_region_list, list) { | 786 | list_for_each_entry(has, &dm_device.ha_region_list, list) { |
782 | /* The page belongs to a different HAS. */ | 787 | /* The page belongs to a different HAS. */ |
783 | if ((pfn < has->start_pfn) || (pfn >= has->end_pfn)) | 788 | if ((pfn < has->start_pfn) || |
789 | (pfn + (1UL << order) > has->end_pfn)) | ||
784 | continue; | 790 | continue; |
785 | 791 | ||
786 | hv_page_online_one(has, pg); | 792 | hv_bring_pgs_online(has, pfn, 1UL << order); |
787 | break; | 793 | break; |
788 | } | 794 | } |
789 | spin_unlock_irqrestore(&dm_device.ha_lock, flags); | 795 | spin_unlock_irqrestore(&dm_device.ha_lock, flags); |
@@ -1201,6 +1207,7 @@ static void free_balloon_pages(struct hv_dynmem_device *dm, | |||
1201 | 1207 | ||
1202 | for (i = 0; i < num_pages; i++) { | 1208 | for (i = 0; i < num_pages; i++) { |
1203 | pg = pfn_to_page(i + start_frame); | 1209 | pg = pfn_to_page(i + start_frame); |
1210 | __ClearPageOffline(pg); | ||
1204 | __free_page(pg); | 1211 | __free_page(pg); |
1205 | dm->num_pages_ballooned--; | 1212 | dm->num_pages_ballooned--; |
1206 | } | 1213 | } |
@@ -1213,7 +1220,7 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, | |||
1213 | struct dm_balloon_response *bl_resp, | 1220 | struct dm_balloon_response *bl_resp, |
1214 | int alloc_unit) | 1221 | int alloc_unit) |
1215 | { | 1222 | { |
1216 | unsigned int i = 0; | 1223 | unsigned int i, j; |
1217 | struct page *pg; | 1224 | struct page *pg; |
1218 | 1225 | ||
1219 | if (num_pages < alloc_unit) | 1226 | if (num_pages < alloc_unit) |
@@ -1245,6 +1252,10 @@ static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm, | |||
1245 | if (alloc_unit != 1) | 1252 | if (alloc_unit != 1) |
1246 | split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); | 1253 | split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); |
1247 | 1254 | ||
1255 | /* mark all pages offline */ | ||
1256 | for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT)); j++) | ||
1257 | __SetPageOffline(pg + j); | ||
1258 | |||
1248 | bl_resp->range_count++; | 1259 | bl_resp->range_count++; |
1249 | bl_resp->range_array[i].finfo.start_page = | 1260 | bl_resp->range_array[i].finfo.start_page = |
1250 | page_to_pfn(pg); | 1261 | page_to_pfn(pg); |
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index 2baf38cc1e23..4fe662c3bbc1 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c | |||
@@ -48,6 +48,7 @@ | |||
48 | #include <linux/cpumask.h> | 48 | #include <linux/cpumask.h> |
49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
50 | #include <linux/interrupt.h> | 50 | #include <linux/interrupt.h> |
51 | #include <linux/numa.h> | ||
51 | 52 | ||
52 | #include "hfi.h" | 53 | #include "hfi.h" |
53 | #include "affinity.h" | 54 | #include "affinity.h" |
@@ -777,7 +778,7 @@ void hfi1_dev_affinity_clean_up(struct hfi1_devdata *dd) | |||
777 | _dev_comp_vect_cpu_mask_clean_up(dd, entry); | 778 | _dev_comp_vect_cpu_mask_clean_up(dd, entry); |
778 | unlock: | 779 | unlock: |
779 | mutex_unlock(&node_affinity.lock); | 780 | mutex_unlock(&node_affinity.lock); |
780 | dd->node = -1; | 781 | dd->node = NUMA_NO_NODE; |
781 | } | 782 | } |
782 | 783 | ||
783 | /* | 784 | /* |
diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 7835eb52e7c5..441b06e2a154 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c | |||
@@ -54,6 +54,7 @@ | |||
54 | #include <linux/printk.h> | 54 | #include <linux/printk.h> |
55 | #include <linux/hrtimer.h> | 55 | #include <linux/hrtimer.h> |
56 | #include <linux/bitmap.h> | 56 | #include <linux/bitmap.h> |
57 | #include <linux/numa.h> | ||
57 | #include <rdma/rdma_vt.h> | 58 | #include <rdma/rdma_vt.h> |
58 | 59 | ||
59 | #include "hfi.h" | 60 | #include "hfi.h" |
@@ -1303,7 +1304,7 @@ static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, | |||
1303 | dd->unit = ret; | 1304 | dd->unit = ret; |
1304 | list_add(&dd->list, &hfi1_dev_list); | 1305 | list_add(&dd->list, &hfi1_dev_list); |
1305 | } | 1306 | } |
1306 | dd->node = -1; | 1307 | dd->node = NUMA_NO_NODE; |
1307 | 1308 | ||
1308 | spin_unlock_irqrestore(&hfi1_devs_lock, flags); | 1309 | spin_unlock_irqrestore(&hfi1_devs_lock, flags); |
1309 | idr_preload_end(); | 1310 | idr_preload_end(); |
diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index 58dc70bffd5b..9c49300e9fb7 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/dmi.h> | 39 | #include <linux/dmi.h> |
40 | #include <linux/slab.h> | 40 | #include <linux/slab.h> |
41 | #include <linux/iommu.h> | 41 | #include <linux/iommu.h> |
42 | #include <linux/numa.h> | ||
42 | #include <asm/irq_remapping.h> | 43 | #include <asm/irq_remapping.h> |
43 | #include <asm/iommu_table.h> | 44 | #include <asm/iommu_table.h> |
44 | 45 | ||
@@ -477,7 +478,7 @@ static int dmar_parse_one_rhsa(struct acpi_dmar_header *header, void *arg) | |||
477 | int node = acpi_map_pxm_to_node(rhsa->proximity_domain); | 478 | int node = acpi_map_pxm_to_node(rhsa->proximity_domain); |
478 | 479 | ||
479 | if (!node_online(node)) | 480 | if (!node_online(node)) |
480 | node = -1; | 481 | node = NUMA_NO_NODE; |
481 | drhd->iommu->node = node; | 482 | drhd->iommu->node = node; |
482 | return 0; | 483 | return 0; |
483 | } | 484 | } |
@@ -1062,7 +1063,7 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) | |||
1062 | iommu->msagaw = msagaw; | 1063 | iommu->msagaw = msagaw; |
1063 | iommu->segment = drhd->segment; | 1064 | iommu->segment = drhd->segment; |
1064 | 1065 | ||
1065 | iommu->node = -1; | 1066 | iommu->node = NUMA_NO_NODE; |
1066 | 1067 | ||
1067 | ver = readl(iommu->reg + DMAR_VER_REG); | 1068 | ver = readl(iommu->reg + DMAR_VER_REG); |
1068 | pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n", | 1069 | pr_info("%s: reg_base_addr %llx ver %d:%d cap %llx ecap %llx\n", |
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 78188bf7e90d..39a33dec4d0b 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c | |||
@@ -47,6 +47,7 @@ | |||
47 | #include <linux/dma-contiguous.h> | 47 | #include <linux/dma-contiguous.h> |
48 | #include <linux/dma-direct.h> | 48 | #include <linux/dma-direct.h> |
49 | #include <linux/crash_dump.h> | 49 | #include <linux/crash_dump.h> |
50 | #include <linux/numa.h> | ||
50 | #include <asm/irq_remapping.h> | 51 | #include <asm/irq_remapping.h> |
51 | #include <asm/cacheflush.h> | 52 | #include <asm/cacheflush.h> |
52 | #include <asm/iommu.h> | 53 | #include <asm/iommu.h> |
@@ -1716,7 +1717,7 @@ static struct dmar_domain *alloc_domain(int flags) | |||
1716 | return NULL; | 1717 | return NULL; |
1717 | 1718 | ||
1718 | memset(domain, 0, sizeof(*domain)); | 1719 | memset(domain, 0, sizeof(*domain)); |
1719 | domain->nid = -1; | 1720 | domain->nid = NUMA_NO_NODE; |
1720 | domain->flags = flags; | 1721 | domain->flags = flags; |
1721 | domain->has_iotlb_device = false; | 1722 | domain->has_iotlb_device = false; |
1722 | INIT_LIST_HEAD(&domain->devices); | 1723 | INIT_LIST_HEAD(&domain->devices); |
diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c index 0441abe87880..9e443df44b3b 100644 --- a/drivers/misc/sgi-xp/xpc_uv.c +++ b/drivers/misc/sgi-xp/xpc_uv.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
23 | #include <linux/err.h> | 23 | #include <linux/err.h> |
24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
25 | #include <linux/numa.h> | ||
25 | #include <asm/uv/uv_hub.h> | 26 | #include <asm/uv/uv_hub.h> |
26 | #if defined CONFIG_X86_64 | 27 | #if defined CONFIG_X86_64 |
27 | #include <asm/uv/bios.h> | 28 | #include <asm/uv/bios.h> |
@@ -61,7 +62,7 @@ static struct xpc_heartbeat_uv *xpc_heartbeat_uv; | |||
61 | XPC_NOTIFY_MSG_SIZE_UV) | 62 | XPC_NOTIFY_MSG_SIZE_UV) |
62 | #define XPC_NOTIFY_IRQ_NAME "xpc_notify" | 63 | #define XPC_NOTIFY_IRQ_NAME "xpc_notify" |
63 | 64 | ||
64 | static int xpc_mq_node = -1; | 65 | static int xpc_mq_node = NUMA_NO_NODE; |
65 | 66 | ||
66 | static struct xpc_gru_mq_uv *xpc_activate_mq_uv; | 67 | static struct xpc_gru_mq_uv *xpc_activate_mq_uv; |
67 | static struct xpc_gru_mq_uv *xpc_notify_mq_uv; | 68 | static struct xpc_gru_mq_uv *xpc_notify_mq_uv; |
diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index f8240b87df22..869ec842729e 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c | |||
@@ -557,6 +557,36 @@ vmballoon_page_in_frames(enum vmballoon_page_size_type page_size) | |||
557 | } | 557 | } |
558 | 558 | ||
559 | /** | 559 | /** |
560 | * vmballoon_mark_page_offline() - mark a page as offline | ||
561 | * @page: pointer for the page. | ||
562 | * @page_size: the size of the page. | ||
563 | */ | ||
564 | static void | ||
565 | vmballoon_mark_page_offline(struct page *page, | ||
566 | enum vmballoon_page_size_type page_size) | ||
567 | { | ||
568 | int i; | ||
569 | |||
570 | for (i = 0; i < vmballoon_page_in_frames(page_size); i++) | ||
571 | __SetPageOffline(page + i); | ||
572 | } | ||
573 | |||
574 | /** | ||
575 | * vmballoon_mark_page_online() - mark a page as online | ||
576 | * @page: pointer for the page. | ||
577 | * @page_size: the size of the page. | ||
578 | */ | ||
579 | static void | ||
580 | vmballoon_mark_page_online(struct page *page, | ||
581 | enum vmballoon_page_size_type page_size) | ||
582 | { | ||
583 | int i; | ||
584 | |||
585 | for (i = 0; i < vmballoon_page_in_frames(page_size); i++) | ||
586 | __ClearPageOffline(page + i); | ||
587 | } | ||
588 | |||
589 | /** | ||
560 | * vmballoon_send_get_target() - Retrieve desired balloon size from the host. | 590 | * vmballoon_send_get_target() - Retrieve desired balloon size from the host. |
561 | * | 591 | * |
562 | * @b: pointer to the balloon. | 592 | * @b: pointer to the balloon. |
@@ -612,6 +642,7 @@ static int vmballoon_alloc_page_list(struct vmballoon *b, | |||
612 | ctl->page_size); | 642 | ctl->page_size); |
613 | 643 | ||
614 | if (page) { | 644 | if (page) { |
645 | vmballoon_mark_page_offline(page, ctl->page_size); | ||
615 | /* Success. Add the page to the list and continue. */ | 646 | /* Success. Add the page to the list and continue. */ |
616 | list_add(&page->lru, &ctl->pages); | 647 | list_add(&page->lru, &ctl->pages); |
617 | continue; | 648 | continue; |
@@ -850,6 +881,7 @@ static void vmballoon_release_page_list(struct list_head *page_list, | |||
850 | 881 | ||
851 | list_for_each_entry_safe(page, tmp, page_list, lru) { | 882 | list_for_each_entry_safe(page, tmp, page_list, lru) { |
852 | list_del(&page->lru); | 883 | list_del(&page->lru); |
884 | vmballoon_mark_page_online(page, page_size); | ||
853 | __free_pages(page, vmballoon_page_order(page_size)); | 885 | __free_pages(page, vmballoon_page_order(page_size)); |
854 | } | 886 | } |
855 | 887 | ||
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index a4e7584a50cb..e100054a3765 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/bpf.h> | 27 | #include <linux/bpf.h> |
28 | #include <linux/bpf_trace.h> | 28 | #include <linux/bpf_trace.h> |
29 | #include <linux/atomic.h> | 29 | #include <linux/atomic.h> |
30 | #include <linux/numa.h> | ||
30 | #include <scsi/fc/fc_fcoe.h> | 31 | #include <scsi/fc/fc_fcoe.h> |
31 | #include <net/udp_tunnel.h> | 32 | #include <net/udp_tunnel.h> |
32 | #include <net/pkt_cls.h> | 33 | #include <net/pkt_cls.h> |
@@ -6418,7 +6419,7 @@ int ixgbe_setup_tx_resources(struct ixgbe_ring *tx_ring) | |||
6418 | { | 6419 | { |
6419 | struct device *dev = tx_ring->dev; | 6420 | struct device *dev = tx_ring->dev; |
6420 | int orig_node = dev_to_node(dev); | 6421 | int orig_node = dev_to_node(dev); |
6421 | int ring_node = -1; | 6422 | int ring_node = NUMA_NO_NODE; |
6422 | int size; | 6423 | int size; |
6423 | 6424 | ||
6424 | size = sizeof(struct ixgbe_tx_buffer) * tx_ring->count; | 6425 | size = sizeof(struct ixgbe_tx_buffer) * tx_ring->count; |
@@ -6512,7 +6513,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter, | |||
6512 | { | 6513 | { |
6513 | struct device *dev = rx_ring->dev; | 6514 | struct device *dev = rx_ring->dev; |
6514 | int orig_node = dev_to_node(dev); | 6515 | int orig_node = dev_to_node(dev); |
6515 | int ring_node = -1; | 6516 | int ring_node = NUMA_NO_NODE; |
6516 | int size; | 6517 | int size; |
6517 | 6518 | ||
6518 | size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count; | 6519 | size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count; |
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index ceb5048de9a7..39b229f9e256 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c | |||
@@ -369,14 +369,20 @@ static enum bp_state reserve_additional_memory(void) | |||
369 | return BP_ECANCELED; | 369 | return BP_ECANCELED; |
370 | } | 370 | } |
371 | 371 | ||
372 | static void xen_online_page(struct page *page) | 372 | static void xen_online_page(struct page *page, unsigned int order) |
373 | { | 373 | { |
374 | __online_page_set_limits(page); | 374 | unsigned long i, size = (1 << order); |
375 | unsigned long start_pfn = page_to_pfn(page); | ||
376 | struct page *p; | ||
375 | 377 | ||
378 | pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn); | ||
376 | mutex_lock(&balloon_mutex); | 379 | mutex_lock(&balloon_mutex); |
377 | 380 | for (i = 0; i < size; i++) { | |
378 | __balloon_append(page); | 381 | p = pfn_to_page(start_pfn + i); |
379 | 382 | __online_page_set_limits(p); | |
383 | __SetPageOffline(p); | ||
384 | __balloon_append(p); | ||
385 | } | ||
380 | mutex_unlock(&balloon_mutex); | 386 | mutex_unlock(&balloon_mutex); |
381 | } | 387 | } |
382 | 388 | ||
@@ -441,6 +447,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages) | |||
441 | xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]); | 447 | xenmem_reservation_va_mapping_update(1, &page, &frame_list[i]); |
442 | 448 | ||
443 | /* Relinquish the page back to the allocator. */ | 449 | /* Relinquish the page back to the allocator. */ |
450 | __ClearPageOffline(page); | ||
444 | free_reserved_page(page); | 451 | free_reserved_page(page); |
445 | } | 452 | } |
446 | 453 | ||
@@ -467,6 +474,7 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) | |||
467 | state = BP_EAGAIN; | 474 | state = BP_EAGAIN; |
468 | break; | 475 | break; |
469 | } | 476 | } |
477 | __SetPageOffline(page); | ||
470 | adjust_managed_page_count(page, -1); | 478 | adjust_managed_page_count(page, -1); |
471 | xenmem_reservation_scrub_page(page); | 479 | xenmem_reservation_scrub_page(page); |
472 | list_add(&page->lru, &pages); | 480 | list_add(&page->lru, &pages); |
@@ -457,6 +457,7 @@ struct files_struct init_files = { | |||
457 | .full_fds_bits = init_files.full_fds_bits_init, | 457 | .full_fds_bits = init_files.full_fds_bits_init, |
458 | }, | 458 | }, |
459 | .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), | 459 | .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock), |
460 | .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait), | ||
460 | }; | 461 | }; |
461 | 462 | ||
462 | static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) | 463 | static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start) |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a7fa037b876b..b0eef008de67 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -530,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) | |||
530 | inode_lock(inode); | 530 | inode_lock(inode); |
531 | 531 | ||
532 | /* protected by i_mutex */ | 532 | /* protected by i_mutex */ |
533 | if (info->seals & F_SEAL_WRITE) { | 533 | if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { |
534 | inode_unlock(inode); | 534 | inode_unlock(inode); |
535 | return -EPERM; | 535 | return -EPERM; |
536 | } | 536 | } |
diff --git a/fs/inode.c b/fs/inode.c index 73432e64f874..e9d97add2b36 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -2093,14 +2093,8 @@ EXPORT_SYMBOL(inode_dio_wait); | |||
2093 | void inode_set_flags(struct inode *inode, unsigned int flags, | 2093 | void inode_set_flags(struct inode *inode, unsigned int flags, |
2094 | unsigned int mask) | 2094 | unsigned int mask) |
2095 | { | 2095 | { |
2096 | unsigned int old_flags, new_flags; | ||
2097 | |||
2098 | WARN_ON_ONCE(flags & ~mask); | 2096 | WARN_ON_ONCE(flags & ~mask); |
2099 | do { | 2097 | set_mask_bits(&inode->i_flags, mask, flags); |
2100 | old_flags = READ_ONCE(inode->i_flags); | ||
2101 | new_flags = (old_flags & ~mask) | flags; | ||
2102 | } while (unlikely(cmpxchg(&inode->i_flags, old_flags, | ||
2103 | new_flags) != old_flags)); | ||
2104 | } | 2098 | } |
2105 | EXPORT_SYMBOL(inode_set_flags); | 2099 | EXPORT_SYMBOL(inode_set_flags); |
2106 | 2100 | ||
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index f8d5021a652e..ae948aaa4c53 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c | |||
@@ -832,26 +832,35 @@ void kernfs_drain_open_files(struct kernfs_node *kn) | |||
832 | * to see if it supports poll (Neither 'poll' nor 'select' return | 832 | * to see if it supports poll (Neither 'poll' nor 'select' return |
833 | * an appropriate error code). When in doubt, set a suitable timeout value. | 833 | * an appropriate error code). When in doubt, set a suitable timeout value. |
834 | */ | 834 | */ |
835 | __poll_t kernfs_generic_poll(struct kernfs_open_file *of, poll_table *wait) | ||
836 | { | ||
837 | struct kernfs_node *kn = kernfs_dentry_node(of->file->f_path.dentry); | ||
838 | struct kernfs_open_node *on = kn->attr.open; | ||
839 | |||
840 | poll_wait(of->file, &on->poll, wait); | ||
841 | |||
842 | if (of->event != atomic_read(&on->event)) | ||
843 | return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; | ||
844 | |||
845 | return DEFAULT_POLLMASK; | ||
846 | } | ||
847 | |||
835 | static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) | 848 | static __poll_t kernfs_fop_poll(struct file *filp, poll_table *wait) |
836 | { | 849 | { |
837 | struct kernfs_open_file *of = kernfs_of(filp); | 850 | struct kernfs_open_file *of = kernfs_of(filp); |
838 | struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); | 851 | struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); |
839 | struct kernfs_open_node *on = kn->attr.open; | 852 | __poll_t ret; |
840 | 853 | ||
841 | if (!kernfs_get_active(kn)) | 854 | if (!kernfs_get_active(kn)) |
842 | goto trigger; | 855 | return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; |
843 | 856 | ||
844 | poll_wait(filp, &on->poll, wait); | 857 | if (kn->attr.ops->poll) |
858 | ret = kn->attr.ops->poll(of, wait); | ||
859 | else | ||
860 | ret = kernfs_generic_poll(of, wait); | ||
845 | 861 | ||
846 | kernfs_put_active(kn); | 862 | kernfs_put_active(kn); |
847 | 863 | return ret; | |
848 | if (of->event != atomic_read(&on->event)) | ||
849 | goto trigger; | ||
850 | |||
851 | return DEFAULT_POLLMASK; | ||
852 | |||
853 | trigger: | ||
854 | return DEFAULT_POLLMASK|EPOLLERR|EPOLLPRI; | ||
855 | } | 864 | } |
856 | 865 | ||
857 | static void kernfs_notify_workfn(struct work_struct *work) | 866 | static void kernfs_notify_workfn(struct work_struct *work) |
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index d1cbb27808e2..6f0999015a44 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
@@ -7532,10 +7532,11 @@ static int ocfs2_trim_group(struct super_block *sb, | |||
7532 | return count; | 7532 | return count; |
7533 | } | 7533 | } |
7534 | 7534 | ||
7535 | int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) | 7535 | static |
7536 | int ocfs2_trim_mainbm(struct super_block *sb, struct fstrim_range *range) | ||
7536 | { | 7537 | { |
7537 | struct ocfs2_super *osb = OCFS2_SB(sb); | 7538 | struct ocfs2_super *osb = OCFS2_SB(sb); |
7538 | u64 start, len, trimmed, first_group, last_group, group; | 7539 | u64 start, len, trimmed = 0, first_group, last_group = 0, group = 0; |
7539 | int ret, cnt; | 7540 | int ret, cnt; |
7540 | u32 first_bit, last_bit, minlen; | 7541 | u32 first_bit, last_bit, minlen; |
7541 | struct buffer_head *main_bm_bh = NULL; | 7542 | struct buffer_head *main_bm_bh = NULL; |
@@ -7543,7 +7544,6 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) | |||
7543 | struct buffer_head *gd_bh = NULL; | 7544 | struct buffer_head *gd_bh = NULL; |
7544 | struct ocfs2_dinode *main_bm; | 7545 | struct ocfs2_dinode *main_bm; |
7545 | struct ocfs2_group_desc *gd = NULL; | 7546 | struct ocfs2_group_desc *gd = NULL; |
7546 | struct ocfs2_trim_fs_info info, *pinfo = NULL; | ||
7547 | 7547 | ||
7548 | start = range->start >> osb->s_clustersize_bits; | 7548 | start = range->start >> osb->s_clustersize_bits; |
7549 | len = range->len >> osb->s_clustersize_bits; | 7549 | len = range->len >> osb->s_clustersize_bits; |
@@ -7552,6 +7552,9 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) | |||
7552 | if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) | 7552 | if (minlen >= osb->bitmap_cpg || range->len < sb->s_blocksize) |
7553 | return -EINVAL; | 7553 | return -EINVAL; |
7554 | 7554 | ||
7555 | trace_ocfs2_trim_mainbm(start, len, minlen); | ||
7556 | |||
7557 | next_group: | ||
7555 | main_bm_inode = ocfs2_get_system_file_inode(osb, | 7558 | main_bm_inode = ocfs2_get_system_file_inode(osb, |
7556 | GLOBAL_BITMAP_SYSTEM_INODE, | 7559 | GLOBAL_BITMAP_SYSTEM_INODE, |
7557 | OCFS2_INVALID_SLOT); | 7560 | OCFS2_INVALID_SLOT); |
@@ -7570,64 +7573,34 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) | |||
7570 | } | 7573 | } |
7571 | main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; | 7574 | main_bm = (struct ocfs2_dinode *)main_bm_bh->b_data; |
7572 | 7575 | ||
7573 | if (start >= le32_to_cpu(main_bm->i_clusters)) { | 7576 | /* |
7574 | ret = -EINVAL; | 7577 | * Do some check before trim the first group. |
7575 | goto out_unlock; | 7578 | */ |
7576 | } | 7579 | if (!group) { |
7577 | 7580 | if (start >= le32_to_cpu(main_bm->i_clusters)) { | |
7578 | len = range->len >> osb->s_clustersize_bits; | 7581 | ret = -EINVAL; |
7579 | if (start + len > le32_to_cpu(main_bm->i_clusters)) | ||
7580 | len = le32_to_cpu(main_bm->i_clusters) - start; | ||
7581 | |||
7582 | trace_ocfs2_trim_fs(start, len, minlen); | ||
7583 | |||
7584 | ocfs2_trim_fs_lock_res_init(osb); | ||
7585 | ret = ocfs2_trim_fs_lock(osb, NULL, 1); | ||
7586 | if (ret < 0) { | ||
7587 | if (ret != -EAGAIN) { | ||
7588 | mlog_errno(ret); | ||
7589 | ocfs2_trim_fs_lock_res_uninit(osb); | ||
7590 | goto out_unlock; | 7582 | goto out_unlock; |
7591 | } | 7583 | } |
7592 | 7584 | ||
7593 | mlog(ML_NOTICE, "Wait for trim on device (%s) to " | 7585 | if (start + len > le32_to_cpu(main_bm->i_clusters)) |
7594 | "finish, which is running from another node.\n", | 7586 | len = le32_to_cpu(main_bm->i_clusters) - start; |
7595 | osb->dev_str); | ||
7596 | ret = ocfs2_trim_fs_lock(osb, &info, 0); | ||
7597 | if (ret < 0) { | ||
7598 | mlog_errno(ret); | ||
7599 | ocfs2_trim_fs_lock_res_uninit(osb); | ||
7600 | goto out_unlock; | ||
7601 | } | ||
7602 | 7587 | ||
7603 | if (info.tf_valid && info.tf_success && | 7588 | /* |
7604 | info.tf_start == start && info.tf_len == len && | 7589 | * Determine first and last group to examine based on |
7605 | info.tf_minlen == minlen) { | 7590 | * start and len |
7606 | /* Avoid sending duplicated trim to a shared device */ | 7591 | */ |
7607 | mlog(ML_NOTICE, "The same trim on device (%s) was " | 7592 | first_group = ocfs2_which_cluster_group(main_bm_inode, start); |
7608 | "just done from node (%u), return.\n", | 7593 | if (first_group == osb->first_cluster_group_blkno) |
7609 | osb->dev_str, info.tf_nodenum); | 7594 | first_bit = start; |
7610 | range->len = info.tf_trimlen; | 7595 | else |
7611 | goto out_trimunlock; | 7596 | first_bit = start - ocfs2_blocks_to_clusters(sb, |
7612 | } | 7597 | first_group); |
7598 | last_group = ocfs2_which_cluster_group(main_bm_inode, | ||
7599 | start + len - 1); | ||
7600 | group = first_group; | ||
7613 | } | 7601 | } |
7614 | 7602 | ||
7615 | info.tf_nodenum = osb->node_num; | 7603 | do { |
7616 | info.tf_start = start; | ||
7617 | info.tf_len = len; | ||
7618 | info.tf_minlen = minlen; | ||
7619 | |||
7620 | /* Determine first and last group to examine based on start and len */ | ||
7621 | first_group = ocfs2_which_cluster_group(main_bm_inode, start); | ||
7622 | if (first_group == osb->first_cluster_group_blkno) | ||
7623 | first_bit = start; | ||
7624 | else | ||
7625 | first_bit = start - ocfs2_blocks_to_clusters(sb, first_group); | ||
7626 | last_group = ocfs2_which_cluster_group(main_bm_inode, start + len - 1); | ||
7627 | last_bit = osb->bitmap_cpg; | ||
7628 | |||
7629 | trimmed = 0; | ||
7630 | for (group = first_group; group <= last_group;) { | ||
7631 | if (first_bit + len >= osb->bitmap_cpg) | 7604 | if (first_bit + len >= osb->bitmap_cpg) |
7632 | last_bit = osb->bitmap_cpg; | 7605 | last_bit = osb->bitmap_cpg; |
7633 | else | 7606 | else |
@@ -7659,21 +7632,81 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) | |||
7659 | group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); | 7632 | group = ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); |
7660 | else | 7633 | else |
7661 | group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); | 7634 | group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); |
7662 | } | 7635 | } while (0); |
7663 | range->len = trimmed * sb->s_blocksize; | ||
7664 | 7636 | ||
7665 | info.tf_trimlen = range->len; | ||
7666 | info.tf_success = (ret ? 0 : 1); | ||
7667 | pinfo = &info; | ||
7668 | out_trimunlock: | ||
7669 | ocfs2_trim_fs_unlock(osb, pinfo); | ||
7670 | ocfs2_trim_fs_lock_res_uninit(osb); | ||
7671 | out_unlock: | 7637 | out_unlock: |
7672 | ocfs2_inode_unlock(main_bm_inode, 0); | 7638 | ocfs2_inode_unlock(main_bm_inode, 0); |
7673 | brelse(main_bm_bh); | 7639 | brelse(main_bm_bh); |
7640 | main_bm_bh = NULL; | ||
7674 | out_mutex: | 7641 | out_mutex: |
7675 | inode_unlock(main_bm_inode); | 7642 | inode_unlock(main_bm_inode); |
7676 | iput(main_bm_inode); | 7643 | iput(main_bm_inode); |
7644 | |||
7645 | /* | ||
7646 | * If all the groups trim are not done or failed, but we should release | ||
7647 | * main_bm related locks for avoiding the current IO starve, then go to | ||
7648 | * trim the next group | ||
7649 | */ | ||
7650 | if (ret >= 0 && group <= last_group) | ||
7651 | goto next_group; | ||
7677 | out: | 7652 | out: |
7653 | range->len = trimmed * sb->s_blocksize; | ||
7654 | return ret; | ||
7655 | } | ||
7656 | |||
7657 | int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) | ||
7658 | { | ||
7659 | int ret; | ||
7660 | struct ocfs2_super *osb = OCFS2_SB(sb); | ||
7661 | struct ocfs2_trim_fs_info info, *pinfo = NULL; | ||
7662 | |||
7663 | ocfs2_trim_fs_lock_res_init(osb); | ||
7664 | |||
7665 | trace_ocfs2_trim_fs(range->start, range->len, range->minlen); | ||
7666 | |||
7667 | ret = ocfs2_trim_fs_lock(osb, NULL, 1); | ||
7668 | if (ret < 0) { | ||
7669 | if (ret != -EAGAIN) { | ||
7670 | mlog_errno(ret); | ||
7671 | ocfs2_trim_fs_lock_res_uninit(osb); | ||
7672 | return ret; | ||
7673 | } | ||
7674 | |||
7675 | mlog(ML_NOTICE, "Wait for trim on device (%s) to " | ||
7676 | "finish, which is running from another node.\n", | ||
7677 | osb->dev_str); | ||
7678 | ret = ocfs2_trim_fs_lock(osb, &info, 0); | ||
7679 | if (ret < 0) { | ||
7680 | mlog_errno(ret); | ||
7681 | ocfs2_trim_fs_lock_res_uninit(osb); | ||
7682 | return ret; | ||
7683 | } | ||
7684 | |||
7685 | if (info.tf_valid && info.tf_success && | ||
7686 | info.tf_start == range->start && | ||
7687 | info.tf_len == range->len && | ||
7688 | info.tf_minlen == range->minlen) { | ||
7689 | /* Avoid sending duplicated trim to a shared device */ | ||
7690 | mlog(ML_NOTICE, "The same trim on device (%s) was " | ||
7691 | "just done from node (%u), return.\n", | ||
7692 | osb->dev_str, info.tf_nodenum); | ||
7693 | range->len = info.tf_trimlen; | ||
7694 | goto out; | ||
7695 | } | ||
7696 | } | ||
7697 | |||
7698 | info.tf_nodenum = osb->node_num; | ||
7699 | info.tf_start = range->start; | ||
7700 | info.tf_len = range->len; | ||
7701 | info.tf_minlen = range->minlen; | ||
7702 | |||
7703 | ret = ocfs2_trim_mainbm(sb, range); | ||
7704 | |||
7705 | info.tf_trimlen = range->len; | ||
7706 | info.tf_success = (ret < 0 ? 0 : 1); | ||
7707 | pinfo = &info; | ||
7708 | out: | ||
7709 | ocfs2_trim_fs_unlock(osb, pinfo); | ||
7710 | ocfs2_trim_fs_lock_res_uninit(osb); | ||
7678 | return ret; | 7711 | return ret; |
7679 | } | 7712 | } |
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 0e4166cc23a0..4ac775e32240 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c | |||
@@ -621,13 +621,15 @@ static void o2nm_node_group_drop_item(struct config_group *group, | |||
621 | struct o2nm_node *node = to_o2nm_node(item); | 621 | struct o2nm_node *node = to_o2nm_node(item); |
622 | struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent); | 622 | struct o2nm_cluster *cluster = to_o2nm_cluster(group->cg_item.ci_parent); |
623 | 623 | ||
624 | o2net_disconnect_node(node); | 624 | if (cluster->cl_nodes[node->nd_num] == node) { |
625 | o2net_disconnect_node(node); | ||
625 | 626 | ||
626 | if (cluster->cl_has_local && | 627 | if (cluster->cl_has_local && |
627 | (cluster->cl_local_node == node->nd_num)) { | 628 | (cluster->cl_local_node == node->nd_num)) { |
628 | cluster->cl_has_local = 0; | 629 | cluster->cl_has_local = 0; |
629 | cluster->cl_local_node = O2NM_INVALID_NODE_NUM; | 630 | cluster->cl_local_node = O2NM_INVALID_NODE_NUM; |
630 | o2net_stop_listening(node); | 631 | o2net_stop_listening(node); |
632 | } | ||
631 | } | 633 | } |
632 | 634 | ||
633 | /* XXX call into net to stop this node from trading messages */ | 635 | /* XXX call into net to stop this node from trading messages */ |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 7c835824247e..af405586c5b1 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -686,6 +686,9 @@ void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb) | |||
686 | { | 686 | { |
687 | struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; | 687 | struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; |
688 | 688 | ||
689 | /* Only one trimfs thread are allowed to work at the same time. */ | ||
690 | mutex_lock(&osb->obs_trim_fs_mutex); | ||
691 | |||
689 | ocfs2_lock_res_init_once(lockres); | 692 | ocfs2_lock_res_init_once(lockres); |
690 | ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name); | 693 | ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name); |
691 | ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS, | 694 | ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS, |
@@ -698,6 +701,8 @@ void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb) | |||
698 | 701 | ||
699 | ocfs2_simple_drop_lockres(osb, lockres); | 702 | ocfs2_simple_drop_lockres(osb, lockres); |
700 | ocfs2_lock_res_free(lockres); | 703 | ocfs2_lock_res_free(lockres); |
704 | |||
705 | mutex_unlock(&osb->obs_trim_fs_mutex); | ||
701 | } | 706 | } |
702 | 707 | ||
703 | static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, | 708 | static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 4f86ac0027b5..1f029fbe8b8d 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -407,6 +407,7 @@ struct ocfs2_super | |||
407 | struct ocfs2_lock_res osb_rename_lockres; | 407 | struct ocfs2_lock_res osb_rename_lockres; |
408 | struct ocfs2_lock_res osb_nfs_sync_lockres; | 408 | struct ocfs2_lock_res osb_nfs_sync_lockres; |
409 | struct ocfs2_lock_res osb_trim_fs_lockres; | 409 | struct ocfs2_lock_res osb_trim_fs_lockres; |
410 | struct mutex obs_trim_fs_mutex; | ||
410 | struct ocfs2_dlm_debug *osb_dlm_debug; | 411 | struct ocfs2_dlm_debug *osb_dlm_debug; |
411 | 412 | ||
412 | struct dentry *osb_debug_root; | 413 | struct dentry *osb_debug_root; |
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 2ee76a90ba8f..dc4bce1649c1 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h | |||
@@ -712,6 +712,8 @@ TRACE_EVENT(ocfs2_trim_extent, | |||
712 | 712 | ||
713 | DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group); | 713 | DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_trim_group); |
714 | 714 | ||
715 | DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_mainbm); | ||
716 | |||
715 | DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs); | 717 | DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_trim_fs); |
716 | 718 | ||
717 | /* End of trace events for fs/ocfs2/alloc.c. */ | 719 | /* End of trace events for fs/ocfs2/alloc.c. */ |
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index d7407994f308..ea0756d83250 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c | |||
@@ -55,7 +55,7 @@ struct ocfs2_slot_info { | |||
55 | unsigned int si_blocks; | 55 | unsigned int si_blocks; |
56 | struct buffer_head **si_bh; | 56 | struct buffer_head **si_bh; |
57 | unsigned int si_num_slots; | 57 | unsigned int si_num_slots; |
58 | struct ocfs2_slot *si_slots; | 58 | struct ocfs2_slot si_slots[]; |
59 | }; | 59 | }; |
60 | 60 | ||
61 | 61 | ||
@@ -420,9 +420,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) | |||
420 | struct inode *inode = NULL; | 420 | struct inode *inode = NULL; |
421 | struct ocfs2_slot_info *si; | 421 | struct ocfs2_slot_info *si; |
422 | 422 | ||
423 | si = kzalloc(sizeof(struct ocfs2_slot_info) + | 423 | si = kzalloc(struct_size(si, si_slots, osb->max_slots), GFP_KERNEL); |
424 | (sizeof(struct ocfs2_slot) * osb->max_slots), | ||
425 | GFP_KERNEL); | ||
426 | if (!si) { | 424 | if (!si) { |
427 | status = -ENOMEM; | 425 | status = -ENOMEM; |
428 | mlog_errno(status); | 426 | mlog_errno(status); |
@@ -431,8 +429,6 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) | |||
431 | 429 | ||
432 | si->si_extended = ocfs2_uses_extended_slot_map(osb); | 430 | si->si_extended = ocfs2_uses_extended_slot_map(osb); |
433 | si->si_num_slots = osb->max_slots; | 431 | si->si_num_slots = osb->max_slots; |
434 | si->si_slots = (struct ocfs2_slot *)((char *)si + | ||
435 | sizeof(struct ocfs2_slot_info)); | ||
436 | 432 | ||
437 | inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, | 433 | inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, |
438 | OCFS2_INVALID_SLOT); | 434 | OCFS2_INVALID_SLOT); |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 3415e0b09398..96ae7cedd487 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -1847,6 +1847,8 @@ static int ocfs2_mount_volume(struct super_block *sb) | |||
1847 | if (ocfs2_is_hard_readonly(osb)) | 1847 | if (ocfs2_is_hard_readonly(osb)) |
1848 | goto leave; | 1848 | goto leave; |
1849 | 1849 | ||
1850 | mutex_init(&osb->obs_trim_fs_mutex); | ||
1851 | |||
1850 | status = ocfs2_dlm_init(osb); | 1852 | status = ocfs2_dlm_init(osb); |
1851 | if (status < 0) { | 1853 | if (status < 0) { |
1852 | mlog_errno(status); | 1854 | mlog_errno(status); |
@@ -140,8 +140,7 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, | |||
140 | struct page *page = buf->page; | 140 | struct page *page = buf->page; |
141 | 141 | ||
142 | if (page_count(page) == 1) { | 142 | if (page_count(page) == 1) { |
143 | if (memcg_kmem_enabled()) | 143 | memcg_kmem_uncharge(page, 0); |
144 | memcg_kmem_uncharge(page, 0); | ||
145 | __SetPageLocked(page); | 144 | __SetPageLocked(page); |
146 | return 0; | 145 | return 0; |
147 | } | 146 | } |
diff --git a/fs/proc/array.c b/fs/proc/array.c index 9d428d5a0ac8..2edbb657f859 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c | |||
@@ -343,28 +343,28 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) | |||
343 | #ifdef CONFIG_SECCOMP | 343 | #ifdef CONFIG_SECCOMP |
344 | seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); | 344 | seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); |
345 | #endif | 345 | #endif |
346 | seq_printf(m, "\nSpeculation_Store_Bypass:\t"); | 346 | seq_puts(m, "\nSpeculation_Store_Bypass:\t"); |
347 | switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { | 347 | switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { |
348 | case -EINVAL: | 348 | case -EINVAL: |
349 | seq_printf(m, "unknown"); | 349 | seq_puts(m, "unknown"); |
350 | break; | 350 | break; |
351 | case PR_SPEC_NOT_AFFECTED: | 351 | case PR_SPEC_NOT_AFFECTED: |
352 | seq_printf(m, "not vulnerable"); | 352 | seq_puts(m, "not vulnerable"); |
353 | break; | 353 | break; |
354 | case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: | 354 | case PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE: |
355 | seq_printf(m, "thread force mitigated"); | 355 | seq_puts(m, "thread force mitigated"); |
356 | break; | 356 | break; |
357 | case PR_SPEC_PRCTL | PR_SPEC_DISABLE: | 357 | case PR_SPEC_PRCTL | PR_SPEC_DISABLE: |
358 | seq_printf(m, "thread mitigated"); | 358 | seq_puts(m, "thread mitigated"); |
359 | break; | 359 | break; |
360 | case PR_SPEC_PRCTL | PR_SPEC_ENABLE: | 360 | case PR_SPEC_PRCTL | PR_SPEC_ENABLE: |
361 | seq_printf(m, "thread vulnerable"); | 361 | seq_puts(m, "thread vulnerable"); |
362 | break; | 362 | break; |
363 | case PR_SPEC_DISABLE: | 363 | case PR_SPEC_DISABLE: |
364 | seq_printf(m, "globally mitigated"); | 364 | seq_puts(m, "globally mitigated"); |
365 | break; | 365 | break; |
366 | default: | 366 | default: |
367 | seq_printf(m, "vulnerable"); | 367 | seq_puts(m, "vulnerable"); |
368 | break; | 368 | break; |
369 | } | 369 | } |
370 | seq_putc(m, '\n'); | 370 | seq_putc(m, '\n'); |
diff --git a/fs/proc/base.c b/fs/proc/base.c index f5ed9512d193..511b279ec69c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -456,7 +456,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, | |||
456 | struct pid *pid, struct task_struct *task) | 456 | struct pid *pid, struct task_struct *task) |
457 | { | 457 | { |
458 | if (unlikely(!sched_info_on())) | 458 | if (unlikely(!sched_info_on())) |
459 | seq_printf(m, "0 0 0\n"); | 459 | seq_puts(m, "0 0 0\n"); |
460 | else | 460 | else |
461 | seq_printf(m, "%llu %llu %lu\n", | 461 | seq_printf(m, "%llu %llu %lu\n", |
462 | (unsigned long long)task->se.sum_exec_runtime, | 462 | (unsigned long long)task->se.sum_exec_runtime, |
@@ -3161,7 +3161,7 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry, | |||
3161 | return d_splice_alias(inode, dentry); | 3161 | return d_splice_alias(inode, dentry); |
3162 | } | 3162 | } |
3163 | 3163 | ||
3164 | struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) | 3164 | struct dentry *proc_pid_lookup(struct dentry *dentry, unsigned int flags) |
3165 | { | 3165 | { |
3166 | struct task_struct *task; | 3166 | struct task_struct *task; |
3167 | unsigned tgid; | 3167 | unsigned tgid; |
diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 95b14196f284..4fc5a9b68f76 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h | |||
@@ -162,7 +162,7 @@ extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struc | |||
162 | extern void pid_update_inode(struct task_struct *, struct inode *); | 162 | extern void pid_update_inode(struct task_struct *, struct inode *); |
163 | extern int pid_delete_dentry(const struct dentry *); | 163 | extern int pid_delete_dentry(const struct dentry *); |
164 | extern int proc_pid_readdir(struct file *, struct dir_context *); | 164 | extern int proc_pid_readdir(struct file *, struct dir_context *); |
165 | extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int); | 165 | struct dentry *proc_pid_lookup(struct dentry *, unsigned int); |
166 | extern loff_t mem_lseek(struct file *, loff_t, int); | 166 | extern loff_t mem_lseek(struct file *, loff_t, int); |
167 | 167 | ||
168 | /* Lookups */ | 168 | /* Lookups */ |
diff --git a/fs/proc/page.c b/fs/proc/page.c index 40b05e0d4274..544d1ee15aee 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c | |||
@@ -152,8 +152,8 @@ u64 stable_page_flags(struct page *page) | |||
152 | else if (page_count(page) == 0 && is_free_buddy_page(page)) | 152 | else if (page_count(page) == 0 && is_free_buddy_page(page)) |
153 | u |= 1 << KPF_BUDDY; | 153 | u |= 1 << KPF_BUDDY; |
154 | 154 | ||
155 | if (PageBalloon(page)) | 155 | if (PageOffline(page)) |
156 | u |= 1 << KPF_BALLOON; | 156 | u |= 1 << KPF_OFFLINE; |
157 | if (PageTable(page)) | 157 | if (PageTable(page)) |
158 | u |= 1 << KPF_PGTABLE; | 158 | u |= 1 << KPF_PGTABLE; |
159 | 159 | ||
diff --git a/fs/proc/root.c b/fs/proc/root.c index f4b1a9d2eca6..621e6ec322ca 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c | |||
@@ -154,7 +154,7 @@ static int proc_root_getattr(const struct path *path, struct kstat *stat, | |||
154 | 154 | ||
155 | static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) | 155 | static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) |
156 | { | 156 | { |
157 | if (!proc_pid_lookup(dir, dentry, flags)) | 157 | if (!proc_pid_lookup(dentry, flags)) |
158 | return NULL; | 158 | return NULL; |
159 | 159 | ||
160 | return proc_lookup(dir, dentry, flags); | 160 | return proc_lookup(dir, dentry, flags); |
diff --git a/fs/proc/self.c b/fs/proc/self.c index 127265e5c55f..57c0a1047250 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c | |||
@@ -38,6 +38,7 @@ int proc_setup_self(struct super_block *s) | |||
38 | struct inode *root_inode = d_inode(s->s_root); | 38 | struct inode *root_inode = d_inode(s->s_root); |
39 | struct pid_namespace *ns = proc_pid_ns(root_inode); | 39 | struct pid_namespace *ns = proc_pid_ns(root_inode); |
40 | struct dentry *self; | 40 | struct dentry *self; |
41 | int ret = -ENOMEM; | ||
41 | 42 | ||
42 | inode_lock(root_inode); | 43 | inode_lock(root_inode); |
43 | self = d_alloc_name(s->s_root, "self"); | 44 | self = d_alloc_name(s->s_root, "self"); |
@@ -51,20 +52,19 @@ int proc_setup_self(struct super_block *s) | |||
51 | inode->i_gid = GLOBAL_ROOT_GID; | 52 | inode->i_gid = GLOBAL_ROOT_GID; |
52 | inode->i_op = &proc_self_inode_operations; | 53 | inode->i_op = &proc_self_inode_operations; |
53 | d_add(self, inode); | 54 | d_add(self, inode); |
55 | ret = 0; | ||
54 | } else { | 56 | } else { |
55 | dput(self); | 57 | dput(self); |
56 | self = ERR_PTR(-ENOMEM); | ||
57 | } | 58 | } |
58 | } else { | ||
59 | self = ERR_PTR(-ENOMEM); | ||
60 | } | 59 | } |
61 | inode_unlock(root_inode); | 60 | inode_unlock(root_inode); |
62 | if (IS_ERR(self)) { | 61 | |
62 | if (ret) | ||
63 | pr_err("proc_fill_super: can't allocate /proc/self\n"); | 63 | pr_err("proc_fill_super: can't allocate /proc/self\n"); |
64 | return PTR_ERR(self); | 64 | else |
65 | } | 65 | ns->proc_self = self; |
66 | ns->proc_self = self; | 66 | |
67 | return 0; | 67 | return ret; |
68 | } | 68 | } |
69 | 69 | ||
70 | void __init proc_self_init(void) | 70 | void __init proc_self_init(void) |
diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 76175211b304..80c305f206bb 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c | |||
@@ -23,21 +23,21 @@ | |||
23 | 23 | ||
24 | #ifdef arch_idle_time | 24 | #ifdef arch_idle_time |
25 | 25 | ||
26 | static u64 get_idle_time(int cpu) | 26 | static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) |
27 | { | 27 | { |
28 | u64 idle; | 28 | u64 idle; |
29 | 29 | ||
30 | idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; | 30 | idle = kcs->cpustat[CPUTIME_IDLE]; |
31 | if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) | 31 | if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) |
32 | idle += arch_idle_time(cpu); | 32 | idle += arch_idle_time(cpu); |
33 | return idle; | 33 | return idle; |
34 | } | 34 | } |
35 | 35 | ||
36 | static u64 get_iowait_time(int cpu) | 36 | static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) |
37 | { | 37 | { |
38 | u64 iowait; | 38 | u64 iowait; |
39 | 39 | ||
40 | iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; | 40 | iowait = kcs->cpustat[CPUTIME_IOWAIT]; |
41 | if (cpu_online(cpu) && nr_iowait_cpu(cpu)) | 41 | if (cpu_online(cpu) && nr_iowait_cpu(cpu)) |
42 | iowait += arch_idle_time(cpu); | 42 | iowait += arch_idle_time(cpu); |
43 | return iowait; | 43 | return iowait; |
@@ -45,7 +45,7 @@ static u64 get_iowait_time(int cpu) | |||
45 | 45 | ||
46 | #else | 46 | #else |
47 | 47 | ||
48 | static u64 get_idle_time(int cpu) | 48 | static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) |
49 | { | 49 | { |
50 | u64 idle, idle_usecs = -1ULL; | 50 | u64 idle, idle_usecs = -1ULL; |
51 | 51 | ||
@@ -54,14 +54,14 @@ static u64 get_idle_time(int cpu) | |||
54 | 54 | ||
55 | if (idle_usecs == -1ULL) | 55 | if (idle_usecs == -1ULL) |
56 | /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ | 56 | /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ |
57 | idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; | 57 | idle = kcs->cpustat[CPUTIME_IDLE]; |
58 | else | 58 | else |
59 | idle = idle_usecs * NSEC_PER_USEC; | 59 | idle = idle_usecs * NSEC_PER_USEC; |
60 | 60 | ||
61 | return idle; | 61 | return idle; |
62 | } | 62 | } |
63 | 63 | ||
64 | static u64 get_iowait_time(int cpu) | 64 | static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) |
65 | { | 65 | { |
66 | u64 iowait, iowait_usecs = -1ULL; | 66 | u64 iowait, iowait_usecs = -1ULL; |
67 | 67 | ||
@@ -70,7 +70,7 @@ static u64 get_iowait_time(int cpu) | |||
70 | 70 | ||
71 | if (iowait_usecs == -1ULL) | 71 | if (iowait_usecs == -1ULL) |
72 | /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ | 72 | /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ |
73 | iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; | 73 | iowait = kcs->cpustat[CPUTIME_IOWAIT]; |
74 | else | 74 | else |
75 | iowait = iowait_usecs * NSEC_PER_USEC; | 75 | iowait = iowait_usecs * NSEC_PER_USEC; |
76 | 76 | ||
@@ -120,16 +120,18 @@ static int show_stat(struct seq_file *p, void *v) | |||
120 | getboottime64(&boottime); | 120 | getboottime64(&boottime); |
121 | 121 | ||
122 | for_each_possible_cpu(i) { | 122 | for_each_possible_cpu(i) { |
123 | user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; | 123 | struct kernel_cpustat *kcs = &kcpustat_cpu(i); |
124 | nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; | 124 | |
125 | system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; | 125 | user += kcs->cpustat[CPUTIME_USER]; |
126 | idle += get_idle_time(i); | 126 | nice += kcs->cpustat[CPUTIME_NICE]; |
127 | iowait += get_iowait_time(i); | 127 | system += kcs->cpustat[CPUTIME_SYSTEM]; |
128 | irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; | 128 | idle += get_idle_time(kcs, i); |
129 | softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; | 129 | iowait += get_iowait_time(kcs, i); |
130 | steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; | 130 | irq += kcs->cpustat[CPUTIME_IRQ]; |
131 | guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; | 131 | softirq += kcs->cpustat[CPUTIME_SOFTIRQ]; |
132 | guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; | 132 | steal += kcs->cpustat[CPUTIME_STEAL]; |
133 | guest += kcs->cpustat[CPUTIME_GUEST]; | ||
134 | guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE]; | ||
133 | sum += kstat_cpu_irqs_sum(i); | 135 | sum += kstat_cpu_irqs_sum(i); |
134 | sum += arch_irq_stat_cpu(i); | 136 | sum += arch_irq_stat_cpu(i); |
135 | 137 | ||
@@ -155,17 +157,19 @@ static int show_stat(struct seq_file *p, void *v) | |||
155 | seq_putc(p, '\n'); | 157 | seq_putc(p, '\n'); |
156 | 158 | ||
157 | for_each_online_cpu(i) { | 159 | for_each_online_cpu(i) { |
160 | struct kernel_cpustat *kcs = &kcpustat_cpu(i); | ||
161 | |||
158 | /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ | 162 | /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ |
159 | user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; | 163 | user = kcs->cpustat[CPUTIME_USER]; |
160 | nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; | 164 | nice = kcs->cpustat[CPUTIME_NICE]; |
161 | system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; | 165 | system = kcs->cpustat[CPUTIME_SYSTEM]; |
162 | idle = get_idle_time(i); | 166 | idle = get_idle_time(kcs, i); |
163 | iowait = get_iowait_time(i); | 167 | iowait = get_iowait_time(kcs, i); |
164 | irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; | 168 | irq = kcs->cpustat[CPUTIME_IRQ]; |
165 | softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; | 169 | softirq = kcs->cpustat[CPUTIME_SOFTIRQ]; |
166 | steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; | 170 | steal = kcs->cpustat[CPUTIME_STEAL]; |
167 | guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; | 171 | guest = kcs->cpustat[CPUTIME_GUEST]; |
168 | guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; | 172 | guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE]; |
169 | seq_printf(p, "cpu%d", i); | 173 | seq_printf(p, "cpu%d", i); |
170 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); | 174 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); |
171 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); | 175 | seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 85b0ef890b28..beccb0b1d57c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -948,10 +948,12 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, | |||
948 | pte_t ptent = *pte; | 948 | pte_t ptent = *pte; |
949 | 949 | ||
950 | if (pte_present(ptent)) { | 950 | if (pte_present(ptent)) { |
951 | ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte); | 951 | pte_t old_pte; |
952 | ptent = pte_wrprotect(ptent); | 952 | |
953 | old_pte = ptep_modify_prot_start(vma, addr, pte); | ||
954 | ptent = pte_wrprotect(old_pte); | ||
953 | ptent = pte_clear_soft_dirty(ptent); | 955 | ptent = pte_clear_soft_dirty(ptent); |
954 | ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent); | 956 | ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); |
955 | } else if (is_swap_pte(ptent)) { | 957 | } else if (is_swap_pte(ptent)) { |
956 | ptent = pte_swp_clear_soft_dirty(ptent); | 958 | ptent = pte_swp_clear_soft_dirty(ptent); |
957 | set_pte_at(vma->vm_mm, addr, pte, ptent); | 959 | set_pte_at(vma->vm_mm, addr, pte, ptent); |
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index f912872fbf91..36bf0f2e102e 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c | |||
@@ -178,7 +178,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) | |||
178 | seq_file_path(m, file, ""); | 178 | seq_file_path(m, file, ""); |
179 | } else if (mm && is_stack(vma)) { | 179 | } else if (mm && is_stack(vma)) { |
180 | seq_pad(m, ' '); | 180 | seq_pad(m, ' '); |
181 | seq_printf(m, "[stack]"); | 181 | seq_puts(m, "[stack]"); |
182 | } | 182 | } |
183 | 183 | ||
184 | seq_putc(m, '\n'); | 184 | seq_putc(m, '\n'); |
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index b905010ca9eb..f61ae53533f5 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c | |||
@@ -38,6 +38,7 @@ int proc_setup_thread_self(struct super_block *s) | |||
38 | struct inode *root_inode = d_inode(s->s_root); | 38 | struct inode *root_inode = d_inode(s->s_root); |
39 | struct pid_namespace *ns = proc_pid_ns(root_inode); | 39 | struct pid_namespace *ns = proc_pid_ns(root_inode); |
40 | struct dentry *thread_self; | 40 | struct dentry *thread_self; |
41 | int ret = -ENOMEM; | ||
41 | 42 | ||
42 | inode_lock(root_inode); | 43 | inode_lock(root_inode); |
43 | thread_self = d_alloc_name(s->s_root, "thread-self"); | 44 | thread_self = d_alloc_name(s->s_root, "thread-self"); |
@@ -51,20 +52,19 @@ int proc_setup_thread_self(struct super_block *s) | |||
51 | inode->i_gid = GLOBAL_ROOT_GID; | 52 | inode->i_gid = GLOBAL_ROOT_GID; |
52 | inode->i_op = &proc_thread_self_inode_operations; | 53 | inode->i_op = &proc_thread_self_inode_operations; |
53 | d_add(thread_self, inode); | 54 | d_add(thread_self, inode); |
55 | ret = 0; | ||
54 | } else { | 56 | } else { |
55 | dput(thread_self); | 57 | dput(thread_self); |
56 | thread_self = ERR_PTR(-ENOMEM); | ||
57 | } | 58 | } |
58 | } else { | ||
59 | thread_self = ERR_PTR(-ENOMEM); | ||
60 | } | 59 | } |
61 | inode_unlock(root_inode); | 60 | inode_unlock(root_inode); |
62 | if (IS_ERR(thread_self)) { | 61 | |
62 | if (ret) | ||
63 | pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); | 63 | pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); |
64 | return PTR_ERR(thread_self); | 64 | else |
65 | } | 65 | ns->proc_thread_self = thread_self; |
66 | ns->proc_thread_self = thread_self; | 66 | |
67 | return 0; | 67 | return ret; |
68 | } | 68 | } |
69 | 69 | ||
70 | void __init proc_thread_self_init(void) | 70 | void __init proc_thread_self_init(void) |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 05e61e6c843f..fa782fba51ee 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
@@ -606,7 +606,7 @@ static inline int pmd_none_or_clear_bad(pmd_t *pmd) | |||
606 | return 0; | 606 | return 0; |
607 | } | 607 | } |
608 | 608 | ||
609 | static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm, | 609 | static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma, |
610 | unsigned long addr, | 610 | unsigned long addr, |
611 | pte_t *ptep) | 611 | pte_t *ptep) |
612 | { | 612 | { |
@@ -615,10 +615,10 @@ static inline pte_t __ptep_modify_prot_start(struct mm_struct *mm, | |||
615 | * non-present, preventing the hardware from asynchronously | 615 | * non-present, preventing the hardware from asynchronously |
616 | * updating it. | 616 | * updating it. |
617 | */ | 617 | */ |
618 | return ptep_get_and_clear(mm, addr, ptep); | 618 | return ptep_get_and_clear(vma->vm_mm, addr, ptep); |
619 | } | 619 | } |
620 | 620 | ||
621 | static inline void __ptep_modify_prot_commit(struct mm_struct *mm, | 621 | static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, |
622 | unsigned long addr, | 622 | unsigned long addr, |
623 | pte_t *ptep, pte_t pte) | 623 | pte_t *ptep, pte_t pte) |
624 | { | 624 | { |
@@ -626,7 +626,7 @@ static inline void __ptep_modify_prot_commit(struct mm_struct *mm, | |||
626 | * The pte is non-present, so there's no hardware state to | 626 | * The pte is non-present, so there's no hardware state to |
627 | * preserve. | 627 | * preserve. |
628 | */ | 628 | */ |
629 | set_pte_at(mm, addr, ptep, pte); | 629 | set_pte_at(vma->vm_mm, addr, ptep, pte); |
630 | } | 630 | } |
631 | 631 | ||
632 | #ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION | 632 | #ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION |
@@ -644,22 +644,22 @@ static inline void __ptep_modify_prot_commit(struct mm_struct *mm, | |||
644 | * queue the update to be done at some later time. The update must be | 644 | * queue the update to be done at some later time. The update must be |
645 | * actually committed before the pte lock is released, however. | 645 | * actually committed before the pte lock is released, however. |
646 | */ | 646 | */ |
647 | static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, | 647 | static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, |
648 | unsigned long addr, | 648 | unsigned long addr, |
649 | pte_t *ptep) | 649 | pte_t *ptep) |
650 | { | 650 | { |
651 | return __ptep_modify_prot_start(mm, addr, ptep); | 651 | return __ptep_modify_prot_start(vma, addr, ptep); |
652 | } | 652 | } |
653 | 653 | ||
654 | /* | 654 | /* |
655 | * Commit an update to a pte, leaving any hardware-controlled bits in | 655 | * Commit an update to a pte, leaving any hardware-controlled bits in |
656 | * the PTE unmodified. | 656 | * the PTE unmodified. |
657 | */ | 657 | */ |
658 | static inline void ptep_modify_prot_commit(struct mm_struct *mm, | 658 | static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, |
659 | unsigned long addr, | 659 | unsigned long addr, |
660 | pte_t *ptep, pte_t pte) | 660 | pte_t *ptep, pte_t old_pte, pte_t pte) |
661 | { | 661 | { |
662 | __ptep_modify_prot_commit(mm, addr, ptep, pte); | 662 | __ptep_modify_prot_commit(vma, addr, ptep, pte); |
663 | } | 663 | } |
664 | #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ | 664 | #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ |
665 | #endif /* CONFIG_MMU */ | 665 | #endif /* CONFIG_MMU */ |
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c28a47cbe355..f9b029180241 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h | |||
@@ -365,7 +365,7 @@ unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) | |||
365 | rcu_read_lock(); | 365 | rcu_read_lock(); |
366 | 366 | ||
367 | /* | 367 | /* |
368 | * Paired with store_release in inode_switch_wb_work_fn() and | 368 | * Paired with store_release in inode_switch_wbs_work_fn() and |
369 | * ensures that we see the new wb if we see cleared I_WB_SWITCH. | 369 | * ensures that we see the new wb if we see cleared I_WB_SWITCH. |
370 | */ | 370 | */ |
371 | cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; | 371 | cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; |
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 53051f3d8f25..f111c780ef1d 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h | |||
@@ -4,15 +4,18 @@ | |||
4 | * | 4 | * |
5 | * Common interface definitions for making balloon pages movable by compaction. | 5 | * Common interface definitions for making balloon pages movable by compaction. |
6 | * | 6 | * |
7 | * Despite being perfectly possible to perform ballooned pages migration, they | 7 | * Balloon page migration makes use of the general non-lru movable page |
8 | * make a special corner case to compaction scans because balloon pages are not | 8 | * feature. |
9 | * enlisted at any LRU list like the other pages we do compact / migrate. | 9 | * |
10 | * page->private is used to reference the responsible balloon device. | ||
11 | * page->mapping is used in context of non-lru page migration to reference | ||
12 | * the address space operations for page isolation/migration/compaction. | ||
10 | * | 13 | * |
11 | * As the page isolation scanning step a compaction thread does is a lockless | 14 | * As the page isolation scanning step a compaction thread does is a lockless |
12 | * procedure (from a page standpoint), it might bring some racy situations while | 15 | * procedure (from a page standpoint), it might bring some racy situations while |
13 | * performing balloon page compaction. In order to sort out these racy scenarios | 16 | * performing balloon page compaction. In order to sort out these racy scenarios |
14 | * and safely perform balloon's page compaction and migration we must, always, | 17 | * and safely perform balloon's page compaction and migration we must, always, |
15 | * ensure following these three simple rules: | 18 | * ensure following these simple rules: |
16 | * | 19 | * |
17 | * i. when updating a balloon's page ->mapping element, strictly do it under | 20 | * i. when updating a balloon's page ->mapping element, strictly do it under |
18 | * the following lock order, independently of the far superior | 21 | * the following lock order, independently of the far superior |
@@ -21,19 +24,8 @@ | |||
21 | * +--spin_lock_irq(&b_dev_info->pages_lock); | 24 | * +--spin_lock_irq(&b_dev_info->pages_lock); |
22 | * ... page->mapping updates here ... | 25 | * ... page->mapping updates here ... |
23 | * | 26 | * |
24 | * ii. before isolating or dequeueing a balloon page from the balloon device | 27 | * ii. isolation or dequeueing procedure must remove the page from balloon |
25 | * pages list, the page reference counter must be raised by one and the | 28 | * device page list under b_dev_info->pages_lock. |
26 | * extra refcount must be dropped when the page is enqueued back into | ||
27 | * the balloon device page list, thus a balloon page keeps its reference | ||
28 | * counter raised only while it is under our special handling; | ||
29 | * | ||
30 | * iii. after the lockless scan step have selected a potential balloon page for | ||
31 | * isolation, re-test the PageBalloon mark and the PagePrivate flag | ||
32 | * under the proper page lock, to ensure isolating a valid balloon page | ||
33 | * (not yet isolated, nor under release procedure) | ||
34 | * | ||
35 | * iv. isolation or dequeueing procedure must clear PagePrivate flag under | ||
36 | * page lock together with removing page from balloon device page list. | ||
37 | * | 29 | * |
38 | * The functions provided by this interface are placed to help on coping with | 30 | * The functions provided by this interface are placed to help on coping with |
39 | * the aforementioned balloon page corner case, as well as to ensure the simple | 31 | * the aforementioned balloon page corner case, as well as to ensure the simple |
@@ -103,7 +95,7 @@ extern int balloon_page_migrate(struct address_space *mapping, | |||
103 | static inline void balloon_page_insert(struct balloon_dev_info *balloon, | 95 | static inline void balloon_page_insert(struct balloon_dev_info *balloon, |
104 | struct page *page) | 96 | struct page *page) |
105 | { | 97 | { |
106 | __SetPageBalloon(page); | 98 | __SetPageOffline(page); |
107 | __SetPageMovable(page, balloon->inode->i_mapping); | 99 | __SetPageMovable(page, balloon->inode->i_mapping); |
108 | set_page_private(page, (unsigned long)balloon); | 100 | set_page_private(page, (unsigned long)balloon); |
109 | list_add(&page->lru, &balloon->pages); | 101 | list_add(&page->lru, &balloon->pages); |
@@ -119,7 +111,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, | |||
119 | */ | 111 | */ |
120 | static inline void balloon_page_delete(struct page *page) | 112 | static inline void balloon_page_delete(struct page *page) |
121 | { | 113 | { |
122 | __ClearPageBalloon(page); | 114 | __ClearPageOffline(page); |
123 | __ClearPageMovable(page); | 115 | __ClearPageMovable(page); |
124 | set_page_private(page, 0); | 116 | set_page_private(page, 0); |
125 | /* | 117 | /* |
@@ -149,13 +141,13 @@ static inline gfp_t balloon_mapping_gfp_mask(void) | |||
149 | static inline void balloon_page_insert(struct balloon_dev_info *balloon, | 141 | static inline void balloon_page_insert(struct balloon_dev_info *balloon, |
150 | struct page *page) | 142 | struct page *page) |
151 | { | 143 | { |
152 | __SetPageBalloon(page); | 144 | __SetPageOffline(page); |
153 | list_add(&page->lru, &balloon->pages); | 145 | list_add(&page->lru, &balloon->pages); |
154 | } | 146 | } |
155 | 147 | ||
156 | static inline void balloon_page_delete(struct page *page) | 148 | static inline void balloon_page_delete(struct page *page) |
157 | { | 149 | { |
158 | __ClearPageBalloon(page); | 150 | __ClearPageOffline(page); |
159 | list_del(&page->lru); | 151 | list_del(&page->lru); |
160 | } | 152 | } |
161 | 153 | ||
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8fcbae1b8db0..aad3babef007 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h | |||
@@ -32,6 +32,7 @@ struct kernfs_node; | |||
32 | struct kernfs_ops; | 32 | struct kernfs_ops; |
33 | struct kernfs_open_file; | 33 | struct kernfs_open_file; |
34 | struct seq_file; | 34 | struct seq_file; |
35 | struct poll_table_struct; | ||
35 | 36 | ||
36 | #define MAX_CGROUP_TYPE_NAMELEN 32 | 37 | #define MAX_CGROUP_TYPE_NAMELEN 32 |
37 | #define MAX_CGROUP_ROOT_NAMELEN 64 | 38 | #define MAX_CGROUP_ROOT_NAMELEN 64 |
@@ -574,6 +575,9 @@ struct cftype { | |||
574 | ssize_t (*write)(struct kernfs_open_file *of, | 575 | ssize_t (*write)(struct kernfs_open_file *of, |
575 | char *buf, size_t nbytes, loff_t off); | 576 | char *buf, size_t nbytes, loff_t off); |
576 | 577 | ||
578 | __poll_t (*poll)(struct kernfs_open_file *of, | ||
579 | struct poll_table_struct *pt); | ||
580 | |||
577 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 581 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
578 | struct lock_class_key lockdep_key; | 582 | struct lock_class_key lockdep_key; |
579 | #endif | 583 | #endif |
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 68250a57aace..9569e7c786d3 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
@@ -88,14 +88,13 @@ extern int sysctl_compact_memory; | |||
88 | extern int sysctl_compaction_handler(struct ctl_table *table, int write, | 88 | extern int sysctl_compaction_handler(struct ctl_table *table, int write, |
89 | void __user *buffer, size_t *length, loff_t *ppos); | 89 | void __user *buffer, size_t *length, loff_t *ppos); |
90 | extern int sysctl_extfrag_threshold; | 90 | extern int sysctl_extfrag_threshold; |
91 | extern int sysctl_extfrag_handler(struct ctl_table *table, int write, | ||
92 | void __user *buffer, size_t *length, loff_t *ppos); | ||
93 | extern int sysctl_compact_unevictable_allowed; | 91 | extern int sysctl_compact_unevictable_allowed; |
94 | 92 | ||
95 | extern int fragmentation_index(struct zone *zone, unsigned int order); | 93 | extern int fragmentation_index(struct zone *zone, unsigned int order); |
96 | extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, | 94 | extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, |
97 | unsigned int order, unsigned int alloc_flags, | 95 | unsigned int order, unsigned int alloc_flags, |
98 | const struct alloc_context *ac, enum compact_priority prio); | 96 | const struct alloc_context *ac, enum compact_priority prio, |
97 | struct page **page); | ||
99 | extern void reset_isolation_suitable(pg_data_t *pgdat); | 98 | extern void reset_isolation_suitable(pg_data_t *pgdat); |
100 | extern enum compact_result compaction_suitable(struct zone *zone, int order, | 99 | extern enum compact_result compaction_suitable(struct zone *zone, int order, |
101 | unsigned int alloc_flags, int classzone_idx); | 100 | unsigned int alloc_flags, int classzone_idx); |
@@ -227,8 +226,8 @@ static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_i | |||
227 | 226 | ||
228 | #endif /* CONFIG_COMPACTION */ | 227 | #endif /* CONFIG_COMPACTION */ |
229 | 228 | ||
230 | #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) | ||
231 | struct node; | 229 | struct node; |
230 | #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) | ||
232 | extern int compaction_register_node(struct node *node); | 231 | extern int compaction_register_node(struct node *node); |
233 | extern void compaction_unregister_node(struct node *node); | 232 | extern void compaction_unregister_node(struct node *node); |
234 | 233 | ||
diff --git a/include/linux/device.h b/include/linux/device.h index 6cb4640b6160..4d2f13e8c540 100644 --- a/include/linux/device.h +++ b/include/linux/device.h | |||
@@ -1095,7 +1095,7 @@ static inline void set_dev_node(struct device *dev, int node) | |||
1095 | #else | 1095 | #else |
1096 | static inline int dev_to_node(struct device *dev) | 1096 | static inline int dev_to_node(struct device *dev) |
1097 | { | 1097 | { |
1098 | return -1; | 1098 | return NUMA_NO_NODE; |
1099 | } | 1099 | } |
1100 | static inline void set_dev_node(struct device *dev, int node) | 1100 | static inline void set_dev_node(struct device *dev, int node) |
1101 | { | 1101 | { |
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 011965c08b93..6d775984905b 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h | |||
@@ -7,6 +7,13 @@ | |||
7 | #include <linux/bitops.h> | 7 | #include <linux/bitops.h> |
8 | #include <linux/jump_label.h> | 8 | #include <linux/jump_label.h> |
9 | 9 | ||
10 | /* | ||
11 | * Return code to denote that requested number of | ||
12 | * frontswap pages are unused(moved to page cache). | ||
13 | * Used in in shmem_unuse and try_to_unuse. | ||
14 | */ | ||
15 | #define FRONTSWAP_PAGES_UNUSED 2 | ||
16 | |||
10 | struct frontswap_ops { | 17 | struct frontswap_ops { |
11 | void (*init)(unsigned); /* this swap type was just swapon'ed */ | 18 | void (*init)(unsigned); /* this swap type was just swapon'ed */ |
12 | int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ | 19 | int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ |
diff --git a/include/linux/fs.h b/include/linux/fs.h index fd423fec8d83..08f26046233e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -2091,7 +2091,7 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) | |||
2091 | * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to | 2091 | * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to |
2092 | * synchronize competing switching instances and to tell | 2092 | * synchronize competing switching instances and to tell |
2093 | * wb stat updates to grab the i_pages lock. See | 2093 | * wb stat updates to grab the i_pages lock. See |
2094 | * inode_switch_wb_work_fn() for details. | 2094 | * inode_switch_wbs_work_fn() for details. |
2095 | * | 2095 | * |
2096 | * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper | 2096 | * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper |
2097 | * and work dirs among overlayfs mounts. | 2097 | * and work dirs among overlayfs mounts. |
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5f5e25fd6149..fdab7de7490d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
@@ -24,21 +24,21 @@ struct vm_area_struct; | |||
24 | #define ___GFP_HIGH 0x20u | 24 | #define ___GFP_HIGH 0x20u |
25 | #define ___GFP_IO 0x40u | 25 | #define ___GFP_IO 0x40u |
26 | #define ___GFP_FS 0x80u | 26 | #define ___GFP_FS 0x80u |
27 | #define ___GFP_WRITE 0x100u | 27 | #define ___GFP_ZERO 0x100u |
28 | #define ___GFP_NOWARN 0x200u | 28 | #define ___GFP_ATOMIC 0x200u |
29 | #define ___GFP_RETRY_MAYFAIL 0x400u | 29 | #define ___GFP_DIRECT_RECLAIM 0x400u |
30 | #define ___GFP_NOFAIL 0x800u | 30 | #define ___GFP_KSWAPD_RECLAIM 0x800u |
31 | #define ___GFP_NORETRY 0x1000u | 31 | #define ___GFP_WRITE 0x1000u |
32 | #define ___GFP_MEMALLOC 0x2000u | 32 | #define ___GFP_NOWARN 0x2000u |
33 | #define ___GFP_COMP 0x4000u | 33 | #define ___GFP_RETRY_MAYFAIL 0x4000u |
34 | #define ___GFP_ZERO 0x8000u | 34 | #define ___GFP_NOFAIL 0x8000u |
35 | #define ___GFP_NOMEMALLOC 0x10000u | 35 | #define ___GFP_NORETRY 0x10000u |
36 | #define ___GFP_HARDWALL 0x20000u | 36 | #define ___GFP_MEMALLOC 0x20000u |
37 | #define ___GFP_THISNODE 0x40000u | 37 | #define ___GFP_COMP 0x40000u |
38 | #define ___GFP_ATOMIC 0x80000u | 38 | #define ___GFP_NOMEMALLOC 0x80000u |
39 | #define ___GFP_ACCOUNT 0x100000u | 39 | #define ___GFP_HARDWALL 0x100000u |
40 | #define ___GFP_DIRECT_RECLAIM 0x200000u | 40 | #define ___GFP_THISNODE 0x200000u |
41 | #define ___GFP_KSWAPD_RECLAIM 0x400000u | 41 | #define ___GFP_ACCOUNT 0x400000u |
42 | #ifdef CONFIG_LOCKDEP | 42 | #ifdef CONFIG_LOCKDEP |
43 | #define ___GFP_NOLOCKDEP 0x800000u | 43 | #define ___GFP_NOLOCKDEP 0x800000u |
44 | #else | 44 | #else |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 087fd5f48c91..ea35263eb76b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -371,6 +371,8 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, | |||
371 | nodemask_t *nmask); | 371 | nodemask_t *nmask); |
372 | struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, | 372 | struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, |
373 | unsigned long address); | 373 | unsigned long address); |
374 | struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, | ||
375 | int nid, nodemask_t *nmask); | ||
374 | int huge_add_to_page_cache(struct page *page, struct address_space *mapping, | 376 | int huge_add_to_page_cache(struct page *page, struct address_space *mapping, |
375 | pgoff_t idx); | 377 | pgoff_t idx); |
376 | 378 | ||
@@ -493,17 +495,54 @@ static inline pgoff_t basepage_index(struct page *page) | |||
493 | extern int dissolve_free_huge_page(struct page *page); | 495 | extern int dissolve_free_huge_page(struct page *page); |
494 | extern int dissolve_free_huge_pages(unsigned long start_pfn, | 496 | extern int dissolve_free_huge_pages(unsigned long start_pfn, |
495 | unsigned long end_pfn); | 497 | unsigned long end_pfn); |
496 | static inline bool hugepage_migration_supported(struct hstate *h) | 498 | |
497 | { | ||
498 | #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION | 499 | #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION |
500 | #ifndef arch_hugetlb_migration_supported | ||
501 | static inline bool arch_hugetlb_migration_supported(struct hstate *h) | ||
502 | { | ||
499 | if ((huge_page_shift(h) == PMD_SHIFT) || | 503 | if ((huge_page_shift(h) == PMD_SHIFT) || |
500 | (huge_page_shift(h) == PGDIR_SHIFT)) | 504 | (huge_page_shift(h) == PUD_SHIFT) || |
505 | (huge_page_shift(h) == PGDIR_SHIFT)) | ||
501 | return true; | 506 | return true; |
502 | else | 507 | else |
503 | return false; | 508 | return false; |
509 | } | ||
510 | #endif | ||
504 | #else | 511 | #else |
512 | static inline bool arch_hugetlb_migration_supported(struct hstate *h) | ||
513 | { | ||
505 | return false; | 514 | return false; |
515 | } | ||
506 | #endif | 516 | #endif |
517 | |||
518 | static inline bool hugepage_migration_supported(struct hstate *h) | ||
519 | { | ||
520 | return arch_hugetlb_migration_supported(h); | ||
521 | } | ||
522 | |||
523 | /* | ||
524 | * Movability check is different as compared to migration check. | ||
525 | * It determines whether or not a huge page should be placed on | ||
526 | * movable zone or not. Movability of any huge page should be | ||
527 | * required only if huge page size is supported for migration. | ||
528 | * There wont be any reason for the huge page to be movable if | ||
529 | * it is not migratable to start with. Also the size of the huge | ||
530 | * page should be large enough to be placed under a movable zone | ||
531 | * and still feasible enough to be migratable. Just the presence | ||
532 | * in movable zone does not make the migration feasible. | ||
533 | * | ||
534 | * So even though large huge page sizes like the gigantic ones | ||
535 | * are migratable they should not be movable because its not | ||
536 | * feasible to migrate them from movable zone. | ||
537 | */ | ||
538 | static inline bool hugepage_movable_supported(struct hstate *h) | ||
539 | { | ||
540 | if (!hugepage_migration_supported(h)) | ||
541 | return false; | ||
542 | |||
543 | if (hstate_is_gigantic(h)) | ||
544 | return false; | ||
545 | return true; | ||
507 | } | 546 | } |
508 | 547 | ||
509 | static inline spinlock_t *huge_pte_lockptr(struct hstate *h, | 548 | static inline spinlock_t *huge_pte_lockptr(struct hstate *h, |
@@ -543,6 +582,26 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr | |||
543 | set_huge_pte_at(mm, addr, ptep, pte); | 582 | set_huge_pte_at(mm, addr, ptep, pte); |
544 | } | 583 | } |
545 | #endif | 584 | #endif |
585 | |||
586 | #ifndef huge_ptep_modify_prot_start | ||
587 | #define huge_ptep_modify_prot_start huge_ptep_modify_prot_start | ||
588 | static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, | ||
589 | unsigned long addr, pte_t *ptep) | ||
590 | { | ||
591 | return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); | ||
592 | } | ||
593 | #endif | ||
594 | |||
595 | #ifndef huge_ptep_modify_prot_commit | ||
596 | #define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit | ||
597 | static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, | ||
598 | unsigned long addr, pte_t *ptep, | ||
599 | pte_t old_pte, pte_t pte) | ||
600 | { | ||
601 | set_huge_pte_at(vma->vm_mm, addr, ptep, pte); | ||
602 | } | ||
603 | #endif | ||
604 | |||
546 | #else /* CONFIG_HUGETLB_PAGE */ | 605 | #else /* CONFIG_HUGETLB_PAGE */ |
547 | struct hstate {}; | 606 | struct hstate {}; |
548 | #define alloc_huge_page(v, a, r) NULL | 607 | #define alloc_huge_page(v, a, r) NULL |
@@ -602,6 +661,11 @@ static inline bool hugepage_migration_supported(struct hstate *h) | |||
602 | return false; | 661 | return false; |
603 | } | 662 | } |
604 | 663 | ||
664 | static inline bool hugepage_movable_supported(struct hstate *h) | ||
665 | { | ||
666 | return false; | ||
667 | } | ||
668 | |||
605 | static inline spinlock_t *huge_pte_lockptr(struct hstate *h, | 669 | static inline spinlock_t *huge_pte_lockptr(struct hstate *h, |
606 | struct mm_struct *mm, pte_t *pte) | 670 | struct mm_struct *mm, pte_t *pte) |
607 | { | 671 | { |
diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h index d314150658a4..a61dc075e2ce 100644 --- a/include/linux/kasan-checks.h +++ b/include/linux/kasan-checks.h | |||
@@ -2,7 +2,7 @@ | |||
2 | #ifndef _LINUX_KASAN_CHECKS_H | 2 | #ifndef _LINUX_KASAN_CHECKS_H |
3 | #define _LINUX_KASAN_CHECKS_H | 3 | #define _LINUX_KASAN_CHECKS_H |
4 | 4 | ||
5 | #ifdef CONFIG_KASAN | 5 | #if defined(__SANITIZE_ADDRESS__) || defined(__KASAN_INTERNAL) |
6 | void kasan_check_read(const volatile void *p, unsigned int size); | 6 | void kasan_check_read(const volatile void *p, unsigned int size); |
7 | void kasan_check_write(const volatile void *p, unsigned int size); | 7 | void kasan_check_write(const volatile void *p, unsigned int size); |
8 | #else | 8 | #else |
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 5b36b1287a5a..0cac1207bb00 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h | |||
@@ -25,6 +25,7 @@ struct seq_file; | |||
25 | struct vm_area_struct; | 25 | struct vm_area_struct; |
26 | struct super_block; | 26 | struct super_block; |
27 | struct file_system_type; | 27 | struct file_system_type; |
28 | struct poll_table_struct; | ||
28 | 29 | ||
29 | struct kernfs_open_node; | 30 | struct kernfs_open_node; |
30 | struct kernfs_iattrs; | 31 | struct kernfs_iattrs; |
@@ -261,6 +262,9 @@ struct kernfs_ops { | |||
261 | ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, | 262 | ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, |
262 | loff_t off); | 263 | loff_t off); |
263 | 264 | ||
265 | __poll_t (*poll)(struct kernfs_open_file *of, | ||
266 | struct poll_table_struct *pt); | ||
267 | |||
264 | int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma); | 268 | int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma); |
265 | 269 | ||
266 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 270 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
@@ -350,6 +354,8 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, | |||
350 | int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, | 354 | int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, |
351 | const char *new_name, const void *new_ns); | 355 | const char *new_name, const void *new_ns); |
352 | int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); | 356 | int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); |
357 | __poll_t kernfs_generic_poll(struct kernfs_open_file *of, | ||
358 | struct poll_table_struct *pt); | ||
353 | void kernfs_notify(struct kernfs_node *kn); | 359 | void kernfs_notify(struct kernfs_node *kn); |
354 | 360 | ||
355 | const void *kernfs_super_ns(struct super_block *sb); | 361 | const void *kernfs_super_ns(struct super_block *sb); |
diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 161e8164abcf..e48b1e453ff5 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h | |||
@@ -53,6 +53,8 @@ struct page *ksm_might_need_to_copy(struct page *page, | |||
53 | 53 | ||
54 | void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); | 54 | void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); |
55 | void ksm_migrate_page(struct page *newpage, struct page *oldpage); | 55 | void ksm_migrate_page(struct page *newpage, struct page *oldpage); |
56 | bool reuse_ksm_page(struct page *page, | ||
57 | struct vm_area_struct *vma, unsigned long address); | ||
56 | 58 | ||
57 | #else /* !CONFIG_KSM */ | 59 | #else /* !CONFIG_KSM */ |
58 | 60 | ||
@@ -86,6 +88,11 @@ static inline void rmap_walk_ksm(struct page *page, | |||
86 | static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) | 88 | static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) |
87 | { | 89 | { |
88 | } | 90 | } |
91 | static inline bool reuse_ksm_page(struct page *page, | ||
92 | struct vm_area_struct *vma, unsigned long address) | ||
93 | { | ||
94 | return false; | ||
95 | } | ||
89 | #endif /* CONFIG_MMU */ | 96 | #endif /* CONFIG_MMU */ |
90 | #endif /* !CONFIG_KSM */ | 97 | #endif /* !CONFIG_KSM */ |
91 | 98 | ||
diff --git a/include/linux/list.h b/include/linux/list.h index edb7628e46ed..79626b5ab36c 100644 --- a/include/linux/list.h +++ b/include/linux/list.h | |||
@@ -207,6 +207,17 @@ static inline void list_bulk_move_tail(struct list_head *head, | |||
207 | } | 207 | } |
208 | 208 | ||
209 | /** | 209 | /** |
210 | * list_is_first -- tests whether @ list is the first entry in list @head | ||
211 | * @list: the entry to test | ||
212 | * @head: the head of the list | ||
213 | */ | ||
214 | static inline int list_is_first(const struct list_head *list, | ||
215 | const struct list_head *head) | ||
216 | { | ||
217 | return list->prev == head; | ||
218 | } | ||
219 | |||
220 | /** | ||
210 | * list_is_last - tests whether @list is the last entry in list @head | 221 | * list_is_last - tests whether @list is the last entry in list @head |
211 | * @list: the entry to test | 222 | * @list: the entry to test |
212 | * @head: the head of the list | 223 | * @head: the head of the list |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 83ae11cbd12c..1f3d880b7ca1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -429,6 +429,11 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) | |||
429 | } | 429 | } |
430 | struct mem_cgroup *mem_cgroup_from_id(unsigned short id); | 430 | struct mem_cgroup *mem_cgroup_from_id(unsigned short id); |
431 | 431 | ||
432 | static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) | ||
433 | { | ||
434 | return mem_cgroup_from_css(seq_css(m)); | ||
435 | } | ||
436 | |||
432 | static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) | 437 | static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) |
433 | { | 438 | { |
434 | struct mem_cgroup_per_node *mz; | 439 | struct mem_cgroup_per_node *mz; |
@@ -937,6 +942,11 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) | |||
937 | return NULL; | 942 | return NULL; |
938 | } | 943 | } |
939 | 944 | ||
945 | static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) | ||
946 | { | ||
947 | return NULL; | ||
948 | } | ||
949 | |||
940 | static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) | 950 | static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) |
941 | { | 951 | { |
942 | return NULL; | 952 | return NULL; |
@@ -1273,12 +1283,12 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) | |||
1273 | 1283 | ||
1274 | struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); | 1284 | struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); |
1275 | void memcg_kmem_put_cache(struct kmem_cache *cachep); | 1285 | void memcg_kmem_put_cache(struct kmem_cache *cachep); |
1276 | int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, | ||
1277 | struct mem_cgroup *memcg); | ||
1278 | 1286 | ||
1279 | #ifdef CONFIG_MEMCG_KMEM | 1287 | #ifdef CONFIG_MEMCG_KMEM |
1280 | int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); | 1288 | int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); |
1281 | void memcg_kmem_uncharge(struct page *page, int order); | 1289 | void __memcg_kmem_uncharge(struct page *page, int order); |
1290 | int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, | ||
1291 | struct mem_cgroup *memcg); | ||
1282 | 1292 | ||
1283 | extern struct static_key_false memcg_kmem_enabled_key; | 1293 | extern struct static_key_false memcg_kmem_enabled_key; |
1284 | extern struct workqueue_struct *memcg_kmem_cache_wq; | 1294 | extern struct workqueue_struct *memcg_kmem_cache_wq; |
@@ -1300,6 +1310,26 @@ static inline bool memcg_kmem_enabled(void) | |||
1300 | return static_branch_unlikely(&memcg_kmem_enabled_key); | 1310 | return static_branch_unlikely(&memcg_kmem_enabled_key); |
1301 | } | 1311 | } |
1302 | 1312 | ||
1313 | static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) | ||
1314 | { | ||
1315 | if (memcg_kmem_enabled()) | ||
1316 | return __memcg_kmem_charge(page, gfp, order); | ||
1317 | return 0; | ||
1318 | } | ||
1319 | |||
1320 | static inline void memcg_kmem_uncharge(struct page *page, int order) | ||
1321 | { | ||
1322 | if (memcg_kmem_enabled()) | ||
1323 | __memcg_kmem_uncharge(page, order); | ||
1324 | } | ||
1325 | |||
1326 | static inline int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, | ||
1327 | int order, struct mem_cgroup *memcg) | ||
1328 | { | ||
1329 | if (memcg_kmem_enabled()) | ||
1330 | return __memcg_kmem_charge_memcg(page, gfp, order, memcg); | ||
1331 | return 0; | ||
1332 | } | ||
1303 | /* | 1333 | /* |
1304 | * helper for accessing a memcg's index. It will be used as an index in the | 1334 | * helper for accessing a memcg's index. It will be used as an index in the |
1305 | * child cache array in kmem_cache, and also to derive its name. This function | 1335 | * child cache array in kmem_cache, and also to derive its name. This function |
@@ -1325,6 +1355,15 @@ static inline void memcg_kmem_uncharge(struct page *page, int order) | |||
1325 | { | 1355 | { |
1326 | } | 1356 | } |
1327 | 1357 | ||
1358 | static inline int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) | ||
1359 | { | ||
1360 | return 0; | ||
1361 | } | ||
1362 | |||
1363 | static inline void __memcg_kmem_uncharge(struct page *page, int order) | ||
1364 | { | ||
1365 | } | ||
1366 | |||
1328 | #define for_each_memcg_cache_index(_idx) \ | 1367 | #define for_each_memcg_cache_index(_idx) \ |
1329 | for (; NULL; ) | 1368 | for (; NULL; ) |
1330 | 1369 | ||
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 368267c1b71b..52869d6d38b3 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h | |||
@@ -89,7 +89,7 @@ extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, | |||
89 | unsigned long *valid_start, unsigned long *valid_end); | 89 | unsigned long *valid_start, unsigned long *valid_end); |
90 | extern void __offline_isolated_pages(unsigned long, unsigned long); | 90 | extern void __offline_isolated_pages(unsigned long, unsigned long); |
91 | 91 | ||
92 | typedef void (*online_page_callback_t)(struct page *page); | 92 | typedef void (*online_page_callback_t)(struct page *page, unsigned int order); |
93 | 93 | ||
94 | extern int set_online_page_callback(online_page_callback_t callback); | 94 | extern int set_online_page_callback(online_page_callback_t callback); |
95 | extern int restore_online_page_callback(online_page_callback_t callback); | 95 | extern int restore_online_page_callback(online_page_callback_t callback); |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 80bb6408fe73..20ec56f8e2bb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -1536,7 +1536,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, | |||
1536 | unsigned int gup_flags, struct page **pages, int *locked); | 1536 | unsigned int gup_flags, struct page **pages, int *locked); |
1537 | long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, | 1537 | long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, |
1538 | struct page **pages, unsigned int gup_flags); | 1538 | struct page **pages, unsigned int gup_flags); |
1539 | #ifdef CONFIG_FS_DAX | 1539 | |
1540 | #if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA) | ||
1540 | long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, | 1541 | long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, |
1541 | unsigned int gup_flags, struct page **pages, | 1542 | unsigned int gup_flags, struct page **pages, |
1542 | struct vm_area_struct **vmas); | 1543 | struct vm_area_struct **vmas); |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0a36a22228e7..ab9b48420200 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -80,7 +80,7 @@ struct page { | |||
80 | struct { /* Page cache and anonymous pages */ | 80 | struct { /* Page cache and anonymous pages */ |
81 | /** | 81 | /** |
82 | * @lru: Pageout list, eg. active_list protected by | 82 | * @lru: Pageout list, eg. active_list protected by |
83 | * zone_lru_lock. Sometimes used as a generic list | 83 | * pgdat->lru_lock. Sometimes used as a generic list |
84 | * by the page owner. | 84 | * by the page owner. |
85 | */ | 85 | */ |
86 | struct list_head lru; | 86 | struct list_head lru; |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 842f9189537b..fba7741533be 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -480,6 +480,8 @@ struct zone { | |||
480 | unsigned long compact_cached_free_pfn; | 480 | unsigned long compact_cached_free_pfn; |
481 | /* pfn where async and sync compaction migration scanner should start */ | 481 | /* pfn where async and sync compaction migration scanner should start */ |
482 | unsigned long compact_cached_migrate_pfn[2]; | 482 | unsigned long compact_cached_migrate_pfn[2]; |
483 | unsigned long compact_init_migrate_pfn; | ||
484 | unsigned long compact_init_free_pfn; | ||
483 | #endif | 485 | #endif |
484 | 486 | ||
485 | #ifdef CONFIG_COMPACTION | 487 | #ifdef CONFIG_COMPACTION |
@@ -728,10 +730,6 @@ typedef struct pglist_data { | |||
728 | 730 | ||
729 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) | 731 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) |
730 | #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) | 732 | #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) |
731 | static inline spinlock_t *zone_lru_lock(struct zone *zone) | ||
732 | { | ||
733 | return &zone->zone_pgdat->lru_lock; | ||
734 | } | ||
735 | 733 | ||
736 | static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) | 734 | static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) |
737 | { | 735 | { |
@@ -1299,7 +1297,7 @@ void memory_present(int nid, unsigned long start, unsigned long end); | |||
1299 | 1297 | ||
1300 | /* | 1298 | /* |
1301 | * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we | 1299 | * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we |
1302 | * need to check pfn validility within that MAX_ORDER_NR_PAGES block. | 1300 | * need to check pfn validity within that MAX_ORDER_NR_PAGES block. |
1303 | * pfn_valid_within() should be used in this case; we optimise this away | 1301 | * pfn_valid_within() should be used in this case; we optimise this away |
1304 | * when we have no holes within a MAX_ORDER_NR_PAGES block. | 1302 | * when we have no holes within a MAX_ORDER_NR_PAGES block. |
1305 | */ | 1303 | */ |
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 5a30ad594ccc..27e7fa36f707 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h | |||
@@ -444,8 +444,8 @@ static inline int next_memory_node(int nid) | |||
444 | return next_node(nid, node_states[N_MEMORY]); | 444 | return next_node(nid, node_states[N_MEMORY]); |
445 | } | 445 | } |
446 | 446 | ||
447 | extern int nr_node_ids; | 447 | extern unsigned int nr_node_ids; |
448 | extern int nr_online_nodes; | 448 | extern unsigned int nr_online_nodes; |
449 | 449 | ||
450 | static inline void node_set_online(int nid) | 450 | static inline void node_set_online(int nid) |
451 | { | 451 | { |
@@ -485,8 +485,8 @@ static inline int num_node_state(enum node_states state) | |||
485 | #define first_online_node 0 | 485 | #define first_online_node 0 |
486 | #define first_memory_node 0 | 486 | #define first_memory_node 0 |
487 | #define next_online_node(nid) (MAX_NUMNODES) | 487 | #define next_online_node(nid) (MAX_NUMNODES) |
488 | #define nr_node_ids 1 | 488 | #define nr_node_ids 1U |
489 | #define nr_online_nodes 1 | 489 | #define nr_online_nodes 1U |
490 | 490 | ||
491 | #define node_set_online(node) node_set_state((node), N_ONLINE) | 491 | #define node_set_online(node) node_set_state((node), N_ONLINE) |
492 | #define node_set_offline(node) node_clear_state((node), N_ONLINE) | 492 | #define node_set_offline(node) node_clear_state((node), N_ONLINE) |
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 39b4494e29f1..9f8712a4b1a5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h | |||
@@ -17,8 +17,37 @@ | |||
17 | /* | 17 | /* |
18 | * Various page->flags bits: | 18 | * Various page->flags bits: |
19 | * | 19 | * |
20 | * PG_reserved is set for special pages, which can never be swapped out. Some | 20 | * PG_reserved is set for special pages. The "struct page" of such a page |
21 | * of them might not even exist... | 21 | * should in general not be touched (e.g. set dirty) except by its owner. |
22 | * Pages marked as PG_reserved include: | ||
23 | * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS, | ||
24 | * initrd, HW tables) | ||
25 | * - Pages reserved or allocated early during boot (before the page allocator | ||
26 | * was initialized). This includes (depending on the architecture) the | ||
27 | * initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much | ||
28 | * much more. Once (if ever) freed, PG_reserved is cleared and they will | ||
29 | * be given to the page allocator. | ||
30 | * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying | ||
31 | * to read/write these pages might end badly. Don't touch! | ||
32 | * - The zero page(s) | ||
33 | * - Pages not added to the page allocator when onlining a section because | ||
34 | * they were excluded via the online_page_callback() or because they are | ||
35 | * PG_hwpoison. | ||
36 | * - Pages allocated in the context of kexec/kdump (loaded kernel image, | ||
37 | * control pages, vmcoreinfo) | ||
38 | * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are | ||
39 | * not marked PG_reserved (as they might be in use by somebody else who does | ||
40 | * not respect the caching strategy). | ||
41 | * - Pages part of an offline section (struct pages of offline sections should | ||
42 | * not be trusted as they will be initialized when first onlined). | ||
43 | * - MCA pages on ia64 | ||
44 | * - Pages holding CPU notes for POWER Firmware Assisted Dump | ||
45 | * - Device memory (e.g. PMEM, DAX, HMM) | ||
46 | * Some PG_reserved pages will be excluded from the hibernation image. | ||
47 | * PG_reserved does in general not hinder anybody from dumping or swapping | ||
48 | * and is no longer required for remap_pfn_range(). ioremap might require it. | ||
49 | * Consequently, PG_reserved for a page mapped into user space can indicate | ||
50 | * the zero page, the vDSO, MMIO pages or device memory. | ||
22 | * | 51 | * |
23 | * The PG_private bitflag is set on pagecache pages if they contain filesystem | 52 | * The PG_private bitflag is set on pagecache pages if they contain filesystem |
24 | * specific data (which is normally at page->private). It can be used by | 53 | * specific data (which is normally at page->private). It can be used by |
@@ -671,7 +700,7 @@ PAGEFLAG_FALSE(DoubleMap) | |||
671 | /* Reserve 0x0000007f to catch underflows of page_mapcount */ | 700 | /* Reserve 0x0000007f to catch underflows of page_mapcount */ |
672 | #define PAGE_MAPCOUNT_RESERVE -128 | 701 | #define PAGE_MAPCOUNT_RESERVE -128 |
673 | #define PG_buddy 0x00000080 | 702 | #define PG_buddy 0x00000080 |
674 | #define PG_balloon 0x00000100 | 703 | #define PG_offline 0x00000100 |
675 | #define PG_kmemcg 0x00000200 | 704 | #define PG_kmemcg 0x00000200 |
676 | #define PG_table 0x00000400 | 705 | #define PG_table 0x00000400 |
677 | 706 | ||
@@ -706,10 +735,13 @@ static __always_inline void __ClearPage##uname(struct page *page) \ | |||
706 | PAGE_TYPE_OPS(Buddy, buddy) | 735 | PAGE_TYPE_OPS(Buddy, buddy) |
707 | 736 | ||
708 | /* | 737 | /* |
709 | * PageBalloon() is true for pages that are on the balloon page list | 738 | * PageOffline() indicates that the page is logically offline although the |
710 | * (see mm/balloon_compaction.c). | 739 | * containing section is online. (e.g. inflated in a balloon driver or |
740 | * not onlined when onlining the section). | ||
741 | * The content of these pages is effectively stale. Such pages should not | ||
742 | * be touched (read/write/dump/save) except by their owner. | ||
711 | */ | 743 | */ |
712 | PAGE_TYPE_OPS(Balloon, balloon) | 744 | PAGE_TYPE_OPS(Offline, offline) |
713 | 745 | ||
714 | /* | 746 | /* |
715 | * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on | 747 | * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on |
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e2d7039af6a3..b477a70cc2e4 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h | |||
@@ -164,7 +164,7 @@ void release_pages(struct page **pages, int nr); | |||
164 | * will find the page or it will not. Likewise, the old find_get_page could run | 164 | * will find the page or it will not. Likewise, the old find_get_page could run |
165 | * either before the insertion or afterwards, depending on timing. | 165 | * either before the insertion or afterwards, depending on timing. |
166 | */ | 166 | */ |
167 | static inline int page_cache_get_speculative(struct page *page) | 167 | static inline int __page_cache_add_speculative(struct page *page, int count) |
168 | { | 168 | { |
169 | #ifdef CONFIG_TINY_RCU | 169 | #ifdef CONFIG_TINY_RCU |
170 | # ifdef CONFIG_PREEMPT_COUNT | 170 | # ifdef CONFIG_PREEMPT_COUNT |
@@ -180,10 +180,10 @@ static inline int page_cache_get_speculative(struct page *page) | |||
180 | * SMP requires. | 180 | * SMP requires. |
181 | */ | 181 | */ |
182 | VM_BUG_ON_PAGE(page_count(page) == 0, page); | 182 | VM_BUG_ON_PAGE(page_count(page) == 0, page); |
183 | page_ref_inc(page); | 183 | page_ref_add(page, count); |
184 | 184 | ||
185 | #else | 185 | #else |
186 | if (unlikely(!get_page_unless_zero(page))) { | 186 | if (unlikely(!page_ref_add_unless(page, count, 0))) { |
187 | /* | 187 | /* |
188 | * Either the page has been freed, or will be freed. | 188 | * Either the page has been freed, or will be freed. |
189 | * In either case, retry here and the caller should | 189 | * In either case, retry here and the caller should |
@@ -197,27 +197,14 @@ static inline int page_cache_get_speculative(struct page *page) | |||
197 | return 1; | 197 | return 1; |
198 | } | 198 | } |
199 | 199 | ||
200 | /* | 200 | static inline int page_cache_get_speculative(struct page *page) |
201 | * Same as above, but add instead of inc (could just be merged) | ||
202 | */ | ||
203 | static inline int page_cache_add_speculative(struct page *page, int count) | ||
204 | { | 201 | { |
205 | VM_BUG_ON(in_interrupt()); | 202 | return __page_cache_add_speculative(page, 1); |
206 | 203 | } | |
207 | #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) | ||
208 | # ifdef CONFIG_PREEMPT_COUNT | ||
209 | VM_BUG_ON(!in_atomic() && !irqs_disabled()); | ||
210 | # endif | ||
211 | VM_BUG_ON_PAGE(page_count(page) == 0, page); | ||
212 | page_ref_add(page, count); | ||
213 | |||
214 | #else | ||
215 | if (unlikely(!page_ref_add_unless(page, count, 0))) | ||
216 | return 0; | ||
217 | #endif | ||
218 | VM_BUG_ON_PAGE(PageCompound(page) && page != compound_head(page), page); | ||
219 | 204 | ||
220 | return 1; | 205 | static inline int page_cache_add_speculative(struct page *page, int count) |
206 | { | ||
207 | return __page_cache_add_speculative(page, count); | ||
221 | } | 208 | } |
222 | 209 | ||
223 | #ifdef CONFIG_NUMA | 210 | #ifdef CONFIG_NUMA |
diff --git a/include/linux/poison.h b/include/linux/poison.h index 15927ebc22f2..5046bad0c1c5 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h | |||
@@ -30,7 +30,7 @@ | |||
30 | */ | 30 | */ |
31 | #define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) | 31 | #define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) |
32 | 32 | ||
33 | /********** mm/debug-pagealloc.c **********/ | 33 | /********** mm/page_poison.c **********/ |
34 | #ifdef CONFIG_PAGE_POISONING_ZERO | 34 | #ifdef CONFIG_PAGE_POISONING_ZERO |
35 | #define PAGE_POISON 0x00 | 35 | #define PAGE_POISON 0x00 |
36 | #else | 36 | #else |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 903ef29b62c3..f073bd59df32 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -48,6 +48,7 @@ struct pid_namespace; | |||
48 | struct pipe_inode_info; | 48 | struct pipe_inode_info; |
49 | struct rcu_node; | 49 | struct rcu_node; |
50 | struct reclaim_state; | 50 | struct reclaim_state; |
51 | struct capture_control; | ||
51 | struct robust_list_head; | 52 | struct robust_list_head; |
52 | struct sched_attr; | 53 | struct sched_attr; |
53 | struct sched_param; | 54 | struct sched_param; |
@@ -950,6 +951,9 @@ struct task_struct { | |||
950 | 951 | ||
951 | struct io_context *io_context; | 952 | struct io_context *io_context; |
952 | 953 | ||
954 | #ifdef CONFIG_COMPACTION | ||
955 | struct capture_control *capture_control; | ||
956 | #endif | ||
953 | /* Ptrace state: */ | 957 | /* Ptrace state: */ |
954 | unsigned long ptrace_message; | 958 | unsigned long ptrace_message; |
955 | kernel_siginfo_t *last_siginfo; | 959 | kernel_siginfo_t *last_siginfo; |
@@ -1395,6 +1399,7 @@ extern struct pid *cad_pid; | |||
1395 | #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ | 1399 | #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ |
1396 | #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ | 1400 | #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ |
1397 | #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ | 1401 | #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ |
1402 | #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ | ||
1398 | #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ | 1403 | #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ |
1399 | #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ | 1404 | #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ |
1400 | 1405 | ||
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 3bfa6a0cbba4..0cd9f10423fb 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h | |||
@@ -148,17 +148,25 @@ static inline bool in_vfork(struct task_struct *tsk) | |||
148 | * Applies per-task gfp context to the given allocation flags. | 148 | * Applies per-task gfp context to the given allocation flags. |
149 | * PF_MEMALLOC_NOIO implies GFP_NOIO | 149 | * PF_MEMALLOC_NOIO implies GFP_NOIO |
150 | * PF_MEMALLOC_NOFS implies GFP_NOFS | 150 | * PF_MEMALLOC_NOFS implies GFP_NOFS |
151 | * PF_MEMALLOC_NOCMA implies no allocation from CMA region. | ||
151 | */ | 152 | */ |
152 | static inline gfp_t current_gfp_context(gfp_t flags) | 153 | static inline gfp_t current_gfp_context(gfp_t flags) |
153 | { | 154 | { |
154 | /* | 155 | if (unlikely(current->flags & |
155 | * NOIO implies both NOIO and NOFS and it is a weaker context | 156 | (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_NOCMA))) { |
156 | * so always make sure it makes precedence | 157 | /* |
157 | */ | 158 | * NOIO implies both NOIO and NOFS and it is a weaker context |
158 | if (unlikely(current->flags & PF_MEMALLOC_NOIO)) | 159 | * so always make sure it makes precedence |
159 | flags &= ~(__GFP_IO | __GFP_FS); | 160 | */ |
160 | else if (unlikely(current->flags & PF_MEMALLOC_NOFS)) | 161 | if (current->flags & PF_MEMALLOC_NOIO) |
161 | flags &= ~__GFP_FS; | 162 | flags &= ~(__GFP_IO | __GFP_FS); |
163 | else if (current->flags & PF_MEMALLOC_NOFS) | ||
164 | flags &= ~__GFP_FS; | ||
165 | #ifdef CONFIG_CMA | ||
166 | if (current->flags & PF_MEMALLOC_NOCMA) | ||
167 | flags &= ~__GFP_MOVABLE; | ||
168 | #endif | ||
169 | } | ||
162 | return flags; | 170 | return flags; |
163 | } | 171 | } |
164 | 172 | ||
@@ -248,6 +256,30 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) | |||
248 | current->flags = (current->flags & ~PF_MEMALLOC) | flags; | 256 | current->flags = (current->flags & ~PF_MEMALLOC) | flags; |
249 | } | 257 | } |
250 | 258 | ||
259 | #ifdef CONFIG_CMA | ||
260 | static inline unsigned int memalloc_nocma_save(void) | ||
261 | { | ||
262 | unsigned int flags = current->flags & PF_MEMALLOC_NOCMA; | ||
263 | |||
264 | current->flags |= PF_MEMALLOC_NOCMA; | ||
265 | return flags; | ||
266 | } | ||
267 | |||
268 | static inline void memalloc_nocma_restore(unsigned int flags) | ||
269 | { | ||
270 | current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags; | ||
271 | } | ||
272 | #else | ||
273 | static inline unsigned int memalloc_nocma_save(void) | ||
274 | { | ||
275 | return 0; | ||
276 | } | ||
277 | |||
278 | static inline void memalloc_nocma_restore(unsigned int flags) | ||
279 | { | ||
280 | } | ||
281 | #endif | ||
282 | |||
251 | #ifdef CONFIG_MEMCG | 283 | #ifdef CONFIG_MEMCG |
252 | /** | 284 | /** |
253 | * memalloc_use_memcg - Starts the remote memcg charging scope. | 285 | * memalloc_use_memcg - Starts the remote memcg charging scope. |
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f155dc607112..f3fb1edb3526 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h | |||
@@ -72,7 +72,8 @@ extern void shmem_unlock_mapping(struct address_space *mapping); | |||
72 | extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, | 72 | extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, |
73 | pgoff_t index, gfp_t gfp_mask); | 73 | pgoff_t index, gfp_t gfp_mask); |
74 | extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); | 74 | extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); |
75 | extern int shmem_unuse(swp_entry_t entry, struct page *page); | 75 | extern int shmem_unuse(unsigned int type, bool frontswap, |
76 | unsigned long *fs_pages_to_unuse); | ||
76 | 77 | ||
77 | extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); | 78 | extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); |
78 | extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, | 79 | extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, |
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 3a1a1dbc6f49..d2153789bd9f 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h | |||
@@ -81,12 +81,12 @@ struct kmem_cache_order_objects { | |||
81 | */ | 81 | */ |
82 | struct kmem_cache { | 82 | struct kmem_cache { |
83 | struct kmem_cache_cpu __percpu *cpu_slab; | 83 | struct kmem_cache_cpu __percpu *cpu_slab; |
84 | /* Used for retriving partial slabs etc */ | 84 | /* Used for retrieving partial slabs, etc. */ |
85 | slab_flags_t flags; | 85 | slab_flags_t flags; |
86 | unsigned long min_partial; | 86 | unsigned long min_partial; |
87 | unsigned int size; /* The size of an object including meta data */ | 87 | unsigned int size; /* The size of an object including metadata */ |
88 | unsigned int object_size;/* The size of an object without meta data */ | 88 | unsigned int object_size;/* The size of an object without metadata */ |
89 | unsigned int offset; /* Free pointer offset. */ | 89 | unsigned int offset; /* Free pointer offset */ |
90 | #ifdef CONFIG_SLUB_CPU_PARTIAL | 90 | #ifdef CONFIG_SLUB_CPU_PARTIAL |
91 | /* Number of per cpu partial objects to keep around */ | 91 | /* Number of per cpu partial objects to keep around */ |
92 | unsigned int cpu_partial; | 92 | unsigned int cpu_partial; |
@@ -110,7 +110,7 @@ struct kmem_cache { | |||
110 | #endif | 110 | #endif |
111 | #ifdef CONFIG_MEMCG | 111 | #ifdef CONFIG_MEMCG |
112 | struct memcg_cache_params memcg_params; | 112 | struct memcg_cache_params memcg_params; |
113 | /* for propagation, maximum size of a stored attr */ | 113 | /* For propagation, maximum size of a stored attr */ |
114 | unsigned int max_attr_size; | 114 | unsigned int max_attr_size; |
115 | #ifdef CONFIG_SYSFS | 115 | #ifdef CONFIG_SYSFS |
116 | struct kset *memcg_kset; | 116 | struct kset *memcg_kset; |
@@ -151,7 +151,7 @@ struct kmem_cache { | |||
151 | #else | 151 | #else |
152 | #define slub_cpu_partial(s) (0) | 152 | #define slub_cpu_partial(s) (0) |
153 | #define slub_set_cpu_partial(s, n) | 153 | #define slub_set_cpu_partial(s, n) |
154 | #endif // CONFIG_SLUB_CPU_PARTIAL | 154 | #endif /* CONFIG_SLUB_CPU_PARTIAL */ |
155 | 155 | ||
156 | #ifdef CONFIG_SYSFS | 156 | #ifdef CONFIG_SYSFS |
157 | #define SLAB_SUPPORTS_SYSFS | 157 | #define SLAB_SUPPORTS_SYSFS |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 622025ac1461..fc50e21b3b88 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -307,7 +307,7 @@ struct vma_swap_readahead { | |||
307 | }; | 307 | }; |
308 | 308 | ||
309 | /* linux/mm/workingset.c */ | 309 | /* linux/mm/workingset.c */ |
310 | void *workingset_eviction(struct address_space *mapping, struct page *page); | 310 | void *workingset_eviction(struct page *page); |
311 | void workingset_refault(struct page *page, void *shadow); | 311 | void workingset_refault(struct page *page, void *shadow); |
312 | void workingset_activation(struct page *page); | 312 | void workingset_activation(struct page *page); |
313 | 313 | ||
@@ -625,7 +625,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) | |||
625 | return vm_swappiness; | 625 | return vm_swappiness; |
626 | 626 | ||
627 | /* root ? */ | 627 | /* root ? */ |
628 | if (mem_cgroup_disabled() || !memcg->css.parent) | 628 | if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg)) |
629 | return vm_swappiness; | 629 | return vm_swappiness; |
630 | 630 | ||
631 | return memcg->swappiness; | 631 | return memcg->swappiness; |
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 6448cdd9a350..a2f8658f1c55 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h | |||
@@ -41,6 +41,7 @@ | |||
41 | #define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ | 41 | #define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ |
42 | #define F_SEAL_GROW 0x0004 /* prevent file from growing */ | 42 | #define F_SEAL_GROW 0x0004 /* prevent file from growing */ |
43 | #define F_SEAL_WRITE 0x0008 /* prevent writes */ | 43 | #define F_SEAL_WRITE 0x0008 /* prevent writes */ |
44 | #define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ | ||
44 | /* (1U << 31) is reserved for signed error codes */ | 45 | /* (1U << 31) is reserved for signed error codes */ |
45 | 46 | ||
46 | /* | 47 | /* |
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h index 21b9113c69da..6f2f2720f3ac 100644 --- a/include/uapi/linux/kernel-page-flags.h +++ b/include/uapi/linux/kernel-page-flags.h | |||
@@ -32,7 +32,7 @@ | |||
32 | 32 | ||
33 | #define KPF_KSM 21 | 33 | #define KPF_KSM 21 |
34 | #define KPF_THP 22 | 34 | #define KPF_THP 22 |
35 | #define KPF_BALLOON 23 | 35 | #define KPF_OFFLINE 23 |
36 | #define KPF_ZERO_PAGE 24 | 36 | #define KPF_ZERO_PAGE 24 |
37 | #define KPF_IDLE 25 | 37 | #define KPF_IDLE 25 |
38 | #define KPF_PGTABLE 26 | 38 | #define KPF_PGTABLE 26 |
diff --git a/init/init_task.c b/init/init_task.c index 46dbf546264d..df0257c5928c 100644 --- a/init/init_task.c +++ b/init/init_task.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/fs.h> | 10 | #include <linux/fs.h> |
11 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
12 | #include <linux/audit.h> | 12 | #include <linux/audit.h> |
13 | #include <linux/numa.h> | ||
13 | 14 | ||
14 | #include <asm/pgtable.h> | 15 | #include <asm/pgtable.h> |
15 | #include <linux/uaccess.h> | 16 | #include <linux/uaccess.h> |
@@ -154,7 +155,7 @@ struct task_struct init_task | |||
154 | .vtime.state = VTIME_SYS, | 155 | .vtime.state = VTIME_SYS, |
155 | #endif | 156 | #endif |
156 | #ifdef CONFIG_NUMA_BALANCING | 157 | #ifdef CONFIG_NUMA_BALANCING |
157 | .numa_preferred_nid = -1, | 158 | .numa_preferred_nid = NUMA_NO_NODE, |
158 | .numa_group = NULL, | 159 | .numa_group = NULL, |
159 | .numa_faults = NULL, | 160 | .numa_faults = NULL, |
160 | #endif | 161 | #endif |
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cef98502b124..17828333f7c3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c | |||
@@ -3534,6 +3534,16 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, | |||
3534 | return ret ?: nbytes; | 3534 | return ret ?: nbytes; |
3535 | } | 3535 | } |
3536 | 3536 | ||
3537 | static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) | ||
3538 | { | ||
3539 | struct cftype *cft = of->kn->priv; | ||
3540 | |||
3541 | if (cft->poll) | ||
3542 | return cft->poll(of, pt); | ||
3543 | |||
3544 | return kernfs_generic_poll(of, pt); | ||
3545 | } | ||
3546 | |||
3537 | static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) | 3547 | static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) |
3538 | { | 3548 | { |
3539 | return seq_cft(seq)->seq_start(seq, ppos); | 3549 | return seq_cft(seq)->seq_start(seq, ppos); |
@@ -3572,6 +3582,7 @@ static struct kernfs_ops cgroup_kf_single_ops = { | |||
3572 | .open = cgroup_file_open, | 3582 | .open = cgroup_file_open, |
3573 | .release = cgroup_file_release, | 3583 | .release = cgroup_file_release, |
3574 | .write = cgroup_file_write, | 3584 | .write = cgroup_file_write, |
3585 | .poll = cgroup_file_poll, | ||
3575 | .seq_show = cgroup_seqfile_show, | 3586 | .seq_show = cgroup_seqfile_show, |
3576 | }; | 3587 | }; |
3577 | 3588 | ||
@@ -3580,6 +3591,7 @@ static struct kernfs_ops cgroup_kf_ops = { | |||
3580 | .open = cgroup_file_open, | 3591 | .open = cgroup_file_open, |
3581 | .release = cgroup_file_release, | 3592 | .release = cgroup_file_release, |
3582 | .write = cgroup_file_write, | 3593 | .write = cgroup_file_write, |
3594 | .poll = cgroup_file_poll, | ||
3583 | .seq_start = cgroup_seqfile_start, | 3595 | .seq_start = cgroup_seqfile_start, |
3584 | .seq_next = cgroup_seqfile_next, | 3596 | .seq_next = cgroup_seqfile_next, |
3585 | .seq_stop = cgroup_seqfile_stop, | 3597 | .seq_stop = cgroup_seqfile_stop, |
diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 933cb3e45b98..093c9f917ed0 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c | |||
@@ -464,6 +464,8 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
464 | VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); | 464 | VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); |
465 | #ifdef CONFIG_HUGETLB_PAGE | 465 | #ifdef CONFIG_HUGETLB_PAGE |
466 | VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); | 466 | VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); |
467 | #define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) | ||
468 | VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); | ||
467 | #endif | 469 | #endif |
468 | 470 | ||
469 | arch_crash_save_vmcoreinfo(); | 471 | arch_crash_save_vmcoreinfo(); |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 9cf20cc5ebe3..5942eeafb9ac 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/freezer.h> | 20 | #include <linux/freezer.h> |
21 | #include <linux/ptrace.h> | 21 | #include <linux/ptrace.h> |
22 | #include <linux/uaccess.h> | 22 | #include <linux/uaccess.h> |
23 | #include <linux/numa.h> | ||
23 | #include <trace/events/sched.h> | 24 | #include <trace/events/sched.h> |
24 | 25 | ||
25 | static DEFINE_SPINLOCK(kthread_create_lock); | 26 | static DEFINE_SPINLOCK(kthread_create_lock); |
@@ -681,7 +682,7 @@ __kthread_create_worker(int cpu, unsigned int flags, | |||
681 | { | 682 | { |
682 | struct kthread_worker *worker; | 683 | struct kthread_worker *worker; |
683 | struct task_struct *task; | 684 | struct task_struct *task; |
684 | int node = -1; | 685 | int node = NUMA_NO_NODE; |
685 | 686 | ||
686 | worker = kzalloc(sizeof(*worker), GFP_KERNEL); | 687 | worker = kzalloc(sizeof(*worker), GFP_KERNEL); |
687 | if (!worker) | 688 | if (!worker) |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 640b2034edd6..4802b039b89f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -1215,14 +1215,16 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) | |||
1215 | if (!pfn_valid(pfn)) | 1215 | if (!pfn_valid(pfn)) |
1216 | return NULL; | 1216 | return NULL; |
1217 | 1217 | ||
1218 | page = pfn_to_page(pfn); | 1218 | page = pfn_to_online_page(pfn); |
1219 | if (page_zone(page) != zone) | 1219 | if (!page || page_zone(page) != zone) |
1220 | return NULL; | 1220 | return NULL; |
1221 | 1221 | ||
1222 | BUG_ON(!PageHighMem(page)); | 1222 | BUG_ON(!PageHighMem(page)); |
1223 | 1223 | ||
1224 | if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || | 1224 | if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) |
1225 | PageReserved(page)) | 1225 | return NULL; |
1226 | |||
1227 | if (PageReserved(page) || PageOffline(page)) | ||
1226 | return NULL; | 1228 | return NULL; |
1227 | 1229 | ||
1228 | if (page_is_guard(page)) | 1230 | if (page_is_guard(page)) |
@@ -1277,8 +1279,8 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) | |||
1277 | if (!pfn_valid(pfn)) | 1279 | if (!pfn_valid(pfn)) |
1278 | return NULL; | 1280 | return NULL; |
1279 | 1281 | ||
1280 | page = pfn_to_page(pfn); | 1282 | page = pfn_to_online_page(pfn); |
1281 | if (page_zone(page) != zone) | 1283 | if (!page || page_zone(page) != zone) |
1282 | return NULL; | 1284 | return NULL; |
1283 | 1285 | ||
1284 | BUG_ON(PageHighMem(page)); | 1286 | BUG_ON(PageHighMem(page)); |
@@ -1286,6 +1288,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) | |||
1286 | if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) | 1288 | if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) |
1287 | return NULL; | 1289 | return NULL; |
1288 | 1290 | ||
1291 | if (PageOffline(page)) | ||
1292 | return NULL; | ||
1293 | |||
1289 | if (PageReserved(page) | 1294 | if (PageReserved(page) |
1290 | && (!kernel_page_present(page) || pfn_is_nosave(pfn))) | 1295 | && (!kernel_page_present(page) || pfn_is_nosave(pfn))) |
1291 | return NULL; | 1296 | return NULL; |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f3901b84d217..ead464a0f2e5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -2220,6 +2220,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2220 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 2220 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
2221 | #endif | 2221 | #endif |
2222 | 2222 | ||
2223 | #ifdef CONFIG_COMPACTION | ||
2224 | p->capture_control = NULL; | ||
2225 | #endif | ||
2223 | init_numa_balancing(clone_flags, p); | 2226 | init_numa_balancing(clone_flags, p); |
2224 | } | 2227 | } |
2225 | 2228 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8213ff6e365d..ea74d43924b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -1173,7 +1173,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) | |||
1173 | 1173 | ||
1174 | /* New address space, reset the preferred nid */ | 1174 | /* New address space, reset the preferred nid */ |
1175 | if (!(clone_flags & CLONE_VM)) { | 1175 | if (!(clone_flags & CLONE_VM)) { |
1176 | p->numa_preferred_nid = -1; | 1176 | p->numa_preferred_nid = NUMA_NO_NODE; |
1177 | return; | 1177 | return; |
1178 | } | 1178 | } |
1179 | 1179 | ||
@@ -1193,13 +1193,13 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) | |||
1193 | 1193 | ||
1194 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) | 1194 | static void account_numa_enqueue(struct rq *rq, struct task_struct *p) |
1195 | { | 1195 | { |
1196 | rq->nr_numa_running += (p->numa_preferred_nid != -1); | 1196 | rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); |
1197 | rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); | 1197 | rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p)); |
1198 | } | 1198 | } |
1199 | 1199 | ||
1200 | static void account_numa_dequeue(struct rq *rq, struct task_struct *p) | 1200 | static void account_numa_dequeue(struct rq *rq, struct task_struct *p) |
1201 | { | 1201 | { |
1202 | rq->nr_numa_running -= (p->numa_preferred_nid != -1); | 1202 | rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE); |
1203 | rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); | 1203 | rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p)); |
1204 | } | 1204 | } |
1205 | 1205 | ||
@@ -1413,7 +1413,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, | |||
1413 | * two full passes of the "multi-stage node selection" test that is | 1413 | * two full passes of the "multi-stage node selection" test that is |
1414 | * executed below. | 1414 | * executed below. |
1415 | */ | 1415 | */ |
1416 | if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) && | 1416 | if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) && |
1417 | (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) | 1417 | (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid))) |
1418 | return true; | 1418 | return true; |
1419 | 1419 | ||
@@ -1861,7 +1861,7 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
1861 | unsigned long interval = HZ; | 1861 | unsigned long interval = HZ; |
1862 | 1862 | ||
1863 | /* This task has no NUMA fault statistics yet */ | 1863 | /* This task has no NUMA fault statistics yet */ |
1864 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) | 1864 | if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults)) |
1865 | return; | 1865 | return; |
1866 | 1866 | ||
1867 | /* Periodically retry migrating the task to the preferred node */ | 1867 | /* Periodically retry migrating the task to the preferred node */ |
@@ -2108,7 +2108,7 @@ static int preferred_group_nid(struct task_struct *p, int nid) | |||
2108 | 2108 | ||
2109 | static void task_numa_placement(struct task_struct *p) | 2109 | static void task_numa_placement(struct task_struct *p) |
2110 | { | 2110 | { |
2111 | int seq, nid, max_nid = -1; | 2111 | int seq, nid, max_nid = NUMA_NO_NODE; |
2112 | unsigned long max_faults = 0; | 2112 | unsigned long max_faults = 0; |
2113 | unsigned long fault_types[2] = { 0, 0 }; | 2113 | unsigned long fault_types[2] = { 0, 0 }; |
2114 | unsigned long total_faults; | 2114 | unsigned long total_faults; |
@@ -2651,7 +2651,8 @@ static void update_scan_period(struct task_struct *p, int new_cpu) | |||
2651 | * the preferred node. | 2651 | * the preferred node. |
2652 | */ | 2652 | */ |
2653 | if (dst_nid == p->numa_preferred_nid || | 2653 | if (dst_nid == p->numa_preferred_nid || |
2654 | (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid)) | 2654 | (p->numa_preferred_nid != NUMA_NO_NODE && |
2655 | src_nid != p->numa_preferred_nid)) | ||
2655 | return; | 2656 | return; |
2656 | } | 2657 | } |
2657 | 2658 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7c2b9bc88ee8..14f30b4a1b64 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -1471,7 +1471,7 @@ static struct ctl_table vm_table[] = { | |||
1471 | .data = &sysctl_extfrag_threshold, | 1471 | .data = &sysctl_extfrag_threshold, |
1472 | .maxlen = sizeof(int), | 1472 | .maxlen = sizeof(int), |
1473 | .mode = 0644, | 1473 | .mode = 0644, |
1474 | .proc_handler = sysctl_extfrag_handler, | 1474 | .proc_handler = proc_dointvec_minmax, |
1475 | .extra1 = &min_extfrag_threshold, | 1475 | .extra1 = &min_extfrag_threshold, |
1476 | .extra2 = &max_extfrag_threshold, | 1476 | .extra2 = &max_extfrag_threshold, |
1477 | }, | 1477 | }, |
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d4df5b24d75e..e6a7b01932e6 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug | |||
@@ -222,7 +222,6 @@ config ENABLE_MUST_CHECK | |||
222 | config FRAME_WARN | 222 | config FRAME_WARN |
223 | int "Warn for stack frames larger than (needs gcc 4.4)" | 223 | int "Warn for stack frames larger than (needs gcc 4.4)" |
224 | range 0 8192 | 224 | range 0 8192 |
225 | default 3072 if KASAN_EXTRA | ||
226 | default 2048 if GCC_PLUGIN_LATENT_ENTROPY | 225 | default 2048 if GCC_PLUGIN_LATENT_ENTROPY |
227 | default 1280 if (!64BIT && PARISC) | 226 | default 1280 if (!64BIT && PARISC) |
228 | default 1024 if (!64BIT && !PARISC) | 227 | default 1024 if (!64BIT && !PARISC) |
@@ -266,23 +265,6 @@ config UNUSED_SYMBOLS | |||
266 | you really need it, and what the merge plan to the mainline kernel for | 265 | you really need it, and what the merge plan to the mainline kernel for |
267 | your module is. | 266 | your module is. |
268 | 267 | ||
269 | config PAGE_OWNER | ||
270 | bool "Track page owner" | ||
271 | depends on DEBUG_KERNEL && STACKTRACE_SUPPORT | ||
272 | select DEBUG_FS | ||
273 | select STACKTRACE | ||
274 | select STACKDEPOT | ||
275 | select PAGE_EXTENSION | ||
276 | help | ||
277 | This keeps track of what call chain is the owner of a page, may | ||
278 | help to find bare alloc_page(s) leaks. Even if you include this | ||
279 | feature on your build, it is disabled in default. You should pass | ||
280 | "page_owner=on" to boot parameter in order to enable it. Eats | ||
281 | a fair amount of memory if enabled. See tools/vm/page_owner_sort.c | ||
282 | for user-space helper. | ||
283 | |||
284 | If unsure, say N. | ||
285 | |||
286 | config DEBUG_FS | 268 | config DEBUG_FS |
287 | bool "Debug Filesystem" | 269 | bool "Debug Filesystem" |
288 | help | 270 | help |
@@ -1876,6 +1858,19 @@ config TEST_LKM | |||
1876 | 1858 | ||
1877 | If unsure, say N. | 1859 | If unsure, say N. |
1878 | 1860 | ||
1861 | config TEST_VMALLOC | ||
1862 | tristate "Test module for stress/performance analysis of vmalloc allocator" | ||
1863 | default n | ||
1864 | depends on MMU | ||
1865 | depends on m | ||
1866 | help | ||
1867 | This builds the "test_vmalloc" module that should be used for | ||
1868 | stress and performance analysis. So, any new change for vmalloc | ||
1869 | subsystem can be evaluated from performance and stability point | ||
1870 | of view. | ||
1871 | |||
1872 | If unsure, say N. | ||
1873 | |||
1879 | config TEST_USER_COPY | 1874 | config TEST_USER_COPY |
1880 | tristate "Test user/kernel boundary protections" | 1875 | tristate "Test user/kernel boundary protections" |
1881 | depends on m | 1876 | depends on m |
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 9737059ec58b..9950b660e62d 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan | |||
@@ -78,16 +78,6 @@ config KASAN_SW_TAGS | |||
78 | 78 | ||
79 | endchoice | 79 | endchoice |
80 | 80 | ||
81 | config KASAN_EXTRA | ||
82 | bool "KASAN: extra checks" | ||
83 | depends on KASAN_GENERIC && DEBUG_KERNEL && !COMPILE_TEST | ||
84 | help | ||
85 | This enables further checks in generic KASAN, for now it only | ||
86 | includes the address-use-after-scope check that can lead to | ||
87 | excessive kernel stack usage, frame size warnings and longer | ||
88 | compile time. | ||
89 | See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81715 | ||
90 | |||
91 | choice | 81 | choice |
92 | prompt "Instrumentation type" | 82 | prompt "Instrumentation type" |
93 | depends on KASAN | 83 | depends on KASAN |
diff --git a/lib/Makefile b/lib/Makefile index e1b59da71418..cbfacd55aeca 100644 --- a/lib/Makefile +++ b/lib/Makefile | |||
@@ -60,6 +60,7 @@ UBSAN_SANITIZE_test_ubsan.o := y | |||
60 | obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o | 60 | obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o |
61 | obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o | 61 | obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o |
62 | obj-$(CONFIG_TEST_LKM) += test_module.o | 62 | obj-$(CONFIG_TEST_LKM) += test_module.o |
63 | obj-$(CONFIG_TEST_VMALLOC) += test_vmalloc.o | ||
63 | obj-$(CONFIG_TEST_OVERFLOW) += test_overflow.o | 64 | obj-$(CONFIG_TEST_OVERFLOW) += test_overflow.o |
64 | obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o | 65 | obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o |
65 | obj-$(CONFIG_TEST_SORT) += test_sort.o | 66 | obj-$(CONFIG_TEST_SORT) += test_sort.o |
diff --git a/lib/cpumask.c b/lib/cpumask.c index 8d666ab84b5c..087a3e9a0202 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c | |||
@@ -5,6 +5,7 @@ | |||
5 | #include <linux/cpumask.h> | 5 | #include <linux/cpumask.h> |
6 | #include <linux/export.h> | 6 | #include <linux/export.h> |
7 | #include <linux/memblock.h> | 7 | #include <linux/memblock.h> |
8 | #include <linux/numa.h> | ||
8 | 9 | ||
9 | /** | 10 | /** |
10 | * cpumask_next - get the next cpu in a cpumask | 11 | * cpumask_next - get the next cpu in a cpumask |
@@ -206,7 +207,7 @@ unsigned int cpumask_local_spread(unsigned int i, int node) | |||
206 | /* Wrap: we always want a cpu. */ | 207 | /* Wrap: we always want a cpu. */ |
207 | i %= num_online_cpus(); | 208 | i %= num_online_cpus(); |
208 | 209 | ||
209 | if (node == -1) { | 210 | if (node == NUMA_NO_NODE) { |
210 | for_each_cpu(cpu, cpu_online_mask) | 211 | for_each_cpu(cpu, cpu_online_mask) |
211 | if (i-- == 0) | 212 | if (i-- == 0) |
212 | return cpu; | 213 | return cpu; |
diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 51b78405bf24..7de2702621dc 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c | |||
@@ -480,29 +480,6 @@ static noinline void __init copy_user_test(void) | |||
480 | kfree(kmem); | 480 | kfree(kmem); |
481 | } | 481 | } |
482 | 482 | ||
483 | static noinline void __init use_after_scope_test(void) | ||
484 | { | ||
485 | volatile char *volatile p; | ||
486 | |||
487 | pr_info("use-after-scope on int\n"); | ||
488 | { | ||
489 | int local = 0; | ||
490 | |||
491 | p = (char *)&local; | ||
492 | } | ||
493 | p[0] = 1; | ||
494 | p[3] = 1; | ||
495 | |||
496 | pr_info("use-after-scope on array\n"); | ||
497 | { | ||
498 | char local[1024] = {0}; | ||
499 | |||
500 | p = local; | ||
501 | } | ||
502 | p[0] = 1; | ||
503 | p[1023] = 1; | ||
504 | } | ||
505 | |||
506 | static noinline void __init kasan_alloca_oob_left(void) | 483 | static noinline void __init kasan_alloca_oob_left(void) |
507 | { | 484 | { |
508 | volatile int i = 10; | 485 | volatile int i = 10; |
@@ -682,7 +659,6 @@ static int __init kmalloc_tests_init(void) | |||
682 | kasan_alloca_oob_right(); | 659 | kasan_alloca_oob_right(); |
683 | ksize_unpoisons_memory(); | 660 | ksize_unpoisons_memory(); |
684 | copy_user_test(); | 661 | copy_user_test(); |
685 | use_after_scope_test(); | ||
686 | kmem_cache_double_free(); | 662 | kmem_cache_double_free(); |
687 | kmem_cache_invalid_free(); | 663 | kmem_cache_invalid_free(); |
688 | kasan_memchr(); | 664 | kasan_memchr(); |
diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c new file mode 100644 index 000000000000..83cdcaa82bf6 --- /dev/null +++ b/lib/test_vmalloc.c | |||
@@ -0,0 +1,551 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | |||
3 | /* | ||
4 | * Test module for stress and analyze performance of vmalloc allocator. | ||
5 | * (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com> | ||
6 | */ | ||
7 | #include <linux/init.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/vmalloc.h> | ||
11 | #include <linux/random.h> | ||
12 | #include <linux/kthread.h> | ||
13 | #include <linux/moduleparam.h> | ||
14 | #include <linux/completion.h> | ||
15 | #include <linux/delay.h> | ||
16 | #include <linux/rwsem.h> | ||
17 | #include <linux/mm.h> | ||
18 | |||
19 | #define __param(type, name, init, msg) \ | ||
20 | static type name = init; \ | ||
21 | module_param(name, type, 0444); \ | ||
22 | MODULE_PARM_DESC(name, msg) \ | ||
23 | |||
24 | __param(bool, single_cpu_test, false, | ||
25 | "Use single first online CPU to run tests"); | ||
26 | |||
27 | __param(bool, sequential_test_order, false, | ||
28 | "Use sequential stress tests order"); | ||
29 | |||
30 | __param(int, test_repeat_count, 1, | ||
31 | "Set test repeat counter"); | ||
32 | |||
33 | __param(int, test_loop_count, 1000000, | ||
34 | "Set test loop counter"); | ||
35 | |||
36 | __param(int, run_test_mask, INT_MAX, | ||
37 | "Set tests specified in the mask.\n\n" | ||
38 | "\t\tid: 1, name: fix_size_alloc_test\n" | ||
39 | "\t\tid: 2, name: full_fit_alloc_test\n" | ||
40 | "\t\tid: 4, name: long_busy_list_alloc_test\n" | ||
41 | "\t\tid: 8, name: random_size_alloc_test\n" | ||
42 | "\t\tid: 16, name: fix_align_alloc_test\n" | ||
43 | "\t\tid: 32, name: random_size_align_alloc_test\n" | ||
44 | "\t\tid: 64, name: align_shift_alloc_test\n" | ||
45 | "\t\tid: 128, name: pcpu_alloc_test\n" | ||
46 | /* Add a new test case description here. */ | ||
47 | ); | ||
48 | |||
49 | /* | ||
50 | * Depends on single_cpu_test parameter. If it is true, then | ||
51 | * use first online CPU to trigger a test on, otherwise go with | ||
52 | * all online CPUs. | ||
53 | */ | ||
54 | static cpumask_t cpus_run_test_mask = CPU_MASK_NONE; | ||
55 | |||
56 | /* | ||
57 | * Read write semaphore for synchronization of setup | ||
58 | * phase that is done in main thread and workers. | ||
59 | */ | ||
60 | static DECLARE_RWSEM(prepare_for_test_rwsem); | ||
61 | |||
62 | /* | ||
63 | * Completion tracking for worker threads. | ||
64 | */ | ||
65 | static DECLARE_COMPLETION(test_all_done_comp); | ||
66 | static atomic_t test_n_undone = ATOMIC_INIT(0); | ||
67 | |||
68 | static inline void | ||
69 | test_report_one_done(void) | ||
70 | { | ||
71 | if (atomic_dec_and_test(&test_n_undone)) | ||
72 | complete(&test_all_done_comp); | ||
73 | } | ||
74 | |||
75 | static int random_size_align_alloc_test(void) | ||
76 | { | ||
77 | unsigned long size, align, rnd; | ||
78 | void *ptr; | ||
79 | int i; | ||
80 | |||
81 | for (i = 0; i < test_loop_count; i++) { | ||
82 | get_random_bytes(&rnd, sizeof(rnd)); | ||
83 | |||
84 | /* | ||
85 | * Maximum 1024 pages, if PAGE_SIZE is 4096. | ||
86 | */ | ||
87 | align = 1 << (rnd % 23); | ||
88 | |||
89 | /* | ||
90 | * Maximum 10 pages. | ||
91 | */ | ||
92 | size = ((rnd % 10) + 1) * PAGE_SIZE; | ||
93 | |||
94 | ptr = __vmalloc_node_range(size, align, | ||
95 | VMALLOC_START, VMALLOC_END, | ||
96 | GFP_KERNEL | __GFP_ZERO, | ||
97 | PAGE_KERNEL, | ||
98 | 0, 0, __builtin_return_address(0)); | ||
99 | |||
100 | if (!ptr) | ||
101 | return -1; | ||
102 | |||
103 | vfree(ptr); | ||
104 | } | ||
105 | |||
106 | return 0; | ||
107 | } | ||
108 | |||
109 | /* | ||
110 | * This test case is supposed to be failed. | ||
111 | */ | ||
112 | static int align_shift_alloc_test(void) | ||
113 | { | ||
114 | unsigned long align; | ||
115 | void *ptr; | ||
116 | int i; | ||
117 | |||
118 | for (i = 0; i < BITS_PER_LONG; i++) { | ||
119 | align = ((unsigned long) 1) << i; | ||
120 | |||
121 | ptr = __vmalloc_node_range(PAGE_SIZE, align, | ||
122 | VMALLOC_START, VMALLOC_END, | ||
123 | GFP_KERNEL | __GFP_ZERO, | ||
124 | PAGE_KERNEL, | ||
125 | 0, 0, __builtin_return_address(0)); | ||
126 | |||
127 | if (!ptr) | ||
128 | return -1; | ||
129 | |||
130 | vfree(ptr); | ||
131 | } | ||
132 | |||
133 | return 0; | ||
134 | } | ||
135 | |||
136 | static int fix_align_alloc_test(void) | ||
137 | { | ||
138 | void *ptr; | ||
139 | int i; | ||
140 | |||
141 | for (i = 0; i < test_loop_count; i++) { | ||
142 | ptr = __vmalloc_node_range(5 * PAGE_SIZE, | ||
143 | THREAD_ALIGN << 1, | ||
144 | VMALLOC_START, VMALLOC_END, | ||
145 | GFP_KERNEL | __GFP_ZERO, | ||
146 | PAGE_KERNEL, | ||
147 | 0, 0, __builtin_return_address(0)); | ||
148 | |||
149 | if (!ptr) | ||
150 | return -1; | ||
151 | |||
152 | vfree(ptr); | ||
153 | } | ||
154 | |||
155 | return 0; | ||
156 | } | ||
157 | |||
158 | static int random_size_alloc_test(void) | ||
159 | { | ||
160 | unsigned int n; | ||
161 | void *p; | ||
162 | int i; | ||
163 | |||
164 | for (i = 0; i < test_loop_count; i++) { | ||
165 | get_random_bytes(&n, sizeof(i)); | ||
166 | n = (n % 100) + 1; | ||
167 | |||
168 | p = vmalloc(n * PAGE_SIZE); | ||
169 | |||
170 | if (!p) | ||
171 | return -1; | ||
172 | |||
173 | *((__u8 *)p) = 1; | ||
174 | vfree(p); | ||
175 | } | ||
176 | |||
177 | return 0; | ||
178 | } | ||
179 | |||
180 | static int long_busy_list_alloc_test(void) | ||
181 | { | ||
182 | void *ptr_1, *ptr_2; | ||
183 | void **ptr; | ||
184 | int rv = -1; | ||
185 | int i; | ||
186 | |||
187 | ptr = vmalloc(sizeof(void *) * 15000); | ||
188 | if (!ptr) | ||
189 | return rv; | ||
190 | |||
191 | for (i = 0; i < 15000; i++) | ||
192 | ptr[i] = vmalloc(1 * PAGE_SIZE); | ||
193 | |||
194 | for (i = 0; i < test_loop_count; i++) { | ||
195 | ptr_1 = vmalloc(100 * PAGE_SIZE); | ||
196 | if (!ptr_1) | ||
197 | goto leave; | ||
198 | |||
199 | ptr_2 = vmalloc(1 * PAGE_SIZE); | ||
200 | if (!ptr_2) { | ||
201 | vfree(ptr_1); | ||
202 | goto leave; | ||
203 | } | ||
204 | |||
205 | *((__u8 *)ptr_1) = 0; | ||
206 | *((__u8 *)ptr_2) = 1; | ||
207 | |||
208 | vfree(ptr_1); | ||
209 | vfree(ptr_2); | ||
210 | } | ||
211 | |||
212 | /* Success */ | ||
213 | rv = 0; | ||
214 | |||
215 | leave: | ||
216 | for (i = 0; i < 15000; i++) | ||
217 | vfree(ptr[i]); | ||
218 | |||
219 | vfree(ptr); | ||
220 | return rv; | ||
221 | } | ||
222 | |||
223 | static int full_fit_alloc_test(void) | ||
224 | { | ||
225 | void **ptr, **junk_ptr, *tmp; | ||
226 | int junk_length; | ||
227 | int rv = -1; | ||
228 | int i; | ||
229 | |||
230 | junk_length = fls(num_online_cpus()); | ||
231 | junk_length *= (32 * 1024 * 1024 / PAGE_SIZE); | ||
232 | |||
233 | ptr = vmalloc(sizeof(void *) * junk_length); | ||
234 | if (!ptr) | ||
235 | return rv; | ||
236 | |||
237 | junk_ptr = vmalloc(sizeof(void *) * junk_length); | ||
238 | if (!junk_ptr) { | ||
239 | vfree(ptr); | ||
240 | return rv; | ||
241 | } | ||
242 | |||
243 | for (i = 0; i < junk_length; i++) { | ||
244 | ptr[i] = vmalloc(1 * PAGE_SIZE); | ||
245 | junk_ptr[i] = vmalloc(1 * PAGE_SIZE); | ||
246 | } | ||
247 | |||
248 | for (i = 0; i < junk_length; i++) | ||
249 | vfree(junk_ptr[i]); | ||
250 | |||
251 | for (i = 0; i < test_loop_count; i++) { | ||
252 | tmp = vmalloc(1 * PAGE_SIZE); | ||
253 | |||
254 | if (!tmp) | ||
255 | goto error; | ||
256 | |||
257 | *((__u8 *)tmp) = 1; | ||
258 | vfree(tmp); | ||
259 | } | ||
260 | |||
261 | /* Success */ | ||
262 | rv = 0; | ||
263 | |||
264 | error: | ||
265 | for (i = 0; i < junk_length; i++) | ||
266 | vfree(ptr[i]); | ||
267 | |||
268 | vfree(ptr); | ||
269 | vfree(junk_ptr); | ||
270 | |||
271 | return rv; | ||
272 | } | ||
273 | |||
274 | static int fix_size_alloc_test(void) | ||
275 | { | ||
276 | void *ptr; | ||
277 | int i; | ||
278 | |||
279 | for (i = 0; i < test_loop_count; i++) { | ||
280 | ptr = vmalloc(3 * PAGE_SIZE); | ||
281 | |||
282 | if (!ptr) | ||
283 | return -1; | ||
284 | |||
285 | *((__u8 *)ptr) = 0; | ||
286 | |||
287 | vfree(ptr); | ||
288 | } | ||
289 | |||
290 | return 0; | ||
291 | } | ||
292 | |||
293 | static int | ||
294 | pcpu_alloc_test(void) | ||
295 | { | ||
296 | int rv = 0; | ||
297 | #ifndef CONFIG_NEED_PER_CPU_KM | ||
298 | void __percpu **pcpu; | ||
299 | size_t size, align; | ||
300 | int i; | ||
301 | |||
302 | pcpu = vmalloc(sizeof(void __percpu *) * 35000); | ||
303 | if (!pcpu) | ||
304 | return -1; | ||
305 | |||
306 | for (i = 0; i < 35000; i++) { | ||
307 | unsigned int r; | ||
308 | |||
309 | get_random_bytes(&r, sizeof(i)); | ||
310 | size = (r % (PAGE_SIZE / 4)) + 1; | ||
311 | |||
312 | /* | ||
313 | * Maximum PAGE_SIZE | ||
314 | */ | ||
315 | get_random_bytes(&r, sizeof(i)); | ||
316 | align = 1 << ((i % 11) + 1); | ||
317 | |||
318 | pcpu[i] = __alloc_percpu(size, align); | ||
319 | if (!pcpu[i]) | ||
320 | rv = -1; | ||
321 | } | ||
322 | |||
323 | for (i = 0; i < 35000; i++) | ||
324 | free_percpu(pcpu[i]); | ||
325 | |||
326 | vfree(pcpu); | ||
327 | #endif | ||
328 | return rv; | ||
329 | } | ||
330 | |||
331 | struct test_case_desc { | ||
332 | const char *test_name; | ||
333 | int (*test_func)(void); | ||
334 | }; | ||
335 | |||
336 | static struct test_case_desc test_case_array[] = { | ||
337 | { "fix_size_alloc_test", fix_size_alloc_test }, | ||
338 | { "full_fit_alloc_test", full_fit_alloc_test }, | ||
339 | { "long_busy_list_alloc_test", long_busy_list_alloc_test }, | ||
340 | { "random_size_alloc_test", random_size_alloc_test }, | ||
341 | { "fix_align_alloc_test", fix_align_alloc_test }, | ||
342 | { "random_size_align_alloc_test", random_size_align_alloc_test }, | ||
343 | { "align_shift_alloc_test", align_shift_alloc_test }, | ||
344 | { "pcpu_alloc_test", pcpu_alloc_test }, | ||
345 | /* Add a new test case here. */ | ||
346 | }; | ||
347 | |||
348 | struct test_case_data { | ||
349 | int test_failed; | ||
350 | int test_passed; | ||
351 | u64 time; | ||
352 | }; | ||
353 | |||
354 | /* Split it to get rid of: WARNING: line over 80 characters */ | ||
355 | static struct test_case_data | ||
356 | per_cpu_test_data[NR_CPUS][ARRAY_SIZE(test_case_array)]; | ||
357 | |||
358 | static struct test_driver { | ||
359 | struct task_struct *task; | ||
360 | unsigned long start; | ||
361 | unsigned long stop; | ||
362 | int cpu; | ||
363 | } per_cpu_test_driver[NR_CPUS]; | ||
364 | |||
365 | static void shuffle_array(int *arr, int n) | ||
366 | { | ||
367 | unsigned int rnd; | ||
368 | int i, j, x; | ||
369 | |||
370 | for (i = n - 1; i > 0; i--) { | ||
371 | get_random_bytes(&rnd, sizeof(rnd)); | ||
372 | |||
373 | /* Cut the range. */ | ||
374 | j = rnd % i; | ||
375 | |||
376 | /* Swap indexes. */ | ||
377 | x = arr[i]; | ||
378 | arr[i] = arr[j]; | ||
379 | arr[j] = x; | ||
380 | } | ||
381 | } | ||
382 | |||
383 | static int test_func(void *private) | ||
384 | { | ||
385 | struct test_driver *t = private; | ||
386 | cpumask_t newmask = CPU_MASK_NONE; | ||
387 | int random_array[ARRAY_SIZE(test_case_array)]; | ||
388 | int index, i, j, ret; | ||
389 | ktime_t kt; | ||
390 | u64 delta; | ||
391 | |||
392 | cpumask_set_cpu(t->cpu, &newmask); | ||
393 | set_cpus_allowed_ptr(current, &newmask); | ||
394 | |||
395 | for (i = 0; i < ARRAY_SIZE(test_case_array); i++) | ||
396 | random_array[i] = i; | ||
397 | |||
398 | if (!sequential_test_order) | ||
399 | shuffle_array(random_array, ARRAY_SIZE(test_case_array)); | ||
400 | |||
401 | /* | ||
402 | * Block until initialization is done. | ||
403 | */ | ||
404 | down_read(&prepare_for_test_rwsem); | ||
405 | |||
406 | t->start = get_cycles(); | ||
407 | for (i = 0; i < ARRAY_SIZE(test_case_array); i++) { | ||
408 | index = random_array[i]; | ||
409 | |||
410 | /* | ||
411 | * Skip tests if run_test_mask has been specified. | ||
412 | */ | ||
413 | if (!((run_test_mask & (1 << index)) >> index)) | ||
414 | continue; | ||
415 | |||
416 | kt = ktime_get(); | ||
417 | for (j = 0; j < test_repeat_count; j++) { | ||
418 | ret = test_case_array[index].test_func(); | ||
419 | if (!ret) | ||
420 | per_cpu_test_data[t->cpu][index].test_passed++; | ||
421 | else | ||
422 | per_cpu_test_data[t->cpu][index].test_failed++; | ||
423 | } | ||
424 | |||
425 | /* | ||
426 | * Take an average time that test took. | ||
427 | */ | ||
428 | delta = (u64) ktime_us_delta(ktime_get(), kt); | ||
429 | do_div(delta, (u32) test_repeat_count); | ||
430 | |||
431 | per_cpu_test_data[t->cpu][index].time = delta; | ||
432 | } | ||
433 | t->stop = get_cycles(); | ||
434 | |||
435 | up_read(&prepare_for_test_rwsem); | ||
436 | test_report_one_done(); | ||
437 | |||
438 | /* | ||
439 | * Wait for the kthread_stop() call. | ||
440 | */ | ||
441 | while (!kthread_should_stop()) | ||
442 | msleep(10); | ||
443 | |||
444 | return 0; | ||
445 | } | ||
446 | |||
447 | static void | ||
448 | init_test_configurtion(void) | ||
449 | { | ||
450 | /* | ||
451 | * Reset all data of all CPUs. | ||
452 | */ | ||
453 | memset(per_cpu_test_data, 0, sizeof(per_cpu_test_data)); | ||
454 | |||
455 | if (single_cpu_test) | ||
456 | cpumask_set_cpu(cpumask_first(cpu_online_mask), | ||
457 | &cpus_run_test_mask); | ||
458 | else | ||
459 | cpumask_and(&cpus_run_test_mask, cpu_online_mask, | ||
460 | cpu_online_mask); | ||
461 | |||
462 | if (test_repeat_count <= 0) | ||
463 | test_repeat_count = 1; | ||
464 | |||
465 | if (test_loop_count <= 0) | ||
466 | test_loop_count = 1; | ||
467 | } | ||
468 | |||
469 | static void do_concurrent_test(void) | ||
470 | { | ||
471 | int cpu, ret; | ||
472 | |||
473 | /* | ||
474 | * Set some basic configurations plus sanity check. | ||
475 | */ | ||
476 | init_test_configurtion(); | ||
477 | |||
478 | /* | ||
479 | * Put on hold all workers. | ||
480 | */ | ||
481 | down_write(&prepare_for_test_rwsem); | ||
482 | |||
483 | for_each_cpu(cpu, &cpus_run_test_mask) { | ||
484 | struct test_driver *t = &per_cpu_test_driver[cpu]; | ||
485 | |||
486 | t->cpu = cpu; | ||
487 | t->task = kthread_run(test_func, t, "vmalloc_test/%d", cpu); | ||
488 | |||
489 | if (!IS_ERR(t->task)) | ||
490 | /* Success. */ | ||
491 | atomic_inc(&test_n_undone); | ||
492 | else | ||
493 | pr_err("Failed to start kthread for %d CPU\n", cpu); | ||
494 | } | ||
495 | |||
496 | /* | ||
497 | * Now let the workers do their job. | ||
498 | */ | ||
499 | up_write(&prepare_for_test_rwsem); | ||
500 | |||
501 | /* | ||
502 | * Sleep quiet until all workers are done with 1 second | ||
503 | * interval. Since the test can take a lot of time we | ||
504 | * can run into a stack trace of the hung task. That is | ||
505 | * why we go with completion_timeout and HZ value. | ||
506 | */ | ||
507 | do { | ||
508 | ret = wait_for_completion_timeout(&test_all_done_comp, HZ); | ||
509 | } while (!ret); | ||
510 | |||
511 | for_each_cpu(cpu, &cpus_run_test_mask) { | ||
512 | struct test_driver *t = &per_cpu_test_driver[cpu]; | ||
513 | int i; | ||
514 | |||
515 | if (!IS_ERR(t->task)) | ||
516 | kthread_stop(t->task); | ||
517 | |||
518 | for (i = 0; i < ARRAY_SIZE(test_case_array); i++) { | ||
519 | if (!((run_test_mask & (1 << i)) >> i)) | ||
520 | continue; | ||
521 | |||
522 | pr_info( | ||
523 | "Summary: %s passed: %d failed: %d repeat: %d loops: %d avg: %llu usec\n", | ||
524 | test_case_array[i].test_name, | ||
525 | per_cpu_test_data[cpu][i].test_passed, | ||
526 | per_cpu_test_data[cpu][i].test_failed, | ||
527 | test_repeat_count, test_loop_count, | ||
528 | per_cpu_test_data[cpu][i].time); | ||
529 | } | ||
530 | |||
531 | pr_info("All test took CPU%d=%lu cycles\n", | ||
532 | cpu, t->stop - t->start); | ||
533 | } | ||
534 | } | ||
535 | |||
536 | static int vmalloc_test_init(void) | ||
537 | { | ||
538 | do_concurrent_test(); | ||
539 | return -EAGAIN; /* Fail will directly unload the module */ | ||
540 | } | ||
541 | |||
542 | static void vmalloc_test_exit(void) | ||
543 | { | ||
544 | } | ||
545 | |||
546 | module_init(vmalloc_test_init) | ||
547 | module_exit(vmalloc_test_exit) | ||
548 | |||
549 | MODULE_LICENSE("GPL"); | ||
550 | MODULE_AUTHOR("Uladzislau Rezki"); | ||
551 | MODULE_DESCRIPTION("vmalloc test module"); | ||
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 9a7b8b049d04..e3df921208c0 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
@@ -39,6 +39,23 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT | |||
39 | Enable debug page memory allocations by default? This value | 39 | Enable debug page memory allocations by default? This value |
40 | can be overridden by debug_pagealloc=off|on. | 40 | can be overridden by debug_pagealloc=off|on. |
41 | 41 | ||
42 | config PAGE_OWNER | ||
43 | bool "Track page owner" | ||
44 | depends on DEBUG_KERNEL && STACKTRACE_SUPPORT | ||
45 | select DEBUG_FS | ||
46 | select STACKTRACE | ||
47 | select STACKDEPOT | ||
48 | select PAGE_EXTENSION | ||
49 | help | ||
50 | This keeps track of what call chain is the owner of a page, may | ||
51 | help to find bare alloc_page(s) leaks. Even if you include this | ||
52 | feature on your build, it is disabled in default. You should pass | ||
53 | "page_owner=on" to boot parameter in order to enable it. Eats | ||
54 | a fair amount of memory if enabled. See tools/vm/page_owner_sort.c | ||
55 | for user-space helper. | ||
56 | |||
57 | If unsure, say N. | ||
58 | |||
42 | config PAGE_POISONING | 59 | config PAGE_POISONING |
43 | bool "Poison pages after freeing" | 60 | bool "Poison pages after freeing" |
44 | select PAGE_POISONING_NO_SANITY if HIBERNATION | 61 | select PAGE_POISONING_NO_SANITY if HIBERNATION |
@@ -353,12 +353,14 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
353 | 353 | ||
354 | ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); | 354 | ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); |
355 | if (ret) | 355 | if (ret) |
356 | goto err; | 356 | goto free_mem; |
357 | 357 | ||
358 | pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, | 358 | pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, |
359 | &base); | 359 | &base); |
360 | return 0; | 360 | return 0; |
361 | 361 | ||
362 | free_mem: | ||
363 | memblock_free(base, size); | ||
362 | err: | 364 | err: |
363 | pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); | 365 | pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); |
364 | return ret; | 366 | return ret; |
diff --git a/mm/cma_debug.c b/mm/cma_debug.c index ad6723e9d110..8d7b2fd52225 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c | |||
@@ -21,8 +21,6 @@ struct cma_mem { | |||
21 | unsigned long n; | 21 | unsigned long n; |
22 | }; | 22 | }; |
23 | 23 | ||
24 | static struct dentry *cma_debugfs_root; | ||
25 | |||
26 | static int cma_debugfs_get(void *data, u64 *val) | 24 | static int cma_debugfs_get(void *data, u64 *val) |
27 | { | 25 | { |
28 | unsigned long *p = data; | 26 | unsigned long *p = data; |
@@ -162,7 +160,7 @@ static int cma_alloc_write(void *data, u64 val) | |||
162 | } | 160 | } |
163 | DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); | 161 | DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); |
164 | 162 | ||
165 | static void cma_debugfs_add_one(struct cma *cma, int idx) | 163 | static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry) |
166 | { | 164 | { |
167 | struct dentry *tmp; | 165 | struct dentry *tmp; |
168 | char name[16]; | 166 | char name[16]; |
@@ -170,7 +168,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) | |||
170 | 168 | ||
171 | scnprintf(name, sizeof(name), "cma-%s", cma->name); | 169 | scnprintf(name, sizeof(name), "cma-%s", cma->name); |
172 | 170 | ||
173 | tmp = debugfs_create_dir(name, cma_debugfs_root); | 171 | tmp = debugfs_create_dir(name, root_dentry); |
174 | 172 | ||
175 | debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops); | 173 | debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops); |
176 | debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops); | 174 | debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops); |
@@ -188,14 +186,13 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) | |||
188 | 186 | ||
189 | static int __init cma_debugfs_init(void) | 187 | static int __init cma_debugfs_init(void) |
190 | { | 188 | { |
189 | struct dentry *cma_debugfs_root; | ||
191 | int i; | 190 | int i; |
192 | 191 | ||
193 | cma_debugfs_root = debugfs_create_dir("cma", NULL); | 192 | cma_debugfs_root = debugfs_create_dir("cma", NULL); |
194 | if (!cma_debugfs_root) | ||
195 | return -ENOMEM; | ||
196 | 193 | ||
197 | for (i = 0; i < cma_area_count; i++) | 194 | for (i = 0; i < cma_area_count; i++) |
198 | cma_debugfs_add_one(&cma_areas[i], i); | 195 | cma_debugfs_add_one(&cma_areas[i], cma_debugfs_root); |
199 | 196 | ||
200 | return 0; | 197 | return 0; |
201 | } | 198 | } |
diff --git a/mm/compaction.c b/mm/compaction.c index ef29490b0f46..f171a83707ce 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -66,7 +66,7 @@ static unsigned long release_freepages(struct list_head *freelist) | |||
66 | return high_pfn; | 66 | return high_pfn; |
67 | } | 67 | } |
68 | 68 | ||
69 | static void map_pages(struct list_head *list) | 69 | static void split_map_pages(struct list_head *list) |
70 | { | 70 | { |
71 | unsigned int i, order, nr_pages; | 71 | unsigned int i, order, nr_pages; |
72 | struct page *page, *next; | 72 | struct page *page, *next; |
@@ -237,6 +237,70 @@ static bool pageblock_skip_persistent(struct page *page) | |||
237 | return false; | 237 | return false; |
238 | } | 238 | } |
239 | 239 | ||
240 | static bool | ||
241 | __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, | ||
242 | bool check_target) | ||
243 | { | ||
244 | struct page *page = pfn_to_online_page(pfn); | ||
245 | struct page *end_page; | ||
246 | unsigned long block_pfn; | ||
247 | |||
248 | if (!page) | ||
249 | return false; | ||
250 | if (zone != page_zone(page)) | ||
251 | return false; | ||
252 | if (pageblock_skip_persistent(page)) | ||
253 | return false; | ||
254 | |||
255 | /* | ||
256 | * If skip is already cleared do no further checking once the | ||
257 | * restart points have been set. | ||
258 | */ | ||
259 | if (check_source && check_target && !get_pageblock_skip(page)) | ||
260 | return true; | ||
261 | |||
262 | /* | ||
263 | * If clearing skip for the target scanner, do not select a | ||
264 | * non-movable pageblock as the starting point. | ||
265 | */ | ||
266 | if (!check_source && check_target && | ||
267 | get_pageblock_migratetype(page) != MIGRATE_MOVABLE) | ||
268 | return false; | ||
269 | |||
270 | /* | ||
271 | * Only clear the hint if a sample indicates there is either a | ||
272 | * free page or an LRU page in the block. One or other condition | ||
273 | * is necessary for the block to be a migration source/target. | ||
274 | */ | ||
275 | block_pfn = pageblock_start_pfn(pfn); | ||
276 | pfn = max(block_pfn, zone->zone_start_pfn); | ||
277 | page = pfn_to_page(pfn); | ||
278 | if (zone != page_zone(page)) | ||
279 | return false; | ||
280 | pfn = block_pfn + pageblock_nr_pages; | ||
281 | pfn = min(pfn, zone_end_pfn(zone)); | ||
282 | end_page = pfn_to_page(pfn); | ||
283 | |||
284 | do { | ||
285 | if (pfn_valid_within(pfn)) { | ||
286 | if (check_source && PageLRU(page)) { | ||
287 | clear_pageblock_skip(page); | ||
288 | return true; | ||
289 | } | ||
290 | |||
291 | if (check_target && PageBuddy(page)) { | ||
292 | clear_pageblock_skip(page); | ||
293 | return true; | ||
294 | } | ||
295 | } | ||
296 | |||
297 | page += (1 << PAGE_ALLOC_COSTLY_ORDER); | ||
298 | pfn += (1 << PAGE_ALLOC_COSTLY_ORDER); | ||
299 | } while (page < end_page); | ||
300 | |||
301 | return false; | ||
302 | } | ||
303 | |||
240 | /* | 304 | /* |
241 | * This function is called to clear all cached information on pageblocks that | 305 | * This function is called to clear all cached information on pageblocks that |
242 | * should be skipped for page isolation when the migrate and free page scanner | 306 | * should be skipped for page isolation when the migrate and free page scanner |
@@ -244,30 +308,54 @@ static bool pageblock_skip_persistent(struct page *page) | |||
244 | */ | 308 | */ |
245 | static void __reset_isolation_suitable(struct zone *zone) | 309 | static void __reset_isolation_suitable(struct zone *zone) |
246 | { | 310 | { |
247 | unsigned long start_pfn = zone->zone_start_pfn; | 311 | unsigned long migrate_pfn = zone->zone_start_pfn; |
248 | unsigned long end_pfn = zone_end_pfn(zone); | 312 | unsigned long free_pfn = zone_end_pfn(zone); |
249 | unsigned long pfn; | 313 | unsigned long reset_migrate = free_pfn; |
314 | unsigned long reset_free = migrate_pfn; | ||
315 | bool source_set = false; | ||
316 | bool free_set = false; | ||
317 | |||
318 | if (!zone->compact_blockskip_flush) | ||
319 | return; | ||
250 | 320 | ||
251 | zone->compact_blockskip_flush = false; | 321 | zone->compact_blockskip_flush = false; |
252 | 322 | ||
253 | /* Walk the zone and mark every pageblock as suitable for isolation */ | 323 | /* |
254 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 324 | * Walk the zone and update pageblock skip information. Source looks |
255 | struct page *page; | 325 | * for PageLRU while target looks for PageBuddy. When the scanner |
256 | 326 | * is found, both PageBuddy and PageLRU are checked as the pageblock | |
327 | * is suitable as both source and target. | ||
328 | */ | ||
329 | for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages, | ||
330 | free_pfn -= pageblock_nr_pages) { | ||
257 | cond_resched(); | 331 | cond_resched(); |
258 | 332 | ||
259 | page = pfn_to_online_page(pfn); | 333 | /* Update the migrate PFN */ |
260 | if (!page) | 334 | if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) && |
261 | continue; | 335 | migrate_pfn < reset_migrate) { |
262 | if (zone != page_zone(page)) | 336 | source_set = true; |
263 | continue; | 337 | reset_migrate = migrate_pfn; |
264 | if (pageblock_skip_persistent(page)) | 338 | zone->compact_init_migrate_pfn = reset_migrate; |
265 | continue; | 339 | zone->compact_cached_migrate_pfn[0] = reset_migrate; |
340 | zone->compact_cached_migrate_pfn[1] = reset_migrate; | ||
341 | } | ||
266 | 342 | ||
267 | clear_pageblock_skip(page); | 343 | /* Update the free PFN */ |
344 | if (__reset_isolation_pfn(zone, free_pfn, free_set, true) && | ||
345 | free_pfn > reset_free) { | ||
346 | free_set = true; | ||
347 | reset_free = free_pfn; | ||
348 | zone->compact_init_free_pfn = reset_free; | ||
349 | zone->compact_cached_free_pfn = reset_free; | ||
350 | } | ||
268 | } | 351 | } |
269 | 352 | ||
270 | reset_cached_positions(zone); | 353 | /* Leave no distance if no suitable block was reset */ |
354 | if (reset_migrate >= reset_free) { | ||
355 | zone->compact_cached_migrate_pfn[0] = migrate_pfn; | ||
356 | zone->compact_cached_migrate_pfn[1] = migrate_pfn; | ||
357 | zone->compact_cached_free_pfn = free_pfn; | ||
358 | } | ||
271 | } | 359 | } |
272 | 360 | ||
273 | void reset_isolation_suitable(pg_data_t *pgdat) | 361 | void reset_isolation_suitable(pg_data_t *pgdat) |
@@ -286,15 +374,53 @@ void reset_isolation_suitable(pg_data_t *pgdat) | |||
286 | } | 374 | } |
287 | 375 | ||
288 | /* | 376 | /* |
377 | * Sets the pageblock skip bit if it was clear. Note that this is a hint as | ||
378 | * locks are not required for read/writers. Returns true if it was already set. | ||
379 | */ | ||
380 | static bool test_and_set_skip(struct compact_control *cc, struct page *page, | ||
381 | unsigned long pfn) | ||
382 | { | ||
383 | bool skip; | ||
384 | |||
385 | /* Do no update if skip hint is being ignored */ | ||
386 | if (cc->ignore_skip_hint) | ||
387 | return false; | ||
388 | |||
389 | if (!IS_ALIGNED(pfn, pageblock_nr_pages)) | ||
390 | return false; | ||
391 | |||
392 | skip = get_pageblock_skip(page); | ||
393 | if (!skip && !cc->no_set_skip_hint) | ||
394 | set_pageblock_skip(page); | ||
395 | |||
396 | return skip; | ||
397 | } | ||
398 | |||
399 | static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) | ||
400 | { | ||
401 | struct zone *zone = cc->zone; | ||
402 | |||
403 | pfn = pageblock_end_pfn(pfn); | ||
404 | |||
405 | /* Set for isolation rather than compaction */ | ||
406 | if (cc->no_set_skip_hint) | ||
407 | return; | ||
408 | |||
409 | if (pfn > zone->compact_cached_migrate_pfn[0]) | ||
410 | zone->compact_cached_migrate_pfn[0] = pfn; | ||
411 | if (cc->mode != MIGRATE_ASYNC && | ||
412 | pfn > zone->compact_cached_migrate_pfn[1]) | ||
413 | zone->compact_cached_migrate_pfn[1] = pfn; | ||
414 | } | ||
415 | |||
416 | /* | ||
289 | * If no pages were isolated then mark this pageblock to be skipped in the | 417 | * If no pages were isolated then mark this pageblock to be skipped in the |
290 | * future. The information is later cleared by __reset_isolation_suitable(). | 418 | * future. The information is later cleared by __reset_isolation_suitable(). |
291 | */ | 419 | */ |
292 | static void update_pageblock_skip(struct compact_control *cc, | 420 | static void update_pageblock_skip(struct compact_control *cc, |
293 | struct page *page, unsigned long nr_isolated, | 421 | struct page *page, unsigned long pfn) |
294 | bool migrate_scanner) | ||
295 | { | 422 | { |
296 | struct zone *zone = cc->zone; | 423 | struct zone *zone = cc->zone; |
297 | unsigned long pfn; | ||
298 | 424 | ||
299 | if (cc->no_set_skip_hint) | 425 | if (cc->no_set_skip_hint) |
300 | return; | 426 | return; |
@@ -302,24 +428,11 @@ static void update_pageblock_skip(struct compact_control *cc, | |||
302 | if (!page) | 428 | if (!page) |
303 | return; | 429 | return; |
304 | 430 | ||
305 | if (nr_isolated) | ||
306 | return; | ||
307 | |||
308 | set_pageblock_skip(page); | 431 | set_pageblock_skip(page); |
309 | 432 | ||
310 | pfn = page_to_pfn(page); | ||
311 | |||
312 | /* Update where async and sync compaction should restart */ | 433 | /* Update where async and sync compaction should restart */ |
313 | if (migrate_scanner) { | 434 | if (pfn < zone->compact_cached_free_pfn) |
314 | if (pfn > zone->compact_cached_migrate_pfn[0]) | 435 | zone->compact_cached_free_pfn = pfn; |
315 | zone->compact_cached_migrate_pfn[0] = pfn; | ||
316 | if (cc->mode != MIGRATE_ASYNC && | ||
317 | pfn > zone->compact_cached_migrate_pfn[1]) | ||
318 | zone->compact_cached_migrate_pfn[1] = pfn; | ||
319 | } else { | ||
320 | if (pfn < zone->compact_cached_free_pfn) | ||
321 | zone->compact_cached_free_pfn = pfn; | ||
322 | } | ||
323 | } | 436 | } |
324 | #else | 437 | #else |
325 | static inline bool isolation_suitable(struct compact_control *cc, | 438 | static inline bool isolation_suitable(struct compact_control *cc, |
@@ -334,32 +447,42 @@ static inline bool pageblock_skip_persistent(struct page *page) | |||
334 | } | 447 | } |
335 | 448 | ||
336 | static inline void update_pageblock_skip(struct compact_control *cc, | 449 | static inline void update_pageblock_skip(struct compact_control *cc, |
337 | struct page *page, unsigned long nr_isolated, | 450 | struct page *page, unsigned long pfn) |
338 | bool migrate_scanner) | 451 | { |
452 | } | ||
453 | |||
454 | static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) | ||
455 | { | ||
456 | } | ||
457 | |||
458 | static bool test_and_set_skip(struct compact_control *cc, struct page *page, | ||
459 | unsigned long pfn) | ||
339 | { | 460 | { |
461 | return false; | ||
340 | } | 462 | } |
341 | #endif /* CONFIG_COMPACTION */ | 463 | #endif /* CONFIG_COMPACTION */ |
342 | 464 | ||
343 | /* | 465 | /* |
344 | * Compaction requires the taking of some coarse locks that are potentially | 466 | * Compaction requires the taking of some coarse locks that are potentially |
345 | * very heavily contended. For async compaction, back out if the lock cannot | 467 | * very heavily contended. For async compaction, trylock and record if the |
346 | * be taken immediately. For sync compaction, spin on the lock if needed. | 468 | * lock is contended. The lock will still be acquired but compaction will |
469 | * abort when the current block is finished regardless of success rate. | ||
470 | * Sync compaction acquires the lock. | ||
347 | * | 471 | * |
348 | * Returns true if the lock is held | 472 | * Always returns true which makes it easier to track lock state in callers. |
349 | * Returns false if the lock is not held and compaction should abort | ||
350 | */ | 473 | */ |
351 | static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, | 474 | static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, |
352 | struct compact_control *cc) | 475 | struct compact_control *cc) |
353 | { | 476 | { |
354 | if (cc->mode == MIGRATE_ASYNC) { | 477 | /* Track if the lock is contended in async mode */ |
355 | if (!spin_trylock_irqsave(lock, *flags)) { | 478 | if (cc->mode == MIGRATE_ASYNC && !cc->contended) { |
356 | cc->contended = true; | 479 | if (spin_trylock_irqsave(lock, *flags)) |
357 | return false; | 480 | return true; |
358 | } | 481 | |
359 | } else { | 482 | cc->contended = true; |
360 | spin_lock_irqsave(lock, *flags); | ||
361 | } | 483 | } |
362 | 484 | ||
485 | spin_lock_irqsave(lock, *flags); | ||
363 | return true; | 486 | return true; |
364 | } | 487 | } |
365 | 488 | ||
@@ -391,37 +514,7 @@ static bool compact_unlock_should_abort(spinlock_t *lock, | |||
391 | return true; | 514 | return true; |
392 | } | 515 | } |
393 | 516 | ||
394 | if (need_resched()) { | 517 | cond_resched(); |
395 | if (cc->mode == MIGRATE_ASYNC) { | ||
396 | cc->contended = true; | ||
397 | return true; | ||
398 | } | ||
399 | cond_resched(); | ||
400 | } | ||
401 | |||
402 | return false; | ||
403 | } | ||
404 | |||
405 | /* | ||
406 | * Aside from avoiding lock contention, compaction also periodically checks | ||
407 | * need_resched() and either schedules in sync compaction or aborts async | ||
408 | * compaction. This is similar to what compact_unlock_should_abort() does, but | ||
409 | * is used where no lock is concerned. | ||
410 | * | ||
411 | * Returns false when no scheduling was needed, or sync compaction scheduled. | ||
412 | * Returns true when async compaction should abort. | ||
413 | */ | ||
414 | static inline bool compact_should_abort(struct compact_control *cc) | ||
415 | { | ||
416 | /* async compaction aborts if contended */ | ||
417 | if (need_resched()) { | ||
418 | if (cc->mode == MIGRATE_ASYNC) { | ||
419 | cc->contended = true; | ||
420 | return true; | ||
421 | } | ||
422 | |||
423 | cond_resched(); | ||
424 | } | ||
425 | 518 | ||
426 | return false; | 519 | return false; |
427 | } | 520 | } |
@@ -435,19 +528,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
435 | unsigned long *start_pfn, | 528 | unsigned long *start_pfn, |
436 | unsigned long end_pfn, | 529 | unsigned long end_pfn, |
437 | struct list_head *freelist, | 530 | struct list_head *freelist, |
531 | unsigned int stride, | ||
438 | bool strict) | 532 | bool strict) |
439 | { | 533 | { |
440 | int nr_scanned = 0, total_isolated = 0; | 534 | int nr_scanned = 0, total_isolated = 0; |
441 | struct page *cursor, *valid_page = NULL; | 535 | struct page *cursor; |
442 | unsigned long flags = 0; | 536 | unsigned long flags = 0; |
443 | bool locked = false; | 537 | bool locked = false; |
444 | unsigned long blockpfn = *start_pfn; | 538 | unsigned long blockpfn = *start_pfn; |
445 | unsigned int order; | 539 | unsigned int order; |
446 | 540 | ||
541 | /* Strict mode is for isolation, speed is secondary */ | ||
542 | if (strict) | ||
543 | stride = 1; | ||
544 | |||
447 | cursor = pfn_to_page(blockpfn); | 545 | cursor = pfn_to_page(blockpfn); |
448 | 546 | ||
449 | /* Isolate free pages. */ | 547 | /* Isolate free pages. */ |
450 | for (; blockpfn < end_pfn; blockpfn++, cursor++) { | 548 | for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) { |
451 | int isolated; | 549 | int isolated; |
452 | struct page *page = cursor; | 550 | struct page *page = cursor; |
453 | 551 | ||
@@ -465,9 +563,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
465 | if (!pfn_valid_within(blockpfn)) | 563 | if (!pfn_valid_within(blockpfn)) |
466 | goto isolate_fail; | 564 | goto isolate_fail; |
467 | 565 | ||
468 | if (!valid_page) | ||
469 | valid_page = page; | ||
470 | |||
471 | /* | 566 | /* |
472 | * For compound pages such as THP and hugetlbfs, we can save | 567 | * For compound pages such as THP and hugetlbfs, we can save |
473 | * potentially a lot of iterations if we skip them at once. | 568 | * potentially a lot of iterations if we skip them at once. |
@@ -495,18 +590,8 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
495 | * recheck as well. | 590 | * recheck as well. |
496 | */ | 591 | */ |
497 | if (!locked) { | 592 | if (!locked) { |
498 | /* | 593 | locked = compact_lock_irqsave(&cc->zone->lock, |
499 | * The zone lock must be held to isolate freepages. | ||
500 | * Unfortunately this is a very coarse lock and can be | ||
501 | * heavily contended if there are parallel allocations | ||
502 | * or parallel compactions. For async compaction do not | ||
503 | * spin on the lock and we acquire the lock as late as | ||
504 | * possible. | ||
505 | */ | ||
506 | locked = compact_trylock_irqsave(&cc->zone->lock, | ||
507 | &flags, cc); | 594 | &flags, cc); |
508 | if (!locked) | ||
509 | break; | ||
510 | 595 | ||
511 | /* Recheck this is a buddy page under lock */ | 596 | /* Recheck this is a buddy page under lock */ |
512 | if (!PageBuddy(page)) | 597 | if (!PageBuddy(page)) |
@@ -565,10 +650,6 @@ isolate_fail: | |||
565 | if (strict && blockpfn < end_pfn) | 650 | if (strict && blockpfn < end_pfn) |
566 | total_isolated = 0; | 651 | total_isolated = 0; |
567 | 652 | ||
568 | /* Update the pageblock-skip if the whole pageblock was scanned */ | ||
569 | if (blockpfn == end_pfn) | ||
570 | update_pageblock_skip(cc, valid_page, total_isolated, false); | ||
571 | |||
572 | cc->total_free_scanned += nr_scanned; | 653 | cc->total_free_scanned += nr_scanned; |
573 | if (total_isolated) | 654 | if (total_isolated) |
574 | count_compact_events(COMPACTISOLATED, total_isolated); | 655 | count_compact_events(COMPACTISOLATED, total_isolated); |
@@ -626,7 +707,7 @@ isolate_freepages_range(struct compact_control *cc, | |||
626 | break; | 707 | break; |
627 | 708 | ||
628 | isolated = isolate_freepages_block(cc, &isolate_start_pfn, | 709 | isolated = isolate_freepages_block(cc, &isolate_start_pfn, |
629 | block_end_pfn, &freelist, true); | 710 | block_end_pfn, &freelist, 0, true); |
630 | 711 | ||
631 | /* | 712 | /* |
632 | * In strict mode, isolate_freepages_block() returns 0 if | 713 | * In strict mode, isolate_freepages_block() returns 0 if |
@@ -644,7 +725,7 @@ isolate_freepages_range(struct compact_control *cc, | |||
644 | } | 725 | } |
645 | 726 | ||
646 | /* __isolate_free_page() does not map the pages */ | 727 | /* __isolate_free_page() does not map the pages */ |
647 | map_pages(&freelist); | 728 | split_map_pages(&freelist); |
648 | 729 | ||
649 | if (pfn < end_pfn) { | 730 | if (pfn < end_pfn) { |
650 | /* Loop terminated early, cleanup. */ | 731 | /* Loop terminated early, cleanup. */ |
@@ -657,16 +738,16 @@ isolate_freepages_range(struct compact_control *cc, | |||
657 | } | 738 | } |
658 | 739 | ||
659 | /* Similar to reclaim, but different enough that they don't share logic */ | 740 | /* Similar to reclaim, but different enough that they don't share logic */ |
660 | static bool too_many_isolated(struct zone *zone) | 741 | static bool too_many_isolated(pg_data_t *pgdat) |
661 | { | 742 | { |
662 | unsigned long active, inactive, isolated; | 743 | unsigned long active, inactive, isolated; |
663 | 744 | ||
664 | inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) + | 745 | inactive = node_page_state(pgdat, NR_INACTIVE_FILE) + |
665 | node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON); | 746 | node_page_state(pgdat, NR_INACTIVE_ANON); |
666 | active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) + | 747 | active = node_page_state(pgdat, NR_ACTIVE_FILE) + |
667 | node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON); | 748 | node_page_state(pgdat, NR_ACTIVE_ANON); |
668 | isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) + | 749 | isolated = node_page_state(pgdat, NR_ISOLATED_FILE) + |
669 | node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON); | 750 | node_page_state(pgdat, NR_ISOLATED_ANON); |
670 | 751 | ||
671 | return isolated > (inactive + active) / 2; | 752 | return isolated > (inactive + active) / 2; |
672 | } | 753 | } |
@@ -693,7 +774,7 @@ static unsigned long | |||
693 | isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | 774 | isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, |
694 | unsigned long end_pfn, isolate_mode_t isolate_mode) | 775 | unsigned long end_pfn, isolate_mode_t isolate_mode) |
695 | { | 776 | { |
696 | struct zone *zone = cc->zone; | 777 | pg_data_t *pgdat = cc->zone->zone_pgdat; |
697 | unsigned long nr_scanned = 0, nr_isolated = 0; | 778 | unsigned long nr_scanned = 0, nr_isolated = 0; |
698 | struct lruvec *lruvec; | 779 | struct lruvec *lruvec; |
699 | unsigned long flags = 0; | 780 | unsigned long flags = 0; |
@@ -702,13 +783,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
702 | unsigned long start_pfn = low_pfn; | 783 | unsigned long start_pfn = low_pfn; |
703 | bool skip_on_failure = false; | 784 | bool skip_on_failure = false; |
704 | unsigned long next_skip_pfn = 0; | 785 | unsigned long next_skip_pfn = 0; |
786 | bool skip_updated = false; | ||
705 | 787 | ||
706 | /* | 788 | /* |
707 | * Ensure that there are not too many pages isolated from the LRU | 789 | * Ensure that there are not too many pages isolated from the LRU |
708 | * list by either parallel reclaimers or compaction. If there are, | 790 | * list by either parallel reclaimers or compaction. If there are, |
709 | * delay for some time until fewer pages are isolated | 791 | * delay for some time until fewer pages are isolated |
710 | */ | 792 | */ |
711 | while (unlikely(too_many_isolated(zone))) { | 793 | while (unlikely(too_many_isolated(pgdat))) { |
712 | /* async migration should just abort */ | 794 | /* async migration should just abort */ |
713 | if (cc->mode == MIGRATE_ASYNC) | 795 | if (cc->mode == MIGRATE_ASYNC) |
714 | return 0; | 796 | return 0; |
@@ -719,8 +801,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
719 | return 0; | 801 | return 0; |
720 | } | 802 | } |
721 | 803 | ||
722 | if (compact_should_abort(cc)) | 804 | cond_resched(); |
723 | return 0; | ||
724 | 805 | ||
725 | if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) { | 806 | if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) { |
726 | skip_on_failure = true; | 807 | skip_on_failure = true; |
@@ -758,8 +839,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
758 | * if contended. | 839 | * if contended. |
759 | */ | 840 | */ |
760 | if (!(low_pfn % SWAP_CLUSTER_MAX) | 841 | if (!(low_pfn % SWAP_CLUSTER_MAX) |
761 | && compact_unlock_should_abort(zone_lru_lock(zone), flags, | 842 | && compact_unlock_should_abort(&pgdat->lru_lock, |
762 | &locked, cc)) | 843 | flags, &locked, cc)) |
763 | break; | 844 | break; |
764 | 845 | ||
765 | if (!pfn_valid_within(low_pfn)) | 846 | if (!pfn_valid_within(low_pfn)) |
@@ -768,8 +849,19 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
768 | 849 | ||
769 | page = pfn_to_page(low_pfn); | 850 | page = pfn_to_page(low_pfn); |
770 | 851 | ||
771 | if (!valid_page) | 852 | /* |
853 | * Check if the pageblock has already been marked skipped. | ||
854 | * Only the aligned PFN is checked as the caller isolates | ||
855 | * COMPACT_CLUSTER_MAX at a time so the second call must | ||
856 | * not falsely conclude that the block should be skipped. | ||
857 | */ | ||
858 | if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) { | ||
859 | if (!cc->ignore_skip_hint && get_pageblock_skip(page)) { | ||
860 | low_pfn = end_pfn; | ||
861 | goto isolate_abort; | ||
862 | } | ||
772 | valid_page = page; | 863 | valid_page = page; |
864 | } | ||
773 | 865 | ||
774 | /* | 866 | /* |
775 | * Skip if free. We read page order here without zone lock | 867 | * Skip if free. We read page order here without zone lock |
@@ -818,7 +910,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
818 | if (unlikely(__PageMovable(page)) && | 910 | if (unlikely(__PageMovable(page)) && |
819 | !PageIsolated(page)) { | 911 | !PageIsolated(page)) { |
820 | if (locked) { | 912 | if (locked) { |
821 | spin_unlock_irqrestore(zone_lru_lock(zone), | 913 | spin_unlock_irqrestore(&pgdat->lru_lock, |
822 | flags); | 914 | flags); |
823 | locked = false; | 915 | locked = false; |
824 | } | 916 | } |
@@ -848,10 +940,15 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
848 | 940 | ||
849 | /* If we already hold the lock, we can skip some rechecking */ | 941 | /* If we already hold the lock, we can skip some rechecking */ |
850 | if (!locked) { | 942 | if (!locked) { |
851 | locked = compact_trylock_irqsave(zone_lru_lock(zone), | 943 | locked = compact_lock_irqsave(&pgdat->lru_lock, |
852 | &flags, cc); | 944 | &flags, cc); |
853 | if (!locked) | 945 | |
854 | break; | 946 | /* Try get exclusive access under lock */ |
947 | if (!skip_updated) { | ||
948 | skip_updated = true; | ||
949 | if (test_and_set_skip(cc, page, low_pfn)) | ||
950 | goto isolate_abort; | ||
951 | } | ||
855 | 952 | ||
856 | /* Recheck PageLRU and PageCompound under lock */ | 953 | /* Recheck PageLRU and PageCompound under lock */ |
857 | if (!PageLRU(page)) | 954 | if (!PageLRU(page)) |
@@ -868,7 +965,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
868 | } | 965 | } |
869 | } | 966 | } |
870 | 967 | ||
871 | lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); | 968 | lruvec = mem_cgroup_page_lruvec(page, pgdat); |
872 | 969 | ||
873 | /* Try isolate the page */ | 970 | /* Try isolate the page */ |
874 | if (__isolate_lru_page(page, isolate_mode) != 0) | 971 | if (__isolate_lru_page(page, isolate_mode) != 0) |
@@ -887,16 +984,13 @@ isolate_success: | |||
887 | nr_isolated++; | 984 | nr_isolated++; |
888 | 985 | ||
889 | /* | 986 | /* |
890 | * Record where we could have freed pages by migration and not | 987 | * Avoid isolating too much unless this block is being |
891 | * yet flushed them to buddy allocator. | 988 | * rescanned (e.g. dirty/writeback pages, parallel allocation) |
892 | * - this is the lowest page that was isolated and likely be | 989 | * or a lock is contended. For contention, isolate quickly to |
893 | * then freed by migration. | 990 | * potentially remove one source of contention. |
894 | */ | 991 | */ |
895 | if (!cc->last_migrated_pfn) | 992 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX && |
896 | cc->last_migrated_pfn = low_pfn; | 993 | !cc->rescan && !cc->contended) { |
897 | |||
898 | /* Avoid isolating too much */ | ||
899 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { | ||
900 | ++low_pfn; | 994 | ++low_pfn; |
901 | break; | 995 | break; |
902 | } | 996 | } |
@@ -913,12 +1007,11 @@ isolate_fail: | |||
913 | */ | 1007 | */ |
914 | if (nr_isolated) { | 1008 | if (nr_isolated) { |
915 | if (locked) { | 1009 | if (locked) { |
916 | spin_unlock_irqrestore(zone_lru_lock(zone), flags); | 1010 | spin_unlock_irqrestore(&pgdat->lru_lock, flags); |
917 | locked = false; | 1011 | locked = false; |
918 | } | 1012 | } |
919 | putback_movable_pages(&cc->migratepages); | 1013 | putback_movable_pages(&cc->migratepages); |
920 | cc->nr_migratepages = 0; | 1014 | cc->nr_migratepages = 0; |
921 | cc->last_migrated_pfn = 0; | ||
922 | nr_isolated = 0; | 1015 | nr_isolated = 0; |
923 | } | 1016 | } |
924 | 1017 | ||
@@ -939,15 +1032,23 @@ isolate_fail: | |||
939 | if (unlikely(low_pfn > end_pfn)) | 1032 | if (unlikely(low_pfn > end_pfn)) |
940 | low_pfn = end_pfn; | 1033 | low_pfn = end_pfn; |
941 | 1034 | ||
1035 | isolate_abort: | ||
942 | if (locked) | 1036 | if (locked) |
943 | spin_unlock_irqrestore(zone_lru_lock(zone), flags); | 1037 | spin_unlock_irqrestore(&pgdat->lru_lock, flags); |
944 | 1038 | ||
945 | /* | 1039 | /* |
946 | * Update the pageblock-skip information and cached scanner pfn, | 1040 | * Updated the cached scanner pfn once the pageblock has been scanned |
947 | * if the whole pageblock was scanned without isolating any page. | 1041 | * Pages will either be migrated in which case there is no point |
1042 | * scanning in the near future or migration failed in which case the | ||
1043 | * failure reason may persist. The block is marked for skipping if | ||
1044 | * there were no pages isolated in the block or if the block is | ||
1045 | * rescanned twice in a row. | ||
948 | */ | 1046 | */ |
949 | if (low_pfn == end_pfn) | 1047 | if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) { |
950 | update_pageblock_skip(cc, valid_page, nr_isolated, true); | 1048 | if (valid_page && !skip_updated) |
1049 | set_pageblock_skip(valid_page); | ||
1050 | update_cached_migrate(cc, low_pfn); | ||
1051 | } | ||
951 | 1052 | ||
952 | trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, | 1053 | trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, |
953 | nr_scanned, nr_isolated); | 1054 | nr_scanned, nr_isolated); |
@@ -1013,6 +1114,9 @@ static bool suitable_migration_source(struct compact_control *cc, | |||
1013 | { | 1114 | { |
1014 | int block_mt; | 1115 | int block_mt; |
1015 | 1116 | ||
1117 | if (pageblock_skip_persistent(page)) | ||
1118 | return false; | ||
1119 | |||
1016 | if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) | 1120 | if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) |
1017 | return true; | 1121 | return true; |
1018 | 1122 | ||
@@ -1050,6 +1154,12 @@ static bool suitable_migration_target(struct compact_control *cc, | |||
1050 | return false; | 1154 | return false; |
1051 | } | 1155 | } |
1052 | 1156 | ||
1157 | static inline unsigned int | ||
1158 | freelist_scan_limit(struct compact_control *cc) | ||
1159 | { | ||
1160 | return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1; | ||
1161 | } | ||
1162 | |||
1053 | /* | 1163 | /* |
1054 | * Test whether the free scanner has reached the same or lower pageblock than | 1164 | * Test whether the free scanner has reached the same or lower pageblock than |
1055 | * the migration scanner, and compaction should thus terminate. | 1165 | * the migration scanner, and compaction should thus terminate. |
@@ -1061,6 +1171,248 @@ static inline bool compact_scanners_met(struct compact_control *cc) | |||
1061 | } | 1171 | } |
1062 | 1172 | ||
1063 | /* | 1173 | /* |
1174 | * Used when scanning for a suitable migration target which scans freelists | ||
1175 | * in reverse. Reorders the list such as the unscanned pages are scanned | ||
1176 | * first on the next iteration of the free scanner | ||
1177 | */ | ||
1178 | static void | ||
1179 | move_freelist_head(struct list_head *freelist, struct page *freepage) | ||
1180 | { | ||
1181 | LIST_HEAD(sublist); | ||
1182 | |||
1183 | if (!list_is_last(freelist, &freepage->lru)) { | ||
1184 | list_cut_before(&sublist, freelist, &freepage->lru); | ||
1185 | if (!list_empty(&sublist)) | ||
1186 | list_splice_tail(&sublist, freelist); | ||
1187 | } | ||
1188 | } | ||
1189 | |||
1190 | /* | ||
1191 | * Similar to move_freelist_head except used by the migration scanner | ||
1192 | * when scanning forward. It's possible for these list operations to | ||
1193 | * move against each other if they search the free list exactly in | ||
1194 | * lockstep. | ||
1195 | */ | ||
1196 | static void | ||
1197 | move_freelist_tail(struct list_head *freelist, struct page *freepage) | ||
1198 | { | ||
1199 | LIST_HEAD(sublist); | ||
1200 | |||
1201 | if (!list_is_first(freelist, &freepage->lru)) { | ||
1202 | list_cut_position(&sublist, freelist, &freepage->lru); | ||
1203 | if (!list_empty(&sublist)) | ||
1204 | list_splice_tail(&sublist, freelist); | ||
1205 | } | ||
1206 | } | ||
1207 | |||
1208 | static void | ||
1209 | fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated) | ||
1210 | { | ||
1211 | unsigned long start_pfn, end_pfn; | ||
1212 | struct page *page = pfn_to_page(pfn); | ||
1213 | |||
1214 | /* Do not search around if there are enough pages already */ | ||
1215 | if (cc->nr_freepages >= cc->nr_migratepages) | ||
1216 | return; | ||
1217 | |||
1218 | /* Minimise scanning during async compaction */ | ||
1219 | if (cc->direct_compaction && cc->mode == MIGRATE_ASYNC) | ||
1220 | return; | ||
1221 | |||
1222 | /* Pageblock boundaries */ | ||
1223 | start_pfn = pageblock_start_pfn(pfn); | ||
1224 | end_pfn = min(start_pfn + pageblock_nr_pages, zone_end_pfn(cc->zone)); | ||
1225 | |||
1226 | /* Scan before */ | ||
1227 | if (start_pfn != pfn) { | ||
1228 | isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false); | ||
1229 | if (cc->nr_freepages >= cc->nr_migratepages) | ||
1230 | return; | ||
1231 | } | ||
1232 | |||
1233 | /* Scan after */ | ||
1234 | start_pfn = pfn + nr_isolated; | ||
1235 | if (start_pfn != end_pfn) | ||
1236 | isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); | ||
1237 | |||
1238 | /* Skip this pageblock in the future as it's full or nearly full */ | ||
1239 | if (cc->nr_freepages < cc->nr_migratepages) | ||
1240 | set_pageblock_skip(page); | ||
1241 | } | ||
1242 | |||
1243 | /* Search orders in round-robin fashion */ | ||
1244 | static int next_search_order(struct compact_control *cc, int order) | ||
1245 | { | ||
1246 | order--; | ||
1247 | if (order < 0) | ||
1248 | order = cc->order - 1; | ||
1249 | |||
1250 | /* Search wrapped around? */ | ||
1251 | if (order == cc->search_order) { | ||
1252 | cc->search_order--; | ||
1253 | if (cc->search_order < 0) | ||
1254 | cc->search_order = cc->order - 1; | ||
1255 | return -1; | ||
1256 | } | ||
1257 | |||
1258 | return order; | ||
1259 | } | ||
1260 | |||
1261 | static unsigned long | ||
1262 | fast_isolate_freepages(struct compact_control *cc) | ||
1263 | { | ||
1264 | unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1); | ||
1265 | unsigned int nr_scanned = 0; | ||
1266 | unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0; | ||
1267 | unsigned long nr_isolated = 0; | ||
1268 | unsigned long distance; | ||
1269 | struct page *page = NULL; | ||
1270 | bool scan_start = false; | ||
1271 | int order; | ||
1272 | |||
1273 | /* Full compaction passes in a negative order */ | ||
1274 | if (cc->order <= 0) | ||
1275 | return cc->free_pfn; | ||
1276 | |||
1277 | /* | ||
1278 | * If starting the scan, use a deeper search and use the highest | ||
1279 | * PFN found if a suitable one is not found. | ||
1280 | */ | ||
1281 | if (cc->free_pfn >= cc->zone->compact_init_free_pfn) { | ||
1282 | limit = pageblock_nr_pages >> 1; | ||
1283 | scan_start = true; | ||
1284 | } | ||
1285 | |||
1286 | /* | ||
1287 | * Preferred point is in the top quarter of the scan space but take | ||
1288 | * a pfn from the top half if the search is problematic. | ||
1289 | */ | ||
1290 | distance = (cc->free_pfn - cc->migrate_pfn); | ||
1291 | low_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 2)); | ||
1292 | min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 1)); | ||
1293 | |||
1294 | if (WARN_ON_ONCE(min_pfn > low_pfn)) | ||
1295 | low_pfn = min_pfn; | ||
1296 | |||
1297 | /* | ||
1298 | * Search starts from the last successful isolation order or the next | ||
1299 | * order to search after a previous failure | ||
1300 | */ | ||
1301 | cc->search_order = min_t(unsigned int, cc->order - 1, cc->search_order); | ||
1302 | |||
1303 | for (order = cc->search_order; | ||
1304 | !page && order >= 0; | ||
1305 | order = next_search_order(cc, order)) { | ||
1306 | struct free_area *area = &cc->zone->free_area[order]; | ||
1307 | struct list_head *freelist; | ||
1308 | struct page *freepage; | ||
1309 | unsigned long flags; | ||
1310 | unsigned int order_scanned = 0; | ||
1311 | |||
1312 | if (!area->nr_free) | ||
1313 | continue; | ||
1314 | |||
1315 | spin_lock_irqsave(&cc->zone->lock, flags); | ||
1316 | freelist = &area->free_list[MIGRATE_MOVABLE]; | ||
1317 | list_for_each_entry_reverse(freepage, freelist, lru) { | ||
1318 | unsigned long pfn; | ||
1319 | |||
1320 | order_scanned++; | ||
1321 | nr_scanned++; | ||
1322 | pfn = page_to_pfn(freepage); | ||
1323 | |||
1324 | if (pfn >= highest) | ||
1325 | highest = pageblock_start_pfn(pfn); | ||
1326 | |||
1327 | if (pfn >= low_pfn) { | ||
1328 | cc->fast_search_fail = 0; | ||
1329 | cc->search_order = order; | ||
1330 | page = freepage; | ||
1331 | break; | ||
1332 | } | ||
1333 | |||
1334 | if (pfn >= min_pfn && pfn > high_pfn) { | ||
1335 | high_pfn = pfn; | ||
1336 | |||
1337 | /* Shorten the scan if a candidate is found */ | ||
1338 | limit >>= 1; | ||
1339 | } | ||
1340 | |||
1341 | if (order_scanned >= limit) | ||
1342 | break; | ||
1343 | } | ||
1344 | |||
1345 | /* Use a minimum pfn if a preferred one was not found */ | ||
1346 | if (!page && high_pfn) { | ||
1347 | page = pfn_to_page(high_pfn); | ||
1348 | |||
1349 | /* Update freepage for the list reorder below */ | ||
1350 | freepage = page; | ||
1351 | } | ||
1352 | |||
1353 | /* Reorder to so a future search skips recent pages */ | ||
1354 | move_freelist_head(freelist, freepage); | ||
1355 | |||
1356 | /* Isolate the page if available */ | ||
1357 | if (page) { | ||
1358 | if (__isolate_free_page(page, order)) { | ||
1359 | set_page_private(page, order); | ||
1360 | nr_isolated = 1 << order; | ||
1361 | cc->nr_freepages += nr_isolated; | ||
1362 | list_add_tail(&page->lru, &cc->freepages); | ||
1363 | count_compact_events(COMPACTISOLATED, nr_isolated); | ||
1364 | } else { | ||
1365 | /* If isolation fails, abort the search */ | ||
1366 | order = -1; | ||
1367 | page = NULL; | ||
1368 | } | ||
1369 | } | ||
1370 | |||
1371 | spin_unlock_irqrestore(&cc->zone->lock, flags); | ||
1372 | |||
1373 | /* | ||
1374 | * Smaller scan on next order so the total scan ig related | ||
1375 | * to freelist_scan_limit. | ||
1376 | */ | ||
1377 | if (order_scanned >= limit) | ||
1378 | limit = min(1U, limit >> 1); | ||
1379 | } | ||
1380 | |||
1381 | if (!page) { | ||
1382 | cc->fast_search_fail++; | ||
1383 | if (scan_start) { | ||
1384 | /* | ||
1385 | * Use the highest PFN found above min. If one was | ||
1386 | * not found, be pessemistic for direct compaction | ||
1387 | * and use the min mark. | ||
1388 | */ | ||
1389 | if (highest) { | ||
1390 | page = pfn_to_page(highest); | ||
1391 | cc->free_pfn = highest; | ||
1392 | } else { | ||
1393 | if (cc->direct_compaction) { | ||
1394 | page = pfn_to_page(min_pfn); | ||
1395 | cc->free_pfn = min_pfn; | ||
1396 | } | ||
1397 | } | ||
1398 | } | ||
1399 | } | ||
1400 | |||
1401 | if (highest && highest >= cc->zone->compact_cached_free_pfn) { | ||
1402 | highest -= pageblock_nr_pages; | ||
1403 | cc->zone->compact_cached_free_pfn = highest; | ||
1404 | } | ||
1405 | |||
1406 | cc->total_free_scanned += nr_scanned; | ||
1407 | if (!page) | ||
1408 | return cc->free_pfn; | ||
1409 | |||
1410 | low_pfn = page_to_pfn(page); | ||
1411 | fast_isolate_around(cc, low_pfn, nr_isolated); | ||
1412 | return low_pfn; | ||
1413 | } | ||
1414 | |||
1415 | /* | ||
1064 | * Based on information in the current compact_control, find blocks | 1416 | * Based on information in the current compact_control, find blocks |
1065 | * suitable for isolating free pages from and then isolate them. | 1417 | * suitable for isolating free pages from and then isolate them. |
1066 | */ | 1418 | */ |
@@ -1073,6 +1425,12 @@ static void isolate_freepages(struct compact_control *cc) | |||
1073 | unsigned long block_end_pfn; /* end of current pageblock */ | 1425 | unsigned long block_end_pfn; /* end of current pageblock */ |
1074 | unsigned long low_pfn; /* lowest pfn scanner is able to scan */ | 1426 | unsigned long low_pfn; /* lowest pfn scanner is able to scan */ |
1075 | struct list_head *freelist = &cc->freepages; | 1427 | struct list_head *freelist = &cc->freepages; |
1428 | unsigned int stride; | ||
1429 | |||
1430 | /* Try a small search of the free lists for a candidate */ | ||
1431 | isolate_start_pfn = fast_isolate_freepages(cc); | ||
1432 | if (cc->nr_freepages) | ||
1433 | goto splitmap; | ||
1076 | 1434 | ||
1077 | /* | 1435 | /* |
1078 | * Initialise the free scanner. The starting point is where we last | 1436 | * Initialise the free scanner. The starting point is where we last |
@@ -1086,10 +1444,11 @@ static void isolate_freepages(struct compact_control *cc) | |||
1086 | * is using. | 1444 | * is using. |
1087 | */ | 1445 | */ |
1088 | isolate_start_pfn = cc->free_pfn; | 1446 | isolate_start_pfn = cc->free_pfn; |
1089 | block_start_pfn = pageblock_start_pfn(cc->free_pfn); | 1447 | block_start_pfn = pageblock_start_pfn(isolate_start_pfn); |
1090 | block_end_pfn = min(block_start_pfn + pageblock_nr_pages, | 1448 | block_end_pfn = min(block_start_pfn + pageblock_nr_pages, |
1091 | zone_end_pfn(zone)); | 1449 | zone_end_pfn(zone)); |
1092 | low_pfn = pageblock_end_pfn(cc->migrate_pfn); | 1450 | low_pfn = pageblock_end_pfn(cc->migrate_pfn); |
1451 | stride = cc->mode == MIGRATE_ASYNC ? COMPACT_CLUSTER_MAX : 1; | ||
1093 | 1452 | ||
1094 | /* | 1453 | /* |
1095 | * Isolate free pages until enough are available to migrate the | 1454 | * Isolate free pages until enough are available to migrate the |
@@ -1100,14 +1459,14 @@ static void isolate_freepages(struct compact_control *cc) | |||
1100 | block_end_pfn = block_start_pfn, | 1459 | block_end_pfn = block_start_pfn, |
1101 | block_start_pfn -= pageblock_nr_pages, | 1460 | block_start_pfn -= pageblock_nr_pages, |
1102 | isolate_start_pfn = block_start_pfn) { | 1461 | isolate_start_pfn = block_start_pfn) { |
1462 | unsigned long nr_isolated; | ||
1463 | |||
1103 | /* | 1464 | /* |
1104 | * This can iterate a massively long zone without finding any | 1465 | * This can iterate a massively long zone without finding any |
1105 | * suitable migration targets, so periodically check if we need | 1466 | * suitable migration targets, so periodically check resched. |
1106 | * to schedule, or even abort async compaction. | ||
1107 | */ | 1467 | */ |
1108 | if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) | 1468 | if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) |
1109 | && compact_should_abort(cc)) | 1469 | cond_resched(); |
1110 | break; | ||
1111 | 1470 | ||
1112 | page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, | 1471 | page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, |
1113 | zone); | 1472 | zone); |
@@ -1123,15 +1482,15 @@ static void isolate_freepages(struct compact_control *cc) | |||
1123 | continue; | 1482 | continue; |
1124 | 1483 | ||
1125 | /* Found a block suitable for isolating free pages from. */ | 1484 | /* Found a block suitable for isolating free pages from. */ |
1126 | isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn, | 1485 | nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn, |
1127 | freelist, false); | 1486 | block_end_pfn, freelist, stride, false); |
1128 | 1487 | ||
1129 | /* | 1488 | /* Update the skip hint if the full pageblock was scanned */ |
1130 | * If we isolated enough freepages, or aborted due to lock | 1489 | if (isolate_start_pfn == block_end_pfn) |
1131 | * contention, terminate. | 1490 | update_pageblock_skip(cc, page, block_start_pfn); |
1132 | */ | 1491 | |
1133 | if ((cc->nr_freepages >= cc->nr_migratepages) | 1492 | /* Are enough freepages isolated? */ |
1134 | || cc->contended) { | 1493 | if (cc->nr_freepages >= cc->nr_migratepages) { |
1135 | if (isolate_start_pfn >= block_end_pfn) { | 1494 | if (isolate_start_pfn >= block_end_pfn) { |
1136 | /* | 1495 | /* |
1137 | * Restart at previous pageblock if more | 1496 | * Restart at previous pageblock if more |
@@ -1148,10 +1507,14 @@ static void isolate_freepages(struct compact_control *cc) | |||
1148 | */ | 1507 | */ |
1149 | break; | 1508 | break; |
1150 | } | 1509 | } |
1151 | } | ||
1152 | 1510 | ||
1153 | /* __isolate_free_page() does not map the pages */ | 1511 | /* Adjust stride depending on isolation */ |
1154 | map_pages(freelist); | 1512 | if (nr_isolated) { |
1513 | stride = 1; | ||
1514 | continue; | ||
1515 | } | ||
1516 | stride = min_t(unsigned int, COMPACT_CLUSTER_MAX, stride << 1); | ||
1517 | } | ||
1155 | 1518 | ||
1156 | /* | 1519 | /* |
1157 | * Record where the free scanner will restart next time. Either we | 1520 | * Record where the free scanner will restart next time. Either we |
@@ -1160,6 +1523,10 @@ static void isolate_freepages(struct compact_control *cc) | |||
1160 | * and the loop terminated due to isolate_start_pfn < low_pfn | 1523 | * and the loop terminated due to isolate_start_pfn < low_pfn |
1161 | */ | 1524 | */ |
1162 | cc->free_pfn = isolate_start_pfn; | 1525 | cc->free_pfn = isolate_start_pfn; |
1526 | |||
1527 | splitmap: | ||
1528 | /* __isolate_free_page() does not map the pages */ | ||
1529 | split_map_pages(freelist); | ||
1163 | } | 1530 | } |
1164 | 1531 | ||
1165 | /* | 1532 | /* |
@@ -1172,13 +1539,8 @@ static struct page *compaction_alloc(struct page *migratepage, | |||
1172 | struct compact_control *cc = (struct compact_control *)data; | 1539 | struct compact_control *cc = (struct compact_control *)data; |
1173 | struct page *freepage; | 1540 | struct page *freepage; |
1174 | 1541 | ||
1175 | /* | ||
1176 | * Isolate free pages if necessary, and if we are not aborting due to | ||
1177 | * contention. | ||
1178 | */ | ||
1179 | if (list_empty(&cc->freepages)) { | 1542 | if (list_empty(&cc->freepages)) { |
1180 | if (!cc->contended) | 1543 | isolate_freepages(cc); |
1181 | isolate_freepages(cc); | ||
1182 | 1544 | ||
1183 | if (list_empty(&cc->freepages)) | 1545 | if (list_empty(&cc->freepages)) |
1184 | return NULL; | 1546 | return NULL; |
@@ -1217,6 +1579,147 @@ typedef enum { | |||
1217 | */ | 1579 | */ |
1218 | int sysctl_compact_unevictable_allowed __read_mostly = 1; | 1580 | int sysctl_compact_unevictable_allowed __read_mostly = 1; |
1219 | 1581 | ||
1582 | static inline void | ||
1583 | update_fast_start_pfn(struct compact_control *cc, unsigned long pfn) | ||
1584 | { | ||
1585 | if (cc->fast_start_pfn == ULONG_MAX) | ||
1586 | return; | ||
1587 | |||
1588 | if (!cc->fast_start_pfn) | ||
1589 | cc->fast_start_pfn = pfn; | ||
1590 | |||
1591 | cc->fast_start_pfn = min(cc->fast_start_pfn, pfn); | ||
1592 | } | ||
1593 | |||
1594 | static inline unsigned long | ||
1595 | reinit_migrate_pfn(struct compact_control *cc) | ||
1596 | { | ||
1597 | if (!cc->fast_start_pfn || cc->fast_start_pfn == ULONG_MAX) | ||
1598 | return cc->migrate_pfn; | ||
1599 | |||
1600 | cc->migrate_pfn = cc->fast_start_pfn; | ||
1601 | cc->fast_start_pfn = ULONG_MAX; | ||
1602 | |||
1603 | return cc->migrate_pfn; | ||
1604 | } | ||
1605 | |||
1606 | /* | ||
1607 | * Briefly search the free lists for a migration source that already has | ||
1608 | * some free pages to reduce the number of pages that need migration | ||
1609 | * before a pageblock is free. | ||
1610 | */ | ||
1611 | static unsigned long fast_find_migrateblock(struct compact_control *cc) | ||
1612 | { | ||
1613 | unsigned int limit = freelist_scan_limit(cc); | ||
1614 | unsigned int nr_scanned = 0; | ||
1615 | unsigned long distance; | ||
1616 | unsigned long pfn = cc->migrate_pfn; | ||
1617 | unsigned long high_pfn; | ||
1618 | int order; | ||
1619 | |||
1620 | /* Skip hints are relied on to avoid repeats on the fast search */ | ||
1621 | if (cc->ignore_skip_hint) | ||
1622 | return pfn; | ||
1623 | |||
1624 | /* | ||
1625 | * If the migrate_pfn is not at the start of a zone or the start | ||
1626 | * of a pageblock then assume this is a continuation of a previous | ||
1627 | * scan restarted due to COMPACT_CLUSTER_MAX. | ||
1628 | */ | ||
1629 | if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn)) | ||
1630 | return pfn; | ||
1631 | |||
1632 | /* | ||
1633 | * For smaller orders, just linearly scan as the number of pages | ||
1634 | * to migrate should be relatively small and does not necessarily | ||
1635 | * justify freeing up a large block for a small allocation. | ||
1636 | */ | ||
1637 | if (cc->order <= PAGE_ALLOC_COSTLY_ORDER) | ||
1638 | return pfn; | ||
1639 | |||
1640 | /* | ||
1641 | * Only allow kcompactd and direct requests for movable pages to | ||
1642 | * quickly clear out a MOVABLE pageblock for allocation. This | ||
1643 | * reduces the risk that a large movable pageblock is freed for | ||
1644 | * an unmovable/reclaimable small allocation. | ||
1645 | */ | ||
1646 | if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE) | ||
1647 | return pfn; | ||
1648 | |||
1649 | /* | ||
1650 | * When starting the migration scanner, pick any pageblock within the | ||
1651 | * first half of the search space. Otherwise try and pick a pageblock | ||
1652 | * within the first eighth to reduce the chances that a migration | ||
1653 | * target later becomes a source. | ||
1654 | */ | ||
1655 | distance = (cc->free_pfn - cc->migrate_pfn) >> 1; | ||
1656 | if (cc->migrate_pfn != cc->zone->zone_start_pfn) | ||
1657 | distance >>= 2; | ||
1658 | high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance); | ||
1659 | |||
1660 | for (order = cc->order - 1; | ||
1661 | order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit; | ||
1662 | order--) { | ||
1663 | struct free_area *area = &cc->zone->free_area[order]; | ||
1664 | struct list_head *freelist; | ||
1665 | unsigned long flags; | ||
1666 | struct page *freepage; | ||
1667 | |||
1668 | if (!area->nr_free) | ||
1669 | continue; | ||
1670 | |||
1671 | spin_lock_irqsave(&cc->zone->lock, flags); | ||
1672 | freelist = &area->free_list[MIGRATE_MOVABLE]; | ||
1673 | list_for_each_entry(freepage, freelist, lru) { | ||
1674 | unsigned long free_pfn; | ||
1675 | |||
1676 | nr_scanned++; | ||
1677 | free_pfn = page_to_pfn(freepage); | ||
1678 | if (free_pfn < high_pfn) { | ||
1679 | /* | ||
1680 | * Avoid if skipped recently. Ideally it would | ||
1681 | * move to the tail but even safe iteration of | ||
1682 | * the list assumes an entry is deleted, not | ||
1683 | * reordered. | ||
1684 | */ | ||
1685 | if (get_pageblock_skip(freepage)) { | ||
1686 | if (list_is_last(freelist, &freepage->lru)) | ||
1687 | break; | ||
1688 | |||
1689 | continue; | ||
1690 | } | ||
1691 | |||
1692 | /* Reorder to so a future search skips recent pages */ | ||
1693 | move_freelist_tail(freelist, freepage); | ||
1694 | |||
1695 | update_fast_start_pfn(cc, free_pfn); | ||
1696 | pfn = pageblock_start_pfn(free_pfn); | ||
1697 | cc->fast_search_fail = 0; | ||
1698 | set_pageblock_skip(freepage); | ||
1699 | break; | ||
1700 | } | ||
1701 | |||
1702 | if (nr_scanned >= limit) { | ||
1703 | cc->fast_search_fail++; | ||
1704 | move_freelist_tail(freelist, freepage); | ||
1705 | break; | ||
1706 | } | ||
1707 | } | ||
1708 | spin_unlock_irqrestore(&cc->zone->lock, flags); | ||
1709 | } | ||
1710 | |||
1711 | cc->total_migrate_scanned += nr_scanned; | ||
1712 | |||
1713 | /* | ||
1714 | * If fast scanning failed then use a cached entry for a page block | ||
1715 | * that had free pages as the basis for starting a linear scan. | ||
1716 | */ | ||
1717 | if (pfn == cc->migrate_pfn) | ||
1718 | pfn = reinit_migrate_pfn(cc); | ||
1719 | |||
1720 | return pfn; | ||
1721 | } | ||
1722 | |||
1220 | /* | 1723 | /* |
1221 | * Isolate all pages that can be migrated from the first suitable block, | 1724 | * Isolate all pages that can be migrated from the first suitable block, |
1222 | * starting at the block pointed to by the migrate scanner pfn within | 1725 | * starting at the block pointed to by the migrate scanner pfn within |
@@ -1232,16 +1735,25 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
1232 | const isolate_mode_t isolate_mode = | 1735 | const isolate_mode_t isolate_mode = |
1233 | (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | | 1736 | (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | |
1234 | (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0); | 1737 | (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0); |
1738 | bool fast_find_block; | ||
1235 | 1739 | ||
1236 | /* | 1740 | /* |
1237 | * Start at where we last stopped, or beginning of the zone as | 1741 | * Start at where we last stopped, or beginning of the zone as |
1238 | * initialized by compact_zone() | 1742 | * initialized by compact_zone(). The first failure will use |
1743 | * the lowest PFN as the starting point for linear scanning. | ||
1239 | */ | 1744 | */ |
1240 | low_pfn = cc->migrate_pfn; | 1745 | low_pfn = fast_find_migrateblock(cc); |
1241 | block_start_pfn = pageblock_start_pfn(low_pfn); | 1746 | block_start_pfn = pageblock_start_pfn(low_pfn); |
1242 | if (block_start_pfn < zone->zone_start_pfn) | 1747 | if (block_start_pfn < zone->zone_start_pfn) |
1243 | block_start_pfn = zone->zone_start_pfn; | 1748 | block_start_pfn = zone->zone_start_pfn; |
1244 | 1749 | ||
1750 | /* | ||
1751 | * fast_find_migrateblock marks a pageblock skipped so to avoid | ||
1752 | * the isolation_suitable check below, check whether the fast | ||
1753 | * search was successful. | ||
1754 | */ | ||
1755 | fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail; | ||
1756 | |||
1245 | /* Only scan within a pageblock boundary */ | 1757 | /* Only scan within a pageblock boundary */ |
1246 | block_end_pfn = pageblock_end_pfn(low_pfn); | 1758 | block_end_pfn = pageblock_end_pfn(low_pfn); |
1247 | 1759 | ||
@@ -1250,6 +1762,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
1250 | * Do not cross the free scanner. | 1762 | * Do not cross the free scanner. |
1251 | */ | 1763 | */ |
1252 | for (; block_end_pfn <= cc->free_pfn; | 1764 | for (; block_end_pfn <= cc->free_pfn; |
1765 | fast_find_block = false, | ||
1253 | low_pfn = block_end_pfn, | 1766 | low_pfn = block_end_pfn, |
1254 | block_start_pfn = block_end_pfn, | 1767 | block_start_pfn = block_end_pfn, |
1255 | block_end_pfn += pageblock_nr_pages) { | 1768 | block_end_pfn += pageblock_nr_pages) { |
@@ -1257,34 +1770,45 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
1257 | /* | 1770 | /* |
1258 | * This can potentially iterate a massively long zone with | 1771 | * This can potentially iterate a massively long zone with |
1259 | * many pageblocks unsuitable, so periodically check if we | 1772 | * many pageblocks unsuitable, so periodically check if we |
1260 | * need to schedule, or even abort async compaction. | 1773 | * need to schedule. |
1261 | */ | 1774 | */ |
1262 | if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) | 1775 | if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) |
1263 | && compact_should_abort(cc)) | 1776 | cond_resched(); |
1264 | break; | ||
1265 | 1777 | ||
1266 | page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, | 1778 | page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, |
1267 | zone); | 1779 | zone); |
1268 | if (!page) | 1780 | if (!page) |
1269 | continue; | 1781 | continue; |
1270 | 1782 | ||
1271 | /* If isolation recently failed, do not retry */ | 1783 | /* |
1272 | if (!isolation_suitable(cc, page)) | 1784 | * If isolation recently failed, do not retry. Only check the |
1785 | * pageblock once. COMPACT_CLUSTER_MAX causes a pageblock | ||
1786 | * to be visited multiple times. Assume skip was checked | ||
1787 | * before making it "skip" so other compaction instances do | ||
1788 | * not scan the same block. | ||
1789 | */ | ||
1790 | if (IS_ALIGNED(low_pfn, pageblock_nr_pages) && | ||
1791 | !fast_find_block && !isolation_suitable(cc, page)) | ||
1273 | continue; | 1792 | continue; |
1274 | 1793 | ||
1275 | /* | 1794 | /* |
1276 | * For async compaction, also only scan in MOVABLE blocks. | 1795 | * For async compaction, also only scan in MOVABLE blocks |
1277 | * Async compaction is optimistic to see if the minimum amount | 1796 | * without huge pages. Async compaction is optimistic to see |
1278 | * of work satisfies the allocation. | 1797 | * if the minimum amount of work satisfies the allocation. |
1798 | * The cached PFN is updated as it's possible that all | ||
1799 | * remaining blocks between source and target are unsuitable | ||
1800 | * and the compaction scanners fail to meet. | ||
1279 | */ | 1801 | */ |
1280 | if (!suitable_migration_source(cc, page)) | 1802 | if (!suitable_migration_source(cc, page)) { |
1803 | update_cached_migrate(cc, block_end_pfn); | ||
1281 | continue; | 1804 | continue; |
1805 | } | ||
1282 | 1806 | ||
1283 | /* Perform the isolation */ | 1807 | /* Perform the isolation */ |
1284 | low_pfn = isolate_migratepages_block(cc, low_pfn, | 1808 | low_pfn = isolate_migratepages_block(cc, low_pfn, |
1285 | block_end_pfn, isolate_mode); | 1809 | block_end_pfn, isolate_mode); |
1286 | 1810 | ||
1287 | if (!low_pfn || cc->contended) | 1811 | if (!low_pfn) |
1288 | return ISOLATE_ABORT; | 1812 | return ISOLATE_ABORT; |
1289 | 1813 | ||
1290 | /* | 1814 | /* |
@@ -1310,19 +1834,16 @@ static inline bool is_via_compact_memory(int order) | |||
1310 | return order == -1; | 1834 | return order == -1; |
1311 | } | 1835 | } |
1312 | 1836 | ||
1313 | static enum compact_result __compact_finished(struct zone *zone, | 1837 | static enum compact_result __compact_finished(struct compact_control *cc) |
1314 | struct compact_control *cc) | ||
1315 | { | 1838 | { |
1316 | unsigned int order; | 1839 | unsigned int order; |
1317 | const int migratetype = cc->migratetype; | 1840 | const int migratetype = cc->migratetype; |
1318 | 1841 | int ret; | |
1319 | if (cc->contended || fatal_signal_pending(current)) | ||
1320 | return COMPACT_CONTENDED; | ||
1321 | 1842 | ||
1322 | /* Compaction run completes if the migrate and free scanner meet */ | 1843 | /* Compaction run completes if the migrate and free scanner meet */ |
1323 | if (compact_scanners_met(cc)) { | 1844 | if (compact_scanners_met(cc)) { |
1324 | /* Let the next compaction start anew. */ | 1845 | /* Let the next compaction start anew. */ |
1325 | reset_cached_positions(zone); | 1846 | reset_cached_positions(cc->zone); |
1326 | 1847 | ||
1327 | /* | 1848 | /* |
1328 | * Mark that the PG_migrate_skip information should be cleared | 1849 | * Mark that the PG_migrate_skip information should be cleared |
@@ -1331,7 +1852,7 @@ static enum compact_result __compact_finished(struct zone *zone, | |||
1331 | * based on an allocation request. | 1852 | * based on an allocation request. |
1332 | */ | 1853 | */ |
1333 | if (cc->direct_compaction) | 1854 | if (cc->direct_compaction) |
1334 | zone->compact_blockskip_flush = true; | 1855 | cc->zone->compact_blockskip_flush = true; |
1335 | 1856 | ||
1336 | if (cc->whole_zone) | 1857 | if (cc->whole_zone) |
1337 | return COMPACT_COMPLETE; | 1858 | return COMPACT_COMPLETE; |
@@ -1342,20 +1863,19 @@ static enum compact_result __compact_finished(struct zone *zone, | |||
1342 | if (is_via_compact_memory(cc->order)) | 1863 | if (is_via_compact_memory(cc->order)) |
1343 | return COMPACT_CONTINUE; | 1864 | return COMPACT_CONTINUE; |
1344 | 1865 | ||
1345 | if (cc->finishing_block) { | 1866 | /* |
1346 | /* | 1867 | * Always finish scanning a pageblock to reduce the possibility of |
1347 | * We have finished the pageblock, but better check again that | 1868 | * fallbacks in the future. This is particularly important when |
1348 | * we really succeeded. | 1869 | * migration source is unmovable/reclaimable but it's not worth |
1349 | */ | 1870 | * special casing. |
1350 | if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) | 1871 | */ |
1351 | cc->finishing_block = false; | 1872 | if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) |
1352 | else | 1873 | return COMPACT_CONTINUE; |
1353 | return COMPACT_CONTINUE; | ||
1354 | } | ||
1355 | 1874 | ||
1356 | /* Direct compactor: Is a suitable page free? */ | 1875 | /* Direct compactor: Is a suitable page free? */ |
1876 | ret = COMPACT_NO_SUITABLE_PAGE; | ||
1357 | for (order = cc->order; order < MAX_ORDER; order++) { | 1877 | for (order = cc->order; order < MAX_ORDER; order++) { |
1358 | struct free_area *area = &zone->free_area[order]; | 1878 | struct free_area *area = &cc->zone->free_area[order]; |
1359 | bool can_steal; | 1879 | bool can_steal; |
1360 | 1880 | ||
1361 | /* Job done if page is free of the right migratetype */ | 1881 | /* Job done if page is free of the right migratetype */ |
@@ -1393,21 +1913,23 @@ static enum compact_result __compact_finished(struct zone *zone, | |||
1393 | return COMPACT_SUCCESS; | 1913 | return COMPACT_SUCCESS; |
1394 | } | 1914 | } |
1395 | 1915 | ||
1396 | cc->finishing_block = true; | 1916 | ret = COMPACT_CONTINUE; |
1397 | return COMPACT_CONTINUE; | 1917 | break; |
1398 | } | 1918 | } |
1399 | } | 1919 | } |
1400 | 1920 | ||
1401 | return COMPACT_NO_SUITABLE_PAGE; | 1921 | if (cc->contended || fatal_signal_pending(current)) |
1922 | ret = COMPACT_CONTENDED; | ||
1923 | |||
1924 | return ret; | ||
1402 | } | 1925 | } |
1403 | 1926 | ||
1404 | static enum compact_result compact_finished(struct zone *zone, | 1927 | static enum compact_result compact_finished(struct compact_control *cc) |
1405 | struct compact_control *cc) | ||
1406 | { | 1928 | { |
1407 | int ret; | 1929 | int ret; |
1408 | 1930 | ||
1409 | ret = __compact_finished(zone, cc); | 1931 | ret = __compact_finished(cc); |
1410 | trace_mm_compaction_finished(zone, cc->order, ret); | 1932 | trace_mm_compaction_finished(cc->zone, cc->order, ret); |
1411 | if (ret == COMPACT_NO_SUITABLE_PAGE) | 1933 | if (ret == COMPACT_NO_SUITABLE_PAGE) |
1412 | ret = COMPACT_CONTINUE; | 1934 | ret = COMPACT_CONTINUE; |
1413 | 1935 | ||
@@ -1534,15 +2056,18 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, | |||
1534 | return false; | 2056 | return false; |
1535 | } | 2057 | } |
1536 | 2058 | ||
1537 | static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc) | 2059 | static enum compact_result |
2060 | compact_zone(struct compact_control *cc, struct capture_control *capc) | ||
1538 | { | 2061 | { |
1539 | enum compact_result ret; | 2062 | enum compact_result ret; |
1540 | unsigned long start_pfn = zone->zone_start_pfn; | 2063 | unsigned long start_pfn = cc->zone->zone_start_pfn; |
1541 | unsigned long end_pfn = zone_end_pfn(zone); | 2064 | unsigned long end_pfn = zone_end_pfn(cc->zone); |
2065 | unsigned long last_migrated_pfn; | ||
1542 | const bool sync = cc->mode != MIGRATE_ASYNC; | 2066 | const bool sync = cc->mode != MIGRATE_ASYNC; |
2067 | bool update_cached; | ||
1543 | 2068 | ||
1544 | cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); | 2069 | cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); |
1545 | ret = compaction_suitable(zone, cc->order, cc->alloc_flags, | 2070 | ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, |
1546 | cc->classzone_idx); | 2071 | cc->classzone_idx); |
1547 | /* Compaction is likely to fail */ | 2072 | /* Compaction is likely to fail */ |
1548 | if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) | 2073 | if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED) |
@@ -1555,8 +2080,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro | |||
1555 | * Clear pageblock skip if there were failures recently and compaction | 2080 | * Clear pageblock skip if there were failures recently and compaction |
1556 | * is about to be retried after being deferred. | 2081 | * is about to be retried after being deferred. |
1557 | */ | 2082 | */ |
1558 | if (compaction_restarting(zone, cc->order)) | 2083 | if (compaction_restarting(cc->zone, cc->order)) |
1559 | __reset_isolation_suitable(zone); | 2084 | __reset_isolation_suitable(cc->zone); |
1560 | 2085 | ||
1561 | /* | 2086 | /* |
1562 | * Setup to move all movable pages to the end of the zone. Used cached | 2087 | * Setup to move all movable pages to the end of the zone. Used cached |
@@ -1564,43 +2089,76 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro | |||
1564 | * want to compact the whole zone), but check that it is initialised | 2089 | * want to compact the whole zone), but check that it is initialised |
1565 | * by ensuring the values are within zone boundaries. | 2090 | * by ensuring the values are within zone boundaries. |
1566 | */ | 2091 | */ |
2092 | cc->fast_start_pfn = 0; | ||
1567 | if (cc->whole_zone) { | 2093 | if (cc->whole_zone) { |
1568 | cc->migrate_pfn = start_pfn; | 2094 | cc->migrate_pfn = start_pfn; |
1569 | cc->free_pfn = pageblock_start_pfn(end_pfn - 1); | 2095 | cc->free_pfn = pageblock_start_pfn(end_pfn - 1); |
1570 | } else { | 2096 | } else { |
1571 | cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; | 2097 | cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync]; |
1572 | cc->free_pfn = zone->compact_cached_free_pfn; | 2098 | cc->free_pfn = cc->zone->compact_cached_free_pfn; |
1573 | if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { | 2099 | if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { |
1574 | cc->free_pfn = pageblock_start_pfn(end_pfn - 1); | 2100 | cc->free_pfn = pageblock_start_pfn(end_pfn - 1); |
1575 | zone->compact_cached_free_pfn = cc->free_pfn; | 2101 | cc->zone->compact_cached_free_pfn = cc->free_pfn; |
1576 | } | 2102 | } |
1577 | if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { | 2103 | if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { |
1578 | cc->migrate_pfn = start_pfn; | 2104 | cc->migrate_pfn = start_pfn; |
1579 | zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; | 2105 | cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; |
1580 | zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; | 2106 | cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; |
1581 | } | 2107 | } |
1582 | 2108 | ||
1583 | if (cc->migrate_pfn == start_pfn) | 2109 | if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn) |
1584 | cc->whole_zone = true; | 2110 | cc->whole_zone = true; |
1585 | } | 2111 | } |
1586 | 2112 | ||
1587 | cc->last_migrated_pfn = 0; | 2113 | last_migrated_pfn = 0; |
2114 | |||
2115 | /* | ||
2116 | * Migrate has separate cached PFNs for ASYNC and SYNC* migration on | ||
2117 | * the basis that some migrations will fail in ASYNC mode. However, | ||
2118 | * if the cached PFNs match and pageblocks are skipped due to having | ||
2119 | * no isolation candidates, then the sync state does not matter. | ||
2120 | * Until a pageblock with isolation candidates is found, keep the | ||
2121 | * cached PFNs in sync to avoid revisiting the same blocks. | ||
2122 | */ | ||
2123 | update_cached = !sync && | ||
2124 | cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1]; | ||
1588 | 2125 | ||
1589 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, | 2126 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, |
1590 | cc->free_pfn, end_pfn, sync); | 2127 | cc->free_pfn, end_pfn, sync); |
1591 | 2128 | ||
1592 | migrate_prep_local(); | 2129 | migrate_prep_local(); |
1593 | 2130 | ||
1594 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { | 2131 | while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) { |
1595 | int err; | 2132 | int err; |
2133 | unsigned long start_pfn = cc->migrate_pfn; | ||
2134 | |||
2135 | /* | ||
2136 | * Avoid multiple rescans which can happen if a page cannot be | ||
2137 | * isolated (dirty/writeback in async mode) or if the migrated | ||
2138 | * pages are being allocated before the pageblock is cleared. | ||
2139 | * The first rescan will capture the entire pageblock for | ||
2140 | * migration. If it fails, it'll be marked skip and scanning | ||
2141 | * will proceed as normal. | ||
2142 | */ | ||
2143 | cc->rescan = false; | ||
2144 | if (pageblock_start_pfn(last_migrated_pfn) == | ||
2145 | pageblock_start_pfn(start_pfn)) { | ||
2146 | cc->rescan = true; | ||
2147 | } | ||
1596 | 2148 | ||
1597 | switch (isolate_migratepages(zone, cc)) { | 2149 | switch (isolate_migratepages(cc->zone, cc)) { |
1598 | case ISOLATE_ABORT: | 2150 | case ISOLATE_ABORT: |
1599 | ret = COMPACT_CONTENDED; | 2151 | ret = COMPACT_CONTENDED; |
1600 | putback_movable_pages(&cc->migratepages); | 2152 | putback_movable_pages(&cc->migratepages); |
1601 | cc->nr_migratepages = 0; | 2153 | cc->nr_migratepages = 0; |
2154 | last_migrated_pfn = 0; | ||
1602 | goto out; | 2155 | goto out; |
1603 | case ISOLATE_NONE: | 2156 | case ISOLATE_NONE: |
2157 | if (update_cached) { | ||
2158 | cc->zone->compact_cached_migrate_pfn[1] = | ||
2159 | cc->zone->compact_cached_migrate_pfn[0]; | ||
2160 | } | ||
2161 | |||
1604 | /* | 2162 | /* |
1605 | * We haven't isolated and migrated anything, but | 2163 | * We haven't isolated and migrated anything, but |
1606 | * there might still be unflushed migrations from | 2164 | * there might still be unflushed migrations from |
@@ -1608,6 +2166,8 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro | |||
1608 | */ | 2166 | */ |
1609 | goto check_drain; | 2167 | goto check_drain; |
1610 | case ISOLATE_SUCCESS: | 2168 | case ISOLATE_SUCCESS: |
2169 | update_cached = false; | ||
2170 | last_migrated_pfn = start_pfn; | ||
1611 | ; | 2171 | ; |
1612 | } | 2172 | } |
1613 | 2173 | ||
@@ -1639,8 +2199,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro | |||
1639 | cc->migrate_pfn = block_end_pfn( | 2199 | cc->migrate_pfn = block_end_pfn( |
1640 | cc->migrate_pfn - 1, cc->order); | 2200 | cc->migrate_pfn - 1, cc->order); |
1641 | /* Draining pcplists is useless in this case */ | 2201 | /* Draining pcplists is useless in this case */ |
1642 | cc->last_migrated_pfn = 0; | 2202 | last_migrated_pfn = 0; |
1643 | |||
1644 | } | 2203 | } |
1645 | } | 2204 | } |
1646 | 2205 | ||
@@ -1652,21 +2211,26 @@ check_drain: | |||
1652 | * compact_finished() can detect immediately if allocation | 2211 | * compact_finished() can detect immediately if allocation |
1653 | * would succeed. | 2212 | * would succeed. |
1654 | */ | 2213 | */ |
1655 | if (cc->order > 0 && cc->last_migrated_pfn) { | 2214 | if (cc->order > 0 && last_migrated_pfn) { |
1656 | int cpu; | 2215 | int cpu; |
1657 | unsigned long current_block_start = | 2216 | unsigned long current_block_start = |
1658 | block_start_pfn(cc->migrate_pfn, cc->order); | 2217 | block_start_pfn(cc->migrate_pfn, cc->order); |
1659 | 2218 | ||
1660 | if (cc->last_migrated_pfn < current_block_start) { | 2219 | if (last_migrated_pfn < current_block_start) { |
1661 | cpu = get_cpu(); | 2220 | cpu = get_cpu(); |
1662 | lru_add_drain_cpu(cpu); | 2221 | lru_add_drain_cpu(cpu); |
1663 | drain_local_pages(zone); | 2222 | drain_local_pages(cc->zone); |
1664 | put_cpu(); | 2223 | put_cpu(); |
1665 | /* No more flushing until we migrate again */ | 2224 | /* No more flushing until we migrate again */ |
1666 | cc->last_migrated_pfn = 0; | 2225 | last_migrated_pfn = 0; |
1667 | } | 2226 | } |
1668 | } | 2227 | } |
1669 | 2228 | ||
2229 | /* Stop if a page has been captured */ | ||
2230 | if (capc && capc->page) { | ||
2231 | ret = COMPACT_SUCCESS; | ||
2232 | break; | ||
2233 | } | ||
1670 | } | 2234 | } |
1671 | 2235 | ||
1672 | out: | 2236 | out: |
@@ -1685,8 +2249,8 @@ out: | |||
1685 | * Only go back, not forward. The cached pfn might have been | 2249 | * Only go back, not forward. The cached pfn might have been |
1686 | * already reset to zone end in compact_finished() | 2250 | * already reset to zone end in compact_finished() |
1687 | */ | 2251 | */ |
1688 | if (free_pfn > zone->compact_cached_free_pfn) | 2252 | if (free_pfn > cc->zone->compact_cached_free_pfn) |
1689 | zone->compact_cached_free_pfn = free_pfn; | 2253 | cc->zone->compact_cached_free_pfn = free_pfn; |
1690 | } | 2254 | } |
1691 | 2255 | ||
1692 | count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned); | 2256 | count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned); |
@@ -1700,7 +2264,8 @@ out: | |||
1700 | 2264 | ||
1701 | static enum compact_result compact_zone_order(struct zone *zone, int order, | 2265 | static enum compact_result compact_zone_order(struct zone *zone, int order, |
1702 | gfp_t gfp_mask, enum compact_priority prio, | 2266 | gfp_t gfp_mask, enum compact_priority prio, |
1703 | unsigned int alloc_flags, int classzone_idx) | 2267 | unsigned int alloc_flags, int classzone_idx, |
2268 | struct page **capture) | ||
1704 | { | 2269 | { |
1705 | enum compact_result ret; | 2270 | enum compact_result ret; |
1706 | struct compact_control cc = { | 2271 | struct compact_control cc = { |
@@ -1709,6 +2274,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, | |||
1709 | .total_migrate_scanned = 0, | 2274 | .total_migrate_scanned = 0, |
1710 | .total_free_scanned = 0, | 2275 | .total_free_scanned = 0, |
1711 | .order = order, | 2276 | .order = order, |
2277 | .search_order = order, | ||
1712 | .gfp_mask = gfp_mask, | 2278 | .gfp_mask = gfp_mask, |
1713 | .zone = zone, | 2279 | .zone = zone, |
1714 | .mode = (prio == COMPACT_PRIO_ASYNC) ? | 2280 | .mode = (prio == COMPACT_PRIO_ASYNC) ? |
@@ -1720,14 +2286,24 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, | |||
1720 | .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), | 2286 | .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY), |
1721 | .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY) | 2287 | .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY) |
1722 | }; | 2288 | }; |
2289 | struct capture_control capc = { | ||
2290 | .cc = &cc, | ||
2291 | .page = NULL, | ||
2292 | }; | ||
2293 | |||
2294 | if (capture) | ||
2295 | current->capture_control = &capc; | ||
1723 | INIT_LIST_HEAD(&cc.freepages); | 2296 | INIT_LIST_HEAD(&cc.freepages); |
1724 | INIT_LIST_HEAD(&cc.migratepages); | 2297 | INIT_LIST_HEAD(&cc.migratepages); |
1725 | 2298 | ||
1726 | ret = compact_zone(zone, &cc); | 2299 | ret = compact_zone(&cc, &capc); |
1727 | 2300 | ||
1728 | VM_BUG_ON(!list_empty(&cc.freepages)); | 2301 | VM_BUG_ON(!list_empty(&cc.freepages)); |
1729 | VM_BUG_ON(!list_empty(&cc.migratepages)); | 2302 | VM_BUG_ON(!list_empty(&cc.migratepages)); |
1730 | 2303 | ||
2304 | *capture = capc.page; | ||
2305 | current->capture_control = NULL; | ||
2306 | |||
1731 | return ret; | 2307 | return ret; |
1732 | } | 2308 | } |
1733 | 2309 | ||
@@ -1745,7 +2321,7 @@ int sysctl_extfrag_threshold = 500; | |||
1745 | */ | 2321 | */ |
1746 | enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, | 2322 | enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, |
1747 | unsigned int alloc_flags, const struct alloc_context *ac, | 2323 | unsigned int alloc_flags, const struct alloc_context *ac, |
1748 | enum compact_priority prio) | 2324 | enum compact_priority prio, struct page **capture) |
1749 | { | 2325 | { |
1750 | int may_perform_io = gfp_mask & __GFP_IO; | 2326 | int may_perform_io = gfp_mask & __GFP_IO; |
1751 | struct zoneref *z; | 2327 | struct zoneref *z; |
@@ -1773,7 +2349,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, | |||
1773 | } | 2349 | } |
1774 | 2350 | ||
1775 | status = compact_zone_order(zone, order, gfp_mask, prio, | 2351 | status = compact_zone_order(zone, order, gfp_mask, prio, |
1776 | alloc_flags, ac_classzone_idx(ac)); | 2352 | alloc_flags, ac_classzone_idx(ac), capture); |
1777 | rc = max(status, rc); | 2353 | rc = max(status, rc); |
1778 | 2354 | ||
1779 | /* The allocation should succeed, stop compacting */ | 2355 | /* The allocation should succeed, stop compacting */ |
@@ -1841,7 +2417,7 @@ static void compact_node(int nid) | |||
1841 | INIT_LIST_HEAD(&cc.freepages); | 2417 | INIT_LIST_HEAD(&cc.freepages); |
1842 | INIT_LIST_HEAD(&cc.migratepages); | 2418 | INIT_LIST_HEAD(&cc.migratepages); |
1843 | 2419 | ||
1844 | compact_zone(zone, &cc); | 2420 | compact_zone(&cc, NULL); |
1845 | 2421 | ||
1846 | VM_BUG_ON(!list_empty(&cc.freepages)); | 2422 | VM_BUG_ON(!list_empty(&cc.freepages)); |
1847 | VM_BUG_ON(!list_empty(&cc.migratepages)); | 2423 | VM_BUG_ON(!list_empty(&cc.migratepages)); |
@@ -1876,14 +2452,6 @@ int sysctl_compaction_handler(struct ctl_table *table, int write, | |||
1876 | return 0; | 2452 | return 0; |
1877 | } | 2453 | } |
1878 | 2454 | ||
1879 | int sysctl_extfrag_handler(struct ctl_table *table, int write, | ||
1880 | void __user *buffer, size_t *length, loff_t *ppos) | ||
1881 | { | ||
1882 | proc_dointvec_minmax(table, write, buffer, length, ppos); | ||
1883 | |||
1884 | return 0; | ||
1885 | } | ||
1886 | |||
1887 | #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) | 2455 | #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) |
1888 | static ssize_t sysfs_compact_node(struct device *dev, | 2456 | static ssize_t sysfs_compact_node(struct device *dev, |
1889 | struct device_attribute *attr, | 2457 | struct device_attribute *attr, |
@@ -1948,6 +2516,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) | |||
1948 | struct zone *zone; | 2516 | struct zone *zone; |
1949 | struct compact_control cc = { | 2517 | struct compact_control cc = { |
1950 | .order = pgdat->kcompactd_max_order, | 2518 | .order = pgdat->kcompactd_max_order, |
2519 | .search_order = pgdat->kcompactd_max_order, | ||
1951 | .total_migrate_scanned = 0, | 2520 | .total_migrate_scanned = 0, |
1952 | .total_free_scanned = 0, | 2521 | .total_free_scanned = 0, |
1953 | .classzone_idx = pgdat->kcompactd_classzone_idx, | 2522 | .classzone_idx = pgdat->kcompactd_classzone_idx, |
@@ -1983,7 +2552,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) | |||
1983 | 2552 | ||
1984 | if (kthread_should_stop()) | 2553 | if (kthread_should_stop()) |
1985 | return; | 2554 | return; |
1986 | status = compact_zone(zone, &cc); | 2555 | status = compact_zone(&cc, NULL); |
1987 | 2556 | ||
1988 | if (status == COMPACT_SUCCESS) { | 2557 | if (status == COMPACT_SUCCESS) { |
1989 | compaction_defer_reset(zone, cc.order, false); | 2558 | compaction_defer_reset(zone, cc.order, false); |
diff --git a/mm/dmapool.c b/mm/dmapool.c index 6d4b97e7e9e9..76a160083506 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
@@ -114,10 +114,9 @@ static DEVICE_ATTR(pools, 0444, show_pools, NULL); | |||
114 | * @size: size of the blocks in this pool. | 114 | * @size: size of the blocks in this pool. |
115 | * @align: alignment requirement for blocks; must be a power of two | 115 | * @align: alignment requirement for blocks; must be a power of two |
116 | * @boundary: returned blocks won't cross this power of two boundary | 116 | * @boundary: returned blocks won't cross this power of two boundary |
117 | * Context: !in_interrupt() | 117 | * Context: not in_interrupt() |
118 | * | 118 | * |
119 | * Returns a dma allocation pool with the requested characteristics, or | 119 | * Given one of these pools, dma_pool_alloc() |
120 | * null if one can't be created. Given one of these pools, dma_pool_alloc() | ||
121 | * may be used to allocate memory. Such memory will all have "consistent" | 120 | * may be used to allocate memory. Such memory will all have "consistent" |
122 | * DMA mappings, accessible by the device and its driver without using | 121 | * DMA mappings, accessible by the device and its driver without using |
123 | * cache flushing primitives. The actual size of blocks allocated may be | 122 | * cache flushing primitives. The actual size of blocks allocated may be |
@@ -127,6 +126,9 @@ static DEVICE_ATTR(pools, 0444, show_pools, NULL); | |||
127 | * cross that size boundary. This is useful for devices which have | 126 | * cross that size boundary. This is useful for devices which have |
128 | * addressing restrictions on individual DMA transfers, such as not crossing | 127 | * addressing restrictions on individual DMA transfers, such as not crossing |
129 | * boundaries of 4KBytes. | 128 | * boundaries of 4KBytes. |
129 | * | ||
130 | * Return: a dma allocation pool with the requested characteristics, or | ||
131 | * %NULL if one can't be created. | ||
130 | */ | 132 | */ |
131 | struct dma_pool *dma_pool_create(const char *name, struct device *dev, | 133 | struct dma_pool *dma_pool_create(const char *name, struct device *dev, |
132 | size_t size, size_t align, size_t boundary) | 134 | size_t size, size_t align, size_t boundary) |
@@ -313,7 +315,7 @@ EXPORT_SYMBOL(dma_pool_destroy); | |||
313 | * @mem_flags: GFP_* bitmask | 315 | * @mem_flags: GFP_* bitmask |
314 | * @handle: pointer to dma address of block | 316 | * @handle: pointer to dma address of block |
315 | * | 317 | * |
316 | * This returns the kernel virtual address of a currently unused block, | 318 | * Return: the kernel virtual address of a currently unused block, |
317 | * and reports its dma address through the handle. | 319 | * and reports its dma address through the handle. |
318 | * If such a memory block can't be allocated, %NULL is returned. | 320 | * If such a memory block can't be allocated, %NULL is returned. |
319 | */ | 321 | */ |
@@ -498,6 +500,9 @@ static int dmam_pool_match(struct device *dev, void *res, void *match_data) | |||
498 | * | 500 | * |
499 | * Managed dma_pool_create(). DMA pool created with this function is | 501 | * Managed dma_pool_create(). DMA pool created with this function is |
500 | * automatically destroyed on driver detach. | 502 | * automatically destroyed on driver detach. |
503 | * | ||
504 | * Return: a managed dma allocation pool with the requested | ||
505 | * characteristics, or %NULL if one can't be created. | ||
501 | */ | 506 | */ |
502 | struct dma_pool *dmam_pool_create(const char *name, struct device *dev, | 507 | struct dma_pool *dmam_pool_create(const char *name, struct device *dev, |
503 | size_t size, size_t align, size_t allocation) | 508 | size_t size, size_t align, size_t allocation) |
diff --git a/mm/failslab.c b/mm/failslab.c index b135ebb88b6f..ec5aad211c5b 100644 --- a/mm/failslab.c +++ b/mm/failslab.c | |||
@@ -48,18 +48,12 @@ static int __init failslab_debugfs_init(void) | |||
48 | if (IS_ERR(dir)) | 48 | if (IS_ERR(dir)) |
49 | return PTR_ERR(dir); | 49 | return PTR_ERR(dir); |
50 | 50 | ||
51 | if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, | 51 | debugfs_create_bool("ignore-gfp-wait", mode, dir, |
52 | &failslab.ignore_gfp_reclaim)) | 52 | &failslab.ignore_gfp_reclaim); |
53 | goto fail; | 53 | debugfs_create_bool("cache-filter", mode, dir, |
54 | if (!debugfs_create_bool("cache-filter", mode, dir, | 54 | &failslab.cache_filter); |
55 | &failslab.cache_filter)) | ||
56 | goto fail; | ||
57 | 55 | ||
58 | return 0; | 56 | return 0; |
59 | fail: | ||
60 | debugfs_remove_recursive(dir); | ||
61 | |||
62 | return -ENOMEM; | ||
63 | } | 57 | } |
64 | 58 | ||
65 | late_initcall(failslab_debugfs_init); | 59 | late_initcall(failslab_debugfs_init); |
diff --git a/mm/filemap.c b/mm/filemap.c index 9f5e323e883e..a3b4021c448f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -98,8 +98,8 @@ | |||
98 | * ->swap_lock (try_to_unmap_one) | 98 | * ->swap_lock (try_to_unmap_one) |
99 | * ->private_lock (try_to_unmap_one) | 99 | * ->private_lock (try_to_unmap_one) |
100 | * ->i_pages lock (try_to_unmap_one) | 100 | * ->i_pages lock (try_to_unmap_one) |
101 | * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) | 101 | * ->pgdat->lru_lock (follow_page->mark_page_accessed) |
102 | * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) | 102 | * ->pgdat->lru_lock (check_pte_range->isolate_lru_page) |
103 | * ->private_lock (page_remove_rmap->set_page_dirty) | 103 | * ->private_lock (page_remove_rmap->set_page_dirty) |
104 | * ->i_pages lock (page_remove_rmap->set_page_dirty) | 104 | * ->i_pages lock (page_remove_rmap->set_page_dirty) |
105 | * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) | 105 | * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) |
@@ -392,6 +392,8 @@ static int filemap_check_and_keep_errors(struct address_space *mapping) | |||
392 | * opposed to a regular memory cleansing writeback. The difference between | 392 | * opposed to a regular memory cleansing writeback. The difference between |
393 | * these two operations is that if a dirty page/buffer is encountered, it must | 393 | * these two operations is that if a dirty page/buffer is encountered, it must |
394 | * be waited upon, and not just skipped over. | 394 | * be waited upon, and not just skipped over. |
395 | * | ||
396 | * Return: %0 on success, negative error code otherwise. | ||
395 | */ | 397 | */ |
396 | int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, | 398 | int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, |
397 | loff_t end, int sync_mode) | 399 | loff_t end, int sync_mode) |
@@ -438,6 +440,8 @@ EXPORT_SYMBOL(filemap_fdatawrite_range); | |||
438 | * | 440 | * |
439 | * This is a mostly non-blocking flush. Not suitable for data-integrity | 441 | * This is a mostly non-blocking flush. Not suitable for data-integrity |
440 | * purposes - I/O may not be started against all dirty pages. | 442 | * purposes - I/O may not be started against all dirty pages. |
443 | * | ||
444 | * Return: %0 on success, negative error code otherwise. | ||
441 | */ | 445 | */ |
442 | int filemap_flush(struct address_space *mapping) | 446 | int filemap_flush(struct address_space *mapping) |
443 | { | 447 | { |
@@ -453,6 +457,9 @@ EXPORT_SYMBOL(filemap_flush); | |||
453 | * | 457 | * |
454 | * Find at least one page in the range supplied, usually used to check if | 458 | * Find at least one page in the range supplied, usually used to check if |
455 | * direct writing in this range will trigger a writeback. | 459 | * direct writing in this range will trigger a writeback. |
460 | * | ||
461 | * Return: %true if at least one page exists in the specified range, | ||
462 | * %false otherwise. | ||
456 | */ | 463 | */ |
457 | bool filemap_range_has_page(struct address_space *mapping, | 464 | bool filemap_range_has_page(struct address_space *mapping, |
458 | loff_t start_byte, loff_t end_byte) | 465 | loff_t start_byte, loff_t end_byte) |
@@ -529,6 +536,8 @@ static void __filemap_fdatawait_range(struct address_space *mapping, | |||
529 | * Since the error status of the address space is cleared by this function, | 536 | * Since the error status of the address space is cleared by this function, |
530 | * callers are responsible for checking the return value and handling and/or | 537 | * callers are responsible for checking the return value and handling and/or |
531 | * reporting the error. | 538 | * reporting the error. |
539 | * | ||
540 | * Return: error status of the address space. | ||
532 | */ | 541 | */ |
533 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, | 542 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, |
534 | loff_t end_byte) | 543 | loff_t end_byte) |
@@ -551,6 +560,8 @@ EXPORT_SYMBOL(filemap_fdatawait_range); | |||
551 | * Since the error status of the file is advanced by this function, | 560 | * Since the error status of the file is advanced by this function, |
552 | * callers are responsible for checking the return value and handling and/or | 561 | * callers are responsible for checking the return value and handling and/or |
553 | * reporting the error. | 562 | * reporting the error. |
563 | * | ||
564 | * Return: error status of the address space vs. the file->f_wb_err cursor. | ||
554 | */ | 565 | */ |
555 | int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) | 566 | int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte) |
556 | { | 567 | { |
@@ -572,6 +583,8 @@ EXPORT_SYMBOL(file_fdatawait_range); | |||
572 | * Use this function if callers don't handle errors themselves. Expected | 583 | * Use this function if callers don't handle errors themselves. Expected |
573 | * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), | 584 | * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2), |
574 | * fsfreeze(8) | 585 | * fsfreeze(8) |
586 | * | ||
587 | * Return: error status of the address space. | ||
575 | */ | 588 | */ |
576 | int filemap_fdatawait_keep_errors(struct address_space *mapping) | 589 | int filemap_fdatawait_keep_errors(struct address_space *mapping) |
577 | { | 590 | { |
@@ -623,6 +636,8 @@ EXPORT_SYMBOL(filemap_write_and_wait); | |||
623 | * | 636 | * |
624 | * Note that @lend is inclusive (describes the last byte to be written) so | 637 | * Note that @lend is inclusive (describes the last byte to be written) so |
625 | * that this function can be used to write to the very end-of-file (end = -1). | 638 | * that this function can be used to write to the very end-of-file (end = -1). |
639 | * | ||
640 | * Return: error status of the address space. | ||
626 | */ | 641 | */ |
627 | int filemap_write_and_wait_range(struct address_space *mapping, | 642 | int filemap_write_and_wait_range(struct address_space *mapping, |
628 | loff_t lstart, loff_t lend) | 643 | loff_t lstart, loff_t lend) |
@@ -678,6 +693,8 @@ EXPORT_SYMBOL(__filemap_set_wb_err); | |||
678 | * While we handle mapping->wb_err with atomic operations, the f_wb_err | 693 | * While we handle mapping->wb_err with atomic operations, the f_wb_err |
679 | * value is protected by the f_lock since we must ensure that it reflects | 694 | * value is protected by the f_lock since we must ensure that it reflects |
680 | * the latest value swapped in for this file descriptor. | 695 | * the latest value swapped in for this file descriptor. |
696 | * | ||
697 | * Return: %0 on success, negative error code otherwise. | ||
681 | */ | 698 | */ |
682 | int file_check_and_advance_wb_err(struct file *file) | 699 | int file_check_and_advance_wb_err(struct file *file) |
683 | { | 700 | { |
@@ -720,6 +737,8 @@ EXPORT_SYMBOL(file_check_and_advance_wb_err); | |||
720 | * | 737 | * |
721 | * After writing out and waiting on the data, we check and advance the | 738 | * After writing out and waiting on the data, we check and advance the |
722 | * f_wb_err cursor to the latest value, and return any errors detected there. | 739 | * f_wb_err cursor to the latest value, and return any errors detected there. |
740 | * | ||
741 | * Return: %0 on success, negative error code otherwise. | ||
723 | */ | 742 | */ |
724 | int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) | 743 | int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) |
725 | { | 744 | { |
@@ -753,6 +772,8 @@ EXPORT_SYMBOL(file_write_and_wait_range); | |||
753 | * caller must do that. | 772 | * caller must do that. |
754 | * | 773 | * |
755 | * The remove + add is atomic. This function cannot fail. | 774 | * The remove + add is atomic. This function cannot fail. |
775 | * | ||
776 | * Return: %0 | ||
756 | */ | 777 | */ |
757 | int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) | 778 | int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) |
758 | { | 779 | { |
@@ -867,6 +888,8 @@ error: | |||
867 | * | 888 | * |
868 | * This function is used to add a page to the pagecache. It must be locked. | 889 | * This function is used to add a page to the pagecache. It must be locked. |
869 | * This function does not add the page to the LRU. The caller must do that. | 890 | * This function does not add the page to the LRU. The caller must do that. |
891 | * | ||
892 | * Return: %0 on success, negative error code otherwise. | ||
870 | */ | 893 | */ |
871 | int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | 894 | int add_to_page_cache_locked(struct page *page, struct address_space *mapping, |
872 | pgoff_t offset, gfp_t gfp_mask) | 895 | pgoff_t offset, gfp_t gfp_mask) |
@@ -1463,7 +1486,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); | |||
1463 | * If the slot holds a shadow entry of a previously evicted page, or a | 1486 | * If the slot holds a shadow entry of a previously evicted page, or a |
1464 | * swap entry from shmem/tmpfs, it is returned. | 1487 | * swap entry from shmem/tmpfs, it is returned. |
1465 | * | 1488 | * |
1466 | * Otherwise, %NULL is returned. | 1489 | * Return: the found page or shadow entry, %NULL if nothing is found. |
1467 | */ | 1490 | */ |
1468 | struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) | 1491 | struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) |
1469 | { | 1492 | { |
@@ -1521,9 +1544,9 @@ EXPORT_SYMBOL(find_get_entry); | |||
1521 | * If the slot holds a shadow entry of a previously evicted page, or a | 1544 | * If the slot holds a shadow entry of a previously evicted page, or a |
1522 | * swap entry from shmem/tmpfs, it is returned. | 1545 | * swap entry from shmem/tmpfs, it is returned. |
1523 | * | 1546 | * |
1524 | * Otherwise, %NULL is returned. | ||
1525 | * | ||
1526 | * find_lock_entry() may sleep. | 1547 | * find_lock_entry() may sleep. |
1548 | * | ||
1549 | * Return: the found page or shadow entry, %NULL if nothing is found. | ||
1527 | */ | 1550 | */ |
1528 | struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) | 1551 | struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) |
1529 | { | 1552 | { |
@@ -1563,12 +1586,14 @@ EXPORT_SYMBOL(find_lock_entry); | |||
1563 | * - FGP_CREAT: If page is not present then a new page is allocated using | 1586 | * - FGP_CREAT: If page is not present then a new page is allocated using |
1564 | * @gfp_mask and added to the page cache and the VM's LRU | 1587 | * @gfp_mask and added to the page cache and the VM's LRU |
1565 | * list. The page is returned locked and with an increased | 1588 | * list. The page is returned locked and with an increased |
1566 | * refcount. Otherwise, NULL is returned. | 1589 | * refcount. |
1567 | * | 1590 | * |
1568 | * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even | 1591 | * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even |
1569 | * if the GFP flags specified for FGP_CREAT are atomic. | 1592 | * if the GFP flags specified for FGP_CREAT are atomic. |
1570 | * | 1593 | * |
1571 | * If there is a page cache page, it is returned with an increased refcount. | 1594 | * If there is a page cache page, it is returned with an increased refcount. |
1595 | * | ||
1596 | * Return: the found page or %NULL otherwise. | ||
1572 | */ | 1597 | */ |
1573 | struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, | 1598 | struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, |
1574 | int fgp_flags, gfp_t gfp_mask) | 1599 | int fgp_flags, gfp_t gfp_mask) |
@@ -1656,8 +1681,7 @@ EXPORT_SYMBOL(pagecache_get_page); | |||
1656 | * Any shadow entries of evicted pages, or swap entries from | 1681 | * Any shadow entries of evicted pages, or swap entries from |
1657 | * shmem/tmpfs, are included in the returned array. | 1682 | * shmem/tmpfs, are included in the returned array. |
1658 | * | 1683 | * |
1659 | * find_get_entries() returns the number of pages and shadow entries | 1684 | * Return: the number of pages and shadow entries which were found. |
1660 | * which were found. | ||
1661 | */ | 1685 | */ |
1662 | unsigned find_get_entries(struct address_space *mapping, | 1686 | unsigned find_get_entries(struct address_space *mapping, |
1663 | pgoff_t start, unsigned int nr_entries, | 1687 | pgoff_t start, unsigned int nr_entries, |
@@ -1727,8 +1751,8 @@ retry: | |||
1727 | * indexes. There may be holes in the indices due to not-present pages. | 1751 | * indexes. There may be holes in the indices due to not-present pages. |
1728 | * We also update @start to index the next page for the traversal. | 1752 | * We also update @start to index the next page for the traversal. |
1729 | * | 1753 | * |
1730 | * find_get_pages_range() returns the number of pages which were found. If this | 1754 | * Return: the number of pages which were found. If this number is |
1731 | * number is smaller than @nr_pages, the end of specified range has been | 1755 | * smaller than @nr_pages, the end of specified range has been |
1732 | * reached. | 1756 | * reached. |
1733 | */ | 1757 | */ |
1734 | unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, | 1758 | unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, |
@@ -1765,7 +1789,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, | |||
1765 | 1789 | ||
1766 | pages[ret] = page; | 1790 | pages[ret] = page; |
1767 | if (++ret == nr_pages) { | 1791 | if (++ret == nr_pages) { |
1768 | *start = page->index + 1; | 1792 | *start = xas.xa_index + 1; |
1769 | goto out; | 1793 | goto out; |
1770 | } | 1794 | } |
1771 | continue; | 1795 | continue; |
@@ -1801,7 +1825,7 @@ out: | |||
1801 | * find_get_pages_contig() works exactly like find_get_pages(), except | 1825 | * find_get_pages_contig() works exactly like find_get_pages(), except |
1802 | * that the returned number of pages are guaranteed to be contiguous. | 1826 | * that the returned number of pages are guaranteed to be contiguous. |
1803 | * | 1827 | * |
1804 | * find_get_pages_contig() returns the number of pages which were found. | 1828 | * Return: the number of pages which were found. |
1805 | */ | 1829 | */ |
1806 | unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | 1830 | unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, |
1807 | unsigned int nr_pages, struct page **pages) | 1831 | unsigned int nr_pages, struct page **pages) |
@@ -1837,16 +1861,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | |||
1837 | if (unlikely(page != xas_reload(&xas))) | 1861 | if (unlikely(page != xas_reload(&xas))) |
1838 | goto put_page; | 1862 | goto put_page; |
1839 | 1863 | ||
1840 | /* | ||
1841 | * must check mapping and index after taking the ref. | ||
1842 | * otherwise we can get both false positives and false | ||
1843 | * negatives, which is just confusing to the caller. | ||
1844 | */ | ||
1845 | if (!page->mapping || page_to_pgoff(page) != xas.xa_index) { | ||
1846 | put_page(page); | ||
1847 | break; | ||
1848 | } | ||
1849 | |||
1850 | pages[ret] = page; | 1864 | pages[ret] = page; |
1851 | if (++ret == nr_pages) | 1865 | if (++ret == nr_pages) |
1852 | break; | 1866 | break; |
@@ -1872,6 +1886,8 @@ EXPORT_SYMBOL(find_get_pages_contig); | |||
1872 | * | 1886 | * |
1873 | * Like find_get_pages, except we only return pages which are tagged with | 1887 | * Like find_get_pages, except we only return pages which are tagged with |
1874 | * @tag. We update @index to index the next page for the traversal. | 1888 | * @tag. We update @index to index the next page for the traversal. |
1889 | * | ||
1890 | * Return: the number of pages which were found. | ||
1875 | */ | 1891 | */ |
1876 | unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, | 1892 | unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, |
1877 | pgoff_t end, xa_mark_t tag, unsigned int nr_pages, | 1893 | pgoff_t end, xa_mark_t tag, unsigned int nr_pages, |
@@ -1911,7 +1927,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, | |||
1911 | 1927 | ||
1912 | pages[ret] = page; | 1928 | pages[ret] = page; |
1913 | if (++ret == nr_pages) { | 1929 | if (++ret == nr_pages) { |
1914 | *index = page->index + 1; | 1930 | *index = xas.xa_index + 1; |
1915 | goto out; | 1931 | goto out; |
1916 | } | 1932 | } |
1917 | continue; | 1933 | continue; |
@@ -1949,6 +1965,8 @@ EXPORT_SYMBOL(find_get_pages_range_tag); | |||
1949 | * | 1965 | * |
1950 | * Like find_get_entries, except we only return entries which are tagged with | 1966 | * Like find_get_entries, except we only return entries which are tagged with |
1951 | * @tag. | 1967 | * @tag. |
1968 | * | ||
1969 | * Return: the number of entries which were found. | ||
1952 | */ | 1970 | */ |
1953 | unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, | 1971 | unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, |
1954 | xa_mark_t tag, unsigned int nr_entries, | 1972 | xa_mark_t tag, unsigned int nr_entries, |
@@ -2034,6 +2052,10 @@ static void shrink_readahead_size_eio(struct file *filp, | |||
2034 | * | 2052 | * |
2035 | * This is really ugly. But the goto's actually try to clarify some | 2053 | * This is really ugly. But the goto's actually try to clarify some |
2036 | * of the logic when it comes to error handling etc. | 2054 | * of the logic when it comes to error handling etc. |
2055 | * | ||
2056 | * Return: | ||
2057 | * * total number of bytes copied, including those the were already @written | ||
2058 | * * negative error code if nothing was copied | ||
2037 | */ | 2059 | */ |
2038 | static ssize_t generic_file_buffered_read(struct kiocb *iocb, | 2060 | static ssize_t generic_file_buffered_read(struct kiocb *iocb, |
2039 | struct iov_iter *iter, ssize_t written) | 2061 | struct iov_iter *iter, ssize_t written) |
@@ -2295,6 +2317,9 @@ out: | |||
2295 | * | 2317 | * |
2296 | * This is the "read_iter()" routine for all filesystems | 2318 | * This is the "read_iter()" routine for all filesystems |
2297 | * that can use the page cache directly. | 2319 | * that can use the page cache directly. |
2320 | * Return: | ||
2321 | * * number of bytes copied, even for partial reads | ||
2322 | * * negative error code if nothing was read | ||
2298 | */ | 2323 | */ |
2299 | ssize_t | 2324 | ssize_t |
2300 | generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) | 2325 | generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) |
@@ -2362,6 +2387,8 @@ EXPORT_SYMBOL(generic_file_read_iter); | |||
2362 | * | 2387 | * |
2363 | * This adds the requested page to the page cache if it isn't already there, | 2388 | * This adds the requested page to the page cache if it isn't already there, |
2364 | * and schedules an I/O to read in its contents from disk. | 2389 | * and schedules an I/O to read in its contents from disk. |
2390 | * | ||
2391 | * Return: %0 on success, negative error code otherwise. | ||
2365 | */ | 2392 | */ |
2366 | static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) | 2393 | static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) |
2367 | { | 2394 | { |
@@ -2476,6 +2503,8 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, | |||
2476 | * has not been released. | 2503 | * has not been released. |
2477 | * | 2504 | * |
2478 | * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. | 2505 | * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. |
2506 | * | ||
2507 | * Return: bitwise-OR of %VM_FAULT_ codes. | ||
2479 | */ | 2508 | */ |
2480 | vm_fault_t filemap_fault(struct vm_fault *vmf) | 2509 | vm_fault_t filemap_fault(struct vm_fault *vmf) |
2481 | { | 2510 | { |
@@ -2861,6 +2890,8 @@ out: | |||
2861 | * not set, try to fill the page and wait for it to become unlocked. | 2890 | * not set, try to fill the page and wait for it to become unlocked. |
2862 | * | 2891 | * |
2863 | * If the page does not get brought uptodate, return -EIO. | 2892 | * If the page does not get brought uptodate, return -EIO. |
2893 | * | ||
2894 | * Return: up to date page on success, ERR_PTR() on failure. | ||
2864 | */ | 2895 | */ |
2865 | struct page *read_cache_page(struct address_space *mapping, | 2896 | struct page *read_cache_page(struct address_space *mapping, |
2866 | pgoff_t index, | 2897 | pgoff_t index, |
@@ -2881,6 +2912,8 @@ EXPORT_SYMBOL(read_cache_page); | |||
2881 | * any new page allocations done using the specified allocation flags. | 2912 | * any new page allocations done using the specified allocation flags. |
2882 | * | 2913 | * |
2883 | * If the page does not get brought uptodate, return -EIO. | 2914 | * If the page does not get brought uptodate, return -EIO. |
2915 | * | ||
2916 | * Return: up to date page on success, ERR_PTR() on failure. | ||
2884 | */ | 2917 | */ |
2885 | struct page *read_cache_page_gfp(struct address_space *mapping, | 2918 | struct page *read_cache_page_gfp(struct address_space *mapping, |
2886 | pgoff_t index, | 2919 | pgoff_t index, |
@@ -3081,7 +3114,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) | |||
3081 | if (iocb->ki_flags & IOCB_NOWAIT) { | 3114 | if (iocb->ki_flags & IOCB_NOWAIT) { |
3082 | /* If there are pages to writeback, return */ | 3115 | /* If there are pages to writeback, return */ |
3083 | if (filemap_range_has_page(inode->i_mapping, pos, | 3116 | if (filemap_range_has_page(inode->i_mapping, pos, |
3084 | pos + write_len)) | 3117 | pos + write_len - 1)) |
3085 | return -EAGAIN; | 3118 | return -EAGAIN; |
3086 | } else { | 3119 | } else { |
3087 | written = filemap_write_and_wait_range(mapping, pos, | 3120 | written = filemap_write_and_wait_range(mapping, pos, |
@@ -3264,6 +3297,10 @@ EXPORT_SYMBOL(generic_perform_write); | |||
3264 | * This function does *not* take care of syncing data in case of O_SYNC write. | 3297 | * This function does *not* take care of syncing data in case of O_SYNC write. |
3265 | * A caller has to handle it. This is mainly due to the fact that we want to | 3298 | * A caller has to handle it. This is mainly due to the fact that we want to |
3266 | * avoid syncing under i_mutex. | 3299 | * avoid syncing under i_mutex. |
3300 | * | ||
3301 | * Return: | ||
3302 | * * number of bytes written, even for truncated writes | ||
3303 | * * negative error code if no data has been written at all | ||
3267 | */ | 3304 | */ |
3268 | ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) | 3305 | ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) |
3269 | { | 3306 | { |
@@ -3348,6 +3385,10 @@ EXPORT_SYMBOL(__generic_file_write_iter); | |||
3348 | * This is a wrapper around __generic_file_write_iter() to be used by most | 3385 | * This is a wrapper around __generic_file_write_iter() to be used by most |
3349 | * filesystems. It takes care of syncing the file in case of O_SYNC file | 3386 | * filesystems. It takes care of syncing the file in case of O_SYNC file |
3350 | * and acquires i_mutex as needed. | 3387 | * and acquires i_mutex as needed. |
3388 | * Return: | ||
3389 | * * negative error code if no data has been written at all of | ||
3390 | * vfs_fsync_range() failed for a synchronous write | ||
3391 | * * number of bytes written, even for truncated writes | ||
3351 | */ | 3392 | */ |
3352 | ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) | 3393 | ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) |
3353 | { | 3394 | { |
@@ -3374,8 +3415,7 @@ EXPORT_SYMBOL(generic_file_write_iter); | |||
3374 | * @gfp_mask: memory allocation flags (and I/O mode) | 3415 | * @gfp_mask: memory allocation flags (and I/O mode) |
3375 | * | 3416 | * |
3376 | * The address_space is to try to release any data against the page | 3417 | * The address_space is to try to release any data against the page |
3377 | * (presumably at page->private). If the release was successful, return '1'. | 3418 | * (presumably at page->private). |
3378 | * Otherwise return zero. | ||
3379 | * | 3419 | * |
3380 | * This may also be called if PG_fscache is set on a page, indicating that the | 3420 | * This may also be called if PG_fscache is set on a page, indicating that the |
3381 | * page is known to the local caching routines. | 3421 | * page is known to the local caching routines. |
@@ -3383,6 +3423,7 @@ EXPORT_SYMBOL(generic_file_write_iter); | |||
3383 | * The @gfp_mask argument specifies whether I/O may be performed to release | 3423 | * The @gfp_mask argument specifies whether I/O may be performed to release |
3384 | * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). | 3424 | * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS). |
3385 | * | 3425 | * |
3426 | * Return: %1 if the release was successful, otherwise return zero. | ||
3386 | */ | 3427 | */ |
3387 | int try_to_release_page(struct page *page, gfp_t gfp_mask) | 3428 | int try_to_release_page(struct page *page, gfp_t gfp_mask) |
3388 | { | 3429 | { |
@@ -13,6 +13,9 @@ | |||
13 | #include <linux/sched/signal.h> | 13 | #include <linux/sched/signal.h> |
14 | #include <linux/rwsem.h> | 14 | #include <linux/rwsem.h> |
15 | #include <linux/hugetlb.h> | 15 | #include <linux/hugetlb.h> |
16 | #include <linux/migrate.h> | ||
17 | #include <linux/mm_inline.h> | ||
18 | #include <linux/sched/mm.h> | ||
16 | 19 | ||
17 | #include <asm/mmu_context.h> | 20 | #include <asm/mmu_context.h> |
18 | #include <asm/pgtable.h> | 21 | #include <asm/pgtable.h> |
@@ -1126,7 +1129,167 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, | |||
1126 | } | 1129 | } |
1127 | EXPORT_SYMBOL(get_user_pages); | 1130 | EXPORT_SYMBOL(get_user_pages); |
1128 | 1131 | ||
1132 | #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) | ||
1133 | |||
1129 | #ifdef CONFIG_FS_DAX | 1134 | #ifdef CONFIG_FS_DAX |
1135 | static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) | ||
1136 | { | ||
1137 | long i; | ||
1138 | struct vm_area_struct *vma_prev = NULL; | ||
1139 | |||
1140 | for (i = 0; i < nr_pages; i++) { | ||
1141 | struct vm_area_struct *vma = vmas[i]; | ||
1142 | |||
1143 | if (vma == vma_prev) | ||
1144 | continue; | ||
1145 | |||
1146 | vma_prev = vma; | ||
1147 | |||
1148 | if (vma_is_fsdax(vma)) | ||
1149 | return true; | ||
1150 | } | ||
1151 | return false; | ||
1152 | } | ||
1153 | #else | ||
1154 | static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) | ||
1155 | { | ||
1156 | return false; | ||
1157 | } | ||
1158 | #endif | ||
1159 | |||
1160 | #ifdef CONFIG_CMA | ||
1161 | static struct page *new_non_cma_page(struct page *page, unsigned long private) | ||
1162 | { | ||
1163 | /* | ||
1164 | * We want to make sure we allocate the new page from the same node | ||
1165 | * as the source page. | ||
1166 | */ | ||
1167 | int nid = page_to_nid(page); | ||
1168 | /* | ||
1169 | * Trying to allocate a page for migration. Ignore allocation | ||
1170 | * failure warnings. We don't force __GFP_THISNODE here because | ||
1171 | * this node here is the node where we have CMA reservation and | ||
1172 | * in some case these nodes will have really less non movable | ||
1173 | * allocation memory. | ||
1174 | */ | ||
1175 | gfp_t gfp_mask = GFP_USER | __GFP_NOWARN; | ||
1176 | |||
1177 | if (PageHighMem(page)) | ||
1178 | gfp_mask |= __GFP_HIGHMEM; | ||
1179 | |||
1180 | #ifdef CONFIG_HUGETLB_PAGE | ||
1181 | if (PageHuge(page)) { | ||
1182 | struct hstate *h = page_hstate(page); | ||
1183 | /* | ||
1184 | * We don't want to dequeue from the pool because pool pages will | ||
1185 | * mostly be from the CMA region. | ||
1186 | */ | ||
1187 | return alloc_migrate_huge_page(h, gfp_mask, nid, NULL); | ||
1188 | } | ||
1189 | #endif | ||
1190 | if (PageTransHuge(page)) { | ||
1191 | struct page *thp; | ||
1192 | /* | ||
1193 | * ignore allocation failure warnings | ||
1194 | */ | ||
1195 | gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN; | ||
1196 | |||
1197 | /* | ||
1198 | * Remove the movable mask so that we don't allocate from | ||
1199 | * CMA area again. | ||
1200 | */ | ||
1201 | thp_gfpmask &= ~__GFP_MOVABLE; | ||
1202 | thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER); | ||
1203 | if (!thp) | ||
1204 | return NULL; | ||
1205 | prep_transhuge_page(thp); | ||
1206 | return thp; | ||
1207 | } | ||
1208 | |||
1209 | return __alloc_pages_node(nid, gfp_mask, 0); | ||
1210 | } | ||
1211 | |||
1212 | static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, | ||
1213 | unsigned int gup_flags, | ||
1214 | struct page **pages, | ||
1215 | struct vm_area_struct **vmas) | ||
1216 | { | ||
1217 | long i; | ||
1218 | bool drain_allow = true; | ||
1219 | bool migrate_allow = true; | ||
1220 | LIST_HEAD(cma_page_list); | ||
1221 | |||
1222 | check_again: | ||
1223 | for (i = 0; i < nr_pages; i++) { | ||
1224 | /* | ||
1225 | * If we get a page from the CMA zone, since we are going to | ||
1226 | * be pinning these entries, we might as well move them out | ||
1227 | * of the CMA zone if possible. | ||
1228 | */ | ||
1229 | if (is_migrate_cma_page(pages[i])) { | ||
1230 | |||
1231 | struct page *head = compound_head(pages[i]); | ||
1232 | |||
1233 | if (PageHuge(head)) { | ||
1234 | isolate_huge_page(head, &cma_page_list); | ||
1235 | } else { | ||
1236 | if (!PageLRU(head) && drain_allow) { | ||
1237 | lru_add_drain_all(); | ||
1238 | drain_allow = false; | ||
1239 | } | ||
1240 | |||
1241 | if (!isolate_lru_page(head)) { | ||
1242 | list_add_tail(&head->lru, &cma_page_list); | ||
1243 | mod_node_page_state(page_pgdat(head), | ||
1244 | NR_ISOLATED_ANON + | ||
1245 | page_is_file_cache(head), | ||
1246 | hpage_nr_pages(head)); | ||
1247 | } | ||
1248 | } | ||
1249 | } | ||
1250 | } | ||
1251 | |||
1252 | if (!list_empty(&cma_page_list)) { | ||
1253 | /* | ||
1254 | * drop the above get_user_pages reference. | ||
1255 | */ | ||
1256 | for (i = 0; i < nr_pages; i++) | ||
1257 | put_page(pages[i]); | ||
1258 | |||
1259 | if (migrate_pages(&cma_page_list, new_non_cma_page, | ||
1260 | NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) { | ||
1261 | /* | ||
1262 | * some of the pages failed migration. Do get_user_pages | ||
1263 | * without migration. | ||
1264 | */ | ||
1265 | migrate_allow = false; | ||
1266 | |||
1267 | if (!list_empty(&cma_page_list)) | ||
1268 | putback_movable_pages(&cma_page_list); | ||
1269 | } | ||
1270 | /* | ||
1271 | * We did migrate all the pages, Try to get the page references again | ||
1272 | * migrating any new CMA pages which we failed to isolate earlier. | ||
1273 | */ | ||
1274 | nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); | ||
1275 | if ((nr_pages > 0) && migrate_allow) { | ||
1276 | drain_allow = true; | ||
1277 | goto check_again; | ||
1278 | } | ||
1279 | } | ||
1280 | |||
1281 | return nr_pages; | ||
1282 | } | ||
1283 | #else | ||
1284 | static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages, | ||
1285 | unsigned int gup_flags, | ||
1286 | struct page **pages, | ||
1287 | struct vm_area_struct **vmas) | ||
1288 | { | ||
1289 | return nr_pages; | ||
1290 | } | ||
1291 | #endif | ||
1292 | |||
1130 | /* | 1293 | /* |
1131 | * This is the same as get_user_pages() in that it assumes we are | 1294 | * This is the same as get_user_pages() in that it assumes we are |
1132 | * operating on the current task's mm, but it goes further to validate | 1295 | * operating on the current task's mm, but it goes further to validate |
@@ -1140,11 +1303,11 @@ EXPORT_SYMBOL(get_user_pages); | |||
1140 | * Contrast this to iov_iter_get_pages() usages which are transient. | 1303 | * Contrast this to iov_iter_get_pages() usages which are transient. |
1141 | */ | 1304 | */ |
1142 | long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, | 1305 | long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, |
1143 | unsigned int gup_flags, struct page **pages, | 1306 | unsigned int gup_flags, struct page **pages, |
1144 | struct vm_area_struct **vmas_arg) | 1307 | struct vm_area_struct **vmas_arg) |
1145 | { | 1308 | { |
1146 | struct vm_area_struct **vmas = vmas_arg; | 1309 | struct vm_area_struct **vmas = vmas_arg; |
1147 | struct vm_area_struct *vma_prev = NULL; | 1310 | unsigned long flags; |
1148 | long rc, i; | 1311 | long rc, i; |
1149 | 1312 | ||
1150 | if (!pages) | 1313 | if (!pages) |
@@ -1157,31 +1320,20 @@ long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, | |||
1157 | return -ENOMEM; | 1320 | return -ENOMEM; |
1158 | } | 1321 | } |
1159 | 1322 | ||
1323 | flags = memalloc_nocma_save(); | ||
1160 | rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); | 1324 | rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); |
1325 | memalloc_nocma_restore(flags); | ||
1326 | if (rc < 0) | ||
1327 | goto out; | ||
1161 | 1328 | ||
1162 | for (i = 0; i < rc; i++) { | 1329 | if (check_dax_vmas(vmas, rc)) { |
1163 | struct vm_area_struct *vma = vmas[i]; | 1330 | for (i = 0; i < rc; i++) |
1164 | 1331 | put_page(pages[i]); | |
1165 | if (vma == vma_prev) | 1332 | rc = -EOPNOTSUPP; |
1166 | continue; | ||
1167 | |||
1168 | vma_prev = vma; | ||
1169 | |||
1170 | if (vma_is_fsdax(vma)) | ||
1171 | break; | ||
1172 | } | ||
1173 | |||
1174 | /* | ||
1175 | * Either get_user_pages() failed, or the vma validation | ||
1176 | * succeeded, in either case we don't need to put_page() before | ||
1177 | * returning. | ||
1178 | */ | ||
1179 | if (i >= rc) | ||
1180 | goto out; | 1333 | goto out; |
1334 | } | ||
1181 | 1335 | ||
1182 | for (i = 0; i < rc; i++) | 1336 | rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas); |
1183 | put_page(pages[i]); | ||
1184 | rc = -EOPNOTSUPP; | ||
1185 | out: | 1337 | out: |
1186 | if (vmas != vmas_arg) | 1338 | if (vmas != vmas_arg) |
1187 | kfree(vmas); | 1339 | kfree(vmas); |
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 5b42d3d4b60a..6c0279e70cc4 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c | |||
@@ -122,12 +122,8 @@ static const struct file_operations gup_benchmark_fops = { | |||
122 | 122 | ||
123 | static int gup_benchmark_init(void) | 123 | static int gup_benchmark_init(void) |
124 | { | 124 | { |
125 | void *ret; | 125 | debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL, |
126 | 126 | &gup_benchmark_fops); | |
127 | ret = debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL, | ||
128 | &gup_benchmark_fops); | ||
129 | if (!ret) | ||
130 | pr_warn("Failed to create gup_benchmark in debugfs"); | ||
131 | 127 | ||
132 | return 0; | 128 | return 0; |
133 | } | 129 | } |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index faf357eaf0ce..404acdcd0455 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/page_idle.h> | 33 | #include <linux/page_idle.h> |
34 | #include <linux/shmem_fs.h> | 34 | #include <linux/shmem_fs.h> |
35 | #include <linux/oom.h> | 35 | #include <linux/oom.h> |
36 | #include <linux/numa.h> | ||
36 | 37 | ||
37 | #include <asm/tlb.h> | 38 | #include <asm/tlb.h> |
38 | #include <asm/pgalloc.h> | 39 | #include <asm/pgalloc.h> |
@@ -616,6 +617,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, | |||
616 | mm_inc_nr_ptes(vma->vm_mm); | 617 | mm_inc_nr_ptes(vma->vm_mm); |
617 | spin_unlock(vmf->ptl); | 618 | spin_unlock(vmf->ptl); |
618 | count_vm_event(THP_FAULT_ALLOC); | 619 | count_vm_event(THP_FAULT_ALLOC); |
620 | count_memcg_events(memcg, THP_FAULT_ALLOC, 1); | ||
619 | } | 621 | } |
620 | 622 | ||
621 | return 0; | 623 | return 0; |
@@ -1337,6 +1339,7 @@ alloc: | |||
1337 | } | 1339 | } |
1338 | 1340 | ||
1339 | count_vm_event(THP_FAULT_ALLOC); | 1341 | count_vm_event(THP_FAULT_ALLOC); |
1342 | count_memcg_events(memcg, THP_FAULT_ALLOC, 1); | ||
1340 | 1343 | ||
1341 | if (!page) | 1344 | if (!page) |
1342 | clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); | 1345 | clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR); |
@@ -1475,7 +1478,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) | |||
1475 | struct anon_vma *anon_vma = NULL; | 1478 | struct anon_vma *anon_vma = NULL; |
1476 | struct page *page; | 1479 | struct page *page; |
1477 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; | 1480 | unsigned long haddr = vmf->address & HPAGE_PMD_MASK; |
1478 | int page_nid = -1, this_nid = numa_node_id(); | 1481 | int page_nid = NUMA_NO_NODE, this_nid = numa_node_id(); |
1479 | int target_nid, last_cpupid = -1; | 1482 | int target_nid, last_cpupid = -1; |
1480 | bool page_locked; | 1483 | bool page_locked; |
1481 | bool migrated = false; | 1484 | bool migrated = false; |
@@ -1520,7 +1523,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) | |||
1520 | */ | 1523 | */ |
1521 | page_locked = trylock_page(page); | 1524 | page_locked = trylock_page(page); |
1522 | target_nid = mpol_misplaced(page, vma, haddr); | 1525 | target_nid = mpol_misplaced(page, vma, haddr); |
1523 | if (target_nid == -1) { | 1526 | if (target_nid == NUMA_NO_NODE) { |
1524 | /* If the page was locked, there are no parallel migrations */ | 1527 | /* If the page was locked, there are no parallel migrations */ |
1525 | if (page_locked) | 1528 | if (page_locked) |
1526 | goto clear_pmdnuma; | 1529 | goto clear_pmdnuma; |
@@ -1528,7 +1531,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) | |||
1528 | 1531 | ||
1529 | /* Migration could have started since the pmd_trans_migrating check */ | 1532 | /* Migration could have started since the pmd_trans_migrating check */ |
1530 | if (!page_locked) { | 1533 | if (!page_locked) { |
1531 | page_nid = -1; | 1534 | page_nid = NUMA_NO_NODE; |
1532 | if (!get_page_unless_zero(page)) | 1535 | if (!get_page_unless_zero(page)) |
1533 | goto out_unlock; | 1536 | goto out_unlock; |
1534 | spin_unlock(vmf->ptl); | 1537 | spin_unlock(vmf->ptl); |
@@ -1549,14 +1552,14 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) | |||
1549 | if (unlikely(!pmd_same(pmd, *vmf->pmd))) { | 1552 | if (unlikely(!pmd_same(pmd, *vmf->pmd))) { |
1550 | unlock_page(page); | 1553 | unlock_page(page); |
1551 | put_page(page); | 1554 | put_page(page); |
1552 | page_nid = -1; | 1555 | page_nid = NUMA_NO_NODE; |
1553 | goto out_unlock; | 1556 | goto out_unlock; |
1554 | } | 1557 | } |
1555 | 1558 | ||
1556 | /* Bail if we fail to protect against THP splits for any reason */ | 1559 | /* Bail if we fail to protect against THP splits for any reason */ |
1557 | if (unlikely(!anon_vma)) { | 1560 | if (unlikely(!anon_vma)) { |
1558 | put_page(page); | 1561 | put_page(page); |
1559 | page_nid = -1; | 1562 | page_nid = NUMA_NO_NODE; |
1560 | goto clear_pmdnuma; | 1563 | goto clear_pmdnuma; |
1561 | } | 1564 | } |
1562 | 1565 | ||
@@ -1618,7 +1621,7 @@ out: | |||
1618 | if (anon_vma) | 1621 | if (anon_vma) |
1619 | page_unlock_anon_vma_read(anon_vma); | 1622 | page_unlock_anon_vma_read(anon_vma); |
1620 | 1623 | ||
1621 | if (page_nid != -1) | 1624 | if (page_nid != NUMA_NO_NODE) |
1622 | task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, | 1625 | task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, |
1623 | flags); | 1626 | flags); |
1624 | 1627 | ||
@@ -1979,7 +1982,6 @@ spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) | |||
1979 | int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, | 1982 | int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, |
1980 | pud_t *pud, unsigned long addr) | 1983 | pud_t *pud, unsigned long addr) |
1981 | { | 1984 | { |
1982 | pud_t orig_pud; | ||
1983 | spinlock_t *ptl; | 1985 | spinlock_t *ptl; |
1984 | 1986 | ||
1985 | ptl = __pud_trans_huge_lock(pud, vma); | 1987 | ptl = __pud_trans_huge_lock(pud, vma); |
@@ -1991,8 +1993,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1991 | * pgtable_trans_huge_withdraw after finishing pudp related | 1993 | * pgtable_trans_huge_withdraw after finishing pudp related |
1992 | * operations. | 1994 | * operations. |
1993 | */ | 1995 | */ |
1994 | orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud, | 1996 | pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm); |
1995 | tlb->fullmm); | ||
1996 | tlb_remove_pud_tlb_entry(tlb, pud, addr); | 1997 | tlb_remove_pud_tlb_entry(tlb, pud, addr); |
1997 | if (vma_is_dax(vma)) { | 1998 | if (vma_is_dax(vma)) { |
1998 | spin_unlock(ptl); | 1999 | spin_unlock(ptl); |
@@ -2437,11 +2438,11 @@ static void __split_huge_page(struct page *page, struct list_head *list, | |||
2437 | pgoff_t end, unsigned long flags) | 2438 | pgoff_t end, unsigned long flags) |
2438 | { | 2439 | { |
2439 | struct page *head = compound_head(page); | 2440 | struct page *head = compound_head(page); |
2440 | struct zone *zone = page_zone(head); | 2441 | pg_data_t *pgdat = page_pgdat(head); |
2441 | struct lruvec *lruvec; | 2442 | struct lruvec *lruvec; |
2442 | int i; | 2443 | int i; |
2443 | 2444 | ||
2444 | lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat); | 2445 | lruvec = mem_cgroup_page_lruvec(head, pgdat); |
2445 | 2446 | ||
2446 | /* complete memcg works before add pages to LRU */ | 2447 | /* complete memcg works before add pages to LRU */ |
2447 | mem_cgroup_split_huge_fixup(head); | 2448 | mem_cgroup_split_huge_fixup(head); |
@@ -2472,7 +2473,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, | |||
2472 | xa_unlock(&head->mapping->i_pages); | 2473 | xa_unlock(&head->mapping->i_pages); |
2473 | } | 2474 | } |
2474 | 2475 | ||
2475 | spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); | 2476 | spin_unlock_irqrestore(&pgdat->lru_lock, flags); |
2476 | 2477 | ||
2477 | remap_page(head); | 2478 | remap_page(head); |
2478 | 2479 | ||
@@ -2683,7 +2684,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
2683 | lru_add_drain(); | 2684 | lru_add_drain(); |
2684 | 2685 | ||
2685 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | 2686 | /* prevent PageLRU to go away from under us, and freeze lru stats */ |
2686 | spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags); | 2687 | spin_lock_irqsave(&pgdata->lru_lock, flags); |
2687 | 2688 | ||
2688 | if (mapping) { | 2689 | if (mapping) { |
2689 | XA_STATE(xas, &mapping->i_pages, page_index(head)); | 2690 | XA_STATE(xas, &mapping->i_pages, page_index(head)); |
@@ -2728,7 +2729,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
2728 | spin_unlock(&pgdata->split_queue_lock); | 2729 | spin_unlock(&pgdata->split_queue_lock); |
2729 | fail: if (mapping) | 2730 | fail: if (mapping) |
2730 | xa_unlock(&mapping->i_pages); | 2731 | xa_unlock(&mapping->i_pages); |
2731 | spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); | 2732 | spin_unlock_irqrestore(&pgdata->lru_lock, flags); |
2732 | remap_page(head); | 2733 | remap_page(head); |
2733 | ret = -EBUSY; | 2734 | ret = -EBUSY; |
2734 | } | 2735 | } |
@@ -2886,12 +2887,8 @@ DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, | |||
2886 | 2887 | ||
2887 | static int __init split_huge_pages_debugfs(void) | 2888 | static int __init split_huge_pages_debugfs(void) |
2888 | { | 2889 | { |
2889 | void *ret; | 2890 | debugfs_create_file("split_huge_pages", 0200, NULL, NULL, |
2890 | 2891 | &split_huge_pages_fops); | |
2891 | ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL, | ||
2892 | &split_huge_pages_fops); | ||
2893 | if (!ret) | ||
2894 | pr_warn("Failed to create split_huge_pages in debugfs"); | ||
2895 | return 0; | 2892 | return 0; |
2896 | } | 2893 | } |
2897 | late_initcall(split_huge_pages_debugfs); | 2894 | late_initcall(split_huge_pages_debugfs); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8dfdffc34a99..97b1e0290c66 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/swap.h> | 25 | #include <linux/swap.h> |
26 | #include <linux/swapops.h> | 26 | #include <linux/swapops.h> |
27 | #include <linux/jhash.h> | 27 | #include <linux/jhash.h> |
28 | #include <linux/numa.h> | ||
28 | 29 | ||
29 | #include <asm/page.h> | 30 | #include <asm/page.h> |
30 | #include <asm/pgtable.h> | 31 | #include <asm/pgtable.h> |
@@ -887,7 +888,7 @@ static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, | |||
887 | struct zonelist *zonelist; | 888 | struct zonelist *zonelist; |
888 | struct zone *zone; | 889 | struct zone *zone; |
889 | struct zoneref *z; | 890 | struct zoneref *z; |
890 | int node = -1; | 891 | int node = NUMA_NO_NODE; |
891 | 892 | ||
892 | zonelist = node_zonelist(nid, gfp_mask); | 893 | zonelist = node_zonelist(nid, gfp_mask); |
893 | 894 | ||
@@ -919,7 +920,7 @@ retry_cpuset: | |||
919 | /* Movability of hugepages depends on migration support. */ | 920 | /* Movability of hugepages depends on migration support. */ |
920 | static inline gfp_t htlb_alloc_mask(struct hstate *h) | 921 | static inline gfp_t htlb_alloc_mask(struct hstate *h) |
921 | { | 922 | { |
922 | if (hugepage_migration_supported(h)) | 923 | if (hugepage_movable_supported(h)) |
923 | return GFP_HIGHUSER_MOVABLE; | 924 | return GFP_HIGHUSER_MOVABLE; |
924 | else | 925 | else |
925 | return GFP_HIGHUSER; | 926 | return GFP_HIGHUSER; |
@@ -1586,8 +1587,8 @@ out_unlock: | |||
1586 | return page; | 1587 | return page; |
1587 | } | 1588 | } |
1588 | 1589 | ||
1589 | static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, | 1590 | struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, |
1590 | int nid, nodemask_t *nmask) | 1591 | int nid, nodemask_t *nmask) |
1591 | { | 1592 | { |
1592 | struct page *page; | 1593 | struct page *page; |
1593 | 1594 | ||
@@ -4398,10 +4399,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
4398 | continue; | 4399 | continue; |
4399 | } | 4400 | } |
4400 | if (!huge_pte_none(pte)) { | 4401 | if (!huge_pte_none(pte)) { |
4401 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 4402 | pte_t old_pte; |
4402 | pte = pte_mkhuge(huge_pte_modify(pte, newprot)); | 4403 | |
4404 | old_pte = huge_ptep_modify_prot_start(vma, address, ptep); | ||
4405 | pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); | ||
4403 | pte = arch_make_huge_pte(pte, vma, NULL, 0); | 4406 | pte = arch_make_huge_pte(pte, vma, NULL, 0); |
4404 | set_huge_pte_at(mm, address, ptep, pte); | 4407 | huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); |
4405 | pages++; | 4408 | pages++; |
4406 | } | 4409 | } |
4407 | spin_unlock(ptl); | 4410 | spin_unlock(ptl); |
diff --git a/mm/internal.h b/mm/internal.h index f4a7bb02decf..9eeaf2b95166 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -163,6 +163,7 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, | |||
163 | extern int __isolate_free_page(struct page *page, unsigned int order); | 163 | extern int __isolate_free_page(struct page *page, unsigned int order); |
164 | extern void memblock_free_pages(struct page *page, unsigned long pfn, | 164 | extern void memblock_free_pages(struct page *page, unsigned long pfn, |
165 | unsigned int order); | 165 | unsigned int order); |
166 | extern void __free_pages_core(struct page *page, unsigned int order); | ||
166 | extern void prep_compound_page(struct page *page, unsigned int order); | 167 | extern void prep_compound_page(struct page *page, unsigned int order); |
167 | extern void post_alloc_hook(struct page *page, unsigned int order, | 168 | extern void post_alloc_hook(struct page *page, unsigned int order, |
168 | gfp_t gfp_flags); | 169 | gfp_t gfp_flags); |
@@ -183,14 +184,16 @@ extern int user_min_free_kbytes; | |||
183 | struct compact_control { | 184 | struct compact_control { |
184 | struct list_head freepages; /* List of free pages to migrate to */ | 185 | struct list_head freepages; /* List of free pages to migrate to */ |
185 | struct list_head migratepages; /* List of pages being migrated */ | 186 | struct list_head migratepages; /* List of pages being migrated */ |
187 | unsigned int nr_freepages; /* Number of isolated free pages */ | ||
188 | unsigned int nr_migratepages; /* Number of pages to migrate */ | ||
189 | unsigned long free_pfn; /* isolate_freepages search base */ | ||
190 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | ||
191 | unsigned long fast_start_pfn; /* a pfn to start linear scan from */ | ||
186 | struct zone *zone; | 192 | struct zone *zone; |
187 | unsigned long nr_freepages; /* Number of isolated free pages */ | ||
188 | unsigned long nr_migratepages; /* Number of pages to migrate */ | ||
189 | unsigned long total_migrate_scanned; | 193 | unsigned long total_migrate_scanned; |
190 | unsigned long total_free_scanned; | 194 | unsigned long total_free_scanned; |
191 | unsigned long free_pfn; /* isolate_freepages search base */ | 195 | unsigned short fast_search_fail;/* failures to use free list searches */ |
192 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 196 | short search_order; /* order to start a fast search at */ |
193 | unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ | ||
194 | const gfp_t gfp_mask; /* gfp mask of a direct compactor */ | 197 | const gfp_t gfp_mask; /* gfp mask of a direct compactor */ |
195 | int order; /* order a direct compactor needs */ | 198 | int order; /* order a direct compactor needs */ |
196 | int migratetype; /* migratetype of direct compactor */ | 199 | int migratetype; /* migratetype of direct compactor */ |
@@ -203,7 +206,16 @@ struct compact_control { | |||
203 | bool direct_compaction; /* False from kcompactd or /proc/... */ | 206 | bool direct_compaction; /* False from kcompactd or /proc/... */ |
204 | bool whole_zone; /* Whole zone should/has been scanned */ | 207 | bool whole_zone; /* Whole zone should/has been scanned */ |
205 | bool contended; /* Signal lock or sched contention */ | 208 | bool contended; /* Signal lock or sched contention */ |
206 | bool finishing_block; /* Finishing current pageblock */ | 209 | bool rescan; /* Rescanning the same pageblock */ |
210 | }; | ||
211 | |||
212 | /* | ||
213 | * Used in direct compaction when a page should be taken from the freelists | ||
214 | * immediately when one is created during the free path. | ||
215 | */ | ||
216 | struct capture_control { | ||
217 | struct compact_control *cc; | ||
218 | struct page *page; | ||
207 | }; | 219 | }; |
208 | 220 | ||
209 | unsigned long | 221 | unsigned long |
diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 09b534fbba17..80bbe62b16cd 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c | |||
@@ -14,6 +14,8 @@ | |||
14 | * | 14 | * |
15 | */ | 15 | */ |
16 | 16 | ||
17 | #define __KASAN_INTERNAL | ||
18 | |||
17 | #include <linux/export.h> | 19 | #include <linux/export.h> |
18 | #include <linux/interrupt.h> | 20 | #include <linux/interrupt.h> |
19 | #include <linux/init.h> | 21 | #include <linux/init.h> |
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index ccb6207276e3..504c79363a34 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c | |||
@@ -275,25 +275,6 @@ EXPORT_SYMBOL(__asan_storeN_noabort); | |||
275 | void __asan_handle_no_return(void) {} | 275 | void __asan_handle_no_return(void) {} |
276 | EXPORT_SYMBOL(__asan_handle_no_return); | 276 | EXPORT_SYMBOL(__asan_handle_no_return); |
277 | 277 | ||
278 | /* Emitted by compiler to poison large objects when they go out of scope. */ | ||
279 | void __asan_poison_stack_memory(const void *addr, size_t size) | ||
280 | { | ||
281 | /* | ||
282 | * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded | ||
283 | * by redzones, so we simply round up size to simplify logic. | ||
284 | */ | ||
285 | kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE), | ||
286 | KASAN_USE_AFTER_SCOPE); | ||
287 | } | ||
288 | EXPORT_SYMBOL(__asan_poison_stack_memory); | ||
289 | |||
290 | /* Emitted by compiler to unpoison large objects when they go into scope. */ | ||
291 | void __asan_unpoison_stack_memory(const void *addr, size_t size) | ||
292 | { | ||
293 | kasan_unpoison_shadow(addr, size); | ||
294 | } | ||
295 | EXPORT_SYMBOL(__asan_unpoison_stack_memory); | ||
296 | |||
297 | /* Emitted by compiler to poison alloca()ed objects. */ | 278 | /* Emitted by compiler to poison alloca()ed objects. */ |
298 | void __asan_alloca_poison(unsigned long addr, size_t size) | 279 | void __asan_alloca_poison(unsigned long addr, size_t size) |
299 | { | 280 | { |
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c index 5e12035888f2..36c645939bc9 100644 --- a/mm/kasan/generic_report.c +++ b/mm/kasan/generic_report.c | |||
@@ -82,9 +82,6 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info) | |||
82 | case KASAN_KMALLOC_FREE: | 82 | case KASAN_KMALLOC_FREE: |
83 | bug_type = "use-after-free"; | 83 | bug_type = "use-after-free"; |
84 | break; | 84 | break; |
85 | case KASAN_USE_AFTER_SCOPE: | ||
86 | bug_type = "use-after-scope"; | ||
87 | break; | ||
88 | case KASAN_ALLOCA_LEFT: | 85 | case KASAN_ALLOCA_LEFT: |
89 | case KASAN_ALLOCA_RIGHT: | 86 | case KASAN_ALLOCA_RIGHT: |
90 | bug_type = "alloca-out-of-bounds"; | 87 | bug_type = "alloca-out-of-bounds"; |
diff --git a/mm/kasan/init.c b/mm/kasan/init.c index 45a1b5e38e1e..fcaa1ca03175 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c | |||
@@ -42,7 +42,7 @@ static inline bool kasan_p4d_table(pgd_t pgd) | |||
42 | #else | 42 | #else |
43 | static inline bool kasan_p4d_table(pgd_t pgd) | 43 | static inline bool kasan_p4d_table(pgd_t pgd) |
44 | { | 44 | { |
45 | return 0; | 45 | return false; |
46 | } | 46 | } |
47 | #endif | 47 | #endif |
48 | #if CONFIG_PGTABLE_LEVELS > 3 | 48 | #if CONFIG_PGTABLE_LEVELS > 3 |
@@ -54,7 +54,7 @@ static inline bool kasan_pud_table(p4d_t p4d) | |||
54 | #else | 54 | #else |
55 | static inline bool kasan_pud_table(p4d_t p4d) | 55 | static inline bool kasan_pud_table(p4d_t p4d) |
56 | { | 56 | { |
57 | return 0; | 57 | return false; |
58 | } | 58 | } |
59 | #endif | 59 | #endif |
60 | #if CONFIG_PGTABLE_LEVELS > 2 | 60 | #if CONFIG_PGTABLE_LEVELS > 2 |
@@ -66,7 +66,7 @@ static inline bool kasan_pmd_table(pud_t pud) | |||
66 | #else | 66 | #else |
67 | static inline bool kasan_pmd_table(pud_t pud) | 67 | static inline bool kasan_pmd_table(pud_t pud) |
68 | { | 68 | { |
69 | return 0; | 69 | return false; |
70 | } | 70 | } |
71 | #endif | 71 | #endif |
72 | pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss; | 72 | pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss; |
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index ea51b2d898ec..3e0c11f7d7a1 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h | |||
@@ -34,7 +34,6 @@ | |||
34 | #define KASAN_STACK_MID 0xF2 | 34 | #define KASAN_STACK_MID 0xF2 |
35 | #define KASAN_STACK_RIGHT 0xF3 | 35 | #define KASAN_STACK_RIGHT 0xF3 |
36 | #define KASAN_STACK_PARTIAL 0xF4 | 36 | #define KASAN_STACK_PARTIAL 0xF4 |
37 | #define KASAN_USE_AFTER_SCOPE 0xF8 | ||
38 | 37 | ||
39 | /* | 38 | /* |
40 | * alloca redzone shadow values | 39 | * alloca redzone shadow values |
@@ -187,8 +186,6 @@ void __asan_unregister_globals(struct kasan_global *globals, size_t size); | |||
187 | void __asan_loadN(unsigned long addr, size_t size); | 186 | void __asan_loadN(unsigned long addr, size_t size); |
188 | void __asan_storeN(unsigned long addr, size_t size); | 187 | void __asan_storeN(unsigned long addr, size_t size); |
189 | void __asan_handle_no_return(void); | 188 | void __asan_handle_no_return(void); |
190 | void __asan_poison_stack_memory(const void *addr, size_t size); | ||
191 | void __asan_unpoison_stack_memory(const void *addr, size_t size); | ||
192 | void __asan_alloca_poison(unsigned long addr, size_t size); | 189 | void __asan_alloca_poison(unsigned long addr, size_t size); |
193 | void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom); | 190 | void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom); |
194 | 191 | ||
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4f017339ddb2..449044378782 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c | |||
@@ -1074,6 +1074,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
1074 | BUG_ON(!pmd_none(*pmd)); | 1074 | BUG_ON(!pmd_none(*pmd)); |
1075 | page_add_new_anon_rmap(new_page, vma, address, true); | 1075 | page_add_new_anon_rmap(new_page, vma, address, true); |
1076 | mem_cgroup_commit_charge(new_page, memcg, false, true); | 1076 | mem_cgroup_commit_charge(new_page, memcg, false, true); |
1077 | count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); | ||
1077 | lru_cache_add_active_or_unevictable(new_page, vma); | 1078 | lru_cache_add_active_or_unevictable(new_page, vma); |
1078 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 1079 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
1079 | set_pmd_at(mm, address, pmd, _pmd); | 1080 | set_pmd_at(mm, address, pmd, _pmd); |
@@ -1502,6 +1503,7 @@ xa_unlocked: | |||
1502 | page_ref_add(new_page, HPAGE_PMD_NR - 1); | 1503 | page_ref_add(new_page, HPAGE_PMD_NR - 1); |
1503 | set_page_dirty(new_page); | 1504 | set_page_dirty(new_page); |
1504 | mem_cgroup_commit_charge(new_page, memcg, false, true); | 1505 | mem_cgroup_commit_charge(new_page, memcg, false, true); |
1506 | count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); | ||
1505 | lru_cache_add_anon(new_page); | 1507 | lru_cache_add_anon(new_page); |
1506 | 1508 | ||
1507 | /* | 1509 | /* |
@@ -598,7 +598,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, | |||
598 | chain->chain_prune_time = jiffies; | 598 | chain->chain_prune_time = jiffies; |
599 | chain->rmap_hlist_len = STABLE_NODE_CHAIN; | 599 | chain->rmap_hlist_len = STABLE_NODE_CHAIN; |
600 | #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) | 600 | #if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA) |
601 | chain->nid = -1; /* debug */ | 601 | chain->nid = NUMA_NO_NODE; /* debug */ |
602 | #endif | 602 | #endif |
603 | ksm_stable_node_chains++; | 603 | ksm_stable_node_chains++; |
604 | 604 | ||
@@ -667,6 +667,12 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
667 | free_stable_node(stable_node); | 667 | free_stable_node(stable_node); |
668 | } | 668 | } |
669 | 669 | ||
670 | enum get_ksm_page_flags { | ||
671 | GET_KSM_PAGE_NOLOCK, | ||
672 | GET_KSM_PAGE_LOCK, | ||
673 | GET_KSM_PAGE_TRYLOCK | ||
674 | }; | ||
675 | |||
670 | /* | 676 | /* |
671 | * get_ksm_page: checks if the page indicated by the stable node | 677 | * get_ksm_page: checks if the page indicated by the stable node |
672 | * is still its ksm page, despite having held no reference to it. | 678 | * is still its ksm page, despite having held no reference to it. |
@@ -686,7 +692,8 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node) | |||
686 | * a page to put something that might look like our key in page->mapping. | 692 | * a page to put something that might look like our key in page->mapping. |
687 | * is on its way to being freed; but it is an anomaly to bear in mind. | 693 | * is on its way to being freed; but it is an anomaly to bear in mind. |
688 | */ | 694 | */ |
689 | static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) | 695 | static struct page *get_ksm_page(struct stable_node *stable_node, |
696 | enum get_ksm_page_flags flags) | ||
690 | { | 697 | { |
691 | struct page *page; | 698 | struct page *page; |
692 | void *expected_mapping; | 699 | void *expected_mapping; |
@@ -706,8 +713,9 @@ again: | |||
706 | * case this node is no longer referenced, and should be freed; | 713 | * case this node is no longer referenced, and should be freed; |
707 | * however, it might mean that the page is under page_ref_freeze(). | 714 | * however, it might mean that the page is under page_ref_freeze(). |
708 | * The __remove_mapping() case is easy, again the node is now stale; | 715 | * The __remove_mapping() case is easy, again the node is now stale; |
709 | * but if page is swapcache in migrate_page_move_mapping(), it might | 716 | * the same is in reuse_ksm_page() case; but if page is swapcache |
710 | * still be our page, in which case it's essential to keep the node. | 717 | * in migrate_page_move_mapping(), it might still be our page, |
718 | * in which case it's essential to keep the node. | ||
711 | */ | 719 | */ |
712 | while (!get_page_unless_zero(page)) { | 720 | while (!get_page_unless_zero(page)) { |
713 | /* | 721 | /* |
@@ -728,8 +736,15 @@ again: | |||
728 | goto stale; | 736 | goto stale; |
729 | } | 737 | } |
730 | 738 | ||
731 | if (lock_it) { | 739 | if (flags == GET_KSM_PAGE_TRYLOCK) { |
740 | if (!trylock_page(page)) { | ||
741 | put_page(page); | ||
742 | return ERR_PTR(-EBUSY); | ||
743 | } | ||
744 | } else if (flags == GET_KSM_PAGE_LOCK) | ||
732 | lock_page(page); | 745 | lock_page(page); |
746 | |||
747 | if (flags != GET_KSM_PAGE_NOLOCK) { | ||
733 | if (READ_ONCE(page->mapping) != expected_mapping) { | 748 | if (READ_ONCE(page->mapping) != expected_mapping) { |
734 | unlock_page(page); | 749 | unlock_page(page); |
735 | put_page(page); | 750 | put_page(page); |
@@ -763,7 +778,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
763 | struct page *page; | 778 | struct page *page; |
764 | 779 | ||
765 | stable_node = rmap_item->head; | 780 | stable_node = rmap_item->head; |
766 | page = get_ksm_page(stable_node, true); | 781 | page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); |
767 | if (!page) | 782 | if (!page) |
768 | goto out; | 783 | goto out; |
769 | 784 | ||
@@ -863,7 +878,7 @@ static int remove_stable_node(struct stable_node *stable_node) | |||
863 | struct page *page; | 878 | struct page *page; |
864 | int err; | 879 | int err; |
865 | 880 | ||
866 | page = get_ksm_page(stable_node, true); | 881 | page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); |
867 | if (!page) { | 882 | if (!page) { |
868 | /* | 883 | /* |
869 | * get_ksm_page did remove_node_from_stable_tree itself. | 884 | * get_ksm_page did remove_node_from_stable_tree itself. |
@@ -1385,7 +1400,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup, | |||
1385 | * stable_node parameter itself will be freed from | 1400 | * stable_node parameter itself will be freed from |
1386 | * under us if it returns NULL. | 1401 | * under us if it returns NULL. |
1387 | */ | 1402 | */ |
1388 | _tree_page = get_ksm_page(dup, false); | 1403 | _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK); |
1389 | if (!_tree_page) | 1404 | if (!_tree_page) |
1390 | continue; | 1405 | continue; |
1391 | nr += 1; | 1406 | nr += 1; |
@@ -1508,7 +1523,7 @@ static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, | |||
1508 | if (!is_stable_node_chain(stable_node)) { | 1523 | if (!is_stable_node_chain(stable_node)) { |
1509 | if (is_page_sharing_candidate(stable_node)) { | 1524 | if (is_page_sharing_candidate(stable_node)) { |
1510 | *_stable_node_dup = stable_node; | 1525 | *_stable_node_dup = stable_node; |
1511 | return get_ksm_page(stable_node, false); | 1526 | return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); |
1512 | } | 1527 | } |
1513 | /* | 1528 | /* |
1514 | * _stable_node_dup set to NULL means the stable_node | 1529 | * _stable_node_dup set to NULL means the stable_node |
@@ -1613,7 +1628,8 @@ again: | |||
1613 | * wrprotected at all times. Any will work | 1628 | * wrprotected at all times. Any will work |
1614 | * fine to continue the walk. | 1629 | * fine to continue the walk. |
1615 | */ | 1630 | */ |
1616 | tree_page = get_ksm_page(stable_node_any, false); | 1631 | tree_page = get_ksm_page(stable_node_any, |
1632 | GET_KSM_PAGE_NOLOCK); | ||
1617 | } | 1633 | } |
1618 | VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); | 1634 | VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); |
1619 | if (!tree_page) { | 1635 | if (!tree_page) { |
@@ -1673,7 +1689,12 @@ again: | |||
1673 | * It would be more elegant to return stable_node | 1689 | * It would be more elegant to return stable_node |
1674 | * than kpage, but that involves more changes. | 1690 | * than kpage, but that involves more changes. |
1675 | */ | 1691 | */ |
1676 | tree_page = get_ksm_page(stable_node_dup, true); | 1692 | tree_page = get_ksm_page(stable_node_dup, |
1693 | GET_KSM_PAGE_TRYLOCK); | ||
1694 | |||
1695 | if (PTR_ERR(tree_page) == -EBUSY) | ||
1696 | return ERR_PTR(-EBUSY); | ||
1697 | |||
1677 | if (unlikely(!tree_page)) | 1698 | if (unlikely(!tree_page)) |
1678 | /* | 1699 | /* |
1679 | * The tree may have been rebalanced, | 1700 | * The tree may have been rebalanced, |
@@ -1842,7 +1863,8 @@ again: | |||
1842 | * wrprotected at all times. Any will work | 1863 | * wrprotected at all times. Any will work |
1843 | * fine to continue the walk. | 1864 | * fine to continue the walk. |
1844 | */ | 1865 | */ |
1845 | tree_page = get_ksm_page(stable_node_any, false); | 1866 | tree_page = get_ksm_page(stable_node_any, |
1867 | GET_KSM_PAGE_NOLOCK); | ||
1846 | } | 1868 | } |
1847 | VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); | 1869 | VM_BUG_ON(!stable_node_dup ^ !!stable_node_any); |
1848 | if (!tree_page) { | 1870 | if (!tree_page) { |
@@ -2068,6 +2090,9 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
2068 | remove_rmap_item_from_tree(rmap_item); | 2090 | remove_rmap_item_from_tree(rmap_item); |
2069 | 2091 | ||
2070 | if (kpage) { | 2092 | if (kpage) { |
2093 | if (PTR_ERR(kpage) == -EBUSY) | ||
2094 | return; | ||
2095 | |||
2071 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); | 2096 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); |
2072 | if (!err) { | 2097 | if (!err) { |
2073 | /* | 2098 | /* |
@@ -2242,7 +2267,8 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
2242 | 2267 | ||
2243 | list_for_each_entry_safe(stable_node, next, | 2268 | list_for_each_entry_safe(stable_node, next, |
2244 | &migrate_nodes, list) { | 2269 | &migrate_nodes, list) { |
2245 | page = get_ksm_page(stable_node, false); | 2270 | page = get_ksm_page(stable_node, |
2271 | GET_KSM_PAGE_NOLOCK); | ||
2246 | if (page) | 2272 | if (page) |
2247 | put_page(page); | 2273 | put_page(page); |
2248 | cond_resched(); | 2274 | cond_resched(); |
@@ -2642,6 +2668,31 @@ again: | |||
2642 | goto again; | 2668 | goto again; |
2643 | } | 2669 | } |
2644 | 2670 | ||
2671 | bool reuse_ksm_page(struct page *page, | ||
2672 | struct vm_area_struct *vma, | ||
2673 | unsigned long address) | ||
2674 | { | ||
2675 | #ifdef CONFIG_DEBUG_VM | ||
2676 | if (WARN_ON(is_zero_pfn(page_to_pfn(page))) || | ||
2677 | WARN_ON(!page_mapped(page)) || | ||
2678 | WARN_ON(!PageLocked(page))) { | ||
2679 | dump_page(page, "reuse_ksm_page"); | ||
2680 | return false; | ||
2681 | } | ||
2682 | #endif | ||
2683 | |||
2684 | if (PageSwapCache(page) || !page_stable_node(page)) | ||
2685 | return false; | ||
2686 | /* Prohibit parallel get_ksm_page() */ | ||
2687 | if (!page_ref_freeze(page, 1)) | ||
2688 | return false; | ||
2689 | |||
2690 | page_move_anon_rmap(page, vma); | ||
2691 | page->index = linear_page_index(vma, address); | ||
2692 | page_ref_unfreeze(page, 1); | ||
2693 | |||
2694 | return true; | ||
2695 | } | ||
2645 | #ifdef CONFIG_MIGRATION | 2696 | #ifdef CONFIG_MIGRATION |
2646 | void ksm_migrate_page(struct page *newpage, struct page *oldpage) | 2697 | void ksm_migrate_page(struct page *newpage, struct page *oldpage) |
2647 | { | 2698 | { |
diff --git a/mm/list_lru.c b/mm/list_lru.c index 5b30625fd365..0730bf8ff39f 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c | |||
@@ -601,7 +601,6 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, | |||
601 | struct lock_class_key *key, struct shrinker *shrinker) | 601 | struct lock_class_key *key, struct shrinker *shrinker) |
602 | { | 602 | { |
603 | int i; | 603 | int i; |
604 | size_t size = sizeof(*lru->node) * nr_node_ids; | ||
605 | int err = -ENOMEM; | 604 | int err = -ENOMEM; |
606 | 605 | ||
607 | #ifdef CONFIG_MEMCG_KMEM | 606 | #ifdef CONFIG_MEMCG_KMEM |
@@ -612,7 +611,7 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, | |||
612 | #endif | 611 | #endif |
613 | memcg_get_cache_ids(); | 612 | memcg_get_cache_ids(); |
614 | 613 | ||
615 | lru->node = kzalloc(size, GFP_KERNEL); | 614 | lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); |
616 | if (!lru->node) | 615 | if (!lru->node) |
617 | goto out; | 616 | goto out; |
618 | 617 | ||
diff --git a/mm/memblock.c b/mm/memblock.c index ea31045ba704..470601115892 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -2005,8 +2005,7 @@ DEFINE_SHOW_ATTRIBUTE(memblock_debug); | |||
2005 | static int __init memblock_init_debugfs(void) | 2005 | static int __init memblock_init_debugfs(void) |
2006 | { | 2006 | { |
2007 | struct dentry *root = debugfs_create_dir("memblock", NULL); | 2007 | struct dentry *root = debugfs_create_dir("memblock", NULL); |
2008 | if (!root) | 2008 | |
2009 | return -ENXIO; | ||
2010 | debugfs_create_file("memory", 0444, root, | 2009 | debugfs_create_file("memory", 0444, root, |
2011 | &memblock.memory, &memblock_debug_fops); | 2010 | &memblock.memory, &memblock_debug_fops); |
2012 | debugfs_create_file("reserved", 0444, root, | 2011 | debugfs_create_file("reserved", 0444, root, |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index af7f18b32389..532e0e2a4817 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/shmem_fs.h> | 39 | #include <linux/shmem_fs.h> |
40 | #include <linux/hugetlb.h> | 40 | #include <linux/hugetlb.h> |
41 | #include <linux/pagemap.h> | 41 | #include <linux/pagemap.h> |
42 | #include <linux/vm_event_item.h> | ||
42 | #include <linux/smp.h> | 43 | #include <linux/smp.h> |
43 | #include <linux/page-flags.h> | 44 | #include <linux/page-flags.h> |
44 | #include <linux/backing-dev.h> | 45 | #include <linux/backing-dev.h> |
@@ -248,6 +249,12 @@ enum res_type { | |||
248 | iter != NULL; \ | 249 | iter != NULL; \ |
249 | iter = mem_cgroup_iter(NULL, iter, NULL)) | 250 | iter = mem_cgroup_iter(NULL, iter, NULL)) |
250 | 251 | ||
252 | static inline bool should_force_charge(void) | ||
253 | { | ||
254 | return tsk_is_oom_victim(current) || fatal_signal_pending(current) || | ||
255 | (current->flags & PF_EXITING); | ||
256 | } | ||
257 | |||
251 | /* Some nice accessors for the vmpressure. */ | 258 | /* Some nice accessors for the vmpressure. */ |
252 | struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) | 259 | struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) |
253 | { | 260 | { |
@@ -1389,8 +1396,13 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1389 | }; | 1396 | }; |
1390 | bool ret; | 1397 | bool ret; |
1391 | 1398 | ||
1392 | mutex_lock(&oom_lock); | 1399 | if (mutex_lock_killable(&oom_lock)) |
1393 | ret = out_of_memory(&oc); | 1400 | return true; |
1401 | /* | ||
1402 | * A few threads which were not waiting at mutex_lock_killable() can | ||
1403 | * fail to bail out. Therefore, check again after holding oom_lock. | ||
1404 | */ | ||
1405 | ret = should_force_charge() || out_of_memory(&oc); | ||
1394 | mutex_unlock(&oom_lock); | 1406 | mutex_unlock(&oom_lock); |
1395 | return ret; | 1407 | return ret; |
1396 | } | 1408 | } |
@@ -2209,9 +2221,7 @@ retry: | |||
2209 | * bypass the last charges so that they can exit quickly and | 2221 | * bypass the last charges so that they can exit quickly and |
2210 | * free their memory. | 2222 | * free their memory. |
2211 | */ | 2223 | */ |
2212 | if (unlikely(tsk_is_oom_victim(current) || | 2224 | if (unlikely(should_force_charge())) |
2213 | fatal_signal_pending(current) || | ||
2214 | current->flags & PF_EXITING)) | ||
2215 | goto force; | 2225 | goto force; |
2216 | 2226 | ||
2217 | /* | 2227 | /* |
@@ -2352,13 +2362,13 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) | |||
2352 | 2362 | ||
2353 | static void lock_page_lru(struct page *page, int *isolated) | 2363 | static void lock_page_lru(struct page *page, int *isolated) |
2354 | { | 2364 | { |
2355 | struct zone *zone = page_zone(page); | 2365 | pg_data_t *pgdat = page_pgdat(page); |
2356 | 2366 | ||
2357 | spin_lock_irq(zone_lru_lock(zone)); | 2367 | spin_lock_irq(&pgdat->lru_lock); |
2358 | if (PageLRU(page)) { | 2368 | if (PageLRU(page)) { |
2359 | struct lruvec *lruvec; | 2369 | struct lruvec *lruvec; |
2360 | 2370 | ||
2361 | lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); | 2371 | lruvec = mem_cgroup_page_lruvec(page, pgdat); |
2362 | ClearPageLRU(page); | 2372 | ClearPageLRU(page); |
2363 | del_page_from_lru_list(page, lruvec, page_lru(page)); | 2373 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
2364 | *isolated = 1; | 2374 | *isolated = 1; |
@@ -2368,17 +2378,17 @@ static void lock_page_lru(struct page *page, int *isolated) | |||
2368 | 2378 | ||
2369 | static void unlock_page_lru(struct page *page, int isolated) | 2379 | static void unlock_page_lru(struct page *page, int isolated) |
2370 | { | 2380 | { |
2371 | struct zone *zone = page_zone(page); | 2381 | pg_data_t *pgdat = page_pgdat(page); |
2372 | 2382 | ||
2373 | if (isolated) { | 2383 | if (isolated) { |
2374 | struct lruvec *lruvec; | 2384 | struct lruvec *lruvec; |
2375 | 2385 | ||
2376 | lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); | 2386 | lruvec = mem_cgroup_page_lruvec(page, pgdat); |
2377 | VM_BUG_ON_PAGE(PageLRU(page), page); | 2387 | VM_BUG_ON_PAGE(PageLRU(page), page); |
2378 | SetPageLRU(page); | 2388 | SetPageLRU(page); |
2379 | add_page_to_lru_list(page, lruvec, page_lru(page)); | 2389 | add_page_to_lru_list(page, lruvec, page_lru(page)); |
2380 | } | 2390 | } |
2381 | spin_unlock_irq(zone_lru_lock(zone)); | 2391 | spin_unlock_irq(&pgdat->lru_lock); |
2382 | } | 2392 | } |
2383 | 2393 | ||
2384 | static void commit_charge(struct page *page, struct mem_cgroup *memcg, | 2394 | static void commit_charge(struct page *page, struct mem_cgroup *memcg, |
@@ -2573,7 +2583,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep) | |||
2573 | } | 2583 | } |
2574 | 2584 | ||
2575 | /** | 2585 | /** |
2576 | * memcg_kmem_charge_memcg: charge a kmem page | 2586 | * __memcg_kmem_charge_memcg: charge a kmem page |
2577 | * @page: page to charge | 2587 | * @page: page to charge |
2578 | * @gfp: reclaim mode | 2588 | * @gfp: reclaim mode |
2579 | * @order: allocation order | 2589 | * @order: allocation order |
@@ -2581,7 +2591,7 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep) | |||
2581 | * | 2591 | * |
2582 | * Returns 0 on success, an error code on failure. | 2592 | * Returns 0 on success, an error code on failure. |
2583 | */ | 2593 | */ |
2584 | int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, | 2594 | int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, |
2585 | struct mem_cgroup *memcg) | 2595 | struct mem_cgroup *memcg) |
2586 | { | 2596 | { |
2587 | unsigned int nr_pages = 1 << order; | 2597 | unsigned int nr_pages = 1 << order; |
@@ -2604,24 +2614,24 @@ int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, | |||
2604 | } | 2614 | } |
2605 | 2615 | ||
2606 | /** | 2616 | /** |
2607 | * memcg_kmem_charge: charge a kmem page to the current memory cgroup | 2617 | * __memcg_kmem_charge: charge a kmem page to the current memory cgroup |
2608 | * @page: page to charge | 2618 | * @page: page to charge |
2609 | * @gfp: reclaim mode | 2619 | * @gfp: reclaim mode |
2610 | * @order: allocation order | 2620 | * @order: allocation order |
2611 | * | 2621 | * |
2612 | * Returns 0 on success, an error code on failure. | 2622 | * Returns 0 on success, an error code on failure. |
2613 | */ | 2623 | */ |
2614 | int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) | 2624 | int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) |
2615 | { | 2625 | { |
2616 | struct mem_cgroup *memcg; | 2626 | struct mem_cgroup *memcg; |
2617 | int ret = 0; | 2627 | int ret = 0; |
2618 | 2628 | ||
2619 | if (mem_cgroup_disabled() || memcg_kmem_bypass()) | 2629 | if (memcg_kmem_bypass()) |
2620 | return 0; | 2630 | return 0; |
2621 | 2631 | ||
2622 | memcg = get_mem_cgroup_from_current(); | 2632 | memcg = get_mem_cgroup_from_current(); |
2623 | if (!mem_cgroup_is_root(memcg)) { | 2633 | if (!mem_cgroup_is_root(memcg)) { |
2624 | ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); | 2634 | ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); |
2625 | if (!ret) | 2635 | if (!ret) |
2626 | __SetPageKmemcg(page); | 2636 | __SetPageKmemcg(page); |
2627 | } | 2637 | } |
@@ -2629,11 +2639,11 @@ int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) | |||
2629 | return ret; | 2639 | return ret; |
2630 | } | 2640 | } |
2631 | /** | 2641 | /** |
2632 | * memcg_kmem_uncharge: uncharge a kmem page | 2642 | * __memcg_kmem_uncharge: uncharge a kmem page |
2633 | * @page: page to uncharge | 2643 | * @page: page to uncharge |
2634 | * @order: allocation order | 2644 | * @order: allocation order |
2635 | */ | 2645 | */ |
2636 | void memcg_kmem_uncharge(struct page *page, int order) | 2646 | void __memcg_kmem_uncharge(struct page *page, int order) |
2637 | { | 2647 | { |
2638 | struct mem_cgroup *memcg = page->mem_cgroup; | 2648 | struct mem_cgroup *memcg = page->mem_cgroup; |
2639 | unsigned int nr_pages = 1 << order; | 2649 | unsigned int nr_pages = 1 << order; |
@@ -2664,7 +2674,7 @@ void memcg_kmem_uncharge(struct page *page, int order) | |||
2664 | 2674 | ||
2665 | /* | 2675 | /* |
2666 | * Because tail pages are not marked as "used", set it. We're under | 2676 | * Because tail pages are not marked as "used", set it. We're under |
2667 | * zone_lru_lock and migration entries setup in all page mappings. | 2677 | * pgdat->lru_lock and migration entries setup in all page mappings. |
2668 | */ | 2678 | */ |
2669 | void mem_cgroup_split_huge_fixup(struct page *head) | 2679 | void mem_cgroup_split_huge_fixup(struct page *head) |
2670 | { | 2680 | { |
@@ -3337,7 +3347,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) | |||
3337 | const struct numa_stat *stat; | 3347 | const struct numa_stat *stat; |
3338 | int nid; | 3348 | int nid; |
3339 | unsigned long nr; | 3349 | unsigned long nr; |
3340 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 3350 | struct mem_cgroup *memcg = mem_cgroup_from_seq(m); |
3341 | 3351 | ||
3342 | for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { | 3352 | for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { |
3343 | nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); | 3353 | nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); |
@@ -3388,7 +3398,7 @@ static const char *const memcg1_event_names[] = { | |||
3388 | 3398 | ||
3389 | static int memcg_stat_show(struct seq_file *m, void *v) | 3399 | static int memcg_stat_show(struct seq_file *m, void *v) |
3390 | { | 3400 | { |
3391 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 3401 | struct mem_cgroup *memcg = mem_cgroup_from_seq(m); |
3392 | unsigned long memory, memsw; | 3402 | unsigned long memory, memsw; |
3393 | struct mem_cgroup *mi; | 3403 | struct mem_cgroup *mi; |
3394 | unsigned int i; | 3404 | unsigned int i; |
@@ -3626,8 +3636,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | |||
3626 | size = thresholds->primary ? thresholds->primary->size + 1 : 1; | 3636 | size = thresholds->primary ? thresholds->primary->size + 1 : 1; |
3627 | 3637 | ||
3628 | /* Allocate memory for new array of thresholds */ | 3638 | /* Allocate memory for new array of thresholds */ |
3629 | new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), | 3639 | new = kmalloc(struct_size(new, entries, size), GFP_KERNEL); |
3630 | GFP_KERNEL); | ||
3631 | if (!new) { | 3640 | if (!new) { |
3632 | ret = -ENOMEM; | 3641 | ret = -ENOMEM; |
3633 | goto unlock; | 3642 | goto unlock; |
@@ -3821,7 +3830,7 @@ static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, | |||
3821 | 3830 | ||
3822 | static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) | 3831 | static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) |
3823 | { | 3832 | { |
3824 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); | 3833 | struct mem_cgroup *memcg = mem_cgroup_from_seq(sf); |
3825 | 3834 | ||
3826 | seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); | 3835 | seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); |
3827 | seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); | 3836 | seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); |
@@ -4420,7 +4429,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) | |||
4420 | static struct mem_cgroup *mem_cgroup_alloc(void) | 4429 | static struct mem_cgroup *mem_cgroup_alloc(void) |
4421 | { | 4430 | { |
4422 | struct mem_cgroup *memcg; | 4431 | struct mem_cgroup *memcg; |
4423 | size_t size; | 4432 | unsigned int size; |
4424 | int node; | 4433 | int node; |
4425 | 4434 | ||
4426 | size = sizeof(struct mem_cgroup); | 4435 | size = sizeof(struct mem_cgroup); |
@@ -5354,6 +5363,16 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) | |||
5354 | root_mem_cgroup->use_hierarchy = false; | 5363 | root_mem_cgroup->use_hierarchy = false; |
5355 | } | 5364 | } |
5356 | 5365 | ||
5366 | static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) | ||
5367 | { | ||
5368 | if (value == PAGE_COUNTER_MAX) | ||
5369 | seq_puts(m, "max\n"); | ||
5370 | else | ||
5371 | seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE); | ||
5372 | |||
5373 | return 0; | ||
5374 | } | ||
5375 | |||
5357 | static u64 memory_current_read(struct cgroup_subsys_state *css, | 5376 | static u64 memory_current_read(struct cgroup_subsys_state *css, |
5358 | struct cftype *cft) | 5377 | struct cftype *cft) |
5359 | { | 5378 | { |
@@ -5364,15 +5383,8 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, | |||
5364 | 5383 | ||
5365 | static int memory_min_show(struct seq_file *m, void *v) | 5384 | static int memory_min_show(struct seq_file *m, void *v) |
5366 | { | 5385 | { |
5367 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5386 | return seq_puts_memcg_tunable(m, |
5368 | unsigned long min = READ_ONCE(memcg->memory.min); | 5387 | READ_ONCE(mem_cgroup_from_seq(m)->memory.min)); |
5369 | |||
5370 | if (min == PAGE_COUNTER_MAX) | ||
5371 | seq_puts(m, "max\n"); | ||
5372 | else | ||
5373 | seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE); | ||
5374 | |||
5375 | return 0; | ||
5376 | } | 5388 | } |
5377 | 5389 | ||
5378 | static ssize_t memory_min_write(struct kernfs_open_file *of, | 5390 | static ssize_t memory_min_write(struct kernfs_open_file *of, |
@@ -5394,15 +5406,8 @@ static ssize_t memory_min_write(struct kernfs_open_file *of, | |||
5394 | 5406 | ||
5395 | static int memory_low_show(struct seq_file *m, void *v) | 5407 | static int memory_low_show(struct seq_file *m, void *v) |
5396 | { | 5408 | { |
5397 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5409 | return seq_puts_memcg_tunable(m, |
5398 | unsigned long low = READ_ONCE(memcg->memory.low); | 5410 | READ_ONCE(mem_cgroup_from_seq(m)->memory.low)); |
5399 | |||
5400 | if (low == PAGE_COUNTER_MAX) | ||
5401 | seq_puts(m, "max\n"); | ||
5402 | else | ||
5403 | seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); | ||
5404 | |||
5405 | return 0; | ||
5406 | } | 5411 | } |
5407 | 5412 | ||
5408 | static ssize_t memory_low_write(struct kernfs_open_file *of, | 5413 | static ssize_t memory_low_write(struct kernfs_open_file *of, |
@@ -5424,15 +5429,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, | |||
5424 | 5429 | ||
5425 | static int memory_high_show(struct seq_file *m, void *v) | 5430 | static int memory_high_show(struct seq_file *m, void *v) |
5426 | { | 5431 | { |
5427 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5432 | return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high)); |
5428 | unsigned long high = READ_ONCE(memcg->high); | ||
5429 | |||
5430 | if (high == PAGE_COUNTER_MAX) | ||
5431 | seq_puts(m, "max\n"); | ||
5432 | else | ||
5433 | seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); | ||
5434 | |||
5435 | return 0; | ||
5436 | } | 5433 | } |
5437 | 5434 | ||
5438 | static ssize_t memory_high_write(struct kernfs_open_file *of, | 5435 | static ssize_t memory_high_write(struct kernfs_open_file *of, |
@@ -5461,15 +5458,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, | |||
5461 | 5458 | ||
5462 | static int memory_max_show(struct seq_file *m, void *v) | 5459 | static int memory_max_show(struct seq_file *m, void *v) |
5463 | { | 5460 | { |
5464 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5461 | return seq_puts_memcg_tunable(m, |
5465 | unsigned long max = READ_ONCE(memcg->memory.max); | 5462 | READ_ONCE(mem_cgroup_from_seq(m)->memory.max)); |
5466 | |||
5467 | if (max == PAGE_COUNTER_MAX) | ||
5468 | seq_puts(m, "max\n"); | ||
5469 | else | ||
5470 | seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); | ||
5471 | |||
5472 | return 0; | ||
5473 | } | 5463 | } |
5474 | 5464 | ||
5475 | static ssize_t memory_max_write(struct kernfs_open_file *of, | 5465 | static ssize_t memory_max_write(struct kernfs_open_file *of, |
@@ -5523,7 +5513,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, | |||
5523 | 5513 | ||
5524 | static int memory_events_show(struct seq_file *m, void *v) | 5514 | static int memory_events_show(struct seq_file *m, void *v) |
5525 | { | 5515 | { |
5526 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5516 | struct mem_cgroup *memcg = mem_cgroup_from_seq(m); |
5527 | 5517 | ||
5528 | seq_printf(m, "low %lu\n", | 5518 | seq_printf(m, "low %lu\n", |
5529 | atomic_long_read(&memcg->memory_events[MEMCG_LOW])); | 5519 | atomic_long_read(&memcg->memory_events[MEMCG_LOW])); |
@@ -5541,7 +5531,7 @@ static int memory_events_show(struct seq_file *m, void *v) | |||
5541 | 5531 | ||
5542 | static int memory_stat_show(struct seq_file *m, void *v) | 5532 | static int memory_stat_show(struct seq_file *m, void *v) |
5543 | { | 5533 | { |
5544 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5534 | struct mem_cgroup *memcg = mem_cgroup_from_seq(m); |
5545 | struct accumulated_stats acc; | 5535 | struct accumulated_stats acc; |
5546 | int i; | 5536 | int i; |
5547 | 5537 | ||
@@ -5582,6 +5572,15 @@ static int memory_stat_show(struct seq_file *m, void *v) | |||
5582 | seq_printf(m, "file_writeback %llu\n", | 5572 | seq_printf(m, "file_writeback %llu\n", |
5583 | (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); | 5573 | (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); |
5584 | 5574 | ||
5575 | /* | ||
5576 | * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter | ||
5577 | * with the NR_ANON_THP vm counter, but right now it's a pain in the | ||
5578 | * arse because it requires migrating the work out of rmap to a place | ||
5579 | * where the page->mem_cgroup is set up and stable. | ||
5580 | */ | ||
5581 | seq_printf(m, "anon_thp %llu\n", | ||
5582 | (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE); | ||
5583 | |||
5585 | for (i = 0; i < NR_LRU_LISTS; i++) | 5584 | for (i = 0; i < NR_LRU_LISTS; i++) |
5586 | seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], | 5585 | seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], |
5587 | (u64)acc.lru_pages[i] * PAGE_SIZE); | 5586 | (u64)acc.lru_pages[i] * PAGE_SIZE); |
@@ -5613,12 +5612,18 @@ static int memory_stat_show(struct seq_file *m, void *v) | |||
5613 | seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); | 5612 | seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); |
5614 | seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); | 5613 | seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); |
5615 | 5614 | ||
5615 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
5616 | seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]); | ||
5617 | seq_printf(m, "thp_collapse_alloc %lu\n", | ||
5618 | acc.events[THP_COLLAPSE_ALLOC]); | ||
5619 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
5620 | |||
5616 | return 0; | 5621 | return 0; |
5617 | } | 5622 | } |
5618 | 5623 | ||
5619 | static int memory_oom_group_show(struct seq_file *m, void *v) | 5624 | static int memory_oom_group_show(struct seq_file *m, void *v) |
5620 | { | 5625 | { |
5621 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5626 | struct mem_cgroup *memcg = mem_cgroup_from_seq(m); |
5622 | 5627 | ||
5623 | seq_printf(m, "%d\n", memcg->oom_group); | 5628 | seq_printf(m, "%d\n", memcg->oom_group); |
5624 | 5629 | ||
@@ -5747,7 +5752,7 @@ struct cgroup_subsys memory_cgrp_subsys = { | |||
5747 | * | 5752 | * |
5748 | * | memory.current, if memory.current < memory.low | 5753 | * | memory.current, if memory.current < memory.low |
5749 | * low_usage = | | 5754 | * low_usage = | |
5750 | | 0, otherwise. | 5755 | * | 0, otherwise. |
5751 | * | 5756 | * |
5752 | * | 5757 | * |
5753 | * Such definition of the effective memory.low provides the expected | 5758 | * Such definition of the effective memory.low provides the expected |
@@ -6601,15 +6606,8 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, | |||
6601 | 6606 | ||
6602 | static int swap_max_show(struct seq_file *m, void *v) | 6607 | static int swap_max_show(struct seq_file *m, void *v) |
6603 | { | 6608 | { |
6604 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 6609 | return seq_puts_memcg_tunable(m, |
6605 | unsigned long max = READ_ONCE(memcg->swap.max); | 6610 | READ_ONCE(mem_cgroup_from_seq(m)->swap.max)); |
6606 | |||
6607 | if (max == PAGE_COUNTER_MAX) | ||
6608 | seq_puts(m, "max\n"); | ||
6609 | else | ||
6610 | seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); | ||
6611 | |||
6612 | return 0; | ||
6613 | } | 6611 | } |
6614 | 6612 | ||
6615 | static ssize_t swap_max_write(struct kernfs_open_file *of, | 6613 | static ssize_t swap_max_write(struct kernfs_open_file *of, |
@@ -6631,7 +6629,7 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, | |||
6631 | 6629 | ||
6632 | static int swap_events_show(struct seq_file *m, void *v) | 6630 | static int swap_events_show(struct seq_file *m, void *v) |
6633 | { | 6631 | { |
6634 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 6632 | struct mem_cgroup *memcg = mem_cgroup_from_seq(m); |
6635 | 6633 | ||
6636 | seq_printf(m, "max %lu\n", | 6634 | seq_printf(m, "max %lu\n", |
6637 | atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); | 6635 | atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); |
diff --git a/mm/memfd.c b/mm/memfd.c index 97264c79d2cd..650e65a46b9c 100644 --- a/mm/memfd.c +++ b/mm/memfd.c | |||
@@ -131,7 +131,8 @@ static unsigned int *memfd_file_seals_ptr(struct file *file) | |||
131 | #define F_ALL_SEALS (F_SEAL_SEAL | \ | 131 | #define F_ALL_SEALS (F_SEAL_SEAL | \ |
132 | F_SEAL_SHRINK | \ | 132 | F_SEAL_SHRINK | \ |
133 | F_SEAL_GROW | \ | 133 | F_SEAL_GROW | \ |
134 | F_SEAL_WRITE) | 134 | F_SEAL_WRITE | \ |
135 | F_SEAL_FUTURE_WRITE) | ||
135 | 136 | ||
136 | static int memfd_add_seals(struct file *file, unsigned int seals) | 137 | static int memfd_add_seals(struct file *file, unsigned int seals) |
137 | { | 138 | { |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 831be5ff5f4d..fc8b51744579 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1825,19 +1825,17 @@ static int soft_offline_in_use_page(struct page *page, int flags) | |||
1825 | struct page *hpage = compound_head(page); | 1825 | struct page *hpage = compound_head(page); |
1826 | 1826 | ||
1827 | if (!PageHuge(page) && PageTransHuge(hpage)) { | 1827 | if (!PageHuge(page) && PageTransHuge(hpage)) { |
1828 | lock_page(hpage); | 1828 | lock_page(page); |
1829 | if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { | 1829 | if (!PageAnon(page) || unlikely(split_huge_page(page))) { |
1830 | unlock_page(hpage); | 1830 | unlock_page(page); |
1831 | if (!PageAnon(hpage)) | 1831 | if (!PageAnon(page)) |
1832 | pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); | 1832 | pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); |
1833 | else | 1833 | else |
1834 | pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); | 1834 | pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); |
1835 | put_hwpoison_page(hpage); | 1835 | put_hwpoison_page(page); |
1836 | return -EBUSY; | 1836 | return -EBUSY; |
1837 | } | 1837 | } |
1838 | unlock_page(hpage); | 1838 | unlock_page(page); |
1839 | get_hwpoison_page(page); | ||
1840 | put_hwpoison_page(hpage); | ||
1841 | } | 1839 | } |
1842 | 1840 | ||
1843 | /* | 1841 | /* |
diff --git a/mm/memory.c b/mm/memory.c index e11ca9dd823f..47fe250307c7 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -69,6 +69,7 @@ | |||
69 | #include <linux/userfaultfd_k.h> | 69 | #include <linux/userfaultfd_k.h> |
70 | #include <linux/dax.h> | 70 | #include <linux/dax.h> |
71 | #include <linux/oom.h> | 71 | #include <linux/oom.h> |
72 | #include <linux/numa.h> | ||
72 | 73 | ||
73 | #include <asm/io.h> | 74 | #include <asm/io.h> |
74 | #include <asm/mmu_context.h> | 75 | #include <asm/mmu_context.h> |
@@ -1451,7 +1452,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
1451 | spinlock_t *ptl; | 1452 | spinlock_t *ptl; |
1452 | 1453 | ||
1453 | retval = -EINVAL; | 1454 | retval = -EINVAL; |
1454 | if (PageAnon(page)) | 1455 | if (PageAnon(page) || PageSlab(page) || page_has_type(page)) |
1455 | goto out; | 1456 | goto out; |
1456 | retval = -ENOMEM; | 1457 | retval = -ENOMEM; |
1457 | flush_dcache_page(page); | 1458 | flush_dcache_page(page); |
@@ -1503,6 +1504,8 @@ out: | |||
1503 | * under mm->mmap_sem write-lock, so it can change vma->vm_flags. | 1504 | * under mm->mmap_sem write-lock, so it can change vma->vm_flags. |
1504 | * Caller must set VM_MIXEDMAP on vma if it wants to call this | 1505 | * Caller must set VM_MIXEDMAP on vma if it wants to call this |
1505 | * function from other places, for example from page-fault handler. | 1506 | * function from other places, for example from page-fault handler. |
1507 | * | ||
1508 | * Return: %0 on success, negative error code otherwise. | ||
1506 | */ | 1509 | */ |
1507 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, | 1510 | int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, |
1508 | struct page *page) | 1511 | struct page *page) |
@@ -1830,7 +1833,9 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, | |||
1830 | * @size: size of map area | 1833 | * @size: size of map area |
1831 | * @prot: page protection flags for this mapping | 1834 | * @prot: page protection flags for this mapping |
1832 | * | 1835 | * |
1833 | * Note: this is only safe if the mm semaphore is held when called. | 1836 | * Note: this is only safe if the mm semaphore is held when called. |
1837 | * | ||
1838 | * Return: %0 on success, negative error code otherwise. | ||
1834 | */ | 1839 | */ |
1835 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | 1840 | int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, |
1836 | unsigned long pfn, unsigned long size, pgprot_t prot) | 1841 | unsigned long pfn, unsigned long size, pgprot_t prot) |
@@ -1903,6 +1908,8 @@ EXPORT_SYMBOL(remap_pfn_range); | |||
1903 | * | 1908 | * |
1904 | * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get | 1909 | * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get |
1905 | * whatever write-combining details or similar. | 1910 | * whatever write-combining details or similar. |
1911 | * | ||
1912 | * Return: %0 on success, negative error code otherwise. | ||
1906 | */ | 1913 | */ |
1907 | int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) | 1914 | int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) |
1908 | { | 1915 | { |
@@ -2381,12 +2388,13 @@ oom: | |||
2381 | * | 2388 | * |
2382 | * This function handles all that is needed to finish a write page fault in a | 2389 | * This function handles all that is needed to finish a write page fault in a |
2383 | * shared mapping due to PTE being read-only once the mapped page is prepared. | 2390 | * shared mapping due to PTE being read-only once the mapped page is prepared. |
2384 | * It handles locking of PTE and modifying it. The function returns | 2391 | * It handles locking of PTE and modifying it. |
2385 | * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE | ||
2386 | * lock. | ||
2387 | * | 2392 | * |
2388 | * The function expects the page to be locked or other protection against | 2393 | * The function expects the page to be locked or other protection against |
2389 | * concurrent faults / writeback (such as DAX radix tree locks). | 2394 | * concurrent faults / writeback (such as DAX radix tree locks). |
2395 | * | ||
2396 | * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before | ||
2397 | * we acquired PTE lock. | ||
2390 | */ | 2398 | */ |
2391 | vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) | 2399 | vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) |
2392 | { | 2400 | { |
@@ -2504,8 +2512,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) | |||
2504 | * Take out anonymous pages first, anonymous shared vmas are | 2512 | * Take out anonymous pages first, anonymous shared vmas are |
2505 | * not dirty accountable. | 2513 | * not dirty accountable. |
2506 | */ | 2514 | */ |
2507 | if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { | 2515 | if (PageAnon(vmf->page)) { |
2508 | int total_map_swapcount; | 2516 | int total_map_swapcount; |
2517 | if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) || | ||
2518 | page_count(vmf->page) != 1)) | ||
2519 | goto copy; | ||
2509 | if (!trylock_page(vmf->page)) { | 2520 | if (!trylock_page(vmf->page)) { |
2510 | get_page(vmf->page); | 2521 | get_page(vmf->page); |
2511 | pte_unmap_unlock(vmf->pte, vmf->ptl); | 2522 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
@@ -2520,6 +2531,15 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) | |||
2520 | } | 2531 | } |
2521 | put_page(vmf->page); | 2532 | put_page(vmf->page); |
2522 | } | 2533 | } |
2534 | if (PageKsm(vmf->page)) { | ||
2535 | bool reused = reuse_ksm_page(vmf->page, vmf->vma, | ||
2536 | vmf->address); | ||
2537 | unlock_page(vmf->page); | ||
2538 | if (!reused) | ||
2539 | goto copy; | ||
2540 | wp_page_reuse(vmf); | ||
2541 | return VM_FAULT_WRITE; | ||
2542 | } | ||
2523 | if (reuse_swap_page(vmf->page, &total_map_swapcount)) { | 2543 | if (reuse_swap_page(vmf->page, &total_map_swapcount)) { |
2524 | if (total_map_swapcount == 1) { | 2544 | if (total_map_swapcount == 1) { |
2525 | /* | 2545 | /* |
@@ -2540,7 +2560,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) | |||
2540 | (VM_WRITE|VM_SHARED))) { | 2560 | (VM_WRITE|VM_SHARED))) { |
2541 | return wp_page_shared(vmf); | 2561 | return wp_page_shared(vmf); |
2542 | } | 2562 | } |
2543 | 2563 | copy: | |
2544 | /* | 2564 | /* |
2545 | * Ok, we need to copy. Oh, well.. | 2565 | * Ok, we need to copy. Oh, well.. |
2546 | */ | 2566 | */ |
@@ -3201,6 +3221,8 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) | |||
3201 | * | 3221 | * |
3202 | * Target users are page handler itself and implementations of | 3222 | * Target users are page handler itself and implementations of |
3203 | * vm_ops->map_pages. | 3223 | * vm_ops->map_pages. |
3224 | * | ||
3225 | * Return: %0 on success, %VM_FAULT_ code in case of error. | ||
3204 | */ | 3226 | */ |
3205 | vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, | 3227 | vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, |
3206 | struct page *page) | 3228 | struct page *page) |
@@ -3261,11 +3283,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, | |||
3261 | * This function handles all that is needed to finish a page fault once the | 3283 | * This function handles all that is needed to finish a page fault once the |
3262 | * page to fault in is prepared. It handles locking of PTEs, inserts PTE for | 3284 | * page to fault in is prepared. It handles locking of PTEs, inserts PTE for |
3263 | * given page, adds reverse page mapping, handles memcg charges and LRU | 3285 | * given page, adds reverse page mapping, handles memcg charges and LRU |
3264 | * addition. The function returns 0 on success, VM_FAULT_ code in case of | 3286 | * addition. |
3265 | * error. | ||
3266 | * | 3287 | * |
3267 | * The function expects the page to be locked and on success it consumes a | 3288 | * The function expects the page to be locked and on success it consumes a |
3268 | * reference of a page being mapped (for the PTE which maps it). | 3289 | * reference of a page being mapped (for the PTE which maps it). |
3290 | * | ||
3291 | * Return: %0 on success, %VM_FAULT_ code in case of error. | ||
3269 | */ | 3292 | */ |
3270 | vm_fault_t finish_fault(struct vm_fault *vmf) | 3293 | vm_fault_t finish_fault(struct vm_fault *vmf) |
3271 | { | 3294 | { |
@@ -3321,12 +3344,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, | |||
3321 | 3344 | ||
3322 | static int __init fault_around_debugfs(void) | 3345 | static int __init fault_around_debugfs(void) |
3323 | { | 3346 | { |
3324 | void *ret; | 3347 | debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, |
3325 | 3348 | &fault_around_bytes_fops); | |
3326 | ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, | ||
3327 | &fault_around_bytes_fops); | ||
3328 | if (!ret) | ||
3329 | pr_warn("Failed to create fault_around_bytes in debugfs"); | ||
3330 | return 0; | 3349 | return 0; |
3331 | } | 3350 | } |
3332 | late_initcall(fault_around_debugfs); | 3351 | late_initcall(fault_around_debugfs); |
@@ -3517,10 +3536,13 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) | |||
3517 | * but allow concurrent faults). | 3536 | * but allow concurrent faults). |
3518 | * The mmap_sem may have been released depending on flags and our | 3537 | * The mmap_sem may have been released depending on flags and our |
3519 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3538 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3539 | * If mmap_sem is released, vma may become invalid (for example | ||
3540 | * by other thread calling munmap()). | ||
3520 | */ | 3541 | */ |
3521 | static vm_fault_t do_fault(struct vm_fault *vmf) | 3542 | static vm_fault_t do_fault(struct vm_fault *vmf) |
3522 | { | 3543 | { |
3523 | struct vm_area_struct *vma = vmf->vma; | 3544 | struct vm_area_struct *vma = vmf->vma; |
3545 | struct mm_struct *vm_mm = vma->vm_mm; | ||
3524 | vm_fault_t ret; | 3546 | vm_fault_t ret; |
3525 | 3547 | ||
3526 | /* | 3548 | /* |
@@ -3561,7 +3583,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf) | |||
3561 | 3583 | ||
3562 | /* preallocated pagetable is unused: free it */ | 3584 | /* preallocated pagetable is unused: free it */ |
3563 | if (vmf->prealloc_pte) { | 3585 | if (vmf->prealloc_pte) { |
3564 | pte_free(vma->vm_mm, vmf->prealloc_pte); | 3586 | pte_free(vm_mm, vmf->prealloc_pte); |
3565 | vmf->prealloc_pte = NULL; | 3587 | vmf->prealloc_pte = NULL; |
3566 | } | 3588 | } |
3567 | return ret; | 3589 | return ret; |
@@ -3586,11 +3608,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) | |||
3586 | { | 3608 | { |
3587 | struct vm_area_struct *vma = vmf->vma; | 3609 | struct vm_area_struct *vma = vmf->vma; |
3588 | struct page *page = NULL; | 3610 | struct page *page = NULL; |
3589 | int page_nid = -1; | 3611 | int page_nid = NUMA_NO_NODE; |
3590 | int last_cpupid; | 3612 | int last_cpupid; |
3591 | int target_nid; | 3613 | int target_nid; |
3592 | bool migrated = false; | 3614 | bool migrated = false; |
3593 | pte_t pte; | 3615 | pte_t pte, old_pte; |
3594 | bool was_writable = pte_savedwrite(vmf->orig_pte); | 3616 | bool was_writable = pte_savedwrite(vmf->orig_pte); |
3595 | int flags = 0; | 3617 | int flags = 0; |
3596 | 3618 | ||
@@ -3610,12 +3632,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) | |||
3610 | * Make it present again, Depending on how arch implementes non | 3632 | * Make it present again, Depending on how arch implementes non |
3611 | * accessible ptes, some can allow access by kernel mode. | 3633 | * accessible ptes, some can allow access by kernel mode. |
3612 | */ | 3634 | */ |
3613 | pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte); | 3635 | old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); |
3614 | pte = pte_modify(pte, vma->vm_page_prot); | 3636 | pte = pte_modify(old_pte, vma->vm_page_prot); |
3615 | pte = pte_mkyoung(pte); | 3637 | pte = pte_mkyoung(pte); |
3616 | if (was_writable) | 3638 | if (was_writable) |
3617 | pte = pte_mkwrite(pte); | 3639 | pte = pte_mkwrite(pte); |
3618 | ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte); | 3640 | ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); |
3619 | update_mmu_cache(vma, vmf->address, vmf->pte); | 3641 | update_mmu_cache(vma, vmf->address, vmf->pte); |
3620 | 3642 | ||
3621 | page = vm_normal_page(vma, vmf->address, pte); | 3643 | page = vm_normal_page(vma, vmf->address, pte); |
@@ -3653,7 +3675,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) | |||
3653 | target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, | 3675 | target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, |
3654 | &flags); | 3676 | &flags); |
3655 | pte_unmap_unlock(vmf->pte, vmf->ptl); | 3677 | pte_unmap_unlock(vmf->pte, vmf->ptl); |
3656 | if (target_nid == -1) { | 3678 | if (target_nid == NUMA_NO_NODE) { |
3657 | put_page(page); | 3679 | put_page(page); |
3658 | goto out; | 3680 | goto out; |
3659 | } | 3681 | } |
@@ -3667,7 +3689,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) | |||
3667 | flags |= TNF_MIGRATE_FAIL; | 3689 | flags |= TNF_MIGRATE_FAIL; |
3668 | 3690 | ||
3669 | out: | 3691 | out: |
3670 | if (page_nid != -1) | 3692 | if (page_nid != NUMA_NO_NODE) |
3671 | task_numa_fault(last_cpupid, page_nid, 1, flags); | 3693 | task_numa_fault(last_cpupid, page_nid, 1, flags); |
3672 | return 0; | 3694 | return 0; |
3673 | } | 3695 | } |
@@ -4150,7 +4172,7 @@ EXPORT_SYMBOL(follow_pte_pmd); | |||
4150 | * | 4172 | * |
4151 | * Only IO mappings and raw PFN mappings are allowed. | 4173 | * Only IO mappings and raw PFN mappings are allowed. |
4152 | * | 4174 | * |
4153 | * Returns zero and the pfn at @pfn on success, -ve otherwise. | 4175 | * Return: zero and the pfn at @pfn on success, -ve otherwise. |
4154 | */ | 4176 | */ |
4155 | int follow_pfn(struct vm_area_struct *vma, unsigned long address, | 4177 | int follow_pfn(struct vm_area_struct *vma, unsigned long address, |
4156 | unsigned long *pfn) | 4178 | unsigned long *pfn) |
@@ -4300,6 +4322,8 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, | |||
4300 | * @gup_flags: flags modifying lookup behaviour | 4322 | * @gup_flags: flags modifying lookup behaviour |
4301 | * | 4323 | * |
4302 | * The caller must hold a reference on @mm. | 4324 | * The caller must hold a reference on @mm. |
4325 | * | ||
4326 | * Return: number of bytes copied from source to destination. | ||
4303 | */ | 4327 | */ |
4304 | int access_remote_vm(struct mm_struct *mm, unsigned long addr, | 4328 | int access_remote_vm(struct mm_struct *mm, unsigned long addr, |
4305 | void *buf, int len, unsigned int gup_flags) | 4329 | void *buf, int len, unsigned int gup_flags) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1ad28323fb9f..6b05576fb4ec 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -47,7 +47,7 @@ | |||
47 | * and restore_online_page_callback() for generic callback restore. | 47 | * and restore_online_page_callback() for generic callback restore. |
48 | */ | 48 | */ |
49 | 49 | ||
50 | static void generic_online_page(struct page *page); | 50 | static void generic_online_page(struct page *page, unsigned int order); |
51 | 51 | ||
52 | static online_page_callback_t online_page_callback = generic_online_page; | 52 | static online_page_callback_t online_page_callback = generic_online_page; |
53 | static DEFINE_MUTEX(online_page_callback_lock); | 53 | static DEFINE_MUTEX(online_page_callback_lock); |
@@ -656,26 +656,40 @@ void __online_page_free(struct page *page) | |||
656 | } | 656 | } |
657 | EXPORT_SYMBOL_GPL(__online_page_free); | 657 | EXPORT_SYMBOL_GPL(__online_page_free); |
658 | 658 | ||
659 | static void generic_online_page(struct page *page) | 659 | static void generic_online_page(struct page *page, unsigned int order) |
660 | { | 660 | { |
661 | __online_page_set_limits(page); | 661 | kernel_map_pages(page, 1 << order, 1); |
662 | __online_page_increment_counters(page); | 662 | __free_pages_core(page, order); |
663 | __online_page_free(page); | 663 | totalram_pages_add(1UL << order); |
664 | #ifdef CONFIG_HIGHMEM | ||
665 | if (PageHighMem(page)) | ||
666 | totalhigh_pages_add(1UL << order); | ||
667 | #endif | ||
668 | } | ||
669 | |||
670 | static int online_pages_blocks(unsigned long start, unsigned long nr_pages) | ||
671 | { | ||
672 | unsigned long end = start + nr_pages; | ||
673 | int order, onlined_pages = 0; | ||
674 | |||
675 | while (start < end) { | ||
676 | order = min(MAX_ORDER - 1, | ||
677 | get_order(PFN_PHYS(end) - PFN_PHYS(start))); | ||
678 | (*online_page_callback)(pfn_to_page(start), order); | ||
679 | |||
680 | onlined_pages += (1UL << order); | ||
681 | start += (1UL << order); | ||
682 | } | ||
683 | return onlined_pages; | ||
664 | } | 684 | } |
665 | 685 | ||
666 | static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, | 686 | static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, |
667 | void *arg) | 687 | void *arg) |
668 | { | 688 | { |
669 | unsigned long i; | ||
670 | unsigned long onlined_pages = *(unsigned long *)arg; | 689 | unsigned long onlined_pages = *(unsigned long *)arg; |
671 | struct page *page; | ||
672 | 690 | ||
673 | if (PageReserved(pfn_to_page(start_pfn))) | 691 | if (PageReserved(pfn_to_page(start_pfn))) |
674 | for (i = 0; i < nr_pages; i++) { | 692 | onlined_pages += online_pages_blocks(start_pfn, nr_pages); |
675 | page = pfn_to_page(start_pfn + i); | ||
676 | (*online_page_callback)(page); | ||
677 | onlined_pages++; | ||
678 | } | ||
679 | 693 | ||
680 | online_mem_sections(start_pfn, start_pfn + nr_pages); | 694 | online_mem_sections(start_pfn, start_pfn + nr_pages); |
681 | 695 | ||
@@ -689,9 +703,9 @@ static void node_states_check_changes_online(unsigned long nr_pages, | |||
689 | { | 703 | { |
690 | int nid = zone_to_nid(zone); | 704 | int nid = zone_to_nid(zone); |
691 | 705 | ||
692 | arg->status_change_nid = -1; | 706 | arg->status_change_nid = NUMA_NO_NODE; |
693 | arg->status_change_nid_normal = -1; | 707 | arg->status_change_nid_normal = NUMA_NO_NODE; |
694 | arg->status_change_nid_high = -1; | 708 | arg->status_change_nid_high = NUMA_NO_NODE; |
695 | 709 | ||
696 | if (!node_state(nid, N_MEMORY)) | 710 | if (!node_state(nid, N_MEMORY)) |
697 | arg->status_change_nid = nid; | 711 | arg->status_change_nid = nid; |
@@ -1365,12 +1379,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1365 | 1379 | ||
1366 | if (PageHuge(page)) { | 1380 | if (PageHuge(page)) { |
1367 | struct page *head = compound_head(page); | 1381 | struct page *head = compound_head(page); |
1368 | pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; | ||
1369 | if (compound_order(head) > PFN_SECTION_SHIFT) { | 1382 | if (compound_order(head) > PFN_SECTION_SHIFT) { |
1370 | ret = -EBUSY; | 1383 | ret = -EBUSY; |
1371 | break; | 1384 | break; |
1372 | } | 1385 | } |
1373 | isolate_huge_page(page, &source); | 1386 | pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; |
1387 | isolate_huge_page(head, &source); | ||
1374 | continue; | 1388 | continue; |
1375 | } else if (PageTransHuge(page)) | 1389 | } else if (PageTransHuge(page)) |
1376 | pfn = page_to_pfn(compound_head(page)) | 1390 | pfn = page_to_pfn(compound_head(page)) |
@@ -1496,9 +1510,9 @@ static void node_states_check_changes_offline(unsigned long nr_pages, | |||
1496 | unsigned long present_pages = 0; | 1510 | unsigned long present_pages = 0; |
1497 | enum zone_type zt; | 1511 | enum zone_type zt; |
1498 | 1512 | ||
1499 | arg->status_change_nid = -1; | 1513 | arg->status_change_nid = NUMA_NO_NODE; |
1500 | arg->status_change_nid_normal = -1; | 1514 | arg->status_change_nid_normal = NUMA_NO_NODE; |
1501 | arg->status_change_nid_high = -1; | 1515 | arg->status_change_nid_high = NUMA_NO_NODE; |
1502 | 1516 | ||
1503 | /* | 1517 | /* |
1504 | * Check whether node_states[N_NORMAL_MEMORY] will be changed. | 1518 | * Check whether node_states[N_NORMAL_MEMORY] will be changed. |
@@ -1612,7 +1626,6 @@ static int __ref __offline_pages(unsigned long start_pfn, | |||
1612 | 1626 | ||
1613 | cond_resched(); | 1627 | cond_resched(); |
1614 | lru_add_drain_all(); | 1628 | lru_add_drain_all(); |
1615 | drain_all_pages(zone); | ||
1616 | 1629 | ||
1617 | pfn = scan_movable_pages(pfn, end_pfn); | 1630 | pfn = scan_movable_pages(pfn, end_pfn); |
1618 | if (pfn) { | 1631 | if (pfn) { |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ee2bce59d2bf..af171ccb56a2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -350,7 +350,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) | |||
350 | { | 350 | { |
351 | if (!pol) | 351 | if (!pol) |
352 | return; | 352 | return; |
353 | if (!mpol_store_user_nodemask(pol) && | 353 | if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) && |
354 | nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) | 354 | nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) |
355 | return; | 355 | return; |
356 | 356 | ||
@@ -2304,7 +2304,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long | |||
2304 | unsigned long pgoff; | 2304 | unsigned long pgoff; |
2305 | int thiscpu = raw_smp_processor_id(); | 2305 | int thiscpu = raw_smp_processor_id(); |
2306 | int thisnid = cpu_to_node(thiscpu); | 2306 | int thisnid = cpu_to_node(thiscpu); |
2307 | int polnid = -1; | 2307 | int polnid = NUMA_NO_NODE; |
2308 | int ret = -1; | 2308 | int ret = -1; |
2309 | 2309 | ||
2310 | pol = get_vma_policy(vma, addr); | 2310 | pol = get_vma_policy(vma, addr); |
diff --git a/mm/mempool.c b/mm/mempool.c index 0ef8cc8d1602..85efab3da720 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -222,6 +222,8 @@ EXPORT_SYMBOL(mempool_init_node); | |||
222 | * | 222 | * |
223 | * Like mempool_create(), but initializes the pool in (i.e. embedded in another | 223 | * Like mempool_create(), but initializes the pool in (i.e. embedded in another |
224 | * structure). | 224 | * structure). |
225 | * | ||
226 | * Return: %0 on success, negative error code otherwise. | ||
225 | */ | 227 | */ |
226 | int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, | 228 | int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, |
227 | mempool_free_t *free_fn, void *pool_data) | 229 | mempool_free_t *free_fn, void *pool_data) |
@@ -245,6 +247,8 @@ EXPORT_SYMBOL(mempool_init); | |||
245 | * functions. This function might sleep. Both the alloc_fn() and the free_fn() | 247 | * functions. This function might sleep. Both the alloc_fn() and the free_fn() |
246 | * functions might sleep - as long as the mempool_alloc() function is not called | 248 | * functions might sleep - as long as the mempool_alloc() function is not called |
247 | * from IRQ contexts. | 249 | * from IRQ contexts. |
250 | * | ||
251 | * Return: pointer to the created memory pool object or %NULL on error. | ||
248 | */ | 252 | */ |
249 | mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, | 253 | mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, |
250 | mempool_free_t *free_fn, void *pool_data) | 254 | mempool_free_t *free_fn, void *pool_data) |
@@ -289,6 +293,8 @@ EXPORT_SYMBOL(mempool_create_node); | |||
289 | * Note, the caller must guarantee that no mempool_destroy is called | 293 | * Note, the caller must guarantee that no mempool_destroy is called |
290 | * while this function is running. mempool_alloc() & mempool_free() | 294 | * while this function is running. mempool_alloc() & mempool_free() |
291 | * might be called (eg. from IRQ contexts) while this function executes. | 295 | * might be called (eg. from IRQ contexts) while this function executes. |
296 | * | ||
297 | * Return: %0 on success, negative error code otherwise. | ||
292 | */ | 298 | */ |
293 | int mempool_resize(mempool_t *pool, int new_min_nr) | 299 | int mempool_resize(mempool_t *pool, int new_min_nr) |
294 | { | 300 | { |
@@ -363,6 +369,8 @@ EXPORT_SYMBOL(mempool_resize); | |||
363 | * *never* fails when called from process contexts. (it might | 369 | * *never* fails when called from process contexts. (it might |
364 | * fail if called from an IRQ context.) | 370 | * fail if called from an IRQ context.) |
365 | * Note: using __GFP_ZERO is not supported. | 371 | * Note: using __GFP_ZERO is not supported. |
372 | * | ||
373 | * Return: pointer to the allocated element or %NULL on error. | ||
366 | */ | 374 | */ |
367 | void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) | 375 | void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) |
368 | { | 376 | { |
diff --git a/mm/migrate.c b/mm/migrate.c index 181f5d2718a9..ac6f4939bb59 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -100,7 +100,7 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) | |||
100 | /* | 100 | /* |
101 | * Check PageMovable before holding a PG_lock because page's owner | 101 | * Check PageMovable before holding a PG_lock because page's owner |
102 | * assumes anybody doesn't touch PG_lock of newly allocated page | 102 | * assumes anybody doesn't touch PG_lock of newly allocated page |
103 | * so unconditionally grapping the lock ruins page's owner side. | 103 | * so unconditionally grabbing the lock ruins page's owner side. |
104 | */ | 104 | */ |
105 | if (unlikely(!__PageMovable(page))) | 105 | if (unlikely(!__PageMovable(page))) |
106 | goto out_putpage; | 106 | goto out_putpage; |
@@ -374,7 +374,7 @@ unlock: | |||
374 | } | 374 | } |
375 | #endif | 375 | #endif |
376 | 376 | ||
377 | static int expected_page_refs(struct page *page) | 377 | static int expected_page_refs(struct address_space *mapping, struct page *page) |
378 | { | 378 | { |
379 | int expected_count = 1; | 379 | int expected_count = 1; |
380 | 380 | ||
@@ -384,7 +384,7 @@ static int expected_page_refs(struct page *page) | |||
384 | */ | 384 | */ |
385 | expected_count += is_device_private_page(page); | 385 | expected_count += is_device_private_page(page); |
386 | expected_count += is_device_public_page(page); | 386 | expected_count += is_device_public_page(page); |
387 | if (page_mapping(page)) | 387 | if (mapping) |
388 | expected_count += hpage_nr_pages(page) + page_has_private(page); | 388 | expected_count += hpage_nr_pages(page) + page_has_private(page); |
389 | 389 | ||
390 | return expected_count; | 390 | return expected_count; |
@@ -405,7 +405,7 @@ int migrate_page_move_mapping(struct address_space *mapping, | |||
405 | XA_STATE(xas, &mapping->i_pages, page_index(page)); | 405 | XA_STATE(xas, &mapping->i_pages, page_index(page)); |
406 | struct zone *oldzone, *newzone; | 406 | struct zone *oldzone, *newzone; |
407 | int dirty; | 407 | int dirty; |
408 | int expected_count = expected_page_refs(page) + extra_count; | 408 | int expected_count = expected_page_refs(mapping, page) + extra_count; |
409 | 409 | ||
410 | if (!mapping) { | 410 | if (!mapping) { |
411 | /* Anonymous page without mapping */ | 411 | /* Anonymous page without mapping */ |
@@ -750,7 +750,7 @@ static int __buffer_migrate_page(struct address_space *mapping, | |||
750 | return migrate_page(mapping, newpage, page, mode); | 750 | return migrate_page(mapping, newpage, page, mode); |
751 | 751 | ||
752 | /* Check whether page does not have extra refs before we do more work */ | 752 | /* Check whether page does not have extra refs before we do more work */ |
753 | expected_count = expected_page_refs(page); | 753 | expected_count = expected_page_refs(mapping, page); |
754 | if (page_count(page) != expected_count) | 754 | if (page_count(page) != expected_count) |
755 | return -EAGAIN; | 755 | return -EAGAIN; |
756 | 756 | ||
@@ -911,7 +911,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
911 | */ | 911 | */ |
912 | if (page_has_private(page) && | 912 | if (page_has_private(page) && |
913 | !try_to_release_page(page, GFP_KERNEL)) | 913 | !try_to_release_page(page, GFP_KERNEL)) |
914 | return -EAGAIN; | 914 | return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; |
915 | 915 | ||
916 | return migrate_page(mapping, newpage, page, mode); | 916 | return migrate_page(mapping, newpage, page, mode); |
917 | } | 917 | } |
@@ -1287,7 +1287,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, | |||
1287 | struct anon_vma *anon_vma = NULL; | 1287 | struct anon_vma *anon_vma = NULL; |
1288 | 1288 | ||
1289 | /* | 1289 | /* |
1290 | * Movability of hugepages depends on architectures and hugepage size. | 1290 | * Migratability of hugepages depends on architectures and their size. |
1291 | * This check is necessary because some callers of hugepage migration | 1291 | * This check is necessary because some callers of hugepage migration |
1292 | * like soft offline and memory hotremove don't walk through page | 1292 | * like soft offline and memory hotremove don't walk through page |
1293 | * tables or check whether the hugepage is pmd-based or not before | 1293 | * tables or check whether the hugepage is pmd-based or not before |
diff --git a/mm/mlock.c b/mm/mlock.c index 41cc47e28ad6..080f3b36415b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -182,7 +182,7 @@ static void __munlock_isolation_failed(struct page *page) | |||
182 | unsigned int munlock_vma_page(struct page *page) | 182 | unsigned int munlock_vma_page(struct page *page) |
183 | { | 183 | { |
184 | int nr_pages; | 184 | int nr_pages; |
185 | struct zone *zone = page_zone(page); | 185 | pg_data_t *pgdat = page_pgdat(page); |
186 | 186 | ||
187 | /* For try_to_munlock() and to serialize with page migration */ | 187 | /* For try_to_munlock() and to serialize with page migration */ |
188 | BUG_ON(!PageLocked(page)); | 188 | BUG_ON(!PageLocked(page)); |
@@ -194,7 +194,7 @@ unsigned int munlock_vma_page(struct page *page) | |||
194 | * might otherwise copy PageMlocked to part of the tail pages before | 194 | * might otherwise copy PageMlocked to part of the tail pages before |
195 | * we clear it in the head page. It also stabilizes hpage_nr_pages(). | 195 | * we clear it in the head page. It also stabilizes hpage_nr_pages(). |
196 | */ | 196 | */ |
197 | spin_lock_irq(zone_lru_lock(zone)); | 197 | spin_lock_irq(&pgdat->lru_lock); |
198 | 198 | ||
199 | if (!TestClearPageMlocked(page)) { | 199 | if (!TestClearPageMlocked(page)) { |
200 | /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ | 200 | /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ |
@@ -203,17 +203,17 @@ unsigned int munlock_vma_page(struct page *page) | |||
203 | } | 203 | } |
204 | 204 | ||
205 | nr_pages = hpage_nr_pages(page); | 205 | nr_pages = hpage_nr_pages(page); |
206 | __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); | 206 | __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); |
207 | 207 | ||
208 | if (__munlock_isolate_lru_page(page, true)) { | 208 | if (__munlock_isolate_lru_page(page, true)) { |
209 | spin_unlock_irq(zone_lru_lock(zone)); | 209 | spin_unlock_irq(&pgdat->lru_lock); |
210 | __munlock_isolated_page(page); | 210 | __munlock_isolated_page(page); |
211 | goto out; | 211 | goto out; |
212 | } | 212 | } |
213 | __munlock_isolation_failed(page); | 213 | __munlock_isolation_failed(page); |
214 | 214 | ||
215 | unlock_out: | 215 | unlock_out: |
216 | spin_unlock_irq(zone_lru_lock(zone)); | 216 | spin_unlock_irq(&pgdat->lru_lock); |
217 | 217 | ||
218 | out: | 218 | out: |
219 | return nr_pages - 1; | 219 | return nr_pages - 1; |
@@ -298,7 +298,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) | |||
298 | pagevec_init(&pvec_putback); | 298 | pagevec_init(&pvec_putback); |
299 | 299 | ||
300 | /* Phase 1: page isolation */ | 300 | /* Phase 1: page isolation */ |
301 | spin_lock_irq(zone_lru_lock(zone)); | 301 | spin_lock_irq(&zone->zone_pgdat->lru_lock); |
302 | for (i = 0; i < nr; i++) { | 302 | for (i = 0; i < nr; i++) { |
303 | struct page *page = pvec->pages[i]; | 303 | struct page *page = pvec->pages[i]; |
304 | 304 | ||
@@ -325,7 +325,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) | |||
325 | pvec->pages[i] = NULL; | 325 | pvec->pages[i] = NULL; |
326 | } | 326 | } |
327 | __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); | 327 | __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); |
328 | spin_unlock_irq(zone_lru_lock(zone)); | 328 | spin_unlock_irq(&zone->zone_pgdat->lru_lock); |
329 | 329 | ||
330 | /* Now we can release pins of pages that we are not munlocking */ | 330 | /* Now we can release pins of pages that we are not munlocking */ |
331 | pagevec_release(&pvec_putback); | 331 | pagevec_release(&pvec_putback); |
@@ -438,7 +438,7 @@ static void vma_gap_update(struct vm_area_struct *vma) | |||
438 | { | 438 | { |
439 | /* | 439 | /* |
440 | * As it turns out, RB_DECLARE_CALLBACKS() already created a callback | 440 | * As it turns out, RB_DECLARE_CALLBACKS() already created a callback |
441 | * function that does exacltly what we want. | 441 | * function that does exactly what we want. |
442 | */ | 442 | */ |
443 | vma_gap_callbacks_propagate(&vma->vm_rb, NULL); | 443 | vma_gap_callbacks_propagate(&vma->vm_rb, NULL); |
444 | } | 444 | } |
@@ -1012,7 +1012,7 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, | |||
1012 | * VM_SOFTDIRTY should not prevent from VMA merging, if we | 1012 | * VM_SOFTDIRTY should not prevent from VMA merging, if we |
1013 | * match the flags but dirty bit -- the caller should mark | 1013 | * match the flags but dirty bit -- the caller should mark |
1014 | * merged VMA as dirty. If dirty bit won't be excluded from | 1014 | * merged VMA as dirty. If dirty bit won't be excluded from |
1015 | * comparison, we increase pressue on the memory system forcing | 1015 | * comparison, we increase pressure on the memory system forcing |
1016 | * the kernel to generate new VMAs when old one could be | 1016 | * the kernel to generate new VMAs when old one could be |
1017 | * extended instead. | 1017 | * extended instead. |
1018 | */ | 1018 | */ |
@@ -1115,7 +1115,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, | |||
1115 | * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN | 1115 | * PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN |
1116 | * might become case 1 below case 2 below case 3 below | 1116 | * might become case 1 below case 2 below case 3 below |
1117 | * | 1117 | * |
1118 | * It is important for case 8 that the the vma NNNN overlapping the | 1118 | * It is important for case 8 that the vma NNNN overlapping the |
1119 | * region AAAA is never going to extended over XXXX. Instead XXXX must | 1119 | * region AAAA is never going to extended over XXXX. Instead XXXX must |
1120 | * be extended in region AAAA and NNNN must be removed. This way in | 1120 | * be extended in region AAAA and NNNN must be removed. This way in |
1121 | * all cases where vma_merge succeeds, the moment vma_adjust drops the | 1121 | * all cases where vma_merge succeeds, the moment vma_adjust drops the |
@@ -1645,7 +1645,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) | |||
1645 | #endif /* __ARCH_WANT_SYS_OLD_MMAP */ | 1645 | #endif /* __ARCH_WANT_SYS_OLD_MMAP */ |
1646 | 1646 | ||
1647 | /* | 1647 | /* |
1648 | * Some shared mappigns will want the pages marked read-only | 1648 | * Some shared mappings will want the pages marked read-only |
1649 | * to track write events. If so, we'll downgrade vm_page_prot | 1649 | * to track write events. If so, we'll downgrade vm_page_prot |
1650 | * to the private version (using protection_map[] without the | 1650 | * to the private version (using protection_map[] without the |
1651 | * VM_SHARED bit). | 1651 | * VM_SHARED bit). |
@@ -2126,13 +2126,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
2126 | */ | 2126 | */ |
2127 | #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN | 2127 | #ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN |
2128 | unsigned long | 2128 | unsigned long |
2129 | arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | 2129 | arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, |
2130 | const unsigned long len, const unsigned long pgoff, | 2130 | unsigned long len, unsigned long pgoff, |
2131 | const unsigned long flags) | 2131 | unsigned long flags) |
2132 | { | 2132 | { |
2133 | struct vm_area_struct *vma, *prev; | 2133 | struct vm_area_struct *vma, *prev; |
2134 | struct mm_struct *mm = current->mm; | 2134 | struct mm_struct *mm = current->mm; |
2135 | unsigned long addr = addr0; | ||
2136 | struct vm_unmapped_area_info info; | 2135 | struct vm_unmapped_area_info info; |
2137 | const unsigned long mmap_end = arch_get_mmap_end(addr); | 2136 | const unsigned long mmap_end = arch_get_mmap_end(addr); |
2138 | 2137 | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index 36cb358db170..028c724dcb1a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -110,8 +110,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
110 | continue; | 110 | continue; |
111 | } | 111 | } |
112 | 112 | ||
113 | ptent = ptep_modify_prot_start(mm, addr, pte); | 113 | oldpte = ptep_modify_prot_start(vma, addr, pte); |
114 | ptent = pte_modify(ptent, newprot); | 114 | ptent = pte_modify(oldpte, newprot); |
115 | if (preserve_write) | 115 | if (preserve_write) |
116 | ptent = pte_mk_savedwrite(ptent); | 116 | ptent = pte_mk_savedwrite(ptent); |
117 | 117 | ||
@@ -121,7 +121,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
121 | !(vma->vm_flags & VM_SOFTDIRTY))) { | 121 | !(vma->vm_flags & VM_SOFTDIRTY))) { |
122 | ptent = pte_mkwrite(ptent); | 122 | ptent = pte_mkwrite(ptent); |
123 | } | 123 | } |
124 | ptep_modify_prot_commit(mm, addr, pte, ptent); | 124 | ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); |
125 | pages++; | 125 | pages++; |
126 | } else if (IS_ENABLED(CONFIG_MIGRATION)) { | 126 | } else if (IS_ENABLED(CONFIG_MIGRATION)) { |
127 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 127 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
diff --git a/mm/mremap.c b/mm/mremap.c index 3320616ed93f..e3edef6b7a12 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -516,6 +516,23 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, | |||
516 | if (addr + old_len > new_addr && new_addr + new_len > addr) | 516 | if (addr + old_len > new_addr && new_addr + new_len > addr) |
517 | goto out; | 517 | goto out; |
518 | 518 | ||
519 | /* | ||
520 | * move_vma() need us to stay 4 maps below the threshold, otherwise | ||
521 | * it will bail out at the very beginning. | ||
522 | * That is a problem if we have already unmaped the regions here | ||
523 | * (new_addr, and old_addr), because userspace will not know the | ||
524 | * state of the vma's after it gets -ENOMEM. | ||
525 | * So, to avoid such scenario we can pre-compute if the whole | ||
526 | * operation has high chances to success map-wise. | ||
527 | * Worst-scenario case is when both vma's (new_addr and old_addr) get | ||
528 | * split in 3 before unmaping it. | ||
529 | * That means 2 more maps (1 for each) to the ones we already hold. | ||
530 | * Check whether current map count plus 2 still leads us to 4 maps below | ||
531 | * the threshold, otherwise return -ENOMEM here to be more safe. | ||
532 | */ | ||
533 | if ((mm->map_count + 2) >= sysctl_max_map_count - 3) | ||
534 | return -ENOMEM; | ||
535 | |||
519 | ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); | 536 | ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); |
520 | if (ret) | 537 | if (ret) |
521 | goto out; | 538 | goto out; |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 26ea8636758f..3a2484884cfd 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -843,7 +843,7 @@ static bool task_will_free_mem(struct task_struct *task) | |||
843 | return ret; | 843 | return ret; |
844 | } | 844 | } |
845 | 845 | ||
846 | static void __oom_kill_process(struct task_struct *victim) | 846 | static void __oom_kill_process(struct task_struct *victim, const char *message) |
847 | { | 847 | { |
848 | struct task_struct *p; | 848 | struct task_struct *p; |
849 | struct mm_struct *mm; | 849 | struct mm_struct *mm; |
@@ -874,8 +874,9 @@ static void __oom_kill_process(struct task_struct *victim) | |||
874 | */ | 874 | */ |
875 | do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); | 875 | do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); |
876 | mark_oom_victim(victim); | 876 | mark_oom_victim(victim); |
877 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", | 877 | pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", |
878 | task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), | 878 | message, task_pid_nr(victim), victim->comm, |
879 | K(victim->mm->total_vm), | ||
879 | K(get_mm_counter(victim->mm, MM_ANONPAGES)), | 880 | K(get_mm_counter(victim->mm, MM_ANONPAGES)), |
880 | K(get_mm_counter(victim->mm, MM_FILEPAGES)), | 881 | K(get_mm_counter(victim->mm, MM_FILEPAGES)), |
881 | K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); | 882 | K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); |
@@ -926,24 +927,20 @@ static void __oom_kill_process(struct task_struct *victim) | |||
926 | * Kill provided task unless it's secured by setting | 927 | * Kill provided task unless it's secured by setting |
927 | * oom_score_adj to OOM_SCORE_ADJ_MIN. | 928 | * oom_score_adj to OOM_SCORE_ADJ_MIN. |
928 | */ | 929 | */ |
929 | static int oom_kill_memcg_member(struct task_struct *task, void *unused) | 930 | static int oom_kill_memcg_member(struct task_struct *task, void *message) |
930 | { | 931 | { |
931 | if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { | 932 | if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN && |
933 | !is_global_init(task)) { | ||
932 | get_task_struct(task); | 934 | get_task_struct(task); |
933 | __oom_kill_process(task); | 935 | __oom_kill_process(task, message); |
934 | } | 936 | } |
935 | return 0; | 937 | return 0; |
936 | } | 938 | } |
937 | 939 | ||
938 | static void oom_kill_process(struct oom_control *oc, const char *message) | 940 | static void oom_kill_process(struct oom_control *oc, const char *message) |
939 | { | 941 | { |
940 | struct task_struct *p = oc->chosen; | 942 | struct task_struct *victim = oc->chosen; |
941 | unsigned int points = oc->chosen_points; | ||
942 | struct task_struct *victim = p; | ||
943 | struct task_struct *child; | ||
944 | struct task_struct *t; | ||
945 | struct mem_cgroup *oom_group; | 943 | struct mem_cgroup *oom_group; |
946 | unsigned int victim_points = 0; | ||
947 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, | 944 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, |
948 | DEFAULT_RATELIMIT_BURST); | 945 | DEFAULT_RATELIMIT_BURST); |
949 | 946 | ||
@@ -952,57 +949,18 @@ static void oom_kill_process(struct oom_control *oc, const char *message) | |||
952 | * its children or threads, just give it access to memory reserves | 949 | * its children or threads, just give it access to memory reserves |
953 | * so it can die quickly | 950 | * so it can die quickly |
954 | */ | 951 | */ |
955 | task_lock(p); | 952 | task_lock(victim); |
956 | if (task_will_free_mem(p)) { | 953 | if (task_will_free_mem(victim)) { |
957 | mark_oom_victim(p); | 954 | mark_oom_victim(victim); |
958 | wake_oom_reaper(p); | 955 | wake_oom_reaper(victim); |
959 | task_unlock(p); | 956 | task_unlock(victim); |
960 | put_task_struct(p); | 957 | put_task_struct(victim); |
961 | return; | 958 | return; |
962 | } | 959 | } |
963 | task_unlock(p); | 960 | task_unlock(victim); |
964 | 961 | ||
965 | if (__ratelimit(&oom_rs)) | 962 | if (__ratelimit(&oom_rs)) |
966 | dump_header(oc, p); | 963 | dump_header(oc, victim); |
967 | |||
968 | pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", | ||
969 | message, task_pid_nr(p), p->comm, points); | ||
970 | |||
971 | /* | ||
972 | * If any of p's children has a different mm and is eligible for kill, | ||
973 | * the one with the highest oom_badness() score is sacrificed for its | ||
974 | * parent. This attempts to lose the minimal amount of work done while | ||
975 | * still freeing memory. | ||
976 | */ | ||
977 | read_lock(&tasklist_lock); | ||
978 | |||
979 | /* | ||
980 | * The task 'p' might have already exited before reaching here. The | ||
981 | * put_task_struct() will free task_struct 'p' while the loop still try | ||
982 | * to access the field of 'p', so, get an extra reference. | ||
983 | */ | ||
984 | get_task_struct(p); | ||
985 | for_each_thread(p, t) { | ||
986 | list_for_each_entry(child, &t->children, sibling) { | ||
987 | unsigned int child_points; | ||
988 | |||
989 | if (process_shares_mm(child, p->mm)) | ||
990 | continue; | ||
991 | /* | ||
992 | * oom_badness() returns 0 if the thread is unkillable | ||
993 | */ | ||
994 | child_points = oom_badness(child, | ||
995 | oc->memcg, oc->nodemask, oc->totalpages); | ||
996 | if (child_points > victim_points) { | ||
997 | put_task_struct(victim); | ||
998 | victim = child; | ||
999 | victim_points = child_points; | ||
1000 | get_task_struct(victim); | ||
1001 | } | ||
1002 | } | ||
1003 | } | ||
1004 | put_task_struct(p); | ||
1005 | read_unlock(&tasklist_lock); | ||
1006 | 964 | ||
1007 | /* | 965 | /* |
1008 | * Do we need to kill the entire memory cgroup? | 966 | * Do we need to kill the entire memory cgroup? |
@@ -1011,14 +969,15 @@ static void oom_kill_process(struct oom_control *oc, const char *message) | |||
1011 | */ | 969 | */ |
1012 | oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); | 970 | oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); |
1013 | 971 | ||
1014 | __oom_kill_process(victim); | 972 | __oom_kill_process(victim, message); |
1015 | 973 | ||
1016 | /* | 974 | /* |
1017 | * If necessary, kill all tasks in the selected memory cgroup. | 975 | * If necessary, kill all tasks in the selected memory cgroup. |
1018 | */ | 976 | */ |
1019 | if (oom_group) { | 977 | if (oom_group) { |
1020 | mem_cgroup_print_oom_group(oom_group); | 978 | mem_cgroup_print_oom_group(oom_group); |
1021 | mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL); | 979 | mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, |
980 | (void*)message); | ||
1022 | mem_cgroup_put(oom_group); | 981 | mem_cgroup_put(oom_group); |
1023 | } | 982 | } |
1024 | } | 983 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7d1010453fb9..9f61dfec6a1f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -270,7 +270,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, | |||
270 | * node_dirtyable_memory - number of dirtyable pages in a node | 270 | * node_dirtyable_memory - number of dirtyable pages in a node |
271 | * @pgdat: the node | 271 | * @pgdat: the node |
272 | * | 272 | * |
273 | * Returns the node's number of pages potentially available for dirty | 273 | * Return: the node's number of pages potentially available for dirty |
274 | * page cache. This is the base value for the per-node dirty limits. | 274 | * page cache. This is the base value for the per-node dirty limits. |
275 | */ | 275 | */ |
276 | static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) | 276 | static unsigned long node_dirtyable_memory(struct pglist_data *pgdat) |
@@ -355,7 +355,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) | |||
355 | /** | 355 | /** |
356 | * global_dirtyable_memory - number of globally dirtyable pages | 356 | * global_dirtyable_memory - number of globally dirtyable pages |
357 | * | 357 | * |
358 | * Returns the global number of pages potentially available for dirty | 358 | * Return: the global number of pages potentially available for dirty |
359 | * page cache. This is the base value for the global dirty limits. | 359 | * page cache. This is the base value for the global dirty limits. |
360 | */ | 360 | */ |
361 | static unsigned long global_dirtyable_memory(void) | 361 | static unsigned long global_dirtyable_memory(void) |
@@ -470,7 +470,7 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) | |||
470 | * node_dirty_limit - maximum number of dirty pages allowed in a node | 470 | * node_dirty_limit - maximum number of dirty pages allowed in a node |
471 | * @pgdat: the node | 471 | * @pgdat: the node |
472 | * | 472 | * |
473 | * Returns the maximum number of dirty pages allowed in a node, based | 473 | * Return: the maximum number of dirty pages allowed in a node, based |
474 | * on the node's dirtyable memory. | 474 | * on the node's dirtyable memory. |
475 | */ | 475 | */ |
476 | static unsigned long node_dirty_limit(struct pglist_data *pgdat) | 476 | static unsigned long node_dirty_limit(struct pglist_data *pgdat) |
@@ -495,7 +495,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat) | |||
495 | * node_dirty_ok - tells whether a node is within its dirty limits | 495 | * node_dirty_ok - tells whether a node is within its dirty limits |
496 | * @pgdat: the node to check | 496 | * @pgdat: the node to check |
497 | * | 497 | * |
498 | * Returns %true when the dirty pages in @pgdat are within the node's | 498 | * Return: %true when the dirty pages in @pgdat are within the node's |
499 | * dirty limit, %false if the limit is exceeded. | 499 | * dirty limit, %false if the limit is exceeded. |
500 | */ | 500 | */ |
501 | bool node_dirty_ok(struct pglist_data *pgdat) | 501 | bool node_dirty_ok(struct pglist_data *pgdat) |
@@ -743,9 +743,6 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, | |||
743 | * __wb_calc_thresh - @wb's share of dirty throttling threshold | 743 | * __wb_calc_thresh - @wb's share of dirty throttling threshold |
744 | * @dtc: dirty_throttle_context of interest | 744 | * @dtc: dirty_throttle_context of interest |
745 | * | 745 | * |
746 | * Returns @wb's dirty limit in pages. The term "dirty" in the context of | ||
747 | * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. | ||
748 | * | ||
749 | * Note that balance_dirty_pages() will only seriously take it as a hard limit | 746 | * Note that balance_dirty_pages() will only seriously take it as a hard limit |
750 | * when sleeping max_pause per page is not enough to keep the dirty pages under | 747 | * when sleeping max_pause per page is not enough to keep the dirty pages under |
751 | * control. For example, when the device is completely stalled due to some error | 748 | * control. For example, when the device is completely stalled due to some error |
@@ -759,6 +756,9 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, | |||
759 | * | 756 | * |
760 | * The wb's share of dirty limit will be adapting to its throughput and | 757 | * The wb's share of dirty limit will be adapting to its throughput and |
761 | * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. | 758 | * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. |
759 | * | ||
760 | * Return: @wb's dirty limit in pages. The term "dirty" in the context of | ||
761 | * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. | ||
762 | */ | 762 | */ |
763 | static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) | 763 | static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) |
764 | { | 764 | { |
@@ -1918,7 +1918,9 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited); | |||
1918 | * @wb: bdi_writeback of interest | 1918 | * @wb: bdi_writeback of interest |
1919 | * | 1919 | * |
1920 | * Determines whether background writeback should keep writing @wb or it's | 1920 | * Determines whether background writeback should keep writing @wb or it's |
1921 | * clean enough. Returns %true if writeback should continue. | 1921 | * clean enough. |
1922 | * | ||
1923 | * Return: %true if writeback should continue. | ||
1922 | */ | 1924 | */ |
1923 | bool wb_over_bg_thresh(struct bdi_writeback *wb) | 1925 | bool wb_over_bg_thresh(struct bdi_writeback *wb) |
1924 | { | 1926 | { |
@@ -2147,6 +2149,8 @@ EXPORT_SYMBOL(tag_pages_for_writeback); | |||
2147 | * lock/page writeback access order inversion - we should only ever lock | 2149 | * lock/page writeback access order inversion - we should only ever lock |
2148 | * multiple pages in ascending page->index order, and looping back to the start | 2150 | * multiple pages in ascending page->index order, and looping back to the start |
2149 | * of the file violates that rule and causes deadlocks. | 2151 | * of the file violates that rule and causes deadlocks. |
2152 | * | ||
2153 | * Return: %0 on success, negative error code otherwise | ||
2150 | */ | 2154 | */ |
2151 | int write_cache_pages(struct address_space *mapping, | 2155 | int write_cache_pages(struct address_space *mapping, |
2152 | struct writeback_control *wbc, writepage_t writepage, | 2156 | struct writeback_control *wbc, writepage_t writepage, |
@@ -2305,6 +2309,8 @@ static int __writepage(struct page *page, struct writeback_control *wbc, | |||
2305 | * | 2309 | * |
2306 | * This is a library function, which implements the writepages() | 2310 | * This is a library function, which implements the writepages() |
2307 | * address_space_operation. | 2311 | * address_space_operation. |
2312 | * | ||
2313 | * Return: %0 on success, negative error code otherwise | ||
2308 | */ | 2314 | */ |
2309 | int generic_writepages(struct address_space *mapping, | 2315 | int generic_writepages(struct address_space *mapping, |
2310 | struct writeback_control *wbc) | 2316 | struct writeback_control *wbc) |
@@ -2351,6 +2357,8 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) | |||
2351 | * | 2357 | * |
2352 | * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this | 2358 | * Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this |
2353 | * function returns. | 2359 | * function returns. |
2360 | * | ||
2361 | * Return: %0 on success, negative error code otherwise | ||
2354 | */ | 2362 | */ |
2355 | int write_one_page(struct page *page) | 2363 | int write_one_page(struct page *page) |
2356 | { | 2364 | { |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0b9f577b1a2a..3eb01dedfb50 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -289,8 +289,8 @@ EXPORT_SYMBOL(movable_zone); | |||
289 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 289 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
290 | 290 | ||
291 | #if MAX_NUMNODES > 1 | 291 | #if MAX_NUMNODES > 1 |
292 | int nr_node_ids __read_mostly = MAX_NUMNODES; | 292 | unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; |
293 | int nr_online_nodes __read_mostly = 1; | 293 | unsigned int nr_online_nodes __read_mostly = 1; |
294 | EXPORT_SYMBOL(nr_node_ids); | 294 | EXPORT_SYMBOL(nr_node_ids); |
295 | EXPORT_SYMBOL(nr_online_nodes); | 295 | EXPORT_SYMBOL(nr_online_nodes); |
296 | #endif | 296 | #endif |
@@ -789,6 +789,57 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
789 | return 0; | 789 | return 0; |
790 | } | 790 | } |
791 | 791 | ||
792 | #ifdef CONFIG_COMPACTION | ||
793 | static inline struct capture_control *task_capc(struct zone *zone) | ||
794 | { | ||
795 | struct capture_control *capc = current->capture_control; | ||
796 | |||
797 | return capc && | ||
798 | !(current->flags & PF_KTHREAD) && | ||
799 | !capc->page && | ||
800 | capc->cc->zone == zone && | ||
801 | capc->cc->direct_compaction ? capc : NULL; | ||
802 | } | ||
803 | |||
804 | static inline bool | ||
805 | compaction_capture(struct capture_control *capc, struct page *page, | ||
806 | int order, int migratetype) | ||
807 | { | ||
808 | if (!capc || order != capc->cc->order) | ||
809 | return false; | ||
810 | |||
811 | /* Do not accidentally pollute CMA or isolated regions*/ | ||
812 | if (is_migrate_cma(migratetype) || | ||
813 | is_migrate_isolate(migratetype)) | ||
814 | return false; | ||
815 | |||
816 | /* | ||
817 | * Do not let lower order allocations polluate a movable pageblock. | ||
818 | * This might let an unmovable request use a reclaimable pageblock | ||
819 | * and vice-versa but no more than normal fallback logic which can | ||
820 | * have trouble finding a high-order free page. | ||
821 | */ | ||
822 | if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) | ||
823 | return false; | ||
824 | |||
825 | capc->page = page; | ||
826 | return true; | ||
827 | } | ||
828 | |||
829 | #else | ||
830 | static inline struct capture_control *task_capc(struct zone *zone) | ||
831 | { | ||
832 | return NULL; | ||
833 | } | ||
834 | |||
835 | static inline bool | ||
836 | compaction_capture(struct capture_control *capc, struct page *page, | ||
837 | int order, int migratetype) | ||
838 | { | ||
839 | return false; | ||
840 | } | ||
841 | #endif /* CONFIG_COMPACTION */ | ||
842 | |||
792 | /* | 843 | /* |
793 | * Freeing function for a buddy system allocator. | 844 | * Freeing function for a buddy system allocator. |
794 | * | 845 | * |
@@ -822,6 +873,7 @@ static inline void __free_one_page(struct page *page, | |||
822 | unsigned long uninitialized_var(buddy_pfn); | 873 | unsigned long uninitialized_var(buddy_pfn); |
823 | struct page *buddy; | 874 | struct page *buddy; |
824 | unsigned int max_order; | 875 | unsigned int max_order; |
876 | struct capture_control *capc = task_capc(zone); | ||
825 | 877 | ||
826 | max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); | 878 | max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); |
827 | 879 | ||
@@ -837,6 +889,11 @@ static inline void __free_one_page(struct page *page, | |||
837 | 889 | ||
838 | continue_merging: | 890 | continue_merging: |
839 | while (order < max_order - 1) { | 891 | while (order < max_order - 1) { |
892 | if (compaction_capture(capc, page, order, migratetype)) { | ||
893 | __mod_zone_freepage_state(zone, -(1 << order), | ||
894 | migratetype); | ||
895 | return; | ||
896 | } | ||
840 | buddy_pfn = __find_buddy_pfn(pfn, order); | 897 | buddy_pfn = __find_buddy_pfn(pfn, order); |
841 | buddy = page + (buddy_pfn - pfn); | 898 | buddy = page + (buddy_pfn - pfn); |
842 | 899 | ||
@@ -1056,7 +1113,7 @@ static __always_inline bool free_pages_prepare(struct page *page, | |||
1056 | if (PageMappingFlags(page)) | 1113 | if (PageMappingFlags(page)) |
1057 | page->mapping = NULL; | 1114 | page->mapping = NULL; |
1058 | if (memcg_kmem_enabled() && PageKmemcg(page)) | 1115 | if (memcg_kmem_enabled() && PageKmemcg(page)) |
1059 | memcg_kmem_uncharge(page, order); | 1116 | __memcg_kmem_uncharge(page, order); |
1060 | if (check_free) | 1117 | if (check_free) |
1061 | bad += free_pages_check(page); | 1118 | bad += free_pages_check(page); |
1062 | if (bad) | 1119 | if (bad) |
@@ -1303,7 +1360,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
1303 | local_irq_restore(flags); | 1360 | local_irq_restore(flags); |
1304 | } | 1361 | } |
1305 | 1362 | ||
1306 | static void __init __free_pages_boot_core(struct page *page, unsigned int order) | 1363 | void __free_pages_core(struct page *page, unsigned int order) |
1307 | { | 1364 | { |
1308 | unsigned int nr_pages = 1 << order; | 1365 | unsigned int nr_pages = 1 << order; |
1309 | struct page *p = page; | 1366 | struct page *p = page; |
@@ -1382,7 +1439,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, | |||
1382 | { | 1439 | { |
1383 | if (early_page_uninitialised(pfn)) | 1440 | if (early_page_uninitialised(pfn)) |
1384 | return; | 1441 | return; |
1385 | return __free_pages_boot_core(page, order); | 1442 | __free_pages_core(page, order); |
1386 | } | 1443 | } |
1387 | 1444 | ||
1388 | /* | 1445 | /* |
@@ -1472,14 +1529,14 @@ static void __init deferred_free_range(unsigned long pfn, | |||
1472 | if (nr_pages == pageblock_nr_pages && | 1529 | if (nr_pages == pageblock_nr_pages && |
1473 | (pfn & (pageblock_nr_pages - 1)) == 0) { | 1530 | (pfn & (pageblock_nr_pages - 1)) == 0) { |
1474 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | 1531 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
1475 | __free_pages_boot_core(page, pageblock_order); | 1532 | __free_pages_core(page, pageblock_order); |
1476 | return; | 1533 | return; |
1477 | } | 1534 | } |
1478 | 1535 | ||
1479 | for (i = 0; i < nr_pages; i++, page++, pfn++) { | 1536 | for (i = 0; i < nr_pages; i++, page++, pfn++) { |
1480 | if ((pfn & (pageblock_nr_pages - 1)) == 0) | 1537 | if ((pfn & (pageblock_nr_pages - 1)) == 0) |
1481 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | 1538 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
1482 | __free_pages_boot_core(page, 0); | 1539 | __free_pages_core(page, 0); |
1483 | } | 1540 | } |
1484 | } | 1541 | } |
1485 | 1542 | ||
@@ -1945,8 +2002,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, | |||
1945 | 2002 | ||
1946 | arch_alloc_page(page, order); | 2003 | arch_alloc_page(page, order); |
1947 | kernel_map_pages(page, 1 << order, 1); | 2004 | kernel_map_pages(page, 1 << order, 1); |
1948 | kernel_poison_pages(page, 1 << order, 1); | ||
1949 | kasan_alloc_pages(page, order); | 2005 | kasan_alloc_pages(page, order); |
2006 | kernel_poison_pages(page, 1 << order, 1); | ||
1950 | set_page_owner(page, order, gfp_flags); | 2007 | set_page_owner(page, order, gfp_flags); |
1951 | } | 2008 | } |
1952 | 2009 | ||
@@ -2962,7 +3019,7 @@ int __isolate_free_page(struct page *page, unsigned int order) | |||
2962 | * watermark, because we already know our high-order page | 3019 | * watermark, because we already know our high-order page |
2963 | * exists. | 3020 | * exists. |
2964 | */ | 3021 | */ |
2965 | watermark = min_wmark_pages(zone) + (1UL << order); | 3022 | watermark = zone->_watermark[WMARK_MIN] + (1UL << order); |
2966 | if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) | 3023 | if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) |
2967 | return 0; | 3024 | return 0; |
2968 | 3025 | ||
@@ -3173,24 +3230,14 @@ static int __init fail_page_alloc_debugfs(void) | |||
3173 | 3230 | ||
3174 | dir = fault_create_debugfs_attr("fail_page_alloc", NULL, | 3231 | dir = fault_create_debugfs_attr("fail_page_alloc", NULL, |
3175 | &fail_page_alloc.attr); | 3232 | &fail_page_alloc.attr); |
3176 | if (IS_ERR(dir)) | ||
3177 | return PTR_ERR(dir); | ||
3178 | |||
3179 | if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, | ||
3180 | &fail_page_alloc.ignore_gfp_reclaim)) | ||
3181 | goto fail; | ||
3182 | if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, | ||
3183 | &fail_page_alloc.ignore_gfp_highmem)) | ||
3184 | goto fail; | ||
3185 | if (!debugfs_create_u32("min-order", mode, dir, | ||
3186 | &fail_page_alloc.min_order)) | ||
3187 | goto fail; | ||
3188 | 3233 | ||
3189 | return 0; | 3234 | debugfs_create_bool("ignore-gfp-wait", mode, dir, |
3190 | fail: | 3235 | &fail_page_alloc.ignore_gfp_reclaim); |
3191 | debugfs_remove_recursive(dir); | 3236 | debugfs_create_bool("ignore-gfp-highmem", mode, dir, |
3237 | &fail_page_alloc.ignore_gfp_highmem); | ||
3238 | debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order); | ||
3192 | 3239 | ||
3193 | return -ENOMEM; | 3240 | return 0; |
3194 | } | 3241 | } |
3195 | 3242 | ||
3196 | late_initcall(fail_page_alloc_debugfs); | 3243 | late_initcall(fail_page_alloc_debugfs); |
@@ -3710,7 +3757,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
3710 | unsigned int alloc_flags, const struct alloc_context *ac, | 3757 | unsigned int alloc_flags, const struct alloc_context *ac, |
3711 | enum compact_priority prio, enum compact_result *compact_result) | 3758 | enum compact_priority prio, enum compact_result *compact_result) |
3712 | { | 3759 | { |
3713 | struct page *page; | 3760 | struct page *page = NULL; |
3714 | unsigned long pflags; | 3761 | unsigned long pflags; |
3715 | unsigned int noreclaim_flag; | 3762 | unsigned int noreclaim_flag; |
3716 | 3763 | ||
@@ -3721,13 +3768,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
3721 | noreclaim_flag = memalloc_noreclaim_save(); | 3768 | noreclaim_flag = memalloc_noreclaim_save(); |
3722 | 3769 | ||
3723 | *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, | 3770 | *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, |
3724 | prio); | 3771 | prio, &page); |
3725 | 3772 | ||
3726 | memalloc_noreclaim_restore(noreclaim_flag); | 3773 | memalloc_noreclaim_restore(noreclaim_flag); |
3727 | psi_memstall_leave(&pflags); | 3774 | psi_memstall_leave(&pflags); |
3728 | 3775 | ||
3729 | if (*compact_result <= COMPACT_INACTIVE) | 3776 | if (*compact_result <= COMPACT_INACTIVE) { |
3777 | WARN_ON_ONCE(page); | ||
3730 | return NULL; | 3778 | return NULL; |
3779 | } | ||
3731 | 3780 | ||
3732 | /* | 3781 | /* |
3733 | * At least in one zone compaction wasn't deferred or skipped, so let's | 3782 | * At least in one zone compaction wasn't deferred or skipped, so let's |
@@ -3735,7 +3784,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
3735 | */ | 3784 | */ |
3736 | count_vm_event(COMPACTSTALL); | 3785 | count_vm_event(COMPACTSTALL); |
3737 | 3786 | ||
3738 | page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); | 3787 | /* Prep a captured page if available */ |
3788 | if (page) | ||
3789 | prep_new_page(page, order, gfp_mask, alloc_flags); | ||
3790 | |||
3791 | /* Try get a page from the freelist if available */ | ||
3792 | if (!page) | ||
3793 | page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); | ||
3739 | 3794 | ||
3740 | if (page) { | 3795 | if (page) { |
3741 | struct zone *zone = page_zone(page); | 3796 | struct zone *zone = page_zone(page); |
@@ -4568,7 +4623,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, | |||
4568 | 4623 | ||
4569 | out: | 4624 | out: |
4570 | if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && | 4625 | if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page && |
4571 | unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) { | 4626 | unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) { |
4572 | __free_pages(page, order); | 4627 | __free_pages(page, order); |
4573 | page = NULL; | 4628 | page = NULL; |
4574 | } | 4629 | } |
@@ -4761,6 +4816,8 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, | |||
4761 | * This function is also limited by MAX_ORDER. | 4816 | * This function is also limited by MAX_ORDER. |
4762 | * | 4817 | * |
4763 | * Memory allocated by this function must be released by free_pages_exact(). | 4818 | * Memory allocated by this function must be released by free_pages_exact(). |
4819 | * | ||
4820 | * Return: pointer to the allocated area or %NULL in case of error. | ||
4764 | */ | 4821 | */ |
4765 | void *alloc_pages_exact(size_t size, gfp_t gfp_mask) | 4822 | void *alloc_pages_exact(size_t size, gfp_t gfp_mask) |
4766 | { | 4823 | { |
@@ -4781,6 +4838,8 @@ EXPORT_SYMBOL(alloc_pages_exact); | |||
4781 | * | 4838 | * |
4782 | * Like alloc_pages_exact(), but try to allocate on node nid first before falling | 4839 | * Like alloc_pages_exact(), but try to allocate on node nid first before falling |
4783 | * back. | 4840 | * back. |
4841 | * | ||
4842 | * Return: pointer to the allocated area or %NULL in case of error. | ||
4784 | */ | 4843 | */ |
4785 | void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) | 4844 | void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) |
4786 | { | 4845 | { |
@@ -4814,11 +4873,13 @@ EXPORT_SYMBOL(free_pages_exact); | |||
4814 | * nr_free_zone_pages - count number of pages beyond high watermark | 4873 | * nr_free_zone_pages - count number of pages beyond high watermark |
4815 | * @offset: The zone index of the highest zone | 4874 | * @offset: The zone index of the highest zone |
4816 | * | 4875 | * |
4817 | * nr_free_zone_pages() counts the number of counts pages which are beyond the | 4876 | * nr_free_zone_pages() counts the number of pages which are beyond the |
4818 | * high watermark within all zones at or below a given zone index. For each | 4877 | * high watermark within all zones at or below a given zone index. For each |
4819 | * zone, the number of pages is calculated as: | 4878 | * zone, the number of pages is calculated as: |
4820 | * | 4879 | * |
4821 | * nr_free_zone_pages = managed_pages - high_pages | 4880 | * nr_free_zone_pages = managed_pages - high_pages |
4881 | * | ||
4882 | * Return: number of pages beyond high watermark. | ||
4822 | */ | 4883 | */ |
4823 | static unsigned long nr_free_zone_pages(int offset) | 4884 | static unsigned long nr_free_zone_pages(int offset) |
4824 | { | 4885 | { |
@@ -4845,6 +4906,9 @@ static unsigned long nr_free_zone_pages(int offset) | |||
4845 | * | 4906 | * |
4846 | * nr_free_buffer_pages() counts the number of pages which are beyond the high | 4907 | * nr_free_buffer_pages() counts the number of pages which are beyond the high |
4847 | * watermark within ZONE_DMA and ZONE_NORMAL. | 4908 | * watermark within ZONE_DMA and ZONE_NORMAL. |
4909 | * | ||
4910 | * Return: number of pages beyond high watermark within ZONE_DMA and | ||
4911 | * ZONE_NORMAL. | ||
4848 | */ | 4912 | */ |
4849 | unsigned long nr_free_buffer_pages(void) | 4913 | unsigned long nr_free_buffer_pages(void) |
4850 | { | 4914 | { |
@@ -4857,6 +4921,8 @@ EXPORT_SYMBOL_GPL(nr_free_buffer_pages); | |||
4857 | * | 4921 | * |
4858 | * nr_free_pagecache_pages() counts the number of pages which are beyond the | 4922 | * nr_free_pagecache_pages() counts the number of pages which are beyond the |
4859 | * high watermark within all zones. | 4923 | * high watermark within all zones. |
4924 | * | ||
4925 | * Return: number of pages beyond high watermark within all zones. | ||
4860 | */ | 4926 | */ |
4861 | unsigned long nr_free_pagecache_pages(void) | 4927 | unsigned long nr_free_pagecache_pages(void) |
4862 | { | 4928 | { |
@@ -5303,7 +5369,8 @@ static int node_load[MAX_NUMNODES]; | |||
5303 | * from each node to each node in the system), and should also prefer nodes | 5369 | * from each node to each node in the system), and should also prefer nodes |
5304 | * with no CPUs, since presumably they'll have very little allocation pressure | 5370 | * with no CPUs, since presumably they'll have very little allocation pressure |
5305 | * on them otherwise. | 5371 | * on them otherwise. |
5306 | * It returns -1 if no node is found. | 5372 | * |
5373 | * Return: node id of the found node or %NUMA_NO_NODE if no node is found. | ||
5307 | */ | 5374 | */ |
5308 | static int find_next_best_node(int node, nodemask_t *used_node_mask) | 5375 | static int find_next_best_node(int node, nodemask_t *used_node_mask) |
5309 | { | 5376 | { |
@@ -5609,7 +5676,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat) | |||
5609 | else | 5676 | else |
5610 | page_group_by_mobility_disabled = 0; | 5677 | page_group_by_mobility_disabled = 0; |
5611 | 5678 | ||
5612 | pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n", | 5679 | pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n", |
5613 | nr_online_nodes, | 5680 | nr_online_nodes, |
5614 | page_group_by_mobility_disabled ? "off" : "on", | 5681 | page_group_by_mobility_disabled ? "off" : "on", |
5615 | vm_total_pages); | 5682 | vm_total_pages); |
@@ -6016,7 +6083,7 @@ int __meminit __early_pfn_to_nid(unsigned long pfn, | |||
6016 | return state->last_nid; | 6083 | return state->last_nid; |
6017 | 6084 | ||
6018 | nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); | 6085 | nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); |
6019 | if (nid != -1) { | 6086 | if (nid != NUMA_NO_NODE) { |
6020 | state->last_start = start_pfn; | 6087 | state->last_start = start_pfn; |
6021 | state->last_end = end_pfn; | 6088 | state->last_end = end_pfn; |
6022 | state->last_nid = nid; | 6089 | state->last_nid = nid; |
@@ -6214,7 +6281,7 @@ unsigned long __init __absent_pages_in_range(int nid, | |||
6214 | * @start_pfn: The start PFN to start searching for holes | 6281 | * @start_pfn: The start PFN to start searching for holes |
6215 | * @end_pfn: The end PFN to stop searching for holes | 6282 | * @end_pfn: The end PFN to stop searching for holes |
6216 | * | 6283 | * |
6217 | * It returns the number of pages frames in memory holes within a range. | 6284 | * Return: the number of pages frames in memory holes within a range. |
6218 | */ | 6285 | */ |
6219 | unsigned long __init absent_pages_in_range(unsigned long start_pfn, | 6286 | unsigned long __init absent_pages_in_range(unsigned long start_pfn, |
6220 | unsigned long end_pfn) | 6287 | unsigned long end_pfn) |
@@ -6376,10 +6443,14 @@ static void __ref setup_usemap(struct pglist_data *pgdat, | |||
6376 | { | 6443 | { |
6377 | unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); | 6444 | unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); |
6378 | zone->pageblock_flags = NULL; | 6445 | zone->pageblock_flags = NULL; |
6379 | if (usemapsize) | 6446 | if (usemapsize) { |
6380 | zone->pageblock_flags = | 6447 | zone->pageblock_flags = |
6381 | memblock_alloc_node_nopanic(usemapsize, | 6448 | memblock_alloc_node_nopanic(usemapsize, |
6382 | pgdat->node_id); | 6449 | pgdat->node_id); |
6450 | if (!zone->pageblock_flags) | ||
6451 | panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", | ||
6452 | usemapsize, zone->name, pgdat->node_id); | ||
6453 | } | ||
6383 | } | 6454 | } |
6384 | #else | 6455 | #else |
6385 | static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, | 6456 | static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, |
@@ -6609,6 +6680,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) | |||
6609 | end = ALIGN(end, MAX_ORDER_NR_PAGES); | 6680 | end = ALIGN(end, MAX_ORDER_NR_PAGES); |
6610 | size = (end - start) * sizeof(struct page); | 6681 | size = (end - start) * sizeof(struct page); |
6611 | map = memblock_alloc_node_nopanic(size, pgdat->node_id); | 6682 | map = memblock_alloc_node_nopanic(size, pgdat->node_id); |
6683 | if (!map) | ||
6684 | panic("Failed to allocate %ld bytes for node %d memory map\n", | ||
6685 | size, pgdat->node_id); | ||
6612 | pgdat->node_mem_map = map + offset; | 6686 | pgdat->node_mem_map = map + offset; |
6613 | } | 6687 | } |
6614 | pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", | 6688 | pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", |
@@ -6764,14 +6838,14 @@ void __init setup_nr_node_ids(void) | |||
6764 | * model has fine enough granularity to avoid incorrect mapping for the | 6838 | * model has fine enough granularity to avoid incorrect mapping for the |
6765 | * populated node map. | 6839 | * populated node map. |
6766 | * | 6840 | * |
6767 | * Returns the determined alignment in pfn's. 0 if there is no alignment | 6841 | * Return: the determined alignment in pfn's. 0 if there is no alignment |
6768 | * requirement (single node). | 6842 | * requirement (single node). |
6769 | */ | 6843 | */ |
6770 | unsigned long __init node_map_pfn_alignment(void) | 6844 | unsigned long __init node_map_pfn_alignment(void) |
6771 | { | 6845 | { |
6772 | unsigned long accl_mask = 0, last_end = 0; | 6846 | unsigned long accl_mask = 0, last_end = 0; |
6773 | unsigned long start, end, mask; | 6847 | unsigned long start, end, mask; |
6774 | int last_nid = -1; | 6848 | int last_nid = NUMA_NO_NODE; |
6775 | int i, nid; | 6849 | int i, nid; |
6776 | 6850 | ||
6777 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { | 6851 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { |
@@ -6819,7 +6893,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) | |||
6819 | /** | 6893 | /** |
6820 | * find_min_pfn_with_active_regions - Find the minimum PFN registered | 6894 | * find_min_pfn_with_active_regions - Find the minimum PFN registered |
6821 | * | 6895 | * |
6822 | * It returns the minimum PFN based on information provided via | 6896 | * Return: the minimum PFN based on information provided via |
6823 | * memblock_set_node(). | 6897 | * memblock_set_node(). |
6824 | */ | 6898 | */ |
6825 | unsigned long __init find_min_pfn_with_active_regions(void) | 6899 | unsigned long __init find_min_pfn_with_active_regions(void) |
@@ -7267,7 +7341,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char | |||
7267 | 7341 | ||
7268 | return pages; | 7342 | return pages; |
7269 | } | 7343 | } |
7270 | EXPORT_SYMBOL(free_reserved_area); | ||
7271 | 7344 | ||
7272 | #ifdef CONFIG_HIGHMEM | 7345 | #ifdef CONFIG_HIGHMEM |
7273 | void free_highmem_page(struct page *page) | 7346 | void free_highmem_page(struct page *page) |
@@ -7496,7 +7569,7 @@ static void __setup_per_zone_wmarks(void) | |||
7496 | * value here. | 7569 | * value here. |
7497 | * | 7570 | * |
7498 | * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) | 7571 | * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) |
7499 | * deltas control asynch page reclaim, and so should | 7572 | * deltas control async page reclaim, and so should |
7500 | * not be capped for highmem. | 7573 | * not be capped for highmem. |
7501 | */ | 7574 | */ |
7502 | unsigned long min_pages; | 7575 | unsigned long min_pages; |
@@ -7973,7 +8046,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | |||
7973 | 8046 | ||
7974 | /* | 8047 | /* |
7975 | * Hugepages are not in LRU lists, but they're movable. | 8048 | * Hugepages are not in LRU lists, but they're movable. |
7976 | * We need not scan over tail pages bacause we don't | 8049 | * We need not scan over tail pages because we don't |
7977 | * handle each tail page individually in migration. | 8050 | * handle each tail page individually in migration. |
7978 | */ | 8051 | */ |
7979 | if (PageHuge(page)) { | 8052 | if (PageHuge(page)) { |
@@ -8112,7 +8185,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
8112 | * pageblocks in the range. Once isolated, the pageblocks should not | 8185 | * pageblocks in the range. Once isolated, the pageblocks should not |
8113 | * be modified by others. | 8186 | * be modified by others. |
8114 | * | 8187 | * |
8115 | * Returns zero on success or negative error code. On success all | 8188 | * Return: zero on success or negative error code. On success all |
8116 | * pages which PFN is in [start, end) are allocated for the caller and | 8189 | * pages which PFN is in [start, end) are allocated for the caller and |
8117 | * need to be freed with free_contig_range(). | 8190 | * need to be freed with free_contig_range(). |
8118 | */ | 8191 | */ |
@@ -8196,7 +8269,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
8196 | */ | 8269 | */ |
8197 | 8270 | ||
8198 | lru_add_drain_all(); | 8271 | lru_add_drain_all(); |
8199 | drain_all_pages(cc.zone); | ||
8200 | 8272 | ||
8201 | order = 0; | 8273 | order = 0; |
8202 | outer_start = start; | 8274 | outer_start = start; |
diff --git a/mm/page_ext.c b/mm/page_ext.c index 8c78b8d45117..ab4244920e0f 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c | |||
@@ -273,6 +273,7 @@ static void free_page_ext(void *addr) | |||
273 | table_size = get_entry_size() * PAGES_PER_SECTION; | 273 | table_size = get_entry_size() * PAGES_PER_SECTION; |
274 | 274 | ||
275 | BUG_ON(PageReserved(page)); | 275 | BUG_ON(PageReserved(page)); |
276 | kmemleak_free(addr); | ||
276 | free_pages_exact(addr, table_size); | 277 | free_pages_exact(addr, table_size); |
277 | } | 278 | } |
278 | } | 279 | } |
@@ -300,7 +301,7 @@ static int __meminit online_page_ext(unsigned long start_pfn, | |||
300 | start = SECTION_ALIGN_DOWN(start_pfn); | 301 | start = SECTION_ALIGN_DOWN(start_pfn); |
301 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); | 302 | end = SECTION_ALIGN_UP(start_pfn + nr_pages); |
302 | 303 | ||
303 | if (nid == -1) { | 304 | if (nid == NUMA_NO_NODE) { |
304 | /* | 305 | /* |
305 | * In this case, "nid" already exists and contains valid memory. | 306 | * In this case, "nid" already exists and contains valid memory. |
306 | * "start_pfn" passed to us is a pfn which is an arg for | 307 | * "start_pfn" passed to us is a pfn which is an arg for |
diff --git a/mm/page_idle.c b/mm/page_idle.c index b9e4b42b33ab..0b39ec0c945c 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c | |||
@@ -31,7 +31,7 @@ | |||
31 | static struct page *page_idle_get_page(unsigned long pfn) | 31 | static struct page *page_idle_get_page(unsigned long pfn) |
32 | { | 32 | { |
33 | struct page *page; | 33 | struct page *page; |
34 | struct zone *zone; | 34 | pg_data_t *pgdat; |
35 | 35 | ||
36 | if (!pfn_valid(pfn)) | 36 | if (!pfn_valid(pfn)) |
37 | return NULL; | 37 | return NULL; |
@@ -41,13 +41,13 @@ static struct page *page_idle_get_page(unsigned long pfn) | |||
41 | !get_page_unless_zero(page)) | 41 | !get_page_unless_zero(page)) |
42 | return NULL; | 42 | return NULL; |
43 | 43 | ||
44 | zone = page_zone(page); | 44 | pgdat = page_pgdat(page); |
45 | spin_lock_irq(zone_lru_lock(zone)); | 45 | spin_lock_irq(&pgdat->lru_lock); |
46 | if (unlikely(!PageLRU(page))) { | 46 | if (unlikely(!PageLRU(page))) { |
47 | put_page(page); | 47 | put_page(page); |
48 | page = NULL; | 48 | page = NULL; |
49 | } | 49 | } |
50 | spin_unlock_irq(zone_lru_lock(zone)); | 50 | spin_unlock_irq(&pgdat->lru_lock); |
51 | return page; | 51 | return page; |
52 | } | 52 | } |
53 | 53 | ||
diff --git a/mm/page_owner.c b/mm/page_owner.c index 28b06524939f..925b6f44a444 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c | |||
@@ -625,16 +625,14 @@ static const struct file_operations proc_page_owner_operations = { | |||
625 | 625 | ||
626 | static int __init pageowner_init(void) | 626 | static int __init pageowner_init(void) |
627 | { | 627 | { |
628 | struct dentry *dentry; | ||
629 | |||
630 | if (!static_branch_unlikely(&page_owner_inited)) { | 628 | if (!static_branch_unlikely(&page_owner_inited)) { |
631 | pr_info("page_owner is disabled\n"); | 629 | pr_info("page_owner is disabled\n"); |
632 | return 0; | 630 | return 0; |
633 | } | 631 | } |
634 | 632 | ||
635 | dentry = debugfs_create_file("page_owner", 0400, NULL, | 633 | debugfs_create_file("page_owner", 0400, NULL, NULL, |
636 | NULL, &proc_page_owner_operations); | 634 | &proc_page_owner_operations); |
637 | 635 | ||
638 | return PTR_ERR_OR_ZERO(dentry); | 636 | return 0; |
639 | } | 637 | } |
640 | late_initcall(pageowner_init) | 638 | late_initcall(pageowner_init) |
diff --git a/mm/page_poison.c b/mm/page_poison.c index f0c15e9017c0..21d4f97cb49b 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c | |||
@@ -6,6 +6,7 @@ | |||
6 | #include <linux/page_ext.h> | 6 | #include <linux/page_ext.h> |
7 | #include <linux/poison.h> | 7 | #include <linux/poison.h> |
8 | #include <linux/ratelimit.h> | 8 | #include <linux/ratelimit.h> |
9 | #include <linux/kasan.h> | ||
9 | 10 | ||
10 | static bool want_page_poisoning __read_mostly; | 11 | static bool want_page_poisoning __read_mostly; |
11 | 12 | ||
@@ -40,7 +41,10 @@ static void poison_page(struct page *page) | |||
40 | { | 41 | { |
41 | void *addr = kmap_atomic(page); | 42 | void *addr = kmap_atomic(page); |
42 | 43 | ||
44 | /* KASAN still think the page is in-use, so skip it. */ | ||
45 | kasan_disable_current(); | ||
43 | memset(addr, PAGE_POISON, PAGE_SIZE); | 46 | memset(addr, PAGE_POISON, PAGE_SIZE); |
47 | kasan_enable_current(); | ||
44 | kunmap_atomic(addr); | 48 | kunmap_atomic(addr); |
45 | } | 49 | } |
46 | 50 | ||
diff --git a/mm/readahead.c b/mm/readahead.c index 1ae16522412a..a4593654a26c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -81,6 +81,8 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping, | |||
81 | * @data: private data for the callback routine. | 81 | * @data: private data for the callback routine. |
82 | * | 82 | * |
83 | * Hides the details of the LRU cache etc from the filesystems. | 83 | * Hides the details of the LRU cache etc from the filesystems. |
84 | * | ||
85 | * Returns: %0 on success, error return by @filler otherwise | ||
84 | */ | 86 | */ |
85 | int read_cache_pages(struct address_space *mapping, struct list_head *pages, | 87 | int read_cache_pages(struct address_space *mapping, struct list_head *pages, |
86 | int (*filler)(void *, struct page *), void *data) | 88 | int (*filler)(void *, struct page *), void *data) |
@@ -27,7 +27,7 @@ | |||
27 | * mapping->i_mmap_rwsem | 27 | * mapping->i_mmap_rwsem |
28 | * anon_vma->rwsem | 28 | * anon_vma->rwsem |
29 | * mm->page_table_lock or pte_lock | 29 | * mm->page_table_lock or pte_lock |
30 | * zone_lru_lock (in mark_page_accessed, isolate_lru_page) | 30 | * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page) |
31 | * swap_lock (in swap_duplicate, swap_info_get) | 31 | * swap_lock (in swap_duplicate, swap_info_get) |
32 | * mmlist_lock (in mmput, drain_mmlist and others) | 32 | * mmlist_lock (in mmput, drain_mmlist and others) |
33 | * mapping->private_lock (in __set_page_dirty_buffers) | 33 | * mapping->private_lock (in __set_page_dirty_buffers) |
diff --git a/mm/shmem.c b/mm/shmem.c index 2c012eee133d..b3db3779a30a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/uio.h> | 36 | #include <linux/uio.h> |
37 | #include <linux/khugepaged.h> | 37 | #include <linux/khugepaged.h> |
38 | #include <linux/hugetlb.h> | 38 | #include <linux/hugetlb.h> |
39 | #include <linux/frontswap.h> | ||
39 | 40 | ||
40 | #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ | 41 | #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ |
41 | 42 | ||
@@ -123,6 +124,10 @@ static unsigned long shmem_default_max_inodes(void) | |||
123 | static bool shmem_should_replace_page(struct page *page, gfp_t gfp); | 124 | static bool shmem_should_replace_page(struct page *page, gfp_t gfp); |
124 | static int shmem_replace_page(struct page **pagep, gfp_t gfp, | 125 | static int shmem_replace_page(struct page **pagep, gfp_t gfp, |
125 | struct shmem_inode_info *info, pgoff_t index); | 126 | struct shmem_inode_info *info, pgoff_t index); |
127 | static int shmem_swapin_page(struct inode *inode, pgoff_t index, | ||
128 | struct page **pagep, enum sgp_type sgp, | ||
129 | gfp_t gfp, struct vm_area_struct *vma, | ||
130 | vm_fault_t *fault_type); | ||
126 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | 131 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
127 | struct page **pagep, enum sgp_type sgp, | 132 | struct page **pagep, enum sgp_type sgp, |
128 | gfp_t gfp, struct vm_area_struct *vma, | 133 | gfp_t gfp, struct vm_area_struct *vma, |
@@ -1089,159 +1094,184 @@ static void shmem_evict_inode(struct inode *inode) | |||
1089 | clear_inode(inode); | 1094 | clear_inode(inode); |
1090 | } | 1095 | } |
1091 | 1096 | ||
1092 | static unsigned long find_swap_entry(struct xarray *xa, void *item) | 1097 | extern struct swap_info_struct *swap_info[]; |
1098 | |||
1099 | static int shmem_find_swap_entries(struct address_space *mapping, | ||
1100 | pgoff_t start, unsigned int nr_entries, | ||
1101 | struct page **entries, pgoff_t *indices, | ||
1102 | bool frontswap) | ||
1093 | { | 1103 | { |
1094 | XA_STATE(xas, xa, 0); | 1104 | XA_STATE(xas, &mapping->i_pages, start); |
1095 | unsigned int checked = 0; | 1105 | struct page *page; |
1096 | void *entry; | 1106 | unsigned int ret = 0; |
1107 | |||
1108 | if (!nr_entries) | ||
1109 | return 0; | ||
1097 | 1110 | ||
1098 | rcu_read_lock(); | 1111 | rcu_read_lock(); |
1099 | xas_for_each(&xas, entry, ULONG_MAX) { | 1112 | xas_for_each(&xas, page, ULONG_MAX) { |
1100 | if (xas_retry(&xas, entry)) | 1113 | if (xas_retry(&xas, page)) |
1101 | continue; | 1114 | continue; |
1102 | if (entry == item) | 1115 | |
1103 | break; | 1116 | if (!xa_is_value(page)) |
1104 | checked++; | ||
1105 | if ((checked % XA_CHECK_SCHED) != 0) | ||
1106 | continue; | 1117 | continue; |
1107 | xas_pause(&xas); | 1118 | |
1108 | cond_resched_rcu(); | 1119 | if (frontswap) { |
1120 | swp_entry_t entry = radix_to_swp_entry(page); | ||
1121 | |||
1122 | if (!frontswap_test(swap_info[swp_type(entry)], | ||
1123 | swp_offset(entry))) | ||
1124 | continue; | ||
1125 | } | ||
1126 | |||
1127 | indices[ret] = xas.xa_index; | ||
1128 | entries[ret] = page; | ||
1129 | |||
1130 | if (need_resched()) { | ||
1131 | xas_pause(&xas); | ||
1132 | cond_resched_rcu(); | ||
1133 | } | ||
1134 | if (++ret == nr_entries) | ||
1135 | break; | ||
1109 | } | 1136 | } |
1110 | rcu_read_unlock(); | 1137 | rcu_read_unlock(); |
1111 | 1138 | ||
1112 | return entry ? xas.xa_index : -1; | 1139 | return ret; |
1113 | } | 1140 | } |
1114 | 1141 | ||
1115 | /* | 1142 | /* |
1116 | * If swap found in inode, free it and move page from swapcache to filecache. | 1143 | * Move the swapped pages for an inode to page cache. Returns the count |
1144 | * of pages swapped in, or the error in case of failure. | ||
1117 | */ | 1145 | */ |
1118 | static int shmem_unuse_inode(struct shmem_inode_info *info, | 1146 | static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec, |
1119 | swp_entry_t swap, struct page **pagep) | 1147 | pgoff_t *indices) |
1120 | { | 1148 | { |
1121 | struct address_space *mapping = info->vfs_inode.i_mapping; | 1149 | int i = 0; |
1122 | void *radswap; | 1150 | int ret = 0; |
1123 | pgoff_t index; | ||
1124 | gfp_t gfp; | ||
1125 | int error = 0; | 1151 | int error = 0; |
1152 | struct address_space *mapping = inode->i_mapping; | ||
1126 | 1153 | ||
1127 | radswap = swp_to_radix_entry(swap); | 1154 | for (i = 0; i < pvec.nr; i++) { |
1128 | index = find_swap_entry(&mapping->i_pages, radswap); | 1155 | struct page *page = pvec.pages[i]; |
1129 | if (index == -1) | ||
1130 | return -EAGAIN; /* tell shmem_unuse we found nothing */ | ||
1131 | 1156 | ||
1132 | /* | 1157 | if (!xa_is_value(page)) |
1133 | * Move _head_ to start search for next from here. | 1158 | continue; |
1134 | * But be careful: shmem_evict_inode checks list_empty without taking | 1159 | error = shmem_swapin_page(inode, indices[i], |
1135 | * mutex, and there's an instant in list_move_tail when info->swaplist | 1160 | &page, SGP_CACHE, |
1136 | * would appear empty, if it were the only one on shmem_swaplist. | 1161 | mapping_gfp_mask(mapping), |
1137 | */ | 1162 | NULL, NULL); |
1138 | if (shmem_swaplist.next != &info->swaplist) | 1163 | if (error == 0) { |
1139 | list_move_tail(&shmem_swaplist, &info->swaplist); | 1164 | unlock_page(page); |
1140 | 1165 | put_page(page); | |
1141 | gfp = mapping_gfp_mask(mapping); | 1166 | ret++; |
1142 | if (shmem_should_replace_page(*pagep, gfp)) { | 1167 | } |
1143 | mutex_unlock(&shmem_swaplist_mutex); | 1168 | if (error == -ENOMEM) |
1144 | error = shmem_replace_page(pagep, gfp, info, index); | 1169 | break; |
1145 | mutex_lock(&shmem_swaplist_mutex); | 1170 | error = 0; |
1146 | /* | ||
1147 | * We needed to drop mutex to make that restrictive page | ||
1148 | * allocation, but the inode might have been freed while we | ||
1149 | * dropped it: although a racing shmem_evict_inode() cannot | ||
1150 | * complete without emptying the page cache, our page lock | ||
1151 | * on this swapcache page is not enough to prevent that - | ||
1152 | * free_swap_and_cache() of our swap entry will only | ||
1153 | * trylock_page(), removing swap from page cache whatever. | ||
1154 | * | ||
1155 | * We must not proceed to shmem_add_to_page_cache() if the | ||
1156 | * inode has been freed, but of course we cannot rely on | ||
1157 | * inode or mapping or info to check that. However, we can | ||
1158 | * safely check if our swap entry is still in use (and here | ||
1159 | * it can't have got reused for another page): if it's still | ||
1160 | * in use, then the inode cannot have been freed yet, and we | ||
1161 | * can safely proceed (if it's no longer in use, that tells | ||
1162 | * nothing about the inode, but we don't need to unuse swap). | ||
1163 | */ | ||
1164 | if (!page_swapcount(*pagep)) | ||
1165 | error = -ENOENT; | ||
1166 | } | 1171 | } |
1172 | return error ? error : ret; | ||
1173 | } | ||
1167 | 1174 | ||
1168 | /* | 1175 | /* |
1169 | * We rely on shmem_swaplist_mutex, not only to protect the swaplist, | 1176 | * If swap found in inode, free it and move page from swapcache to filecache. |
1170 | * but also to hold up shmem_evict_inode(): so inode cannot be freed | 1177 | */ |
1171 | * beneath us (pagelock doesn't help until the page is in pagecache). | 1178 | static int shmem_unuse_inode(struct inode *inode, unsigned int type, |
1172 | */ | 1179 | bool frontswap, unsigned long *fs_pages_to_unuse) |
1173 | if (!error) | 1180 | { |
1174 | error = shmem_add_to_page_cache(*pagep, mapping, index, | 1181 | struct address_space *mapping = inode->i_mapping; |
1175 | radswap, gfp); | 1182 | pgoff_t start = 0; |
1176 | if (error != -ENOMEM) { | 1183 | struct pagevec pvec; |
1177 | /* | 1184 | pgoff_t indices[PAGEVEC_SIZE]; |
1178 | * Truncation and eviction use free_swap_and_cache(), which | 1185 | bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0); |
1179 | * only does trylock page: if we raced, best clean up here. | 1186 | int ret = 0; |
1180 | */ | 1187 | |
1181 | delete_from_swap_cache(*pagep); | 1188 | pagevec_init(&pvec); |
1182 | set_page_dirty(*pagep); | 1189 | do { |
1183 | if (!error) { | 1190 | unsigned int nr_entries = PAGEVEC_SIZE; |
1184 | spin_lock_irq(&info->lock); | 1191 | |
1185 | info->swapped--; | 1192 | if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE) |
1186 | spin_unlock_irq(&info->lock); | 1193 | nr_entries = *fs_pages_to_unuse; |
1187 | swap_free(swap); | 1194 | |
1195 | pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries, | ||
1196 | pvec.pages, indices, | ||
1197 | frontswap); | ||
1198 | if (pvec.nr == 0) { | ||
1199 | ret = 0; | ||
1200 | break; | ||
1188 | } | 1201 | } |
1189 | } | 1202 | |
1190 | return error; | 1203 | ret = shmem_unuse_swap_entries(inode, pvec, indices); |
1204 | if (ret < 0) | ||
1205 | break; | ||
1206 | |||
1207 | if (frontswap_partial) { | ||
1208 | *fs_pages_to_unuse -= ret; | ||
1209 | if (*fs_pages_to_unuse == 0) { | ||
1210 | ret = FRONTSWAP_PAGES_UNUSED; | ||
1211 | break; | ||
1212 | } | ||
1213 | } | ||
1214 | |||
1215 | start = indices[pvec.nr - 1]; | ||
1216 | } while (true); | ||
1217 | |||
1218 | return ret; | ||
1191 | } | 1219 | } |
1192 | 1220 | ||
1193 | /* | 1221 | /* |
1194 | * Search through swapped inodes to find and replace swap by page. | 1222 | * Read all the shared memory data that resides in the swap |
1223 | * device 'type' back into memory, so the swap device can be | ||
1224 | * unused. | ||
1195 | */ | 1225 | */ |
1196 | int shmem_unuse(swp_entry_t swap, struct page *page) | 1226 | int shmem_unuse(unsigned int type, bool frontswap, |
1227 | unsigned long *fs_pages_to_unuse) | ||
1197 | { | 1228 | { |
1198 | struct list_head *this, *next; | 1229 | struct shmem_inode_info *info, *next; |
1199 | struct shmem_inode_info *info; | 1230 | struct inode *inode; |
1200 | struct mem_cgroup *memcg; | 1231 | struct inode *prev_inode = NULL; |
1201 | int error = 0; | 1232 | int error = 0; |
1202 | 1233 | ||
1203 | /* | 1234 | if (list_empty(&shmem_swaplist)) |
1204 | * There's a faint possibility that swap page was replaced before | 1235 | return 0; |
1205 | * caller locked it: caller will come back later with the right page. | 1236 | |
1206 | */ | 1237 | mutex_lock(&shmem_swaplist_mutex); |
1207 | if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) | ||
1208 | goto out; | ||
1209 | 1238 | ||
1210 | /* | 1239 | /* |
1211 | * Charge page using GFP_KERNEL while we can wait, before taking | 1240 | * The extra refcount on the inode is necessary to safely dereference |
1212 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). | 1241 | * p->next after re-acquiring the lock. New shmem inodes with swap |
1213 | * Charged back to the user (not to caller) when swap account is used. | 1242 | * get added to the end of the list and we will scan them all. |
1214 | */ | 1243 | */ |
1215 | error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL, | 1244 | list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { |
1216 | &memcg, false); | 1245 | if (!info->swapped) { |
1217 | if (error) | ||
1218 | goto out; | ||
1219 | /* No memory allocation: swap entry occupies the slot for the page */ | ||
1220 | error = -EAGAIN; | ||
1221 | |||
1222 | mutex_lock(&shmem_swaplist_mutex); | ||
1223 | list_for_each_safe(this, next, &shmem_swaplist) { | ||
1224 | info = list_entry(this, struct shmem_inode_info, swaplist); | ||
1225 | if (info->swapped) | ||
1226 | error = shmem_unuse_inode(info, swap, &page); | ||
1227 | else | ||
1228 | list_del_init(&info->swaplist); | 1246 | list_del_init(&info->swaplist); |
1247 | continue; | ||
1248 | } | ||
1249 | |||
1250 | inode = igrab(&info->vfs_inode); | ||
1251 | if (!inode) | ||
1252 | continue; | ||
1253 | |||
1254 | mutex_unlock(&shmem_swaplist_mutex); | ||
1255 | if (prev_inode) | ||
1256 | iput(prev_inode); | ||
1257 | prev_inode = inode; | ||
1258 | |||
1259 | error = shmem_unuse_inode(inode, type, frontswap, | ||
1260 | fs_pages_to_unuse); | ||
1229 | cond_resched(); | 1261 | cond_resched(); |
1230 | if (error != -EAGAIN) | 1262 | |
1263 | mutex_lock(&shmem_swaplist_mutex); | ||
1264 | next = list_next_entry(info, swaplist); | ||
1265 | if (!info->swapped) | ||
1266 | list_del_init(&info->swaplist); | ||
1267 | if (error) | ||
1231 | break; | 1268 | break; |
1232 | /* found nothing in this: move on to search the next */ | ||
1233 | } | 1269 | } |
1234 | mutex_unlock(&shmem_swaplist_mutex); | 1270 | mutex_unlock(&shmem_swaplist_mutex); |
1235 | 1271 | ||
1236 | if (error) { | 1272 | if (prev_inode) |
1237 | if (error != -ENOMEM) | 1273 | iput(prev_inode); |
1238 | error = 0; | 1274 | |
1239 | mem_cgroup_cancel_charge(page, memcg, false); | ||
1240 | } else | ||
1241 | mem_cgroup_commit_charge(page, memcg, true, false); | ||
1242 | out: | ||
1243 | unlock_page(page); | ||
1244 | put_page(page); | ||
1245 | return error; | 1275 | return error; |
1246 | } | 1276 | } |
1247 | 1277 | ||
@@ -1325,7 +1355,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
1325 | */ | 1355 | */ |
1326 | mutex_lock(&shmem_swaplist_mutex); | 1356 | mutex_lock(&shmem_swaplist_mutex); |
1327 | if (list_empty(&info->swaplist)) | 1357 | if (list_empty(&info->swaplist)) |
1328 | list_add_tail(&info->swaplist, &shmem_swaplist); | 1358 | list_add(&info->swaplist, &shmem_swaplist); |
1329 | 1359 | ||
1330 | if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { | 1360 | if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { |
1331 | spin_lock_irq(&info->lock); | 1361 | spin_lock_irq(&info->lock); |
@@ -1576,6 +1606,116 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
1576 | } | 1606 | } |
1577 | 1607 | ||
1578 | /* | 1608 | /* |
1609 | * Swap in the page pointed to by *pagep. | ||
1610 | * Caller has to make sure that *pagep contains a valid swapped page. | ||
1611 | * Returns 0 and the page in pagep if success. On failure, returns the | ||
1612 | * the error code and NULL in *pagep. | ||
1613 | */ | ||
1614 | static int shmem_swapin_page(struct inode *inode, pgoff_t index, | ||
1615 | struct page **pagep, enum sgp_type sgp, | ||
1616 | gfp_t gfp, struct vm_area_struct *vma, | ||
1617 | vm_fault_t *fault_type) | ||
1618 | { | ||
1619 | struct address_space *mapping = inode->i_mapping; | ||
1620 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
1621 | struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm; | ||
1622 | struct mem_cgroup *memcg; | ||
1623 | struct page *page; | ||
1624 | swp_entry_t swap; | ||
1625 | int error; | ||
1626 | |||
1627 | VM_BUG_ON(!*pagep || !xa_is_value(*pagep)); | ||
1628 | swap = radix_to_swp_entry(*pagep); | ||
1629 | *pagep = NULL; | ||
1630 | |||
1631 | /* Look it up and read it in.. */ | ||
1632 | page = lookup_swap_cache(swap, NULL, 0); | ||
1633 | if (!page) { | ||
1634 | /* Or update major stats only when swapin succeeds?? */ | ||
1635 | if (fault_type) { | ||
1636 | *fault_type |= VM_FAULT_MAJOR; | ||
1637 | count_vm_event(PGMAJFAULT); | ||
1638 | count_memcg_event_mm(charge_mm, PGMAJFAULT); | ||
1639 | } | ||
1640 | /* Here we actually start the io */ | ||
1641 | page = shmem_swapin(swap, gfp, info, index); | ||
1642 | if (!page) { | ||
1643 | error = -ENOMEM; | ||
1644 | goto failed; | ||
1645 | } | ||
1646 | } | ||
1647 | |||
1648 | /* We have to do this with page locked to prevent races */ | ||
1649 | lock_page(page); | ||
1650 | if (!PageSwapCache(page) || page_private(page) != swap.val || | ||
1651 | !shmem_confirm_swap(mapping, index, swap)) { | ||
1652 | error = -EEXIST; | ||
1653 | goto unlock; | ||
1654 | } | ||
1655 | if (!PageUptodate(page)) { | ||
1656 | error = -EIO; | ||
1657 | goto failed; | ||
1658 | } | ||
1659 | wait_on_page_writeback(page); | ||
1660 | |||
1661 | if (shmem_should_replace_page(page, gfp)) { | ||
1662 | error = shmem_replace_page(&page, gfp, info, index); | ||
1663 | if (error) | ||
1664 | goto failed; | ||
1665 | } | ||
1666 | |||
1667 | error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, | ||
1668 | false); | ||
1669 | if (!error) { | ||
1670 | error = shmem_add_to_page_cache(page, mapping, index, | ||
1671 | swp_to_radix_entry(swap), gfp); | ||
1672 | /* | ||
1673 | * We already confirmed swap under page lock, and make | ||
1674 | * no memory allocation here, so usually no possibility | ||
1675 | * of error; but free_swap_and_cache() only trylocks a | ||
1676 | * page, so it is just possible that the entry has been | ||
1677 | * truncated or holepunched since swap was confirmed. | ||
1678 | * shmem_undo_range() will have done some of the | ||
1679 | * unaccounting, now delete_from_swap_cache() will do | ||
1680 | * the rest. | ||
1681 | */ | ||
1682 | if (error) { | ||
1683 | mem_cgroup_cancel_charge(page, memcg, false); | ||
1684 | delete_from_swap_cache(page); | ||
1685 | } | ||
1686 | } | ||
1687 | if (error) | ||
1688 | goto failed; | ||
1689 | |||
1690 | mem_cgroup_commit_charge(page, memcg, true, false); | ||
1691 | |||
1692 | spin_lock_irq(&info->lock); | ||
1693 | info->swapped--; | ||
1694 | shmem_recalc_inode(inode); | ||
1695 | spin_unlock_irq(&info->lock); | ||
1696 | |||
1697 | if (sgp == SGP_WRITE) | ||
1698 | mark_page_accessed(page); | ||
1699 | |||
1700 | delete_from_swap_cache(page); | ||
1701 | set_page_dirty(page); | ||
1702 | swap_free(swap); | ||
1703 | |||
1704 | *pagep = page; | ||
1705 | return 0; | ||
1706 | failed: | ||
1707 | if (!shmem_confirm_swap(mapping, index, swap)) | ||
1708 | error = -EEXIST; | ||
1709 | unlock: | ||
1710 | if (page) { | ||
1711 | unlock_page(page); | ||
1712 | put_page(page); | ||
1713 | } | ||
1714 | |||
1715 | return error; | ||
1716 | } | ||
1717 | |||
1718 | /* | ||
1579 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate | 1719 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate |
1580 | * | 1720 | * |
1581 | * If we allocate a new one we do not mark it dirty. That's up to the | 1721 | * If we allocate a new one we do not mark it dirty. That's up to the |
@@ -1596,7 +1736,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | |||
1596 | struct mm_struct *charge_mm; | 1736 | struct mm_struct *charge_mm; |
1597 | struct mem_cgroup *memcg; | 1737 | struct mem_cgroup *memcg; |
1598 | struct page *page; | 1738 | struct page *page; |
1599 | swp_entry_t swap; | ||
1600 | enum sgp_type sgp_huge = sgp; | 1739 | enum sgp_type sgp_huge = sgp; |
1601 | pgoff_t hindex = index; | 1740 | pgoff_t hindex = index; |
1602 | int error; | 1741 | int error; |
@@ -1608,17 +1747,23 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | |||
1608 | if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) | 1747 | if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) |
1609 | sgp = SGP_CACHE; | 1748 | sgp = SGP_CACHE; |
1610 | repeat: | 1749 | repeat: |
1611 | swap.val = 0; | 1750 | if (sgp <= SGP_CACHE && |
1751 | ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { | ||
1752 | return -EINVAL; | ||
1753 | } | ||
1754 | |||
1755 | sbinfo = SHMEM_SB(inode->i_sb); | ||
1756 | charge_mm = vma ? vma->vm_mm : current->mm; | ||
1757 | |||
1612 | page = find_lock_entry(mapping, index); | 1758 | page = find_lock_entry(mapping, index); |
1613 | if (xa_is_value(page)) { | 1759 | if (xa_is_value(page)) { |
1614 | swap = radix_to_swp_entry(page); | 1760 | error = shmem_swapin_page(inode, index, &page, |
1615 | page = NULL; | 1761 | sgp, gfp, vma, fault_type); |
1616 | } | 1762 | if (error == -EEXIST) |
1763 | goto repeat; | ||
1617 | 1764 | ||
1618 | if (sgp <= SGP_CACHE && | 1765 | *pagep = page; |
1619 | ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { | 1766 | return error; |
1620 | error = -EINVAL; | ||
1621 | goto unlock; | ||
1622 | } | 1767 | } |
1623 | 1768 | ||
1624 | if (page && sgp == SGP_WRITE) | 1769 | if (page && sgp == SGP_WRITE) |
@@ -1632,7 +1777,7 @@ repeat: | |||
1632 | put_page(page); | 1777 | put_page(page); |
1633 | page = NULL; | 1778 | page = NULL; |
1634 | } | 1779 | } |
1635 | if (page || (sgp == SGP_READ && !swap.val)) { | 1780 | if (page || sgp == SGP_READ) { |
1636 | *pagep = page; | 1781 | *pagep = page; |
1637 | return 0; | 1782 | return 0; |
1638 | } | 1783 | } |
@@ -1641,215 +1786,138 @@ repeat: | |||
1641 | * Fast cache lookup did not find it: | 1786 | * Fast cache lookup did not find it: |
1642 | * bring it back from swap or allocate. | 1787 | * bring it back from swap or allocate. |
1643 | */ | 1788 | */ |
1644 | sbinfo = SHMEM_SB(inode->i_sb); | ||
1645 | charge_mm = vma ? vma->vm_mm : current->mm; | ||
1646 | |||
1647 | if (swap.val) { | ||
1648 | /* Look it up and read it in.. */ | ||
1649 | page = lookup_swap_cache(swap, NULL, 0); | ||
1650 | if (!page) { | ||
1651 | /* Or update major stats only when swapin succeeds?? */ | ||
1652 | if (fault_type) { | ||
1653 | *fault_type |= VM_FAULT_MAJOR; | ||
1654 | count_vm_event(PGMAJFAULT); | ||
1655 | count_memcg_event_mm(charge_mm, PGMAJFAULT); | ||
1656 | } | ||
1657 | /* Here we actually start the io */ | ||
1658 | page = shmem_swapin(swap, gfp, info, index); | ||
1659 | if (!page) { | ||
1660 | error = -ENOMEM; | ||
1661 | goto failed; | ||
1662 | } | ||
1663 | } | ||
1664 | |||
1665 | /* We have to do this with page locked to prevent races */ | ||
1666 | lock_page(page); | ||
1667 | if (!PageSwapCache(page) || page_private(page) != swap.val || | ||
1668 | !shmem_confirm_swap(mapping, index, swap)) { | ||
1669 | error = -EEXIST; /* try again */ | ||
1670 | goto unlock; | ||
1671 | } | ||
1672 | if (!PageUptodate(page)) { | ||
1673 | error = -EIO; | ||
1674 | goto failed; | ||
1675 | } | ||
1676 | wait_on_page_writeback(page); | ||
1677 | |||
1678 | if (shmem_should_replace_page(page, gfp)) { | ||
1679 | error = shmem_replace_page(&page, gfp, info, index); | ||
1680 | if (error) | ||
1681 | goto failed; | ||
1682 | } | ||
1683 | |||
1684 | error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, | ||
1685 | false); | ||
1686 | if (!error) { | ||
1687 | error = shmem_add_to_page_cache(page, mapping, index, | ||
1688 | swp_to_radix_entry(swap), gfp); | ||
1689 | /* | ||
1690 | * We already confirmed swap under page lock, and make | ||
1691 | * no memory allocation here, so usually no possibility | ||
1692 | * of error; but free_swap_and_cache() only trylocks a | ||
1693 | * page, so it is just possible that the entry has been | ||
1694 | * truncated or holepunched since swap was confirmed. | ||
1695 | * shmem_undo_range() will have done some of the | ||
1696 | * unaccounting, now delete_from_swap_cache() will do | ||
1697 | * the rest. | ||
1698 | * Reset swap.val? No, leave it so "failed" goes back to | ||
1699 | * "repeat": reading a hole and writing should succeed. | ||
1700 | */ | ||
1701 | if (error) { | ||
1702 | mem_cgroup_cancel_charge(page, memcg, false); | ||
1703 | delete_from_swap_cache(page); | ||
1704 | } | ||
1705 | } | ||
1706 | if (error) | ||
1707 | goto failed; | ||
1708 | |||
1709 | mem_cgroup_commit_charge(page, memcg, true, false); | ||
1710 | |||
1711 | spin_lock_irq(&info->lock); | ||
1712 | info->swapped--; | ||
1713 | shmem_recalc_inode(inode); | ||
1714 | spin_unlock_irq(&info->lock); | ||
1715 | |||
1716 | if (sgp == SGP_WRITE) | ||
1717 | mark_page_accessed(page); | ||
1718 | 1789 | ||
1719 | delete_from_swap_cache(page); | 1790 | if (vma && userfaultfd_missing(vma)) { |
1720 | set_page_dirty(page); | 1791 | *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); |
1721 | swap_free(swap); | 1792 | return 0; |
1722 | 1793 | } | |
1723 | } else { | ||
1724 | if (vma && userfaultfd_missing(vma)) { | ||
1725 | *fault_type = handle_userfault(vmf, VM_UFFD_MISSING); | ||
1726 | return 0; | ||
1727 | } | ||
1728 | 1794 | ||
1729 | /* shmem_symlink() */ | 1795 | /* shmem_symlink() */ |
1730 | if (mapping->a_ops != &shmem_aops) | 1796 | if (mapping->a_ops != &shmem_aops) |
1731 | goto alloc_nohuge; | 1797 | goto alloc_nohuge; |
1732 | if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) | 1798 | if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) |
1733 | goto alloc_nohuge; | 1799 | goto alloc_nohuge; |
1734 | if (shmem_huge == SHMEM_HUGE_FORCE) | 1800 | if (shmem_huge == SHMEM_HUGE_FORCE) |
1801 | goto alloc_huge; | ||
1802 | switch (sbinfo->huge) { | ||
1803 | loff_t i_size; | ||
1804 | pgoff_t off; | ||
1805 | case SHMEM_HUGE_NEVER: | ||
1806 | goto alloc_nohuge; | ||
1807 | case SHMEM_HUGE_WITHIN_SIZE: | ||
1808 | off = round_up(index, HPAGE_PMD_NR); | ||
1809 | i_size = round_up(i_size_read(inode), PAGE_SIZE); | ||
1810 | if (i_size >= HPAGE_PMD_SIZE && | ||
1811 | i_size >> PAGE_SHIFT >= off) | ||
1735 | goto alloc_huge; | 1812 | goto alloc_huge; |
1736 | switch (sbinfo->huge) { | 1813 | /* fallthrough */ |
1737 | loff_t i_size; | 1814 | case SHMEM_HUGE_ADVISE: |
1738 | pgoff_t off; | 1815 | if (sgp_huge == SGP_HUGE) |
1739 | case SHMEM_HUGE_NEVER: | 1816 | goto alloc_huge; |
1740 | goto alloc_nohuge; | 1817 | /* TODO: implement fadvise() hints */ |
1741 | case SHMEM_HUGE_WITHIN_SIZE: | 1818 | goto alloc_nohuge; |
1742 | off = round_up(index, HPAGE_PMD_NR); | 1819 | } |
1743 | i_size = round_up(i_size_read(inode), PAGE_SIZE); | ||
1744 | if (i_size >= HPAGE_PMD_SIZE && | ||
1745 | i_size >> PAGE_SHIFT >= off) | ||
1746 | goto alloc_huge; | ||
1747 | /* fallthrough */ | ||
1748 | case SHMEM_HUGE_ADVISE: | ||
1749 | if (sgp_huge == SGP_HUGE) | ||
1750 | goto alloc_huge; | ||
1751 | /* TODO: implement fadvise() hints */ | ||
1752 | goto alloc_nohuge; | ||
1753 | } | ||
1754 | 1820 | ||
1755 | alloc_huge: | 1821 | alloc_huge: |
1756 | page = shmem_alloc_and_acct_page(gfp, inode, index, true); | 1822 | page = shmem_alloc_and_acct_page(gfp, inode, index, true); |
1757 | if (IS_ERR(page)) { | 1823 | if (IS_ERR(page)) { |
1758 | alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, | 1824 | alloc_nohuge: |
1759 | index, false); | 1825 | page = shmem_alloc_and_acct_page(gfp, inode, |
1760 | } | 1826 | index, false); |
1761 | if (IS_ERR(page)) { | 1827 | } |
1762 | int retry = 5; | 1828 | if (IS_ERR(page)) { |
1763 | error = PTR_ERR(page); | 1829 | int retry = 5; |
1764 | page = NULL; | ||
1765 | if (error != -ENOSPC) | ||
1766 | goto failed; | ||
1767 | /* | ||
1768 | * Try to reclaim some spece by splitting a huge page | ||
1769 | * beyond i_size on the filesystem. | ||
1770 | */ | ||
1771 | while (retry--) { | ||
1772 | int ret; | ||
1773 | ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); | ||
1774 | if (ret == SHRINK_STOP) | ||
1775 | break; | ||
1776 | if (ret) | ||
1777 | goto alloc_nohuge; | ||
1778 | } | ||
1779 | goto failed; | ||
1780 | } | ||
1781 | |||
1782 | if (PageTransHuge(page)) | ||
1783 | hindex = round_down(index, HPAGE_PMD_NR); | ||
1784 | else | ||
1785 | hindex = index; | ||
1786 | 1830 | ||
1787 | if (sgp == SGP_WRITE) | 1831 | error = PTR_ERR(page); |
1788 | __SetPageReferenced(page); | 1832 | page = NULL; |
1833 | if (error != -ENOSPC) | ||
1834 | goto unlock; | ||
1835 | /* | ||
1836 | * Try to reclaim some space by splitting a huge page | ||
1837 | * beyond i_size on the filesystem. | ||
1838 | */ | ||
1839 | while (retry--) { | ||
1840 | int ret; | ||
1789 | 1841 | ||
1790 | error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, | 1842 | ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); |
1791 | PageTransHuge(page)); | 1843 | if (ret == SHRINK_STOP) |
1792 | if (error) | 1844 | break; |
1793 | goto unacct; | 1845 | if (ret) |
1794 | error = shmem_add_to_page_cache(page, mapping, hindex, | 1846 | goto alloc_nohuge; |
1795 | NULL, gfp & GFP_RECLAIM_MASK); | ||
1796 | if (error) { | ||
1797 | mem_cgroup_cancel_charge(page, memcg, | ||
1798 | PageTransHuge(page)); | ||
1799 | goto unacct; | ||
1800 | } | 1847 | } |
1801 | mem_cgroup_commit_charge(page, memcg, false, | 1848 | goto unlock; |
1802 | PageTransHuge(page)); | 1849 | } |
1803 | lru_cache_add_anon(page); | ||
1804 | 1850 | ||
1805 | spin_lock_irq(&info->lock); | 1851 | if (PageTransHuge(page)) |
1806 | info->alloced += 1 << compound_order(page); | 1852 | hindex = round_down(index, HPAGE_PMD_NR); |
1807 | inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); | 1853 | else |
1808 | shmem_recalc_inode(inode); | 1854 | hindex = index; |
1809 | spin_unlock_irq(&info->lock); | ||
1810 | alloced = true; | ||
1811 | 1855 | ||
1812 | if (PageTransHuge(page) && | 1856 | if (sgp == SGP_WRITE) |
1813 | DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < | 1857 | __SetPageReferenced(page); |
1814 | hindex + HPAGE_PMD_NR - 1) { | 1858 | |
1815 | /* | 1859 | error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, |
1816 | * Part of the huge page is beyond i_size: subject | 1860 | PageTransHuge(page)); |
1817 | * to shrink under memory pressure. | 1861 | if (error) |
1818 | */ | 1862 | goto unacct; |
1819 | spin_lock(&sbinfo->shrinklist_lock); | 1863 | error = shmem_add_to_page_cache(page, mapping, hindex, |
1820 | /* | 1864 | NULL, gfp & GFP_RECLAIM_MASK); |
1821 | * _careful to defend against unlocked access to | 1865 | if (error) { |
1822 | * ->shrink_list in shmem_unused_huge_shrink() | 1866 | mem_cgroup_cancel_charge(page, memcg, |
1823 | */ | 1867 | PageTransHuge(page)); |
1824 | if (list_empty_careful(&info->shrinklist)) { | 1868 | goto unacct; |
1825 | list_add_tail(&info->shrinklist, | 1869 | } |
1826 | &sbinfo->shrinklist); | 1870 | mem_cgroup_commit_charge(page, memcg, false, |
1827 | sbinfo->shrinklist_len++; | 1871 | PageTransHuge(page)); |
1828 | } | 1872 | lru_cache_add_anon(page); |
1829 | spin_unlock(&sbinfo->shrinklist_lock); | ||
1830 | } | ||
1831 | 1873 | ||
1874 | spin_lock_irq(&info->lock); | ||
1875 | info->alloced += 1 << compound_order(page); | ||
1876 | inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); | ||
1877 | shmem_recalc_inode(inode); | ||
1878 | spin_unlock_irq(&info->lock); | ||
1879 | alloced = true; | ||
1880 | |||
1881 | if (PageTransHuge(page) && | ||
1882 | DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < | ||
1883 | hindex + HPAGE_PMD_NR - 1) { | ||
1832 | /* | 1884 | /* |
1833 | * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. | 1885 | * Part of the huge page is beyond i_size: subject |
1886 | * to shrink under memory pressure. | ||
1834 | */ | 1887 | */ |
1835 | if (sgp == SGP_FALLOC) | 1888 | spin_lock(&sbinfo->shrinklist_lock); |
1836 | sgp = SGP_WRITE; | ||
1837 | clear: | ||
1838 | /* | 1889 | /* |
1839 | * Let SGP_WRITE caller clear ends if write does not fill page; | 1890 | * _careful to defend against unlocked access to |
1840 | * but SGP_FALLOC on a page fallocated earlier must initialize | 1891 | * ->shrink_list in shmem_unused_huge_shrink() |
1841 | * it now, lest undo on failure cancel our earlier guarantee. | ||
1842 | */ | 1892 | */ |
1843 | if (sgp != SGP_WRITE && !PageUptodate(page)) { | 1893 | if (list_empty_careful(&info->shrinklist)) { |
1844 | struct page *head = compound_head(page); | 1894 | list_add_tail(&info->shrinklist, |
1845 | int i; | 1895 | &sbinfo->shrinklist); |
1896 | sbinfo->shrinklist_len++; | ||
1897 | } | ||
1898 | spin_unlock(&sbinfo->shrinklist_lock); | ||
1899 | } | ||
1846 | 1900 | ||
1847 | for (i = 0; i < (1 << compound_order(head)); i++) { | 1901 | /* |
1848 | clear_highpage(head + i); | 1902 | * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. |
1849 | flush_dcache_page(head + i); | 1903 | */ |
1850 | } | 1904 | if (sgp == SGP_FALLOC) |
1851 | SetPageUptodate(head); | 1905 | sgp = SGP_WRITE; |
1906 | clear: | ||
1907 | /* | ||
1908 | * Let SGP_WRITE caller clear ends if write does not fill page; | ||
1909 | * but SGP_FALLOC on a page fallocated earlier must initialize | ||
1910 | * it now, lest undo on failure cancel our earlier guarantee. | ||
1911 | */ | ||
1912 | if (sgp != SGP_WRITE && !PageUptodate(page)) { | ||
1913 | struct page *head = compound_head(page); | ||
1914 | int i; | ||
1915 | |||
1916 | for (i = 0; i < (1 << compound_order(head)); i++) { | ||
1917 | clear_highpage(head + i); | ||
1918 | flush_dcache_page(head + i); | ||
1852 | } | 1919 | } |
1920 | SetPageUptodate(head); | ||
1853 | } | 1921 | } |
1854 | 1922 | ||
1855 | /* Perhaps the file has been truncated since we checked */ | 1923 | /* Perhaps the file has been truncated since we checked */ |
@@ -1879,9 +1947,6 @@ unacct: | |||
1879 | put_page(page); | 1947 | put_page(page); |
1880 | goto alloc_nohuge; | 1948 | goto alloc_nohuge; |
1881 | } | 1949 | } |
1882 | failed: | ||
1883 | if (swap.val && !shmem_confirm_swap(mapping, index, swap)) | ||
1884 | error = -EEXIST; | ||
1885 | unlock: | 1950 | unlock: |
1886 | if (page) { | 1951 | if (page) { |
1887 | unlock_page(page); | 1952 | unlock_page(page); |
@@ -2125,6 +2190,24 @@ out_nomem: | |||
2125 | 2190 | ||
2126 | static int shmem_mmap(struct file *file, struct vm_area_struct *vma) | 2191 | static int shmem_mmap(struct file *file, struct vm_area_struct *vma) |
2127 | { | 2192 | { |
2193 | struct shmem_inode_info *info = SHMEM_I(file_inode(file)); | ||
2194 | |||
2195 | if (info->seals & F_SEAL_FUTURE_WRITE) { | ||
2196 | /* | ||
2197 | * New PROT_WRITE and MAP_SHARED mmaps are not allowed when | ||
2198 | * "future write" seal active. | ||
2199 | */ | ||
2200 | if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) | ||
2201 | return -EPERM; | ||
2202 | |||
2203 | /* | ||
2204 | * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED | ||
2205 | * read-only mapping, take care to not allow mprotect to revert | ||
2206 | * protections. | ||
2207 | */ | ||
2208 | vma->vm_flags &= ~(VM_MAYWRITE); | ||
2209 | } | ||
2210 | |||
2128 | file_accessed(file); | 2211 | file_accessed(file); |
2129 | vma->vm_ops = &shmem_vm_ops; | 2212 | vma->vm_ops = &shmem_vm_ops; |
2130 | if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && | 2213 | if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && |
@@ -2375,8 +2458,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping, | |||
2375 | pgoff_t index = pos >> PAGE_SHIFT; | 2458 | pgoff_t index = pos >> PAGE_SHIFT; |
2376 | 2459 | ||
2377 | /* i_mutex is held by caller */ | 2460 | /* i_mutex is held by caller */ |
2378 | if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) { | 2461 | if (unlikely(info->seals & (F_SEAL_GROW | |
2379 | if (info->seals & F_SEAL_WRITE) | 2462 | F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { |
2463 | if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) | ||
2380 | return -EPERM; | 2464 | return -EPERM; |
2381 | if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) | 2465 | if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) |
2382 | return -EPERM; | 2466 | return -EPERM; |
@@ -2639,7 +2723,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, | |||
2639 | DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); | 2723 | DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); |
2640 | 2724 | ||
2641 | /* protected by i_mutex */ | 2725 | /* protected by i_mutex */ |
2642 | if (info->seals & F_SEAL_WRITE) { | 2726 | if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { |
2643 | error = -EPERM; | 2727 | error = -EPERM; |
2644 | goto out; | 2728 | goto out; |
2645 | } | 2729 | } |
@@ -3847,7 +3931,8 @@ int __init shmem_init(void) | |||
3847 | return 0; | 3931 | return 0; |
3848 | } | 3932 | } |
3849 | 3933 | ||
3850 | int shmem_unuse(swp_entry_t swap, struct page *page) | 3934 | int shmem_unuse(unsigned int type, bool frontswap, |
3935 | unsigned long *fs_pages_to_unuse) | ||
3851 | { | 3936 | { |
3852 | return 0; | 3937 | return 0; |
3853 | } | 3938 | } |
@@ -550,14 +550,6 @@ static void start_cpu_timer(int cpu) | |||
550 | 550 | ||
551 | static void init_arraycache(struct array_cache *ac, int limit, int batch) | 551 | static void init_arraycache(struct array_cache *ac, int limit, int batch) |
552 | { | 552 | { |
553 | /* | ||
554 | * The array_cache structures contain pointers to free object. | ||
555 | * However, when such objects are allocated or transferred to another | ||
556 | * cache the pointers are not cleared and they could be counted as | ||
557 | * valid references during a kmemleak scan. Therefore, kmemleak must | ||
558 | * not scan such objects. | ||
559 | */ | ||
560 | kmemleak_no_scan(ac); | ||
561 | if (ac) { | 553 | if (ac) { |
562 | ac->avail = 0; | 554 | ac->avail = 0; |
563 | ac->limit = limit; | 555 | ac->limit = limit; |
@@ -573,6 +565,14 @@ static struct array_cache *alloc_arraycache(int node, int entries, | |||
573 | struct array_cache *ac = NULL; | 565 | struct array_cache *ac = NULL; |
574 | 566 | ||
575 | ac = kmalloc_node(memsize, gfp, node); | 567 | ac = kmalloc_node(memsize, gfp, node); |
568 | /* | ||
569 | * The array_cache structures contain pointers to free object. | ||
570 | * However, when such objects are allocated or transferred to another | ||
571 | * cache the pointers are not cleared and they could be counted as | ||
572 | * valid references during a kmemleak scan. Therefore, kmemleak must | ||
573 | * not scan such objects. | ||
574 | */ | ||
575 | kmemleak_no_scan(ac); | ||
576 | init_arraycache(ac, entries, batchcount); | 576 | init_arraycache(ac, entries, batchcount); |
577 | return ac; | 577 | return ac; |
578 | } | 578 | } |
@@ -667,6 +667,7 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, | |||
667 | 667 | ||
668 | alc = kmalloc_node(memsize, gfp, node); | 668 | alc = kmalloc_node(memsize, gfp, node); |
669 | if (alc) { | 669 | if (alc) { |
670 | kmemleak_no_scan(alc); | ||
670 | init_arraycache(&alc->ac, entries, batch); | 671 | init_arraycache(&alc->ac, entries, batch); |
671 | spin_lock_init(&alc->lock); | 672 | spin_lock_init(&alc->lock); |
672 | } | 673 | } |
@@ -676,12 +677,11 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries, | |||
676 | static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) | 677 | static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) |
677 | { | 678 | { |
678 | struct alien_cache **alc_ptr; | 679 | struct alien_cache **alc_ptr; |
679 | size_t memsize = sizeof(void *) * nr_node_ids; | ||
680 | int i; | 680 | int i; |
681 | 681 | ||
682 | if (limit > 1) | 682 | if (limit > 1) |
683 | limit = 12; | 683 | limit = 12; |
684 | alc_ptr = kzalloc_node(memsize, gfp, node); | 684 | alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node); |
685 | if (!alc_ptr) | 685 | if (!alc_ptr) |
686 | return NULL; | 686 | return NULL; |
687 | 687 | ||
@@ -1727,6 +1727,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) | |||
1727 | * This could be made much more intelligent. For now, try to avoid using | 1727 | * This could be made much more intelligent. For now, try to avoid using |
1728 | * high order pages for slabs. When the gfp() functions are more friendly | 1728 | * high order pages for slabs. When the gfp() functions are more friendly |
1729 | * towards high-order requests, this should be changed. | 1729 | * towards high-order requests, this should be changed. |
1730 | * | ||
1731 | * Return: number of left-over bytes in a slab | ||
1730 | */ | 1732 | */ |
1731 | static size_t calculate_slab_order(struct kmem_cache *cachep, | 1733 | static size_t calculate_slab_order(struct kmem_cache *cachep, |
1732 | size_t size, slab_flags_t flags) | 1734 | size_t size, slab_flags_t flags) |
@@ -1975,6 +1977,8 @@ static bool set_on_slab_cache(struct kmem_cache *cachep, | |||
1975 | * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware | 1977 | * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware |
1976 | * cacheline. This can be beneficial if you're counting cycles as closely | 1978 | * cacheline. This can be beneficial if you're counting cycles as closely |
1977 | * as davem. | 1979 | * as davem. |
1980 | * | ||
1981 | * Return: a pointer to the created cache or %NULL in case of error | ||
1978 | */ | 1982 | */ |
1979 | int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) | 1983 | int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) |
1980 | { | 1984 | { |
@@ -3542,6 +3546,8 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, | |||
3542 | * | 3546 | * |
3543 | * Allocate an object from this cache. The flags are only relevant | 3547 | * Allocate an object from this cache. The flags are only relevant |
3544 | * if the cache has no available objects. | 3548 | * if the cache has no available objects. |
3549 | * | ||
3550 | * Return: pointer to the new object or %NULL in case of error | ||
3545 | */ | 3551 | */ |
3546 | void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 3552 | void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
3547 | { | 3553 | { |
@@ -3631,6 +3637,8 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); | |||
3631 | * node, which can improve the performance for cpu bound structures. | 3637 | * node, which can improve the performance for cpu bound structures. |
3632 | * | 3638 | * |
3633 | * Fallback to other node is possible if __GFP_THISNODE is not set. | 3639 | * Fallback to other node is possible if __GFP_THISNODE is not set. |
3640 | * | ||
3641 | * Return: pointer to the new object or %NULL in case of error | ||
3634 | */ | 3642 | */ |
3635 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 3643 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) |
3636 | { | 3644 | { |
@@ -3699,6 +3707,8 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller); | |||
3699 | * @size: how many bytes of memory are required. | 3707 | * @size: how many bytes of memory are required. |
3700 | * @flags: the type of memory to allocate (see kmalloc). | 3708 | * @flags: the type of memory to allocate (see kmalloc). |
3701 | * @caller: function caller for debug tracking of the caller | 3709 | * @caller: function caller for debug tracking of the caller |
3710 | * | ||
3711 | * Return: pointer to the allocated memory or %NULL in case of error | ||
3702 | */ | 3712 | */ |
3703 | static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | 3713 | static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, |
3704 | unsigned long caller) | 3714 | unsigned long caller) |
@@ -4164,6 +4174,8 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) | |||
4164 | * @buffer: user buffer | 4174 | * @buffer: user buffer |
4165 | * @count: data length | 4175 | * @count: data length |
4166 | * @ppos: unused | 4176 | * @ppos: unused |
4177 | * | ||
4178 | * Return: %0 on success, negative error code otherwise. | ||
4167 | */ | 4179 | */ |
4168 | ssize_t slabinfo_write(struct file *file, const char __user *buffer, | 4180 | ssize_t slabinfo_write(struct file *file, const char __user *buffer, |
4169 | size_t count, loff_t *ppos) | 4181 | size_t count, loff_t *ppos) |
@@ -4457,6 +4469,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, | |||
4457 | * The caller must guarantee that objp points to a valid object previously | 4469 | * The caller must guarantee that objp points to a valid object previously |
4458 | * allocated with either kmalloc() or kmem_cache_alloc(). The object | 4470 | * allocated with either kmalloc() or kmem_cache_alloc(). The object |
4459 | * must not be freed during the duration of the call. | 4471 | * must not be freed during the duration of the call. |
4472 | * | ||
4473 | * Return: size of the actual memory used by @objp in bytes | ||
4460 | */ | 4474 | */ |
4461 | size_t ksize(const void *objp) | 4475 | size_t ksize(const void *objp) |
4462 | { | 4476 | { |
@@ -276,8 +276,6 @@ static __always_inline int memcg_charge_slab(struct page *page, | |||
276 | gfp_t gfp, int order, | 276 | gfp_t gfp, int order, |
277 | struct kmem_cache *s) | 277 | struct kmem_cache *s) |
278 | { | 278 | { |
279 | if (!memcg_kmem_enabled()) | ||
280 | return 0; | ||
281 | if (is_root_cache(s)) | 279 | if (is_root_cache(s)) |
282 | return 0; | 280 | return 0; |
283 | return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); | 281 | return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); |
@@ -286,8 +284,6 @@ static __always_inline int memcg_charge_slab(struct page *page, | |||
286 | static __always_inline void memcg_uncharge_slab(struct page *page, int order, | 284 | static __always_inline void memcg_uncharge_slab(struct page *page, int order, |
287 | struct kmem_cache *s) | 285 | struct kmem_cache *s) |
288 | { | 286 | { |
289 | if (!memcg_kmem_enabled()) | ||
290 | return; | ||
291 | memcg_kmem_uncharge(page, order); | 287 | memcg_kmem_uncharge(page, order); |
292 | } | 288 | } |
293 | 289 | ||
diff --git a/mm/slab_common.c b/mm/slab_common.c index f9d89c1b5977..03eeb8b7b4b1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -939,6 +939,8 @@ EXPORT_SYMBOL(kmem_cache_destroy); | |||
939 | * | 939 | * |
940 | * Releases as many slabs as possible for a cache. | 940 | * Releases as many slabs as possible for a cache. |
941 | * To help debugging, a zero exit status indicates all slabs were released. | 941 | * To help debugging, a zero exit status indicates all slabs were released. |
942 | * | ||
943 | * Return: %0 if all slabs were released, non-zero otherwise | ||
942 | */ | 944 | */ |
943 | int kmem_cache_shrink(struct kmem_cache *cachep) | 945 | int kmem_cache_shrink(struct kmem_cache *cachep) |
944 | { | 946 | { |
@@ -1425,7 +1427,7 @@ void dump_unreclaimable_slab(void) | |||
1425 | #if defined(CONFIG_MEMCG) | 1427 | #if defined(CONFIG_MEMCG) |
1426 | void *memcg_slab_start(struct seq_file *m, loff_t *pos) | 1428 | void *memcg_slab_start(struct seq_file *m, loff_t *pos) |
1427 | { | 1429 | { |
1428 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 1430 | struct mem_cgroup *memcg = mem_cgroup_from_seq(m); |
1429 | 1431 | ||
1430 | mutex_lock(&slab_mutex); | 1432 | mutex_lock(&slab_mutex); |
1431 | return seq_list_start(&memcg->kmem_caches, *pos); | 1433 | return seq_list_start(&memcg->kmem_caches, *pos); |
@@ -1433,7 +1435,7 @@ void *memcg_slab_start(struct seq_file *m, loff_t *pos) | |||
1433 | 1435 | ||
1434 | void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos) | 1436 | void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos) |
1435 | { | 1437 | { |
1436 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 1438 | struct mem_cgroup *memcg = mem_cgroup_from_seq(m); |
1437 | 1439 | ||
1438 | return seq_list_next(p, &memcg->kmem_caches, pos); | 1440 | return seq_list_next(p, &memcg->kmem_caches, pos); |
1439 | } | 1441 | } |
@@ -1447,7 +1449,7 @@ int memcg_slab_show(struct seq_file *m, void *p) | |||
1447 | { | 1449 | { |
1448 | struct kmem_cache *s = list_entry(p, struct kmem_cache, | 1450 | struct kmem_cache *s = list_entry(p, struct kmem_cache, |
1449 | memcg_params.kmem_caches_node); | 1451 | memcg_params.kmem_caches_node); |
1450 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 1452 | struct mem_cgroup *memcg = mem_cgroup_from_seq(m); |
1451 | 1453 | ||
1452 | if (p == memcg->kmem_caches.next) | 1454 | if (p == memcg->kmem_caches.next) |
1453 | print_slabinfo_header(m); | 1455 | print_slabinfo_header(m); |
@@ -1528,6 +1530,8 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, | |||
1528 | * This function is like krealloc() except it never frees the originally | 1530 | * This function is like krealloc() except it never frees the originally |
1529 | * allocated buffer. Use this if you don't want to free the buffer immediately | 1531 | * allocated buffer. Use this if you don't want to free the buffer immediately |
1530 | * like, for example, with RCU. | 1532 | * like, for example, with RCU. |
1533 | * | ||
1534 | * Return: pointer to the allocated memory or %NULL in case of error | ||
1531 | */ | 1535 | */ |
1532 | void *__krealloc(const void *p, size_t new_size, gfp_t flags) | 1536 | void *__krealloc(const void *p, size_t new_size, gfp_t flags) |
1533 | { | 1537 | { |
@@ -1549,6 +1553,8 @@ EXPORT_SYMBOL(__krealloc); | |||
1549 | * lesser of the new and old sizes. If @p is %NULL, krealloc() | 1553 | * lesser of the new and old sizes. If @p is %NULL, krealloc() |
1550 | * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a | 1554 | * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a |
1551 | * %NULL pointer, the object pointed to is freed. | 1555 | * %NULL pointer, the object pointed to is freed. |
1556 | * | ||
1557 | * Return: pointer to the allocated memory or %NULL in case of error | ||
1552 | */ | 1558 | */ |
1553 | void *krealloc(const void *p, size_t new_size, gfp_t flags) | 1559 | void *krealloc(const void *p, size_t new_size, gfp_t flags) |
1554 | { | 1560 | { |
@@ -1093,8 +1093,7 @@ static void setup_page_debug(struct kmem_cache *s, void *addr, int order) | |||
1093 | } | 1093 | } |
1094 | 1094 | ||
1095 | static inline int alloc_consistency_checks(struct kmem_cache *s, | 1095 | static inline int alloc_consistency_checks(struct kmem_cache *s, |
1096 | struct page *page, | 1096 | struct page *page, void *object) |
1097 | void *object, unsigned long addr) | ||
1098 | { | 1097 | { |
1099 | if (!check_slab(s, page)) | 1098 | if (!check_slab(s, page)) |
1100 | return 0; | 1099 | return 0; |
@@ -1115,7 +1114,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, | |||
1115 | void *object, unsigned long addr) | 1114 | void *object, unsigned long addr) |
1116 | { | 1115 | { |
1117 | if (s->flags & SLAB_CONSISTENCY_CHECKS) { | 1116 | if (s->flags & SLAB_CONSISTENCY_CHECKS) { |
1118 | if (!alloc_consistency_checks(s, page, object, addr)) | 1117 | if (!alloc_consistency_checks(s, page, object)) |
1119 | goto bad; | 1118 | goto bad; |
1120 | } | 1119 | } |
1121 | 1120 | ||
@@ -2130,7 +2129,7 @@ redo: | |||
2130 | if (!lock) { | 2129 | if (!lock) { |
2131 | lock = 1; | 2130 | lock = 1; |
2132 | /* | 2131 | /* |
2133 | * Taking the spinlock removes the possiblity | 2132 | * Taking the spinlock removes the possibility |
2134 | * that acquire_slab() will see a slab page that | 2133 | * that acquire_slab() will see a slab page that |
2135 | * is frozen | 2134 | * is frozen |
2136 | */ | 2135 | */ |
@@ -2254,8 +2253,8 @@ static void unfreeze_partials(struct kmem_cache *s, | |||
2254 | } | 2253 | } |
2255 | 2254 | ||
2256 | /* | 2255 | /* |
2257 | * Put a page that was just frozen (in __slab_free) into a partial page | 2256 | * Put a page that was just frozen (in __slab_free|get_partial_node) into a |
2258 | * slot if available. | 2257 | * partial page slot if available. |
2259 | * | 2258 | * |
2260 | * If we did not find a slot then simply move all the partials to the | 2259 | * If we did not find a slot then simply move all the partials to the |
2261 | * per node partial list. | 2260 | * per node partial list. |
@@ -2482,8 +2481,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, | |||
2482 | stat(s, ALLOC_SLAB); | 2481 | stat(s, ALLOC_SLAB); |
2483 | c->page = page; | 2482 | c->page = page; |
2484 | *pc = c; | 2483 | *pc = c; |
2485 | } else | 2484 | } |
2486 | freelist = NULL; | ||
2487 | 2485 | ||
2488 | return freelist; | 2486 | return freelist; |
2489 | } | 2487 | } |
@@ -4264,7 +4262,7 @@ void __init kmem_cache_init(void) | |||
4264 | cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, | 4262 | cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, |
4265 | slub_cpu_dead); | 4263 | slub_cpu_dead); |
4266 | 4264 | ||
4267 | pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n", | 4265 | pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n", |
4268 | cache_line_size(), | 4266 | cache_line_size(), |
4269 | slub_min_order, slub_max_order, slub_min_objects, | 4267 | slub_min_order, slub_max_order, slub_min_objects, |
4270 | nr_cpu_ids, nr_node_ids); | 4268 | nr_cpu_ids, nr_node_ids); |
diff --git a/mm/sparse.c b/mm/sparse.c index 7ea5dc6c6b19..77a0554fa5bd 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -197,7 +197,7 @@ static inline int next_present_section_nr(int section_nr) | |||
197 | } | 197 | } |
198 | #define for_each_present_section_nr(start, section_nr) \ | 198 | #define for_each_present_section_nr(start, section_nr) \ |
199 | for (section_nr = next_present_section_nr(start-1); \ | 199 | for (section_nr = next_present_section_nr(start-1); \ |
200 | ((section_nr >= 0) && \ | 200 | ((section_nr != -1) && \ |
201 | (section_nr <= __highest_present_section_nr)); \ | 201 | (section_nr <= __highest_present_section_nr)); \ |
202 | section_nr = next_present_section_nr(section_nr)) | 202 | section_nr = next_present_section_nr(section_nr)) |
203 | 203 | ||
@@ -58,16 +58,16 @@ static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); | |||
58 | static void __page_cache_release(struct page *page) | 58 | static void __page_cache_release(struct page *page) |
59 | { | 59 | { |
60 | if (PageLRU(page)) { | 60 | if (PageLRU(page)) { |
61 | struct zone *zone = page_zone(page); | 61 | pg_data_t *pgdat = page_pgdat(page); |
62 | struct lruvec *lruvec; | 62 | struct lruvec *lruvec; |
63 | unsigned long flags; | 63 | unsigned long flags; |
64 | 64 | ||
65 | spin_lock_irqsave(zone_lru_lock(zone), flags); | 65 | spin_lock_irqsave(&pgdat->lru_lock, flags); |
66 | lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); | 66 | lruvec = mem_cgroup_page_lruvec(page, pgdat); |
67 | VM_BUG_ON_PAGE(!PageLRU(page), page); | 67 | VM_BUG_ON_PAGE(!PageLRU(page), page); |
68 | __ClearPageLRU(page); | 68 | __ClearPageLRU(page); |
69 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); | 69 | del_page_from_lru_list(page, lruvec, page_off_lru(page)); |
70 | spin_unlock_irqrestore(zone_lru_lock(zone), flags); | 70 | spin_unlock_irqrestore(&pgdat->lru_lock, flags); |
71 | } | 71 | } |
72 | __ClearPageWaiters(page); | 72 | __ClearPageWaiters(page); |
73 | mem_cgroup_uncharge(page); | 73 | mem_cgroup_uncharge(page); |
@@ -322,12 +322,12 @@ static inline void activate_page_drain(int cpu) | |||
322 | 322 | ||
323 | void activate_page(struct page *page) | 323 | void activate_page(struct page *page) |
324 | { | 324 | { |
325 | struct zone *zone = page_zone(page); | 325 | pg_data_t *pgdat = page_pgdat(page); |
326 | 326 | ||
327 | page = compound_head(page); | 327 | page = compound_head(page); |
328 | spin_lock_irq(zone_lru_lock(zone)); | 328 | spin_lock_irq(&pgdat->lru_lock); |
329 | __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL); | 329 | __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL); |
330 | spin_unlock_irq(zone_lru_lock(zone)); | 330 | spin_unlock_irq(&pgdat->lru_lock); |
331 | } | 331 | } |
332 | #endif | 332 | #endif |
333 | 333 | ||
diff --git a/mm/swap_state.c b/mm/swap_state.c index fd2f21e1c60a..85245fdec8d9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -523,7 +523,7 @@ static unsigned long swapin_nr_pages(unsigned long offset) | |||
523 | * This has been extended to use the NUMA policies from the mm triggering | 523 | * This has been extended to use the NUMA policies from the mm triggering |
524 | * the readahead. | 524 | * the readahead. |
525 | * | 525 | * |
526 | * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL. | 526 | * Caller must hold read mmap_sem if vmf->vma is not NULL. |
527 | */ | 527 | */ |
528 | struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, | 528 | struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, |
529 | struct vm_fault *vmf) | 529 | struct vm_fault *vmf) |
@@ -543,6 +543,13 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, | |||
543 | if (!mask) | 543 | if (!mask) |
544 | goto skip; | 544 | goto skip; |
545 | 545 | ||
546 | /* Test swap type to make sure the dereference is safe */ | ||
547 | if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) { | ||
548 | struct inode *inode = si->swap_file->f_mapping->host; | ||
549 | if (inode_read_congested(inode)) | ||
550 | goto skip; | ||
551 | } | ||
552 | |||
546 | do_poll = false; | 553 | do_poll = false; |
547 | /* Read a page_cluster sized and aligned cluster around offset. */ | 554 | /* Read a page_cluster sized and aligned cluster around offset. */ |
548 | start_offset = offset & ~mask; | 555 | start_offset = offset & ~mask; |
@@ -691,6 +698,20 @@ static void swap_ra_info(struct vm_fault *vmf, | |||
691 | pte_unmap(orig_pte); | 698 | pte_unmap(orig_pte); |
692 | } | 699 | } |
693 | 700 | ||
701 | /** | ||
702 | * swap_vma_readahead - swap in pages in hope we need them soon | ||
703 | * @entry: swap entry of this memory | ||
704 | * @gfp_mask: memory allocation flags | ||
705 | * @vmf: fault information | ||
706 | * | ||
707 | * Returns the struct page for entry and addr, after queueing swapin. | ||
708 | * | ||
709 | * Primitive swap readahead code. We simply read in a few pages whoes | ||
710 | * virtual addresses are around the fault address in the same vma. | ||
711 | * | ||
712 | * Caller must hold read mmap_sem if vmf->vma is not NULL. | ||
713 | * | ||
714 | */ | ||
694 | static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, | 715 | static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, |
695 | struct vm_fault *vmf) | 716 | struct vm_fault *vmf) |
696 | { | 717 | { |
diff --git a/mm/swapfile.c b/mm/swapfile.c index dbac1d49469d..2b8d9c3fbb47 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -98,6 +98,15 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0); | |||
98 | 98 | ||
99 | atomic_t nr_rotate_swap = ATOMIC_INIT(0); | 99 | atomic_t nr_rotate_swap = ATOMIC_INIT(0); |
100 | 100 | ||
101 | static struct swap_info_struct *swap_type_to_swap_info(int type) | ||
102 | { | ||
103 | if (type >= READ_ONCE(nr_swapfiles)) | ||
104 | return NULL; | ||
105 | |||
106 | smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */ | ||
107 | return READ_ONCE(swap_info[type]); | ||
108 | } | ||
109 | |||
101 | static inline unsigned char swap_count(unsigned char ent) | 110 | static inline unsigned char swap_count(unsigned char ent) |
102 | { | 111 | { |
103 | return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ | 112 | return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */ |
@@ -1044,12 +1053,14 @@ noswap: | |||
1044 | /* The only caller of this function is now suspend routine */ | 1053 | /* The only caller of this function is now suspend routine */ |
1045 | swp_entry_t get_swap_page_of_type(int type) | 1054 | swp_entry_t get_swap_page_of_type(int type) |
1046 | { | 1055 | { |
1047 | struct swap_info_struct *si; | 1056 | struct swap_info_struct *si = swap_type_to_swap_info(type); |
1048 | pgoff_t offset; | 1057 | pgoff_t offset; |
1049 | 1058 | ||
1050 | si = swap_info[type]; | 1059 | if (!si) |
1060 | goto fail; | ||
1061 | |||
1051 | spin_lock(&si->lock); | 1062 | spin_lock(&si->lock); |
1052 | if (si && (si->flags & SWP_WRITEOK)) { | 1063 | if (si->flags & SWP_WRITEOK) { |
1053 | atomic_long_dec(&nr_swap_pages); | 1064 | atomic_long_dec(&nr_swap_pages); |
1054 | /* This is called for allocating swap entry, not cache */ | 1065 | /* This is called for allocating swap entry, not cache */ |
1055 | offset = scan_swap_map(si, 1); | 1066 | offset = scan_swap_map(si, 1); |
@@ -1060,6 +1071,7 @@ swp_entry_t get_swap_page_of_type(int type) | |||
1060 | atomic_long_inc(&nr_swap_pages); | 1071 | atomic_long_inc(&nr_swap_pages); |
1061 | } | 1072 | } |
1062 | spin_unlock(&si->lock); | 1073 | spin_unlock(&si->lock); |
1074 | fail: | ||
1063 | return (swp_entry_t) {0}; | 1075 | return (swp_entry_t) {0}; |
1064 | } | 1076 | } |
1065 | 1077 | ||
@@ -1071,9 +1083,9 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry) | |||
1071 | if (!entry.val) | 1083 | if (!entry.val) |
1072 | goto out; | 1084 | goto out; |
1073 | type = swp_type(entry); | 1085 | type = swp_type(entry); |
1074 | if (type >= nr_swapfiles) | 1086 | p = swap_type_to_swap_info(type); |
1087 | if (!p) | ||
1075 | goto bad_nofile; | 1088 | goto bad_nofile; |
1076 | p = swap_info[type]; | ||
1077 | if (!(p->flags & SWP_USED)) | 1089 | if (!(p->flags & SWP_USED)) |
1078 | goto bad_device; | 1090 | goto bad_device; |
1079 | offset = swp_offset(entry); | 1091 | offset = swp_offset(entry); |
@@ -1697,10 +1709,9 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | |||
1697 | sector_t swapdev_block(int type, pgoff_t offset) | 1709 | sector_t swapdev_block(int type, pgoff_t offset) |
1698 | { | 1710 | { |
1699 | struct block_device *bdev; | 1711 | struct block_device *bdev; |
1712 | struct swap_info_struct *si = swap_type_to_swap_info(type); | ||
1700 | 1713 | ||
1701 | if ((unsigned int)type >= nr_swapfiles) | 1714 | if (!si || !(si->flags & SWP_WRITEOK)) |
1702 | return 0; | ||
1703 | if (!(swap_info[type]->flags & SWP_WRITEOK)) | ||
1704 | return 0; | 1715 | return 0; |
1705 | return map_swap_entry(swp_entry(type, offset), &bdev); | 1716 | return map_swap_entry(swp_entry(type, offset), &bdev); |
1706 | } | 1717 | } |
@@ -1799,44 +1810,77 @@ out_nolock: | |||
1799 | } | 1810 | } |
1800 | 1811 | ||
1801 | static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 1812 | static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
1802 | unsigned long addr, unsigned long end, | 1813 | unsigned long addr, unsigned long end, |
1803 | swp_entry_t entry, struct page *page) | 1814 | unsigned int type, bool frontswap, |
1815 | unsigned long *fs_pages_to_unuse) | ||
1804 | { | 1816 | { |
1805 | pte_t swp_pte = swp_entry_to_pte(entry); | 1817 | struct page *page; |
1818 | swp_entry_t entry; | ||
1806 | pte_t *pte; | 1819 | pte_t *pte; |
1820 | struct swap_info_struct *si; | ||
1821 | unsigned long offset; | ||
1807 | int ret = 0; | 1822 | int ret = 0; |
1823 | volatile unsigned char *swap_map; | ||
1808 | 1824 | ||
1809 | /* | 1825 | si = swap_info[type]; |
1810 | * We don't actually need pte lock while scanning for swp_pte: since | ||
1811 | * we hold page lock and mmap_sem, swp_pte cannot be inserted into the | ||
1812 | * page table while we're scanning; though it could get zapped, and on | ||
1813 | * some architectures (e.g. x86_32 with PAE) we might catch a glimpse | ||
1814 | * of unmatched parts which look like swp_pte, so unuse_pte must | ||
1815 | * recheck under pte lock. Scanning without pte lock lets it be | ||
1816 | * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. | ||
1817 | */ | ||
1818 | pte = pte_offset_map(pmd, addr); | 1826 | pte = pte_offset_map(pmd, addr); |
1819 | do { | 1827 | do { |
1820 | /* | 1828 | struct vm_fault vmf; |
1821 | * swapoff spends a _lot_ of time in this loop! | 1829 | |
1822 | * Test inline before going to call unuse_pte. | 1830 | if (!is_swap_pte(*pte)) |
1823 | */ | 1831 | continue; |
1824 | if (unlikely(pte_same_as_swp(*pte, swp_pte))) { | 1832 | |
1825 | pte_unmap(pte); | 1833 | entry = pte_to_swp_entry(*pte); |
1826 | ret = unuse_pte(vma, pmd, addr, entry, page); | 1834 | if (swp_type(entry) != type) |
1827 | if (ret) | 1835 | continue; |
1828 | goto out; | 1836 | |
1829 | pte = pte_offset_map(pmd, addr); | 1837 | offset = swp_offset(entry); |
1838 | if (frontswap && !frontswap_test(si, offset)) | ||
1839 | continue; | ||
1840 | |||
1841 | pte_unmap(pte); | ||
1842 | swap_map = &si->swap_map[offset]; | ||
1843 | vmf.vma = vma; | ||
1844 | vmf.address = addr; | ||
1845 | vmf.pmd = pmd; | ||
1846 | page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); | ||
1847 | if (!page) { | ||
1848 | if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) | ||
1849 | goto try_next; | ||
1850 | return -ENOMEM; | ||
1851 | } | ||
1852 | |||
1853 | lock_page(page); | ||
1854 | wait_on_page_writeback(page); | ||
1855 | ret = unuse_pte(vma, pmd, addr, entry, page); | ||
1856 | if (ret < 0) { | ||
1857 | unlock_page(page); | ||
1858 | put_page(page); | ||
1859 | goto out; | ||
1830 | } | 1860 | } |
1861 | |||
1862 | try_to_free_swap(page); | ||
1863 | unlock_page(page); | ||
1864 | put_page(page); | ||
1865 | |||
1866 | if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) { | ||
1867 | ret = FRONTSWAP_PAGES_UNUSED; | ||
1868 | goto out; | ||
1869 | } | ||
1870 | try_next: | ||
1871 | pte = pte_offset_map(pmd, addr); | ||
1831 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1872 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1832 | pte_unmap(pte - 1); | 1873 | pte_unmap(pte - 1); |
1874 | |||
1875 | ret = 0; | ||
1833 | out: | 1876 | out: |
1834 | return ret; | 1877 | return ret; |
1835 | } | 1878 | } |
1836 | 1879 | ||
1837 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 1880 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
1838 | unsigned long addr, unsigned long end, | 1881 | unsigned long addr, unsigned long end, |
1839 | swp_entry_t entry, struct page *page) | 1882 | unsigned int type, bool frontswap, |
1883 | unsigned long *fs_pages_to_unuse) | ||
1840 | { | 1884 | { |
1841 | pmd_t *pmd; | 1885 | pmd_t *pmd; |
1842 | unsigned long next; | 1886 | unsigned long next; |
@@ -1848,7 +1892,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
1848 | next = pmd_addr_end(addr, end); | 1892 | next = pmd_addr_end(addr, end); |
1849 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | 1893 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
1850 | continue; | 1894 | continue; |
1851 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); | 1895 | ret = unuse_pte_range(vma, pmd, addr, next, type, |
1896 | frontswap, fs_pages_to_unuse); | ||
1852 | if (ret) | 1897 | if (ret) |
1853 | return ret; | 1898 | return ret; |
1854 | } while (pmd++, addr = next, addr != end); | 1899 | } while (pmd++, addr = next, addr != end); |
@@ -1857,7 +1902,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
1857 | 1902 | ||
1858 | static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, | 1903 | static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, |
1859 | unsigned long addr, unsigned long end, | 1904 | unsigned long addr, unsigned long end, |
1860 | swp_entry_t entry, struct page *page) | 1905 | unsigned int type, bool frontswap, |
1906 | unsigned long *fs_pages_to_unuse) | ||
1861 | { | 1907 | { |
1862 | pud_t *pud; | 1908 | pud_t *pud; |
1863 | unsigned long next; | 1909 | unsigned long next; |
@@ -1868,7 +1914,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, | |||
1868 | next = pud_addr_end(addr, end); | 1914 | next = pud_addr_end(addr, end); |
1869 | if (pud_none_or_clear_bad(pud)) | 1915 | if (pud_none_or_clear_bad(pud)) |
1870 | continue; | 1916 | continue; |
1871 | ret = unuse_pmd_range(vma, pud, addr, next, entry, page); | 1917 | ret = unuse_pmd_range(vma, pud, addr, next, type, |
1918 | frontswap, fs_pages_to_unuse); | ||
1872 | if (ret) | 1919 | if (ret) |
1873 | return ret; | 1920 | return ret; |
1874 | } while (pud++, addr = next, addr != end); | 1921 | } while (pud++, addr = next, addr != end); |
@@ -1877,7 +1924,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, | |||
1877 | 1924 | ||
1878 | static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, | 1925 | static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, |
1879 | unsigned long addr, unsigned long end, | 1926 | unsigned long addr, unsigned long end, |
1880 | swp_entry_t entry, struct page *page) | 1927 | unsigned int type, bool frontswap, |
1928 | unsigned long *fs_pages_to_unuse) | ||
1881 | { | 1929 | { |
1882 | p4d_t *p4d; | 1930 | p4d_t *p4d; |
1883 | unsigned long next; | 1931 | unsigned long next; |
@@ -1888,78 +1936,66 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, | |||
1888 | next = p4d_addr_end(addr, end); | 1936 | next = p4d_addr_end(addr, end); |
1889 | if (p4d_none_or_clear_bad(p4d)) | 1937 | if (p4d_none_or_clear_bad(p4d)) |
1890 | continue; | 1938 | continue; |
1891 | ret = unuse_pud_range(vma, p4d, addr, next, entry, page); | 1939 | ret = unuse_pud_range(vma, p4d, addr, next, type, |
1940 | frontswap, fs_pages_to_unuse); | ||
1892 | if (ret) | 1941 | if (ret) |
1893 | return ret; | 1942 | return ret; |
1894 | } while (p4d++, addr = next, addr != end); | 1943 | } while (p4d++, addr = next, addr != end); |
1895 | return 0; | 1944 | return 0; |
1896 | } | 1945 | } |
1897 | 1946 | ||
1898 | static int unuse_vma(struct vm_area_struct *vma, | 1947 | static int unuse_vma(struct vm_area_struct *vma, unsigned int type, |
1899 | swp_entry_t entry, struct page *page) | 1948 | bool frontswap, unsigned long *fs_pages_to_unuse) |
1900 | { | 1949 | { |
1901 | pgd_t *pgd; | 1950 | pgd_t *pgd; |
1902 | unsigned long addr, end, next; | 1951 | unsigned long addr, end, next; |
1903 | int ret; | 1952 | int ret; |
1904 | 1953 | ||
1905 | if (page_anon_vma(page)) { | 1954 | addr = vma->vm_start; |
1906 | addr = page_address_in_vma(page, vma); | 1955 | end = vma->vm_end; |
1907 | if (addr == -EFAULT) | ||
1908 | return 0; | ||
1909 | else | ||
1910 | end = addr + PAGE_SIZE; | ||
1911 | } else { | ||
1912 | addr = vma->vm_start; | ||
1913 | end = vma->vm_end; | ||
1914 | } | ||
1915 | 1956 | ||
1916 | pgd = pgd_offset(vma->vm_mm, addr); | 1957 | pgd = pgd_offset(vma->vm_mm, addr); |
1917 | do { | 1958 | do { |
1918 | next = pgd_addr_end(addr, end); | 1959 | next = pgd_addr_end(addr, end); |
1919 | if (pgd_none_or_clear_bad(pgd)) | 1960 | if (pgd_none_or_clear_bad(pgd)) |
1920 | continue; | 1961 | continue; |
1921 | ret = unuse_p4d_range(vma, pgd, addr, next, entry, page); | 1962 | ret = unuse_p4d_range(vma, pgd, addr, next, type, |
1963 | frontswap, fs_pages_to_unuse); | ||
1922 | if (ret) | 1964 | if (ret) |
1923 | return ret; | 1965 | return ret; |
1924 | } while (pgd++, addr = next, addr != end); | 1966 | } while (pgd++, addr = next, addr != end); |
1925 | return 0; | 1967 | return 0; |
1926 | } | 1968 | } |
1927 | 1969 | ||
1928 | static int unuse_mm(struct mm_struct *mm, | 1970 | static int unuse_mm(struct mm_struct *mm, unsigned int type, |
1929 | swp_entry_t entry, struct page *page) | 1971 | bool frontswap, unsigned long *fs_pages_to_unuse) |
1930 | { | 1972 | { |
1931 | struct vm_area_struct *vma; | 1973 | struct vm_area_struct *vma; |
1932 | int ret = 0; | 1974 | int ret = 0; |
1933 | 1975 | ||
1934 | if (!down_read_trylock(&mm->mmap_sem)) { | 1976 | down_read(&mm->mmap_sem); |
1935 | /* | ||
1936 | * Activate page so shrink_inactive_list is unlikely to unmap | ||
1937 | * its ptes while lock is dropped, so swapoff can make progress. | ||
1938 | */ | ||
1939 | activate_page(page); | ||
1940 | unlock_page(page); | ||
1941 | down_read(&mm->mmap_sem); | ||
1942 | lock_page(page); | ||
1943 | } | ||
1944 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 1977 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
1945 | if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) | 1978 | if (vma->anon_vma) { |
1946 | break; | 1979 | ret = unuse_vma(vma, type, frontswap, |
1980 | fs_pages_to_unuse); | ||
1981 | if (ret) | ||
1982 | break; | ||
1983 | } | ||
1947 | cond_resched(); | 1984 | cond_resched(); |
1948 | } | 1985 | } |
1949 | up_read(&mm->mmap_sem); | 1986 | up_read(&mm->mmap_sem); |
1950 | return (ret < 0)? ret: 0; | 1987 | return ret; |
1951 | } | 1988 | } |
1952 | 1989 | ||
1953 | /* | 1990 | /* |
1954 | * Scan swap_map (or frontswap_map if frontswap parameter is true) | 1991 | * Scan swap_map (or frontswap_map if frontswap parameter is true) |
1955 | * from current position to next entry still in use. | 1992 | * from current position to next entry still in use. Return 0 |
1956 | * Recycle to start on reaching the end, returning 0 when empty. | 1993 | * if there are no inuse entries after prev till end of the map. |
1957 | */ | 1994 | */ |
1958 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, | 1995 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, |
1959 | unsigned int prev, bool frontswap) | 1996 | unsigned int prev, bool frontswap) |
1960 | { | 1997 | { |
1961 | unsigned int max = si->max; | 1998 | unsigned int i; |
1962 | unsigned int i = prev; | ||
1963 | unsigned char count; | 1999 | unsigned char count; |
1964 | 2000 | ||
1965 | /* | 2001 | /* |
@@ -1968,20 +2004,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1968 | * hits are okay, and sys_swapoff() has already prevented new | 2004 | * hits are okay, and sys_swapoff() has already prevented new |
1969 | * allocations from this area (while holding swap_lock). | 2005 | * allocations from this area (while holding swap_lock). |
1970 | */ | 2006 | */ |
1971 | for (;;) { | 2007 | for (i = prev + 1; i < si->max; i++) { |
1972 | if (++i >= max) { | ||
1973 | if (!prev) { | ||
1974 | i = 0; | ||
1975 | break; | ||
1976 | } | ||
1977 | /* | ||
1978 | * No entries in use at top of swap_map, | ||
1979 | * loop back to start and recheck there. | ||
1980 | */ | ||
1981 | max = prev + 1; | ||
1982 | prev = 0; | ||
1983 | i = 1; | ||
1984 | } | ||
1985 | count = READ_ONCE(si->swap_map[i]); | 2008 | count = READ_ONCE(si->swap_map[i]); |
1986 | if (count && swap_count(count) != SWAP_MAP_BAD) | 2009 | if (count && swap_count(count) != SWAP_MAP_BAD) |
1987 | if (!frontswap || frontswap_test(si, i)) | 2010 | if (!frontswap || frontswap_test(si, i)) |
@@ -1989,240 +2012,121 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
1989 | if ((i % LATENCY_LIMIT) == 0) | 2012 | if ((i % LATENCY_LIMIT) == 0) |
1990 | cond_resched(); | 2013 | cond_resched(); |
1991 | } | 2014 | } |
2015 | |||
2016 | if (i == si->max) | ||
2017 | i = 0; | ||
2018 | |||
1992 | return i; | 2019 | return i; |
1993 | } | 2020 | } |
1994 | 2021 | ||
1995 | /* | 2022 | /* |
1996 | * We completely avoid races by reading each swap page in advance, | 2023 | * If the boolean frontswap is true, only unuse pages_to_unuse pages; |
1997 | * and then search for the process using it. All the necessary | ||
1998 | * page table adjustments can then be made atomically. | ||
1999 | * | ||
2000 | * if the boolean frontswap is true, only unuse pages_to_unuse pages; | ||
2001 | * pages_to_unuse==0 means all pages; ignored if frontswap is false | 2024 | * pages_to_unuse==0 means all pages; ignored if frontswap is false |
2002 | */ | 2025 | */ |
2026 | #define SWAP_UNUSE_MAX_TRIES 3 | ||
2003 | int try_to_unuse(unsigned int type, bool frontswap, | 2027 | int try_to_unuse(unsigned int type, bool frontswap, |
2004 | unsigned long pages_to_unuse) | 2028 | unsigned long pages_to_unuse) |
2005 | { | 2029 | { |
2030 | struct mm_struct *prev_mm; | ||
2031 | struct mm_struct *mm; | ||
2032 | struct list_head *p; | ||
2033 | int retval = 0; | ||
2006 | struct swap_info_struct *si = swap_info[type]; | 2034 | struct swap_info_struct *si = swap_info[type]; |
2007 | struct mm_struct *start_mm; | ||
2008 | volatile unsigned char *swap_map; /* swap_map is accessed without | ||
2009 | * locking. Mark it as volatile | ||
2010 | * to prevent compiler doing | ||
2011 | * something odd. | ||
2012 | */ | ||
2013 | unsigned char swcount; | ||
2014 | struct page *page; | 2035 | struct page *page; |
2015 | swp_entry_t entry; | 2036 | swp_entry_t entry; |
2016 | unsigned int i = 0; | 2037 | unsigned int i; |
2017 | int retval = 0; | 2038 | int retries = 0; |
2018 | 2039 | ||
2019 | /* | 2040 | if (!si->inuse_pages) |
2020 | * When searching mms for an entry, a good strategy is to | 2041 | return 0; |
2021 | * start at the first mm we freed the previous entry from | ||
2022 | * (though actually we don't notice whether we or coincidence | ||
2023 | * freed the entry). Initialize this start_mm with a hold. | ||
2024 | * | ||
2025 | * A simpler strategy would be to start at the last mm we | ||
2026 | * freed the previous entry from; but that would take less | ||
2027 | * advantage of mmlist ordering, which clusters forked mms | ||
2028 | * together, child after parent. If we race with dup_mmap(), we | ||
2029 | * prefer to resolve parent before child, lest we miss entries | ||
2030 | * duplicated after we scanned child: using last mm would invert | ||
2031 | * that. | ||
2032 | */ | ||
2033 | start_mm = &init_mm; | ||
2034 | mmget(&init_mm); | ||
2035 | 2042 | ||
2036 | /* | 2043 | if (!frontswap) |
2037 | * Keep on scanning until all entries have gone. Usually, | 2044 | pages_to_unuse = 0; |
2038 | * one pass through swap_map is enough, but not necessarily: | 2045 | |
2039 | * there are races when an instance of an entry might be missed. | 2046 | retry: |
2040 | */ | 2047 | retval = shmem_unuse(type, frontswap, &pages_to_unuse); |
2041 | while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { | 2048 | if (retval) |
2049 | goto out; | ||
2050 | |||
2051 | prev_mm = &init_mm; | ||
2052 | mmget(prev_mm); | ||
2053 | |||
2054 | spin_lock(&mmlist_lock); | ||
2055 | p = &init_mm.mmlist; | ||
2056 | while ((p = p->next) != &init_mm.mmlist) { | ||
2042 | if (signal_pending(current)) { | 2057 | if (signal_pending(current)) { |
2043 | retval = -EINTR; | 2058 | retval = -EINTR; |
2044 | break; | 2059 | break; |
2045 | } | 2060 | } |
2046 | 2061 | ||
2047 | /* | 2062 | mm = list_entry(p, struct mm_struct, mmlist); |
2048 | * Get a page for the entry, using the existing swap | 2063 | if (!mmget_not_zero(mm)) |
2049 | * cache page if there is one. Otherwise, get a clean | 2064 | continue; |
2050 | * page and read the swap into it. | 2065 | spin_unlock(&mmlist_lock); |
2051 | */ | 2066 | mmput(prev_mm); |
2052 | swap_map = &si->swap_map[i]; | 2067 | prev_mm = mm; |
2053 | entry = swp_entry(type, i); | 2068 | retval = unuse_mm(mm, type, frontswap, &pages_to_unuse); |
2054 | page = read_swap_cache_async(entry, | ||
2055 | GFP_HIGHUSER_MOVABLE, NULL, 0, false); | ||
2056 | if (!page) { | ||
2057 | /* | ||
2058 | * Either swap_duplicate() failed because entry | ||
2059 | * has been freed independently, and will not be | ||
2060 | * reused since sys_swapoff() already disabled | ||
2061 | * allocation from here, or alloc_page() failed. | ||
2062 | */ | ||
2063 | swcount = *swap_map; | ||
2064 | /* | ||
2065 | * We don't hold lock here, so the swap entry could be | ||
2066 | * SWAP_MAP_BAD (when the cluster is discarding). | ||
2067 | * Instead of fail out, We can just skip the swap | ||
2068 | * entry because swapoff will wait for discarding | ||
2069 | * finish anyway. | ||
2070 | */ | ||
2071 | if (!swcount || swcount == SWAP_MAP_BAD) | ||
2072 | continue; | ||
2073 | retval = -ENOMEM; | ||
2074 | break; | ||
2075 | } | ||
2076 | 2069 | ||
2077 | /* | 2070 | if (retval) { |
2078 | * Don't hold on to start_mm if it looks like exiting. | 2071 | mmput(prev_mm); |
2079 | */ | 2072 | goto out; |
2080 | if (atomic_read(&start_mm->mm_users) == 1) { | ||
2081 | mmput(start_mm); | ||
2082 | start_mm = &init_mm; | ||
2083 | mmget(&init_mm); | ||
2084 | } | 2073 | } |
2085 | 2074 | ||
2086 | /* | 2075 | /* |
2087 | * Wait for and lock page. When do_swap_page races with | 2076 | * Make sure that we aren't completely killing |
2088 | * try_to_unuse, do_swap_page can handle the fault much | 2077 | * interactive performance. |
2089 | * faster than try_to_unuse can locate the entry. This | ||
2090 | * apparently redundant "wait_on_page_locked" lets try_to_unuse | ||
2091 | * defer to do_swap_page in such a case - in some tests, | ||
2092 | * do_swap_page and try_to_unuse repeatedly compete. | ||
2093 | */ | ||
2094 | wait_on_page_locked(page); | ||
2095 | wait_on_page_writeback(page); | ||
2096 | lock_page(page); | ||
2097 | wait_on_page_writeback(page); | ||
2098 | |||
2099 | /* | ||
2100 | * Remove all references to entry. | ||
2101 | */ | 2078 | */ |
2102 | swcount = *swap_map; | 2079 | cond_resched(); |
2103 | if (swap_count(swcount) == SWAP_MAP_SHMEM) { | 2080 | spin_lock(&mmlist_lock); |
2104 | retval = shmem_unuse(entry, page); | 2081 | } |
2105 | /* page has already been unlocked and released */ | 2082 | spin_unlock(&mmlist_lock); |
2106 | if (retval < 0) | ||
2107 | break; | ||
2108 | continue; | ||
2109 | } | ||
2110 | if (swap_count(swcount) && start_mm != &init_mm) | ||
2111 | retval = unuse_mm(start_mm, entry, page); | ||
2112 | |||
2113 | if (swap_count(*swap_map)) { | ||
2114 | int set_start_mm = (*swap_map >= swcount); | ||
2115 | struct list_head *p = &start_mm->mmlist; | ||
2116 | struct mm_struct *new_start_mm = start_mm; | ||
2117 | struct mm_struct *prev_mm = start_mm; | ||
2118 | struct mm_struct *mm; | ||
2119 | |||
2120 | mmget(new_start_mm); | ||
2121 | mmget(prev_mm); | ||
2122 | spin_lock(&mmlist_lock); | ||
2123 | while (swap_count(*swap_map) && !retval && | ||
2124 | (p = p->next) != &start_mm->mmlist) { | ||
2125 | mm = list_entry(p, struct mm_struct, mmlist); | ||
2126 | if (!mmget_not_zero(mm)) | ||
2127 | continue; | ||
2128 | spin_unlock(&mmlist_lock); | ||
2129 | mmput(prev_mm); | ||
2130 | prev_mm = mm; | ||
2131 | 2083 | ||
2132 | cond_resched(); | 2084 | mmput(prev_mm); |
2133 | 2085 | ||
2134 | swcount = *swap_map; | 2086 | i = 0; |
2135 | if (!swap_count(swcount)) /* any usage ? */ | 2087 | while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { |
2136 | ; | ||
2137 | else if (mm == &init_mm) | ||
2138 | set_start_mm = 1; | ||
2139 | else | ||
2140 | retval = unuse_mm(mm, entry, page); | ||
2141 | |||
2142 | if (set_start_mm && *swap_map < swcount) { | ||
2143 | mmput(new_start_mm); | ||
2144 | mmget(mm); | ||
2145 | new_start_mm = mm; | ||
2146 | set_start_mm = 0; | ||
2147 | } | ||
2148 | spin_lock(&mmlist_lock); | ||
2149 | } | ||
2150 | spin_unlock(&mmlist_lock); | ||
2151 | mmput(prev_mm); | ||
2152 | mmput(start_mm); | ||
2153 | start_mm = new_start_mm; | ||
2154 | } | ||
2155 | if (retval) { | ||
2156 | unlock_page(page); | ||
2157 | put_page(page); | ||
2158 | break; | ||
2159 | } | ||
2160 | 2088 | ||
2161 | /* | 2089 | entry = swp_entry(type, i); |
2162 | * If a reference remains (rare), we would like to leave | 2090 | page = find_get_page(swap_address_space(entry), i); |
2163 | * the page in the swap cache; but try_to_unmap could | 2091 | if (!page) |
2164 | * then re-duplicate the entry once we drop page lock, | 2092 | continue; |
2165 | * so we might loop indefinitely; also, that page could | ||
2166 | * not be swapped out to other storage meanwhile. So: | ||
2167 | * delete from cache even if there's another reference, | ||
2168 | * after ensuring that the data has been saved to disk - | ||
2169 | * since if the reference remains (rarer), it will be | ||
2170 | * read from disk into another page. Splitting into two | ||
2171 | * pages would be incorrect if swap supported "shared | ||
2172 | * private" pages, but they are handled by tmpfs files. | ||
2173 | * | ||
2174 | * Given how unuse_vma() targets one particular offset | ||
2175 | * in an anon_vma, once the anon_vma has been determined, | ||
2176 | * this splitting happens to be just what is needed to | ||
2177 | * handle where KSM pages have been swapped out: re-reading | ||
2178 | * is unnecessarily slow, but we can fix that later on. | ||
2179 | */ | ||
2180 | if (swap_count(*swap_map) && | ||
2181 | PageDirty(page) && PageSwapCache(page)) { | ||
2182 | struct writeback_control wbc = { | ||
2183 | .sync_mode = WB_SYNC_NONE, | ||
2184 | }; | ||
2185 | |||
2186 | swap_writepage(compound_head(page), &wbc); | ||
2187 | lock_page(page); | ||
2188 | wait_on_page_writeback(page); | ||
2189 | } | ||
2190 | 2093 | ||
2191 | /* | 2094 | /* |
2192 | * It is conceivable that a racing task removed this page from | 2095 | * It is conceivable that a racing task removed this page from |
2193 | * swap cache just before we acquired the page lock at the top, | 2096 | * swap cache just before we acquired the page lock. The page |
2194 | * or while we dropped it in unuse_mm(). The page might even | 2097 | * might even be back in swap cache on another swap area. But |
2195 | * be back in swap cache on another swap area: that we must not | 2098 | * that is okay, try_to_free_swap() only removes stale pages. |
2196 | * delete, since it may not have been written out to swap yet. | ||
2197 | */ | ||
2198 | if (PageSwapCache(page) && | ||
2199 | likely(page_private(page) == entry.val) && | ||
2200 | (!PageTransCompound(page) || | ||
2201 | !swap_page_trans_huge_swapped(si, entry))) | ||
2202 | delete_from_swap_cache(compound_head(page)); | ||
2203 | |||
2204 | /* | ||
2205 | * So we could skip searching mms once swap count went | ||
2206 | * to 1, we did not mark any present ptes as dirty: must | ||
2207 | * mark page dirty so shrink_page_list will preserve it. | ||
2208 | */ | 2099 | */ |
2209 | SetPageDirty(page); | 2100 | lock_page(page); |
2101 | wait_on_page_writeback(page); | ||
2102 | try_to_free_swap(page); | ||
2210 | unlock_page(page); | 2103 | unlock_page(page); |
2211 | put_page(page); | 2104 | put_page(page); |
2212 | 2105 | ||
2213 | /* | 2106 | /* |
2214 | * Make sure that we aren't completely killing | 2107 | * For frontswap, we just need to unuse pages_to_unuse, if |
2215 | * interactive performance. | 2108 | * it was specified. Need not check frontswap again here as |
2109 | * we already zeroed out pages_to_unuse if not frontswap. | ||
2216 | */ | 2110 | */ |
2217 | cond_resched(); | 2111 | if (pages_to_unuse && --pages_to_unuse == 0) |
2218 | if (frontswap && pages_to_unuse > 0) { | 2112 | goto out; |
2219 | if (!--pages_to_unuse) | ||
2220 | break; | ||
2221 | } | ||
2222 | } | 2113 | } |
2223 | 2114 | ||
2224 | mmput(start_mm); | 2115 | /* |
2225 | return retval; | 2116 | * Lets check again to see if there are still swap entries in the map. |
2117 | * If yes, we would need to do retry the unuse logic again. | ||
2118 | * Under global memory pressure, swap entries can be reinserted back | ||
2119 | * into process space after the mmlist loop above passes over them. | ||
2120 | * Its not worth continuosuly retrying to unuse the swap in this case. | ||
2121 | * So we try SWAP_UNUSE_MAX_TRIES times. | ||
2122 | */ | ||
2123 | if (++retries >= SWAP_UNUSE_MAX_TRIES) | ||
2124 | retval = -EBUSY; | ||
2125 | else if (si->inuse_pages) | ||
2126 | goto retry; | ||
2127 | |||
2128 | out: | ||
2129 | return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; | ||
2226 | } | 2130 | } |
2227 | 2131 | ||
2228 | /* | 2132 | /* |
@@ -2258,7 +2162,7 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) | |||
2258 | struct swap_extent *se; | 2162 | struct swap_extent *se; |
2259 | pgoff_t offset; | 2163 | pgoff_t offset; |
2260 | 2164 | ||
2261 | sis = swap_info[swp_type(entry)]; | 2165 | sis = swp_swap_info(entry); |
2262 | *bdev = sis->bdev; | 2166 | *bdev = sis->bdev; |
2263 | 2167 | ||
2264 | offset = swp_offset(entry); | 2168 | offset = swp_offset(entry); |
@@ -2700,9 +2604,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
2700 | if (!l) | 2604 | if (!l) |
2701 | return SEQ_START_TOKEN; | 2605 | return SEQ_START_TOKEN; |
2702 | 2606 | ||
2703 | for (type = 0; type < nr_swapfiles; type++) { | 2607 | for (type = 0; (si = swap_type_to_swap_info(type)); type++) { |
2704 | smp_rmb(); /* read nr_swapfiles before swap_info[type] */ | ||
2705 | si = swap_info[type]; | ||
2706 | if (!(si->flags & SWP_USED) || !si->swap_map) | 2608 | if (!(si->flags & SWP_USED) || !si->swap_map) |
2707 | continue; | 2609 | continue; |
2708 | if (!--l) | 2610 | if (!--l) |
@@ -2722,9 +2624,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) | |||
2722 | else | 2624 | else |
2723 | type = si->type + 1; | 2625 | type = si->type + 1; |
2724 | 2626 | ||
2725 | for (; type < nr_swapfiles; type++) { | 2627 | for (; (si = swap_type_to_swap_info(type)); type++) { |
2726 | smp_rmb(); /* read nr_swapfiles before swap_info[type] */ | ||
2727 | si = swap_info[type]; | ||
2728 | if (!(si->flags & SWP_USED) || !si->swap_map) | 2628 | if (!(si->flags & SWP_USED) || !si->swap_map) |
2729 | continue; | 2629 | continue; |
2730 | ++*pos; | 2630 | ++*pos; |
@@ -2813,9 +2713,8 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
2813 | struct swap_info_struct *p; | 2713 | struct swap_info_struct *p; |
2814 | unsigned int type; | 2714 | unsigned int type; |
2815 | int i; | 2715 | int i; |
2816 | int size = sizeof(*p) + nr_node_ids * sizeof(struct plist_node); | ||
2817 | 2716 | ||
2818 | p = kvzalloc(size, GFP_KERNEL); | 2717 | p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); |
2819 | if (!p) | 2718 | if (!p) |
2820 | return ERR_PTR(-ENOMEM); | 2719 | return ERR_PTR(-ENOMEM); |
2821 | 2720 | ||
@@ -2831,14 +2730,14 @@ static struct swap_info_struct *alloc_swap_info(void) | |||
2831 | } | 2730 | } |
2832 | if (type >= nr_swapfiles) { | 2731 | if (type >= nr_swapfiles) { |
2833 | p->type = type; | 2732 | p->type = type; |
2834 | swap_info[type] = p; | 2733 | WRITE_ONCE(swap_info[type], p); |
2835 | /* | 2734 | /* |
2836 | * Write swap_info[type] before nr_swapfiles, in case a | 2735 | * Write swap_info[type] before nr_swapfiles, in case a |
2837 | * racing procfs swap_start() or swap_next() is reading them. | 2736 | * racing procfs swap_start() or swap_next() is reading them. |
2838 | * (We never shrink nr_swapfiles, we never free this entry.) | 2737 | * (We never shrink nr_swapfiles, we never free this entry.) |
2839 | */ | 2738 | */ |
2840 | smp_wmb(); | 2739 | smp_wmb(); |
2841 | nr_swapfiles++; | 2740 | WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1); |
2842 | } else { | 2741 | } else { |
2843 | kvfree(p); | 2742 | kvfree(p); |
2844 | p = swap_info[type]; | 2743 | p = swap_info[type]; |
@@ -3358,7 +3257,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) | |||
3358 | { | 3257 | { |
3359 | struct swap_info_struct *p; | 3258 | struct swap_info_struct *p; |
3360 | struct swap_cluster_info *ci; | 3259 | struct swap_cluster_info *ci; |
3361 | unsigned long offset, type; | 3260 | unsigned long offset; |
3362 | unsigned char count; | 3261 | unsigned char count; |
3363 | unsigned char has_cache; | 3262 | unsigned char has_cache; |
3364 | int err = -EINVAL; | 3263 | int err = -EINVAL; |
@@ -3366,10 +3265,10 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) | |||
3366 | if (non_swap_entry(entry)) | 3265 | if (non_swap_entry(entry)) |
3367 | goto out; | 3266 | goto out; |
3368 | 3267 | ||
3369 | type = swp_type(entry); | 3268 | p = swp_swap_info(entry); |
3370 | if (type >= nr_swapfiles) | 3269 | if (!p) |
3371 | goto bad_file; | 3270 | goto bad_file; |
3372 | p = swap_info[type]; | 3271 | |
3373 | offset = swp_offset(entry); | 3272 | offset = swp_offset(entry); |
3374 | if (unlikely(offset >= p->max)) | 3273 | if (unlikely(offset >= p->max)) |
3375 | goto out; | 3274 | goto out; |
@@ -3466,7 +3365,7 @@ int swapcache_prepare(swp_entry_t entry) | |||
3466 | 3365 | ||
3467 | struct swap_info_struct *swp_swap_info(swp_entry_t entry) | 3366 | struct swap_info_struct *swp_swap_info(swp_entry_t entry) |
3468 | { | 3367 | { |
3469 | return swap_info[swp_type(entry)]; | 3368 | return swap_type_to_swap_info(swp_type(entry)); |
3470 | } | 3369 | } |
3471 | 3370 | ||
3472 | struct swap_info_struct *page_swap_info(struct page *page) | 3371 | struct swap_info_struct *page_swap_info(struct page *page) |
diff --git a/mm/truncate.c b/mm/truncate.c index 798e7ccfb030..b7d3c99f00c9 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -539,6 +539,8 @@ EXPORT_SYMBOL(truncate_inode_pages_final); | |||
539 | * invalidate_mapping_pages() will not block on IO activity. It will not | 539 | * invalidate_mapping_pages() will not block on IO activity. It will not |
540 | * invalidate pages which are dirty, locked, under writeback or mapped into | 540 | * invalidate pages which are dirty, locked, under writeback or mapped into |
541 | * pagetables. | 541 | * pagetables. |
542 | * | ||
543 | * Return: the number of the pages that were invalidated | ||
542 | */ | 544 | */ |
543 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | 545 | unsigned long invalidate_mapping_pages(struct address_space *mapping, |
544 | pgoff_t start, pgoff_t end) | 546 | pgoff_t start, pgoff_t end) |
@@ -664,7 +666,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) | |||
664 | * Any pages which are found to be mapped into pagetables are unmapped prior to | 666 | * Any pages which are found to be mapped into pagetables are unmapped prior to |
665 | * invalidation. | 667 | * invalidation. |
666 | * | 668 | * |
667 | * Returns -EBUSY if any pages could not be invalidated. | 669 | * Return: -EBUSY if any pages could not be invalidated. |
668 | */ | 670 | */ |
669 | int invalidate_inode_pages2_range(struct address_space *mapping, | 671 | int invalidate_inode_pages2_range(struct address_space *mapping, |
670 | pgoff_t start, pgoff_t end) | 672 | pgoff_t start, pgoff_t end) |
@@ -761,7 +763,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); | |||
761 | * Any pages which are found to be mapped into pagetables are unmapped prior to | 763 | * Any pages which are found to be mapped into pagetables are unmapped prior to |
762 | * invalidation. | 764 | * invalidation. |
763 | * | 765 | * |
764 | * Returns -EBUSY if any pages could not be invalidated. | 766 | * Return: -EBUSY if any pages could not be invalidated. |
765 | */ | 767 | */ |
766 | int invalidate_inode_pages2(struct address_space *mapping) | 768 | int invalidate_inode_pages2(struct address_space *mapping) |
767 | { | 769 | { |
@@ -36,6 +36,8 @@ EXPORT_SYMBOL(kfree_const); | |||
36 | * kstrdup - allocate space for and copy an existing string | 36 | * kstrdup - allocate space for and copy an existing string |
37 | * @s: the string to duplicate | 37 | * @s: the string to duplicate |
38 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory | 38 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory |
39 | * | ||
40 | * Return: newly allocated copy of @s or %NULL in case of error | ||
39 | */ | 41 | */ |
40 | char *kstrdup(const char *s, gfp_t gfp) | 42 | char *kstrdup(const char *s, gfp_t gfp) |
41 | { | 43 | { |
@@ -58,9 +60,10 @@ EXPORT_SYMBOL(kstrdup); | |||
58 | * @s: the string to duplicate | 60 | * @s: the string to duplicate |
59 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory | 61 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory |
60 | * | 62 | * |
61 | * Function returns source string if it is in .rodata section otherwise it | 63 | * Note: Strings allocated by kstrdup_const should be freed by kfree_const. |
62 | * fallbacks to kstrdup. | 64 | * |
63 | * Strings allocated by kstrdup_const should be freed by kfree_const. | 65 | * Return: source string if it is in .rodata section otherwise |
66 | * fallback to kstrdup. | ||
64 | */ | 67 | */ |
65 | const char *kstrdup_const(const char *s, gfp_t gfp) | 68 | const char *kstrdup_const(const char *s, gfp_t gfp) |
66 | { | 69 | { |
@@ -78,6 +81,8 @@ EXPORT_SYMBOL(kstrdup_const); | |||
78 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory | 81 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory |
79 | * | 82 | * |
80 | * Note: Use kmemdup_nul() instead if the size is known exactly. | 83 | * Note: Use kmemdup_nul() instead if the size is known exactly. |
84 | * | ||
85 | * Return: newly allocated copy of @s or %NULL in case of error | ||
81 | */ | 86 | */ |
82 | char *kstrndup(const char *s, size_t max, gfp_t gfp) | 87 | char *kstrndup(const char *s, size_t max, gfp_t gfp) |
83 | { | 88 | { |
@@ -103,6 +108,8 @@ EXPORT_SYMBOL(kstrndup); | |||
103 | * @src: memory region to duplicate | 108 | * @src: memory region to duplicate |
104 | * @len: memory region length | 109 | * @len: memory region length |
105 | * @gfp: GFP mask to use | 110 | * @gfp: GFP mask to use |
111 | * | ||
112 | * Return: newly allocated copy of @src or %NULL in case of error | ||
106 | */ | 113 | */ |
107 | void *kmemdup(const void *src, size_t len, gfp_t gfp) | 114 | void *kmemdup(const void *src, size_t len, gfp_t gfp) |
108 | { | 115 | { |
@@ -120,6 +127,9 @@ EXPORT_SYMBOL(kmemdup); | |||
120 | * @s: The data to stringify | 127 | * @s: The data to stringify |
121 | * @len: The size of the data | 128 | * @len: The size of the data |
122 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory | 129 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory |
130 | * | ||
131 | * Return: newly allocated copy of @s with NUL-termination or %NULL in | ||
132 | * case of error | ||
123 | */ | 133 | */ |
124 | char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) | 134 | char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) |
125 | { | 135 | { |
@@ -143,7 +153,7 @@ EXPORT_SYMBOL(kmemdup_nul); | |||
143 | * @src: source address in user space | 153 | * @src: source address in user space |
144 | * @len: number of bytes to copy | 154 | * @len: number of bytes to copy |
145 | * | 155 | * |
146 | * Returns an ERR_PTR() on failure. Result is physically | 156 | * Return: an ERR_PTR() on failure. Result is physically |
147 | * contiguous, to be freed by kfree(). | 157 | * contiguous, to be freed by kfree(). |
148 | */ | 158 | */ |
149 | void *memdup_user(const void __user *src, size_t len) | 159 | void *memdup_user(const void __user *src, size_t len) |
@@ -169,7 +179,7 @@ EXPORT_SYMBOL(memdup_user); | |||
169 | * @src: source address in user space | 179 | * @src: source address in user space |
170 | * @len: number of bytes to copy | 180 | * @len: number of bytes to copy |
171 | * | 181 | * |
172 | * Returns an ERR_PTR() on failure. Result may be not | 182 | * Return: an ERR_PTR() on failure. Result may be not |
173 | * physically contiguous. Use kvfree() to free. | 183 | * physically contiguous. Use kvfree() to free. |
174 | */ | 184 | */ |
175 | void *vmemdup_user(const void __user *src, size_t len) | 185 | void *vmemdup_user(const void __user *src, size_t len) |
@@ -193,6 +203,8 @@ EXPORT_SYMBOL(vmemdup_user); | |||
193 | * strndup_user - duplicate an existing string from user space | 203 | * strndup_user - duplicate an existing string from user space |
194 | * @s: The string to duplicate | 204 | * @s: The string to duplicate |
195 | * @n: Maximum number of bytes to copy, including the trailing NUL. | 205 | * @n: Maximum number of bytes to copy, including the trailing NUL. |
206 | * | ||
207 | * Return: newly allocated copy of @s or %NULL in case of error | ||
196 | */ | 208 | */ |
197 | char *strndup_user(const char __user *s, long n) | 209 | char *strndup_user(const char __user *s, long n) |
198 | { | 210 | { |
@@ -224,7 +236,7 @@ EXPORT_SYMBOL(strndup_user); | |||
224 | * @src: source address in user space | 236 | * @src: source address in user space |
225 | * @len: number of bytes to copy | 237 | * @len: number of bytes to copy |
226 | * | 238 | * |
227 | * Returns an ERR_PTR() on failure. | 239 | * Return: an ERR_PTR() on failure. |
228 | */ | 240 | */ |
229 | void *memdup_user_nul(const void __user *src, size_t len) | 241 | void *memdup_user_nul(const void __user *src, size_t len) |
230 | { | 242 | { |
@@ -310,10 +322,6 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); | |||
310 | * @pages: array that receives pointers to the pages pinned. | 322 | * @pages: array that receives pointers to the pages pinned. |
311 | * Should be at least nr_pages long. | 323 | * Should be at least nr_pages long. |
312 | * | 324 | * |
313 | * Returns number of pages pinned. This may be fewer than the number | ||
314 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
315 | * were pinned, returns -errno. | ||
316 | * | ||
317 | * get_user_pages_fast provides equivalent functionality to get_user_pages, | 325 | * get_user_pages_fast provides equivalent functionality to get_user_pages, |
318 | * operating on current and current->mm, with force=0 and vma=NULL. However | 326 | * operating on current and current->mm, with force=0 and vma=NULL. However |
319 | * unlike get_user_pages, it must be called without mmap_sem held. | 327 | * unlike get_user_pages, it must be called without mmap_sem held. |
@@ -325,6 +333,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); | |||
325 | * pages have to be faulted in, it may turn out to be slightly slower so | 333 | * pages have to be faulted in, it may turn out to be slightly slower so |
326 | * callers need to carefully consider what to use. On many architectures, | 334 | * callers need to carefully consider what to use. On many architectures, |
327 | * get_user_pages_fast simply falls back to get_user_pages. | 335 | * get_user_pages_fast simply falls back to get_user_pages. |
336 | * | ||
337 | * Return: number of pages pinned. This may be fewer than the number | ||
338 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
339 | * were pinned, returns -errno. | ||
328 | */ | 340 | */ |
329 | int __weak get_user_pages_fast(unsigned long start, | 341 | int __weak get_user_pages_fast(unsigned long start, |
330 | int nr_pages, int write, struct page **pages) | 342 | int nr_pages, int write, struct page **pages) |
@@ -386,6 +398,8 @@ EXPORT_SYMBOL(vm_mmap); | |||
386 | * | 398 | * |
387 | * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not | 399 | * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not |
388 | * fall back to vmalloc. | 400 | * fall back to vmalloc. |
401 | * | ||
402 | * Return: pointer to the allocated memory of %NULL in case of failure | ||
389 | */ | 403 | */ |
390 | void *kvmalloc_node(size_t size, gfp_t flags, int node) | 404 | void *kvmalloc_node(size_t size, gfp_t flags, int node) |
391 | { | 405 | { |
@@ -729,7 +743,8 @@ error: | |||
729 | * @buffer: the buffer to copy to. | 743 | * @buffer: the buffer to copy to. |
730 | * @buflen: the length of the buffer. Larger cmdline values are truncated | 744 | * @buflen: the length of the buffer. Larger cmdline values are truncated |
731 | * to this length. | 745 | * to this length. |
732 | * Returns the size of the cmdline field copied. Note that the copy does | 746 | * |
747 | * Return: the size of the cmdline field copied. Note that the copy does | ||
733 | * not guarantee an ending NULL byte. | 748 | * not guarantee an ending NULL byte. |
734 | */ | 749 | */ |
735 | int get_cmdline(struct task_struct *task, char *buffer, int buflen) | 750 | int get_cmdline(struct task_struct *task, char *buffer, int buflen) |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 871e41c55e23..e86ba6e74b50 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -498,7 +498,11 @@ nocache: | |||
498 | } | 498 | } |
499 | 499 | ||
500 | found: | 500 | found: |
501 | if (addr + size > vend) | 501 | /* |
502 | * Check also calculated address against the vstart, | ||
503 | * because it can be 0 because of big align request. | ||
504 | */ | ||
505 | if (addr + size > vend || addr < vstart) | ||
502 | goto overflow; | 506 | goto overflow; |
503 | 507 | ||
504 | va->va_start = addr; | 508 | va->va_start = addr; |
@@ -840,7 +844,7 @@ static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) | |||
840 | * @order: how many 2^order pages should be occupied in newly allocated block | 844 | * @order: how many 2^order pages should be occupied in newly allocated block |
841 | * @gfp_mask: flags for the page level allocator | 845 | * @gfp_mask: flags for the page level allocator |
842 | * | 846 | * |
843 | * Returns: virtual address in a newly allocated block or ERR_PTR(-errno) | 847 | * Return: virtual address in a newly allocated block or ERR_PTR(-errno) |
844 | */ | 848 | */ |
845 | static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) | 849 | static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) |
846 | { | 850 | { |
@@ -1187,6 +1191,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro | |||
1187 | EXPORT_SYMBOL(vm_map_ram); | 1191 | EXPORT_SYMBOL(vm_map_ram); |
1188 | 1192 | ||
1189 | static struct vm_struct *vmlist __initdata; | 1193 | static struct vm_struct *vmlist __initdata; |
1194 | |||
1190 | /** | 1195 | /** |
1191 | * vm_area_add_early - add vmap area early during boot | 1196 | * vm_area_add_early - add vmap area early during boot |
1192 | * @vm: vm_struct to add | 1197 | * @vm: vm_struct to add |
@@ -1421,13 +1426,15 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, | |||
1421 | } | 1426 | } |
1422 | 1427 | ||
1423 | /** | 1428 | /** |
1424 | * get_vm_area - reserve a contiguous kernel virtual area | 1429 | * get_vm_area - reserve a contiguous kernel virtual area |
1425 | * @size: size of the area | 1430 | * @size: size of the area |
1426 | * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC | 1431 | * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC |
1427 | * | 1432 | * |
1428 | * Search an area of @size in the kernel virtual mapping area, | 1433 | * Search an area of @size in the kernel virtual mapping area, |
1429 | * and reserved it for out purposes. Returns the area descriptor | 1434 | * and reserved it for out purposes. Returns the area descriptor |
1430 | * on success or %NULL on failure. | 1435 | * on success or %NULL on failure. |
1436 | * | ||
1437 | * Return: the area descriptor on success or %NULL on failure. | ||
1431 | */ | 1438 | */ |
1432 | struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) | 1439 | struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) |
1433 | { | 1440 | { |
@@ -1444,12 +1451,14 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, | |||
1444 | } | 1451 | } |
1445 | 1452 | ||
1446 | /** | 1453 | /** |
1447 | * find_vm_area - find a continuous kernel virtual area | 1454 | * find_vm_area - find a continuous kernel virtual area |
1448 | * @addr: base address | 1455 | * @addr: base address |
1456 | * | ||
1457 | * Search for the kernel VM area starting at @addr, and return it. | ||
1458 | * It is up to the caller to do all required locking to keep the returned | ||
1459 | * pointer valid. | ||
1449 | * | 1460 | * |
1450 | * Search for the kernel VM area starting at @addr, and return it. | 1461 | * Return: pointer to the found area or %NULL on faulure |
1451 | * It is up to the caller to do all required locking to keep the returned | ||
1452 | * pointer valid. | ||
1453 | */ | 1462 | */ |
1454 | struct vm_struct *find_vm_area(const void *addr) | 1463 | struct vm_struct *find_vm_area(const void *addr) |
1455 | { | 1464 | { |
@@ -1463,12 +1472,14 @@ struct vm_struct *find_vm_area(const void *addr) | |||
1463 | } | 1472 | } |
1464 | 1473 | ||
1465 | /** | 1474 | /** |
1466 | * remove_vm_area - find and remove a continuous kernel virtual area | 1475 | * remove_vm_area - find and remove a continuous kernel virtual area |
1467 | * @addr: base address | 1476 | * @addr: base address |
1468 | * | 1477 | * |
1469 | * Search for the kernel VM area starting at @addr, and remove it. | 1478 | * Search for the kernel VM area starting at @addr, and remove it. |
1470 | * This function returns the found VM area, but using it is NOT safe | 1479 | * This function returns the found VM area, but using it is NOT safe |
1471 | * on SMP machines, except for its size or flags. | 1480 | * on SMP machines, except for its size or flags. |
1481 | * | ||
1482 | * Return: pointer to the found area or %NULL on faulure | ||
1472 | */ | 1483 | */ |
1473 | struct vm_struct *remove_vm_area(const void *addr) | 1484 | struct vm_struct *remove_vm_area(const void *addr) |
1474 | { | 1485 | { |
@@ -1505,7 +1516,7 @@ static void __vunmap(const void *addr, int deallocate_pages) | |||
1505 | addr)) | 1516 | addr)) |
1506 | return; | 1517 | return; |
1507 | 1518 | ||
1508 | area = find_vmap_area((unsigned long)addr)->vm; | 1519 | area = find_vm_area(addr); |
1509 | if (unlikely(!area)) { | 1520 | if (unlikely(!area)) { |
1510 | WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", | 1521 | WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", |
1511 | addr); | 1522 | addr); |
@@ -1548,11 +1559,11 @@ static inline void __vfree_deferred(const void *addr) | |||
1548 | } | 1559 | } |
1549 | 1560 | ||
1550 | /** | 1561 | /** |
1551 | * vfree_atomic - release memory allocated by vmalloc() | 1562 | * vfree_atomic - release memory allocated by vmalloc() |
1552 | * @addr: memory base address | 1563 | * @addr: memory base address |
1553 | * | 1564 | * |
1554 | * This one is just like vfree() but can be called in any atomic context | 1565 | * This one is just like vfree() but can be called in any atomic context |
1555 | * except NMIs. | 1566 | * except NMIs. |
1556 | */ | 1567 | */ |
1557 | void vfree_atomic(const void *addr) | 1568 | void vfree_atomic(const void *addr) |
1558 | { | 1569 | { |
@@ -1565,21 +1576,29 @@ void vfree_atomic(const void *addr) | |||
1565 | __vfree_deferred(addr); | 1576 | __vfree_deferred(addr); |
1566 | } | 1577 | } |
1567 | 1578 | ||
1579 | static void __vfree(const void *addr) | ||
1580 | { | ||
1581 | if (unlikely(in_interrupt())) | ||
1582 | __vfree_deferred(addr); | ||
1583 | else | ||
1584 | __vunmap(addr, 1); | ||
1585 | } | ||
1586 | |||
1568 | /** | 1587 | /** |
1569 | * vfree - release memory allocated by vmalloc() | 1588 | * vfree - release memory allocated by vmalloc() |
1570 | * @addr: memory base address | 1589 | * @addr: memory base address |
1571 | * | 1590 | * |
1572 | * Free the virtually continuous memory area starting at @addr, as | 1591 | * Free the virtually continuous memory area starting at @addr, as |
1573 | * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is | 1592 | * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is |
1574 | * NULL, no operation is performed. | 1593 | * NULL, no operation is performed. |
1575 | * | 1594 | * |
1576 | * Must not be called in NMI context (strictly speaking, only if we don't | 1595 | * Must not be called in NMI context (strictly speaking, only if we don't |
1577 | * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling | 1596 | * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling |
1578 | * conventions for vfree() arch-depenedent would be a really bad idea) | 1597 | * conventions for vfree() arch-depenedent would be a really bad idea) |
1579 | * | 1598 | * |
1580 | * May sleep if called *not* from interrupt context. | 1599 | * May sleep if called *not* from interrupt context. |
1581 | * | 1600 | * |
1582 | * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) | 1601 | * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) |
1583 | */ | 1602 | */ |
1584 | void vfree(const void *addr) | 1603 | void vfree(const void *addr) |
1585 | { | 1604 | { |
@@ -1591,21 +1610,19 @@ void vfree(const void *addr) | |||
1591 | 1610 | ||
1592 | if (!addr) | 1611 | if (!addr) |
1593 | return; | 1612 | return; |
1594 | if (unlikely(in_interrupt())) | 1613 | |
1595 | __vfree_deferred(addr); | 1614 | __vfree(addr); |
1596 | else | ||
1597 | __vunmap(addr, 1); | ||
1598 | } | 1615 | } |
1599 | EXPORT_SYMBOL(vfree); | 1616 | EXPORT_SYMBOL(vfree); |
1600 | 1617 | ||
1601 | /** | 1618 | /** |
1602 | * vunmap - release virtual mapping obtained by vmap() | 1619 | * vunmap - release virtual mapping obtained by vmap() |
1603 | * @addr: memory base address | 1620 | * @addr: memory base address |
1604 | * | 1621 | * |
1605 | * Free the virtually contiguous memory area starting at @addr, | 1622 | * Free the virtually contiguous memory area starting at @addr, |
1606 | * which was created from the page array passed to vmap(). | 1623 | * which was created from the page array passed to vmap(). |
1607 | * | 1624 | * |
1608 | * Must not be called in interrupt context. | 1625 | * Must not be called in interrupt context. |
1609 | */ | 1626 | */ |
1610 | void vunmap(const void *addr) | 1627 | void vunmap(const void *addr) |
1611 | { | 1628 | { |
@@ -1617,17 +1634,19 @@ void vunmap(const void *addr) | |||
1617 | EXPORT_SYMBOL(vunmap); | 1634 | EXPORT_SYMBOL(vunmap); |
1618 | 1635 | ||
1619 | /** | 1636 | /** |
1620 | * vmap - map an array of pages into virtually contiguous space | 1637 | * vmap - map an array of pages into virtually contiguous space |
1621 | * @pages: array of page pointers | 1638 | * @pages: array of page pointers |
1622 | * @count: number of pages to map | 1639 | * @count: number of pages to map |
1623 | * @flags: vm_area->flags | 1640 | * @flags: vm_area->flags |
1624 | * @prot: page protection for the mapping | 1641 | * @prot: page protection for the mapping |
1625 | * | 1642 | * |
1626 | * Maps @count pages from @pages into contiguous kernel virtual | 1643 | * Maps @count pages from @pages into contiguous kernel virtual |
1627 | * space. | 1644 | * space. |
1645 | * | ||
1646 | * Return: the address of the area or %NULL on failure | ||
1628 | */ | 1647 | */ |
1629 | void *vmap(struct page **pages, unsigned int count, | 1648 | void *vmap(struct page **pages, unsigned int count, |
1630 | unsigned long flags, pgprot_t prot) | 1649 | unsigned long flags, pgprot_t prot) |
1631 | { | 1650 | { |
1632 | struct vm_struct *area; | 1651 | struct vm_struct *area; |
1633 | unsigned long size; /* In bytes */ | 1652 | unsigned long size; /* In bytes */ |
@@ -1709,25 +1728,27 @@ fail: | |||
1709 | warn_alloc(gfp_mask, NULL, | 1728 | warn_alloc(gfp_mask, NULL, |
1710 | "vmalloc: allocation failure, allocated %ld of %ld bytes", | 1729 | "vmalloc: allocation failure, allocated %ld of %ld bytes", |
1711 | (area->nr_pages*PAGE_SIZE), area->size); | 1730 | (area->nr_pages*PAGE_SIZE), area->size); |
1712 | vfree(area->addr); | 1731 | __vfree(area->addr); |
1713 | return NULL; | 1732 | return NULL; |
1714 | } | 1733 | } |
1715 | 1734 | ||
1716 | /** | 1735 | /** |
1717 | * __vmalloc_node_range - allocate virtually contiguous memory | 1736 | * __vmalloc_node_range - allocate virtually contiguous memory |
1718 | * @size: allocation size | 1737 | * @size: allocation size |
1719 | * @align: desired alignment | 1738 | * @align: desired alignment |
1720 | * @start: vm area range start | 1739 | * @start: vm area range start |
1721 | * @end: vm area range end | 1740 | * @end: vm area range end |
1722 | * @gfp_mask: flags for the page level allocator | 1741 | * @gfp_mask: flags for the page level allocator |
1723 | * @prot: protection mask for the allocated pages | 1742 | * @prot: protection mask for the allocated pages |
1724 | * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) | 1743 | * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) |
1725 | * @node: node to use for allocation or NUMA_NO_NODE | 1744 | * @node: node to use for allocation or NUMA_NO_NODE |
1726 | * @caller: caller's return address | 1745 | * @caller: caller's return address |
1727 | * | 1746 | * |
1728 | * Allocate enough pages to cover @size from the page level | 1747 | * Allocate enough pages to cover @size from the page level |
1729 | * allocator with @gfp_mask flags. Map them into contiguous | 1748 | * allocator with @gfp_mask flags. Map them into contiguous |
1730 | * kernel virtual space, using a pagetable protection of @prot. | 1749 | * kernel virtual space, using a pagetable protection of @prot. |
1750 | * | ||
1751 | * Return: the address of the area or %NULL on failure | ||
1731 | */ | 1752 | */ |
1732 | void *__vmalloc_node_range(unsigned long size, unsigned long align, | 1753 | void *__vmalloc_node_range(unsigned long size, unsigned long align, |
1733 | unsigned long start, unsigned long end, gfp_t gfp_mask, | 1754 | unsigned long start, unsigned long end, gfp_t gfp_mask, |
@@ -1768,25 +1789,35 @@ fail: | |||
1768 | return NULL; | 1789 | return NULL; |
1769 | } | 1790 | } |
1770 | 1791 | ||
1792 | /* | ||
1793 | * This is only for performance analysis of vmalloc and stress purpose. | ||
1794 | * It is required by vmalloc test module, therefore do not use it other | ||
1795 | * than that. | ||
1796 | */ | ||
1797 | #ifdef CONFIG_TEST_VMALLOC_MODULE | ||
1798 | EXPORT_SYMBOL_GPL(__vmalloc_node_range); | ||
1799 | #endif | ||
1800 | |||
1771 | /** | 1801 | /** |
1772 | * __vmalloc_node - allocate virtually contiguous memory | 1802 | * __vmalloc_node - allocate virtually contiguous memory |
1773 | * @size: allocation size | 1803 | * @size: allocation size |
1774 | * @align: desired alignment | 1804 | * @align: desired alignment |
1775 | * @gfp_mask: flags for the page level allocator | 1805 | * @gfp_mask: flags for the page level allocator |
1776 | * @prot: protection mask for the allocated pages | 1806 | * @prot: protection mask for the allocated pages |
1777 | * @node: node to use for allocation or NUMA_NO_NODE | 1807 | * @node: node to use for allocation or NUMA_NO_NODE |
1778 | * @caller: caller's return address | 1808 | * @caller: caller's return address |
1779 | * | 1809 | * |
1780 | * Allocate enough pages to cover @size from the page level | 1810 | * Allocate enough pages to cover @size from the page level |
1781 | * allocator with @gfp_mask flags. Map them into contiguous | 1811 | * allocator with @gfp_mask flags. Map them into contiguous |
1782 | * kernel virtual space, using a pagetable protection of @prot. | 1812 | * kernel virtual space, using a pagetable protection of @prot. |
1783 | * | 1813 | * |
1784 | * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL | 1814 | * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL |
1785 | * and __GFP_NOFAIL are not supported | 1815 | * and __GFP_NOFAIL are not supported |
1786 | * | 1816 | * |
1787 | * Any use of gfp flags outside of GFP_KERNEL should be consulted | 1817 | * Any use of gfp flags outside of GFP_KERNEL should be consulted |
1788 | * with mm people. | 1818 | * with mm people. |
1789 | * | 1819 | * |
1820 | * Return: pointer to the allocated memory or %NULL on error | ||
1790 | */ | 1821 | */ |
1791 | static void *__vmalloc_node(unsigned long size, unsigned long align, | 1822 | static void *__vmalloc_node(unsigned long size, unsigned long align, |
1792 | gfp_t gfp_mask, pgprot_t prot, | 1823 | gfp_t gfp_mask, pgprot_t prot, |
@@ -1818,13 +1849,16 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, | |||
1818 | } | 1849 | } |
1819 | 1850 | ||
1820 | /** | 1851 | /** |
1821 | * vmalloc - allocate virtually contiguous memory | 1852 | * vmalloc - allocate virtually contiguous memory |
1822 | * @size: allocation size | 1853 | * @size: allocation size |
1823 | * Allocate enough pages to cover @size from the page level | 1854 | * |
1824 | * allocator and map them into contiguous kernel virtual space. | 1855 | * Allocate enough pages to cover @size from the page level |
1856 | * allocator and map them into contiguous kernel virtual space. | ||
1857 | * | ||
1858 | * For tight control over page level allocator and protection flags | ||
1859 | * use __vmalloc() instead. | ||
1825 | * | 1860 | * |
1826 | * For tight control over page level allocator and protection flags | 1861 | * Return: pointer to the allocated memory or %NULL on error |
1827 | * use __vmalloc() instead. | ||
1828 | */ | 1862 | */ |
1829 | void *vmalloc(unsigned long size) | 1863 | void *vmalloc(unsigned long size) |
1830 | { | 1864 | { |
@@ -1834,14 +1868,17 @@ void *vmalloc(unsigned long size) | |||
1834 | EXPORT_SYMBOL(vmalloc); | 1868 | EXPORT_SYMBOL(vmalloc); |
1835 | 1869 | ||
1836 | /** | 1870 | /** |
1837 | * vzalloc - allocate virtually contiguous memory with zero fill | 1871 | * vzalloc - allocate virtually contiguous memory with zero fill |
1838 | * @size: allocation size | 1872 | * @size: allocation size |
1839 | * Allocate enough pages to cover @size from the page level | 1873 | * |
1840 | * allocator and map them into contiguous kernel virtual space. | 1874 | * Allocate enough pages to cover @size from the page level |
1841 | * The memory allocated is set to zero. | 1875 | * allocator and map them into contiguous kernel virtual space. |
1842 | * | 1876 | * The memory allocated is set to zero. |
1843 | * For tight control over page level allocator and protection flags | 1877 | * |
1844 | * use __vmalloc() instead. | 1878 | * For tight control over page level allocator and protection flags |
1879 | * use __vmalloc() instead. | ||
1880 | * | ||
1881 | * Return: pointer to the allocated memory or %NULL on error | ||
1845 | */ | 1882 | */ |
1846 | void *vzalloc(unsigned long size) | 1883 | void *vzalloc(unsigned long size) |
1847 | { | 1884 | { |
@@ -1856,34 +1893,30 @@ EXPORT_SYMBOL(vzalloc); | |||
1856 | * | 1893 | * |
1857 | * The resulting memory area is zeroed so it can be mapped to userspace | 1894 | * The resulting memory area is zeroed so it can be mapped to userspace |
1858 | * without leaking data. | 1895 | * without leaking data. |
1896 | * | ||
1897 | * Return: pointer to the allocated memory or %NULL on error | ||
1859 | */ | 1898 | */ |
1860 | void *vmalloc_user(unsigned long size) | 1899 | void *vmalloc_user(unsigned long size) |
1861 | { | 1900 | { |
1862 | struct vm_struct *area; | 1901 | return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, |
1863 | void *ret; | 1902 | GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, |
1864 | 1903 | VM_USERMAP, NUMA_NO_NODE, | |
1865 | ret = __vmalloc_node(size, SHMLBA, | 1904 | __builtin_return_address(0)); |
1866 | GFP_KERNEL | __GFP_ZERO, | ||
1867 | PAGE_KERNEL, NUMA_NO_NODE, | ||
1868 | __builtin_return_address(0)); | ||
1869 | if (ret) { | ||
1870 | area = find_vm_area(ret); | ||
1871 | area->flags |= VM_USERMAP; | ||
1872 | } | ||
1873 | return ret; | ||
1874 | } | 1905 | } |
1875 | EXPORT_SYMBOL(vmalloc_user); | 1906 | EXPORT_SYMBOL(vmalloc_user); |
1876 | 1907 | ||
1877 | /** | 1908 | /** |
1878 | * vmalloc_node - allocate memory on a specific node | 1909 | * vmalloc_node - allocate memory on a specific node |
1879 | * @size: allocation size | 1910 | * @size: allocation size |
1880 | * @node: numa node | 1911 | * @node: numa node |
1912 | * | ||
1913 | * Allocate enough pages to cover @size from the page level | ||
1914 | * allocator and map them into contiguous kernel virtual space. | ||
1881 | * | 1915 | * |
1882 | * Allocate enough pages to cover @size from the page level | 1916 | * For tight control over page level allocator and protection flags |
1883 | * allocator and map them into contiguous kernel virtual space. | 1917 | * use __vmalloc() instead. |
1884 | * | 1918 | * |
1885 | * For tight control over page level allocator and protection flags | 1919 | * Return: pointer to the allocated memory or %NULL on error |
1886 | * use __vmalloc() instead. | ||
1887 | */ | 1920 | */ |
1888 | void *vmalloc_node(unsigned long size, int node) | 1921 | void *vmalloc_node(unsigned long size, int node) |
1889 | { | 1922 | { |
@@ -1903,6 +1936,8 @@ EXPORT_SYMBOL(vmalloc_node); | |||
1903 | * | 1936 | * |
1904 | * For tight control over page level allocator and protection flags | 1937 | * For tight control over page level allocator and protection flags |
1905 | * use __vmalloc_node() instead. | 1938 | * use __vmalloc_node() instead. |
1939 | * | ||
1940 | * Return: pointer to the allocated memory or %NULL on error | ||
1906 | */ | 1941 | */ |
1907 | void *vzalloc_node(unsigned long size, int node) | 1942 | void *vzalloc_node(unsigned long size, int node) |
1908 | { | 1943 | { |
@@ -1912,17 +1947,18 @@ void *vzalloc_node(unsigned long size, int node) | |||
1912 | EXPORT_SYMBOL(vzalloc_node); | 1947 | EXPORT_SYMBOL(vzalloc_node); |
1913 | 1948 | ||
1914 | /** | 1949 | /** |
1915 | * vmalloc_exec - allocate virtually contiguous, executable memory | 1950 | * vmalloc_exec - allocate virtually contiguous, executable memory |
1916 | * @size: allocation size | 1951 | * @size: allocation size |
1917 | * | 1952 | * |
1918 | * Kernel-internal function to allocate enough pages to cover @size | 1953 | * Kernel-internal function to allocate enough pages to cover @size |
1919 | * the page level allocator and map them into contiguous and | 1954 | * the page level allocator and map them into contiguous and |
1920 | * executable kernel virtual space. | 1955 | * executable kernel virtual space. |
1956 | * | ||
1957 | * For tight control over page level allocator and protection flags | ||
1958 | * use __vmalloc() instead. | ||
1921 | * | 1959 | * |
1922 | * For tight control over page level allocator and protection flags | 1960 | * Return: pointer to the allocated memory or %NULL on error |
1923 | * use __vmalloc() instead. | ||
1924 | */ | 1961 | */ |
1925 | |||
1926 | void *vmalloc_exec(unsigned long size) | 1962 | void *vmalloc_exec(unsigned long size) |
1927 | { | 1963 | { |
1928 | return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, | 1964 | return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, |
@@ -1942,11 +1978,13 @@ void *vmalloc_exec(unsigned long size) | |||
1942 | #endif | 1978 | #endif |
1943 | 1979 | ||
1944 | /** | 1980 | /** |
1945 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) | 1981 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) |
1946 | * @size: allocation size | 1982 | * @size: allocation size |
1947 | * | 1983 | * |
1948 | * Allocate enough 32bit PA addressable pages to cover @size from the | 1984 | * Allocate enough 32bit PA addressable pages to cover @size from the |
1949 | * page level allocator and map them into contiguous kernel virtual space. | 1985 | * page level allocator and map them into contiguous kernel virtual space. |
1986 | * | ||
1987 | * Return: pointer to the allocated memory or %NULL on error | ||
1950 | */ | 1988 | */ |
1951 | void *vmalloc_32(unsigned long size) | 1989 | void *vmalloc_32(unsigned long size) |
1952 | { | 1990 | { |
@@ -1957,23 +1995,19 @@ EXPORT_SYMBOL(vmalloc_32); | |||
1957 | 1995 | ||
1958 | /** | 1996 | /** |
1959 | * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory | 1997 | * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory |
1960 | * @size: allocation size | 1998 | * @size: allocation size |
1961 | * | 1999 | * |
1962 | * The resulting memory area is 32bit addressable and zeroed so it can be | 2000 | * The resulting memory area is 32bit addressable and zeroed so it can be |
1963 | * mapped to userspace without leaking data. | 2001 | * mapped to userspace without leaking data. |
2002 | * | ||
2003 | * Return: pointer to the allocated memory or %NULL on error | ||
1964 | */ | 2004 | */ |
1965 | void *vmalloc_32_user(unsigned long size) | 2005 | void *vmalloc_32_user(unsigned long size) |
1966 | { | 2006 | { |
1967 | struct vm_struct *area; | 2007 | return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, |
1968 | void *ret; | 2008 | GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, |
1969 | 2009 | VM_USERMAP, NUMA_NO_NODE, | |
1970 | ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, | 2010 | __builtin_return_address(0)); |
1971 | NUMA_NO_NODE, __builtin_return_address(0)); | ||
1972 | if (ret) { | ||
1973 | area = find_vm_area(ret); | ||
1974 | area->flags |= VM_USERMAP; | ||
1975 | } | ||
1976 | return ret; | ||
1977 | } | 2011 | } |
1978 | EXPORT_SYMBOL(vmalloc_32_user); | 2012 | EXPORT_SYMBOL(vmalloc_32_user); |
1979 | 2013 | ||
@@ -2059,31 +2093,29 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count) | |||
2059 | } | 2093 | } |
2060 | 2094 | ||
2061 | /** | 2095 | /** |
2062 | * vread() - read vmalloc area in a safe way. | 2096 | * vread() - read vmalloc area in a safe way. |
2063 | * @buf: buffer for reading data | 2097 | * @buf: buffer for reading data |
2064 | * @addr: vm address. | 2098 | * @addr: vm address. |
2065 | * @count: number of bytes to be read. | 2099 | * @count: number of bytes to be read. |
2066 | * | 2100 | * |
2067 | * Returns # of bytes which addr and buf should be increased. | 2101 | * This function checks that addr is a valid vmalloc'ed area, and |
2068 | * (same number to @count). Returns 0 if [addr...addr+count) doesn't | 2102 | * copy data from that area to a given buffer. If the given memory range |
2069 | * includes any intersect with alive vmalloc area. | 2103 | * of [addr...addr+count) includes some valid address, data is copied to |
2070 | * | 2104 | * proper area of @buf. If there are memory holes, they'll be zero-filled. |
2071 | * This function checks that addr is a valid vmalloc'ed area, and | 2105 | * IOREMAP area is treated as memory hole and no copy is done. |
2072 | * copy data from that area to a given buffer. If the given memory range | 2106 | * |
2073 | * of [addr...addr+count) includes some valid address, data is copied to | 2107 | * If [addr...addr+count) doesn't includes any intersects with alive |
2074 | * proper area of @buf. If there are memory holes, they'll be zero-filled. | 2108 | * vm_struct area, returns 0. @buf should be kernel's buffer. |
2075 | * IOREMAP area is treated as memory hole and no copy is done. | 2109 | * |
2076 | * | 2110 | * Note: In usual ops, vread() is never necessary because the caller |
2077 | * If [addr...addr+count) doesn't includes any intersects with alive | 2111 | * should know vmalloc() area is valid and can use memcpy(). |
2078 | * vm_struct area, returns 0. @buf should be kernel's buffer. | 2112 | * This is for routines which have to access vmalloc area without |
2079 | * | 2113 | * any informaion, as /dev/kmem. |
2080 | * Note: In usual ops, vread() is never necessary because the caller | 2114 | * |
2081 | * should know vmalloc() area is valid and can use memcpy(). | 2115 | * Return: number of bytes for which addr and buf should be increased |
2082 | * This is for routines which have to access vmalloc area without | 2116 | * (same number as @count) or %0 if [addr...addr+count) doesn't |
2083 | * any informaion, as /dev/kmem. | 2117 | * include any intersection with valid vmalloc area |
2084 | * | ||
2085 | */ | 2118 | */ |
2086 | |||
2087 | long vread(char *buf, char *addr, unsigned long count) | 2119 | long vread(char *buf, char *addr, unsigned long count) |
2088 | { | 2120 | { |
2089 | struct vmap_area *va; | 2121 | struct vmap_area *va; |
@@ -2140,31 +2172,29 @@ finished: | |||
2140 | } | 2172 | } |
2141 | 2173 | ||
2142 | /** | 2174 | /** |
2143 | * vwrite() - write vmalloc area in a safe way. | 2175 | * vwrite() - write vmalloc area in a safe way. |
2144 | * @buf: buffer for source data | 2176 | * @buf: buffer for source data |
2145 | * @addr: vm address. | 2177 | * @addr: vm address. |
2146 | * @count: number of bytes to be read. | 2178 | * @count: number of bytes to be read. |
2147 | * | 2179 | * |
2148 | * Returns # of bytes which addr and buf should be incresed. | 2180 | * This function checks that addr is a valid vmalloc'ed area, and |
2149 | * (same number to @count). | 2181 | * copy data from a buffer to the given addr. If specified range of |
2150 | * If [addr...addr+count) doesn't includes any intersect with valid | 2182 | * [addr...addr+count) includes some valid address, data is copied from |
2151 | * vmalloc area, returns 0. | 2183 | * proper area of @buf. If there are memory holes, no copy to hole. |
2152 | * | 2184 | * IOREMAP area is treated as memory hole and no copy is done. |
2153 | * This function checks that addr is a valid vmalloc'ed area, and | 2185 | * |
2154 | * copy data from a buffer to the given addr. If specified range of | 2186 | * If [addr...addr+count) doesn't includes any intersects with alive |
2155 | * [addr...addr+count) includes some valid address, data is copied from | 2187 | * vm_struct area, returns 0. @buf should be kernel's buffer. |
2156 | * proper area of @buf. If there are memory holes, no copy to hole. | 2188 | * |
2157 | * IOREMAP area is treated as memory hole and no copy is done. | 2189 | * Note: In usual ops, vwrite() is never necessary because the caller |
2158 | * | 2190 | * should know vmalloc() area is valid and can use memcpy(). |
2159 | * If [addr...addr+count) doesn't includes any intersects with alive | 2191 | * This is for routines which have to access vmalloc area without |
2160 | * vm_struct area, returns 0. @buf should be kernel's buffer. | 2192 | * any informaion, as /dev/kmem. |
2161 | * | 2193 | * |
2162 | * Note: In usual ops, vwrite() is never necessary because the caller | 2194 | * Return: number of bytes for which addr and buf should be |
2163 | * should know vmalloc() area is valid and can use memcpy(). | 2195 | * increased (same number as @count) or %0 if [addr...addr+count) |
2164 | * This is for routines which have to access vmalloc area without | 2196 | * doesn't include any intersection with valid vmalloc area |
2165 | * any informaion, as /dev/kmem. | ||
2166 | */ | 2197 | */ |
2167 | |||
2168 | long vwrite(char *buf, char *addr, unsigned long count) | 2198 | long vwrite(char *buf, char *addr, unsigned long count) |
2169 | { | 2199 | { |
2170 | struct vmap_area *va; | 2200 | struct vmap_area *va; |
@@ -2216,20 +2246,20 @@ finished: | |||
2216 | } | 2246 | } |
2217 | 2247 | ||
2218 | /** | 2248 | /** |
2219 | * remap_vmalloc_range_partial - map vmalloc pages to userspace | 2249 | * remap_vmalloc_range_partial - map vmalloc pages to userspace |
2220 | * @vma: vma to cover | 2250 | * @vma: vma to cover |
2221 | * @uaddr: target user address to start at | 2251 | * @uaddr: target user address to start at |
2222 | * @kaddr: virtual address of vmalloc kernel memory | 2252 | * @kaddr: virtual address of vmalloc kernel memory |
2223 | * @size: size of map area | 2253 | * @size: size of map area |
2224 | * | 2254 | * |
2225 | * Returns: 0 for success, -Exxx on failure | 2255 | * Returns: 0 for success, -Exxx on failure |
2226 | * | 2256 | * |
2227 | * This function checks that @kaddr is a valid vmalloc'ed area, | 2257 | * This function checks that @kaddr is a valid vmalloc'ed area, |
2228 | * and that it is big enough to cover the range starting at | 2258 | * and that it is big enough to cover the range starting at |
2229 | * @uaddr in @vma. Will return failure if that criteria isn't | 2259 | * @uaddr in @vma. Will return failure if that criteria isn't |
2230 | * met. | 2260 | * met. |
2231 | * | 2261 | * |
2232 | * Similar to remap_pfn_range() (see mm/memory.c) | 2262 | * Similar to remap_pfn_range() (see mm/memory.c) |
2233 | */ | 2263 | */ |
2234 | int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, | 2264 | int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, |
2235 | void *kaddr, unsigned long size) | 2265 | void *kaddr, unsigned long size) |
@@ -2248,7 +2278,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, | |||
2248 | if (!(area->flags & VM_USERMAP)) | 2278 | if (!(area->flags & VM_USERMAP)) |
2249 | return -EINVAL; | 2279 | return -EINVAL; |
2250 | 2280 | ||
2251 | if (kaddr + size > area->addr + area->size) | 2281 | if (kaddr + size > area->addr + get_vm_area_size(area)) |
2252 | return -EINVAL; | 2282 | return -EINVAL; |
2253 | 2283 | ||
2254 | do { | 2284 | do { |
@@ -2271,18 +2301,18 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, | |||
2271 | EXPORT_SYMBOL(remap_vmalloc_range_partial); | 2301 | EXPORT_SYMBOL(remap_vmalloc_range_partial); |
2272 | 2302 | ||
2273 | /** | 2303 | /** |
2274 | * remap_vmalloc_range - map vmalloc pages to userspace | 2304 | * remap_vmalloc_range - map vmalloc pages to userspace |
2275 | * @vma: vma to cover (map full range of vma) | 2305 | * @vma: vma to cover (map full range of vma) |
2276 | * @addr: vmalloc memory | 2306 | * @addr: vmalloc memory |
2277 | * @pgoff: number of pages into addr before first page to map | 2307 | * @pgoff: number of pages into addr before first page to map |
2278 | * | 2308 | * |
2279 | * Returns: 0 for success, -Exxx on failure | 2309 | * Returns: 0 for success, -Exxx on failure |
2280 | * | 2310 | * |
2281 | * This function checks that addr is a valid vmalloc'ed area, and | 2311 | * This function checks that addr is a valid vmalloc'ed area, and |
2282 | * that it is big enough to cover the vma. Will return failure if | 2312 | * that it is big enough to cover the vma. Will return failure if |
2283 | * that criteria isn't met. | 2313 | * that criteria isn't met. |
2284 | * | 2314 | * |
2285 | * Similar to remap_pfn_range() (see mm/memory.c) | 2315 | * Similar to remap_pfn_range() (see mm/memory.c) |
2286 | */ | 2316 | */ |
2287 | int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | 2317 | int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, |
2288 | unsigned long pgoff) | 2318 | unsigned long pgoff) |
@@ -2314,18 +2344,18 @@ static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data) | |||
2314 | } | 2344 | } |
2315 | 2345 | ||
2316 | /** | 2346 | /** |
2317 | * alloc_vm_area - allocate a range of kernel address space | 2347 | * alloc_vm_area - allocate a range of kernel address space |
2318 | * @size: size of the area | 2348 | * @size: size of the area |
2319 | * @ptes: returns the PTEs for the address space | 2349 | * @ptes: returns the PTEs for the address space |
2320 | * | 2350 | * |
2321 | * Returns: NULL on failure, vm_struct on success | 2351 | * Returns: NULL on failure, vm_struct on success |
2322 | * | 2352 | * |
2323 | * This function reserves a range of kernel address space, and | 2353 | * This function reserves a range of kernel address space, and |
2324 | * allocates pagetables to map that range. No actual mappings | 2354 | * allocates pagetables to map that range. No actual mappings |
2325 | * are created. | 2355 | * are created. |
2326 | * | 2356 | * |
2327 | * If @ptes is non-NULL, pointers to the PTEs (in init_mm) | 2357 | * If @ptes is non-NULL, pointers to the PTEs (in init_mm) |
2328 | * allocated for the VM area are returned. | 2358 | * allocated for the VM area are returned. |
2329 | */ | 2359 | */ |
2330 | struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) | 2360 | struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) |
2331 | { | 2361 | { |
@@ -2751,4 +2781,3 @@ static int __init proc_vmalloc_init(void) | |||
2751 | module_init(proc_vmalloc_init); | 2781 | module_init(proc_vmalloc_init); |
2752 | 2782 | ||
2753 | #endif | 2783 | #endif |
2754 | |||
diff --git a/mm/vmscan.c b/mm/vmscan.c index e979705bbf32..a5ad0b35ab8e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -374,7 +374,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone | |||
374 | */ | 374 | */ |
375 | int prealloc_shrinker(struct shrinker *shrinker) | 375 | int prealloc_shrinker(struct shrinker *shrinker) |
376 | { | 376 | { |
377 | size_t size = sizeof(*shrinker->nr_deferred); | 377 | unsigned int size = sizeof(*shrinker->nr_deferred); |
378 | 378 | ||
379 | if (shrinker->flags & SHRINKER_NUMA_AWARE) | 379 | if (shrinker->flags & SHRINKER_NUMA_AWARE) |
380 | size *= nr_node_ids; | 380 | size *= nr_node_ids; |
@@ -952,7 +952,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, | |||
952 | */ | 952 | */ |
953 | if (reclaimed && page_is_file_cache(page) && | 953 | if (reclaimed && page_is_file_cache(page) && |
954 | !mapping_exiting(mapping) && !dax_mapping(mapping)) | 954 | !mapping_exiting(mapping) && !dax_mapping(mapping)) |
955 | shadow = workingset_eviction(mapping, page); | 955 | shadow = workingset_eviction(page); |
956 | __delete_from_page_cache(page, shadow); | 956 | __delete_from_page_cache(page, shadow); |
957 | xa_unlock_irqrestore(&mapping->i_pages, flags); | 957 | xa_unlock_irqrestore(&mapping->i_pages, flags); |
958 | 958 | ||
@@ -1106,16 +1106,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1106 | { | 1106 | { |
1107 | LIST_HEAD(ret_pages); | 1107 | LIST_HEAD(ret_pages); |
1108 | LIST_HEAD(free_pages); | 1108 | LIST_HEAD(free_pages); |
1109 | int pgactivate = 0; | ||
1110 | unsigned nr_unqueued_dirty = 0; | ||
1111 | unsigned nr_dirty = 0; | ||
1112 | unsigned nr_congested = 0; | ||
1113 | unsigned nr_reclaimed = 0; | 1109 | unsigned nr_reclaimed = 0; |
1114 | unsigned nr_writeback = 0; | ||
1115 | unsigned nr_immediate = 0; | ||
1116 | unsigned nr_ref_keep = 0; | ||
1117 | unsigned nr_unmap_fail = 0; | ||
1118 | 1110 | ||
1111 | memset(stat, 0, sizeof(*stat)); | ||
1119 | cond_resched(); | 1112 | cond_resched(); |
1120 | 1113 | ||
1121 | while (!list_empty(page_list)) { | 1114 | while (!list_empty(page_list)) { |
@@ -1159,10 +1152,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1159 | */ | 1152 | */ |
1160 | page_check_dirty_writeback(page, &dirty, &writeback); | 1153 | page_check_dirty_writeback(page, &dirty, &writeback); |
1161 | if (dirty || writeback) | 1154 | if (dirty || writeback) |
1162 | nr_dirty++; | 1155 | stat->nr_dirty++; |
1163 | 1156 | ||
1164 | if (dirty && !writeback) | 1157 | if (dirty && !writeback) |
1165 | nr_unqueued_dirty++; | 1158 | stat->nr_unqueued_dirty++; |
1166 | 1159 | ||
1167 | /* | 1160 | /* |
1168 | * Treat this page as congested if the underlying BDI is or if | 1161 | * Treat this page as congested if the underlying BDI is or if |
@@ -1174,7 +1167,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1174 | if (((dirty || writeback) && mapping && | 1167 | if (((dirty || writeback) && mapping && |
1175 | inode_write_congested(mapping->host)) || | 1168 | inode_write_congested(mapping->host)) || |
1176 | (writeback && PageReclaim(page))) | 1169 | (writeback && PageReclaim(page))) |
1177 | nr_congested++; | 1170 | stat->nr_congested++; |
1178 | 1171 | ||
1179 | /* | 1172 | /* |
1180 | * If a page at the tail of the LRU is under writeback, there | 1173 | * If a page at the tail of the LRU is under writeback, there |
@@ -1223,7 +1216,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1223 | if (current_is_kswapd() && | 1216 | if (current_is_kswapd() && |
1224 | PageReclaim(page) && | 1217 | PageReclaim(page) && |
1225 | test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { | 1218 | test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { |
1226 | nr_immediate++; | 1219 | stat->nr_immediate++; |
1227 | goto activate_locked; | 1220 | goto activate_locked; |
1228 | 1221 | ||
1229 | /* Case 2 above */ | 1222 | /* Case 2 above */ |
@@ -1241,7 +1234,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1241 | * and it's also appropriate in global reclaim. | 1234 | * and it's also appropriate in global reclaim. |
1242 | */ | 1235 | */ |
1243 | SetPageReclaim(page); | 1236 | SetPageReclaim(page); |
1244 | nr_writeback++; | 1237 | stat->nr_writeback++; |
1245 | goto activate_locked; | 1238 | goto activate_locked; |
1246 | 1239 | ||
1247 | /* Case 3 above */ | 1240 | /* Case 3 above */ |
@@ -1261,7 +1254,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1261 | case PAGEREF_ACTIVATE: | 1254 | case PAGEREF_ACTIVATE: |
1262 | goto activate_locked; | 1255 | goto activate_locked; |
1263 | case PAGEREF_KEEP: | 1256 | case PAGEREF_KEEP: |
1264 | nr_ref_keep++; | 1257 | stat->nr_ref_keep++; |
1265 | goto keep_locked; | 1258 | goto keep_locked; |
1266 | case PAGEREF_RECLAIM: | 1259 | case PAGEREF_RECLAIM: |
1267 | case PAGEREF_RECLAIM_CLEAN: | 1260 | case PAGEREF_RECLAIM_CLEAN: |
@@ -1326,7 +1319,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1326 | if (unlikely(PageTransHuge(page))) | 1319 | if (unlikely(PageTransHuge(page))) |
1327 | flags |= TTU_SPLIT_HUGE_PMD; | 1320 | flags |= TTU_SPLIT_HUGE_PMD; |
1328 | if (!try_to_unmap(page, flags)) { | 1321 | if (!try_to_unmap(page, flags)) { |
1329 | nr_unmap_fail++; | 1322 | stat->nr_unmap_fail++; |
1330 | goto activate_locked; | 1323 | goto activate_locked; |
1331 | } | 1324 | } |
1332 | } | 1325 | } |
@@ -1474,7 +1467,7 @@ activate_locked: | |||
1474 | VM_BUG_ON_PAGE(PageActive(page), page); | 1467 | VM_BUG_ON_PAGE(PageActive(page), page); |
1475 | if (!PageMlocked(page)) { | 1468 | if (!PageMlocked(page)) { |
1476 | SetPageActive(page); | 1469 | SetPageActive(page); |
1477 | pgactivate++; | 1470 | stat->nr_activate++; |
1478 | count_memcg_page_event(page, PGACTIVATE); | 1471 | count_memcg_page_event(page, PGACTIVATE); |
1479 | } | 1472 | } |
1480 | keep_locked: | 1473 | keep_locked: |
@@ -1489,18 +1482,8 @@ keep: | |||
1489 | free_unref_page_list(&free_pages); | 1482 | free_unref_page_list(&free_pages); |
1490 | 1483 | ||
1491 | list_splice(&ret_pages, page_list); | 1484 | list_splice(&ret_pages, page_list); |
1492 | count_vm_events(PGACTIVATE, pgactivate); | 1485 | count_vm_events(PGACTIVATE, stat->nr_activate); |
1493 | 1486 | ||
1494 | if (stat) { | ||
1495 | stat->nr_dirty = nr_dirty; | ||
1496 | stat->nr_congested = nr_congested; | ||
1497 | stat->nr_unqueued_dirty = nr_unqueued_dirty; | ||
1498 | stat->nr_writeback = nr_writeback; | ||
1499 | stat->nr_immediate = nr_immediate; | ||
1500 | stat->nr_activate = pgactivate; | ||
1501 | stat->nr_ref_keep = nr_ref_keep; | ||
1502 | stat->nr_unmap_fail = nr_unmap_fail; | ||
1503 | } | ||
1504 | return nr_reclaimed; | 1487 | return nr_reclaimed; |
1505 | } | 1488 | } |
1506 | 1489 | ||
@@ -1512,6 +1495,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
1512 | .priority = DEF_PRIORITY, | 1495 | .priority = DEF_PRIORITY, |
1513 | .may_unmap = 1, | 1496 | .may_unmap = 1, |
1514 | }; | 1497 | }; |
1498 | struct reclaim_stat dummy_stat; | ||
1515 | unsigned long ret; | 1499 | unsigned long ret; |
1516 | struct page *page, *next; | 1500 | struct page *page, *next; |
1517 | LIST_HEAD(clean_pages); | 1501 | LIST_HEAD(clean_pages); |
@@ -1525,7 +1509,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
1525 | } | 1509 | } |
1526 | 1510 | ||
1527 | ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, | 1511 | ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, |
1528 | TTU_IGNORE_ACCESS, NULL, true); | 1512 | TTU_IGNORE_ACCESS, &dummy_stat, true); |
1529 | list_splice(&clean_pages, page_list); | 1513 | list_splice(&clean_pages, page_list); |
1530 | mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); | 1514 | mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); |
1531 | return ret; | 1515 | return ret; |
@@ -1630,8 +1614,8 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, | |||
1630 | 1614 | ||
1631 | } | 1615 | } |
1632 | 1616 | ||
1633 | /* | 1617 | /** |
1634 | * zone_lru_lock is heavily contended. Some of the functions that | 1618 | * pgdat->lru_lock is heavily contended. Some of the functions that |
1635 | * shrink the lists perform better by taking out a batch of pages | 1619 | * shrink the lists perform better by taking out a batch of pages |
1636 | * and working on them outside the LRU lock. | 1620 | * and working on them outside the LRU lock. |
1637 | * | 1621 | * |
@@ -1653,7 +1637,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, | |||
1653 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | 1637 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
1654 | struct lruvec *lruvec, struct list_head *dst, | 1638 | struct lruvec *lruvec, struct list_head *dst, |
1655 | unsigned long *nr_scanned, struct scan_control *sc, | 1639 | unsigned long *nr_scanned, struct scan_control *sc, |
1656 | isolate_mode_t mode, enum lru_list lru) | 1640 | enum lru_list lru) |
1657 | { | 1641 | { |
1658 | struct list_head *src = &lruvec->lists[lru]; | 1642 | struct list_head *src = &lruvec->lists[lru]; |
1659 | unsigned long nr_taken = 0; | 1643 | unsigned long nr_taken = 0; |
@@ -1662,6 +1646,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1662 | unsigned long skipped = 0; | 1646 | unsigned long skipped = 0; |
1663 | unsigned long scan, total_scan, nr_pages; | 1647 | unsigned long scan, total_scan, nr_pages; |
1664 | LIST_HEAD(pages_skipped); | 1648 | LIST_HEAD(pages_skipped); |
1649 | isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED); | ||
1665 | 1650 | ||
1666 | scan = 0; | 1651 | scan = 0; |
1667 | for (total_scan = 0; | 1652 | for (total_scan = 0; |
@@ -1765,11 +1750,11 @@ int isolate_lru_page(struct page *page) | |||
1765 | WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); | 1750 | WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); |
1766 | 1751 | ||
1767 | if (PageLRU(page)) { | 1752 | if (PageLRU(page)) { |
1768 | struct zone *zone = page_zone(page); | 1753 | pg_data_t *pgdat = page_pgdat(page); |
1769 | struct lruvec *lruvec; | 1754 | struct lruvec *lruvec; |
1770 | 1755 | ||
1771 | spin_lock_irq(zone_lru_lock(zone)); | 1756 | spin_lock_irq(&pgdat->lru_lock); |
1772 | lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); | 1757 | lruvec = mem_cgroup_page_lruvec(page, pgdat); |
1773 | if (PageLRU(page)) { | 1758 | if (PageLRU(page)) { |
1774 | int lru = page_lru(page); | 1759 | int lru = page_lru(page); |
1775 | get_page(page); | 1760 | get_page(page); |
@@ -1777,7 +1762,7 @@ int isolate_lru_page(struct page *page) | |||
1777 | del_page_from_lru_list(page, lruvec, lru); | 1762 | del_page_from_lru_list(page, lruvec, lru); |
1778 | ret = 0; | 1763 | ret = 0; |
1779 | } | 1764 | } |
1780 | spin_unlock_irq(zone_lru_lock(zone)); | 1765 | spin_unlock_irq(&pgdat->lru_lock); |
1781 | } | 1766 | } |
1782 | return ret; | 1767 | return ret; |
1783 | } | 1768 | } |
@@ -1899,8 +1884,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1899 | unsigned long nr_scanned; | 1884 | unsigned long nr_scanned; |
1900 | unsigned long nr_reclaimed = 0; | 1885 | unsigned long nr_reclaimed = 0; |
1901 | unsigned long nr_taken; | 1886 | unsigned long nr_taken; |
1902 | struct reclaim_stat stat = {}; | 1887 | struct reclaim_stat stat; |
1903 | isolate_mode_t isolate_mode = 0; | ||
1904 | int file = is_file_lru(lru); | 1888 | int file = is_file_lru(lru); |
1905 | struct pglist_data *pgdat = lruvec_pgdat(lruvec); | 1889 | struct pglist_data *pgdat = lruvec_pgdat(lruvec); |
1906 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; | 1890 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
@@ -1921,13 +1905,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1921 | 1905 | ||
1922 | lru_add_drain(); | 1906 | lru_add_drain(); |
1923 | 1907 | ||
1924 | if (!sc->may_unmap) | ||
1925 | isolate_mode |= ISOLATE_UNMAPPED; | ||
1926 | |||
1927 | spin_lock_irq(&pgdat->lru_lock); | 1908 | spin_lock_irq(&pgdat->lru_lock); |
1928 | 1909 | ||
1929 | nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, | 1910 | nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, |
1930 | &nr_scanned, sc, isolate_mode, lru); | 1911 | &nr_scanned, sc, lru); |
1931 | 1912 | ||
1932 | __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); | 1913 | __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); |
1933 | reclaim_stat->recent_scanned[file] += nr_taken; | 1914 | reclaim_stat->recent_scanned[file] += nr_taken; |
@@ -2009,9 +1990,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
2009 | * processes, from rmap. | 1990 | * processes, from rmap. |
2010 | * | 1991 | * |
2011 | * If the pages are mostly unmapped, the processing is fast and it is | 1992 | * If the pages are mostly unmapped, the processing is fast and it is |
2012 | * appropriate to hold zone_lru_lock across the whole operation. But if | 1993 | * appropriate to hold pgdat->lru_lock across the whole operation. But if |
2013 | * the pages are mapped, the processing is slow (page_referenced()) so we | 1994 | * the pages are mapped, the processing is slow (page_referenced()) so we |
2014 | * should drop zone_lru_lock around each page. It's impossible to balance | 1995 | * should drop pgdat->lru_lock around each page. It's impossible to balance |
2015 | * this, so instead we remove the pages from the LRU while processing them. | 1996 | * this, so instead we remove the pages from the LRU while processing them. |
2016 | * It is safe to rely on PG_active against the non-LRU pages in here because | 1997 | * It is safe to rely on PG_active against the non-LRU pages in here because |
2017 | * nobody will play with that bit on a non-LRU page. | 1998 | * nobody will play with that bit on a non-LRU page. |
@@ -2084,19 +2065,15 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
2084 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; | 2065 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
2085 | unsigned nr_deactivate, nr_activate; | 2066 | unsigned nr_deactivate, nr_activate; |
2086 | unsigned nr_rotated = 0; | 2067 | unsigned nr_rotated = 0; |
2087 | isolate_mode_t isolate_mode = 0; | ||
2088 | int file = is_file_lru(lru); | 2068 | int file = is_file_lru(lru); |
2089 | struct pglist_data *pgdat = lruvec_pgdat(lruvec); | 2069 | struct pglist_data *pgdat = lruvec_pgdat(lruvec); |
2090 | 2070 | ||
2091 | lru_add_drain(); | 2071 | lru_add_drain(); |
2092 | 2072 | ||
2093 | if (!sc->may_unmap) | ||
2094 | isolate_mode |= ISOLATE_UNMAPPED; | ||
2095 | |||
2096 | spin_lock_irq(&pgdat->lru_lock); | 2073 | spin_lock_irq(&pgdat->lru_lock); |
2097 | 2074 | ||
2098 | nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, | 2075 | nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, |
2099 | &nr_scanned, sc, isolate_mode, lru); | 2076 | &nr_scanned, sc, lru); |
2100 | 2077 | ||
2101 | __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); | 2078 | __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); |
2102 | reclaim_stat->recent_scanned[file] += nr_taken; | 2079 | reclaim_stat->recent_scanned[file] += nr_taken; |
@@ -2754,16 +2731,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) | |||
2754 | sc->nr_reclaimed - reclaimed); | 2731 | sc->nr_reclaimed - reclaimed); |
2755 | 2732 | ||
2756 | /* | 2733 | /* |
2757 | * Direct reclaim and kswapd have to scan all memory | 2734 | * Kswapd have to scan all memory cgroups to fulfill |
2758 | * cgroups to fulfill the overall scan target for the | 2735 | * the overall scan target for the node. |
2759 | * node. | ||
2760 | * | 2736 | * |
2761 | * Limit reclaim, on the other hand, only cares about | 2737 | * Limit reclaim, on the other hand, only cares about |
2762 | * nr_to_reclaim pages to be reclaimed and it will | 2738 | * nr_to_reclaim pages to be reclaimed and it will |
2763 | * retry with decreasing priority if one round over the | 2739 | * retry with decreasing priority if one round over the |
2764 | * whole hierarchy is not sufficient. | 2740 | * whole hierarchy is not sufficient. |
2765 | */ | 2741 | */ |
2766 | if (!global_reclaim(sc) && | 2742 | if (!current_is_kswapd() && |
2767 | sc->nr_reclaimed >= sc->nr_to_reclaim) { | 2743 | sc->nr_reclaimed >= sc->nr_to_reclaim) { |
2768 | mem_cgroup_iter_break(root, memcg); | 2744 | mem_cgroup_iter_break(root, memcg); |
2769 | break; | 2745 | break; |
@@ -3527,7 +3503,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, | |||
3527 | * | 3503 | * |
3528 | * kswapd scans the zones in the highmem->normal->dma direction. It skips | 3504 | * kswapd scans the zones in the highmem->normal->dma direction. It skips |
3529 | * zones which have free_pages > high_wmark_pages(zone), but once a zone is | 3505 | * zones which have free_pages > high_wmark_pages(zone), but once a zone is |
3530 | * found to have free_pages <= high_wmark_pages(zone), any page is that zone | 3506 | * found to have free_pages <= high_wmark_pages(zone), any page in that zone |
3531 | * or lower is eligible for reclaim until at least one usable zone is | 3507 | * or lower is eligible for reclaim until at least one usable zone is |
3532 | * balanced. | 3508 | * balanced. |
3533 | */ | 3509 | */ |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 83b30edc2f7f..36b56f858f0f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -2121,21 +2121,14 @@ static int __init extfrag_debug_init(void) | |||
2121 | struct dentry *extfrag_debug_root; | 2121 | struct dentry *extfrag_debug_root; |
2122 | 2122 | ||
2123 | extfrag_debug_root = debugfs_create_dir("extfrag", NULL); | 2123 | extfrag_debug_root = debugfs_create_dir("extfrag", NULL); |
2124 | if (!extfrag_debug_root) | ||
2125 | return -ENOMEM; | ||
2126 | 2124 | ||
2127 | if (!debugfs_create_file("unusable_index", 0444, | 2125 | debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL, |
2128 | extfrag_debug_root, NULL, &unusable_file_ops)) | 2126 | &unusable_file_ops); |
2129 | goto fail; | ||
2130 | 2127 | ||
2131 | if (!debugfs_create_file("extfrag_index", 0444, | 2128 | debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL, |
2132 | extfrag_debug_root, NULL, &extfrag_file_ops)) | 2129 | &extfrag_file_ops); |
2133 | goto fail; | ||
2134 | 2130 | ||
2135 | return 0; | 2131 | return 0; |
2136 | fail: | ||
2137 | debugfs_remove_recursive(extfrag_debug_root); | ||
2138 | return -ENOMEM; | ||
2139 | } | 2132 | } |
2140 | 2133 | ||
2141 | module_init(extfrag_debug_init); | 2134 | module_init(extfrag_debug_init); |
diff --git a/mm/workingset.c b/mm/workingset.c index dcb994f2acc2..0bedf67502d5 100644 --- a/mm/workingset.c +++ b/mm/workingset.c | |||
@@ -215,13 +215,12 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, | |||
215 | 215 | ||
216 | /** | 216 | /** |
217 | * workingset_eviction - note the eviction of a page from memory | 217 | * workingset_eviction - note the eviction of a page from memory |
218 | * @mapping: address space the page was backing | ||
219 | * @page: the page being evicted | 218 | * @page: the page being evicted |
220 | * | 219 | * |
221 | * Returns a shadow entry to be stored in @mapping->i_pages in place | 220 | * Returns a shadow entry to be stored in @page->mapping->i_pages in place |
222 | * of the evicted @page so that a later refault can be detected. | 221 | * of the evicted @page so that a later refault can be detected. |
223 | */ | 222 | */ |
224 | void *workingset_eviction(struct address_space *mapping, struct page *page) | 223 | void *workingset_eviction(struct page *page) |
225 | { | 224 | { |
226 | struct pglist_data *pgdat = page_pgdat(page); | 225 | struct pglist_data *pgdat = page_pgdat(page); |
227 | struct mem_cgroup *memcg = page_memcg(page); | 226 | struct mem_cgroup *memcg = page_memcg(page); |
diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 6ac919847ce6..f3f5a78cd062 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c | |||
@@ -158,6 +158,7 @@ | |||
158 | #include <linux/etherdevice.h> | 158 | #include <linux/etherdevice.h> |
159 | #include <linux/kthread.h> | 159 | #include <linux/kthread.h> |
160 | #include <linux/prefetch.h> | 160 | #include <linux/prefetch.h> |
161 | #include <linux/mmzone.h> | ||
161 | #include <net/net_namespace.h> | 162 | #include <net/net_namespace.h> |
162 | #include <net/checksum.h> | 163 | #include <net/checksum.h> |
163 | #include <net/ipv6.h> | 164 | #include <net/ipv6.h> |
@@ -3625,7 +3626,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) | |||
3625 | pkt_dev->svlan_cfi = 0; | 3626 | pkt_dev->svlan_cfi = 0; |
3626 | pkt_dev->svlan_id = 0xffff; | 3627 | pkt_dev->svlan_id = 0xffff; |
3627 | pkt_dev->burst = 1; | 3628 | pkt_dev->burst = 1; |
3628 | pkt_dev->node = -1; | 3629 | pkt_dev->node = NUMA_NO_NODE; |
3629 | 3630 | ||
3630 | err = pktgen_setup_dev(t->net, pkt_dev, ifname); | 3631 | err = pktgen_setup_dev(t->net, pkt_dev, ifname); |
3631 | if (err) | 3632 | if (err) |
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 86e1e37eb4e8..b37e6e0a1026 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/netlink.h> | 15 | #include <linux/netlink.h> |
16 | #include <linux/qrtr.h> | 16 | #include <linux/qrtr.h> |
17 | #include <linux/termios.h> /* For TIOCINQ/OUTQ */ | 17 | #include <linux/termios.h> /* For TIOCINQ/OUTQ */ |
18 | #include <linux/numa.h> | ||
18 | 19 | ||
19 | #include <net/sock.h> | 20 | #include <net/sock.h> |
20 | 21 | ||
@@ -101,7 +102,7 @@ static inline struct qrtr_sock *qrtr_sk(struct sock *sk) | |||
101 | return container_of(sk, struct qrtr_sock, sk); | 102 | return container_of(sk, struct qrtr_sock, sk); |
102 | } | 103 | } |
103 | 104 | ||
104 | static unsigned int qrtr_local_nid = -1; | 105 | static unsigned int qrtr_local_nid = NUMA_NO_NODE; |
105 | 106 | ||
106 | /* for node ids */ | 107 | /* for node ids */ |
107 | static RADIX_TREE(qrtr_nodes, GFP_KERNEL); | 108 | static RADIX_TREE(qrtr_nodes, GFP_KERNEL); |
diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan index 6deabedc67fc..6410bd22fe38 100644 --- a/scripts/Makefile.kasan +++ b/scripts/Makefile.kasan | |||
@@ -27,14 +27,9 @@ else | |||
27 | $(call cc-param,asan-globals=1) \ | 27 | $(call cc-param,asan-globals=1) \ |
28 | $(call cc-param,asan-instrumentation-with-call-threshold=$(call_threshold)) \ | 28 | $(call cc-param,asan-instrumentation-with-call-threshold=$(call_threshold)) \ |
29 | $(call cc-param,asan-stack=$(CONFIG_KASAN_STACK)) \ | 29 | $(call cc-param,asan-stack=$(CONFIG_KASAN_STACK)) \ |
30 | $(call cc-param,asan-use-after-scope=1) \ | ||
31 | $(call cc-param,asan-instrument-allocas=1) | 30 | $(call cc-param,asan-instrument-allocas=1) |
32 | endif | 31 | endif |
33 | 32 | ||
34 | ifdef CONFIG_KASAN_EXTRA | ||
35 | CFLAGS_KASAN += $(call cc-option, -fsanitize-address-use-after-scope) | ||
36 | endif | ||
37 | |||
38 | endif # CONFIG_KASAN_GENERIC | 33 | endif # CONFIG_KASAN_GENERIC |
39 | 34 | ||
40 | ifdef CONFIG_KASAN_SW_TAGS | 35 | ifdef CONFIG_KASAN_SW_TAGS |
diff --git a/scripts/decode_stacktrace.sh b/scripts/decode_stacktrace.sh index 98a7d63a723e..bcdd45df3f51 100755 --- a/scripts/decode_stacktrace.sh +++ b/scripts/decode_stacktrace.sh | |||
@@ -37,6 +37,13 @@ parse_symbol() { | |||
37 | symbol=${symbol#\(} | 37 | symbol=${symbol#\(} |
38 | symbol=${symbol%\)} | 38 | symbol=${symbol%\)} |
39 | 39 | ||
40 | # Strip segment | ||
41 | local segment | ||
42 | if [[ $symbol == *:* ]] ; then | ||
43 | segment=${symbol%%:*}: | ||
44 | symbol=${symbol#*:} | ||
45 | fi | ||
46 | |||
40 | # Strip the symbol name so that we could look it up | 47 | # Strip the symbol name so that we could look it up |
41 | local name=${symbol%+*} | 48 | local name=${symbol%+*} |
42 | 49 | ||
@@ -84,7 +91,7 @@ parse_symbol() { | |||
84 | code=${code//$'\n'/' '} | 91 | code=${code//$'\n'/' '} |
85 | 92 | ||
86 | # Replace old address with pretty line numbers | 93 | # Replace old address with pretty line numbers |
87 | symbol="$name ($code)" | 94 | symbol="$segment$name ($code)" |
88 | } | 95 | } |
89 | 96 | ||
90 | decode_code() { | 97 | decode_code() { |
diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig index d45f7f36b859..d9fd9988ef27 100644 --- a/scripts/gcc-plugins/Kconfig +++ b/scripts/gcc-plugins/Kconfig | |||
@@ -68,10 +68,6 @@ config GCC_PLUGIN_LATENT_ENTROPY | |||
68 | 68 | ||
69 | config GCC_PLUGIN_STRUCTLEAK | 69 | config GCC_PLUGIN_STRUCTLEAK |
70 | bool "Force initialization of variables containing userspace addresses" | 70 | bool "Force initialization of variables containing userspace addresses" |
71 | # Currently STRUCTLEAK inserts initialization out of live scope of | ||
72 | # variables from KASAN point of view. This leads to KASAN false | ||
73 | # positive reports. Prohibit this combination for now. | ||
74 | depends on !KASAN_EXTRA | ||
75 | help | 71 | help |
76 | This plugin zero-initializes any structures containing a | 72 | This plugin zero-initializes any structures containing a |
77 | __user attribute. This can prevent some classes of information | 73 | __user attribute. This can prevent some classes of information |
diff --git a/tools/include/linux/numa.h b/tools/include/linux/numa.h new file mode 100644 index 000000000000..110b0e5d0fb0 --- /dev/null +++ b/tools/include/linux/numa.h | |||
@@ -0,0 +1,16 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | #ifndef _LINUX_NUMA_H | ||
3 | #define _LINUX_NUMA_H | ||
4 | |||
5 | |||
6 | #ifdef CONFIG_NODES_SHIFT | ||
7 | #define NODES_SHIFT CONFIG_NODES_SHIFT | ||
8 | #else | ||
9 | #define NODES_SHIFT 0 | ||
10 | #endif | ||
11 | |||
12 | #define MAX_NUMNODES (1 << NODES_SHIFT) | ||
13 | |||
14 | #define NUMA_NO_NODE (-1) | ||
15 | |||
16 | #endif /* _LINUX_NUMA_H */ | ||
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index 44195514b19e..98ad783efc69 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <sys/types.h> | 34 | #include <sys/types.h> |
35 | #include <linux/kernel.h> | 35 | #include <linux/kernel.h> |
36 | #include <linux/time64.h> | 36 | #include <linux/time64.h> |
37 | #include <linux/numa.h> | ||
37 | 38 | ||
38 | #include <numa.h> | 39 | #include <numa.h> |
39 | #include <numaif.h> | 40 | #include <numaif.h> |
@@ -298,7 +299,7 @@ static cpu_set_t bind_to_node(int target_node) | |||
298 | 299 | ||
299 | CPU_ZERO(&mask); | 300 | CPU_ZERO(&mask); |
300 | 301 | ||
301 | if (target_node == -1) { | 302 | if (target_node == NUMA_NO_NODE) { |
302 | for (cpu = 0; cpu < g->p.nr_cpus; cpu++) | 303 | for (cpu = 0; cpu < g->p.nr_cpus; cpu++) |
303 | CPU_SET(cpu, &mask); | 304 | CPU_SET(cpu, &mask); |
304 | } else { | 305 | } else { |
@@ -339,7 +340,7 @@ static void bind_to_memnode(int node) | |||
339 | unsigned long nodemask; | 340 | unsigned long nodemask; |
340 | int ret; | 341 | int ret; |
341 | 342 | ||
342 | if (node == -1) | 343 | if (node == NUMA_NO_NODE) |
343 | return; | 344 | return; |
344 | 345 | ||
345 | BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)*8); | 346 | BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)*8); |
@@ -1363,7 +1364,7 @@ static void init_thread_data(void) | |||
1363 | int cpu; | 1364 | int cpu; |
1364 | 1365 | ||
1365 | /* Allow all nodes by default: */ | 1366 | /* Allow all nodes by default: */ |
1366 | td->bind_node = -1; | 1367 | td->bind_node = NUMA_NO_NODE; |
1367 | 1368 | ||
1368 | /* Allow all CPUs by default: */ | 1369 | /* Allow all CPUs by default: */ |
1369 | CPU_ZERO(&td->bind_cpumask); | 1370 | CPU_ZERO(&td->bind_cpumask); |
diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 400ee81a3043..6a94f07c4164 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile | |||
@@ -48,6 +48,7 @@ TARGETS += sysctl | |||
48 | ifneq (1, $(quicktest)) | 48 | ifneq (1, $(quicktest)) |
49 | TARGETS += timers | 49 | TARGETS += timers |
50 | endif | 50 | endif |
51 | TARGETS += tmpfs | ||
51 | TARGETS += user | 52 | TARGETS += user |
52 | TARGETS += vm | 53 | TARGETS += vm |
53 | TARGETS += x86 | 54 | TARGETS += x86 |
diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 10baa1652fc2..c67d32eeb668 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c | |||
@@ -54,6 +54,22 @@ static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) | |||
54 | return fd; | 54 | return fd; |
55 | } | 55 | } |
56 | 56 | ||
57 | static int mfd_assert_reopen_fd(int fd_in) | ||
58 | { | ||
59 | int r, fd; | ||
60 | char path[100]; | ||
61 | |||
62 | sprintf(path, "/proc/self/fd/%d", fd_in); | ||
63 | |||
64 | fd = open(path, O_RDWR); | ||
65 | if (fd < 0) { | ||
66 | printf("re-open of existing fd %d failed\n", fd_in); | ||
67 | abort(); | ||
68 | } | ||
69 | |||
70 | return fd; | ||
71 | } | ||
72 | |||
57 | static void mfd_fail_new(const char *name, unsigned int flags) | 73 | static void mfd_fail_new(const char *name, unsigned int flags) |
58 | { | 74 | { |
59 | int r; | 75 | int r; |
@@ -255,6 +271,25 @@ static void mfd_assert_read(int fd) | |||
255 | munmap(p, mfd_def_size); | 271 | munmap(p, mfd_def_size); |
256 | } | 272 | } |
257 | 273 | ||
274 | /* Test that PROT_READ + MAP_SHARED mappings work. */ | ||
275 | static void mfd_assert_read_shared(int fd) | ||
276 | { | ||
277 | void *p; | ||
278 | |||
279 | /* verify PROT_READ and MAP_SHARED *is* allowed */ | ||
280 | p = mmap(NULL, | ||
281 | mfd_def_size, | ||
282 | PROT_READ, | ||
283 | MAP_SHARED, | ||
284 | fd, | ||
285 | 0); | ||
286 | if (p == MAP_FAILED) { | ||
287 | printf("mmap() failed: %m\n"); | ||
288 | abort(); | ||
289 | } | ||
290 | munmap(p, mfd_def_size); | ||
291 | } | ||
292 | |||
258 | static void mfd_assert_write(int fd) | 293 | static void mfd_assert_write(int fd) |
259 | { | 294 | { |
260 | ssize_t l; | 295 | ssize_t l; |
@@ -693,6 +728,44 @@ static void test_seal_write(void) | |||
693 | } | 728 | } |
694 | 729 | ||
695 | /* | 730 | /* |
731 | * Test SEAL_FUTURE_WRITE | ||
732 | * Test whether SEAL_FUTURE_WRITE actually prevents modifications. | ||
733 | */ | ||
734 | static void test_seal_future_write(void) | ||
735 | { | ||
736 | int fd, fd2; | ||
737 | void *p; | ||
738 | |||
739 | printf("%s SEAL-FUTURE-WRITE\n", memfd_str); | ||
740 | |||
741 | fd = mfd_assert_new("kern_memfd_seal_future_write", | ||
742 | mfd_def_size, | ||
743 | MFD_CLOEXEC | MFD_ALLOW_SEALING); | ||
744 | |||
745 | p = mfd_assert_mmap_shared(fd); | ||
746 | |||
747 | mfd_assert_has_seals(fd, 0); | ||
748 | |||
749 | mfd_assert_add_seals(fd, F_SEAL_FUTURE_WRITE); | ||
750 | mfd_assert_has_seals(fd, F_SEAL_FUTURE_WRITE); | ||
751 | |||
752 | /* read should pass, writes should fail */ | ||
753 | mfd_assert_read(fd); | ||
754 | mfd_assert_read_shared(fd); | ||
755 | mfd_fail_write(fd); | ||
756 | |||
757 | fd2 = mfd_assert_reopen_fd(fd); | ||
758 | /* read should pass, writes should still fail */ | ||
759 | mfd_assert_read(fd2); | ||
760 | mfd_assert_read_shared(fd2); | ||
761 | mfd_fail_write(fd2); | ||
762 | |||
763 | munmap(p, mfd_def_size); | ||
764 | close(fd2); | ||
765 | close(fd); | ||
766 | } | ||
767 | |||
768 | /* | ||
696 | * Test SEAL_SHRINK | 769 | * Test SEAL_SHRINK |
697 | * Test whether SEAL_SHRINK actually prevents shrinking | 770 | * Test whether SEAL_SHRINK actually prevents shrinking |
698 | */ | 771 | */ |
@@ -945,6 +1018,7 @@ int main(int argc, char **argv) | |||
945 | test_basic(); | 1018 | test_basic(); |
946 | 1019 | ||
947 | test_seal_write(); | 1020 | test_seal_write(); |
1021 | test_seal_future_write(); | ||
948 | test_seal_shrink(); | 1022 | test_seal_shrink(); |
949 | test_seal_grow(); | 1023 | test_seal_grow(); |
950 | test_seal_resize(); | 1024 | test_seal_resize(); |
diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 29bac5ef9a93..444ad39d3700 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore | |||
@@ -2,6 +2,7 @@ | |||
2 | /fd-002-posix-eq | 2 | /fd-002-posix-eq |
3 | /fd-003-kthread | 3 | /fd-003-kthread |
4 | /proc-loadavg-001 | 4 | /proc-loadavg-001 |
5 | /proc-pid-vm | ||
5 | /proc-self-map-files-001 | 6 | /proc-self-map-files-001 |
6 | /proc-self-map-files-002 | 7 | /proc-self-map-files-002 |
7 | /proc-self-syscall | 8 | /proc-self-syscall |
diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 434d033ee067..5163dc887aa3 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile | |||
@@ -6,6 +6,7 @@ TEST_GEN_PROGS += fd-001-lookup | |||
6 | TEST_GEN_PROGS += fd-002-posix-eq | 6 | TEST_GEN_PROGS += fd-002-posix-eq |
7 | TEST_GEN_PROGS += fd-003-kthread | 7 | TEST_GEN_PROGS += fd-003-kthread |
8 | TEST_GEN_PROGS += proc-loadavg-001 | 8 | TEST_GEN_PROGS += proc-loadavg-001 |
9 | TEST_GEN_PROGS += proc-pid-vm | ||
9 | TEST_GEN_PROGS += proc-self-map-files-001 | 10 | TEST_GEN_PROGS += proc-self-map-files-001 |
10 | TEST_GEN_PROGS += proc-self-map-files-002 | 11 | TEST_GEN_PROGS += proc-self-map-files-002 |
11 | TEST_GEN_PROGS += proc-self-syscall | 12 | TEST_GEN_PROGS += proc-self-syscall |
diff --git a/tools/testing/selftests/proc/proc-loadavg-001.c b/tools/testing/selftests/proc/proc-loadavg-001.c index fcff7047000d..471e2aa28077 100644 --- a/tools/testing/selftests/proc/proc-loadavg-001.c +++ b/tools/testing/selftests/proc/proc-loadavg-001.c | |||
@@ -30,7 +30,7 @@ int main(void) | |||
30 | 30 | ||
31 | if (unshare(CLONE_NEWPID) == -1) { | 31 | if (unshare(CLONE_NEWPID) == -1) { |
32 | if (errno == ENOSYS || errno == EPERM) | 32 | if (errno == ENOSYS || errno == EPERM) |
33 | return 2; | 33 | return 4; |
34 | return 1; | 34 | return 1; |
35 | } | 35 | } |
36 | 36 | ||
diff --git a/tools/testing/selftests/proc/proc-pid-vm.c b/tools/testing/selftests/proc/proc-pid-vm.c new file mode 100644 index 000000000000..bbe8150d18aa --- /dev/null +++ b/tools/testing/selftests/proc/proc-pid-vm.c | |||
@@ -0,0 +1,406 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com> | ||
3 | * | ||
4 | * Permission to use, copy, modify, and distribute this software for any | ||
5 | * purpose with or without fee is hereby granted, provided that the above | ||
6 | * copyright notice and this permission notice appear in all copies. | ||
7 | * | ||
8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
15 | */ | ||
16 | /* | ||
17 | * Fork and exec tiny 1 page executable which precisely controls its VM. | ||
18 | * Test /proc/$PID/maps | ||
19 | * Test /proc/$PID/smaps | ||
20 | * Test /proc/$PID/smaps_rollup | ||
21 | * Test /proc/$PID/statm | ||
22 | * | ||
23 | * FIXME require CONFIG_TMPFS which can be disabled | ||
24 | * FIXME test other values from "smaps" | ||
25 | * FIXME support other archs | ||
26 | */ | ||
27 | #undef NDEBUG | ||
28 | #include <assert.h> | ||
29 | #include <errno.h> | ||
30 | #include <sched.h> | ||
31 | #include <signal.h> | ||
32 | #include <stdint.h> | ||
33 | #include <stdio.h> | ||
34 | #include <string.h> | ||
35 | #include <stdlib.h> | ||
36 | #include <sys/mount.h> | ||
37 | #include <sys/types.h> | ||
38 | #include <sys/stat.h> | ||
39 | #include <fcntl.h> | ||
40 | #include <unistd.h> | ||
41 | #include <sys/syscall.h> | ||
42 | #include <sys/uio.h> | ||
43 | #include <linux/kdev_t.h> | ||
44 | |||
45 | static inline long sys_execveat(int dirfd, const char *pathname, char **argv, char **envp, int flags) | ||
46 | { | ||
47 | return syscall(SYS_execveat, dirfd, pathname, argv, envp, flags); | ||
48 | } | ||
49 | |||
50 | static void make_private_tmp(void) | ||
51 | { | ||
52 | if (unshare(CLONE_NEWNS) == -1) { | ||
53 | if (errno == ENOSYS || errno == EPERM) { | ||
54 | exit(4); | ||
55 | } | ||
56 | exit(1); | ||
57 | } | ||
58 | if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) { | ||
59 | exit(1); | ||
60 | } | ||
61 | if (mount(NULL, "/tmp", "tmpfs", 0, NULL) == -1) { | ||
62 | exit(1); | ||
63 | } | ||
64 | } | ||
65 | |||
66 | static pid_t pid = -1; | ||
67 | static void ate(void) | ||
68 | { | ||
69 | if (pid > 0) { | ||
70 | kill(pid, SIGTERM); | ||
71 | } | ||
72 | } | ||
73 | |||
74 | struct elf64_hdr { | ||
75 | uint8_t e_ident[16]; | ||
76 | uint16_t e_type; | ||
77 | uint16_t e_machine; | ||
78 | uint32_t e_version; | ||
79 | uint64_t e_entry; | ||
80 | uint64_t e_phoff; | ||
81 | uint64_t e_shoff; | ||
82 | uint32_t e_flags; | ||
83 | uint16_t e_ehsize; | ||
84 | uint16_t e_phentsize; | ||
85 | uint16_t e_phnum; | ||
86 | uint16_t e_shentsize; | ||
87 | uint16_t e_shnum; | ||
88 | uint16_t e_shstrndx; | ||
89 | }; | ||
90 | |||
91 | struct elf64_phdr { | ||
92 | uint32_t p_type; | ||
93 | uint32_t p_flags; | ||
94 | uint64_t p_offset; | ||
95 | uint64_t p_vaddr; | ||
96 | uint64_t p_paddr; | ||
97 | uint64_t p_filesz; | ||
98 | uint64_t p_memsz; | ||
99 | uint64_t p_align; | ||
100 | }; | ||
101 | |||
102 | #ifdef __x86_64__ | ||
103 | #define PAGE_SIZE 4096 | ||
104 | #define VADDR (1UL << 32) | ||
105 | #define MAPS_OFFSET 73 | ||
106 | |||
107 | #define syscall 0x0f, 0x05 | ||
108 | #define mov_rdi(x) \ | ||
109 | 0x48, 0xbf, \ | ||
110 | (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff, \ | ||
111 | ((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff | ||
112 | |||
113 | #define mov_rsi(x) \ | ||
114 | 0x48, 0xbe, \ | ||
115 | (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff, \ | ||
116 | ((x)>>32)&0xff, ((x)>>40)&0xff, ((x)>>48)&0xff, ((x)>>56)&0xff | ||
117 | |||
118 | #define mov_eax(x) \ | ||
119 | 0xb8, (x)&0xff, ((x)>>8)&0xff, ((x)>>16)&0xff, ((x)>>24)&0xff | ||
120 | |||
121 | static const uint8_t payload[] = { | ||
122 | /* Casually unmap stack, vDSO and everything else. */ | ||
123 | /* munmap */ | ||
124 | mov_rdi(VADDR + 4096), | ||
125 | mov_rsi((1ULL << 47) - 4096 - VADDR - 4096), | ||
126 | mov_eax(11), | ||
127 | syscall, | ||
128 | |||
129 | /* Ping parent. */ | ||
130 | /* write(0, &c, 1); */ | ||
131 | 0x31, 0xff, /* xor edi, edi */ | ||
132 | 0x48, 0x8d, 0x35, 0x00, 0x00, 0x00, 0x00, /* lea rsi, [rip] */ | ||
133 | 0xba, 0x01, 0x00, 0x00, 0x00, /* mov edx, 1 */ | ||
134 | mov_eax(1), | ||
135 | syscall, | ||
136 | |||
137 | /* 1: pause(); */ | ||
138 | mov_eax(34), | ||
139 | syscall, | ||
140 | |||
141 | 0xeb, 0xf7, /* jmp 1b */ | ||
142 | }; | ||
143 | |||
144 | static int make_exe(const uint8_t *payload, size_t len) | ||
145 | { | ||
146 | struct elf64_hdr h; | ||
147 | struct elf64_phdr ph; | ||
148 | |||
149 | struct iovec iov[3] = { | ||
150 | {&h, sizeof(struct elf64_hdr)}, | ||
151 | {&ph, sizeof(struct elf64_phdr)}, | ||
152 | {(void *)payload, len}, | ||
153 | }; | ||
154 | int fd, fd1; | ||
155 | char buf[64]; | ||
156 | |||
157 | memset(&h, 0, sizeof(h)); | ||
158 | h.e_ident[0] = 0x7f; | ||
159 | h.e_ident[1] = 'E'; | ||
160 | h.e_ident[2] = 'L'; | ||
161 | h.e_ident[3] = 'F'; | ||
162 | h.e_ident[4] = 2; | ||
163 | h.e_ident[5] = 1; | ||
164 | h.e_ident[6] = 1; | ||
165 | h.e_ident[7] = 0; | ||
166 | h.e_type = 2; | ||
167 | h.e_machine = 0x3e; | ||
168 | h.e_version = 1; | ||
169 | h.e_entry = VADDR + sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr); | ||
170 | h.e_phoff = sizeof(struct elf64_hdr); | ||
171 | h.e_shoff = 0; | ||
172 | h.e_flags = 0; | ||
173 | h.e_ehsize = sizeof(struct elf64_hdr); | ||
174 | h.e_phentsize = sizeof(struct elf64_phdr); | ||
175 | h.e_phnum = 1; | ||
176 | h.e_shentsize = 0; | ||
177 | h.e_shnum = 0; | ||
178 | h.e_shstrndx = 0; | ||
179 | |||
180 | memset(&ph, 0, sizeof(ph)); | ||
181 | ph.p_type = 1; | ||
182 | ph.p_flags = (1<<2)|1; | ||
183 | ph.p_offset = 0; | ||
184 | ph.p_vaddr = VADDR; | ||
185 | ph.p_paddr = 0; | ||
186 | ph.p_filesz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + sizeof(payload); | ||
187 | ph.p_memsz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + sizeof(payload); | ||
188 | ph.p_align = 4096; | ||
189 | |||
190 | fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_EXCL|O_TMPFILE, 0700); | ||
191 | if (fd == -1) { | ||
192 | exit(1); | ||
193 | } | ||
194 | |||
195 | if (writev(fd, iov, 3) != sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len) { | ||
196 | exit(1); | ||
197 | } | ||
198 | |||
199 | /* Avoid ETXTBSY on exec. */ | ||
200 | snprintf(buf, sizeof(buf), "/proc/self/fd/%u", fd); | ||
201 | fd1 = open(buf, O_RDONLY|O_CLOEXEC); | ||
202 | close(fd); | ||
203 | |||
204 | return fd1; | ||
205 | } | ||
206 | #endif | ||
207 | |||
208 | #ifdef __x86_64__ | ||
209 | int main(void) | ||
210 | { | ||
211 | int pipefd[2]; | ||
212 | int exec_fd; | ||
213 | |||
214 | atexit(ate); | ||
215 | |||
216 | make_private_tmp(); | ||
217 | |||
218 | /* Reserve fd 0 for 1-byte pipe ping from child. */ | ||
219 | close(0); | ||
220 | if (open("/", O_RDONLY|O_DIRECTORY|O_PATH) != 0) { | ||
221 | return 1; | ||
222 | } | ||
223 | |||
224 | exec_fd = make_exe(payload, sizeof(payload)); | ||
225 | |||
226 | if (pipe(pipefd) == -1) { | ||
227 | return 1; | ||
228 | } | ||
229 | if (dup2(pipefd[1], 0) != 0) { | ||
230 | return 1; | ||
231 | } | ||
232 | |||
233 | pid = fork(); | ||
234 | if (pid == -1) { | ||
235 | return 1; | ||
236 | } | ||
237 | if (pid == 0) { | ||
238 | sys_execveat(exec_fd, "", NULL, NULL, AT_EMPTY_PATH); | ||
239 | return 1; | ||
240 | } | ||
241 | |||
242 | char _; | ||
243 | if (read(pipefd[0], &_, 1) != 1) { | ||
244 | return 1; | ||
245 | } | ||
246 | |||
247 | struct stat st; | ||
248 | if (fstat(exec_fd, &st) == -1) { | ||
249 | return 1; | ||
250 | } | ||
251 | |||
252 | /* Generate "head -n1 /proc/$PID/maps" */ | ||
253 | char buf0[256]; | ||
254 | memset(buf0, ' ', sizeof(buf0)); | ||
255 | int len = snprintf(buf0, sizeof(buf0), | ||
256 | "%08lx-%08lx r-xp 00000000 %02lx:%02lx %llu", | ||
257 | VADDR, VADDR + PAGE_SIZE, | ||
258 | MAJOR(st.st_dev), MINOR(st.st_dev), | ||
259 | (unsigned long long)st.st_ino); | ||
260 | buf0[len] = ' '; | ||
261 | snprintf(buf0 + MAPS_OFFSET, sizeof(buf0) - MAPS_OFFSET, | ||
262 | "/tmp/#%llu (deleted)\n", (unsigned long long)st.st_ino); | ||
263 | |||
264 | |||
265 | /* Test /proc/$PID/maps */ | ||
266 | { | ||
267 | char buf[256]; | ||
268 | ssize_t rv; | ||
269 | int fd; | ||
270 | |||
271 | snprintf(buf, sizeof(buf), "/proc/%u/maps", pid); | ||
272 | fd = open(buf, O_RDONLY); | ||
273 | if (fd == -1) { | ||
274 | return 1; | ||
275 | } | ||
276 | rv = read(fd, buf, sizeof(buf)); | ||
277 | assert(rv == strlen(buf0)); | ||
278 | assert(memcmp(buf, buf0, strlen(buf0)) == 0); | ||
279 | } | ||
280 | |||
281 | /* Test /proc/$PID/smaps */ | ||
282 | { | ||
283 | char buf[1024]; | ||
284 | ssize_t rv; | ||
285 | int fd; | ||
286 | |||
287 | snprintf(buf, sizeof(buf), "/proc/%u/smaps", pid); | ||
288 | fd = open(buf, O_RDONLY); | ||
289 | if (fd == -1) { | ||
290 | return 1; | ||
291 | } | ||
292 | rv = read(fd, buf, sizeof(buf)); | ||
293 | assert(0 <= rv && rv <= sizeof(buf)); | ||
294 | |||
295 | assert(rv >= strlen(buf0)); | ||
296 | assert(memcmp(buf, buf0, strlen(buf0)) == 0); | ||
297 | |||
298 | #define RSS1 "Rss: 4 kB\n" | ||
299 | #define RSS2 "Rss: 0 kB\n" | ||
300 | #define PSS1 "Pss: 4 kB\n" | ||
301 | #define PSS2 "Pss: 0 kB\n" | ||
302 | assert(memmem(buf, rv, RSS1, strlen(RSS1)) || | ||
303 | memmem(buf, rv, RSS2, strlen(RSS2))); | ||
304 | assert(memmem(buf, rv, PSS1, strlen(PSS1)) || | ||
305 | memmem(buf, rv, PSS2, strlen(PSS2))); | ||
306 | |||
307 | static const char *S[] = { | ||
308 | "Size: 4 kB\n", | ||
309 | "KernelPageSize: 4 kB\n", | ||
310 | "MMUPageSize: 4 kB\n", | ||
311 | "Anonymous: 0 kB\n", | ||
312 | "AnonHugePages: 0 kB\n", | ||
313 | "Shared_Hugetlb: 0 kB\n", | ||
314 | "Private_Hugetlb: 0 kB\n", | ||
315 | "Locked: 0 kB\n", | ||
316 | }; | ||
317 | int i; | ||
318 | |||
319 | for (i = 0; i < sizeof(S)/sizeof(S[0]); i++) { | ||
320 | assert(memmem(buf, rv, S[i], strlen(S[i]))); | ||
321 | } | ||
322 | } | ||
323 | |||
324 | /* Test /proc/$PID/smaps_rollup */ | ||
325 | { | ||
326 | char bufr[256]; | ||
327 | memset(bufr, ' ', sizeof(bufr)); | ||
328 | len = snprintf(bufr, sizeof(bufr), | ||
329 | "%08lx-%08lx ---p 00000000 00:00 0", | ||
330 | VADDR, VADDR + PAGE_SIZE); | ||
331 | bufr[len] = ' '; | ||
332 | snprintf(bufr + MAPS_OFFSET, sizeof(bufr) - MAPS_OFFSET, | ||
333 | "[rollup]\n"); | ||
334 | |||
335 | char buf[1024]; | ||
336 | ssize_t rv; | ||
337 | int fd; | ||
338 | |||
339 | snprintf(buf, sizeof(buf), "/proc/%u/smaps_rollup", pid); | ||
340 | fd = open(buf, O_RDONLY); | ||
341 | if (fd == -1) { | ||
342 | return 1; | ||
343 | } | ||
344 | rv = read(fd, buf, sizeof(buf)); | ||
345 | assert(0 <= rv && rv <= sizeof(buf)); | ||
346 | |||
347 | assert(rv >= strlen(bufr)); | ||
348 | assert(memcmp(buf, bufr, strlen(bufr)) == 0); | ||
349 | |||
350 | assert(memmem(buf, rv, RSS1, strlen(RSS1)) || | ||
351 | memmem(buf, rv, RSS2, strlen(RSS2))); | ||
352 | assert(memmem(buf, rv, PSS1, strlen(PSS1)) || | ||
353 | memmem(buf, rv, PSS2, strlen(PSS2))); | ||
354 | |||
355 | static const char *S[] = { | ||
356 | "Anonymous: 0 kB\n", | ||
357 | "AnonHugePages: 0 kB\n", | ||
358 | "Shared_Hugetlb: 0 kB\n", | ||
359 | "Private_Hugetlb: 0 kB\n", | ||
360 | "Locked: 0 kB\n", | ||
361 | }; | ||
362 | int i; | ||
363 | |||
364 | for (i = 0; i < sizeof(S)/sizeof(S[0]); i++) { | ||
365 | assert(memmem(buf, rv, S[i], strlen(S[i]))); | ||
366 | } | ||
367 | } | ||
368 | |||
369 | /* Test /proc/$PID/statm */ | ||
370 | { | ||
371 | char buf[64]; | ||
372 | ssize_t rv; | ||
373 | int fd; | ||
374 | |||
375 | snprintf(buf, sizeof(buf), "/proc/%u/statm", pid); | ||
376 | fd = open(buf, O_RDONLY); | ||
377 | if (fd == -1) { | ||
378 | return 1; | ||
379 | } | ||
380 | rv = read(fd, buf, sizeof(buf)); | ||
381 | assert(rv == 7 * 2); | ||
382 | |||
383 | assert(buf[0] == '1'); /* ->total_vm */ | ||
384 | assert(buf[1] == ' '); | ||
385 | assert(buf[2] == '0' || buf[2] == '1'); /* rss */ | ||
386 | assert(buf[3] == ' '); | ||
387 | assert(buf[4] == '0' || buf[2] == '1'); /* file rss */ | ||
388 | assert(buf[5] == ' '); | ||
389 | assert(buf[6] == '1'); /* ELF executable segments */ | ||
390 | assert(buf[7] == ' '); | ||
391 | assert(buf[8] == '0'); | ||
392 | assert(buf[9] == ' '); | ||
393 | assert(buf[10] == '0'); /* ->data_vm + ->stack_vm */ | ||
394 | assert(buf[11] == ' '); | ||
395 | assert(buf[12] == '0'); | ||
396 | assert(buf[13] == '\n'); | ||
397 | } | ||
398 | |||
399 | return 0; | ||
400 | } | ||
401 | #else | ||
402 | int main(void) | ||
403 | { | ||
404 | return 4; | ||
405 | } | ||
406 | #endif | ||
diff --git a/tools/testing/selftests/proc/proc-self-map-files-002.c b/tools/testing/selftests/proc/proc-self-map-files-002.c index 85744425b08d..762cb01f2ca7 100644 --- a/tools/testing/selftests/proc/proc-self-map-files-002.c +++ b/tools/testing/selftests/proc/proc-self-map-files-002.c | |||
@@ -63,7 +63,7 @@ int main(void) | |||
63 | p = mmap((void *)va, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0); | 63 | p = mmap((void *)va, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0); |
64 | if (p == MAP_FAILED) { | 64 | if (p == MAP_FAILED) { |
65 | if (errno == EPERM) | 65 | if (errno == EPERM) |
66 | return 2; | 66 | return 4; |
67 | return 1; | 67 | return 1; |
68 | } | 68 | } |
69 | 69 | ||
diff --git a/tools/testing/selftests/proc/proc-self-syscall.c b/tools/testing/selftests/proc/proc-self-syscall.c index 5ab5f4810e43..9f6d000c0245 100644 --- a/tools/testing/selftests/proc/proc-self-syscall.c +++ b/tools/testing/selftests/proc/proc-self-syscall.c | |||
@@ -20,7 +20,6 @@ | |||
20 | #include <sys/stat.h> | 20 | #include <sys/stat.h> |
21 | #include <fcntl.h> | 21 | #include <fcntl.h> |
22 | #include <errno.h> | 22 | #include <errno.h> |
23 | #include <unistd.h> | ||
24 | #include <string.h> | 23 | #include <string.h> |
25 | #include <stdio.h> | 24 | #include <stdio.h> |
26 | 25 | ||
@@ -39,7 +38,7 @@ int main(void) | |||
39 | fd = open("/proc/self/syscall", O_RDONLY); | 38 | fd = open("/proc/self/syscall", O_RDONLY); |
40 | if (fd == -1) { | 39 | if (fd == -1) { |
41 | if (errno == ENOENT) | 40 | if (errno == ENOENT) |
42 | return 2; | 41 | return 4; |
43 | return 1; | 42 | return 1; |
44 | } | 43 | } |
45 | 44 | ||
diff --git a/tools/testing/selftests/proc/proc-self-wchan.c b/tools/testing/selftests/proc/proc-self-wchan.c index a38b2fbaa7ad..b467b98a457d 100644 --- a/tools/testing/selftests/proc/proc-self-wchan.c +++ b/tools/testing/selftests/proc/proc-self-wchan.c | |||
@@ -27,7 +27,7 @@ int main(void) | |||
27 | fd = open("/proc/self/wchan", O_RDONLY); | 27 | fd = open("/proc/self/wchan", O_RDONLY); |
28 | if (fd == -1) { | 28 | if (fd == -1) { |
29 | if (errno == ENOENT) | 29 | if (errno == ENOENT) |
30 | return 2; | 30 | return 4; |
31 | return 1; | 31 | return 1; |
32 | } | 32 | } |
33 | 33 | ||
diff --git a/tools/testing/selftests/proc/read.c b/tools/testing/selftests/proc/read.c index 563e752e6eba..b3ef9e14d6cc 100644 --- a/tools/testing/selftests/proc/read.c +++ b/tools/testing/selftests/proc/read.c | |||
@@ -26,8 +26,10 @@ | |||
26 | #include <dirent.h> | 26 | #include <dirent.h> |
27 | #include <stdbool.h> | 27 | #include <stdbool.h> |
28 | #include <stdlib.h> | 28 | #include <stdlib.h> |
29 | #include <stdio.h> | ||
29 | #include <string.h> | 30 | #include <string.h> |
30 | #include <sys/stat.h> | 31 | #include <sys/stat.h> |
32 | #include <sys/vfs.h> | ||
31 | #include <fcntl.h> | 33 | #include <fcntl.h> |
32 | #include <unistd.h> | 34 | #include <unistd.h> |
33 | 35 | ||
@@ -123,10 +125,22 @@ static void f(DIR *d, unsigned int level) | |||
123 | int main(void) | 125 | int main(void) |
124 | { | 126 | { |
125 | DIR *d; | 127 | DIR *d; |
128 | struct statfs sfs; | ||
126 | 129 | ||
127 | d = opendir("/proc"); | 130 | d = opendir("/proc"); |
128 | if (!d) | 131 | if (!d) |
132 | return 4; | ||
133 | |||
134 | /* Ensure /proc is proc. */ | ||
135 | if (fstatfs(dirfd(d), &sfs) == -1) { | ||
136 | return 1; | ||
137 | } | ||
138 | if (sfs.f_type != 0x9fa0) { | ||
139 | fprintf(stderr, "error: unexpected f_type %lx\n", (long)sfs.f_type); | ||
129 | return 2; | 140 | return 2; |
141 | } | ||
142 | |||
130 | f(d, 0); | 143 | f(d, 0); |
144 | |||
131 | return 0; | 145 | return 0; |
132 | } | 146 | } |
diff --git a/tools/testing/selftests/tmpfs/.gitignore b/tools/testing/selftests/tmpfs/.gitignore new file mode 100644 index 000000000000..a96838fad74d --- /dev/null +++ b/tools/testing/selftests/tmpfs/.gitignore | |||
@@ -0,0 +1 @@ | |||
/bug-link-o-tmpfile | |||
diff --git a/tools/testing/selftests/tmpfs/Makefile b/tools/testing/selftests/tmpfs/Makefile new file mode 100644 index 000000000000..953c81299181 --- /dev/null +++ b/tools/testing/selftests/tmpfs/Makefile | |||
@@ -0,0 +1,7 @@ | |||
1 | CFLAGS += -Wall -O2 | ||
2 | CFLAGS += -D_GNU_SOURCE | ||
3 | |||
4 | TEST_GEN_PROGS := | ||
5 | TEST_GEN_PROGS += bug-link-o-tmpfile | ||
6 | |||
7 | include ../lib.mk | ||
diff --git a/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c new file mode 100644 index 000000000000..b5c3ddb90942 --- /dev/null +++ b/tools/testing/selftests/tmpfs/bug-link-o-tmpfile.c | |||
@@ -0,0 +1,67 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2019 Alexey Dobriyan <adobriyan@gmail.com> | ||
3 | * | ||
4 | * Permission to use, copy, modify, and distribute this software for any | ||
5 | * purpose with or without fee is hereby granted, provided that the above | ||
6 | * copyright notice and this permission notice appear in all copies. | ||
7 | * | ||
8 | * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | ||
9 | * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | ||
10 | * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR | ||
11 | * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | ||
12 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN | ||
13 | * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF | ||
14 | * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | ||
15 | */ | ||
16 | /* Test that open(O_TMPFILE), linkat() doesn't screw accounting. */ | ||
17 | #include <errno.h> | ||
18 | #include <sched.h> | ||
19 | #include <stdio.h> | ||
20 | #include <sys/types.h> | ||
21 | #include <sys/stat.h> | ||
22 | #include <fcntl.h> | ||
23 | #include <sys/mount.h> | ||
24 | #include <unistd.h> | ||
25 | |||
26 | int main(void) | ||
27 | { | ||
28 | int fd; | ||
29 | |||
30 | if (unshare(CLONE_NEWNS) == -1) { | ||
31 | if (errno == ENOSYS || errno == EPERM) { | ||
32 | fprintf(stderr, "error: unshare, errno %d\n", errno); | ||
33 | return 4; | ||
34 | } | ||
35 | fprintf(stderr, "error: unshare, errno %d\n", errno); | ||
36 | return 1; | ||
37 | } | ||
38 | if (mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL) == -1) { | ||
39 | fprintf(stderr, "error: mount '/', errno %d\n", errno); | ||
40 | return 1; | ||
41 | } | ||
42 | |||
43 | /* Our heroes: 1 root inode, 1 O_TMPFILE inode, 1 permanent inode. */ | ||
44 | if (mount(NULL, "/tmp", "tmpfs", 0, "nr_inodes=3") == -1) { | ||
45 | fprintf(stderr, "error: mount tmpfs, errno %d\n", errno); | ||
46 | return 1; | ||
47 | } | ||
48 | |||
49 | fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600); | ||
50 | if (fd == -1) { | ||
51 | fprintf(stderr, "error: open 1, errno %d\n", errno); | ||
52 | return 1; | ||
53 | } | ||
54 | if (linkat(fd, "", AT_FDCWD, "/tmp/1", AT_EMPTY_PATH) == -1) { | ||
55 | fprintf(stderr, "error: linkat, errno %d\n", errno); | ||
56 | return 1; | ||
57 | } | ||
58 | close(fd); | ||
59 | |||
60 | fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_TMPFILE, 0600); | ||
61 | if (fd == -1) { | ||
62 | fprintf(stderr, "error: open 2, errno %d\n", errno); | ||
63 | return 1; | ||
64 | } | ||
65 | |||
66 | return 0; | ||
67 | } | ||
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index 584a91ae4a8f..951c507a27f7 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests | |||
@@ -211,4 +211,20 @@ else | |||
211 | echo "[PASS]" | 211 | echo "[PASS]" |
212 | fi | 212 | fi |
213 | 213 | ||
214 | echo "------------------------------------" | ||
215 | echo "running vmalloc stability smoke test" | ||
216 | echo "------------------------------------" | ||
217 | ./test_vmalloc.sh smoke | ||
218 | ret_val=$? | ||
219 | |||
220 | if [ $ret_val -eq 0 ]; then | ||
221 | echo "[PASS]" | ||
222 | elif [ $ret_val -eq $ksft_skip ]; then | ||
223 | echo "[SKIP]" | ||
224 | exitcode=$ksft_skip | ||
225 | else | ||
226 | echo "[FAIL]" | ||
227 | exitcode=1 | ||
228 | fi | ||
229 | |||
214 | exit $exitcode | 230 | exit $exitcode |
diff --git a/tools/testing/selftests/vm/test_vmalloc.sh b/tools/testing/selftests/vm/test_vmalloc.sh new file mode 100644 index 000000000000..06d2bb109f06 --- /dev/null +++ b/tools/testing/selftests/vm/test_vmalloc.sh | |||
@@ -0,0 +1,176 @@ | |||
1 | #!/bin/bash | ||
2 | # SPDX-License-Identifier: GPL-2.0 | ||
3 | # | ||
4 | # Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@gmail.com> | ||
5 | # | ||
6 | # This is a test script for the kernel test driver to analyse vmalloc | ||
7 | # allocator. Therefore it is just a kernel module loader. You can specify | ||
8 | # and pass different parameters in order to: | ||
9 | # a) analyse performance of vmalloc allocations; | ||
10 | # b) stressing and stability check of vmalloc subsystem. | ||
11 | |||
12 | TEST_NAME="vmalloc" | ||
13 | DRIVER="test_${TEST_NAME}" | ||
14 | |||
15 | # 1 if fails | ||
16 | exitcode=1 | ||
17 | |||
18 | # Kselftest framework requirement - SKIP code is 4. | ||
19 | ksft_skip=4 | ||
20 | |||
21 | # | ||
22 | # Static templates for performance, stressing and smoke tests. | ||
23 | # Also it is possible to pass any supported parameters manualy. | ||
24 | # | ||
25 | PERF_PARAM="single_cpu_test=1 sequential_test_order=1 test_repeat_count=3" | ||
26 | SMOKE_PARAM="single_cpu_test=1 test_loop_count=10000 test_repeat_count=10" | ||
27 | STRESS_PARAM="test_repeat_count=20" | ||
28 | |||
29 | check_test_requirements() | ||
30 | { | ||
31 | uid=$(id -u) | ||
32 | if [ $uid -ne 0 ]; then | ||
33 | echo "$0: Must be run as root" | ||
34 | exit $ksft_skip | ||
35 | fi | ||
36 | |||
37 | if ! which modprobe > /dev/null 2>&1; then | ||
38 | echo "$0: You need modprobe installed" | ||
39 | exit $ksft_skip | ||
40 | fi | ||
41 | |||
42 | if ! modinfo $DRIVER > /dev/null 2>&1; then | ||
43 | echo "$0: You must have the following enabled in your kernel:" | ||
44 | echo "CONFIG_TEST_VMALLOC=m" | ||
45 | exit $ksft_skip | ||
46 | fi | ||
47 | } | ||
48 | |||
49 | run_perfformance_check() | ||
50 | { | ||
51 | echo "Run performance tests to evaluate how fast vmalloc allocation is." | ||
52 | echo "It runs all test cases on one single CPU with sequential order." | ||
53 | |||
54 | modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1 | ||
55 | echo "Done." | ||
56 | echo "Ccheck the kernel message buffer to see the summary." | ||
57 | } | ||
58 | |||
59 | run_stability_check() | ||
60 | { | ||
61 | echo "Run stability tests. In order to stress vmalloc subsystem we run" | ||
62 | echo "all available test cases on all available CPUs simultaneously." | ||
63 | echo "It will take time, so be patient." | ||
64 | |||
65 | modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 | ||
66 | echo "Done." | ||
67 | echo "Check the kernel ring buffer to see the summary." | ||
68 | } | ||
69 | |||
70 | run_smoke_check() | ||
71 | { | ||
72 | echo "Run smoke test. Note, this test provides basic coverage." | ||
73 | echo "Please check $0 output how it can be used" | ||
74 | echo "for deep performance analysis as well as stress testing." | ||
75 | |||
76 | modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1 | ||
77 | echo "Done." | ||
78 | echo "Check the kernel ring buffer to see the summary." | ||
79 | } | ||
80 | |||
81 | usage() | ||
82 | { | ||
83 | echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | " | ||
84 | echo "manual parameters" | ||
85 | echo | ||
86 | echo "Valid tests and parameters:" | ||
87 | echo | ||
88 | modinfo $DRIVER | ||
89 | echo | ||
90 | echo "Example usage:" | ||
91 | echo | ||
92 | echo "# Shows help message" | ||
93 | echo "./${DRIVER}.sh" | ||
94 | echo | ||
95 | echo "# Runs 1 test(id_1), repeats it 5 times on all online CPUs" | ||
96 | echo "./${DRIVER}.sh run_test_mask=1 test_repeat_count=5" | ||
97 | echo | ||
98 | echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with " | ||
99 | echo "sequential order" | ||
100 | echo -n "./${DRIVER}.sh single_cpu_test=1 sequential_test_order=1 " | ||
101 | echo "run_test_mask=23" | ||
102 | echo | ||
103 | echo -n "# Runs all tests on all online CPUs, shuffled order, repeats " | ||
104 | echo "20 times" | ||
105 | echo "./${DRIVER}.sh test_repeat_count=20" | ||
106 | echo | ||
107 | echo "# Performance analysis" | ||
108 | echo "./${DRIVER}.sh performance" | ||
109 | echo | ||
110 | echo "# Stress testing" | ||
111 | echo "./${DRIVER}.sh stress" | ||
112 | echo | ||
113 | exit 0 | ||
114 | } | ||
115 | |||
116 | function validate_passed_args() | ||
117 | { | ||
118 | VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'` | ||
119 | |||
120 | # | ||
121 | # Something has been passed, check it. | ||
122 | # | ||
123 | for passed_arg in $@; do | ||
124 | key=${passed_arg//=*/} | ||
125 | val="${passed_arg:$((${#key}+1))}" | ||
126 | valid=0 | ||
127 | |||
128 | for valid_arg in $VALID_ARGS; do | ||
129 | if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then | ||
130 | valid=1 | ||
131 | break | ||
132 | fi | ||
133 | done | ||
134 | |||
135 | if [[ $valid -ne 1 ]]; then | ||
136 | echo "Error: key or value is not correct: ${key} $val" | ||
137 | exit $exitcode | ||
138 | fi | ||
139 | done | ||
140 | } | ||
141 | |||
142 | function run_manual_check() | ||
143 | { | ||
144 | # | ||
145 | # Validate passed parameters. If there is wrong one, | ||
146 | # the script exists and does not execute further. | ||
147 | # | ||
148 | validate_passed_args $@ | ||
149 | |||
150 | echo "Run the test with following parameters: $@" | ||
151 | modprobe $DRIVER $@ > /dev/null 2>&1 | ||
152 | echo "Done." | ||
153 | echo "Check the kernel ring buffer to see the summary." | ||
154 | } | ||
155 | |||
156 | function run_test() | ||
157 | { | ||
158 | if [ $# -eq 0 ]; then | ||
159 | usage | ||
160 | else | ||
161 | if [[ "$1" = "performance" ]]; then | ||
162 | run_perfformance_check | ||
163 | elif [[ "$1" = "stress" ]]; then | ||
164 | run_stability_check | ||
165 | elif [[ "$1" = "smoke" ]]; then | ||
166 | run_smoke_check | ||
167 | else | ||
168 | run_manual_check $@ | ||
169 | fi | ||
170 | fi | ||
171 | } | ||
172 | |||
173 | check_test_requirements | ||
174 | run_test $@ | ||
175 | |||
176 | exit 0 | ||
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index 1ff3a6c0367b..6f64b2b93234 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c | |||
@@ -133,7 +133,7 @@ static const char * const page_flag_names[] = { | |||
133 | [KPF_NOPAGE] = "n:nopage", | 133 | [KPF_NOPAGE] = "n:nopage", |
134 | [KPF_KSM] = "x:ksm", | 134 | [KPF_KSM] = "x:ksm", |
135 | [KPF_THP] = "t:thp", | 135 | [KPF_THP] = "t:thp", |
136 | [KPF_BALLOON] = "o:balloon", | 136 | [KPF_OFFLINE] = "o:offline", |
137 | [KPF_PGTABLE] = "g:pgtable", | 137 | [KPF_PGTABLE] = "g:pgtable", |
138 | [KPF_ZERO_PAGE] = "z:zero_page", | 138 | [KPF_ZERO_PAGE] = "z:zero_page", |
139 | [KPF_IDLE] = "i:idle_page", | 139 | [KPF_IDLE] = "i:idle_page", |
diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 334b16db0ebb..73818f1b2ef8 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c | |||
@@ -110,39 +110,42 @@ static void fatal(const char *x, ...) | |||
110 | static void usage(void) | 110 | static void usage(void) |
111 | { | 111 | { |
112 | printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n" | 112 | printf("slabinfo 4/15/2011. (c) 2007 sgi/(c) 2011 Linux Foundation.\n\n" |
113 | "slabinfo [-ahnpvtsz] [-d debugopts] [slab-regexp]\n" | 113 | "slabinfo [-aADefhilnosrStTvz1LXBU] [N=K] [-dafzput] [slab-regexp]\n" |
114 | "-a|--aliases Show aliases\n" | 114 | "-a|--aliases Show aliases\n" |
115 | "-A|--activity Most active slabs first\n" | 115 | "-A|--activity Most active slabs first\n" |
116 | "-d<options>|--debug=<options> Set/Clear Debug options\n" | 116 | "-B|--Bytes Show size in bytes\n" |
117 | "-D|--display-active Switch line format to activity\n" | 117 | "-D|--display-active Switch line format to activity\n" |
118 | "-e|--empty Show empty slabs\n" | 118 | "-e|--empty Show empty slabs\n" |
119 | "-f|--first-alias Show first alias\n" | 119 | "-f|--first-alias Show first alias\n" |
120 | "-h|--help Show usage information\n" | 120 | "-h|--help Show usage information\n" |
121 | "-i|--inverted Inverted list\n" | 121 | "-i|--inverted Inverted list\n" |
122 | "-l|--slabs Show slabs\n" | 122 | "-l|--slabs Show slabs\n" |
123 | "-L|--Loss Sort by loss\n" | ||
123 | "-n|--numa Show NUMA information\n" | 124 | "-n|--numa Show NUMA information\n" |
124 | "-o|--ops Show kmem_cache_ops\n" | 125 | "-N|--lines=K Show the first K slabs\n" |
126 | "-o|--ops Show kmem_cache_ops\n" | ||
127 | "-r|--report Detailed report on single slabs\n" | ||
125 | "-s|--shrink Shrink slabs\n" | 128 | "-s|--shrink Shrink slabs\n" |
126 | "-r|--report Detailed report on single slabs\n" | ||
127 | "-S|--Size Sort by size\n" | 129 | "-S|--Size Sort by size\n" |
128 | "-t|--tracking Show alloc/free information\n" | 130 | "-t|--tracking Show alloc/free information\n" |
129 | "-T|--Totals Show summary information\n" | 131 | "-T|--Totals Show summary information\n" |
132 | "-U|--Unreclaim Show unreclaimable slabs only\n" | ||
130 | "-v|--validate Validate slabs\n" | 133 | "-v|--validate Validate slabs\n" |
131 | "-z|--zero Include empty slabs\n" | 134 | "-z|--zero Include empty slabs\n" |
132 | "-1|--1ref Single reference\n" | 135 | "-1|--1ref Single reference\n" |
133 | "-N|--lines=K Show the first K slabs\n" | ||
134 | "-L|--Loss Sort by loss\n" | ||
135 | "-X|--Xtotals Show extended summary information\n" | 136 | "-X|--Xtotals Show extended summary information\n" |
136 | "-B|--Bytes Show size in bytes\n" | 137 | |
137 | "-U|--Unreclaim Show unreclaimable slabs only\n" | 138 | "\n" |
138 | "\nValid debug options (FZPUT may be combined)\n" | 139 | "-d | --debug Switch off all debug options\n" |
139 | "a / A Switch on all debug options (=FZUP)\n" | 140 | "-da | --debug=a Switch on all debug options (--debug=FZPU)\n" |
140 | "- Switch off all debug options\n" | 141 | |
141 | "f / F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n" | 142 | "\n" |
142 | "z / Z Redzoning\n" | 143 | "-d[afzput] | --debug=[afzput]\n" |
143 | "p / P Poisoning\n" | 144 | " f | F Sanity Checks (SLAB_CONSISTENCY_CHECKS)\n" |
144 | "u / U Tracking\n" | 145 | " z | Z Redzoning\n" |
145 | "t / T Tracing\n" | 146 | " p | P Poisoning\n" |
147 | " u | U Tracking\n" | ||
148 | " t | T Tracing\n" | ||
146 | ); | 149 | ); |
147 | } | 150 | } |
148 | 151 | ||