diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-24 19:10:23 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-24 19:10:23 -0400 |
commit | 9c9fa97a8edbc3668dfc7a25de516e80c146e86f (patch) | |
tree | 2dc0e90203796a4b346ce190f9521c3294104058 | |
parent | 5184d449600f501a8688069f35c138c6b3bf8b94 (diff) | |
parent | 2b38d01b4de8b1bbda7f5f7e91252609557635fc (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton:
- a few hot fixes
- ocfs2 updates
- almost all of -mm (slab-generic, slab, slub, kmemleak, kasan,
cleanups, debug, pagecache, memcg, gup, pagemap, memory-hotplug,
sparsemem, vmalloc, initialization, z3fold, compaction, mempolicy,
oom-kill, hugetlb, migration, thp, mmap, madvise, shmem, zswap,
zsmalloc)
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (132 commits)
mm/zsmalloc.c: fix a -Wunused-function warning
zswap: do not map same object twice
zswap: use movable memory if zpool support allocate movable memory
zpool: add malloc_support_movable to zpool_driver
shmem: fix obsolete comment in shmem_getpage_gfp()
mm/madvise: reduce code duplication in error handling paths
mm: mmap: increase sockets maximum memory size pgoff for 32bits
mm/mmap.c: refine find_vma_prev() with rb_last()
riscv: make mmap allocation top-down by default
mips: use generic mmap top-down layout and brk randomization
mips: replace arch specific way to determine 32bit task with generic version
mips: adjust brk randomization offset to fit generic version
mips: use STACK_TOP when computing mmap base address
mips: properly account for stack randomization and stack guard gap
arm: use generic mmap top-down layout and brk randomization
arm: use STACK_TOP when computing mmap base address
arm: properly account for stack randomization and stack guard gap
arm64, mm: make randomization selected by generic topdown mmap layout
arm64, mm: move generic mmap layout functions to mm
arm64: consider stack randomization for mmap base only when necessary
...
204 files changed, 2273 insertions, 2444 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab index 29601d93a1c2..ed35833ad7f0 100644 --- a/Documentation/ABI/testing/sysfs-kernel-slab +++ b/Documentation/ABI/testing/sysfs-kernel-slab | |||
@@ -429,10 +429,15 @@ KernelVersion: 2.6.22 | |||
429 | Contact: Pekka Enberg <penberg@cs.helsinki.fi>, | 429 | Contact: Pekka Enberg <penberg@cs.helsinki.fi>, |
430 | Christoph Lameter <cl@linux-foundation.org> | 430 | Christoph Lameter <cl@linux-foundation.org> |
431 | Description: | 431 | Description: |
432 | The shrink file is written when memory should be reclaimed from | 432 | The shrink file is used to reclaim unused slab cache |
433 | a cache. Empty partial slabs are freed and the partial list is | 433 | memory from a cache. Empty per-cpu or partial slabs |
434 | sorted so the slabs with the fewest available objects are used | 434 | are freed and the partial list is sorted so the slabs |
435 | first. | 435 | with the fewest available objects are used first. |
436 | It only accepts a value of "1" on write for shrinking | ||
437 | the cache. Other input values are considered invalid. | ||
438 | Shrinking slab caches might be expensive and can | ||
439 | adversely impact other running applications. So it | ||
440 | should be used with care. | ||
436 | 441 | ||
437 | What: /sys/kernel/slab/cache/slab_size | 442 | What: /sys/kernel/slab/cache/slab_size |
438 | Date: May 2007 | 443 | Date: May 2007 |
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 41bdc038dad9..0ae4f564c2d6 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst | |||
@@ -85,8 +85,10 @@ Brief summary of control files. | |||
85 | memory.oom_control set/show oom controls. | 85 | memory.oom_control set/show oom controls. |
86 | memory.numa_stat show the number of memory usage per numa | 86 | memory.numa_stat show the number of memory usage per numa |
87 | node | 87 | node |
88 | |||
89 | memory.kmem.limit_in_bytes set/show hard limit for kernel memory | 88 | memory.kmem.limit_in_bytes set/show hard limit for kernel memory |
89 | This knob is deprecated and shouldn't be | ||
90 | used. It is planned that this be removed in | ||
91 | the foreseeable future. | ||
90 | memory.kmem.usage_in_bytes show current kernel memory allocation | 92 | memory.kmem.usage_in_bytes show current kernel memory allocation |
91 | memory.kmem.failcnt show the number of kernel memory usage | 93 | memory.kmem.failcnt show the number of kernel memory usage |
92 | hits limits | 94 | hits limits |
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 254d8a369f32..944e03e29f65 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt | |||
@@ -809,6 +809,8 @@ | |||
809 | enables the feature at boot time. By default, it is | 809 | enables the feature at boot time. By default, it is |
810 | disabled and the system will work mostly the same as a | 810 | disabled and the system will work mostly the same as a |
811 | kernel built without CONFIG_DEBUG_PAGEALLOC. | 811 | kernel built without CONFIG_DEBUG_PAGEALLOC. |
812 | Note: to get most of debug_pagealloc error reports, it's | ||
813 | useful to also enable the page_owner functionality. | ||
812 | on: enable the feature | 814 | on: enable the feature |
813 | 815 | ||
814 | debugpat [X86] Enable PAT debugging | 816 | debugpat [X86] Enable PAT debugging |
diff --git a/arch/Kconfig b/arch/Kconfig index 0fcf8ec1e098..5f8a5d84dbbe 100644 --- a/arch/Kconfig +++ b/arch/Kconfig | |||
@@ -706,6 +706,17 @@ config HAVE_ARCH_COMPAT_MMAP_BASES | |||
706 | and vice-versa 32-bit applications to call 64-bit mmap(). | 706 | and vice-versa 32-bit applications to call 64-bit mmap(). |
707 | Required for applications doing different bitness syscalls. | 707 | Required for applications doing different bitness syscalls. |
708 | 708 | ||
709 | # This allows to use a set of generic functions to determine mmap base | ||
710 | # address by giving priority to top-down scheme only if the process | ||
711 | # is not in legacy mode (compat task, unlimited stack size or | ||
712 | # sysctl_legacy_va_layout). | ||
713 | # Architecture that selects this option can provide its own version of: | ||
714 | # - STACK_RND_MASK | ||
715 | config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT | ||
716 | bool | ||
717 | depends on MMU | ||
718 | select ARCH_HAS_ELF_RANDOMIZE | ||
719 | |||
709 | config HAVE_COPY_THREAD_TLS | 720 | config HAVE_COPY_THREAD_TLS |
710 | bool | 721 | bool |
711 | help | 722 | help |
diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h index 71ded3b7d82d..eb91f1e85629 100644 --- a/arch/alpha/include/asm/pgalloc.h +++ b/arch/alpha/include/asm/pgalloc.h | |||
@@ -53,6 +53,4 @@ pmd_free(struct mm_struct *mm, pmd_t *pmd) | |||
53 | free_page((unsigned long)pmd); | 53 | free_page((unsigned long)pmd); |
54 | } | 54 | } |
55 | 55 | ||
56 | #define check_pgt_cache() do { } while (0) | ||
57 | |||
58 | #endif /* _ALPHA_PGALLOC_H */ | 56 | #endif /* _ALPHA_PGALLOC_H */ |
diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index 89c2032f9960..065b57f408c3 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h | |||
@@ -359,11 +359,6 @@ extern void paging_init(void); | |||
359 | 359 | ||
360 | #include <asm-generic/pgtable.h> | 360 | #include <asm-generic/pgtable.h> |
361 | 361 | ||
362 | /* | ||
363 | * No page table caches to initialise | ||
364 | */ | ||
365 | #define pgtable_cache_init() do { } while (0) | ||
366 | |||
367 | /* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */ | 362 | /* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */ |
368 | #define HAVE_ARCH_UNMAPPED_AREA | 363 | #define HAVE_ARCH_UNMAPPED_AREA |
369 | 364 | ||
diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h index 9bdb8ed5b0db..4751f2251cd9 100644 --- a/arch/arc/include/asm/pgalloc.h +++ b/arch/arc/include/asm/pgalloc.h | |||
@@ -129,7 +129,6 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptep) | |||
129 | 129 | ||
130 | #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte) | 130 | #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte) |
131 | 131 | ||
132 | #define check_pgt_cache() do { } while (0) | ||
133 | #define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd)) | 132 | #define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd)) |
134 | 133 | ||
135 | #endif /* _ASM_ARC_PGALLOC_H */ | 134 | #endif /* _ASM_ARC_PGALLOC_H */ |
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 1d87c18a2976..7addd0301c51 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h | |||
@@ -395,11 +395,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, | |||
395 | /* to cope with aliasing VIPT cache */ | 395 | /* to cope with aliasing VIPT cache */ |
396 | #define HAVE_ARCH_UNMAPPED_AREA | 396 | #define HAVE_ARCH_UNMAPPED_AREA |
397 | 397 | ||
398 | /* | ||
399 | * No page table caches to initialise | ||
400 | */ | ||
401 | #define pgtable_cache_init() do { } while (0) | ||
402 | |||
403 | #endif /* __ASSEMBLY__ */ | 398 | #endif /* __ASSEMBLY__ */ |
404 | 399 | ||
405 | #endif | 400 | #endif |
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 229f2cdd81ca..8a50efb559f3 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig | |||
@@ -34,6 +34,7 @@ config ARM | |||
34 | select ARCH_SUPPORTS_ATOMIC_RMW | 34 | select ARCH_SUPPORTS_ATOMIC_RMW |
35 | select ARCH_USE_BUILTIN_BSWAP | 35 | select ARCH_USE_BUILTIN_BSWAP |
36 | select ARCH_USE_CMPXCHG_LOCKREF | 36 | select ARCH_USE_CMPXCHG_LOCKREF |
37 | select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU | ||
37 | select ARCH_WANT_IPC_PARSE_VERSION | 38 | select ARCH_WANT_IPC_PARSE_VERSION |
38 | select BINFMT_FLAT_ARGVP_ENVP_ON_STACK | 39 | select BINFMT_FLAT_ARGVP_ENVP_ON_STACK |
39 | select BUILDTIME_EXTABLE_SORT if MMU | 40 | select BUILDTIME_EXTABLE_SORT if MMU |
diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index a2a68b751971..069da393110c 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h | |||
@@ -15,8 +15,6 @@ | |||
15 | #include <asm/cacheflush.h> | 15 | #include <asm/cacheflush.h> |
16 | #include <asm/tlbflush.h> | 16 | #include <asm/tlbflush.h> |
17 | 17 | ||
18 | #define check_pgt_cache() do { } while (0) | ||
19 | |||
20 | #ifdef CONFIG_MMU | 18 | #ifdef CONFIG_MMU |
21 | 19 | ||
22 | #define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_USER)) | 20 | #define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_USER)) |
diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h index d0de24f06724..010fa1a35a68 100644 --- a/arch/arm/include/asm/pgtable-nommu.h +++ b/arch/arm/include/asm/pgtable-nommu.h | |||
@@ -71,11 +71,6 @@ typedef pte_t *pte_addr_t; | |||
71 | extern unsigned int kobjsize(const void *objp); | 71 | extern unsigned int kobjsize(const void *objp); |
72 | 72 | ||
73 | /* | 73 | /* |
74 | * No page table caches to initialise. | ||
75 | */ | ||
76 | #define pgtable_cache_init() do { } while (0) | ||
77 | |||
78 | /* | ||
79 | * All 32bit addresses are effectively valid for vmalloc... | 74 | * All 32bit addresses are effectively valid for vmalloc... |
80 | * Sort of meaningless for non-VM targets. | 75 | * Sort of meaningless for non-VM targets. |
81 | */ | 76 | */ |
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index f2e990dc27e7..3ae120cd1715 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h | |||
@@ -368,8 +368,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) | |||
368 | #define HAVE_ARCH_UNMAPPED_AREA | 368 | #define HAVE_ARCH_UNMAPPED_AREA |
369 | #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN | 369 | #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN |
370 | 370 | ||
371 | #define pgtable_cache_init() do { } while (0) | ||
372 | |||
373 | #endif /* !__ASSEMBLY__ */ | 371 | #endif /* !__ASSEMBLY__ */ |
374 | 372 | ||
375 | #endif /* CONFIG_MMU */ | 373 | #endif /* CONFIG_MMU */ |
diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index 20c2f42454b8..614bf829e454 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h | |||
@@ -140,8 +140,6 @@ static inline void prefetchw(const void *ptr) | |||
140 | #endif | 140 | #endif |
141 | #endif | 141 | #endif |
142 | 142 | ||
143 | #define HAVE_ARCH_PICK_MMAP_LAYOUT | ||
144 | |||
145 | #endif | 143 | #endif |
146 | 144 | ||
147 | #endif /* __ASM_ARM_PROCESSOR_H */ | 145 | #endif /* __ASM_ARM_PROCESSOR_H */ |
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index f934a6739fc0..9485acc520a4 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c | |||
@@ -319,11 +319,6 @@ unsigned long get_wchan(struct task_struct *p) | |||
319 | return 0; | 319 | return 0; |
320 | } | 320 | } |
321 | 321 | ||
322 | unsigned long arch_randomize_brk(struct mm_struct *mm) | ||
323 | { | ||
324 | return randomize_page(mm->brk, 0x02000000); | ||
325 | } | ||
326 | |||
327 | #ifdef CONFIG_MMU | 322 | #ifdef CONFIG_MMU |
328 | #ifdef CONFIG_KUSER_HELPERS | 323 | #ifdef CONFIG_KUSER_HELPERS |
329 | /* | 324 | /* |
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 6ecbda87ee46..6d89db7895d1 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c | |||
@@ -204,18 +204,17 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page) | |||
204 | * coherent with the kernels mapping. | 204 | * coherent with the kernels mapping. |
205 | */ | 205 | */ |
206 | if (!PageHighMem(page)) { | 206 | if (!PageHighMem(page)) { |
207 | size_t page_size = PAGE_SIZE << compound_order(page); | 207 | __cpuc_flush_dcache_area(page_address(page), page_size(page)); |
208 | __cpuc_flush_dcache_area(page_address(page), page_size); | ||
209 | } else { | 208 | } else { |
210 | unsigned long i; | 209 | unsigned long i; |
211 | if (cache_is_vipt_nonaliasing()) { | 210 | if (cache_is_vipt_nonaliasing()) { |
212 | for (i = 0; i < (1 << compound_order(page)); i++) { | 211 | for (i = 0; i < compound_nr(page); i++) { |
213 | void *addr = kmap_atomic(page + i); | 212 | void *addr = kmap_atomic(page + i); |
214 | __cpuc_flush_dcache_area(addr, PAGE_SIZE); | 213 | __cpuc_flush_dcache_area(addr, PAGE_SIZE); |
215 | kunmap_atomic(addr); | 214 | kunmap_atomic(addr); |
216 | } | 215 | } |
217 | } else { | 216 | } else { |
218 | for (i = 0; i < (1 << compound_order(page)); i++) { | 217 | for (i = 0; i < compound_nr(page); i++) { |
219 | void *addr = kmap_high_get(page + i); | 218 | void *addr = kmap_high_get(page + i); |
220 | if (addr) { | 219 | if (addr) { |
221 | __cpuc_flush_dcache_area(addr, PAGE_SIZE); | 220 | __cpuc_flush_dcache_area(addr, PAGE_SIZE); |
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c index f866870db749..b8d912ac9e61 100644 --- a/arch/arm/mm/mmap.c +++ b/arch/arm/mm/mmap.c | |||
@@ -17,33 +17,6 @@ | |||
17 | ((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \ | 17 | ((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \ |
18 | (((pgoff)<<PAGE_SHIFT) & (SHMLBA-1))) | 18 | (((pgoff)<<PAGE_SHIFT) & (SHMLBA-1))) |
19 | 19 | ||
20 | /* gap between mmap and stack */ | ||
21 | #define MIN_GAP (128*1024*1024UL) | ||
22 | #define MAX_GAP ((TASK_SIZE)/6*5) | ||
23 | |||
24 | static int mmap_is_legacy(struct rlimit *rlim_stack) | ||
25 | { | ||
26 | if (current->personality & ADDR_COMPAT_LAYOUT) | ||
27 | return 1; | ||
28 | |||
29 | if (rlim_stack->rlim_cur == RLIM_INFINITY) | ||
30 | return 1; | ||
31 | |||
32 | return sysctl_legacy_va_layout; | ||
33 | } | ||
34 | |||
35 | static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) | ||
36 | { | ||
37 | unsigned long gap = rlim_stack->rlim_cur; | ||
38 | |||
39 | if (gap < MIN_GAP) | ||
40 | gap = MIN_GAP; | ||
41 | else if (gap > MAX_GAP) | ||
42 | gap = MAX_GAP; | ||
43 | |||
44 | return PAGE_ALIGN(TASK_SIZE - gap - rnd); | ||
45 | } | ||
46 | |||
47 | /* | 20 | /* |
48 | * We need to ensure that shared mappings are correctly aligned to | 21 | * We need to ensure that shared mappings are correctly aligned to |
49 | * avoid aliasing issues with VIPT caches. We need to ensure that | 22 | * avoid aliasing issues with VIPT caches. We need to ensure that |
@@ -171,31 +144,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
171 | return addr; | 144 | return addr; |
172 | } | 145 | } |
173 | 146 | ||
174 | unsigned long arch_mmap_rnd(void) | ||
175 | { | ||
176 | unsigned long rnd; | ||
177 | |||
178 | rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); | ||
179 | |||
180 | return rnd << PAGE_SHIFT; | ||
181 | } | ||
182 | |||
183 | void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) | ||
184 | { | ||
185 | unsigned long random_factor = 0UL; | ||
186 | |||
187 | if (current->flags & PF_RANDOMIZE) | ||
188 | random_factor = arch_mmap_rnd(); | ||
189 | |||
190 | if (mmap_is_legacy(rlim_stack)) { | ||
191 | mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; | ||
192 | mm->get_unmapped_area = arch_get_unmapped_area; | ||
193 | } else { | ||
194 | mm->mmap_base = mmap_base(random_factor, rlim_stack); | ||
195 | mm->get_unmapped_area = arch_get_unmapped_area_topdown; | ||
196 | } | ||
197 | } | ||
198 | |||
199 | /* | 147 | /* |
200 | * You really shouldn't be using read() or write() on /dev/mem. This | 148 | * You really shouldn't be using read() or write() on /dev/mem. This |
201 | * might go away in the future. | 149 | * might go away in the future. |
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 37c610963eee..866e05882799 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig | |||
@@ -15,7 +15,6 @@ config ARM64 | |||
15 | select ARCH_HAS_DMA_COHERENT_TO_PFN | 15 | select ARCH_HAS_DMA_COHERENT_TO_PFN |
16 | select ARCH_HAS_DMA_PREP_COHERENT | 16 | select ARCH_HAS_DMA_PREP_COHERENT |
17 | select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI | 17 | select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI |
18 | select ARCH_HAS_ELF_RANDOMIZE | ||
19 | select ARCH_HAS_FAST_MULTIPLIER | 18 | select ARCH_HAS_FAST_MULTIPLIER |
20 | select ARCH_HAS_FORTIFY_SOURCE | 19 | select ARCH_HAS_FORTIFY_SOURCE |
21 | select ARCH_HAS_GCOV_PROFILE_ALL | 20 | select ARCH_HAS_GCOV_PROFILE_ALL |
@@ -71,6 +70,7 @@ config ARM64 | |||
71 | select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG | 70 | select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG |
72 | select ARCH_SUPPORTS_NUMA_BALANCING | 71 | select ARCH_SUPPORTS_NUMA_BALANCING |
73 | select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT | 72 | select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT |
73 | select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT | ||
74 | select ARCH_WANT_FRAME_POINTERS | 74 | select ARCH_WANT_FRAME_POINTERS |
75 | select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) | 75 | select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) |
76 | select ARCH_HAS_UBSAN_SANITIZE_ALL | 76 | select ARCH_HAS_UBSAN_SANITIZE_ALL |
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index 14d0bc44d451..172d76fa0245 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h | |||
@@ -15,8 +15,6 @@ | |||
15 | 15 | ||
16 | #include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ | 16 | #include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ |
17 | 17 | ||
18 | #define check_pgt_cache() do { } while (0) | ||
19 | |||
20 | #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) | 18 | #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) |
21 | 19 | ||
22 | #if CONFIG_PGTABLE_LEVELS > 2 | 20 | #if CONFIG_PGTABLE_LEVELS > 2 |
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 57427d17580e..7576df00eb50 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h | |||
@@ -861,8 +861,6 @@ extern int kern_addr_valid(unsigned long addr); | |||
861 | 861 | ||
862 | #include <asm-generic/pgtable.h> | 862 | #include <asm-generic/pgtable.h> |
863 | 863 | ||
864 | static inline void pgtable_cache_init(void) { } | ||
865 | |||
866 | /* | 864 | /* |
867 | * On AArch64, the cache coherency is handled via the set_pte_at() function. | 865 | * On AArch64, the cache coherency is handled via the set_pte_at() function. |
868 | */ | 866 | */ |
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index c67848c55009..5623685c7d13 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h | |||
@@ -280,8 +280,6 @@ static inline void spin_lock_prefetch(const void *ptr) | |||
280 | "nop") : : "p" (ptr)); | 280 | "nop") : : "p" (ptr)); |
281 | } | 281 | } |
282 | 282 | ||
283 | #define HAVE_ARCH_PICK_MMAP_LAYOUT | ||
284 | |||
285 | extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */ | 283 | extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */ |
286 | extern void __init minsigstksz_setup(void); | 284 | extern void __init minsigstksz_setup(void); |
287 | 285 | ||
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 03689c0beb34..a47462def04b 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c | |||
@@ -557,14 +557,6 @@ unsigned long arch_align_stack(unsigned long sp) | |||
557 | return sp & ~0xf; | 557 | return sp & ~0xf; |
558 | } | 558 | } |
559 | 559 | ||
560 | unsigned long arch_randomize_brk(struct mm_struct *mm) | ||
561 | { | ||
562 | if (is_compat_task()) | ||
563 | return randomize_page(mm->brk, SZ_32M); | ||
564 | else | ||
565 | return randomize_page(mm->brk, SZ_1G); | ||
566 | } | ||
567 | |||
568 | /* | 560 | /* |
569 | * Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY. | 561 | * Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY. |
570 | */ | 562 | */ |
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index dc19300309d2..ac485163a4a7 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c | |||
@@ -56,8 +56,7 @@ void __sync_icache_dcache(pte_t pte) | |||
56 | struct page *page = pte_page(pte); | 56 | struct page *page = pte_page(pte); |
57 | 57 | ||
58 | if (!test_and_set_bit(PG_dcache_clean, &page->flags)) | 58 | if (!test_and_set_bit(PG_dcache_clean, &page->flags)) |
59 | sync_icache_aliases(page_address(page), | 59 | sync_icache_aliases(page_address(page), page_size(page)); |
60 | PAGE_SIZE << compound_order(page)); | ||
61 | } | 60 | } |
62 | EXPORT_SYMBOL_GPL(__sync_icache_dcache); | 61 | EXPORT_SYMBOL_GPL(__sync_icache_dcache); |
63 | 62 | ||
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index b050641b5139..3028bacbc4e9 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c | |||
@@ -21,78 +21,6 @@ | |||
21 | #include <asm/cputype.h> | 21 | #include <asm/cputype.h> |
22 | 22 | ||
23 | /* | 23 | /* |
24 | * Leave enough space between the mmap area and the stack to honour ulimit in | ||
25 | * the face of randomisation. | ||
26 | */ | ||
27 | #define MIN_GAP (SZ_128M) | ||
28 | #define MAX_GAP (STACK_TOP/6*5) | ||
29 | |||
30 | static int mmap_is_legacy(struct rlimit *rlim_stack) | ||
31 | { | ||
32 | if (current->personality & ADDR_COMPAT_LAYOUT) | ||
33 | return 1; | ||
34 | |||
35 | if (rlim_stack->rlim_cur == RLIM_INFINITY) | ||
36 | return 1; | ||
37 | |||
38 | return sysctl_legacy_va_layout; | ||
39 | } | ||
40 | |||
41 | unsigned long arch_mmap_rnd(void) | ||
42 | { | ||
43 | unsigned long rnd; | ||
44 | |||
45 | #ifdef CONFIG_COMPAT | ||
46 | if (test_thread_flag(TIF_32BIT)) | ||
47 | rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); | ||
48 | else | ||
49 | #endif | ||
50 | rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); | ||
51 | return rnd << PAGE_SHIFT; | ||
52 | } | ||
53 | |||
54 | static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) | ||
55 | { | ||
56 | unsigned long gap = rlim_stack->rlim_cur; | ||
57 | unsigned long pad = (STACK_RND_MASK << PAGE_SHIFT) + stack_guard_gap; | ||
58 | |||
59 | /* Values close to RLIM_INFINITY can overflow. */ | ||
60 | if (gap + pad > gap) | ||
61 | gap += pad; | ||
62 | |||
63 | if (gap < MIN_GAP) | ||
64 | gap = MIN_GAP; | ||
65 | else if (gap > MAX_GAP) | ||
66 | gap = MAX_GAP; | ||
67 | |||
68 | return PAGE_ALIGN(STACK_TOP - gap - rnd); | ||
69 | } | ||
70 | |||
71 | /* | ||
72 | * This function, called very early during the creation of a new process VM | ||
73 | * image, sets up which VM layout function to use: | ||
74 | */ | ||
75 | void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) | ||
76 | { | ||
77 | unsigned long random_factor = 0UL; | ||
78 | |||
79 | if (current->flags & PF_RANDOMIZE) | ||
80 | random_factor = arch_mmap_rnd(); | ||
81 | |||
82 | /* | ||
83 | * Fall back to the standard layout if the personality bit is set, or | ||
84 | * if the expected stack growth is unlimited: | ||
85 | */ | ||
86 | if (mmap_is_legacy(rlim_stack)) { | ||
87 | mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; | ||
88 | mm->get_unmapped_area = arch_get_unmapped_area; | ||
89 | } else { | ||
90 | mm->mmap_base = mmap_base(random_factor, rlim_stack); | ||
91 | mm->get_unmapped_area = arch_get_unmapped_area_topdown; | ||
92 | } | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * You really shouldn't be using read() or write() on /dev/mem. This might go | 24 | * You really shouldn't be using read() or write() on /dev/mem. This might go |
97 | * away in the future. | 25 | * away in the future. |
98 | */ | 26 | */ |
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 7548f9ca1f11..4a64089e5771 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c | |||
@@ -35,7 +35,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) | |||
35 | kmem_cache_free(pgd_cache, pgd); | 35 | kmem_cache_free(pgd_cache, pgd); |
36 | } | 36 | } |
37 | 37 | ||
38 | void __init pgd_cache_init(void) | 38 | void __init pgtable_cache_init(void) |
39 | { | 39 | { |
40 | if (PGD_SIZE == PAGE_SIZE) | 40 | if (PGD_SIZE == PAGE_SIZE) |
41 | return; | 41 | return; |
diff --git a/arch/c6x/include/asm/pgtable.h b/arch/c6x/include/asm/pgtable.h index 0bd805964ea6..0b6919c00413 100644 --- a/arch/c6x/include/asm/pgtable.h +++ b/arch/c6x/include/asm/pgtable.h | |||
@@ -60,11 +60,6 @@ extern unsigned long empty_zero_page; | |||
60 | #define swapper_pg_dir ((pgd_t *) 0) | 60 | #define swapper_pg_dir ((pgd_t *) 0) |
61 | 61 | ||
62 | /* | 62 | /* |
63 | * No page table caches to initialise | ||
64 | */ | ||
65 | #define pgtable_cache_init() do { } while (0) | ||
66 | |||
67 | /* | ||
68 | * c6x is !MMU, so define the simpliest implementation | 63 | * c6x is !MMU, so define the simpliest implementation |
69 | */ | 64 | */ |
70 | #define pgprot_writecombine pgprot_noncached | 65 | #define pgprot_writecombine pgprot_noncached |
diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index 98c5716708d6..d089113fe41f 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h | |||
@@ -75,8 +75,6 @@ do { \ | |||
75 | tlb_remove_page(tlb, pte); \ | 75 | tlb_remove_page(tlb, pte); \ |
76 | } while (0) | 76 | } while (0) |
77 | 77 | ||
78 | #define check_pgt_cache() do {} while (0) | ||
79 | |||
80 | extern void pagetable_init(void); | 78 | extern void pagetable_init(void); |
81 | extern void pre_mmu_init(void); | 79 | extern void pre_mmu_init(void); |
82 | extern void pre_trap_init(void); | 80 | extern void pre_trap_init(void); |
diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index c429a6f347de..0040b3a05b61 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h | |||
@@ -296,11 +296,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, | |||
296 | /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ | 296 | /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ |
297 | #define kern_addr_valid(addr) (1) | 297 | #define kern_addr_valid(addr) (1) |
298 | 298 | ||
299 | /* | ||
300 | * No page table caches to initialise | ||
301 | */ | ||
302 | #define pgtable_cache_init() do {} while (0) | ||
303 | |||
304 | #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ | 299 | #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ |
305 | remap_pfn_range(vma, vaddr, pfn, size, prot) | 300 | remap_pfn_range(vma, vaddr, pfn, size, prot) |
306 | 301 | ||
diff --git a/arch/h8300/include/asm/pgtable.h b/arch/h8300/include/asm/pgtable.h index a99caa49d265..4d00152fab58 100644 --- a/arch/h8300/include/asm/pgtable.h +++ b/arch/h8300/include/asm/pgtable.h | |||
@@ -4,7 +4,6 @@ | |||
4 | #define __ARCH_USE_5LEVEL_HACK | 4 | #define __ARCH_USE_5LEVEL_HACK |
5 | #include <asm-generic/pgtable-nopud.h> | 5 | #include <asm-generic/pgtable-nopud.h> |
6 | #include <asm-generic/pgtable.h> | 6 | #include <asm-generic/pgtable.h> |
7 | #define pgtable_cache_init() do { } while (0) | ||
8 | extern void paging_init(void); | 7 | extern void paging_init(void); |
9 | #define PAGE_NONE __pgprot(0) /* these mean nothing to NO_MM */ | 8 | #define PAGE_NONE __pgprot(0) /* these mean nothing to NO_MM */ |
10 | #define PAGE_SHARED __pgprot(0) /* these mean nothing to NO_MM */ | 9 | #define PAGE_SHARED __pgprot(0) /* these mean nothing to NO_MM */ |
@@ -35,11 +34,6 @@ extern unsigned int kobjsize(const void *objp); | |||
35 | extern int is_in_rom(unsigned long); | 34 | extern int is_in_rom(unsigned long); |
36 | 35 | ||
37 | /* | 36 | /* |
38 | * No page table caches to initialise | ||
39 | */ | ||
40 | #define pgtable_cache_init() do { } while (0) | ||
41 | |||
42 | /* | ||
43 | * All 32bit addresses are effectively valid for vmalloc... | 37 | * All 32bit addresses are effectively valid for vmalloc... |
44 | * Sort of meaningless for non-VM targets. | 38 | * Sort of meaningless for non-VM targets. |
45 | */ | 39 | */ |
diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h index d6544dc71258..5a6e79e7926d 100644 --- a/arch/hexagon/include/asm/pgalloc.h +++ b/arch/hexagon/include/asm/pgalloc.h | |||
@@ -13,8 +13,6 @@ | |||
13 | 13 | ||
14 | #include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ | 14 | #include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ |
15 | 15 | ||
16 | #define check_pgt_cache() do {} while (0) | ||
17 | |||
18 | extern unsigned long long kmap_generation; | 16 | extern unsigned long long kmap_generation; |
19 | 17 | ||
20 | /* | 18 | /* |
diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index a3ff6d24c09e..2fec20ad939e 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h | |||
@@ -431,9 +431,6 @@ static inline int pte_exec(pte_t pte) | |||
431 | 431 | ||
432 | #define __pte_offset(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) | 432 | #define __pte_offset(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) |
433 | 433 | ||
434 | /* I think this is in case we have page table caches; needed by init/main.c */ | ||
435 | #define pgtable_cache_init() do { } while (0) | ||
436 | |||
437 | /* | 434 | /* |
438 | * Swap/file PTE definitions. If _PAGE_PRESENT is zero, the rest of the PTE is | 435 | * Swap/file PTE definitions. If _PAGE_PRESENT is zero, the rest of the PTE is |
439 | * interpreted as swap information. The remaining free bits are interpreted as | 436 | * interpreted as swap information. The remaining free bits are interpreted as |
diff --git a/arch/hexagon/mm/Makefile b/arch/hexagon/mm/Makefile index 1894263ae5bc..893838499591 100644 --- a/arch/hexagon/mm/Makefile +++ b/arch/hexagon/mm/Makefile | |||
@@ -3,5 +3,5 @@ | |||
3 | # Makefile for Hexagon memory management subsystem | 3 | # Makefile for Hexagon memory management subsystem |
4 | # | 4 | # |
5 | 5 | ||
6 | obj-y := init.o pgalloc.o ioremap.o uaccess.o vm_fault.o cache.o | 6 | obj-y := init.o ioremap.o uaccess.o vm_fault.o cache.o |
7 | obj-y += copy_to_user.o copy_from_user.o strnlen_user.o vm_tlb.o | 7 | obj-y += copy_to_user.o copy_from_user.o strnlen_user.o vm_tlb.o |
diff --git a/arch/hexagon/mm/pgalloc.c b/arch/hexagon/mm/pgalloc.c deleted file mode 100644 index 4d4316140237..000000000000 --- a/arch/hexagon/mm/pgalloc.c +++ /dev/null | |||
@@ -1,10 +0,0 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved. | ||
4 | */ | ||
5 | |||
6 | #include <linux/init.h> | ||
7 | |||
8 | void __init pgtable_cache_init(void) | ||
9 | { | ||
10 | } | ||
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 685a3df126ca..16714477eef4 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig | |||
@@ -72,10 +72,6 @@ config 64BIT | |||
72 | config ZONE_DMA32 | 72 | config ZONE_DMA32 |
73 | def_bool y | 73 | def_bool y |
74 | 74 | ||
75 | config QUICKLIST | ||
76 | bool | ||
77 | default y | ||
78 | |||
79 | config MMU | 75 | config MMU |
80 | bool | 76 | bool |
81 | default y | 77 | default y |
diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index c9e481023c25..f4c491044882 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h | |||
@@ -19,18 +19,19 @@ | |||
19 | #include <linux/mm.h> | 19 | #include <linux/mm.h> |
20 | #include <linux/page-flags.h> | 20 | #include <linux/page-flags.h> |
21 | #include <linux/threads.h> | 21 | #include <linux/threads.h> |
22 | #include <linux/quicklist.h> | 22 | |
23 | #include <asm-generic/pgalloc.h> | ||
23 | 24 | ||
24 | #include <asm/mmu_context.h> | 25 | #include <asm/mmu_context.h> |
25 | 26 | ||
26 | static inline pgd_t *pgd_alloc(struct mm_struct *mm) | 27 | static inline pgd_t *pgd_alloc(struct mm_struct *mm) |
27 | { | 28 | { |
28 | return quicklist_alloc(0, GFP_KERNEL, NULL); | 29 | return (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); |
29 | } | 30 | } |
30 | 31 | ||
31 | static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) | 32 | static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) |
32 | { | 33 | { |
33 | quicklist_free(0, NULL, pgd); | 34 | free_page((unsigned long)pgd); |
34 | } | 35 | } |
35 | 36 | ||
36 | #if CONFIG_PGTABLE_LEVELS == 4 | 37 | #if CONFIG_PGTABLE_LEVELS == 4 |
@@ -42,12 +43,12 @@ pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud) | |||
42 | 43 | ||
43 | static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) | 44 | static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) |
44 | { | 45 | { |
45 | return quicklist_alloc(0, GFP_KERNEL, NULL); | 46 | return (pud_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); |
46 | } | 47 | } |
47 | 48 | ||
48 | static inline void pud_free(struct mm_struct *mm, pud_t *pud) | 49 | static inline void pud_free(struct mm_struct *mm, pud_t *pud) |
49 | { | 50 | { |
50 | quicklist_free(0, NULL, pud); | 51 | free_page((unsigned long)pud); |
51 | } | 52 | } |
52 | #define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud) | 53 | #define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud) |
53 | #endif /* CONFIG_PGTABLE_LEVELS == 4 */ | 54 | #endif /* CONFIG_PGTABLE_LEVELS == 4 */ |
@@ -60,12 +61,12 @@ pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd) | |||
60 | 61 | ||
61 | static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) | 62 | static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) |
62 | { | 63 | { |
63 | return quicklist_alloc(0, GFP_KERNEL, NULL); | 64 | return (pmd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); |
64 | } | 65 | } |
65 | 66 | ||
66 | static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) | 67 | static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) |
67 | { | 68 | { |
68 | quicklist_free(0, NULL, pmd); | 69 | free_page((unsigned long)pmd); |
69 | } | 70 | } |
70 | 71 | ||
71 | #define __pmd_free_tlb(tlb, pmd, address) pmd_free((tlb)->mm, pmd) | 72 | #define __pmd_free_tlb(tlb, pmd, address) pmd_free((tlb)->mm, pmd) |
@@ -83,43 +84,6 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte) | |||
83 | pmd_val(*pmd_entry) = __pa(pte); | 84 | pmd_val(*pmd_entry) = __pa(pte); |
84 | } | 85 | } |
85 | 86 | ||
86 | static inline pgtable_t pte_alloc_one(struct mm_struct *mm) | ||
87 | { | ||
88 | struct page *page; | ||
89 | void *pg; | ||
90 | |||
91 | pg = quicklist_alloc(0, GFP_KERNEL, NULL); | ||
92 | if (!pg) | ||
93 | return NULL; | ||
94 | page = virt_to_page(pg); | ||
95 | if (!pgtable_page_ctor(page)) { | ||
96 | quicklist_free(0, NULL, pg); | ||
97 | return NULL; | ||
98 | } | ||
99 | return page; | ||
100 | } | ||
101 | |||
102 | static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) | ||
103 | { | ||
104 | return quicklist_alloc(0, GFP_KERNEL, NULL); | ||
105 | } | ||
106 | |||
107 | static inline void pte_free(struct mm_struct *mm, pgtable_t pte) | ||
108 | { | ||
109 | pgtable_page_dtor(pte); | ||
110 | quicklist_free_page(0, NULL, pte); | ||
111 | } | ||
112 | |||
113 | static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) | ||
114 | { | ||
115 | quicklist_free(0, NULL, pte); | ||
116 | } | ||
117 | |||
118 | static inline void check_pgt_cache(void) | ||
119 | { | ||
120 | quicklist_trim(0, NULL, 25, 16); | ||
121 | } | ||
122 | |||
123 | #define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) | 87 | #define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) |
124 | 88 | ||
125 | #endif /* _ASM_IA64_PGALLOC_H */ | 89 | #endif /* _ASM_IA64_PGALLOC_H */ |
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index b1e7468eb65a..d602e7c622db 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h | |||
@@ -566,11 +566,6 @@ extern struct page *zero_page_memmap_ptr; | |||
566 | #define KERNEL_TR_PAGE_SHIFT _PAGE_SIZE_64M | 566 | #define KERNEL_TR_PAGE_SHIFT _PAGE_SIZE_64M |
567 | #define KERNEL_TR_PAGE_SIZE (1 << KERNEL_TR_PAGE_SHIFT) | 567 | #define KERNEL_TR_PAGE_SIZE (1 << KERNEL_TR_PAGE_SHIFT) |
568 | 568 | ||
569 | /* | ||
570 | * No page table caches to initialise | ||
571 | */ | ||
572 | #define pgtable_cache_init() do { } while (0) | ||
573 | |||
574 | /* These tell get_user_pages() that the first gate page is accessible from user-level. */ | 569 | /* These tell get_user_pages() that the first gate page is accessible from user-level. */ |
575 | #define FIXADDR_USER_START GATE_ADDR | 570 | #define FIXADDR_USER_START GATE_ADDR |
576 | #ifdef HAVE_BUGGY_SEGREL | 571 | #ifdef HAVE_BUGGY_SEGREL |
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 678b98a09c85..bf9df2625bc8 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c | |||
@@ -64,7 +64,7 @@ __ia64_sync_icache_dcache (pte_t pte) | |||
64 | if (test_bit(PG_arch_1, &page->flags)) | 64 | if (test_bit(PG_arch_1, &page->flags)) |
65 | return; /* i-cache is already coherent with d-cache */ | 65 | return; /* i-cache is already coherent with d-cache */ |
66 | 66 | ||
67 | flush_icache_range(addr, addr + (PAGE_SIZE << compound_order(page))); | 67 | flush_icache_range(addr, addr + page_size(page)); |
68 | set_bit(PG_arch_1, &page->flags); /* mark page as clean */ | 68 | set_bit(PG_arch_1, &page->flags); /* mark page as clean */ |
69 | } | 69 | } |
70 | 70 | ||
diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h index fde4534b974f..646c174fff99 100644 --- a/arch/m68k/include/asm/pgtable_mm.h +++ b/arch/m68k/include/asm/pgtable_mm.h | |||
@@ -176,11 +176,4 @@ pgprot_t pgprot_dmacoherent(pgprot_t prot); | |||
176 | #include <asm-generic/pgtable.h> | 176 | #include <asm-generic/pgtable.h> |
177 | #endif /* !__ASSEMBLY__ */ | 177 | #endif /* !__ASSEMBLY__ */ |
178 | 178 | ||
179 | /* | ||
180 | * No page table caches to initialise | ||
181 | */ | ||
182 | #define pgtable_cache_init() do { } while (0) | ||
183 | |||
184 | #define check_pgt_cache() do { } while (0) | ||
185 | |||
186 | #endif /* _M68K_PGTABLE_H */ | 179 | #endif /* _M68K_PGTABLE_H */ |
diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h index fc3a96c77bd8..c18165b0d904 100644 --- a/arch/m68k/include/asm/pgtable_no.h +++ b/arch/m68k/include/asm/pgtable_no.h | |||
@@ -45,11 +45,6 @@ extern void paging_init(void); | |||
45 | #define ZERO_PAGE(vaddr) (virt_to_page(0)) | 45 | #define ZERO_PAGE(vaddr) (virt_to_page(0)) |
46 | 46 | ||
47 | /* | 47 | /* |
48 | * No page table caches to initialise. | ||
49 | */ | ||
50 | #define pgtable_cache_init() do { } while (0) | ||
51 | |||
52 | /* | ||
53 | * All 32bit addresses are effectively valid for vmalloc... | 48 | * All 32bit addresses are effectively valid for vmalloc... |
54 | * Sort of meaningless for non-VM targets. | 49 | * Sort of meaningless for non-VM targets. |
55 | */ | 50 | */ |
@@ -60,6 +55,4 @@ extern void paging_init(void); | |||
60 | 55 | ||
61 | #include <asm-generic/pgtable.h> | 56 | #include <asm-generic/pgtable.h> |
62 | 57 | ||
63 | #define check_pgt_cache() do { } while (0) | ||
64 | |||
65 | #endif /* _M68KNOMMU_PGTABLE_H */ | 58 | #endif /* _M68KNOMMU_PGTABLE_H */ |
diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h index f4cc9ffc449e..7ecb05baa601 100644 --- a/arch/microblaze/include/asm/pgalloc.h +++ b/arch/microblaze/include/asm/pgalloc.h | |||
@@ -21,83 +21,23 @@ | |||
21 | #include <asm/cache.h> | 21 | #include <asm/cache.h> |
22 | #include <asm/pgtable.h> | 22 | #include <asm/pgtable.h> |
23 | 23 | ||
24 | #define PGDIR_ORDER 0 | 24 | #define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL |
25 | 25 | #include <asm-generic/pgalloc.h> | |
26 | /* | ||
27 | * This is handled very differently on MicroBlaze since out page tables | ||
28 | * are all 0's and I want to be able to use these zero'd pages elsewhere | ||
29 | * as well - it gives us quite a speedup. | ||
30 | * -- Cort | ||
31 | */ | ||
32 | extern struct pgtable_cache_struct { | ||
33 | unsigned long *pgd_cache; | ||
34 | unsigned long *pte_cache; | ||
35 | unsigned long pgtable_cache_sz; | ||
36 | } quicklists; | ||
37 | |||
38 | #define pgd_quicklist (quicklists.pgd_cache) | ||
39 | #define pmd_quicklist ((unsigned long *)0) | ||
40 | #define pte_quicklist (quicklists.pte_cache) | ||
41 | #define pgtable_cache_size (quicklists.pgtable_cache_sz) | ||
42 | |||
43 | extern unsigned long *zero_cache; /* head linked list of pre-zero'd pages */ | ||
44 | extern atomic_t zero_sz; /* # currently pre-zero'd pages */ | ||
45 | extern atomic_t zeropage_hits; /* # zero'd pages request that we've done */ | ||
46 | extern atomic_t zeropage_calls; /* # zero'd pages request that've been made */ | ||
47 | extern atomic_t zerototal; /* # pages zero'd over time */ | ||
48 | |||
49 | #define zero_quicklist (zero_cache) | ||
50 | #define zero_cache_sz (zero_sz) | ||
51 | #define zero_cache_calls (zeropage_calls) | ||
52 | #define zero_cache_hits (zeropage_hits) | ||
53 | #define zero_cache_total (zerototal) | ||
54 | |||
55 | /* | ||
56 | * return a pre-zero'd page from the list, | ||
57 | * return NULL if none available -- Cort | ||
58 | */ | ||
59 | extern unsigned long get_zero_page_fast(void); | ||
60 | 26 | ||
61 | extern void __bad_pte(pmd_t *pmd); | 27 | extern void __bad_pte(pmd_t *pmd); |
62 | 28 | ||
63 | static inline pgd_t *get_pgd_slow(void) | 29 | static inline pgd_t *get_pgd(void) |
64 | { | 30 | { |
65 | pgd_t *ret; | 31 | return (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0); |
66 | |||
67 | ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGDIR_ORDER); | ||
68 | if (ret != NULL) | ||
69 | clear_page(ret); | ||
70 | return ret; | ||
71 | } | 32 | } |
72 | 33 | ||
73 | static inline pgd_t *get_pgd_fast(void) | 34 | static inline void free_pgd(pgd_t *pgd) |
74 | { | ||
75 | unsigned long *ret; | ||
76 | |||
77 | ret = pgd_quicklist; | ||
78 | if (ret != NULL) { | ||
79 | pgd_quicklist = (unsigned long *)(*ret); | ||
80 | ret[0] = 0; | ||
81 | pgtable_cache_size--; | ||
82 | } else | ||
83 | ret = (unsigned long *)get_pgd_slow(); | ||
84 | return (pgd_t *)ret; | ||
85 | } | ||
86 | |||
87 | static inline void free_pgd_fast(pgd_t *pgd) | ||
88 | { | ||
89 | *(unsigned long **)pgd = pgd_quicklist; | ||
90 | pgd_quicklist = (unsigned long *) pgd; | ||
91 | pgtable_cache_size++; | ||
92 | } | ||
93 | |||
94 | static inline void free_pgd_slow(pgd_t *pgd) | ||
95 | { | 35 | { |
96 | free_page((unsigned long)pgd); | 36 | free_page((unsigned long)pgd); |
97 | } | 37 | } |
98 | 38 | ||
99 | #define pgd_free(mm, pgd) free_pgd_fast(pgd) | 39 | #define pgd_free(mm, pgd) free_pgd(pgd) |
100 | #define pgd_alloc(mm) get_pgd_fast() | 40 | #define pgd_alloc(mm) get_pgd() |
101 | 41 | ||
102 | #define pmd_pgtable(pmd) pmd_page(pmd) | 42 | #define pmd_pgtable(pmd) pmd_page(pmd) |
103 | 43 | ||
@@ -110,50 +50,6 @@ static inline void free_pgd_slow(pgd_t *pgd) | |||
110 | 50 | ||
111 | extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); | 51 | extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); |
112 | 52 | ||
113 | static inline struct page *pte_alloc_one(struct mm_struct *mm) | ||
114 | { | ||
115 | struct page *ptepage; | ||
116 | |||
117 | #ifdef CONFIG_HIGHPTE | ||
118 | int flags = GFP_KERNEL | __GFP_HIGHMEM; | ||
119 | #else | ||
120 | int flags = GFP_KERNEL; | ||
121 | #endif | ||
122 | |||
123 | ptepage = alloc_pages(flags, 0); | ||
124 | if (!ptepage) | ||
125 | return NULL; | ||
126 | clear_highpage(ptepage); | ||
127 | if (!pgtable_page_ctor(ptepage)) { | ||
128 | __free_page(ptepage); | ||
129 | return NULL; | ||
130 | } | ||
131 | return ptepage; | ||
132 | } | ||
133 | |||
134 | static inline void pte_free_fast(pte_t *pte) | ||
135 | { | ||
136 | *(unsigned long **)pte = pte_quicklist; | ||
137 | pte_quicklist = (unsigned long *) pte; | ||
138 | pgtable_cache_size++; | ||
139 | } | ||
140 | |||
141 | static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) | ||
142 | { | ||
143 | free_page((unsigned long)pte); | ||
144 | } | ||
145 | |||
146 | static inline void pte_free_slow(struct page *ptepage) | ||
147 | { | ||
148 | __free_page(ptepage); | ||
149 | } | ||
150 | |||
151 | static inline void pte_free(struct mm_struct *mm, struct page *ptepage) | ||
152 | { | ||
153 | pgtable_page_dtor(ptepage); | ||
154 | __free_page(ptepage); | ||
155 | } | ||
156 | |||
157 | #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, (pte)) | 53 | #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, (pte)) |
158 | 54 | ||
159 | #define pmd_populate(mm, pmd, pte) \ | 55 | #define pmd_populate(mm, pmd, pte) \ |
@@ -171,10 +67,6 @@ static inline void pte_free(struct mm_struct *mm, struct page *ptepage) | |||
171 | #define __pmd_free_tlb(tlb, x, addr) pmd_free((tlb)->mm, x) | 67 | #define __pmd_free_tlb(tlb, x, addr) pmd_free((tlb)->mm, x) |
172 | #define pgd_populate(mm, pmd, pte) BUG() | 68 | #define pgd_populate(mm, pmd, pte) BUG() |
173 | 69 | ||
174 | extern int do_check_pgt_cache(int, int); | ||
175 | |||
176 | #endif /* CONFIG_MMU */ | 70 | #endif /* CONFIG_MMU */ |
177 | 71 | ||
178 | #define check_pgt_cache() do { } while (0) | ||
179 | |||
180 | #endif /* _ASM_MICROBLAZE_PGALLOC_H */ | 72 | #endif /* _ASM_MICROBLAZE_PGALLOC_H */ |
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index 142d3f004848..954b69af451f 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h | |||
@@ -46,8 +46,6 @@ extern int mem_init_done; | |||
46 | 46 | ||
47 | #define swapper_pg_dir ((pgd_t *) NULL) | 47 | #define swapper_pg_dir ((pgd_t *) NULL) |
48 | 48 | ||
49 | #define pgtable_cache_init() do {} while (0) | ||
50 | |||
51 | #define arch_enter_lazy_cpu_mode() do {} while (0) | 49 | #define arch_enter_lazy_cpu_mode() do {} while (0) |
52 | 50 | ||
53 | #define pgprot_noncached_wc(prot) prot | 51 | #define pgprot_noncached_wc(prot) prot |
@@ -526,11 +524,6 @@ extern unsigned long iopa(unsigned long addr); | |||
526 | /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ | 524 | /* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ |
527 | #define kern_addr_valid(addr) (1) | 525 | #define kern_addr_valid(addr) (1) |
528 | 526 | ||
529 | /* | ||
530 | * No page table caches to initialise | ||
531 | */ | ||
532 | #define pgtable_cache_init() do { } while (0) | ||
533 | |||
534 | void do_page_fault(struct pt_regs *regs, unsigned long address, | 527 | void do_page_fault(struct pt_regs *regs, unsigned long address, |
535 | unsigned long error_code); | 528 | unsigned long error_code); |
536 | 529 | ||
diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c index 8fe54fda31dc..010bb9cee2e4 100644 --- a/arch/microblaze/mm/pgtable.c +++ b/arch/microblaze/mm/pgtable.c | |||
@@ -44,10 +44,6 @@ unsigned long ioremap_base; | |||
44 | unsigned long ioremap_bot; | 44 | unsigned long ioremap_bot; |
45 | EXPORT_SYMBOL(ioremap_bot); | 45 | EXPORT_SYMBOL(ioremap_bot); |
46 | 46 | ||
47 | #ifndef CONFIG_SMP | ||
48 | struct pgtable_cache_struct quicklists; | ||
49 | #endif | ||
50 | |||
51 | static void __iomem *__ioremap(phys_addr_t addr, unsigned long size, | 47 | static void __iomem *__ioremap(phys_addr_t addr, unsigned long size, |
52 | unsigned long flags) | 48 | unsigned long flags) |
53 | { | 49 | { |
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index cc8e2b1032a5..a0bd9bdb5f83 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig | |||
@@ -5,7 +5,6 @@ config MIPS | |||
5 | select ARCH_32BIT_OFF_T if !64BIT | 5 | select ARCH_32BIT_OFF_T if !64BIT |
6 | select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT | 6 | select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT |
7 | select ARCH_CLOCKSOURCE_DATA | 7 | select ARCH_CLOCKSOURCE_DATA |
8 | select ARCH_HAS_ELF_RANDOMIZE | ||
9 | select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST | 8 | select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST |
10 | select ARCH_HAS_UBSAN_SANITIZE_ALL | 9 | select ARCH_HAS_UBSAN_SANITIZE_ALL |
11 | select ARCH_SUPPORTS_UPROBES | 10 | select ARCH_SUPPORTS_UPROBES |
@@ -13,6 +12,7 @@ config MIPS | |||
13 | select ARCH_USE_CMPXCHG_LOCKREF if 64BIT | 12 | select ARCH_USE_CMPXCHG_LOCKREF if 64BIT |
14 | select ARCH_USE_QUEUED_RWLOCKS | 13 | select ARCH_USE_QUEUED_RWLOCKS |
15 | select ARCH_USE_QUEUED_SPINLOCKS | 14 | select ARCH_USE_QUEUED_SPINLOCKS |
15 | select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU | ||
16 | select ARCH_WANT_IPC_PARSE_VERSION | 16 | select ARCH_WANT_IPC_PARSE_VERSION |
17 | select BUILDTIME_EXTABLE_SORT | 17 | select BUILDTIME_EXTABLE_SORT |
18 | select CLONE_BACKWARDS | 18 | select CLONE_BACKWARDS |
diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index aa16b85ddffc..aa73cb187a07 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h | |||
@@ -105,8 +105,6 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) | |||
105 | 105 | ||
106 | #endif /* __PAGETABLE_PUD_FOLDED */ | 106 | #endif /* __PAGETABLE_PUD_FOLDED */ |
107 | 107 | ||
108 | #define check_pgt_cache() do { } while (0) | ||
109 | |||
110 | extern void pagetable_init(void); | 108 | extern void pagetable_init(void); |
111 | 109 | ||
112 | #endif /* _ASM_PGALLOC_H */ | 110 | #endif /* _ASM_PGALLOC_H */ |
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 4dca733d5076..f85bd5b15f51 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h | |||
@@ -661,9 +661,4 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, | |||
661 | #define HAVE_ARCH_UNMAPPED_AREA | 661 | #define HAVE_ARCH_UNMAPPED_AREA |
662 | #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN | 662 | #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN |
663 | 663 | ||
664 | /* | ||
665 | * No page table caches to initialise | ||
666 | */ | ||
667 | #define pgtable_cache_init() do { } while (0) | ||
668 | |||
669 | #endif /* _ASM_PGTABLE_H */ | 664 | #endif /* _ASM_PGTABLE_H */ |
diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index aca909bd7841..fba18d4a9190 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h | |||
@@ -29,11 +29,6 @@ | |||
29 | 29 | ||
30 | extern unsigned int vced_count, vcei_count; | 30 | extern unsigned int vced_count, vcei_count; |
31 | 31 | ||
32 | /* | ||
33 | * MIPS does have an arch_pick_mmap_layout() | ||
34 | */ | ||
35 | #define HAVE_ARCH_PICK_MMAP_LAYOUT 1 | ||
36 | |||
37 | #ifdef CONFIG_32BIT | 32 | #ifdef CONFIG_32BIT |
38 | #ifdef CONFIG_KVM_GUEST | 33 | #ifdef CONFIG_KVM_GUEST |
39 | /* User space process size is limited to 1GB in KVM Guest Mode */ | 34 | /* User space process size is limited to 1GB in KVM Guest Mode */ |
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c index d79f2b432318..00fe90c6db3e 100644 --- a/arch/mips/mm/mmap.c +++ b/arch/mips/mm/mmap.c | |||
@@ -20,33 +20,6 @@ | |||
20 | unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ | 20 | unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ |
21 | EXPORT_SYMBOL(shm_align_mask); | 21 | EXPORT_SYMBOL(shm_align_mask); |
22 | 22 | ||
23 | /* gap between mmap and stack */ | ||
24 | #define MIN_GAP (128*1024*1024UL) | ||
25 | #define MAX_GAP ((TASK_SIZE)/6*5) | ||
26 | |||
27 | static int mmap_is_legacy(struct rlimit *rlim_stack) | ||
28 | { | ||
29 | if (current->personality & ADDR_COMPAT_LAYOUT) | ||
30 | return 1; | ||
31 | |||
32 | if (rlim_stack->rlim_cur == RLIM_INFINITY) | ||
33 | return 1; | ||
34 | |||
35 | return sysctl_legacy_va_layout; | ||
36 | } | ||
37 | |||
38 | static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) | ||
39 | { | ||
40 | unsigned long gap = rlim_stack->rlim_cur; | ||
41 | |||
42 | if (gap < MIN_GAP) | ||
43 | gap = MIN_GAP; | ||
44 | else if (gap > MAX_GAP) | ||
45 | gap = MAX_GAP; | ||
46 | |||
47 | return PAGE_ALIGN(TASK_SIZE - gap - rnd); | ||
48 | } | ||
49 | |||
50 | #define COLOUR_ALIGN(addr, pgoff) \ | 23 | #define COLOUR_ALIGN(addr, pgoff) \ |
51 | ((((addr) + shm_align_mask) & ~shm_align_mask) + \ | 24 | ((((addr) + shm_align_mask) & ~shm_align_mask) + \ |
52 | (((pgoff) << PAGE_SHIFT) & shm_align_mask)) | 25 | (((pgoff) << PAGE_SHIFT) & shm_align_mask)) |
@@ -144,63 +117,6 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, | |||
144 | addr0, len, pgoff, flags, DOWN); | 117 | addr0, len, pgoff, flags, DOWN); |
145 | } | 118 | } |
146 | 119 | ||
147 | unsigned long arch_mmap_rnd(void) | ||
148 | { | ||
149 | unsigned long rnd; | ||
150 | |||
151 | #ifdef CONFIG_COMPAT | ||
152 | if (TASK_IS_32BIT_ADDR) | ||
153 | rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); | ||
154 | else | ||
155 | #endif /* CONFIG_COMPAT */ | ||
156 | rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); | ||
157 | |||
158 | return rnd << PAGE_SHIFT; | ||
159 | } | ||
160 | |||
161 | void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) | ||
162 | { | ||
163 | unsigned long random_factor = 0UL; | ||
164 | |||
165 | if (current->flags & PF_RANDOMIZE) | ||
166 | random_factor = arch_mmap_rnd(); | ||
167 | |||
168 | if (mmap_is_legacy(rlim_stack)) { | ||
169 | mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; | ||
170 | mm->get_unmapped_area = arch_get_unmapped_area; | ||
171 | } else { | ||
172 | mm->mmap_base = mmap_base(random_factor, rlim_stack); | ||
173 | mm->get_unmapped_area = arch_get_unmapped_area_topdown; | ||
174 | } | ||
175 | } | ||
176 | |||
177 | static inline unsigned long brk_rnd(void) | ||
178 | { | ||
179 | unsigned long rnd = get_random_long(); | ||
180 | |||
181 | rnd = rnd << PAGE_SHIFT; | ||
182 | /* 8MB for 32bit, 256MB for 64bit */ | ||
183 | if (TASK_IS_32BIT_ADDR) | ||
184 | rnd = rnd & 0x7ffffful; | ||
185 | else | ||
186 | rnd = rnd & 0xffffffful; | ||
187 | |||
188 | return rnd; | ||
189 | } | ||
190 | |||
191 | unsigned long arch_randomize_brk(struct mm_struct *mm) | ||
192 | { | ||
193 | unsigned long base = mm->brk; | ||
194 | unsigned long ret; | ||
195 | |||
196 | ret = PAGE_ALIGN(base + brk_rnd()); | ||
197 | |||
198 | if (ret < mm->brk) | ||
199 | return mm->brk; | ||
200 | |||
201 | return ret; | ||
202 | } | ||
203 | |||
204 | bool __virt_addr_valid(const volatile void *kaddr) | 120 | bool __virt_addr_valid(const volatile void *kaddr) |
205 | { | 121 | { |
206 | unsigned long vaddr = (unsigned long)kaddr; | 122 | unsigned long vaddr = (unsigned long)kaddr; |
diff --git a/arch/nds32/include/asm/pgalloc.h b/arch/nds32/include/asm/pgalloc.h index e78b43d8389f..37125e6884d7 100644 --- a/arch/nds32/include/asm/pgalloc.h +++ b/arch/nds32/include/asm/pgalloc.h | |||
@@ -23,8 +23,6 @@ | |||
23 | extern pgd_t *pgd_alloc(struct mm_struct *mm); | 23 | extern pgd_t *pgd_alloc(struct mm_struct *mm); |
24 | extern void pgd_free(struct mm_struct *mm, pgd_t * pgd); | 24 | extern void pgd_free(struct mm_struct *mm, pgd_t * pgd); |
25 | 25 | ||
26 | #define check_pgt_cache() do { } while (0) | ||
27 | |||
28 | static inline pgtable_t pte_alloc_one(struct mm_struct *mm) | 26 | static inline pgtable_t pte_alloc_one(struct mm_struct *mm) |
29 | { | 27 | { |
30 | pgtable_t pte; | 28 | pgtable_t pte; |
diff --git a/arch/nds32/include/asm/pgtable.h b/arch/nds32/include/asm/pgtable.h index c70cc56bec09..0588ec99725c 100644 --- a/arch/nds32/include/asm/pgtable.h +++ b/arch/nds32/include/asm/pgtable.h | |||
@@ -403,8 +403,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; | |||
403 | * into virtual address `from' | 403 | * into virtual address `from' |
404 | */ | 404 | */ |
405 | 405 | ||
406 | #define pgtable_cache_init() do { } while (0) | ||
407 | |||
408 | #endif /* !__ASSEMBLY__ */ | 406 | #endif /* !__ASSEMBLY__ */ |
409 | 407 | ||
410 | #endif /* _ASMNDS32_PGTABLE_H */ | 408 | #endif /* _ASMNDS32_PGTABLE_H */ |
diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index 4bc8cf72067e..750d18d5980b 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h | |||
@@ -45,6 +45,4 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) | |||
45 | tlb_remove_page((tlb), (pte)); \ | 45 | tlb_remove_page((tlb), (pte)); \ |
46 | } while (0) | 46 | } while (0) |
47 | 47 | ||
48 | #define check_pgt_cache() do { } while (0) | ||
49 | |||
50 | #endif /* _ASM_NIOS2_PGALLOC_H */ | 48 | #endif /* _ASM_NIOS2_PGALLOC_H */ |
diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 95237b7f6fc1..99985d8b7166 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h | |||
@@ -291,8 +291,6 @@ static inline void pte_clear(struct mm_struct *mm, | |||
291 | 291 | ||
292 | #include <asm-generic/pgtable.h> | 292 | #include <asm-generic/pgtable.h> |
293 | 293 | ||
294 | #define pgtable_cache_init() do { } while (0) | ||
295 | |||
296 | extern void __init paging_init(void); | 294 | extern void __init paging_init(void); |
297 | extern void __init mmu_init(void); | 295 | extern void __init mmu_init(void); |
298 | 296 | ||
diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index 3d4b397c2d06..787c1b9d2f6d 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h | |||
@@ -101,6 +101,4 @@ do { \ | |||
101 | 101 | ||
102 | #define pmd_pgtable(pmd) pmd_page(pmd) | 102 | #define pmd_pgtable(pmd) pmd_page(pmd) |
103 | 103 | ||
104 | #define check_pgt_cache() do { } while (0) | ||
105 | |||
106 | #endif | 104 | #endif |
diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 2fe9ff5b5d6f..248d22d8faa7 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h | |||
@@ -443,11 +443,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, | |||
443 | 443 | ||
444 | #include <asm-generic/pgtable.h> | 444 | #include <asm-generic/pgtable.h> |
445 | 445 | ||
446 | /* | ||
447 | * No page table caches to initialise | ||
448 | */ | ||
449 | #define pgtable_cache_init() do { } while (0) | ||
450 | |||
451 | typedef pte_t *pte_addr_t; | 446 | typedef pte_t *pte_addr_t; |
452 | 447 | ||
453 | #endif /* __ASSEMBLY__ */ | 448 | #endif /* __ASSEMBLY__ */ |
diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index 4f2059a50fae..d98647c29b74 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h | |||
@@ -124,6 +124,4 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) | |||
124 | pmd_populate_kernel(mm, pmd, page_address(pte_page)) | 124 | pmd_populate_kernel(mm, pmd, page_address(pte_page)) |
125 | #define pmd_pgtable(pmd) pmd_page(pmd) | 125 | #define pmd_pgtable(pmd) pmd_page(pmd) |
126 | 126 | ||
127 | #define check_pgt_cache() do { } while (0) | ||
128 | |||
129 | #endif | 127 | #endif |
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 6d58c1739b42..4ac374b3a99f 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h | |||
@@ -132,8 +132,6 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) | |||
132 | #define PTRS_PER_PTE (1UL << BITS_PER_PTE) | 132 | #define PTRS_PER_PTE (1UL << BITS_PER_PTE) |
133 | 133 | ||
134 | /* Definitions for 2nd level */ | 134 | /* Definitions for 2nd level */ |
135 | #define pgtable_cache_init() do { } while (0) | ||
136 | |||
137 | #define PMD_SHIFT (PLD_SHIFT + BITS_PER_PTE) | 135 | #define PMD_SHIFT (PLD_SHIFT + BITS_PER_PTE) |
138 | #define PMD_SIZE (1UL << PMD_SHIFT) | 136 | #define PMD_SIZE (1UL << PMD_SHIFT) |
139 | #define PMD_MASK (~(PMD_SIZE-1)) | 137 | #define PMD_MASK (~(PMD_SIZE-1)) |
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 2b2c60a1a66d..6dd78a2dc03a 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h | |||
@@ -64,8 +64,6 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) | |||
64 | extern struct kmem_cache *pgtable_cache[]; | 64 | extern struct kmem_cache *pgtable_cache[]; |
65 | #define PGT_CACHE(shift) pgtable_cache[shift] | 65 | #define PGT_CACHE(shift) pgtable_cache[shift] |
66 | 66 | ||
67 | static inline void check_pgt_cache(void) { } | ||
68 | |||
69 | #ifdef CONFIG_PPC_BOOK3S | 67 | #ifdef CONFIG_PPC_BOOK3S |
70 | #include <asm/book3s/pgalloc.h> | 68 | #include <asm/book3s/pgalloc.h> |
71 | #else | 69 | #else |
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 8b7865a2d576..4053b2ab427c 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h | |||
@@ -87,7 +87,6 @@ extern unsigned long ioremap_bot; | |||
87 | unsigned long vmalloc_to_phys(void *vmalloc_addr); | 87 | unsigned long vmalloc_to_phys(void *vmalloc_addr); |
88 | 88 | ||
89 | void pgtable_cache_add(unsigned int shift); | 89 | void pgtable_cache_add(unsigned int shift); |
90 | void pgtable_cache_init(void); | ||
91 | 90 | ||
92 | #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_PPC32) | 91 | #if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_PPC32) |
93 | void mark_initmem_nx(void); | 92 | void mark_initmem_nx(void); |
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 3410ea9f4de1..6c123760164e 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c | |||
@@ -1748,7 +1748,7 @@ void flush_hash_hugepage(unsigned long vsid, unsigned long addr, | |||
1748 | /* | 1748 | /* |
1749 | * IF we try to do a HUGE PTE update after a withdraw is done. | 1749 | * IF we try to do a HUGE PTE update after a withdraw is done. |
1750 | * we will find the below NULL. This happens when we do | 1750 | * we will find the below NULL. This happens when we do |
1751 | * split_huge_page_pmd | 1751 | * split_huge_pmd |
1752 | */ | 1752 | */ |
1753 | if (!hpte_slot_array) | 1753 | if (!hpte_slot_array) |
1754 | return; | 1754 | return; |
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index b056cae3388b..56cc84520577 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c | |||
@@ -129,11 +129,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, | |||
129 | * Allow to use larger than 64k IOMMU pages. Only do that | 129 | * Allow to use larger than 64k IOMMU pages. Only do that |
130 | * if we are backed by hugetlb. | 130 | * if we are backed by hugetlb. |
131 | */ | 131 | */ |
132 | if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { | 132 | if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) |
133 | struct page *head = compound_head(page); | 133 | pageshift = page_shift(compound_head(page)); |
134 | |||
135 | pageshift = compound_order(head) + PAGE_SHIFT; | ||
136 | } | ||
137 | mem->pageshift = min(mem->pageshift, pageshift); | 134 | mem->pageshift = min(mem->pageshift, pageshift); |
138 | /* | 135 | /* |
139 | * We don't need struct page reference any more, switch | 136 | * We don't need struct page reference any more, switch |
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index a8953f108808..73d4873fc7f8 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c | |||
@@ -667,7 +667,7 @@ void flush_dcache_icache_hugepage(struct page *page) | |||
667 | 667 | ||
668 | BUG_ON(!PageCompound(page)); | 668 | BUG_ON(!PageCompound(page)); |
669 | 669 | ||
670 | for (i = 0; i < (1UL << compound_order(page)); i++) { | 670 | for (i = 0; i < compound_nr(page); i++) { |
671 | if (!PageHighMem(page)) { | 671 | if (!PageHighMem(page)) { |
672 | __flush_dcache_icache(page_address(page+i)); | 672 | __flush_dcache_icache(page_address(page+i)); |
673 | } else { | 673 | } else { |
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 71d29fb4008a..8eebbc8860bb 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig | |||
@@ -59,6 +59,18 @@ config RISCV | |||
59 | select ARCH_HAS_GIGANTIC_PAGE | 59 | select ARCH_HAS_GIGANTIC_PAGE |
60 | select ARCH_WANT_HUGE_PMD_SHARE if 64BIT | 60 | select ARCH_WANT_HUGE_PMD_SHARE if 64BIT |
61 | select SPARSEMEM_STATIC if 32BIT | 61 | select SPARSEMEM_STATIC if 32BIT |
62 | select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU | ||
63 | select HAVE_ARCH_MMAP_RND_BITS | ||
64 | |||
65 | config ARCH_MMAP_RND_BITS_MIN | ||
66 | default 18 if 64BIT | ||
67 | default 8 | ||
68 | |||
69 | # max bits determined by the following formula: | ||
70 | # VA_BITS - PAGE_SHIFT - 3 | ||
71 | config ARCH_MMAP_RND_BITS_MAX | ||
72 | default 24 if 64BIT # SV39 based | ||
73 | default 17 | ||
62 | 74 | ||
63 | config MMU | 75 | config MMU |
64 | def_bool y | 76 | def_bool y |
diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index 56a67d66f72f..f66a00d8cb19 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h | |||
@@ -82,8 +82,4 @@ do { \ | |||
82 | tlb_remove_page((tlb), pte); \ | 82 | tlb_remove_page((tlb), pte); \ |
83 | } while (0) | 83 | } while (0) |
84 | 84 | ||
85 | static inline void check_pgt_cache(void) | ||
86 | { | ||
87 | } | ||
88 | |||
89 | #endif /* _ASM_RISCV_PGALLOC_H */ | 85 | #endif /* _ASM_RISCV_PGALLOC_H */ |
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 80905b27ee98..c60123f018f5 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h | |||
@@ -424,11 +424,6 @@ extern void *dtb_early_va; | |||
424 | extern void setup_bootmem(void); | 424 | extern void setup_bootmem(void); |
425 | extern void paging_init(void); | 425 | extern void paging_init(void); |
426 | 426 | ||
427 | static inline void pgtable_cache_init(void) | ||
428 | { | ||
429 | /* No page table caches to initialize */ | ||
430 | } | ||
431 | |||
432 | #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) | 427 | #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) |
433 | #define VMALLOC_END (PAGE_OFFSET - 1) | 428 | #define VMALLOC_END (PAGE_OFFSET - 1) |
434 | #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) | 429 | #define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) |
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0c4600725fc2..36c578c0ff96 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h | |||
@@ -1682,12 +1682,6 @@ extern void s390_reset_cmma(struct mm_struct *mm); | |||
1682 | #define HAVE_ARCH_UNMAPPED_AREA | 1682 | #define HAVE_ARCH_UNMAPPED_AREA |
1683 | #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN | 1683 | #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN |
1684 | 1684 | ||
1685 | /* | ||
1686 | * No page table caches to initialise | ||
1687 | */ | ||
1688 | static inline void pgtable_cache_init(void) { } | ||
1689 | static inline void check_pgt_cache(void) { } | ||
1690 | |||
1691 | #include <asm-generic/pgtable.h> | 1685 | #include <asm-generic/pgtable.h> |
1692 | 1686 | ||
1693 | #endif /* _S390_PAGE_H */ | 1687 | #endif /* _S390_PAGE_H */ |
diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index b56f908b1395..8c6341a4d807 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h | |||
@@ -2,10 +2,8 @@ | |||
2 | #ifndef __ASM_SH_PGALLOC_H | 2 | #ifndef __ASM_SH_PGALLOC_H |
3 | #define __ASM_SH_PGALLOC_H | 3 | #define __ASM_SH_PGALLOC_H |
4 | 4 | ||
5 | #include <linux/quicklist.h> | ||
6 | #include <asm/page.h> | 5 | #include <asm/page.h> |
7 | 6 | #include <asm-generic/pgalloc.h> | |
8 | #define QUICK_PT 0 /* Other page table pages that are zero on free */ | ||
9 | 7 | ||
10 | extern pgd_t *pgd_alloc(struct mm_struct *); | 8 | extern pgd_t *pgd_alloc(struct mm_struct *); |
11 | extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); | 9 | extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); |
@@ -29,41 +27,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, | |||
29 | } | 27 | } |
30 | #define pmd_pgtable(pmd) pmd_page(pmd) | 28 | #define pmd_pgtable(pmd) pmd_page(pmd) |
31 | 29 | ||
32 | /* | ||
33 | * Allocate and free page tables. | ||
34 | */ | ||
35 | static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) | ||
36 | { | ||
37 | return quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL); | ||
38 | } | ||
39 | |||
40 | static inline pgtable_t pte_alloc_one(struct mm_struct *mm) | ||
41 | { | ||
42 | struct page *page; | ||
43 | void *pg; | ||
44 | |||
45 | pg = quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL); | ||
46 | if (!pg) | ||
47 | return NULL; | ||
48 | page = virt_to_page(pg); | ||
49 | if (!pgtable_page_ctor(page)) { | ||
50 | quicklist_free(QUICK_PT, NULL, pg); | ||
51 | return NULL; | ||
52 | } | ||
53 | return page; | ||
54 | } | ||
55 | |||
56 | static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) | ||
57 | { | ||
58 | quicklist_free(QUICK_PT, NULL, pte); | ||
59 | } | ||
60 | |||
61 | static inline void pte_free(struct mm_struct *mm, pgtable_t pte) | ||
62 | { | ||
63 | pgtable_page_dtor(pte); | ||
64 | quicklist_free_page(QUICK_PT, NULL, pte); | ||
65 | } | ||
66 | |||
67 | #define __pte_free_tlb(tlb,pte,addr) \ | 30 | #define __pte_free_tlb(tlb,pte,addr) \ |
68 | do { \ | 31 | do { \ |
69 | pgtable_page_dtor(pte); \ | 32 | pgtable_page_dtor(pte); \ |
@@ -79,9 +42,4 @@ do { \ | |||
79 | } while (0); | 42 | } while (0); |
80 | #endif | 43 | #endif |
81 | 44 | ||
82 | static inline void check_pgt_cache(void) | ||
83 | { | ||
84 | quicklist_trim(QUICK_PT, NULL, 25, 16); | ||
85 | } | ||
86 | |||
87 | #endif /* __ASM_SH_PGALLOC_H */ | 45 | #endif /* __ASM_SH_PGALLOC_H */ |
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 9085d1142fa3..cbd0f3c55a0c 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h | |||
@@ -123,11 +123,6 @@ typedef pte_t *pte_addr_t; | |||
123 | 123 | ||
124 | #define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT))) | 124 | #define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT))) |
125 | 125 | ||
126 | /* | ||
127 | * Initialise the page table caches | ||
128 | */ | ||
129 | extern void pgtable_cache_init(void); | ||
130 | |||
131 | struct vm_area_struct; | 126 | struct vm_area_struct; |
132 | struct mm_struct; | 127 | struct mm_struct; |
133 | 128 | ||
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index 02ed2df25a54..5c8a2ebfc720 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig | |||
@@ -1,9 +1,6 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0 | 1 | # SPDX-License-Identifier: GPL-2.0 |
2 | menu "Memory management options" | 2 | menu "Memory management options" |
3 | 3 | ||
4 | config QUICKLIST | ||
5 | def_bool y | ||
6 | |||
7 | config MMU | 4 | config MMU |
8 | bool "Support for memory management hardware" | 5 | bool "Support for memory management hardware" |
9 | depends on !CPU_SH2 | 6 | depends on !CPU_SH2 |
diff --git a/arch/sh/mm/nommu.c b/arch/sh/mm/nommu.c index cc779a90d917..dca946f426c6 100644 --- a/arch/sh/mm/nommu.c +++ b/arch/sh/mm/nommu.c | |||
@@ -97,7 +97,3 @@ void __init page_table_range_init(unsigned long start, unsigned long end, | |||
97 | void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) | 97 | void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) |
98 | { | 98 | { |
99 | } | 99 | } |
100 | |||
101 | void pgtable_cache_init(void) | ||
102 | { | ||
103 | } | ||
diff --git a/arch/sparc/include/asm/pgalloc_32.h b/arch/sparc/include/asm/pgalloc_32.h index 282be50a4adf..10538a4d1a1e 100644 --- a/arch/sparc/include/asm/pgalloc_32.h +++ b/arch/sparc/include/asm/pgalloc_32.h | |||
@@ -17,8 +17,6 @@ void srmmu_free_nocache(void *addr, int size); | |||
17 | 17 | ||
18 | extern struct resource sparc_iomap; | 18 | extern struct resource sparc_iomap; |
19 | 19 | ||
20 | #define check_pgt_cache() do { } while (0) | ||
21 | |||
22 | pgd_t *get_pgd_fast(void); | 20 | pgd_t *get_pgd_fast(void); |
23 | static inline void free_pgd_fast(pgd_t *pgd) | 21 | static inline void free_pgd_fast(pgd_t *pgd) |
24 | { | 22 | { |
diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h index 48abccba4991..9d3e5cc95bbb 100644 --- a/arch/sparc/include/asm/pgalloc_64.h +++ b/arch/sparc/include/asm/pgalloc_64.h | |||
@@ -69,8 +69,6 @@ void pte_free(struct mm_struct *mm, pgtable_t ptepage); | |||
69 | #define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE) | 69 | #define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE) |
70 | #define pmd_pgtable(PMD) ((pte_t *)__pmd_page(PMD)) | 70 | #define pmd_pgtable(PMD) ((pte_t *)__pmd_page(PMD)) |
71 | 71 | ||
72 | #define check_pgt_cache() do { } while (0) | ||
73 | |||
74 | void pgtable_free(void *table, bool is_page); | 72 | void pgtable_free(void *table, bool is_page); |
75 | 73 | ||
76 | #ifdef CONFIG_SMP | 74 | #ifdef CONFIG_SMP |
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 4eebed6c6781..31da44826645 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h | |||
@@ -445,9 +445,4 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, | |||
445 | /* We provide our own get_unmapped_area to cope with VA holes for userland */ | 445 | /* We provide our own get_unmapped_area to cope with VA holes for userland */ |
446 | #define HAVE_ARCH_UNMAPPED_AREA | 446 | #define HAVE_ARCH_UNMAPPED_AREA |
447 | 447 | ||
448 | /* | ||
449 | * No page table caches to initialise | ||
450 | */ | ||
451 | #define pgtable_cache_init() do { } while (0) | ||
452 | |||
453 | #endif /* !(_SPARC_PGTABLE_H) */ | 448 | #endif /* !(_SPARC_PGTABLE_H) */ |
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 1599de730532..b57f9c631eca 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h | |||
@@ -1135,7 +1135,6 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long, | |||
1135 | unsigned long); | 1135 | unsigned long); |
1136 | #define HAVE_ARCH_FB_UNMAPPED_AREA | 1136 | #define HAVE_ARCH_FB_UNMAPPED_AREA |
1137 | 1137 | ||
1138 | void pgtable_cache_init(void); | ||
1139 | void sun4v_register_fault_status(void); | 1138 | void sun4v_register_fault_status(void); |
1140 | void sun4v_ktsb_register(void); | 1139 | void sun4v_ktsb_register(void); |
1141 | void __init cheetah_ecache_flush_init(void); | 1140 | void __init cheetah_ecache_flush_init(void); |
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 046ab116cc8c..906eda1158b4 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c | |||
@@ -31,7 +31,6 @@ | |||
31 | #include <asm/page.h> | 31 | #include <asm/page.h> |
32 | #include <asm/pgtable.h> | 32 | #include <asm/pgtable.h> |
33 | #include <asm/vaddrs.h> | 33 | #include <asm/vaddrs.h> |
34 | #include <asm/pgalloc.h> /* bug in asm-generic/tlb.h: check_pgt_cache */ | ||
35 | #include <asm/setup.h> | 34 | #include <asm/setup.h> |
36 | #include <asm/tlb.h> | 35 | #include <asm/tlb.h> |
37 | #include <asm/prom.h> | 36 | #include <asm/prom.h> |
diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 023599c3fa51..446e0c0f4018 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h | |||
@@ -43,7 +43,5 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) | |||
43 | #define __pmd_free_tlb(tlb,x, address) tlb_remove_page((tlb),virt_to_page(x)) | 43 | #define __pmd_free_tlb(tlb,x, address) tlb_remove_page((tlb),virt_to_page(x)) |
44 | #endif | 44 | #endif |
45 | 45 | ||
46 | #define check_pgt_cache() do { } while (0) | ||
47 | |||
48 | #endif | 46 | #endif |
49 | 47 | ||
diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index e4d3ed980d82..36a44d58f373 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h | |||
@@ -32,8 +32,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; | |||
32 | /* zero page used for uninitialized stuff */ | 32 | /* zero page used for uninitialized stuff */ |
33 | extern unsigned long *empty_zero_page; | 33 | extern unsigned long *empty_zero_page; |
34 | 34 | ||
35 | #define pgtable_cache_init() do ; while (0) | ||
36 | |||
37 | /* Just any arbitrary offset to the start of the vmalloc VM area: the | 35 | /* Just any arbitrary offset to the start of the vmalloc VM area: the |
38 | * current 8MB value just means that there will be a 8MB "hole" after the | 36 | * current 8MB value just means that there will be a 8MB "hole" after the |
39 | * physical memory until the kernel virtual memory starts. That means that | 37 | * physical memory until the kernel virtual memory starts. That means that |
diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h index 3f0903bd98e9..ba1c9a79993b 100644 --- a/arch/unicore32/include/asm/pgalloc.h +++ b/arch/unicore32/include/asm/pgalloc.h | |||
@@ -18,8 +18,6 @@ | |||
18 | #define __HAVE_ARCH_PTE_ALLOC_ONE | 18 | #define __HAVE_ARCH_PTE_ALLOC_ONE |
19 | #include <asm-generic/pgalloc.h> | 19 | #include <asm-generic/pgalloc.h> |
20 | 20 | ||
21 | #define check_pgt_cache() do { } while (0) | ||
22 | |||
23 | #define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) | 21 | #define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) |
24 | #define _PAGE_KERNEL_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) | 22 | #define _PAGE_KERNEL_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) |
25 | 23 | ||
diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h index 126e961a8cb0..c8f7ba12f309 100644 --- a/arch/unicore32/include/asm/pgtable.h +++ b/arch/unicore32/include/asm/pgtable.h | |||
@@ -285,8 +285,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; | |||
285 | 285 | ||
286 | #include <asm-generic/pgtable.h> | 286 | #include <asm-generic/pgtable.h> |
287 | 287 | ||
288 | #define pgtable_cache_init() do { } while (0) | ||
289 | |||
290 | #endif /* !__ASSEMBLY__ */ | 288 | #endif /* !__ASSEMBLY__ */ |
291 | 289 | ||
292 | #endif /* __UNICORE_PGTABLE_H__ */ | 290 | #endif /* __UNICORE_PGTABLE_H__ */ |
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index c78da8eda8f2..0dca7f7aeff2 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h | |||
@@ -29,8 +29,6 @@ extern pgd_t swapper_pg_dir[1024]; | |||
29 | extern pgd_t initial_page_table[1024]; | 29 | extern pgd_t initial_page_table[1024]; |
30 | extern pmd_t initial_pg_pmd[]; | 30 | extern pmd_t initial_pg_pmd[]; |
31 | 31 | ||
32 | static inline void pgtable_cache_init(void) { } | ||
33 | static inline void check_pgt_cache(void) { } | ||
34 | void paging_init(void); | 32 | void paging_init(void); |
35 | void sync_initial_page_table(void); | 33 | void sync_initial_page_table(void); |
36 | 34 | ||
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 4990d26dfc73..0b6c4042942a 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h | |||
@@ -241,9 +241,6 @@ extern void cleanup_highmap(void); | |||
241 | #define HAVE_ARCH_UNMAPPED_AREA | 241 | #define HAVE_ARCH_UNMAPPED_AREA |
242 | #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN | 242 | #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN |
243 | 243 | ||
244 | #define pgtable_cache_init() do { } while (0) | ||
245 | #define check_pgt_cache() do { } while (0) | ||
246 | |||
247 | #define PAGE_AGP PAGE_KERNEL_NOCACHE | 244 | #define PAGE_AGP PAGE_KERNEL_NOCACHE |
248 | #define HAVE_PAGE_AGP 1 | 245 | #define HAVE_PAGE_AGP 1 |
249 | 246 | ||
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 44816ff6411f..463940faf52f 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -357,7 +357,7 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm, | |||
357 | 357 | ||
358 | static struct kmem_cache *pgd_cache; | 358 | static struct kmem_cache *pgd_cache; |
359 | 359 | ||
360 | void __init pgd_cache_init(void) | 360 | void __init pgtable_cache_init(void) |
361 | { | 361 | { |
362 | /* | 362 | /* |
363 | * When PAE kernel is running as a Xen domain, it does not use | 363 | * When PAE kernel is running as a Xen domain, it does not use |
@@ -402,10 +402,6 @@ static inline void _pgd_free(pgd_t *pgd) | |||
402 | } | 402 | } |
403 | #else | 403 | #else |
404 | 404 | ||
405 | void __init pgd_cache_init(void) | ||
406 | { | ||
407 | } | ||
408 | |||
409 | static inline pgd_t *_pgd_alloc(void) | 405 | static inline pgd_t *_pgd_alloc(void) |
410 | { | 406 | { |
411 | return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, | 407 | return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, |
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index ce3ff5e591b9..3f7fe5a8c286 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h | |||
@@ -238,7 +238,6 @@ extern void paging_init(void); | |||
238 | # define swapper_pg_dir NULL | 238 | # define swapper_pg_dir NULL |
239 | static inline void paging_init(void) { } | 239 | static inline void paging_init(void) { } |
240 | #endif | 240 | #endif |
241 | static inline void pgtable_cache_init(void) { } | ||
242 | 241 | ||
243 | /* | 242 | /* |
244 | * The pmd contains the kernel virtual address of the pte page. | 243 | * The pmd contains the kernel virtual address of the pte page. |
diff --git a/arch/xtensa/include/asm/tlbflush.h b/arch/xtensa/include/asm/tlbflush.h index 06875feb27c2..856e2da2e397 100644 --- a/arch/xtensa/include/asm/tlbflush.h +++ b/arch/xtensa/include/asm/tlbflush.h | |||
@@ -160,9 +160,6 @@ static inline void invalidate_dtlb_mapping (unsigned address) | |||
160 | invalidate_dtlb_entry(tlb_entry); | 160 | invalidate_dtlb_entry(tlb_entry); |
161 | } | 161 | } |
162 | 162 | ||
163 | #define check_pgt_cache() do { } while (0) | ||
164 | |||
165 | |||
166 | /* | 163 | /* |
167 | * DO NOT USE THESE FUNCTIONS. These instructions aren't part of the Xtensa | 164 | * DO NOT USE THESE FUNCTIONS. These instructions aren't part of the Xtensa |
168 | * ISA and exist only for test purposes.. | 165 | * ISA and exist only for test purposes.. |
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 20c39d1bcef8..6bea4f3f8040 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c | |||
@@ -100,26 +100,9 @@ unsigned long __weak memory_block_size_bytes(void) | |||
100 | } | 100 | } |
101 | EXPORT_SYMBOL_GPL(memory_block_size_bytes); | 101 | EXPORT_SYMBOL_GPL(memory_block_size_bytes); |
102 | 102 | ||
103 | static unsigned long get_memory_block_size(void) | ||
104 | { | ||
105 | unsigned long block_sz; | ||
106 | |||
107 | block_sz = memory_block_size_bytes(); | ||
108 | |||
109 | /* Validate blk_sz is a power of 2 and not less than section size */ | ||
110 | if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) { | ||
111 | WARN_ON(1); | ||
112 | block_sz = MIN_MEMORY_BLOCK_SIZE; | ||
113 | } | ||
114 | |||
115 | return block_sz; | ||
116 | } | ||
117 | |||
118 | /* | 103 | /* |
119 | * use this as the physical section index that this memsection | 104 | * Show the first physical section index (number) of this memory block. |
120 | * uses. | ||
121 | */ | 105 | */ |
122 | |||
123 | static ssize_t phys_index_show(struct device *dev, | 106 | static ssize_t phys_index_show(struct device *dev, |
124 | struct device_attribute *attr, char *buf) | 107 | struct device_attribute *attr, char *buf) |
125 | { | 108 | { |
@@ -131,7 +114,10 @@ static ssize_t phys_index_show(struct device *dev, | |||
131 | } | 114 | } |
132 | 115 | ||
133 | /* | 116 | /* |
134 | * Show whether the section of memory is likely to be hot-removable | 117 | * Show whether the memory block is likely to be offlineable (or is already |
118 | * offline). Once offline, the memory block could be removed. The return | ||
119 | * value does, however, not indicate that there is a way to remove the | ||
120 | * memory block. | ||
135 | */ | 121 | */ |
136 | static ssize_t removable_show(struct device *dev, struct device_attribute *attr, | 122 | static ssize_t removable_show(struct device *dev, struct device_attribute *attr, |
137 | char *buf) | 123 | char *buf) |
@@ -455,12 +441,12 @@ static DEVICE_ATTR_RO(phys_device); | |||
455 | static DEVICE_ATTR_RO(removable); | 441 | static DEVICE_ATTR_RO(removable); |
456 | 442 | ||
457 | /* | 443 | /* |
458 | * Block size attribute stuff | 444 | * Show the memory block size (shared by all memory blocks). |
459 | */ | 445 | */ |
460 | static ssize_t block_size_bytes_show(struct device *dev, | 446 | static ssize_t block_size_bytes_show(struct device *dev, |
461 | struct device_attribute *attr, char *buf) | 447 | struct device_attribute *attr, char *buf) |
462 | { | 448 | { |
463 | return sprintf(buf, "%lx\n", get_memory_block_size()); | 449 | return sprintf(buf, "%lx\n", memory_block_size_bytes()); |
464 | } | 450 | } |
465 | 451 | ||
466 | static DEVICE_ATTR_RO(block_size_bytes); | 452 | static DEVICE_ATTR_RO(block_size_bytes); |
@@ -670,10 +656,10 @@ static int init_memory_block(struct memory_block **memory, | |||
670 | return -ENOMEM; | 656 | return -ENOMEM; |
671 | 657 | ||
672 | mem->start_section_nr = block_id * sections_per_block; | 658 | mem->start_section_nr = block_id * sections_per_block; |
673 | mem->end_section_nr = mem->start_section_nr + sections_per_block - 1; | ||
674 | mem->state = state; | 659 | mem->state = state; |
675 | start_pfn = section_nr_to_pfn(mem->start_section_nr); | 660 | start_pfn = section_nr_to_pfn(mem->start_section_nr); |
676 | mem->phys_device = arch_get_memory_phys_device(start_pfn); | 661 | mem->phys_device = arch_get_memory_phys_device(start_pfn); |
662 | mem->nid = NUMA_NO_NODE; | ||
677 | 663 | ||
678 | ret = register_memory(mem); | 664 | ret = register_memory(mem); |
679 | 665 | ||
@@ -810,19 +796,22 @@ static const struct attribute_group *memory_root_attr_groups[] = { | |||
810 | /* | 796 | /* |
811 | * Initialize the sysfs support for memory devices... | 797 | * Initialize the sysfs support for memory devices... |
812 | */ | 798 | */ |
813 | int __init memory_dev_init(void) | 799 | void __init memory_dev_init(void) |
814 | { | 800 | { |
815 | int ret; | 801 | int ret; |
816 | int err; | 802 | int err; |
817 | unsigned long block_sz, nr; | 803 | unsigned long block_sz, nr; |
818 | 804 | ||
805 | /* Validate the configured memory block size */ | ||
806 | block_sz = memory_block_size_bytes(); | ||
807 | if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE) | ||
808 | panic("Memory block size not suitable: 0x%lx\n", block_sz); | ||
809 | sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; | ||
810 | |||
819 | ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); | 811 | ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); |
820 | if (ret) | 812 | if (ret) |
821 | goto out; | 813 | goto out; |
822 | 814 | ||
823 | block_sz = get_memory_block_size(); | ||
824 | sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; | ||
825 | |||
826 | /* | 815 | /* |
827 | * Create entries for memory sections that were found | 816 | * Create entries for memory sections that were found |
828 | * during boot and have been initialized | 817 | * during boot and have been initialized |
@@ -838,8 +827,7 @@ int __init memory_dev_init(void) | |||
838 | 827 | ||
839 | out: | 828 | out: |
840 | if (ret) | 829 | if (ret) |
841 | printk(KERN_ERR "%s() failed: %d\n", __func__, ret); | 830 | panic("%s() failed: %d\n", __func__, ret); |
842 | return ret; | ||
843 | } | 831 | } |
844 | 832 | ||
845 | /** | 833 | /** |
diff --git a/drivers/base/node.c b/drivers/base/node.c index 75b7e6f6535b..296546ffed6c 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c | |||
@@ -427,6 +427,8 @@ static ssize_t node_read_meminfo(struct device *dev, | |||
427 | "Node %d AnonHugePages: %8lu kB\n" | 427 | "Node %d AnonHugePages: %8lu kB\n" |
428 | "Node %d ShmemHugePages: %8lu kB\n" | 428 | "Node %d ShmemHugePages: %8lu kB\n" |
429 | "Node %d ShmemPmdMapped: %8lu kB\n" | 429 | "Node %d ShmemPmdMapped: %8lu kB\n" |
430 | "Node %d FileHugePages: %8lu kB\n" | ||
431 | "Node %d FilePmdMapped: %8lu kB\n" | ||
430 | #endif | 432 | #endif |
431 | , | 433 | , |
432 | nid, K(node_page_state(pgdat, NR_FILE_DIRTY)), | 434 | nid, K(node_page_state(pgdat, NR_FILE_DIRTY)), |
@@ -452,6 +454,10 @@ static ssize_t node_read_meminfo(struct device *dev, | |||
452 | nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * | 454 | nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * |
453 | HPAGE_PMD_NR), | 455 | HPAGE_PMD_NR), |
454 | nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * | 456 | nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * |
457 | HPAGE_PMD_NR), | ||
458 | nid, K(node_page_state(pgdat, NR_FILE_THPS) * | ||
459 | HPAGE_PMD_NR), | ||
460 | nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) * | ||
455 | HPAGE_PMD_NR) | 461 | HPAGE_PMD_NR) |
456 | #endif | 462 | #endif |
457 | ); | 463 | ); |
@@ -756,15 +762,13 @@ static int __ref get_nid_for_pfn(unsigned long pfn) | |||
756 | static int register_mem_sect_under_node(struct memory_block *mem_blk, | 762 | static int register_mem_sect_under_node(struct memory_block *mem_blk, |
757 | void *arg) | 763 | void *arg) |
758 | { | 764 | { |
765 | unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE; | ||
766 | unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); | ||
767 | unsigned long end_pfn = start_pfn + memory_block_pfns - 1; | ||
759 | int ret, nid = *(int *)arg; | 768 | int ret, nid = *(int *)arg; |
760 | unsigned long pfn, sect_start_pfn, sect_end_pfn; | 769 | unsigned long pfn; |
761 | 770 | ||
762 | mem_blk->nid = nid; | 771 | for (pfn = start_pfn; pfn <= end_pfn; pfn++) { |
763 | |||
764 | sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); | ||
765 | sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); | ||
766 | sect_end_pfn += PAGES_PER_SECTION - 1; | ||
767 | for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { | ||
768 | int page_nid; | 772 | int page_nid; |
769 | 773 | ||
770 | /* | 774 | /* |
@@ -789,6 +793,13 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk, | |||
789 | if (page_nid != nid) | 793 | if (page_nid != nid) |
790 | continue; | 794 | continue; |
791 | } | 795 | } |
796 | |||
797 | /* | ||
798 | * If this memory block spans multiple nodes, we only indicate | ||
799 | * the last processed node. | ||
800 | */ | ||
801 | mem_blk->nid = nid; | ||
802 | |||
792 | ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, | 803 | ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, |
793 | &mem_blk->dev.kobj, | 804 | &mem_blk->dev.kobj, |
794 | kobject_name(&mem_blk->dev.kobj)); | 805 | kobject_name(&mem_blk->dev.kobj)); |
@@ -804,32 +815,18 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk, | |||
804 | } | 815 | } |
805 | 816 | ||
806 | /* | 817 | /* |
807 | * Unregister memory block device under all nodes that it spans. | 818 | * Unregister a memory block device under the node it spans. Memory blocks |
808 | * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes). | 819 | * with multiple nodes cannot be offlined and therefore also never be removed. |
809 | */ | 820 | */ |
810 | void unregister_memory_block_under_nodes(struct memory_block *mem_blk) | 821 | void unregister_memory_block_under_nodes(struct memory_block *mem_blk) |
811 | { | 822 | { |
812 | unsigned long pfn, sect_start_pfn, sect_end_pfn; | 823 | if (mem_blk->nid == NUMA_NO_NODE) |
813 | static nodemask_t unlinked_nodes; | 824 | return; |
814 | |||
815 | nodes_clear(unlinked_nodes); | ||
816 | sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); | ||
817 | sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr); | ||
818 | for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) { | ||
819 | int nid; | ||
820 | 825 | ||
821 | nid = get_nid_for_pfn(pfn); | 826 | sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj, |
822 | if (nid < 0) | 827 | kobject_name(&mem_blk->dev.kobj)); |
823 | continue; | 828 | sysfs_remove_link(&mem_blk->dev.kobj, |
824 | if (!node_online(nid)) | 829 | kobject_name(&node_devices[mem_blk->nid]->dev.kobj)); |
825 | continue; | ||
826 | if (node_test_and_set(nid, unlinked_nodes)) | ||
827 | continue; | ||
828 | sysfs_remove_link(&node_devices[nid]->dev.kobj, | ||
829 | kobject_name(&mem_blk->dev.kobj)); | ||
830 | sysfs_remove_link(&mem_blk->dev.kobj, | ||
831 | kobject_name(&node_devices[nid]->dev.kobj)); | ||
832 | } | ||
833 | } | 830 | } |
834 | 831 | ||
835 | int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn) | 832 | int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn) |
diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c index c70cb5f272cf..0891ab829b1b 100644 --- a/drivers/crypto/chelsio/chtls/chtls_io.c +++ b/drivers/crypto/chelsio/chtls/chtls_io.c | |||
@@ -1078,7 +1078,7 @@ new_buf: | |||
1078 | bool merge; | 1078 | bool merge; |
1079 | 1079 | ||
1080 | if (page) | 1080 | if (page) |
1081 | pg_size <<= compound_order(page); | 1081 | pg_size = page_size(page); |
1082 | if (off < pg_size && | 1082 | if (off < pg_size && |
1083 | skb_can_coalesce(skb, i, page, off)) { | 1083 | skb_can_coalesce(skb, i, page, off)) { |
1084 | merge = 1; | 1084 | merge = 1; |
@@ -1105,8 +1105,7 @@ new_buf: | |||
1105 | __GFP_NORETRY, | 1105 | __GFP_NORETRY, |
1106 | order); | 1106 | order); |
1107 | if (page) | 1107 | if (page) |
1108 | pg_size <<= | 1108 | pg_size <<= order; |
1109 | compound_order(page); | ||
1110 | } | 1109 | } |
1111 | if (!page) { | 1110 | if (!page) { |
1112 | page = alloc_page(gfp); | 1111 | page = alloc_page(gfp); |
diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c index feaa538026a0..3db000aacd26 100644 --- a/drivers/gpu/drm/via/via_dmablit.c +++ b/drivers/gpu/drm/via/via_dmablit.c | |||
@@ -174,7 +174,6 @@ via_map_blit_for_device(struct pci_dev *pdev, | |||
174 | static void | 174 | static void |
175 | via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg) | 175 | via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg) |
176 | { | 176 | { |
177 | struct page *page; | ||
178 | int i; | 177 | int i; |
179 | 178 | ||
180 | switch (vsg->state) { | 179 | switch (vsg->state) { |
@@ -189,13 +188,8 @@ via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg) | |||
189 | kfree(vsg->desc_pages); | 188 | kfree(vsg->desc_pages); |
190 | /* fall through */ | 189 | /* fall through */ |
191 | case dr_via_pages_locked: | 190 | case dr_via_pages_locked: |
192 | for (i = 0; i < vsg->num_pages; ++i) { | 191 | put_user_pages_dirty_lock(vsg->pages, vsg->num_pages, |
193 | if (NULL != (page = vsg->pages[i])) { | 192 | (vsg->direction == DMA_FROM_DEVICE)); |
194 | if (!PageReserved(page) && (DMA_FROM_DEVICE == vsg->direction)) | ||
195 | SetPageDirty(page); | ||
196 | put_page(page); | ||
197 | } | ||
198 | } | ||
199 | /* fall through */ | 193 | /* fall through */ |
200 | case dr_via_pages_alloc: | 194 | case dr_via_pages_alloc: |
201 | vfree(vsg->pages); | 195 | vfree(vsg->pages); |
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 41f9e268e3fb..24244a2f68cc 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c | |||
@@ -54,10 +54,7 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d | |||
54 | 54 | ||
55 | for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { | 55 | for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { |
56 | page = sg_page_iter_page(&sg_iter); | 56 | page = sg_page_iter_page(&sg_iter); |
57 | if (umem->writable && dirty) | 57 | put_user_pages_dirty_lock(&page, 1, umem->writable && dirty); |
58 | put_user_pages_dirty_lock(&page, 1); | ||
59 | else | ||
60 | put_user_page(page); | ||
61 | } | 58 | } |
62 | 59 | ||
63 | sg_free_table(&umem->sg_head); | 60 | sg_free_table(&umem->sg_head); |
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index b89a9b9aef7a..469acb961fbd 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c | |||
@@ -118,10 +118,7 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np | |||
118 | void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, | 118 | void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, |
119 | size_t npages, bool dirty) | 119 | size_t npages, bool dirty) |
120 | { | 120 | { |
121 | if (dirty) | 121 | put_user_pages_dirty_lock(p, npages, dirty); |
122 | put_user_pages_dirty_lock(p, npages); | ||
123 | else | ||
124 | put_user_pages(p, npages); | ||
125 | 122 | ||
126 | if (mm) { /* during close after signal, mm can be NULL */ | 123 | if (mm) { /* during close after signal, mm can be NULL */ |
127 | atomic64_sub(npages, &mm->pinned_vm); | 124 | atomic64_sub(npages, &mm->pinned_vm); |
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index bfbfbb7e0ff4..6bf764e41891 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c | |||
@@ -40,10 +40,7 @@ | |||
40 | static void __qib_release_user_pages(struct page **p, size_t num_pages, | 40 | static void __qib_release_user_pages(struct page **p, size_t num_pages, |
41 | int dirty) | 41 | int dirty) |
42 | { | 42 | { |
43 | if (dirty) | 43 | put_user_pages_dirty_lock(p, num_pages, dirty); |
44 | put_user_pages_dirty_lock(p, num_pages); | ||
45 | else | ||
46 | put_user_pages(p, num_pages); | ||
47 | } | 44 | } |
48 | 45 | ||
49 | /** | 46 | /** |
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 0b0237d41613..62e6ffa9ad78 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c | |||
@@ -75,10 +75,7 @@ static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty) | |||
75 | for_each_sg(chunk->page_list, sg, chunk->nents, i) { | 75 | for_each_sg(chunk->page_list, sg, chunk->nents, i) { |
76 | page = sg_page(sg); | 76 | page = sg_page(sg); |
77 | pa = sg_phys(sg); | 77 | pa = sg_phys(sg); |
78 | if (dirty) | 78 | put_user_pages_dirty_lock(&page, 1, dirty); |
79 | put_user_pages_dirty_lock(&page, 1); | ||
80 | else | ||
81 | put_user_page(page); | ||
82 | usnic_dbg("pa: %pa\n", &pa); | 79 | usnic_dbg("pa: %pa\n", &pa); |
83 | } | 80 | } |
84 | kfree(chunk); | 81 | kfree(chunk); |
diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index 87a56039f0ef..e99983f07663 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c | |||
@@ -63,15 +63,7 @@ struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index) | |||
63 | static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, | 63 | static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, |
64 | bool dirty) | 64 | bool dirty) |
65 | { | 65 | { |
66 | struct page **p = chunk->plist; | 66 | put_user_pages_dirty_lock(chunk->plist, num_pages, dirty); |
67 | |||
68 | while (num_pages--) { | ||
69 | if (!PageDirty(*p) && dirty) | ||
70 | put_user_pages_dirty_lock(p, 1); | ||
71 | else | ||
72 | put_user_page(*p); | ||
73 | p++; | ||
74 | } | ||
75 | } | 67 | } |
76 | 68 | ||
77 | void siw_umem_release(struct siw_umem *umem, bool dirty) | 69 | void siw_umem_release(struct siw_umem *umem, bool dirty) |
diff --git a/drivers/staging/android/ion/ion_system_heap.c b/drivers/staging/android/ion/ion_system_heap.c index aa8d8425be25..b83a1d16bd89 100644 --- a/drivers/staging/android/ion/ion_system_heap.c +++ b/drivers/staging/android/ion/ion_system_heap.c | |||
@@ -120,7 +120,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap, | |||
120 | if (!page) | 120 | if (!page) |
121 | goto free_pages; | 121 | goto free_pages; |
122 | list_add_tail(&page->lru, &pages); | 122 | list_add_tail(&page->lru, &pages); |
123 | size_remaining -= PAGE_SIZE << compound_order(page); | 123 | size_remaining -= page_size(page); |
124 | max_order = compound_order(page); | 124 | max_order = compound_order(page); |
125 | i++; | 125 | i++; |
126 | } | 126 | } |
@@ -133,7 +133,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap, | |||
133 | 133 | ||
134 | sg = table->sgl; | 134 | sg = table->sgl; |
135 | list_for_each_entry_safe(page, tmp_page, &pages, lru) { | 135 | list_for_each_entry_safe(page, tmp_page, &pages, lru) { |
136 | sg_set_page(sg, page, PAGE_SIZE << compound_order(page), 0); | 136 | sg_set_page(sg, page, page_size(page), 0); |
137 | sg = sg_next(sg); | 137 | sg = sg_next(sg); |
138 | list_del(&page->lru); | 138 | list_del(&page->lru); |
139 | } | 139 | } |
diff --git a/drivers/target/tcm_fc/tfc_io.c b/drivers/target/tcm_fc/tfc_io.c index a254792d882c..1354a157e9af 100644 --- a/drivers/target/tcm_fc/tfc_io.c +++ b/drivers/target/tcm_fc/tfc_io.c | |||
@@ -136,8 +136,7 @@ int ft_queue_data_in(struct se_cmd *se_cmd) | |||
136 | page, off_in_page, tlen); | 136 | page, off_in_page, tlen); |
137 | fr_len(fp) += tlen; | 137 | fr_len(fp) += tlen; |
138 | fp_skb(fp)->data_len += tlen; | 138 | fp_skb(fp)->data_len += tlen; |
139 | fp_skb(fp)->truesize += | 139 | fp_skb(fp)->truesize += page_size(page); |
140 | PAGE_SIZE << compound_order(page); | ||
141 | } else { | 140 | } else { |
142 | BUG_ON(!page); | 141 | BUG_ON(!page); |
143 | from = kmap_atomic(page + (mem_off >> PAGE_SHIFT)); | 142 | from = kmap_atomic(page + (mem_off >> PAGE_SHIFT)); |
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 3b18fa4d090a..26cef65b41e7 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c | |||
@@ -176,13 +176,13 @@ put_exit: | |||
176 | } | 176 | } |
177 | 177 | ||
178 | static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, | 178 | static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, |
179 | unsigned int page_shift) | 179 | unsigned int it_page_shift) |
180 | { | 180 | { |
181 | struct page *page; | 181 | struct page *page; |
182 | unsigned long size = 0; | 182 | unsigned long size = 0; |
183 | 183 | ||
184 | if (mm_iommu_is_devmem(mm, hpa, page_shift, &size)) | 184 | if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size)) |
185 | return size == (1UL << page_shift); | 185 | return size == (1UL << it_page_shift); |
186 | 186 | ||
187 | page = pfn_to_page(hpa >> PAGE_SHIFT); | 187 | page = pfn_to_page(hpa >> PAGE_SHIFT); |
188 | /* | 188 | /* |
@@ -190,7 +190,7 @@ static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, | |||
190 | * a page we just found. Otherwise the hardware can get access to | 190 | * a page we just found. Otherwise the hardware can get access to |
191 | * a bigger memory chunk that it should. | 191 | * a bigger memory chunk that it should. |
192 | */ | 192 | */ |
193 | return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; | 193 | return page_shift(compound_head(page)) >= it_page_shift; |
194 | } | 194 | } |
195 | 195 | ||
196 | static inline bool tce_groups_attached(struct tce_container *container) | 196 | static inline bool tce_groups_attached(struct tce_container *container) |
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d4e11b2e04f6..cec3b4146440 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c | |||
@@ -670,26 +670,6 @@ out: | |||
670 | * libraries. There is no binary dependent code anywhere else. | 670 | * libraries. There is no binary dependent code anywhere else. |
671 | */ | 671 | */ |
672 | 672 | ||
673 | #ifndef STACK_RND_MASK | ||
674 | #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ | ||
675 | #endif | ||
676 | |||
677 | static unsigned long randomize_stack_top(unsigned long stack_top) | ||
678 | { | ||
679 | unsigned long random_variable = 0; | ||
680 | |||
681 | if (current->flags & PF_RANDOMIZE) { | ||
682 | random_variable = get_random_long(); | ||
683 | random_variable &= STACK_RND_MASK; | ||
684 | random_variable <<= PAGE_SHIFT; | ||
685 | } | ||
686 | #ifdef CONFIG_STACK_GROWSUP | ||
687 | return PAGE_ALIGN(stack_top) + random_variable; | ||
688 | #else | ||
689 | return PAGE_ALIGN(stack_top) - random_variable; | ||
690 | #endif | ||
691 | } | ||
692 | |||
693 | static int load_elf_binary(struct linux_binprm *bprm) | 673 | static int load_elf_binary(struct linux_binprm *bprm) |
694 | { | 674 | { |
695 | struct file *interpreter = NULL; /* to shut gcc up */ | 675 | struct file *interpreter = NULL; /* to shut gcc up */ |
diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 1bda2ab6745b..814ad2c2ba80 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c | |||
@@ -1100,8 +1100,11 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used, | |||
1100 | err = -ENOMEM; | 1100 | err = -ENOMEM; |
1101 | goto error; | 1101 | goto error; |
1102 | } | 1102 | } |
1103 | /* Avoid race with userspace read via bdev */ | ||
1104 | lock_buffer(bhs[n]); | ||
1103 | memset(bhs[n]->b_data, 0, sb->s_blocksize); | 1105 | memset(bhs[n]->b_data, 0, sb->s_blocksize); |
1104 | set_buffer_uptodate(bhs[n]); | 1106 | set_buffer_uptodate(bhs[n]); |
1107 | unlock_buffer(bhs[n]); | ||
1105 | mark_buffer_dirty_inode(bhs[n], dir); | 1108 | mark_buffer_dirty_inode(bhs[n], dir); |
1106 | 1109 | ||
1107 | n++; | 1110 | n++; |
@@ -1158,6 +1161,8 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts) | |||
1158 | fat_time_unix2fat(sbi, ts, &time, &date, &time_cs); | 1161 | fat_time_unix2fat(sbi, ts, &time, &date, &time_cs); |
1159 | 1162 | ||
1160 | de = (struct msdos_dir_entry *)bhs[0]->b_data; | 1163 | de = (struct msdos_dir_entry *)bhs[0]->b_data; |
1164 | /* Avoid race with userspace read via bdev */ | ||
1165 | lock_buffer(bhs[0]); | ||
1161 | /* filling the new directory slots ("." and ".." entries) */ | 1166 | /* filling the new directory slots ("." and ".." entries) */ |
1162 | memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME); | 1167 | memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME); |
1163 | memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME); | 1168 | memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME); |
@@ -1180,6 +1185,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts) | |||
1180 | de[0].size = de[1].size = 0; | 1185 | de[0].size = de[1].size = 0; |
1181 | memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); | 1186 | memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); |
1182 | set_buffer_uptodate(bhs[0]); | 1187 | set_buffer_uptodate(bhs[0]); |
1188 | unlock_buffer(bhs[0]); | ||
1183 | mark_buffer_dirty_inode(bhs[0], dir); | 1189 | mark_buffer_dirty_inode(bhs[0], dir); |
1184 | 1190 | ||
1185 | err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE); | 1191 | err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE); |
@@ -1237,11 +1243,14 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, | |||
1237 | 1243 | ||
1238 | /* fill the directory entry */ | 1244 | /* fill the directory entry */ |
1239 | copy = min(size, sb->s_blocksize); | 1245 | copy = min(size, sb->s_blocksize); |
1246 | /* Avoid race with userspace read via bdev */ | ||
1247 | lock_buffer(bhs[n]); | ||
1240 | memcpy(bhs[n]->b_data, slots, copy); | 1248 | memcpy(bhs[n]->b_data, slots, copy); |
1241 | slots += copy; | ||
1242 | size -= copy; | ||
1243 | set_buffer_uptodate(bhs[n]); | 1249 | set_buffer_uptodate(bhs[n]); |
1250 | unlock_buffer(bhs[n]); | ||
1244 | mark_buffer_dirty_inode(bhs[n], dir); | 1251 | mark_buffer_dirty_inode(bhs[n], dir); |
1252 | slots += copy; | ||
1253 | size -= copy; | ||
1245 | if (!size) | 1254 | if (!size) |
1246 | break; | 1255 | break; |
1247 | n++; | 1256 | n++; |
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 265983635f2b..3647c65a0f48 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c | |||
@@ -388,8 +388,11 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs, | |||
388 | err = -ENOMEM; | 388 | err = -ENOMEM; |
389 | goto error; | 389 | goto error; |
390 | } | 390 | } |
391 | /* Avoid race with userspace read via bdev */ | ||
392 | lock_buffer(c_bh); | ||
391 | memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); | 393 | memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); |
392 | set_buffer_uptodate(c_bh); | 394 | set_buffer_uptodate(c_bh); |
395 | unlock_buffer(c_bh); | ||
393 | mark_buffer_dirty_inode(c_bh, sbi->fat_inode); | 396 | mark_buffer_dirty_inode(c_bh, sbi->fat_inode); |
394 | if (sb->s_flags & SB_SYNCHRONOUS) | 397 | if (sb->s_flags & SB_SYNCHRONOUS) |
395 | err = sync_dirty_buffer(c_bh); | 398 | err = sync_dirty_buffer(c_bh); |
diff --git a/fs/inode.c b/fs/inode.c index 64bf28cf05cd..fef457a42882 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -181,6 +181,9 @@ int inode_init_always(struct super_block *sb, struct inode *inode) | |||
181 | mapping->flags = 0; | 181 | mapping->flags = 0; |
182 | mapping->wb_err = 0; | 182 | mapping->wb_err = 0; |
183 | atomic_set(&mapping->i_mmap_writable, 0); | 183 | atomic_set(&mapping->i_mmap_writable, 0); |
184 | #ifdef CONFIG_READ_ONLY_THP_FOR_FS | ||
185 | atomic_set(&mapping->nr_thps, 0); | ||
186 | #endif | ||
184 | mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); | 187 | mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); |
185 | mapping->private_data = NULL; | 188 | mapping->private_data = NULL; |
186 | mapping->writeback_index = 0; | 189 | mapping->writeback_index = 0; |
diff --git a/fs/io_uring.c b/fs/io_uring.c index 0dadbdbead0f..f83de4c6a826 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c | |||
@@ -3319,7 +3319,7 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) | |||
3319 | } | 3319 | } |
3320 | 3320 | ||
3321 | page = virt_to_head_page(ptr); | 3321 | page = virt_to_head_page(ptr); |
3322 | if (sz > (PAGE_SIZE << compound_order(page))) | 3322 | if (sz > page_size(page)) |
3323 | return -EINVAL; | 3323 | return -EINVAL; |
3324 | 3324 | ||
3325 | pfn = virt_to_phys(ptr) >> PAGE_SHIFT; | 3325 | pfn = virt_to_phys(ptr) >> PAGE_SHIFT; |
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 953990eb70a9..1c58859aa592 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c | |||
@@ -89,8 +89,6 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page); | |||
89 | EXPORT_SYMBOL(jbd2_journal_invalidatepage); | 89 | EXPORT_SYMBOL(jbd2_journal_invalidatepage); |
90 | EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); | 90 | EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); |
91 | EXPORT_SYMBOL(jbd2_journal_force_commit); | 91 | EXPORT_SYMBOL(jbd2_journal_force_commit); |
92 | EXPORT_SYMBOL(jbd2_journal_inode_add_write); | ||
93 | EXPORT_SYMBOL(jbd2_journal_inode_add_wait); | ||
94 | EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); | 92 | EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); |
95 | EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait); | 93 | EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait); |
96 | EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); | 94 | EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); |
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index afc06daee5bb..bee8498d7792 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c | |||
@@ -2622,18 +2622,6 @@ done: | |||
2622 | return 0; | 2622 | return 0; |
2623 | } | 2623 | } |
2624 | 2624 | ||
2625 | int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode) | ||
2626 | { | ||
2627 | return jbd2_journal_file_inode(handle, jinode, | ||
2628 | JI_WRITE_DATA | JI_WAIT_DATA, 0, LLONG_MAX); | ||
2629 | } | ||
2630 | |||
2631 | int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode) | ||
2632 | { | ||
2633 | return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, 0, | ||
2634 | LLONG_MAX); | ||
2635 | } | ||
2636 | |||
2637 | int jbd2_journal_inode_ranged_write(handle_t *handle, | 2625 | int jbd2_journal_inode_ranged_write(handle_t *handle, |
2638 | struct jbd2_inode *jinode, loff_t start_byte, loff_t length) | 2626 | struct jbd2_inode *jinode, loff_t start_byte, loff_t length) |
2639 | { | 2627 | { |
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 0c335b51043d..f9baefc76cf9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
@@ -5993,6 +5993,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) | |||
5993 | struct buffer_head *data_alloc_bh = NULL; | 5993 | struct buffer_head *data_alloc_bh = NULL; |
5994 | struct ocfs2_dinode *di; | 5994 | struct ocfs2_dinode *di; |
5995 | struct ocfs2_truncate_log *tl; | 5995 | struct ocfs2_truncate_log *tl; |
5996 | struct ocfs2_journal *journal = osb->journal; | ||
5996 | 5997 | ||
5997 | BUG_ON(inode_trylock(tl_inode)); | 5998 | BUG_ON(inode_trylock(tl_inode)); |
5998 | 5999 | ||
@@ -6013,6 +6014,20 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) | |||
6013 | goto out; | 6014 | goto out; |
6014 | } | 6015 | } |
6015 | 6016 | ||
6017 | /* Appending truncate log(TA) and and flushing truncate log(TF) are | ||
6018 | * two separated transactions. They can be both committed but not | ||
6019 | * checkpointed. If crash occurs then, both two transaction will be | ||
6020 | * replayed with several already released to global bitmap clusters. | ||
6021 | * Then truncate log will be replayed resulting in cluster double free. | ||
6022 | */ | ||
6023 | jbd2_journal_lock_updates(journal->j_journal); | ||
6024 | status = jbd2_journal_flush(journal->j_journal); | ||
6025 | jbd2_journal_unlock_updates(journal->j_journal); | ||
6026 | if (status < 0) { | ||
6027 | mlog_errno(status); | ||
6028 | goto out; | ||
6029 | } | ||
6030 | |||
6016 | data_alloc_inode = ocfs2_get_system_file_inode(osb, | 6031 | data_alloc_inode = ocfs2_get_system_file_inode(osb, |
6017 | GLOBAL_BITMAP_SYSTEM_INODE, | 6032 | GLOBAL_BITMAP_SYSTEM_INODE, |
6018 | OCFS2_INVALID_SLOT); | 6033 | OCFS2_INVALID_SLOT); |
@@ -6792,6 +6807,8 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, | |||
6792 | struct page *page, int zero, u64 *phys) | 6807 | struct page *page, int zero, u64 *phys) |
6793 | { | 6808 | { |
6794 | int ret, partial = 0; | 6809 | int ret, partial = 0; |
6810 | loff_t start_byte = ((loff_t)page->index << PAGE_SHIFT) + from; | ||
6811 | loff_t length = to - from; | ||
6795 | 6812 | ||
6796 | ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); | 6813 | ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); |
6797 | if (ret) | 6814 | if (ret) |
@@ -6811,7 +6828,8 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, | |||
6811 | if (ret < 0) | 6828 | if (ret < 0) |
6812 | mlog_errno(ret); | 6829 | mlog_errno(ret); |
6813 | else if (ocfs2_should_order_data(inode)) { | 6830 | else if (ocfs2_should_order_data(inode)) { |
6814 | ret = ocfs2_jbd2_file_inode(handle, inode); | 6831 | ret = ocfs2_jbd2_inode_add_write(handle, inode, |
6832 | start_byte, length); | ||
6815 | if (ret < 0) | 6833 | if (ret < 0) |
6816 | mlog_errno(ret); | 6834 | mlog_errno(ret); |
6817 | } | 6835 | } |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a4c905d6b575..8de1c9d644f6 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -942,7 +942,8 @@ static void ocfs2_write_failure(struct inode *inode, | |||
942 | 942 | ||
943 | if (tmppage && page_has_buffers(tmppage)) { | 943 | if (tmppage && page_has_buffers(tmppage)) { |
944 | if (ocfs2_should_order_data(inode)) | 944 | if (ocfs2_should_order_data(inode)) |
945 | ocfs2_jbd2_file_inode(wc->w_handle, inode); | 945 | ocfs2_jbd2_inode_add_write(wc->w_handle, inode, |
946 | user_pos, user_len); | ||
946 | 947 | ||
947 | block_commit_write(tmppage, from, to); | 948 | block_commit_write(tmppage, from, to); |
948 | } | 949 | } |
@@ -2023,8 +2024,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping, | |||
2023 | } | 2024 | } |
2024 | 2025 | ||
2025 | if (page_has_buffers(tmppage)) { | 2026 | if (page_has_buffers(tmppage)) { |
2026 | if (handle && ocfs2_should_order_data(inode)) | 2027 | if (handle && ocfs2_should_order_data(inode)) { |
2027 | ocfs2_jbd2_file_inode(handle, inode); | 2028 | loff_t start_byte = |
2029 | ((loff_t)tmppage->index << PAGE_SHIFT) + | ||
2030 | from; | ||
2031 | loff_t length = to - from; | ||
2032 | ocfs2_jbd2_inode_add_write(handle, inode, | ||
2033 | start_byte, length); | ||
2034 | } | ||
2028 | block_commit_write(tmppage, from, to); | 2035 | block_commit_write(tmppage, from, to); |
2029 | } | 2036 | } |
2030 | } | 2037 | } |
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index 429e6a8359a5..eaf042feaf5e 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c | |||
@@ -231,14 +231,6 @@ static int blockcheck_u64_get(void *data, u64 *val) | |||
231 | } | 231 | } |
232 | DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); | 232 | DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); |
233 | 233 | ||
234 | static struct dentry *blockcheck_debugfs_create(const char *name, | ||
235 | struct dentry *parent, | ||
236 | u64 *value) | ||
237 | { | ||
238 | return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value, | ||
239 | &blockcheck_fops); | ||
240 | } | ||
241 | |||
242 | static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) | 234 | static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) |
243 | { | 235 | { |
244 | if (stats) { | 236 | if (stats) { |
@@ -250,16 +242,20 @@ static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) | |||
250 | static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, | 242 | static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, |
251 | struct dentry *parent) | 243 | struct dentry *parent) |
252 | { | 244 | { |
253 | stats->b_debug_dir = debugfs_create_dir("blockcheck", parent); | 245 | struct dentry *dir; |
246 | |||
247 | dir = debugfs_create_dir("blockcheck", parent); | ||
248 | stats->b_debug_dir = dir; | ||
249 | |||
250 | debugfs_create_file("blocks_checked", S_IFREG | S_IRUSR, dir, | ||
251 | &stats->b_check_count, &blockcheck_fops); | ||
254 | 252 | ||
255 | blockcheck_debugfs_create("blocks_checked", stats->b_debug_dir, | 253 | debugfs_create_file("checksums_failed", S_IFREG | S_IRUSR, dir, |
256 | &stats->b_check_count); | 254 | &stats->b_failure_count, &blockcheck_fops); |
257 | 255 | ||
258 | blockcheck_debugfs_create("checksums_failed", stats->b_debug_dir, | 256 | debugfs_create_file("ecc_recoveries", S_IFREG | S_IRUSR, dir, |
259 | &stats->b_failure_count); | 257 | &stats->b_recover_count, &blockcheck_fops); |
260 | 258 | ||
261 | blockcheck_debugfs_create("ecc_recoveries", stats->b_debug_dir, | ||
262 | &stats->b_recover_count); | ||
263 | } | 259 | } |
264 | #else | 260 | #else |
265 | static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, | 261 | static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, |
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f1b613327ac8..a368350d4c27 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -225,10 +225,6 @@ struct o2hb_region { | |||
225 | unsigned int hr_region_num; | 225 | unsigned int hr_region_num; |
226 | 226 | ||
227 | struct dentry *hr_debug_dir; | 227 | struct dentry *hr_debug_dir; |
228 | struct dentry *hr_debug_livenodes; | ||
229 | struct dentry *hr_debug_regnum; | ||
230 | struct dentry *hr_debug_elapsed_time; | ||
231 | struct dentry *hr_debug_pinned; | ||
232 | struct o2hb_debug_buf *hr_db_livenodes; | 228 | struct o2hb_debug_buf *hr_db_livenodes; |
233 | struct o2hb_debug_buf *hr_db_regnum; | 229 | struct o2hb_debug_buf *hr_db_regnum; |
234 | struct o2hb_debug_buf *hr_db_elapsed_time; | 230 | struct o2hb_debug_buf *hr_db_elapsed_time; |
@@ -1394,21 +1390,20 @@ void o2hb_exit(void) | |||
1394 | kfree(o2hb_db_failedregions); | 1390 | kfree(o2hb_db_failedregions); |
1395 | } | 1391 | } |
1396 | 1392 | ||
1397 | static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, | 1393 | static void o2hb_debug_create(const char *name, struct dentry *dir, |
1398 | struct o2hb_debug_buf **db, int db_len, | 1394 | struct o2hb_debug_buf **db, int db_len, int type, |
1399 | int type, int size, int len, void *data) | 1395 | int size, int len, void *data) |
1400 | { | 1396 | { |
1401 | *db = kmalloc(db_len, GFP_KERNEL); | 1397 | *db = kmalloc(db_len, GFP_KERNEL); |
1402 | if (!*db) | 1398 | if (!*db) |
1403 | return NULL; | 1399 | return; |
1404 | 1400 | ||
1405 | (*db)->db_type = type; | 1401 | (*db)->db_type = type; |
1406 | (*db)->db_size = size; | 1402 | (*db)->db_size = size; |
1407 | (*db)->db_len = len; | 1403 | (*db)->db_len = len; |
1408 | (*db)->db_data = data; | 1404 | (*db)->db_data = data; |
1409 | 1405 | ||
1410 | return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, | 1406 | debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, &o2hb_debug_fops); |
1411 | &o2hb_debug_fops); | ||
1412 | } | 1407 | } |
1413 | 1408 | ||
1414 | static void o2hb_debug_init(void) | 1409 | static void o2hb_debug_init(void) |
@@ -1525,11 +1520,7 @@ static void o2hb_region_release(struct config_item *item) | |||
1525 | 1520 | ||
1526 | kfree(reg->hr_slots); | 1521 | kfree(reg->hr_slots); |
1527 | 1522 | ||
1528 | debugfs_remove(reg->hr_debug_livenodes); | 1523 | debugfs_remove_recursive(reg->hr_debug_dir); |
1529 | debugfs_remove(reg->hr_debug_regnum); | ||
1530 | debugfs_remove(reg->hr_debug_elapsed_time); | ||
1531 | debugfs_remove(reg->hr_debug_pinned); | ||
1532 | debugfs_remove(reg->hr_debug_dir); | ||
1533 | kfree(reg->hr_db_livenodes); | 1524 | kfree(reg->hr_db_livenodes); |
1534 | kfree(reg->hr_db_regnum); | 1525 | kfree(reg->hr_db_regnum); |
1535 | kfree(reg->hr_db_elapsed_time); | 1526 | kfree(reg->hr_db_elapsed_time); |
@@ -1988,69 +1979,33 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group | |||
1988 | : NULL; | 1979 | : NULL; |
1989 | } | 1980 | } |
1990 | 1981 | ||
1991 | static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) | 1982 | static void o2hb_debug_region_init(struct o2hb_region *reg, |
1983 | struct dentry *parent) | ||
1992 | { | 1984 | { |
1993 | int ret = -ENOMEM; | 1985 | struct dentry *dir; |
1994 | 1986 | ||
1995 | reg->hr_debug_dir = | 1987 | dir = debugfs_create_dir(config_item_name(®->hr_item), parent); |
1996 | debugfs_create_dir(config_item_name(®->hr_item), dir); | 1988 | reg->hr_debug_dir = dir; |
1997 | if (!reg->hr_debug_dir) { | ||
1998 | mlog_errno(ret); | ||
1999 | goto bail; | ||
2000 | } | ||
2001 | 1989 | ||
2002 | reg->hr_debug_livenodes = | 1990 | o2hb_debug_create(O2HB_DEBUG_LIVENODES, dir, &(reg->hr_db_livenodes), |
2003 | o2hb_debug_create(O2HB_DEBUG_LIVENODES, | 1991 | sizeof(*(reg->hr_db_livenodes)), |
2004 | reg->hr_debug_dir, | 1992 | O2HB_DB_TYPE_REGION_LIVENODES, |
2005 | &(reg->hr_db_livenodes), | 1993 | sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES, |
2006 | sizeof(*(reg->hr_db_livenodes)), | 1994 | reg); |
2007 | O2HB_DB_TYPE_REGION_LIVENODES, | ||
2008 | sizeof(reg->hr_live_node_bitmap), | ||
2009 | O2NM_MAX_NODES, reg); | ||
2010 | if (!reg->hr_debug_livenodes) { | ||
2011 | mlog_errno(ret); | ||
2012 | goto bail; | ||
2013 | } | ||
2014 | 1995 | ||
2015 | reg->hr_debug_regnum = | 1996 | o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, dir, &(reg->hr_db_regnum), |
2016 | o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, | 1997 | sizeof(*(reg->hr_db_regnum)), |
2017 | reg->hr_debug_dir, | 1998 | O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg); |
2018 | &(reg->hr_db_regnum), | ||
2019 | sizeof(*(reg->hr_db_regnum)), | ||
2020 | O2HB_DB_TYPE_REGION_NUMBER, | ||
2021 | 0, O2NM_MAX_NODES, reg); | ||
2022 | if (!reg->hr_debug_regnum) { | ||
2023 | mlog_errno(ret); | ||
2024 | goto bail; | ||
2025 | } | ||
2026 | 1999 | ||
2027 | reg->hr_debug_elapsed_time = | 2000 | o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, dir, |
2028 | o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, | 2001 | &(reg->hr_db_elapsed_time), |
2029 | reg->hr_debug_dir, | 2002 | sizeof(*(reg->hr_db_elapsed_time)), |
2030 | &(reg->hr_db_elapsed_time), | 2003 | O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg); |
2031 | sizeof(*(reg->hr_db_elapsed_time)), | ||
2032 | O2HB_DB_TYPE_REGION_ELAPSED_TIME, | ||
2033 | 0, 0, reg); | ||
2034 | if (!reg->hr_debug_elapsed_time) { | ||
2035 | mlog_errno(ret); | ||
2036 | goto bail; | ||
2037 | } | ||
2038 | 2004 | ||
2039 | reg->hr_debug_pinned = | 2005 | o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, dir, &(reg->hr_db_pinned), |
2040 | o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, | 2006 | sizeof(*(reg->hr_db_pinned)), |
2041 | reg->hr_debug_dir, | 2007 | O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg); |
2042 | &(reg->hr_db_pinned), | ||
2043 | sizeof(*(reg->hr_db_pinned)), | ||
2044 | O2HB_DB_TYPE_REGION_PINNED, | ||
2045 | 0, 0, reg); | ||
2046 | if (!reg->hr_debug_pinned) { | ||
2047 | mlog_errno(ret); | ||
2048 | goto bail; | ||
2049 | } | ||
2050 | 2008 | ||
2051 | ret = 0; | ||
2052 | bail: | ||
2053 | return ret; | ||
2054 | } | 2009 | } |
2055 | 2010 | ||
2056 | static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, | 2011 | static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, |
@@ -2106,11 +2061,7 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g | |||
2106 | if (ret) | 2061 | if (ret) |
2107 | goto unregister_handler; | 2062 | goto unregister_handler; |
2108 | 2063 | ||
2109 | ret = o2hb_debug_region_init(reg, o2hb_debug_dir); | 2064 | o2hb_debug_region_init(reg, o2hb_debug_dir); |
2110 | if (ret) { | ||
2111 | config_item_put(®->hr_item); | ||
2112 | goto unregister_handler; | ||
2113 | } | ||
2114 | 2065 | ||
2115 | return ®->hr_item; | 2066 | return ®->hr_item; |
2116 | 2067 | ||
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 784426dee56c..bdef72c0f099 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c | |||
@@ -3636,7 +3636,7 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, | |||
3636 | int i, j, num_used; | 3636 | int i, j, num_used; |
3637 | u32 major_hash; | 3637 | u32 major_hash; |
3638 | struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf; | 3638 | struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf; |
3639 | struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list; | 3639 | struct ocfs2_dx_entry_list *orig_list, *tmp_list; |
3640 | struct ocfs2_dx_entry *dx_entry; | 3640 | struct ocfs2_dx_entry *dx_entry; |
3641 | 3641 | ||
3642 | tmp_list = &tmp_dx_leaf->dl_list; | 3642 | tmp_list = &tmp_dx_leaf->dl_list; |
@@ -3645,7 +3645,6 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, | |||
3645 | orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data; | 3645 | orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data; |
3646 | orig_list = &orig_dx_leaf->dl_list; | 3646 | orig_list = &orig_dx_leaf->dl_list; |
3647 | new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data; | 3647 | new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data; |
3648 | new_list = &new_dx_leaf->dl_list; | ||
3649 | 3648 | ||
3650 | num_used = le16_to_cpu(orig_list->de_num_used); | 3649 | num_used = le16_to_cpu(orig_list->de_num_used); |
3651 | 3650 | ||
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 69a429b625cc..aaf24548b02a 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h | |||
@@ -142,7 +142,6 @@ struct dlm_ctxt | |||
142 | atomic_t res_tot_count; | 142 | atomic_t res_tot_count; |
143 | atomic_t res_cur_count; | 143 | atomic_t res_cur_count; |
144 | 144 | ||
145 | struct dlm_debug_ctxt *dlm_debug_ctxt; | ||
146 | struct dentry *dlm_debugfs_subroot; | 145 | struct dentry *dlm_debugfs_subroot; |
147 | 146 | ||
148 | /* NOTE: Next three are protected by dlm_domain_lock */ | 147 | /* NOTE: Next three are protected by dlm_domain_lock */ |
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index a4b58ba99927..4d0b452012b2 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c | |||
@@ -853,67 +853,34 @@ static const struct file_operations debug_state_fops = { | |||
853 | /* files in subroot */ | 853 | /* files in subroot */ |
854 | void dlm_debug_init(struct dlm_ctxt *dlm) | 854 | void dlm_debug_init(struct dlm_ctxt *dlm) |
855 | { | 855 | { |
856 | struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; | ||
857 | |||
858 | /* for dumping dlm_ctxt */ | 856 | /* for dumping dlm_ctxt */ |
859 | dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE, | 857 | debugfs_create_file(DLM_DEBUGFS_DLM_STATE, S_IFREG|S_IRUSR, |
860 | S_IFREG|S_IRUSR, | 858 | dlm->dlm_debugfs_subroot, dlm, &debug_state_fops); |
861 | dlm->dlm_debugfs_subroot, | ||
862 | dlm, &debug_state_fops); | ||
863 | 859 | ||
864 | /* for dumping lockres */ | 860 | /* for dumping lockres */ |
865 | dc->debug_lockres_dentry = | 861 | debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, S_IFREG|S_IRUSR, |
866 | debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, | 862 | dlm->dlm_debugfs_subroot, dlm, &debug_lockres_fops); |
867 | S_IFREG|S_IRUSR, | ||
868 | dlm->dlm_debugfs_subroot, | ||
869 | dlm, &debug_lockres_fops); | ||
870 | 863 | ||
871 | /* for dumping mles */ | 864 | /* for dumping mles */ |
872 | dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE, | 865 | debugfs_create_file(DLM_DEBUGFS_MLE_STATE, S_IFREG|S_IRUSR, |
873 | S_IFREG|S_IRUSR, | 866 | dlm->dlm_debugfs_subroot, dlm, &debug_mle_fops); |
874 | dlm->dlm_debugfs_subroot, | ||
875 | dlm, &debug_mle_fops); | ||
876 | 867 | ||
877 | /* for dumping lockres on the purge list */ | 868 | /* for dumping lockres on the purge list */ |
878 | dc->debug_purgelist_dentry = | 869 | debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, S_IFREG|S_IRUSR, |
879 | debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, | 870 | dlm->dlm_debugfs_subroot, dlm, |
880 | S_IFREG|S_IRUSR, | 871 | &debug_purgelist_fops); |
881 | dlm->dlm_debugfs_subroot, | ||
882 | dlm, &debug_purgelist_fops); | ||
883 | } | ||
884 | |||
885 | void dlm_debug_shutdown(struct dlm_ctxt *dlm) | ||
886 | { | ||
887 | struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; | ||
888 | |||
889 | if (dc) { | ||
890 | debugfs_remove(dc->debug_purgelist_dentry); | ||
891 | debugfs_remove(dc->debug_mle_dentry); | ||
892 | debugfs_remove(dc->debug_lockres_dentry); | ||
893 | debugfs_remove(dc->debug_state_dentry); | ||
894 | kfree(dc); | ||
895 | dc = NULL; | ||
896 | } | ||
897 | } | 872 | } |
898 | 873 | ||
899 | /* subroot - domain dir */ | 874 | /* subroot - domain dir */ |
900 | int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) | 875 | void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) |
901 | { | 876 | { |
902 | dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt), | ||
903 | GFP_KERNEL); | ||
904 | if (!dlm->dlm_debug_ctxt) { | ||
905 | mlog_errno(-ENOMEM); | ||
906 | return -ENOMEM; | ||
907 | } | ||
908 | |||
909 | dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, | 877 | dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, |
910 | dlm_debugfs_root); | 878 | dlm_debugfs_root); |
911 | return 0; | ||
912 | } | 879 | } |
913 | 880 | ||
914 | void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) | 881 | void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) |
915 | { | 882 | { |
916 | debugfs_remove(dlm->dlm_debugfs_subroot); | 883 | debugfs_remove_recursive(dlm->dlm_debugfs_subroot); |
917 | } | 884 | } |
918 | 885 | ||
919 | /* debugfs root */ | 886 | /* debugfs root */ |
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h index 7d0c7c9013ce..f8fd8680a4b6 100644 --- a/fs/ocfs2/dlm/dlmdebug.h +++ b/fs/ocfs2/dlm/dlmdebug.h | |||
@@ -14,13 +14,6 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle); | |||
14 | 14 | ||
15 | #ifdef CONFIG_DEBUG_FS | 15 | #ifdef CONFIG_DEBUG_FS |
16 | 16 | ||
17 | struct dlm_debug_ctxt { | ||
18 | struct dentry *debug_state_dentry; | ||
19 | struct dentry *debug_lockres_dentry; | ||
20 | struct dentry *debug_mle_dentry; | ||
21 | struct dentry *debug_purgelist_dentry; | ||
22 | }; | ||
23 | |||
24 | struct debug_lockres { | 17 | struct debug_lockres { |
25 | int dl_len; | 18 | int dl_len; |
26 | char *dl_buf; | 19 | char *dl_buf; |
@@ -29,9 +22,8 @@ struct debug_lockres { | |||
29 | }; | 22 | }; |
30 | 23 | ||
31 | void dlm_debug_init(struct dlm_ctxt *dlm); | 24 | void dlm_debug_init(struct dlm_ctxt *dlm); |
32 | void dlm_debug_shutdown(struct dlm_ctxt *dlm); | ||
33 | 25 | ||
34 | int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); | 26 | void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); |
35 | void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm); | 27 | void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm); |
36 | 28 | ||
37 | void dlm_create_debugfs_root(void); | 29 | void dlm_create_debugfs_root(void); |
@@ -42,12 +34,8 @@ void dlm_destroy_debugfs_root(void); | |||
42 | static inline void dlm_debug_init(struct dlm_ctxt *dlm) | 34 | static inline void dlm_debug_init(struct dlm_ctxt *dlm) |
43 | { | 35 | { |
44 | } | 36 | } |
45 | static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm) | 37 | static inline void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) |
46 | { | ||
47 | } | ||
48 | static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) | ||
49 | { | 38 | { |
50 | return 0; | ||
51 | } | 39 | } |
52 | static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) | 40 | static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) |
53 | { | 41 | { |
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 7338b5d4647c..ee6f459f9770 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
@@ -387,7 +387,6 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) | |||
387 | static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) | 387 | static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) |
388 | { | 388 | { |
389 | dlm_unregister_domain_handlers(dlm); | 389 | dlm_unregister_domain_handlers(dlm); |
390 | dlm_debug_shutdown(dlm); | ||
391 | dlm_complete_thread(dlm); | 390 | dlm_complete_thread(dlm); |
392 | dlm_complete_recovery_thread(dlm); | 391 | dlm_complete_recovery_thread(dlm); |
393 | dlm_destroy_dlm_worker(dlm); | 392 | dlm_destroy_dlm_worker(dlm); |
@@ -1938,7 +1937,6 @@ bail: | |||
1938 | 1937 | ||
1939 | if (status) { | 1938 | if (status) { |
1940 | dlm_unregister_domain_handlers(dlm); | 1939 | dlm_unregister_domain_handlers(dlm); |
1941 | dlm_debug_shutdown(dlm); | ||
1942 | dlm_complete_thread(dlm); | 1940 | dlm_complete_thread(dlm); |
1943 | dlm_complete_recovery_thread(dlm); | 1941 | dlm_complete_recovery_thread(dlm); |
1944 | dlm_destroy_dlm_worker(dlm); | 1942 | dlm_destroy_dlm_worker(dlm); |
@@ -1992,9 +1990,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, | |||
1992 | dlm->key = key; | 1990 | dlm->key = key; |
1993 | dlm->node_num = o2nm_this_node(); | 1991 | dlm->node_num = o2nm_this_node(); |
1994 | 1992 | ||
1995 | ret = dlm_create_debugfs_subroot(dlm); | 1993 | dlm_create_debugfs_subroot(dlm); |
1996 | if (ret < 0) | ||
1997 | goto leave; | ||
1998 | 1994 | ||
1999 | spin_lock_init(&dlm->spinlock); | 1995 | spin_lock_init(&dlm->spinlock); |
2000 | spin_lock_init(&dlm->master_lock); | 1996 | spin_lock_init(&dlm->master_lock); |
@@ -2056,6 +2052,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, | |||
2056 | mlog(0, "context init: refcount %u\n", | 2052 | mlog(0, "context init: refcount %u\n", |
2057 | kref_read(&dlm->dlm_refs)); | 2053 | kref_read(&dlm->dlm_refs)); |
2058 | 2054 | ||
2055 | ret = 0; | ||
2059 | leave: | 2056 | leave: |
2060 | if (ret < 0 && dlm) { | 2057 | if (ret < 0 && dlm) { |
2061 | if (dlm->master_hash) | 2058 | if (dlm->master_hash) |
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index e78657742bd8..3883633e82eb 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c | |||
@@ -90,7 +90,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, | |||
90 | enum dlm_status status; | 90 | enum dlm_status status; |
91 | int actions = 0; | 91 | int actions = 0; |
92 | int in_use; | 92 | int in_use; |
93 | u8 owner; | 93 | u8 owner; |
94 | int recovery_wait = 0; | ||
94 | 95 | ||
95 | mlog(0, "master_node = %d, valblk = %d\n", master_node, | 96 | mlog(0, "master_node = %d, valblk = %d\n", master_node, |
96 | flags & LKM_VALBLK); | 97 | flags & LKM_VALBLK); |
@@ -193,9 +194,12 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, | |||
193 | } | 194 | } |
194 | if (flags & LKM_CANCEL) | 195 | if (flags & LKM_CANCEL) |
195 | lock->cancel_pending = 0; | 196 | lock->cancel_pending = 0; |
196 | else | 197 | else { |
197 | lock->unlock_pending = 0; | 198 | if (!lock->unlock_pending) |
198 | 199 | recovery_wait = 1; | |
200 | else | ||
201 | lock->unlock_pending = 0; | ||
202 | } | ||
199 | } | 203 | } |
200 | 204 | ||
201 | /* get an extra ref on lock. if we are just switching | 205 | /* get an extra ref on lock. if we are just switching |
@@ -229,6 +233,17 @@ leave: | |||
229 | spin_unlock(&res->spinlock); | 233 | spin_unlock(&res->spinlock); |
230 | wake_up(&res->wq); | 234 | wake_up(&res->wq); |
231 | 235 | ||
236 | if (recovery_wait) { | ||
237 | spin_lock(&res->spinlock); | ||
238 | /* Unlock request will directly succeed after owner dies, | ||
239 | * and the lock is already removed from grant list. We have to | ||
240 | * wait for RECOVERING done or we miss the chance to purge it | ||
241 | * since the removement is much faster than RECOVERING proc. | ||
242 | */ | ||
243 | __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING); | ||
244 | spin_unlock(&res->spinlock); | ||
245 | } | ||
246 | |||
232 | /* let the caller's final dlm_lock_put handle the actual kfree */ | 247 | /* let the caller's final dlm_lock_put handle the actual kfree */ |
233 | if (actions & DLM_UNLOCK_FREE_LOCK) { | 248 | if (actions & DLM_UNLOCK_FREE_LOCK) { |
234 | /* this should always be coupled with list removal */ | 249 | /* this should always be coupled with list removal */ |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 14207234fa3d..6e774c5ea13b 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -2508,9 +2508,7 @@ bail: | |||
2508 | ocfs2_inode_unlock(inode, ex); | 2508 | ocfs2_inode_unlock(inode, ex); |
2509 | } | 2509 | } |
2510 | 2510 | ||
2511 | if (local_bh) | 2511 | brelse(local_bh); |
2512 | brelse(local_bh); | ||
2513 | |||
2514 | return status; | 2512 | return status; |
2515 | } | 2513 | } |
2516 | 2514 | ||
@@ -2593,8 +2591,7 @@ int ocfs2_inode_lock_atime(struct inode *inode, | |||
2593 | *level = 1; | 2591 | *level = 1; |
2594 | if (ocfs2_should_update_atime(inode, vfsmnt)) | 2592 | if (ocfs2_should_update_atime(inode, vfsmnt)) |
2595 | ocfs2_update_inode_atime(inode, bh); | 2593 | ocfs2_update_inode_atime(inode, bh); |
2596 | if (bh) | 2594 | brelse(bh); |
2597 | brelse(bh); | ||
2598 | } else | 2595 | } else |
2599 | *level = 0; | 2596 | *level = 0; |
2600 | 2597 | ||
@@ -3012,8 +3009,6 @@ struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) | |||
3012 | 3009 | ||
3013 | kref_init(&dlm_debug->d_refcnt); | 3010 | kref_init(&dlm_debug->d_refcnt); |
3014 | INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); | 3011 | INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); |
3015 | dlm_debug->d_locking_state = NULL; | ||
3016 | dlm_debug->d_locking_filter = NULL; | ||
3017 | dlm_debug->d_filter_secs = 0; | 3012 | dlm_debug->d_filter_secs = 0; |
3018 | out: | 3013 | out: |
3019 | return dlm_debug; | 3014 | return dlm_debug; |
@@ -3282,27 +3277,19 @@ static void ocfs2_dlm_init_debug(struct ocfs2_super *osb) | |||
3282 | { | 3277 | { |
3283 | struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; | 3278 | struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; |
3284 | 3279 | ||
3285 | dlm_debug->d_locking_state = debugfs_create_file("locking_state", | 3280 | debugfs_create_file("locking_state", S_IFREG|S_IRUSR, |
3286 | S_IFREG|S_IRUSR, | 3281 | osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops); |
3287 | osb->osb_debug_root, | ||
3288 | osb, | ||
3289 | &ocfs2_dlm_debug_fops); | ||
3290 | 3282 | ||
3291 | dlm_debug->d_locking_filter = debugfs_create_u32("locking_filter", | 3283 | debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root, |
3292 | 0600, | 3284 | &dlm_debug->d_filter_secs); |
3293 | osb->osb_debug_root, | ||
3294 | &dlm_debug->d_filter_secs); | ||
3295 | } | 3285 | } |
3296 | 3286 | ||
3297 | static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) | 3287 | static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) |
3298 | { | 3288 | { |
3299 | struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; | 3289 | struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; |
3300 | 3290 | ||
3301 | if (dlm_debug) { | 3291 | if (dlm_debug) |
3302 | debugfs_remove(dlm_debug->d_locking_state); | ||
3303 | debugfs_remove(dlm_debug->d_locking_filter); | ||
3304 | ocfs2_put_dlm_debug(dlm_debug); | 3292 | ocfs2_put_dlm_debug(dlm_debug); |
3305 | } | ||
3306 | } | 3293 | } |
3307 | 3294 | ||
3308 | int ocfs2_dlm_init(struct ocfs2_super *osb) | 3295 | int ocfs2_dlm_init(struct ocfs2_super *osb) |
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index e66a249fe07c..e3e2d1b2af51 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c | |||
@@ -590,8 +590,7 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, | |||
590 | *extent_flags = rec->e_flags; | 590 | *extent_flags = rec->e_flags; |
591 | } | 591 | } |
592 | out: | 592 | out: |
593 | if (eb_bh) | 593 | brelse(eb_bh); |
594 | brelse(eb_bh); | ||
595 | return ret; | 594 | return ret; |
596 | } | 595 | } |
597 | 596 | ||
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4435df3e5adb..2e982db3e1ae 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -706,7 +706,9 @@ leave: | |||
706 | * Thus, we need to explicitly order the zeroed pages. | 706 | * Thus, we need to explicitly order the zeroed pages. |
707 | */ | 707 | */ |
708 | static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, | 708 | static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, |
709 | struct buffer_head *di_bh) | 709 | struct buffer_head *di_bh, |
710 | loff_t start_byte, | ||
711 | loff_t length) | ||
710 | { | 712 | { |
711 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 713 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
712 | handle_t *handle = NULL; | 714 | handle_t *handle = NULL; |
@@ -722,7 +724,7 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, | |||
722 | goto out; | 724 | goto out; |
723 | } | 725 | } |
724 | 726 | ||
725 | ret = ocfs2_jbd2_file_inode(handle, inode); | 727 | ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length); |
726 | if (ret < 0) { | 728 | if (ret < 0) { |
727 | mlog_errno(ret); | 729 | mlog_errno(ret); |
728 | goto out; | 730 | goto out; |
@@ -761,7 +763,9 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, | |||
761 | BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT)); | 763 | BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT)); |
762 | BUG_ON(abs_from & (inode->i_blkbits - 1)); | 764 | BUG_ON(abs_from & (inode->i_blkbits - 1)); |
763 | 765 | ||
764 | handle = ocfs2_zero_start_ordered_transaction(inode, di_bh); | 766 | handle = ocfs2_zero_start_ordered_transaction(inode, di_bh, |
767 | abs_from, | ||
768 | abs_to - abs_from); | ||
765 | if (IS_ERR(handle)) { | 769 | if (IS_ERR(handle)) { |
766 | ret = PTR_ERR(handle); | 770 | ret = PTR_ERR(handle); |
767 | goto out; | 771 | goto out; |
@@ -2126,7 +2130,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file, | |||
2126 | struct dentry *dentry = file->f_path.dentry; | 2130 | struct dentry *dentry = file->f_path.dentry; |
2127 | struct inode *inode = d_inode(dentry); | 2131 | struct inode *inode = d_inode(dentry); |
2128 | struct buffer_head *di_bh = NULL; | 2132 | struct buffer_head *di_bh = NULL; |
2129 | loff_t end; | ||
2130 | 2133 | ||
2131 | /* | 2134 | /* |
2132 | * We start with a read level meta lock and only jump to an ex | 2135 | * We start with a read level meta lock and only jump to an ex |
@@ -2190,8 +2193,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file, | |||
2190 | } | 2193 | } |
2191 | } | 2194 | } |
2192 | 2195 | ||
2193 | end = pos + count; | ||
2194 | |||
2195 | ret = ocfs2_check_range_for_refcount(inode, pos, count); | 2196 | ret = ocfs2_check_range_for_refcount(inode, pos, count); |
2196 | if (ret == 1) { | 2197 | if (ret == 1) { |
2197 | ocfs2_inode_unlock(inode, meta_level); | 2198 | ocfs2_inode_unlock(inode, meta_level); |
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 7ad9d6590818..7c9dfd50c1c1 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c | |||
@@ -534,7 +534,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, | |||
534 | */ | 534 | */ |
535 | mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != | 535 | mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != |
536 | !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), | 536 | !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), |
537 | "Inode %llu: system file state is ambigous\n", | 537 | "Inode %llu: system file state is ambiguous\n", |
538 | (unsigned long long)args->fi_blkno); | 538 | (unsigned long long)args->fi_blkno); |
539 | 539 | ||
540 | if (S_ISCHR(le16_to_cpu(fe->i_mode)) || | 540 | if (S_ISCHR(le16_to_cpu(fe->i_mode)) || |
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index c0fe6ed08ab1..3103ba7f97a2 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h | |||
@@ -144,7 +144,6 @@ static inline void ocfs2_ci_set_new(struct ocfs2_super *osb, | |||
144 | void ocfs2_orphan_scan_init(struct ocfs2_super *osb); | 144 | void ocfs2_orphan_scan_init(struct ocfs2_super *osb); |
145 | void ocfs2_orphan_scan_start(struct ocfs2_super *osb); | 145 | void ocfs2_orphan_scan_start(struct ocfs2_super *osb); |
146 | void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); | 146 | void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); |
147 | void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); | ||
148 | 147 | ||
149 | void ocfs2_complete_recovery(struct work_struct *work); | 148 | void ocfs2_complete_recovery(struct work_struct *work); |
150 | void ocfs2_wait_for_recovery(struct ocfs2_super *osb); | 149 | void ocfs2_wait_for_recovery(struct ocfs2_super *osb); |
@@ -232,8 +231,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) | |||
232 | * ocfs2_journal_access_*() unless you intend to | 231 | * ocfs2_journal_access_*() unless you intend to |
233 | * manage the checksum by hand. | 232 | * manage the checksum by hand. |
234 | * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. | 233 | * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. |
235 | * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before | 234 | * ocfs2_jbd2_inode_add_write - Mark an inode with range so that its data goes |
236 | * the current handle commits. | 235 | * out before the current handle commits. |
237 | */ | 236 | */ |
238 | 237 | ||
239 | /* You must always start_trans with a number of buffs > 0, but it's | 238 | /* You must always start_trans with a number of buffs > 0, but it's |
@@ -441,7 +440,7 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir, | |||
441 | * previous dirblock update in the free list */ | 440 | * previous dirblock update in the free list */ |
442 | static inline int ocfs2_link_credits(struct super_block *sb) | 441 | static inline int ocfs2_link_credits(struct super_block *sb) |
443 | { | 442 | { |
444 | return 2*OCFS2_INODE_UPDATE_CREDITS + 4 + | 443 | return 2 * OCFS2_INODE_UPDATE_CREDITS + 4 + |
445 | ocfs2_quota_trans_credits(sb); | 444 | ocfs2_quota_trans_credits(sb); |
446 | } | 445 | } |
447 | 446 | ||
@@ -575,37 +574,12 @@ static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb) | |||
575 | return ocfs2_extent_recs_per_gd(sb); | 574 | return ocfs2_extent_recs_per_gd(sb); |
576 | } | 575 | } |
577 | 576 | ||
578 | static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, | 577 | static inline int ocfs2_jbd2_inode_add_write(handle_t *handle, struct inode *inode, |
579 | unsigned int clusters_to_del, | 578 | loff_t start_byte, loff_t length) |
580 | struct ocfs2_dinode *fe, | ||
581 | struct ocfs2_extent_list *last_el) | ||
582 | { | 579 | { |
583 | /* for dinode + all headers in this pass + update to next leaf */ | 580 | return jbd2_journal_inode_ranged_write(handle, |
584 | u16 next_free = le16_to_cpu(last_el->l_next_free_rec); | 581 | &OCFS2_I(inode)->ip_jinode, |
585 | u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth); | 582 | start_byte, length); |
586 | int credits = 1 + tree_depth + 1; | ||
587 | int i; | ||
588 | |||
589 | i = next_free - 1; | ||
590 | BUG_ON(i < 0); | ||
591 | |||
592 | /* We may be deleting metadata blocks, so metadata alloc dinode + | ||
593 | one desc. block for each possible delete. */ | ||
594 | if (tree_depth && next_free == 1 && | ||
595 | ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del) | ||
596 | credits += 1 + tree_depth; | ||
597 | |||
598 | /* update to the truncate log. */ | ||
599 | credits += OCFS2_TRUNCATE_LOG_UPDATE; | ||
600 | |||
601 | credits += ocfs2_quota_trans_credits(sb); | ||
602 | |||
603 | return credits; | ||
604 | } | ||
605 | |||
606 | static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) | ||
607 | { | ||
608 | return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode); | ||
609 | } | 583 | } |
610 | 584 | ||
611 | static inline int ocfs2_begin_ordered_truncate(struct inode *inode, | 585 | static inline int ocfs2_begin_ordered_truncate(struct inode *inode, |
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 6f8e1c4fdb9c..8ea51cf27b97 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c | |||
@@ -2486,7 +2486,6 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, | |||
2486 | struct inode *inode = NULL; | 2486 | struct inode *inode = NULL; |
2487 | struct inode *orphan_dir = NULL; | 2487 | struct inode *orphan_dir = NULL; |
2488 | struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); | 2488 | struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); |
2489 | struct ocfs2_dinode *di = NULL; | ||
2490 | handle_t *handle = NULL; | 2489 | handle_t *handle = NULL; |
2491 | char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; | 2490 | char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; |
2492 | struct buffer_head *parent_di_bh = NULL; | 2491 | struct buffer_head *parent_di_bh = NULL; |
@@ -2552,7 +2551,6 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, | |||
2552 | goto leave; | 2551 | goto leave; |
2553 | } | 2552 | } |
2554 | 2553 | ||
2555 | di = (struct ocfs2_dinode *)new_di_bh->b_data; | ||
2556 | status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, | 2554 | status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, |
2557 | &orphan_insert, orphan_dir, false); | 2555 | &orphan_insert, orphan_dir, false); |
2558 | if (status < 0) { | 2556 | if (status < 0) { |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index fddbbd60f434..9150cfa4df7d 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -223,8 +223,6 @@ struct ocfs2_orphan_scan { | |||
223 | 223 | ||
224 | struct ocfs2_dlm_debug { | 224 | struct ocfs2_dlm_debug { |
225 | struct kref d_refcnt; | 225 | struct kref d_refcnt; |
226 | struct dentry *d_locking_state; | ||
227 | struct dentry *d_locking_filter; | ||
228 | u32 d_filter_secs; | 226 | u32 d_filter_secs; |
229 | struct list_head d_lockres_tracking; | 227 | struct list_head d_lockres_tracking; |
230 | }; | 228 | }; |
@@ -401,7 +399,6 @@ struct ocfs2_super | |||
401 | struct ocfs2_dlm_debug *osb_dlm_debug; | 399 | struct ocfs2_dlm_debug *osb_dlm_debug; |
402 | 400 | ||
403 | struct dentry *osb_debug_root; | 401 | struct dentry *osb_debug_root; |
404 | struct dentry *osb_ctxt; | ||
405 | 402 | ||
406 | wait_queue_head_t recovery_event; | 403 | wait_queue_head_t recovery_event; |
407 | 404 | ||
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 8b2f39506648..c81e86c62380 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -1080,10 +1080,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) | |||
1080 | osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, | 1080 | osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, |
1081 | ocfs2_debugfs_root); | 1081 | ocfs2_debugfs_root); |
1082 | 1082 | ||
1083 | osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, | 1083 | debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root, |
1084 | osb->osb_debug_root, | 1084 | osb, &ocfs2_osb_debug_fops); |
1085 | osb, | ||
1086 | &ocfs2_osb_debug_fops); | ||
1087 | 1085 | ||
1088 | if (ocfs2_meta_ecc(osb)) | 1086 | if (ocfs2_meta_ecc(osb)) |
1089 | ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats, | 1087 | ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats, |
@@ -1861,8 +1859,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) | |||
1861 | 1859 | ||
1862 | kset_unregister(osb->osb_dev_kset); | 1860 | kset_unregister(osb->osb_dev_kset); |
1863 | 1861 | ||
1864 | debugfs_remove(osb->osb_ctxt); | ||
1865 | |||
1866 | /* Orphan scan should be stopped as early as possible */ | 1862 | /* Orphan scan should be stopped as early as possible */ |
1867 | ocfs2_orphan_scan_stop(osb); | 1863 | ocfs2_orphan_scan_stop(osb); |
1868 | 1864 | ||
@@ -1918,7 +1914,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) | |||
1918 | ocfs2_dlm_shutdown(osb, hangup_needed); | 1914 | ocfs2_dlm_shutdown(osb, hangup_needed); |
1919 | 1915 | ||
1920 | ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); | 1916 | ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); |
1921 | debugfs_remove(osb->osb_debug_root); | 1917 | debugfs_remove_recursive(osb->osb_debug_root); |
1922 | 1918 | ||
1923 | if (hangup_needed) | 1919 | if (hangup_needed) |
1924 | ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str)); | 1920 | ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str)); |
@@ -818,6 +818,14 @@ static int do_dentry_open(struct file *f, | |||
818 | if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) | 818 | if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) |
819 | return -EINVAL; | 819 | return -EINVAL; |
820 | } | 820 | } |
821 | |||
822 | /* | ||
823 | * XXX: Huge page cache doesn't support writing yet. Drop all page | ||
824 | * cache for this file before processing writes. | ||
825 | */ | ||
826 | if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping)) | ||
827 | truncate_pagecache(inode, 0); | ||
828 | |||
821 | return 0; | 829 | return 0; |
822 | 830 | ||
823 | cleanup_all: | 831 | cleanup_all: |
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 465ea0153b2a..ac9247371871 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c | |||
@@ -8,7 +8,6 @@ | |||
8 | #include <linux/mmzone.h> | 8 | #include <linux/mmzone.h> |
9 | #include <linux/proc_fs.h> | 9 | #include <linux/proc_fs.h> |
10 | #include <linux/percpu.h> | 10 | #include <linux/percpu.h> |
11 | #include <linux/quicklist.h> | ||
12 | #include <linux/seq_file.h> | 11 | #include <linux/seq_file.h> |
13 | #include <linux/swap.h> | 12 | #include <linux/swap.h> |
14 | #include <linux/vmstat.h> | 13 | #include <linux/vmstat.h> |
@@ -106,9 +105,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
106 | global_zone_page_state(NR_KERNEL_STACK_KB)); | 105 | global_zone_page_state(NR_KERNEL_STACK_KB)); |
107 | show_val_kb(m, "PageTables: ", | 106 | show_val_kb(m, "PageTables: ", |
108 | global_zone_page_state(NR_PAGETABLE)); | 107 | global_zone_page_state(NR_PAGETABLE)); |
109 | #ifdef CONFIG_QUICKLIST | ||
110 | show_val_kb(m, "Quicklists: ", quicklist_total_size()); | ||
111 | #endif | ||
112 | 108 | ||
113 | show_val_kb(m, "NFS_Unstable: ", | 109 | show_val_kb(m, "NFS_Unstable: ", |
114 | global_node_page_state(NR_UNSTABLE_NFS)); | 110 | global_node_page_state(NR_UNSTABLE_NFS)); |
@@ -136,6 +132,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
136 | global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); | 132 | global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); |
137 | show_val_kb(m, "ShmemPmdMapped: ", | 133 | show_val_kb(m, "ShmemPmdMapped: ", |
138 | global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); | 134 | global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); |
135 | show_val_kb(m, "FileHugePages: ", | ||
136 | global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR); | ||
137 | show_val_kb(m, "FilePmdMapped: ", | ||
138 | global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR); | ||
139 | #endif | 139 | #endif |
140 | 140 | ||
141 | #ifdef CONFIG_CMA | 141 | #ifdef CONFIG_CMA |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index bf43d1d60059..9442631fd4af 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -417,6 +417,7 @@ struct mem_size_stats { | |||
417 | unsigned long lazyfree; | 417 | unsigned long lazyfree; |
418 | unsigned long anonymous_thp; | 418 | unsigned long anonymous_thp; |
419 | unsigned long shmem_thp; | 419 | unsigned long shmem_thp; |
420 | unsigned long file_thp; | ||
420 | unsigned long swap; | 421 | unsigned long swap; |
421 | unsigned long shared_hugetlb; | 422 | unsigned long shared_hugetlb; |
422 | unsigned long private_hugetlb; | 423 | unsigned long private_hugetlb; |
@@ -461,7 +462,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, | |||
461 | static void smaps_account(struct mem_size_stats *mss, struct page *page, | 462 | static void smaps_account(struct mem_size_stats *mss, struct page *page, |
462 | bool compound, bool young, bool dirty, bool locked) | 463 | bool compound, bool young, bool dirty, bool locked) |
463 | { | 464 | { |
464 | int i, nr = compound ? 1 << compound_order(page) : 1; | 465 | int i, nr = compound ? compound_nr(page) : 1; |
465 | unsigned long size = nr * PAGE_SIZE; | 466 | unsigned long size = nr * PAGE_SIZE; |
466 | 467 | ||
467 | /* | 468 | /* |
@@ -588,7 +589,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, | |||
588 | else if (is_zone_device_page(page)) | 589 | else if (is_zone_device_page(page)) |
589 | /* pass */; | 590 | /* pass */; |
590 | else | 591 | else |
591 | VM_BUG_ON_PAGE(1, page); | 592 | mss->file_thp += HPAGE_PMD_SIZE; |
592 | smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); | 593 | smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); |
593 | } | 594 | } |
594 | #else | 595 | #else |
@@ -809,6 +810,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, | |||
809 | SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); | 810 | SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); |
810 | SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); | 811 | SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); |
811 | SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); | 812 | SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); |
813 | SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); | ||
812 | SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); | 814 | SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); |
813 | seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", | 815 | seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", |
814 | mss->private_hugetlb >> 10, 7); | 816 | mss->private_hugetlb >> 10, 7); |
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 8476175c07e7..6f8cc06ee44e 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h | |||
@@ -102,11 +102,6 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) | |||
102 | __free_page(pte_page); | 102 | __free_page(pte_page); |
103 | } | 103 | } |
104 | 104 | ||
105 | #else /* CONFIG_MMU */ | ||
106 | |||
107 | /* This is enough for a nommu architecture */ | ||
108 | #define check_pgt_cache() do { } while (0) | ||
109 | |||
110 | #endif /* CONFIG_MMU */ | 105 | #endif /* CONFIG_MMU */ |
111 | 106 | ||
112 | #endif /* __ASM_GENERIC_PGALLOC_H */ | 107 | #endif /* __ASM_GENERIC_PGALLOC_H */ |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 75d9d68a6de7..818691846c90 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
@@ -1002,9 +1002,8 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) | |||
1002 | * need this). If THP is not enabled, the pmd can't go away under the | 1002 | * need this). If THP is not enabled, the pmd can't go away under the |
1003 | * code even if MADV_DONTNEED runs, but if THP is enabled we need to | 1003 | * code even if MADV_DONTNEED runs, but if THP is enabled we need to |
1004 | * run a pmd_trans_unstable before walking the ptes after | 1004 | * run a pmd_trans_unstable before walking the ptes after |
1005 | * split_huge_page_pmd returns (because it may have run when the pmd | 1005 | * split_huge_pmd returns (because it may have run when the pmd become |
1006 | * become null, but then a page fault can map in a THP and not a | 1006 | * null, but then a page fault can map in a THP and not a regular page). |
1007 | * regular page). | ||
1008 | */ | 1007 | */ |
1009 | static inline int pmd_trans_unstable(pmd_t *pmd) | 1008 | static inline int pmd_trans_unstable(pmd_t *pmd) |
1010 | { | 1009 | { |
@@ -1126,7 +1125,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, | |||
1126 | static inline void init_espfix_bsp(void) { } | 1125 | static inline void init_espfix_bsp(void) { } |
1127 | #endif | 1126 | #endif |
1128 | 1127 | ||
1129 | extern void __init pgd_cache_init(void); | 1128 | extern void __init pgtable_cache_init(void); |
1130 | 1129 | ||
1131 | #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED | 1130 | #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED |
1132 | static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) | 1131 | static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) |
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 9569e7c786d3..4b898cdbdf05 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
@@ -129,11 +129,8 @@ static inline bool compaction_failed(enum compact_result result) | |||
129 | return false; | 129 | return false; |
130 | } | 130 | } |
131 | 131 | ||
132 | /* | 132 | /* Compaction needs reclaim to be performed first, so it can continue. */ |
133 | * Compaction has backed off for some reason. It might be throttling or | 133 | static inline bool compaction_needs_reclaim(enum compact_result result) |
134 | * lock contention. Retrying is still worthwhile. | ||
135 | */ | ||
136 | static inline bool compaction_withdrawn(enum compact_result result) | ||
137 | { | 134 | { |
138 | /* | 135 | /* |
139 | * Compaction backed off due to watermark checks for order-0 | 136 | * Compaction backed off due to watermark checks for order-0 |
@@ -142,6 +139,16 @@ static inline bool compaction_withdrawn(enum compact_result result) | |||
142 | if (result == COMPACT_SKIPPED) | 139 | if (result == COMPACT_SKIPPED) |
143 | return true; | 140 | return true; |
144 | 141 | ||
142 | return false; | ||
143 | } | ||
144 | |||
145 | /* | ||
146 | * Compaction has backed off for some reason after doing some work or none | ||
147 | * at all. It might be throttling or lock contention. Retrying might be still | ||
148 | * worthwhile, but with a higher priority if allowed. | ||
149 | */ | ||
150 | static inline bool compaction_withdrawn(enum compact_result result) | ||
151 | { | ||
145 | /* | 152 | /* |
146 | * If compaction is deferred for high-order allocations, it is | 153 | * If compaction is deferred for high-order allocations, it is |
147 | * because sync compaction recently failed. If this is the case | 154 | * because sync compaction recently failed. If this is the case |
@@ -207,6 +214,11 @@ static inline bool compaction_failed(enum compact_result result) | |||
207 | return false; | 214 | return false; |
208 | } | 215 | } |
209 | 216 | ||
217 | static inline bool compaction_needs_reclaim(enum compact_result result) | ||
218 | { | ||
219 | return false; | ||
220 | } | ||
221 | |||
210 | static inline bool compaction_withdrawn(enum compact_result result) | 222 | static inline bool compaction_withdrawn(enum compact_result result) |
211 | { | 223 | { |
212 | return true; | 224 | return true; |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 866268c2c6e3..b0c6b0d34d02 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -429,6 +429,7 @@ int pagecache_write_end(struct file *, struct address_space *mapping, | |||
429 | * @i_pages: Cached pages. | 429 | * @i_pages: Cached pages. |
430 | * @gfp_mask: Memory allocation flags to use for allocating pages. | 430 | * @gfp_mask: Memory allocation flags to use for allocating pages. |
431 | * @i_mmap_writable: Number of VM_SHARED mappings. | 431 | * @i_mmap_writable: Number of VM_SHARED mappings. |
432 | * @nr_thps: Number of THPs in the pagecache (non-shmem only). | ||
432 | * @i_mmap: Tree of private and shared mappings. | 433 | * @i_mmap: Tree of private and shared mappings. |
433 | * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. | 434 | * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. |
434 | * @nrpages: Number of page entries, protected by the i_pages lock. | 435 | * @nrpages: Number of page entries, protected by the i_pages lock. |
@@ -446,6 +447,10 @@ struct address_space { | |||
446 | struct xarray i_pages; | 447 | struct xarray i_pages; |
447 | gfp_t gfp_mask; | 448 | gfp_t gfp_mask; |
448 | atomic_t i_mmap_writable; | 449 | atomic_t i_mmap_writable; |
450 | #ifdef CONFIG_READ_ONLY_THP_FOR_FS | ||
451 | /* number of thp, only for non-shmem files */ | ||
452 | atomic_t nr_thps; | ||
453 | #endif | ||
449 | struct rb_root_cached i_mmap; | 454 | struct rb_root_cached i_mmap; |
450 | struct rw_semaphore i_mmap_rwsem; | 455 | struct rw_semaphore i_mmap_rwsem; |
451 | unsigned long nrpages; | 456 | unsigned long nrpages; |
@@ -2798,6 +2803,33 @@ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping) | |||
2798 | return errseq_sample(&mapping->wb_err); | 2803 | return errseq_sample(&mapping->wb_err); |
2799 | } | 2804 | } |
2800 | 2805 | ||
2806 | static inline int filemap_nr_thps(struct address_space *mapping) | ||
2807 | { | ||
2808 | #ifdef CONFIG_READ_ONLY_THP_FOR_FS | ||
2809 | return atomic_read(&mapping->nr_thps); | ||
2810 | #else | ||
2811 | return 0; | ||
2812 | #endif | ||
2813 | } | ||
2814 | |||
2815 | static inline void filemap_nr_thps_inc(struct address_space *mapping) | ||
2816 | { | ||
2817 | #ifdef CONFIG_READ_ONLY_THP_FOR_FS | ||
2818 | atomic_inc(&mapping->nr_thps); | ||
2819 | #else | ||
2820 | WARN_ON_ONCE(1); | ||
2821 | #endif | ||
2822 | } | ||
2823 | |||
2824 | static inline void filemap_nr_thps_dec(struct address_space *mapping) | ||
2825 | { | ||
2826 | #ifdef CONFIG_READ_ONLY_THP_FOR_FS | ||
2827 | atomic_dec(&mapping->nr_thps); | ||
2828 | #else | ||
2829 | WARN_ON_ONCE(1); | ||
2830 | #endif | ||
2831 | } | ||
2832 | |||
2801 | extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, | 2833 | extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, |
2802 | int datasync); | 2834 | int datasync); |
2803 | extern int vfs_fsync(struct file *file, int datasync); | 2835 | extern int vfs_fsync(struct file *file, int datasync); |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 45ede62aa85b..61c9ffd89b05 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -267,6 +267,15 @@ static inline bool thp_migration_supported(void) | |||
267 | return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); | 267 | return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); |
268 | } | 268 | } |
269 | 269 | ||
270 | static inline struct list_head *page_deferred_list(struct page *page) | ||
271 | { | ||
272 | /* | ||
273 | * Global or memcg deferred list in the second tail pages is | ||
274 | * occupied by compound_head. | ||
275 | */ | ||
276 | return &page[2].deferred_list; | ||
277 | } | ||
278 | |||
270 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ | 279 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ |
271 | #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) | 280 | #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) |
272 | #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) | 281 | #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index edfca4278319..53fc34f930d0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -454,7 +454,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, | |||
454 | static inline struct hstate *page_hstate(struct page *page) | 454 | static inline struct hstate *page_hstate(struct page *page) |
455 | { | 455 | { |
456 | VM_BUG_ON_PAGE(!PageHuge(page), page); | 456 | VM_BUG_ON_PAGE(!PageHuge(page), page); |
457 | return size_to_hstate(PAGE_SIZE << compound_order(page)); | 457 | return size_to_hstate(page_size(page)); |
458 | } | 458 | } |
459 | 459 | ||
460 | static inline unsigned hstate_index_to_shift(unsigned index) | 460 | static inline unsigned hstate_index_to_shift(unsigned index) |
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index df03825ad1a1..603fbc4e2f70 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h | |||
@@ -1410,8 +1410,6 @@ extern int jbd2_journal_clear_err (journal_t *); | |||
1410 | extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); | 1410 | extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); |
1411 | extern int jbd2_journal_force_commit(journal_t *); | 1411 | extern int jbd2_journal_force_commit(journal_t *); |
1412 | extern int jbd2_journal_force_commit_nested(journal_t *); | 1412 | extern int jbd2_journal_force_commit_nested(journal_t *); |
1413 | extern int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode); | ||
1414 | extern int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode); | ||
1415 | extern int jbd2_journal_inode_ranged_write(handle_t *handle, | 1413 | extern int jbd2_journal_inode_ranged_write(handle_t *handle, |
1416 | struct jbd2_inode *inode, loff_t start_byte, | 1414 | struct jbd2_inode *inode, loff_t start_byte, |
1417 | loff_t length); | 1415 | loff_t length); |
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 082d1d2a5216..bc45ea1efbf7 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h | |||
@@ -15,6 +15,14 @@ extern int __khugepaged_enter(struct mm_struct *mm); | |||
15 | extern void __khugepaged_exit(struct mm_struct *mm); | 15 | extern void __khugepaged_exit(struct mm_struct *mm); |
16 | extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, | 16 | extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, |
17 | unsigned long vm_flags); | 17 | unsigned long vm_flags); |
18 | #ifdef CONFIG_SHMEM | ||
19 | extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); | ||
20 | #else | ||
21 | static inline void collapse_pte_mapped_thp(struct mm_struct *mm, | ||
22 | unsigned long addr) | ||
23 | { | ||
24 | } | ||
25 | #endif | ||
18 | 26 | ||
19 | #define khugepaged_enabled() \ | 27 | #define khugepaged_enabled() \ |
20 | (transparent_hugepage_flags & \ | 28 | (transparent_hugepage_flags & \ |
@@ -73,6 +81,10 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma, | |||
73 | { | 81 | { |
74 | return 0; | 82 | return 0; |
75 | } | 83 | } |
84 | static inline void collapse_pte_mapped_thp(struct mm_struct *mm, | ||
85 | unsigned long addr) | ||
86 | { | ||
87 | } | ||
76 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 88 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
77 | 89 | ||
78 | #endif /* _LINUX_KHUGEPAGED_H */ | 90 | #endif /* _LINUX_KHUGEPAGED_H */ |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ad8f1a397ae4..9b60863429cc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -128,9 +128,8 @@ struct mem_cgroup_per_node { | |||
128 | 128 | ||
129 | struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; | 129 | struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; |
130 | 130 | ||
131 | #ifdef CONFIG_MEMCG_KMEM | ||
132 | struct memcg_shrinker_map __rcu *shrinker_map; | 131 | struct memcg_shrinker_map __rcu *shrinker_map; |
133 | #endif | 132 | |
134 | struct rb_node tree_node; /* RB tree node */ | 133 | struct rb_node tree_node; /* RB tree node */ |
135 | unsigned long usage_in_excess;/* Set to the value by which */ | 134 | unsigned long usage_in_excess;/* Set to the value by which */ |
136 | /* the soft limit is exceeded*/ | 135 | /* the soft limit is exceeded*/ |
@@ -331,6 +330,10 @@ struct mem_cgroup { | |||
331 | struct list_head event_list; | 330 | struct list_head event_list; |
332 | spinlock_t event_list_lock; | 331 | spinlock_t event_list_lock; |
333 | 332 | ||
333 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
334 | struct deferred_split deferred_split_queue; | ||
335 | #endif | ||
336 | |||
334 | struct mem_cgroup_per_node *nodeinfo[0]; | 337 | struct mem_cgroup_per_node *nodeinfo[0]; |
335 | /* WARNING: nodeinfo must be the last member here */ | 338 | /* WARNING: nodeinfo must be the last member here */ |
336 | }; | 339 | }; |
@@ -1311,6 +1314,11 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) | |||
1311 | } while ((memcg = parent_mem_cgroup(memcg))); | 1314 | } while ((memcg = parent_mem_cgroup(memcg))); |
1312 | return false; | 1315 | return false; |
1313 | } | 1316 | } |
1317 | |||
1318 | extern int memcg_expand_shrinker_maps(int new_id); | ||
1319 | |||
1320 | extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, | ||
1321 | int nid, int shrinker_id); | ||
1314 | #else | 1322 | #else |
1315 | #define mem_cgroup_sockets_enabled 0 | 1323 | #define mem_cgroup_sockets_enabled 0 |
1316 | static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; | 1324 | static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; |
@@ -1319,6 +1327,11 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) | |||
1319 | { | 1327 | { |
1320 | return false; | 1328 | return false; |
1321 | } | 1329 | } |
1330 | |||
1331 | static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, | ||
1332 | int nid, int shrinker_id) | ||
1333 | { | ||
1334 | } | ||
1322 | #endif | 1335 | #endif |
1323 | 1336 | ||
1324 | struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); | 1337 | struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); |
@@ -1390,10 +1403,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) | |||
1390 | return memcg ? memcg->kmemcg_id : -1; | 1403 | return memcg ? memcg->kmemcg_id : -1; |
1391 | } | 1404 | } |
1392 | 1405 | ||
1393 | extern int memcg_expand_shrinker_maps(int new_id); | ||
1394 | |||
1395 | extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, | ||
1396 | int nid, int shrinker_id); | ||
1397 | #else | 1406 | #else |
1398 | 1407 | ||
1399 | static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) | 1408 | static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) |
@@ -1435,8 +1444,6 @@ static inline void memcg_put_cache_ids(void) | |||
1435 | { | 1444 | { |
1436 | } | 1445 | } |
1437 | 1446 | ||
1438 | static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, | ||
1439 | int nid, int shrinker_id) { } | ||
1440 | #endif /* CONFIG_MEMCG_KMEM */ | 1447 | #endif /* CONFIG_MEMCG_KMEM */ |
1441 | 1448 | ||
1442 | #endif /* _LINUX_MEMCONTROL_H */ | 1449 | #endif /* _LINUX_MEMCONTROL_H */ |
diff --git a/include/linux/memory.h b/include/linux/memory.h index 02e633f3ede0..0ebb105eb261 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h | |||
@@ -25,7 +25,6 @@ | |||
25 | 25 | ||
26 | struct memory_block { | 26 | struct memory_block { |
27 | unsigned long start_section_nr; | 27 | unsigned long start_section_nr; |
28 | unsigned long end_section_nr; | ||
29 | unsigned long state; /* serialized by the dev->lock */ | 28 | unsigned long state; /* serialized by the dev->lock */ |
30 | int section_count; /* serialized by mem_sysfs_mutex */ | 29 | int section_count; /* serialized by mem_sysfs_mutex */ |
31 | int online_type; /* for passing data to online routine */ | 30 | int online_type; /* for passing data to online routine */ |
@@ -80,9 +79,9 @@ struct mem_section; | |||
80 | #define IPC_CALLBACK_PRI 10 | 79 | #define IPC_CALLBACK_PRI 10 |
81 | 80 | ||
82 | #ifndef CONFIG_MEMORY_HOTPLUG_SPARSE | 81 | #ifndef CONFIG_MEMORY_HOTPLUG_SPARSE |
83 | static inline int memory_dev_init(void) | 82 | static inline void memory_dev_init(void) |
84 | { | 83 | { |
85 | return 0; | 84 | return; |
86 | } | 85 | } |
87 | static inline int register_memory_notifier(struct notifier_block *nb) | 86 | static inline int register_memory_notifier(struct notifier_block *nb) |
88 | { | 87 | { |
@@ -113,7 +112,7 @@ extern int register_memory_isolate_notifier(struct notifier_block *nb); | |||
113 | extern void unregister_memory_isolate_notifier(struct notifier_block *nb); | 112 | extern void unregister_memory_isolate_notifier(struct notifier_block *nb); |
114 | int create_memory_block_devices(unsigned long start, unsigned long size); | 113 | int create_memory_block_devices(unsigned long start, unsigned long size); |
115 | void remove_memory_block_devices(unsigned long start, unsigned long size); | 114 | void remove_memory_block_devices(unsigned long start, unsigned long size); |
116 | extern int memory_dev_init(void); | 115 | extern void memory_dev_init(void); |
117 | extern int memory_notify(unsigned long val, void *v); | 116 | extern int memory_notify(unsigned long val, void *v); |
118 | extern int memory_isolate_notify(unsigned long val, void *v); | 117 | extern int memory_isolate_notify(unsigned long val, void *v); |
119 | extern struct memory_block *find_memory_block(struct mem_section *); | 118 | extern struct memory_block *find_memory_block(struct mem_section *); |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 7cf955feb823..294a67b94147 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -805,6 +805,24 @@ static inline void set_compound_order(struct page *page, unsigned int order) | |||
805 | page[1].compound_order = order; | 805 | page[1].compound_order = order; |
806 | } | 806 | } |
807 | 807 | ||
808 | /* Returns the number of pages in this potentially compound page. */ | ||
809 | static inline unsigned long compound_nr(struct page *page) | ||
810 | { | ||
811 | return 1UL << compound_order(page); | ||
812 | } | ||
813 | |||
814 | /* Returns the number of bytes in this potentially compound page. */ | ||
815 | static inline unsigned long page_size(struct page *page) | ||
816 | { | ||
817 | return PAGE_SIZE << compound_order(page); | ||
818 | } | ||
819 | |||
820 | /* Returns the number of bits needed for the number of bytes in a page */ | ||
821 | static inline unsigned int page_shift(struct page *page) | ||
822 | { | ||
823 | return PAGE_SHIFT + compound_order(page); | ||
824 | } | ||
825 | |||
808 | void free_compound_page(struct page *page); | 826 | void free_compound_page(struct page *page); |
809 | 827 | ||
810 | #ifdef CONFIG_MMU | 828 | #ifdef CONFIG_MMU |
@@ -1057,8 +1075,9 @@ static inline void put_user_page(struct page *page) | |||
1057 | put_page(page); | 1075 | put_page(page); |
1058 | } | 1076 | } |
1059 | 1077 | ||
1060 | void put_user_pages_dirty(struct page **pages, unsigned long npages); | 1078 | void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, |
1061 | void put_user_pages_dirty_lock(struct page **pages, unsigned long npages); | 1079 | bool make_dirty); |
1080 | |||
1062 | void put_user_pages(struct page **pages, unsigned long npages); | 1081 | void put_user_pages(struct page **pages, unsigned long npages); |
1063 | 1082 | ||
1064 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) | 1083 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) |
@@ -1405,7 +1424,11 @@ extern void pagefault_out_of_memory(void); | |||
1405 | 1424 | ||
1406 | extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); | 1425 | extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); |
1407 | 1426 | ||
1427 | #ifdef CONFIG_MMU | ||
1408 | extern bool can_do_mlock(void); | 1428 | extern bool can_do_mlock(void); |
1429 | #else | ||
1430 | static inline bool can_do_mlock(void) { return false; } | ||
1431 | #endif | ||
1409 | extern int user_shm_lock(size_t, struct user_struct *); | 1432 | extern int user_shm_lock(size_t, struct user_struct *); |
1410 | extern void user_shm_unlock(size_t, struct user_struct *); | 1433 | extern void user_shm_unlock(size_t, struct user_struct *); |
1411 | 1434 | ||
@@ -2305,6 +2328,8 @@ extern int install_special_mapping(struct mm_struct *mm, | |||
2305 | unsigned long addr, unsigned long len, | 2328 | unsigned long addr, unsigned long len, |
2306 | unsigned long flags, struct page **pages); | 2329 | unsigned long flags, struct page **pages); |
2307 | 2330 | ||
2331 | unsigned long randomize_stack_top(unsigned long stack_top); | ||
2332 | |||
2308 | extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); | 2333 | extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); |
2309 | 2334 | ||
2310 | extern unsigned long mmap_region(struct file *file, unsigned long addr, | 2335 | extern unsigned long mmap_region(struct file *file, unsigned long addr, |
@@ -2568,6 +2593,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
2568 | #define FOLL_COW 0x4000 /* internal GUP flag */ | 2593 | #define FOLL_COW 0x4000 /* internal GUP flag */ |
2569 | #define FOLL_ANON 0x8000 /* don't do file mappings */ | 2594 | #define FOLL_ANON 0x8000 /* don't do file mappings */ |
2570 | #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ | 2595 | #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ |
2596 | #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ | ||
2571 | 2597 | ||
2572 | /* | 2598 | /* |
2573 | * NOTE on FOLL_LONGTERM: | 2599 | * NOTE on FOLL_LONGTERM: |
@@ -2845,5 +2871,12 @@ void __init setup_nr_node_ids(void); | |||
2845 | static inline void setup_nr_node_ids(void) {} | 2871 | static inline void setup_nr_node_ids(void) {} |
2846 | #endif | 2872 | #endif |
2847 | 2873 | ||
2874 | extern int memcmp_pages(struct page *page1, struct page *page2); | ||
2875 | |||
2876 | static inline int pages_identical(struct page *page1, struct page *page2) | ||
2877 | { | ||
2878 | return !memcmp_pages(page1, page2); | ||
2879 | } | ||
2880 | |||
2848 | #endif /* __KERNEL__ */ | 2881 | #endif /* __KERNEL__ */ |
2849 | #endif /* _LINUX_MM_H */ | 2882 | #endif /* _LINUX_MM_H */ |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0b739f360cec..5183e0d77dfa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -138,6 +138,7 @@ struct page { | |||
138 | struct { /* Second tail page of compound page */ | 138 | struct { /* Second tail page of compound page */ |
139 | unsigned long _compound_pad_1; /* compound_head */ | 139 | unsigned long _compound_pad_1; /* compound_head */ |
140 | unsigned long _compound_pad_2; | 140 | unsigned long _compound_pad_2; |
141 | /* For both global and memcg */ | ||
141 | struct list_head deferred_list; | 142 | struct list_head deferred_list; |
142 | }; | 143 | }; |
143 | struct { /* Page table pages */ | 144 | struct { /* Page table pages */ |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3f38c30d2f13..bda20282746b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -235,6 +235,8 @@ enum node_stat_item { | |||
235 | NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ | 235 | NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ |
236 | NR_SHMEM_THPS, | 236 | NR_SHMEM_THPS, |
237 | NR_SHMEM_PMDMAPPED, | 237 | NR_SHMEM_PMDMAPPED, |
238 | NR_FILE_THPS, | ||
239 | NR_FILE_PMDMAPPED, | ||
238 | NR_ANON_THPS, | 240 | NR_ANON_THPS, |
239 | NR_UNSTABLE_NFS, /* NFS unstable pages */ | 241 | NR_UNSTABLE_NFS, /* NFS unstable pages */ |
240 | NR_VMSCAN_WRITE, | 242 | NR_VMSCAN_WRITE, |
@@ -677,6 +679,14 @@ struct zonelist { | |||
677 | extern struct page *mem_map; | 679 | extern struct page *mem_map; |
678 | #endif | 680 | #endif |
679 | 681 | ||
682 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
683 | struct deferred_split { | ||
684 | spinlock_t split_queue_lock; | ||
685 | struct list_head split_queue; | ||
686 | unsigned long split_queue_len; | ||
687 | }; | ||
688 | #endif | ||
689 | |||
680 | /* | 690 | /* |
681 | * On NUMA machines, each NUMA node would have a pg_data_t to describe | 691 | * On NUMA machines, each NUMA node would have a pg_data_t to describe |
682 | * it's memory layout. On UMA machines there is a single pglist_data which | 692 | * it's memory layout. On UMA machines there is a single pglist_data which |
@@ -756,9 +766,7 @@ typedef struct pglist_data { | |||
756 | #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ | 766 | #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ |
757 | 767 | ||
758 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 768 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
759 | spinlock_t split_queue_lock; | 769 | struct deferred_split deferred_split_queue; |
760 | struct list_head split_queue; | ||
761 | unsigned long split_queue_len; | ||
762 | #endif | 770 | #endif |
763 | 771 | ||
764 | /* Fields commonly accessed by the page reclaim scanner */ | 772 | /* Fields commonly accessed by the page reclaim scanner */ |
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 09592951725c..682fd465df06 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h | |||
@@ -18,6 +18,7 @@ struct page_ext_operations { | |||
18 | 18 | ||
19 | enum page_ext_flags { | 19 | enum page_ext_flags { |
20 | PAGE_EXT_OWNER, | 20 | PAGE_EXT_OWNER, |
21 | PAGE_EXT_OWNER_ACTIVE, | ||
21 | #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) | 22 | #if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) |
22 | PAGE_EXT_YOUNG, | 23 | PAGE_EXT_YOUNG, |
23 | PAGE_EXT_IDLE, | 24 | PAGE_EXT_IDLE, |
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c7552459a15f..37a4d9e32cd3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h | |||
@@ -333,6 +333,16 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, | |||
333 | mapping_gfp_mask(mapping)); | 333 | mapping_gfp_mask(mapping)); |
334 | } | 334 | } |
335 | 335 | ||
336 | static inline struct page *find_subpage(struct page *page, pgoff_t offset) | ||
337 | { | ||
338 | if (PageHuge(page)) | ||
339 | return page; | ||
340 | |||
341 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
342 | |||
343 | return page + (offset & (compound_nr(page) - 1)); | ||
344 | } | ||
345 | |||
336 | struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); | 346 | struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); |
337 | struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); | 347 | struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); |
338 | unsigned find_get_entries(struct address_space *mapping, pgoff_t start, | 348 | unsigned find_get_entries(struct address_space *mapping, pgoff_t start, |
diff --git a/include/linux/quicklist.h b/include/linux/quicklist.h deleted file mode 100644 index 034982c98c8b..000000000000 --- a/include/linux/quicklist.h +++ /dev/null | |||
@@ -1,94 +0,0 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | #ifndef LINUX_QUICKLIST_H | ||
3 | #define LINUX_QUICKLIST_H | ||
4 | /* | ||
5 | * Fast allocations and disposal of pages. Pages must be in the condition | ||
6 | * as needed after allocation when they are freed. Per cpu lists of pages | ||
7 | * are kept that only contain node local pages. | ||
8 | * | ||
9 | * (C) 2007, SGI. Christoph Lameter <cl@linux.com> | ||
10 | */ | ||
11 | #include <linux/kernel.h> | ||
12 | #include <linux/gfp.h> | ||
13 | #include <linux/percpu.h> | ||
14 | |||
15 | #ifdef CONFIG_QUICKLIST | ||
16 | |||
17 | struct quicklist { | ||
18 | void *page; | ||
19 | int nr_pages; | ||
20 | }; | ||
21 | |||
22 | DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; | ||
23 | |||
24 | /* | ||
25 | * The two key functions quicklist_alloc and quicklist_free are inline so | ||
26 | * that they may be custom compiled for the platform. | ||
27 | * Specifying a NULL ctor can remove constructor support. Specifying | ||
28 | * a constant quicklist allows the determination of the exact address | ||
29 | * in the per cpu area. | ||
30 | * | ||
31 | * The fast patch in quicklist_alloc touched only a per cpu cacheline and | ||
32 | * the first cacheline of the page itself. There is minmal overhead involved. | ||
33 | */ | ||
34 | static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *)) | ||
35 | { | ||
36 | struct quicklist *q; | ||
37 | void **p = NULL; | ||
38 | |||
39 | q =&get_cpu_var(quicklist)[nr]; | ||
40 | p = q->page; | ||
41 | if (likely(p)) { | ||
42 | q->page = p[0]; | ||
43 | p[0] = NULL; | ||
44 | q->nr_pages--; | ||
45 | } | ||
46 | put_cpu_var(quicklist); | ||
47 | if (likely(p)) | ||
48 | return p; | ||
49 | |||
50 | p = (void *)__get_free_page(flags | __GFP_ZERO); | ||
51 | if (ctor && p) | ||
52 | ctor(p); | ||
53 | return p; | ||
54 | } | ||
55 | |||
56 | static inline void __quicklist_free(int nr, void (*dtor)(void *), void *p, | ||
57 | struct page *page) | ||
58 | { | ||
59 | struct quicklist *q; | ||
60 | |||
61 | q = &get_cpu_var(quicklist)[nr]; | ||
62 | *(void **)p = q->page; | ||
63 | q->page = p; | ||
64 | q->nr_pages++; | ||
65 | put_cpu_var(quicklist); | ||
66 | } | ||
67 | |||
68 | static inline void quicklist_free(int nr, void (*dtor)(void *), void *pp) | ||
69 | { | ||
70 | __quicklist_free(nr, dtor, pp, virt_to_page(pp)); | ||
71 | } | ||
72 | |||
73 | static inline void quicklist_free_page(int nr, void (*dtor)(void *), | ||
74 | struct page *page) | ||
75 | { | ||
76 | __quicklist_free(nr, dtor, page_address(page), page); | ||
77 | } | ||
78 | |||
79 | void quicklist_trim(int nr, void (*dtor)(void *), | ||
80 | unsigned long min_pages, unsigned long max_free); | ||
81 | |||
82 | unsigned long quicklist_total_size(void); | ||
83 | |||
84 | #else | ||
85 | |||
86 | static inline unsigned long quicklist_total_size(void) | ||
87 | { | ||
88 | return 0; | ||
89 | } | ||
90 | |||
91 | #endif | ||
92 | |||
93 | #endif /* LINUX_QUICKLIST_H */ | ||
94 | |||
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 9443cafd1969..0f80123650e2 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h | |||
@@ -69,7 +69,7 @@ struct shrinker { | |||
69 | 69 | ||
70 | /* These are for internal use */ | 70 | /* These are for internal use */ |
71 | struct list_head list; | 71 | struct list_head list; |
72 | #ifdef CONFIG_MEMCG_KMEM | 72 | #ifdef CONFIG_MEMCG |
73 | /* ID in shrinker_idr */ | 73 | /* ID in shrinker_idr */ |
74 | int id; | 74 | int id; |
75 | #endif | 75 | #endif |
@@ -81,6 +81,11 @@ struct shrinker { | |||
81 | /* Flags */ | 81 | /* Flags */ |
82 | #define SHRINKER_NUMA_AWARE (1 << 0) | 82 | #define SHRINKER_NUMA_AWARE (1 << 0) |
83 | #define SHRINKER_MEMCG_AWARE (1 << 1) | 83 | #define SHRINKER_MEMCG_AWARE (1 << 1) |
84 | /* | ||
85 | * It just makes sense when the shrinker is also MEMCG_AWARE for now, | ||
86 | * non-MEMCG_AWARE shrinker should not have this flag set. | ||
87 | */ | ||
88 | #define SHRINKER_NONSLAB (1 << 2) | ||
84 | 89 | ||
85 | extern int prealloc_shrinker(struct shrinker *shrinker); | 90 | extern int prealloc_shrinker(struct shrinker *shrinker); |
86 | extern void register_shrinker_prepared(struct shrinker *shrinker); | 91 | extern void register_shrinker_prepared(struct shrinker *shrinker); |
diff --git a/include/linux/slab.h b/include/linux/slab.h index 56c9c7eed34e..ab2b98ad76e1 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h | |||
@@ -595,68 +595,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) | |||
595 | return __kmalloc_node(size, flags, node); | 595 | return __kmalloc_node(size, flags, node); |
596 | } | 596 | } |
597 | 597 | ||
598 | struct memcg_cache_array { | ||
599 | struct rcu_head rcu; | ||
600 | struct kmem_cache *entries[0]; | ||
601 | }; | ||
602 | |||
603 | /* | ||
604 | * This is the main placeholder for memcg-related information in kmem caches. | ||
605 | * Both the root cache and the child caches will have it. For the root cache, | ||
606 | * this will hold a dynamically allocated array large enough to hold | ||
607 | * information about the currently limited memcgs in the system. To allow the | ||
608 | * array to be accessed without taking any locks, on relocation we free the old | ||
609 | * version only after a grace period. | ||
610 | * | ||
611 | * Root and child caches hold different metadata. | ||
612 | * | ||
613 | * @root_cache: Common to root and child caches. NULL for root, pointer to | ||
614 | * the root cache for children. | ||
615 | * | ||
616 | * The following fields are specific to root caches. | ||
617 | * | ||
618 | * @memcg_caches: kmemcg ID indexed table of child caches. This table is | ||
619 | * used to index child cachces during allocation and cleared | ||
620 | * early during shutdown. | ||
621 | * | ||
622 | * @root_caches_node: List node for slab_root_caches list. | ||
623 | * | ||
624 | * @children: List of all child caches. While the child caches are also | ||
625 | * reachable through @memcg_caches, a child cache remains on | ||
626 | * this list until it is actually destroyed. | ||
627 | * | ||
628 | * The following fields are specific to child caches. | ||
629 | * | ||
630 | * @memcg: Pointer to the memcg this cache belongs to. | ||
631 | * | ||
632 | * @children_node: List node for @root_cache->children list. | ||
633 | * | ||
634 | * @kmem_caches_node: List node for @memcg->kmem_caches list. | ||
635 | */ | ||
636 | struct memcg_cache_params { | ||
637 | struct kmem_cache *root_cache; | ||
638 | union { | ||
639 | struct { | ||
640 | struct memcg_cache_array __rcu *memcg_caches; | ||
641 | struct list_head __root_caches_node; | ||
642 | struct list_head children; | ||
643 | bool dying; | ||
644 | }; | ||
645 | struct { | ||
646 | struct mem_cgroup *memcg; | ||
647 | struct list_head children_node; | ||
648 | struct list_head kmem_caches_node; | ||
649 | struct percpu_ref refcnt; | ||
650 | |||
651 | void (*work_fn)(struct kmem_cache *); | ||
652 | union { | ||
653 | struct rcu_head rcu_head; | ||
654 | struct work_struct work; | ||
655 | }; | ||
656 | }; | ||
657 | }; | ||
658 | }; | ||
659 | |||
660 | int memcg_update_all_caches(int num_memcgs); | 598 | int memcg_update_all_caches(int num_memcgs); |
661 | 599 | ||
662 | /** | 600 | /** |
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index dfa718ffdd4f..4e7809408073 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h | |||
@@ -53,15 +53,21 @@ struct vmap_area { | |||
53 | unsigned long va_start; | 53 | unsigned long va_start; |
54 | unsigned long va_end; | 54 | unsigned long va_end; |
55 | 55 | ||
56 | /* | ||
57 | * Largest available free size in subtree. | ||
58 | */ | ||
59 | unsigned long subtree_max_size; | ||
60 | unsigned long flags; | ||
61 | struct rb_node rb_node; /* address sorted rbtree */ | 56 | struct rb_node rb_node; /* address sorted rbtree */ |
62 | struct list_head list; /* address sorted list */ | 57 | struct list_head list; /* address sorted list */ |
63 | struct llist_node purge_list; /* "lazy purge" list */ | 58 | |
64 | struct vm_struct *vm; | 59 | /* |
60 | * The following three variables can be packed, because | ||
61 | * a vmap_area object is always one of the three states: | ||
62 | * 1) in "free" tree (root is vmap_area_root) | ||
63 | * 2) in "busy" tree (root is free_vmap_area_root) | ||
64 | * 3) in purge list (head is vmap_purge_list) | ||
65 | */ | ||
66 | union { | ||
67 | unsigned long subtree_max_size; /* in "free" tree */ | ||
68 | struct vm_struct *vm; /* in "busy" tree */ | ||
69 | struct llist_node purge_list; /* in purge list */ | ||
70 | }; | ||
65 | }; | 71 | }; |
66 | 72 | ||
67 | /* | 73 | /* |
diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 7238865e75b0..51bf43076165 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h | |||
@@ -46,6 +46,8 @@ const char *zpool_get_type(struct zpool *pool); | |||
46 | 46 | ||
47 | void zpool_destroy_pool(struct zpool *pool); | 47 | void zpool_destroy_pool(struct zpool *pool); |
48 | 48 | ||
49 | bool zpool_malloc_support_movable(struct zpool *pool); | ||
50 | |||
49 | int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, | 51 | int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, |
50 | unsigned long *handle); | 52 | unsigned long *handle); |
51 | 53 | ||
@@ -90,6 +92,7 @@ struct zpool_driver { | |||
90 | struct zpool *zpool); | 92 | struct zpool *zpool); |
91 | void (*destroy)(void *pool); | 93 | void (*destroy)(void *pool); |
92 | 94 | ||
95 | bool malloc_support_movable; | ||
93 | int (*malloc)(void *pool, size_t size, gfp_t gfp, | 96 | int (*malloc)(void *pool, size_t size, gfp_t gfp, |
94 | unsigned long *handle); | 97 | unsigned long *handle); |
95 | void (*free)(void *pool, unsigned long handle); | 98 | void (*free)(void *pool, unsigned long handle); |
diff --git a/init/main.c b/init/main.c index 653693da8da6..208b8fa1808e 100644 --- a/init/main.c +++ b/init/main.c | |||
@@ -507,7 +507,7 @@ void __init __weak mem_encrypt_init(void) { } | |||
507 | 507 | ||
508 | void __init __weak poking_init(void) { } | 508 | void __init __weak poking_init(void) { } |
509 | 509 | ||
510 | void __init __weak pgd_cache_init(void) { } | 510 | void __init __weak pgtable_cache_init(void) { } |
511 | 511 | ||
512 | bool initcall_debug; | 512 | bool initcall_debug; |
513 | core_param(initcall_debug, initcall_debug, bool, 0644); | 513 | core_param(initcall_debug, initcall_debug, bool, 0644); |
@@ -556,6 +556,7 @@ static void __init mm_init(void) | |||
556 | report_meminit(); | 556 | report_meminit(); |
557 | mem_init(); | 557 | mem_init(); |
558 | kmem_cache_init(); | 558 | kmem_cache_init(); |
559 | kmemleak_init(); | ||
559 | pgtable_init(); | 560 | pgtable_init(); |
560 | debug_objects_mem_init(); | 561 | debug_objects_mem_init(); |
561 | vmalloc_init(); | 562 | vmalloc_init(); |
@@ -564,7 +565,6 @@ static void __init mm_init(void) | |||
564 | init_espfix_bsp(); | 565 | init_espfix_bsp(); |
565 | /* Should be run after espfix64 is set up. */ | 566 | /* Should be run after espfix64 is set up. */ |
566 | pti_init(); | 567 | pti_init(); |
567 | pgd_cache_init(); | ||
568 | } | 568 | } |
569 | 569 | ||
570 | void __init __weak arch_call_rest_init(void) | 570 | void __init __weak arch_call_rest_init(void) |
@@ -594,7 +594,6 @@ asmlinkage __visible void __init start_kernel(void) | |||
594 | page_address_init(); | 594 | page_address_init(); |
595 | pr_notice("%s", linux_banner); | 595 | pr_notice("%s", linux_banner); |
596 | setup_arch(&command_line); | 596 | setup_arch(&command_line); |
597 | mm_init_cpumask(&init_mm); | ||
598 | setup_command_line(command_line); | 597 | setup_command_line(command_line); |
599 | setup_nr_cpu_ids(); | 598 | setup_nr_cpu_ids(); |
600 | setup_per_cpu_areas(); | 599 | setup_per_cpu_areas(); |
@@ -740,7 +739,6 @@ asmlinkage __visible void __init start_kernel(void) | |||
740 | initrd_start = 0; | 739 | initrd_start = 0; |
741 | } | 740 | } |
742 | #endif | 741 | #endif |
743 | kmemleak_init(); | ||
744 | setup_per_cpu_pageset(); | 742 | setup_per_cpu_pageset(); |
745 | numa_policy_init(); | 743 | numa_policy_init(); |
746 | acpi_early_init(); | 744 | acpi_early_init(); |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 84fa00497c49..94d38a39d72e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/percpu-rwsem.h> | 26 | #include <linux/percpu-rwsem.h> |
27 | #include <linux/task_work.h> | 27 | #include <linux/task_work.h> |
28 | #include <linux/shmem_fs.h> | 28 | #include <linux/shmem_fs.h> |
29 | #include <linux/khugepaged.h> | ||
29 | 30 | ||
30 | #include <linux/uprobes.h> | 31 | #include <linux/uprobes.h> |
31 | 32 | ||
@@ -143,17 +144,19 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) | |||
143 | * | 144 | * |
144 | * @vma: vma that holds the pte pointing to page | 145 | * @vma: vma that holds the pte pointing to page |
145 | * @addr: address the old @page is mapped at | 146 | * @addr: address the old @page is mapped at |
146 | * @page: the cowed page we are replacing by kpage | 147 | * @old_page: the page we are replacing by new_page |
147 | * @kpage: the modified page we replace page by | 148 | * @new_page: the modified page we replace page by |
148 | * | 149 | * |
149 | * Returns 0 on success, -EFAULT on failure. | 150 | * If @new_page is NULL, only unmap @old_page. |
151 | * | ||
152 | * Returns 0 on success, negative error code otherwise. | ||
150 | */ | 153 | */ |
151 | static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | 154 | static int __replace_page(struct vm_area_struct *vma, unsigned long addr, |
152 | struct page *old_page, struct page *new_page) | 155 | struct page *old_page, struct page *new_page) |
153 | { | 156 | { |
154 | struct mm_struct *mm = vma->vm_mm; | 157 | struct mm_struct *mm = vma->vm_mm; |
155 | struct page_vma_mapped_walk pvmw = { | 158 | struct page_vma_mapped_walk pvmw = { |
156 | .page = old_page, | 159 | .page = compound_head(old_page), |
157 | .vma = vma, | 160 | .vma = vma, |
158 | .address = addr, | 161 | .address = addr, |
159 | }; | 162 | }; |
@@ -164,12 +167,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
164 | mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, | 167 | mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, |
165 | addr + PAGE_SIZE); | 168 | addr + PAGE_SIZE); |
166 | 169 | ||
167 | VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); | 170 | if (new_page) { |
168 | 171 | err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, | |
169 | err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, | 172 | &memcg, false); |
170 | false); | 173 | if (err) |
171 | if (err) | 174 | return err; |
172 | return err; | 175 | } |
173 | 176 | ||
174 | /* For try_to_free_swap() and munlock_vma_page() below */ | 177 | /* For try_to_free_swap() and munlock_vma_page() below */ |
175 | lock_page(old_page); | 178 | lock_page(old_page); |
@@ -177,15 +180,20 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
177 | mmu_notifier_invalidate_range_start(&range); | 180 | mmu_notifier_invalidate_range_start(&range); |
178 | err = -EAGAIN; | 181 | err = -EAGAIN; |
179 | if (!page_vma_mapped_walk(&pvmw)) { | 182 | if (!page_vma_mapped_walk(&pvmw)) { |
180 | mem_cgroup_cancel_charge(new_page, memcg, false); | 183 | if (new_page) |
184 | mem_cgroup_cancel_charge(new_page, memcg, false); | ||
181 | goto unlock; | 185 | goto unlock; |
182 | } | 186 | } |
183 | VM_BUG_ON_PAGE(addr != pvmw.address, old_page); | 187 | VM_BUG_ON_PAGE(addr != pvmw.address, old_page); |
184 | 188 | ||
185 | get_page(new_page); | 189 | if (new_page) { |
186 | page_add_new_anon_rmap(new_page, vma, addr, false); | 190 | get_page(new_page); |
187 | mem_cgroup_commit_charge(new_page, memcg, false, false); | 191 | page_add_new_anon_rmap(new_page, vma, addr, false); |
188 | lru_cache_add_active_or_unevictable(new_page, vma); | 192 | mem_cgroup_commit_charge(new_page, memcg, false, false); |
193 | lru_cache_add_active_or_unevictable(new_page, vma); | ||
194 | } else | ||
195 | /* no new page, just dec_mm_counter for old_page */ | ||
196 | dec_mm_counter(mm, MM_ANONPAGES); | ||
189 | 197 | ||
190 | if (!PageAnon(old_page)) { | 198 | if (!PageAnon(old_page)) { |
191 | dec_mm_counter(mm, mm_counter_file(old_page)); | 199 | dec_mm_counter(mm, mm_counter_file(old_page)); |
@@ -194,8 +202,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
194 | 202 | ||
195 | flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); | 203 | flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); |
196 | ptep_clear_flush_notify(vma, addr, pvmw.pte); | 204 | ptep_clear_flush_notify(vma, addr, pvmw.pte); |
197 | set_pte_at_notify(mm, addr, pvmw.pte, | 205 | if (new_page) |
198 | mk_pte(new_page, vma->vm_page_prot)); | 206 | set_pte_at_notify(mm, addr, pvmw.pte, |
207 | mk_pte(new_page, vma->vm_page_prot)); | ||
199 | 208 | ||
200 | page_remove_rmap(old_page, false); | 209 | page_remove_rmap(old_page, false); |
201 | if (!page_mapped(old_page)) | 210 | if (!page_mapped(old_page)) |
@@ -464,6 +473,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, | |||
464 | struct page *old_page, *new_page; | 473 | struct page *old_page, *new_page; |
465 | struct vm_area_struct *vma; | 474 | struct vm_area_struct *vma; |
466 | int ret, is_register, ref_ctr_updated = 0; | 475 | int ret, is_register, ref_ctr_updated = 0; |
476 | bool orig_page_huge = false; | ||
467 | 477 | ||
468 | is_register = is_swbp_insn(&opcode); | 478 | is_register = is_swbp_insn(&opcode); |
469 | uprobe = container_of(auprobe, struct uprobe, arch); | 479 | uprobe = container_of(auprobe, struct uprobe, arch); |
@@ -471,7 +481,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, | |||
471 | retry: | 481 | retry: |
472 | /* Read the page with vaddr into memory */ | 482 | /* Read the page with vaddr into memory */ |
473 | ret = get_user_pages_remote(NULL, mm, vaddr, 1, | 483 | ret = get_user_pages_remote(NULL, mm, vaddr, 1, |
474 | FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL); | 484 | FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL); |
475 | if (ret <= 0) | 485 | if (ret <= 0) |
476 | return ret; | 486 | return ret; |
477 | 487 | ||
@@ -488,6 +498,10 @@ retry: | |||
488 | ref_ctr_updated = 1; | 498 | ref_ctr_updated = 1; |
489 | } | 499 | } |
490 | 500 | ||
501 | ret = 0; | ||
502 | if (!is_register && !PageAnon(old_page)) | ||
503 | goto put_old; | ||
504 | |||
491 | ret = anon_vma_prepare(vma); | 505 | ret = anon_vma_prepare(vma); |
492 | if (ret) | 506 | if (ret) |
493 | goto put_old; | 507 | goto put_old; |
@@ -501,8 +515,33 @@ retry: | |||
501 | copy_highpage(new_page, old_page); | 515 | copy_highpage(new_page, old_page); |
502 | copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); | 516 | copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); |
503 | 517 | ||
518 | if (!is_register) { | ||
519 | struct page *orig_page; | ||
520 | pgoff_t index; | ||
521 | |||
522 | VM_BUG_ON_PAGE(!PageAnon(old_page), old_page); | ||
523 | |||
524 | index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT; | ||
525 | orig_page = find_get_page(vma->vm_file->f_inode->i_mapping, | ||
526 | index); | ||
527 | |||
528 | if (orig_page) { | ||
529 | if (PageUptodate(orig_page) && | ||
530 | pages_identical(new_page, orig_page)) { | ||
531 | /* let go new_page */ | ||
532 | put_page(new_page); | ||
533 | new_page = NULL; | ||
534 | |||
535 | if (PageCompound(orig_page)) | ||
536 | orig_page_huge = true; | ||
537 | } | ||
538 | put_page(orig_page); | ||
539 | } | ||
540 | } | ||
541 | |||
504 | ret = __replace_page(vma, vaddr, old_page, new_page); | 542 | ret = __replace_page(vma, vaddr, old_page, new_page); |
505 | put_page(new_page); | 543 | if (new_page) |
544 | put_page(new_page); | ||
506 | put_old: | 545 | put_old: |
507 | put_page(old_page); | 546 | put_page(old_page); |
508 | 547 | ||
@@ -513,6 +552,10 @@ put_old: | |||
513 | if (ret && is_register && ref_ctr_updated) | 552 | if (ret && is_register && ref_ctr_updated) |
514 | update_ref_ctr(uprobe, mm, -1); | 553 | update_ref_ctr(uprobe, mm, -1); |
515 | 554 | ||
555 | /* try collapse pmd for compound page */ | ||
556 | if (!ret && orig_page_huge) | ||
557 | collapse_pte_mapped_thp(mm, vaddr); | ||
558 | |||
516 | return ret; | 559 | return ret; |
517 | } | 560 | } |
518 | 561 | ||
diff --git a/kernel/resource.c b/kernel/resource.c index 74877e9d90ca..76036a41143b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -487,8 +487,8 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, | |||
487 | while (start < end && | 487 | while (start < end && |
488 | !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, | 488 | !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, |
489 | false, &res)) { | 489 | false, &res)) { |
490 | pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; | 490 | pfn = PFN_UP(res.start); |
491 | end_pfn = (res.end + 1) >> PAGE_SHIFT; | 491 | end_pfn = PFN_DOWN(res.end + 1); |
492 | if (end_pfn > pfn) | 492 | if (end_pfn > pfn) |
493 | ret = (*func)(pfn, end_pfn - pfn, arg); | 493 | ret = (*func)(pfn, end_pfn - pfn, arg); |
494 | if (ret) | 494 | if (ret) |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c892c6280c9f..8dad5aa600ea 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -238,7 +238,6 @@ static void do_idle(void) | |||
238 | tick_nohz_idle_enter(); | 238 | tick_nohz_idle_enter(); |
239 | 239 | ||
240 | while (!need_resched()) { | 240 | while (!need_resched()) { |
241 | check_pgt_cache(); | ||
242 | rmb(); | 241 | rmb(); |
243 | 242 | ||
244 | local_irq_disable(); | 243 | local_irq_disable(); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 078950d9605b..00fcea236eba 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -264,7 +264,8 @@ extern struct ctl_table epoll_table[]; | |||
264 | extern struct ctl_table firmware_config_table[]; | 264 | extern struct ctl_table firmware_config_table[]; |
265 | #endif | 265 | #endif |
266 | 266 | ||
267 | #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT | 267 | #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ |
268 | defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) | ||
268 | int sysctl_legacy_va_layout; | 269 | int sysctl_legacy_va_layout; |
269 | #endif | 270 | #endif |
270 | 271 | ||
@@ -1573,7 +1574,8 @@ static struct ctl_table vm_table[] = { | |||
1573 | .proc_handler = proc_dointvec, | 1574 | .proc_handler = proc_dointvec, |
1574 | .extra1 = SYSCTL_ZERO, | 1575 | .extra1 = SYSCTL_ZERO, |
1575 | }, | 1576 | }, |
1576 | #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT | 1577 | #if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \ |
1578 | defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT) | ||
1577 | { | 1579 | { |
1578 | .procname = "legacy_va_layout", | 1580 | .procname = "legacy_va_layout", |
1579 | .data = &sysctl_legacy_va_layout, | 1581 | .data = &sysctl_legacy_va_layout, |
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e0e14780a13d..6b1b1703a646 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug | |||
@@ -576,17 +576,18 @@ config DEBUG_KMEMLEAK | |||
576 | In order to access the kmemleak file, debugfs needs to be | 576 | In order to access the kmemleak file, debugfs needs to be |
577 | mounted (usually at /sys/kernel/debug). | 577 | mounted (usually at /sys/kernel/debug). |
578 | 578 | ||
579 | config DEBUG_KMEMLEAK_EARLY_LOG_SIZE | 579 | config DEBUG_KMEMLEAK_MEM_POOL_SIZE |
580 | int "Maximum kmemleak early log entries" | 580 | int "Kmemleak memory pool size" |
581 | depends on DEBUG_KMEMLEAK | 581 | depends on DEBUG_KMEMLEAK |
582 | range 200 40000 | 582 | range 200 1000000 |
583 | default 400 | 583 | default 16000 |
584 | help | 584 | help |
585 | Kmemleak must track all the memory allocations to avoid | 585 | Kmemleak must track all the memory allocations to avoid |
586 | reporting false positives. Since memory may be allocated or | 586 | reporting false positives. Since memory may be allocated or |
587 | freed before kmemleak is initialised, an early log buffer is | 587 | freed before kmemleak is fully initialised, use a static pool |
588 | used to store these actions. If kmemleak reports "early log | 588 | of metadata objects to track such callbacks. After kmemleak is |
589 | buffer exceeded", please increase this value. | 589 | fully initialised, this memory pool acts as an emergency one |
590 | if slab allocations fail. | ||
590 | 591 | ||
591 | config DEBUG_KMEMLEAK_TEST | 592 | config DEBUG_KMEMLEAK_TEST |
592 | tristate "Simple test for the kernel memory leak detector" | 593 | tristate "Simple test for the kernel memory leak detector" |
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 7fa97a8b5717..6c9682ce0254 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan | |||
@@ -134,6 +134,14 @@ config KASAN_S390_4_LEVEL_PAGING | |||
134 | to 3TB of RAM with KASan enabled). This options allows to force | 134 | to 3TB of RAM with KASan enabled). This options allows to force |
135 | 4-level paging instead. | 135 | 4-level paging instead. |
136 | 136 | ||
137 | config KASAN_SW_TAGS_IDENTIFY | ||
138 | bool "Enable memory corruption identification" | ||
139 | depends on KASAN_SW_TAGS | ||
140 | help | ||
141 | This option enables best-effort identification of bug type | ||
142 | (use-after-free or out-of-bounds) at the cost of increased | ||
143 | memory consumption. | ||
144 | |||
137 | config TEST_KASAN | 145 | config TEST_KASAN |
138 | tristate "Module for testing KASAN for bug detection" | 146 | tristate "Module for testing KASAN for bug detection" |
139 | depends on m && KASAN | 147 | depends on m && KASAN |
diff --git a/lib/iov_iter.c b/lib/iov_iter.c index f1e0569b4539..639d5e7014c1 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c | |||
@@ -878,7 +878,7 @@ static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) | |||
878 | head = compound_head(page); | 878 | head = compound_head(page); |
879 | v += (page - head) << PAGE_SHIFT; | 879 | v += (page - head) << PAGE_SHIFT; |
880 | 880 | ||
881 | if (likely(n <= v && v <= (PAGE_SIZE << compound_order(head)))) | 881 | if (likely(n <= v && v <= (page_size(head)))) |
882 | return true; | 882 | return true; |
883 | WARN_ON(1); | 883 | WARN_ON(1); |
884 | return false; | 884 | return false; |
diff --git a/lib/show_mem.c b/lib/show_mem.c index 5c86ef4c899f..1c26c14ffbb9 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c | |||
@@ -6,7 +6,6 @@ | |||
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
9 | #include <linux/quicklist.h> | ||
10 | #include <linux/cma.h> | 9 | #include <linux/cma.h> |
11 | 10 | ||
12 | void show_mem(unsigned int filter, nodemask_t *nodemask) | 11 | void show_mem(unsigned int filter, nodemask_t *nodemask) |
@@ -39,10 +38,6 @@ void show_mem(unsigned int filter, nodemask_t *nodemask) | |||
39 | #ifdef CONFIG_CMA | 38 | #ifdef CONFIG_CMA |
40 | printk("%lu pages cma reserved\n", totalcma_pages); | 39 | printk("%lu pages cma reserved\n", totalcma_pages); |
41 | #endif | 40 | #endif |
42 | #ifdef CONFIG_QUICKLIST | ||
43 | printk("%lu pages in pagetable cache\n", | ||
44 | quicklist_total_size()); | ||
45 | #endif | ||
46 | #ifdef CONFIG_MEMORY_FAILURE | 41 | #ifdef CONFIG_MEMORY_FAILURE |
47 | printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); | 42 | printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); |
48 | #endif | 43 | #endif |
diff --git a/lib/test_kasan.c b/lib/test_kasan.c index b63b367a94e8..49cc4d570a40 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c | |||
@@ -18,6 +18,9 @@ | |||
18 | #include <linux/slab.h> | 18 | #include <linux/slab.h> |
19 | #include <linux/string.h> | 19 | #include <linux/string.h> |
20 | #include <linux/uaccess.h> | 20 | #include <linux/uaccess.h> |
21 | #include <linux/io.h> | ||
22 | |||
23 | #include <asm/page.h> | ||
21 | 24 | ||
22 | /* | 25 | /* |
23 | * Note: test functions are marked noinline so that their names appear in | 26 | * Note: test functions are marked noinline so that their names appear in |
@@ -337,6 +340,42 @@ static noinline void __init kmalloc_uaf2(void) | |||
337 | kfree(ptr2); | 340 | kfree(ptr2); |
338 | } | 341 | } |
339 | 342 | ||
343 | static noinline void __init kfree_via_page(void) | ||
344 | { | ||
345 | char *ptr; | ||
346 | size_t size = 8; | ||
347 | struct page *page; | ||
348 | unsigned long offset; | ||
349 | |||
350 | pr_info("invalid-free false positive (via page)\n"); | ||
351 | ptr = kmalloc(size, GFP_KERNEL); | ||
352 | if (!ptr) { | ||
353 | pr_err("Allocation failed\n"); | ||
354 | return; | ||
355 | } | ||
356 | |||
357 | page = virt_to_page(ptr); | ||
358 | offset = offset_in_page(ptr); | ||
359 | kfree(page_address(page) + offset); | ||
360 | } | ||
361 | |||
362 | static noinline void __init kfree_via_phys(void) | ||
363 | { | ||
364 | char *ptr; | ||
365 | size_t size = 8; | ||
366 | phys_addr_t phys; | ||
367 | |||
368 | pr_info("invalid-free false positive (via phys)\n"); | ||
369 | ptr = kmalloc(size, GFP_KERNEL); | ||
370 | if (!ptr) { | ||
371 | pr_err("Allocation failed\n"); | ||
372 | return; | ||
373 | } | ||
374 | |||
375 | phys = virt_to_phys(ptr); | ||
376 | kfree(phys_to_virt(phys)); | ||
377 | } | ||
378 | |||
340 | static noinline void __init kmem_cache_oob(void) | 379 | static noinline void __init kmem_cache_oob(void) |
341 | { | 380 | { |
342 | char *p; | 381 | char *p; |
@@ -737,6 +776,8 @@ static int __init kmalloc_tests_init(void) | |||
737 | kmalloc_uaf(); | 776 | kmalloc_uaf(); |
738 | kmalloc_uaf_memset(); | 777 | kmalloc_uaf_memset(); |
739 | kmalloc_uaf2(); | 778 | kmalloc_uaf2(); |
779 | kfree_via_page(); | ||
780 | kfree_via_phys(); | ||
740 | kmem_cache_oob(); | 781 | kmem_cache_oob(); |
741 | memcg_accounted_kmem_cache(); | 782 | memcg_accounted_kmem_cache(); |
742 | kasan_stack_oob(); | 783 | kasan_stack_oob(); |
diff --git a/mm/Kconfig b/mm/Kconfig index 2fe4902ad755..a5dae9a7eb51 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -273,11 +273,6 @@ config BOUNCE | |||
273 | by default when ZONE_DMA or HIGHMEM is selected, but you | 273 | by default when ZONE_DMA or HIGHMEM is selected, but you |
274 | may say n to override this. | 274 | may say n to override this. |
275 | 275 | ||
276 | config NR_QUICK | ||
277 | int | ||
278 | depends on QUICKLIST | ||
279 | default "1" | ||
280 | |||
281 | config VIRT_TO_BUS | 276 | config VIRT_TO_BUS |
282 | bool | 277 | bool |
283 | help | 278 | help |
@@ -717,6 +712,17 @@ config GUP_BENCHMARK | |||
717 | config GUP_GET_PTE_LOW_HIGH | 712 | config GUP_GET_PTE_LOW_HIGH |
718 | bool | 713 | bool |
719 | 714 | ||
715 | config READ_ONLY_THP_FOR_FS | ||
716 | bool "Read-only THP for filesystems (EXPERIMENTAL)" | ||
717 | depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM | ||
718 | |||
719 | help | ||
720 | Allow khugepaged to put read-only file-backed pages in THP. | ||
721 | |||
722 | This is marked experimental because it is a new feature. Write | ||
723 | support of file THPs will be developed in the next few release | ||
724 | cycles. | ||
725 | |||
720 | config ARCH_HAS_PTE_SPECIAL | 726 | config ARCH_HAS_PTE_SPECIAL |
721 | bool | 727 | bool |
722 | 728 | ||
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 82b6a20898bd..327b3ebf23bf 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
@@ -21,7 +21,9 @@ config DEBUG_PAGEALLOC | |||
21 | Also, the state of page tracking structures is checked more often as | 21 | Also, the state of page tracking structures is checked more often as |
22 | pages are being allocated and freed, as unexpected state changes | 22 | pages are being allocated and freed, as unexpected state changes |
23 | often happen for same reasons as memory corruption (e.g. double free, | 23 | often happen for same reasons as memory corruption (e.g. double free, |
24 | use-after-free). | 24 | use-after-free). The error reports for these checks can be augmented |
25 | with stack traces of last allocation and freeing of the page, when | ||
26 | PAGE_OWNER is also selected and enabled on boot. | ||
25 | 27 | ||
26 | For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, | 28 | For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, |
27 | fill the pages with poison patterns after free_pages() and verify | 29 | fill the pages with poison patterns after free_pages() and verify |
diff --git a/mm/Makefile b/mm/Makefile index d0b295c3b764..d996846697ef 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -21,6 +21,9 @@ KCOV_INSTRUMENT_memcontrol.o := n | |||
21 | KCOV_INSTRUMENT_mmzone.o := n | 21 | KCOV_INSTRUMENT_mmzone.o := n |
22 | KCOV_INSTRUMENT_vmstat.o := n | 22 | KCOV_INSTRUMENT_vmstat.o := n |
23 | 23 | ||
24 | CFLAGS_init-mm.o += $(call cc-disable-warning, override-init) | ||
25 | CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides) | ||
26 | |||
24 | mmu-y := nommu.o | 27 | mmu-y := nommu.o |
25 | mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ | 28 | mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ |
26 | mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ | 29 | mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ |
@@ -72,7 +75,6 @@ obj-$(CONFIG_FAILSLAB) += failslab.o | |||
72 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 75 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
73 | obj-$(CONFIG_MEMTEST) += memtest.o | 76 | obj-$(CONFIG_MEMTEST) += memtest.o |
74 | obj-$(CONFIG_MIGRATION) += migrate.o | 77 | obj-$(CONFIG_MIGRATION) += migrate.o |
75 | obj-$(CONFIG_QUICKLIST) += quicklist.o | ||
76 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o | 78 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o |
77 | obj-$(CONFIG_PAGE_COUNTER) += page_counter.o | 79 | obj-$(CONFIG_PAGE_COUNTER) += page_counter.o |
78 | obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o | 80 | obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o |
diff --git a/mm/compaction.c b/mm/compaction.c index 952dc2fb24e5..ce08b39d85d4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -969,7 +969,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
969 | * is safe to read and it's 0 for tail pages. | 969 | * is safe to read and it's 0 for tail pages. |
970 | */ | 970 | */ |
971 | if (unlikely(PageCompound(page))) { | 971 | if (unlikely(PageCompound(page))) { |
972 | low_pfn += (1UL << compound_order(page)) - 1; | 972 | low_pfn += compound_nr(page) - 1; |
973 | goto isolate_fail; | 973 | goto isolate_fail; |
974 | } | 974 | } |
975 | } | 975 | } |
@@ -1737,8 +1737,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) | |||
1737 | * starting at the block pointed to by the migrate scanner pfn within | 1737 | * starting at the block pointed to by the migrate scanner pfn within |
1738 | * compact_control. | 1738 | * compact_control. |
1739 | */ | 1739 | */ |
1740 | static isolate_migrate_t isolate_migratepages(struct zone *zone, | 1740 | static isolate_migrate_t isolate_migratepages(struct compact_control *cc) |
1741 | struct compact_control *cc) | ||
1742 | { | 1741 | { |
1743 | unsigned long block_start_pfn; | 1742 | unsigned long block_start_pfn; |
1744 | unsigned long block_end_pfn; | 1743 | unsigned long block_end_pfn; |
@@ -1756,8 +1755,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
1756 | */ | 1755 | */ |
1757 | low_pfn = fast_find_migrateblock(cc); | 1756 | low_pfn = fast_find_migrateblock(cc); |
1758 | block_start_pfn = pageblock_start_pfn(low_pfn); | 1757 | block_start_pfn = pageblock_start_pfn(low_pfn); |
1759 | if (block_start_pfn < zone->zone_start_pfn) | 1758 | if (block_start_pfn < cc->zone->zone_start_pfn) |
1760 | block_start_pfn = zone->zone_start_pfn; | 1759 | block_start_pfn = cc->zone->zone_start_pfn; |
1761 | 1760 | ||
1762 | /* | 1761 | /* |
1763 | * fast_find_migrateblock marks a pageblock skipped so to avoid | 1762 | * fast_find_migrateblock marks a pageblock skipped so to avoid |
@@ -1787,8 +1786,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
1787 | if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) | 1786 | if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) |
1788 | cond_resched(); | 1787 | cond_resched(); |
1789 | 1788 | ||
1790 | page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, | 1789 | page = pageblock_pfn_to_page(block_start_pfn, |
1791 | zone); | 1790 | block_end_pfn, cc->zone); |
1792 | if (!page) | 1791 | if (!page) |
1793 | continue; | 1792 | continue; |
1794 | 1793 | ||
@@ -2078,6 +2077,17 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) | |||
2078 | const bool sync = cc->mode != MIGRATE_ASYNC; | 2077 | const bool sync = cc->mode != MIGRATE_ASYNC; |
2079 | bool update_cached; | 2078 | bool update_cached; |
2080 | 2079 | ||
2080 | /* | ||
2081 | * These counters track activities during zone compaction. Initialize | ||
2082 | * them before compacting a new zone. | ||
2083 | */ | ||
2084 | cc->total_migrate_scanned = 0; | ||
2085 | cc->total_free_scanned = 0; | ||
2086 | cc->nr_migratepages = 0; | ||
2087 | cc->nr_freepages = 0; | ||
2088 | INIT_LIST_HEAD(&cc->freepages); | ||
2089 | INIT_LIST_HEAD(&cc->migratepages); | ||
2090 | |||
2081 | cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); | 2091 | cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); |
2082 | ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, | 2092 | ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, |
2083 | cc->classzone_idx); | 2093 | cc->classzone_idx); |
@@ -2158,7 +2168,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) | |||
2158 | cc->rescan = true; | 2168 | cc->rescan = true; |
2159 | } | 2169 | } |
2160 | 2170 | ||
2161 | switch (isolate_migratepages(cc->zone, cc)) { | 2171 | switch (isolate_migratepages(cc)) { |
2162 | case ISOLATE_ABORT: | 2172 | case ISOLATE_ABORT: |
2163 | ret = COMPACT_CONTENDED; | 2173 | ret = COMPACT_CONTENDED; |
2164 | putback_movable_pages(&cc->migratepages); | 2174 | putback_movable_pages(&cc->migratepages); |
@@ -2281,10 +2291,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, | |||
2281 | { | 2291 | { |
2282 | enum compact_result ret; | 2292 | enum compact_result ret; |
2283 | struct compact_control cc = { | 2293 | struct compact_control cc = { |
2284 | .nr_freepages = 0, | ||
2285 | .nr_migratepages = 0, | ||
2286 | .total_migrate_scanned = 0, | ||
2287 | .total_free_scanned = 0, | ||
2288 | .order = order, | 2294 | .order = order, |
2289 | .search_order = order, | 2295 | .search_order = order, |
2290 | .gfp_mask = gfp_mask, | 2296 | .gfp_mask = gfp_mask, |
@@ -2305,8 +2311,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order, | |||
2305 | 2311 | ||
2306 | if (capture) | 2312 | if (capture) |
2307 | current->capture_control = &capc; | 2313 | current->capture_control = &capc; |
2308 | INIT_LIST_HEAD(&cc.freepages); | ||
2309 | INIT_LIST_HEAD(&cc.migratepages); | ||
2310 | 2314 | ||
2311 | ret = compact_zone(&cc, &capc); | 2315 | ret = compact_zone(&cc, &capc); |
2312 | 2316 | ||
@@ -2408,8 +2412,6 @@ static void compact_node(int nid) | |||
2408 | struct zone *zone; | 2412 | struct zone *zone; |
2409 | struct compact_control cc = { | 2413 | struct compact_control cc = { |
2410 | .order = -1, | 2414 | .order = -1, |
2411 | .total_migrate_scanned = 0, | ||
2412 | .total_free_scanned = 0, | ||
2413 | .mode = MIGRATE_SYNC, | 2415 | .mode = MIGRATE_SYNC, |
2414 | .ignore_skip_hint = true, | 2416 | .ignore_skip_hint = true, |
2415 | .whole_zone = true, | 2417 | .whole_zone = true, |
@@ -2423,11 +2425,7 @@ static void compact_node(int nid) | |||
2423 | if (!populated_zone(zone)) | 2425 | if (!populated_zone(zone)) |
2424 | continue; | 2426 | continue; |
2425 | 2427 | ||
2426 | cc.nr_freepages = 0; | ||
2427 | cc.nr_migratepages = 0; | ||
2428 | cc.zone = zone; | 2428 | cc.zone = zone; |
2429 | INIT_LIST_HEAD(&cc.freepages); | ||
2430 | INIT_LIST_HEAD(&cc.migratepages); | ||
2431 | 2429 | ||
2432 | compact_zone(&cc, NULL); | 2430 | compact_zone(&cc, NULL); |
2433 | 2431 | ||
@@ -2529,8 +2527,6 @@ static void kcompactd_do_work(pg_data_t *pgdat) | |||
2529 | struct compact_control cc = { | 2527 | struct compact_control cc = { |
2530 | .order = pgdat->kcompactd_max_order, | 2528 | .order = pgdat->kcompactd_max_order, |
2531 | .search_order = pgdat->kcompactd_max_order, | 2529 | .search_order = pgdat->kcompactd_max_order, |
2532 | .total_migrate_scanned = 0, | ||
2533 | .total_free_scanned = 0, | ||
2534 | .classzone_idx = pgdat->kcompactd_classzone_idx, | 2530 | .classzone_idx = pgdat->kcompactd_classzone_idx, |
2535 | .mode = MIGRATE_SYNC_LIGHT, | 2531 | .mode = MIGRATE_SYNC_LIGHT, |
2536 | .ignore_skip_hint = false, | 2532 | .ignore_skip_hint = false, |
@@ -2554,16 +2550,10 @@ static void kcompactd_do_work(pg_data_t *pgdat) | |||
2554 | COMPACT_CONTINUE) | 2550 | COMPACT_CONTINUE) |
2555 | continue; | 2551 | continue; |
2556 | 2552 | ||
2557 | cc.nr_freepages = 0; | ||
2558 | cc.nr_migratepages = 0; | ||
2559 | cc.total_migrate_scanned = 0; | ||
2560 | cc.total_free_scanned = 0; | ||
2561 | cc.zone = zone; | ||
2562 | INIT_LIST_HEAD(&cc.freepages); | ||
2563 | INIT_LIST_HEAD(&cc.migratepages); | ||
2564 | |||
2565 | if (kthread_should_stop()) | 2553 | if (kthread_should_stop()) |
2566 | return; | 2554 | return; |
2555 | |||
2556 | cc.zone = zone; | ||
2567 | status = compact_zone(&cc, NULL); | 2557 | status = compact_zone(&cc, NULL); |
2568 | 2558 | ||
2569 | if (status == COMPACT_SUCCESS) { | 2559 | if (status == COMPACT_SUCCESS) { |
diff --git a/mm/filemap.c b/mm/filemap.c index 40667c2f3383..1146fcfa3215 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -126,7 +126,7 @@ static void page_cache_delete(struct address_space *mapping, | |||
126 | /* hugetlb pages are represented by a single entry in the xarray */ | 126 | /* hugetlb pages are represented by a single entry in the xarray */ |
127 | if (!PageHuge(page)) { | 127 | if (!PageHuge(page)) { |
128 | xas_set_order(&xas, page->index, compound_order(page)); | 128 | xas_set_order(&xas, page->index, compound_order(page)); |
129 | nr = 1U << compound_order(page); | 129 | nr = compound_nr(page); |
130 | } | 130 | } |
131 | 131 | ||
132 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 132 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
@@ -203,8 +203,9 @@ static void unaccount_page_cache_page(struct address_space *mapping, | |||
203 | __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); | 203 | __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); |
204 | if (PageTransHuge(page)) | 204 | if (PageTransHuge(page)) |
205 | __dec_node_page_state(page, NR_SHMEM_THPS); | 205 | __dec_node_page_state(page, NR_SHMEM_THPS); |
206 | } else { | 206 | } else if (PageTransHuge(page)) { |
207 | VM_BUG_ON_PAGE(PageTransHuge(page), page); | 207 | __dec_node_page_state(page, NR_FILE_THPS); |
208 | filemap_nr_thps_dec(mapping); | ||
208 | } | 209 | } |
209 | 210 | ||
210 | /* | 211 | /* |
@@ -281,11 +282,11 @@ EXPORT_SYMBOL(delete_from_page_cache); | |||
281 | * @pvec: pagevec with pages to delete | 282 | * @pvec: pagevec with pages to delete |
282 | * | 283 | * |
283 | * The function walks over mapping->i_pages and removes pages passed in @pvec | 284 | * The function walks over mapping->i_pages and removes pages passed in @pvec |
284 | * from the mapping. The function expects @pvec to be sorted by page index. | 285 | * from the mapping. The function expects @pvec to be sorted by page index |
286 | * and is optimised for it to be dense. | ||
285 | * It tolerates holes in @pvec (mapping entries at those indices are not | 287 | * It tolerates holes in @pvec (mapping entries at those indices are not |
286 | * modified). The function expects only THP head pages to be present in the | 288 | * modified). The function expects only THP head pages to be present in the |
287 | * @pvec and takes care to delete all corresponding tail pages from the | 289 | * @pvec. |
288 | * mapping as well. | ||
289 | * | 290 | * |
290 | * The function expects the i_pages lock to be held. | 291 | * The function expects the i_pages lock to be held. |
291 | */ | 292 | */ |
@@ -294,40 +295,43 @@ static void page_cache_delete_batch(struct address_space *mapping, | |||
294 | { | 295 | { |
295 | XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); | 296 | XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); |
296 | int total_pages = 0; | 297 | int total_pages = 0; |
297 | int i = 0, tail_pages = 0; | 298 | int i = 0; |
298 | struct page *page; | 299 | struct page *page; |
299 | 300 | ||
300 | mapping_set_update(&xas, mapping); | 301 | mapping_set_update(&xas, mapping); |
301 | xas_for_each(&xas, page, ULONG_MAX) { | 302 | xas_for_each(&xas, page, ULONG_MAX) { |
302 | if (i >= pagevec_count(pvec) && !tail_pages) | 303 | if (i >= pagevec_count(pvec)) |
303 | break; | 304 | break; |
305 | |||
306 | /* A swap/dax/shadow entry got inserted? Skip it. */ | ||
304 | if (xa_is_value(page)) | 307 | if (xa_is_value(page)) |
305 | continue; | 308 | continue; |
306 | if (!tail_pages) { | 309 | /* |
307 | /* | 310 | * A page got inserted in our range? Skip it. We have our |
308 | * Some page got inserted in our range? Skip it. We | 311 | * pages locked so they are protected from being removed. |
309 | * have our pages locked so they are protected from | 312 | * If we see a page whose index is higher than ours, it |
310 | * being removed. | 313 | * means our page has been removed, which shouldn't be |
311 | */ | 314 | * possible because we're holding the PageLock. |
312 | if (page != pvec->pages[i]) { | 315 | */ |
313 | VM_BUG_ON_PAGE(page->index > | 316 | if (page != pvec->pages[i]) { |
314 | pvec->pages[i]->index, page); | 317 | VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, |
315 | continue; | 318 | page); |
316 | } | 319 | continue; |
317 | WARN_ON_ONCE(!PageLocked(page)); | 320 | } |
318 | if (PageTransHuge(page) && !PageHuge(page)) | 321 | |
319 | tail_pages = HPAGE_PMD_NR - 1; | 322 | WARN_ON_ONCE(!PageLocked(page)); |
323 | |||
324 | if (page->index == xas.xa_index) | ||
320 | page->mapping = NULL; | 325 | page->mapping = NULL; |
321 | /* | 326 | /* Leave page->index set: truncation lookup relies on it */ |
322 | * Leave page->index set: truncation lookup relies | 327 | |
323 | * upon it | 328 | /* |
324 | */ | 329 | * Move to the next page in the vector if this is a regular |
330 | * page or the index is of the last sub-page of this compound | ||
331 | * page. | ||
332 | */ | ||
333 | if (page->index + compound_nr(page) - 1 == xas.xa_index) | ||
325 | i++; | 334 | i++; |
326 | } else { | ||
327 | VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages | ||
328 | != pvec->pages[i]->index, page); | ||
329 | tail_pages--; | ||
330 | } | ||
331 | xas_store(&xas, NULL); | 335 | xas_store(&xas, NULL); |
332 | total_pages++; | 336 | total_pages++; |
333 | } | 337 | } |
@@ -408,7 +412,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, | |||
408 | .range_end = end, | 412 | .range_end = end, |
409 | }; | 413 | }; |
410 | 414 | ||
411 | if (!mapping_cap_writeback_dirty(mapping)) | 415 | if (!mapping_cap_writeback_dirty(mapping) || |
416 | !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) | ||
412 | return 0; | 417 | return 0; |
413 | 418 | ||
414 | wbc_attach_fdatawrite_inode(&wbc, mapping->host); | 419 | wbc_attach_fdatawrite_inode(&wbc, mapping->host); |
@@ -617,10 +622,13 @@ int filemap_fdatawait_keep_errors(struct address_space *mapping) | |||
617 | } | 622 | } |
618 | EXPORT_SYMBOL(filemap_fdatawait_keep_errors); | 623 | EXPORT_SYMBOL(filemap_fdatawait_keep_errors); |
619 | 624 | ||
625 | /* Returns true if writeback might be needed or already in progress. */ | ||
620 | static bool mapping_needs_writeback(struct address_space *mapping) | 626 | static bool mapping_needs_writeback(struct address_space *mapping) |
621 | { | 627 | { |
622 | return (!dax_mapping(mapping) && mapping->nrpages) || | 628 | if (dax_mapping(mapping)) |
623 | (dax_mapping(mapping) && mapping->nrexceptional); | 629 | return mapping->nrexceptional; |
630 | |||
631 | return mapping->nrpages; | ||
624 | } | 632 | } |
625 | 633 | ||
626 | int filemap_write_and_wait(struct address_space *mapping) | 634 | int filemap_write_and_wait(struct address_space *mapping) |
@@ -1516,7 +1524,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); | |||
1516 | struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) | 1524 | struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) |
1517 | { | 1525 | { |
1518 | XA_STATE(xas, &mapping->i_pages, offset); | 1526 | XA_STATE(xas, &mapping->i_pages, offset); |
1519 | struct page *head, *page; | 1527 | struct page *page; |
1520 | 1528 | ||
1521 | rcu_read_lock(); | 1529 | rcu_read_lock(); |
1522 | repeat: | 1530 | repeat: |
@@ -1531,25 +1539,19 @@ repeat: | |||
1531 | if (!page || xa_is_value(page)) | 1539 | if (!page || xa_is_value(page)) |
1532 | goto out; | 1540 | goto out; |
1533 | 1541 | ||
1534 | head = compound_head(page); | 1542 | if (!page_cache_get_speculative(page)) |
1535 | if (!page_cache_get_speculative(head)) | ||
1536 | goto repeat; | 1543 | goto repeat; |
1537 | 1544 | ||
1538 | /* The page was split under us? */ | ||
1539 | if (compound_head(page) != head) { | ||
1540 | put_page(head); | ||
1541 | goto repeat; | ||
1542 | } | ||
1543 | |||
1544 | /* | 1545 | /* |
1545 | * Has the page moved? | 1546 | * Has the page moved or been split? |
1546 | * This is part of the lockless pagecache protocol. See | 1547 | * This is part of the lockless pagecache protocol. See |
1547 | * include/linux/pagemap.h for details. | 1548 | * include/linux/pagemap.h for details. |
1548 | */ | 1549 | */ |
1549 | if (unlikely(page != xas_reload(&xas))) { | 1550 | if (unlikely(page != xas_reload(&xas))) { |
1550 | put_page(head); | 1551 | put_page(page); |
1551 | goto repeat; | 1552 | goto repeat; |
1552 | } | 1553 | } |
1554 | page = find_subpage(page, offset); | ||
1553 | out: | 1555 | out: |
1554 | rcu_read_unlock(); | 1556 | rcu_read_unlock(); |
1555 | 1557 | ||
@@ -1646,7 +1648,7 @@ repeat: | |||
1646 | } | 1648 | } |
1647 | 1649 | ||
1648 | /* Has the page been truncated? */ | 1650 | /* Has the page been truncated? */ |
1649 | if (unlikely(page->mapping != mapping)) { | 1651 | if (unlikely(compound_head(page)->mapping != mapping)) { |
1650 | unlock_page(page); | 1652 | unlock_page(page); |
1651 | put_page(page); | 1653 | put_page(page); |
1652 | goto repeat; | 1654 | goto repeat; |
@@ -1731,7 +1733,6 @@ unsigned find_get_entries(struct address_space *mapping, | |||
1731 | 1733 | ||
1732 | rcu_read_lock(); | 1734 | rcu_read_lock(); |
1733 | xas_for_each(&xas, page, ULONG_MAX) { | 1735 | xas_for_each(&xas, page, ULONG_MAX) { |
1734 | struct page *head; | ||
1735 | if (xas_retry(&xas, page)) | 1736 | if (xas_retry(&xas, page)) |
1736 | continue; | 1737 | continue; |
1737 | /* | 1738 | /* |
@@ -1742,17 +1743,13 @@ unsigned find_get_entries(struct address_space *mapping, | |||
1742 | if (xa_is_value(page)) | 1743 | if (xa_is_value(page)) |
1743 | goto export; | 1744 | goto export; |
1744 | 1745 | ||
1745 | head = compound_head(page); | 1746 | if (!page_cache_get_speculative(page)) |
1746 | if (!page_cache_get_speculative(head)) | ||
1747 | goto retry; | 1747 | goto retry; |
1748 | 1748 | ||
1749 | /* The page was split under us? */ | 1749 | /* Has the page moved or been split? */ |
1750 | if (compound_head(page) != head) | ||
1751 | goto put_page; | ||
1752 | |||
1753 | /* Has the page moved? */ | ||
1754 | if (unlikely(page != xas_reload(&xas))) | 1750 | if (unlikely(page != xas_reload(&xas))) |
1755 | goto put_page; | 1751 | goto put_page; |
1752 | page = find_subpage(page, xas.xa_index); | ||
1756 | 1753 | ||
1757 | export: | 1754 | export: |
1758 | indices[ret] = xas.xa_index; | 1755 | indices[ret] = xas.xa_index; |
@@ -1761,7 +1758,7 @@ export: | |||
1761 | break; | 1758 | break; |
1762 | continue; | 1759 | continue; |
1763 | put_page: | 1760 | put_page: |
1764 | put_page(head); | 1761 | put_page(page); |
1765 | retry: | 1762 | retry: |
1766 | xas_reset(&xas); | 1763 | xas_reset(&xas); |
1767 | } | 1764 | } |
@@ -1803,33 +1800,27 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, | |||
1803 | 1800 | ||
1804 | rcu_read_lock(); | 1801 | rcu_read_lock(); |
1805 | xas_for_each(&xas, page, end) { | 1802 | xas_for_each(&xas, page, end) { |
1806 | struct page *head; | ||
1807 | if (xas_retry(&xas, page)) | 1803 | if (xas_retry(&xas, page)) |
1808 | continue; | 1804 | continue; |
1809 | /* Skip over shadow, swap and DAX entries */ | 1805 | /* Skip over shadow, swap and DAX entries */ |
1810 | if (xa_is_value(page)) | 1806 | if (xa_is_value(page)) |
1811 | continue; | 1807 | continue; |
1812 | 1808 | ||
1813 | head = compound_head(page); | 1809 | if (!page_cache_get_speculative(page)) |
1814 | if (!page_cache_get_speculative(head)) | ||
1815 | goto retry; | 1810 | goto retry; |
1816 | 1811 | ||
1817 | /* The page was split under us? */ | 1812 | /* Has the page moved or been split? */ |
1818 | if (compound_head(page) != head) | ||
1819 | goto put_page; | ||
1820 | |||
1821 | /* Has the page moved? */ | ||
1822 | if (unlikely(page != xas_reload(&xas))) | 1813 | if (unlikely(page != xas_reload(&xas))) |
1823 | goto put_page; | 1814 | goto put_page; |
1824 | 1815 | ||
1825 | pages[ret] = page; | 1816 | pages[ret] = find_subpage(page, xas.xa_index); |
1826 | if (++ret == nr_pages) { | 1817 | if (++ret == nr_pages) { |
1827 | *start = xas.xa_index + 1; | 1818 | *start = xas.xa_index + 1; |
1828 | goto out; | 1819 | goto out; |
1829 | } | 1820 | } |
1830 | continue; | 1821 | continue; |
1831 | put_page: | 1822 | put_page: |
1832 | put_page(head); | 1823 | put_page(page); |
1833 | retry: | 1824 | retry: |
1834 | xas_reset(&xas); | 1825 | xas_reset(&xas); |
1835 | } | 1826 | } |
@@ -1874,7 +1865,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | |||
1874 | 1865 | ||
1875 | rcu_read_lock(); | 1866 | rcu_read_lock(); |
1876 | for (page = xas_load(&xas); page; page = xas_next(&xas)) { | 1867 | for (page = xas_load(&xas); page; page = xas_next(&xas)) { |
1877 | struct page *head; | ||
1878 | if (xas_retry(&xas, page)) | 1868 | if (xas_retry(&xas, page)) |
1879 | continue; | 1869 | continue; |
1880 | /* | 1870 | /* |
@@ -1884,24 +1874,19 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | |||
1884 | if (xa_is_value(page)) | 1874 | if (xa_is_value(page)) |
1885 | break; | 1875 | break; |
1886 | 1876 | ||
1887 | head = compound_head(page); | 1877 | if (!page_cache_get_speculative(page)) |
1888 | if (!page_cache_get_speculative(head)) | ||
1889 | goto retry; | 1878 | goto retry; |
1890 | 1879 | ||
1891 | /* The page was split under us? */ | 1880 | /* Has the page moved or been split? */ |
1892 | if (compound_head(page) != head) | ||
1893 | goto put_page; | ||
1894 | |||
1895 | /* Has the page moved? */ | ||
1896 | if (unlikely(page != xas_reload(&xas))) | 1881 | if (unlikely(page != xas_reload(&xas))) |
1897 | goto put_page; | 1882 | goto put_page; |
1898 | 1883 | ||
1899 | pages[ret] = page; | 1884 | pages[ret] = find_subpage(page, xas.xa_index); |
1900 | if (++ret == nr_pages) | 1885 | if (++ret == nr_pages) |
1901 | break; | 1886 | break; |
1902 | continue; | 1887 | continue; |
1903 | put_page: | 1888 | put_page: |
1904 | put_page(head); | 1889 | put_page(page); |
1905 | retry: | 1890 | retry: |
1906 | xas_reset(&xas); | 1891 | xas_reset(&xas); |
1907 | } | 1892 | } |
@@ -1937,7 +1922,6 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, | |||
1937 | 1922 | ||
1938 | rcu_read_lock(); | 1923 | rcu_read_lock(); |
1939 | xas_for_each_marked(&xas, page, end, tag) { | 1924 | xas_for_each_marked(&xas, page, end, tag) { |
1940 | struct page *head; | ||
1941 | if (xas_retry(&xas, page)) | 1925 | if (xas_retry(&xas, page)) |
1942 | continue; | 1926 | continue; |
1943 | /* | 1927 | /* |
@@ -1948,26 +1932,21 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, | |||
1948 | if (xa_is_value(page)) | 1932 | if (xa_is_value(page)) |
1949 | continue; | 1933 | continue; |
1950 | 1934 | ||
1951 | head = compound_head(page); | 1935 | if (!page_cache_get_speculative(page)) |
1952 | if (!page_cache_get_speculative(head)) | ||
1953 | goto retry; | 1936 | goto retry; |
1954 | 1937 | ||
1955 | /* The page was split under us? */ | 1938 | /* Has the page moved or been split? */ |
1956 | if (compound_head(page) != head) | ||
1957 | goto put_page; | ||
1958 | |||
1959 | /* Has the page moved? */ | ||
1960 | if (unlikely(page != xas_reload(&xas))) | 1939 | if (unlikely(page != xas_reload(&xas))) |
1961 | goto put_page; | 1940 | goto put_page; |
1962 | 1941 | ||
1963 | pages[ret] = page; | 1942 | pages[ret] = find_subpage(page, xas.xa_index); |
1964 | if (++ret == nr_pages) { | 1943 | if (++ret == nr_pages) { |
1965 | *index = xas.xa_index + 1; | 1944 | *index = xas.xa_index + 1; |
1966 | goto out; | 1945 | goto out; |
1967 | } | 1946 | } |
1968 | continue; | 1947 | continue; |
1969 | put_page: | 1948 | put_page: |
1970 | put_page(head); | 1949 | put_page(page); |
1971 | retry: | 1950 | retry: |
1972 | xas_reset(&xas); | 1951 | xas_reset(&xas); |
1973 | } | 1952 | } |
@@ -2562,12 +2541,12 @@ retry_find: | |||
2562 | goto out_retry; | 2541 | goto out_retry; |
2563 | 2542 | ||
2564 | /* Did it get truncated? */ | 2543 | /* Did it get truncated? */ |
2565 | if (unlikely(page->mapping != mapping)) { | 2544 | if (unlikely(compound_head(page)->mapping != mapping)) { |
2566 | unlock_page(page); | 2545 | unlock_page(page); |
2567 | put_page(page); | 2546 | put_page(page); |
2568 | goto retry_find; | 2547 | goto retry_find; |
2569 | } | 2548 | } |
2570 | VM_BUG_ON_PAGE(page->index != offset, page); | 2549 | VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); |
2571 | 2550 | ||
2572 | /* | 2551 | /* |
2573 | * We have a locked page in the page cache, now we need to check | 2552 | * We have a locked page in the page cache, now we need to check |
@@ -2648,7 +2627,7 @@ void filemap_map_pages(struct vm_fault *vmf, | |||
2648 | pgoff_t last_pgoff = start_pgoff; | 2627 | pgoff_t last_pgoff = start_pgoff; |
2649 | unsigned long max_idx; | 2628 | unsigned long max_idx; |
2650 | XA_STATE(xas, &mapping->i_pages, start_pgoff); | 2629 | XA_STATE(xas, &mapping->i_pages, start_pgoff); |
2651 | struct page *head, *page; | 2630 | struct page *page; |
2652 | 2631 | ||
2653 | rcu_read_lock(); | 2632 | rcu_read_lock(); |
2654 | xas_for_each(&xas, page, end_pgoff) { | 2633 | xas_for_each(&xas, page, end_pgoff) { |
@@ -2657,24 +2636,19 @@ void filemap_map_pages(struct vm_fault *vmf, | |||
2657 | if (xa_is_value(page)) | 2636 | if (xa_is_value(page)) |
2658 | goto next; | 2637 | goto next; |
2659 | 2638 | ||
2660 | head = compound_head(page); | ||
2661 | |||
2662 | /* | 2639 | /* |
2663 | * Check for a locked page first, as a speculative | 2640 | * Check for a locked page first, as a speculative |
2664 | * reference may adversely influence page migration. | 2641 | * reference may adversely influence page migration. |
2665 | */ | 2642 | */ |
2666 | if (PageLocked(head)) | 2643 | if (PageLocked(page)) |
2667 | goto next; | 2644 | goto next; |
2668 | if (!page_cache_get_speculative(head)) | 2645 | if (!page_cache_get_speculative(page)) |
2669 | goto next; | 2646 | goto next; |
2670 | 2647 | ||
2671 | /* The page was split under us? */ | 2648 | /* Has the page moved or been split? */ |
2672 | if (compound_head(page) != head) | ||
2673 | goto skip; | ||
2674 | |||
2675 | /* Has the page moved? */ | ||
2676 | if (unlikely(page != xas_reload(&xas))) | 2649 | if (unlikely(page != xas_reload(&xas))) |
2677 | goto skip; | 2650 | goto skip; |
2651 | page = find_subpage(page, xas.xa_index); | ||
2678 | 2652 | ||
2679 | if (!PageUptodate(page) || | 2653 | if (!PageUptodate(page) || |
2680 | PageReadahead(page) || | 2654 | PageReadahead(page) || |
@@ -29,85 +29,70 @@ struct follow_page_context { | |||
29 | unsigned int page_mask; | 29 | unsigned int page_mask; |
30 | }; | 30 | }; |
31 | 31 | ||
32 | typedef int (*set_dirty_func_t)(struct page *page); | ||
33 | |||
34 | static void __put_user_pages_dirty(struct page **pages, | ||
35 | unsigned long npages, | ||
36 | set_dirty_func_t sdf) | ||
37 | { | ||
38 | unsigned long index; | ||
39 | |||
40 | for (index = 0; index < npages; index++) { | ||
41 | struct page *page = compound_head(pages[index]); | ||
42 | |||
43 | /* | ||
44 | * Checking PageDirty at this point may race with | ||
45 | * clear_page_dirty_for_io(), but that's OK. Two key cases: | ||
46 | * | ||
47 | * 1) This code sees the page as already dirty, so it skips | ||
48 | * the call to sdf(). That could happen because | ||
49 | * clear_page_dirty_for_io() called page_mkclean(), | ||
50 | * followed by set_page_dirty(). However, now the page is | ||
51 | * going to get written back, which meets the original | ||
52 | * intention of setting it dirty, so all is well: | ||
53 | * clear_page_dirty_for_io() goes on to call | ||
54 | * TestClearPageDirty(), and write the page back. | ||
55 | * | ||
56 | * 2) This code sees the page as clean, so it calls sdf(). | ||
57 | * The page stays dirty, despite being written back, so it | ||
58 | * gets written back again in the next writeback cycle. | ||
59 | * This is harmless. | ||
60 | */ | ||
61 | if (!PageDirty(page)) | ||
62 | sdf(page); | ||
63 | |||
64 | put_user_page(page); | ||
65 | } | ||
66 | } | ||
67 | |||
68 | /** | 32 | /** |
69 | * put_user_pages_dirty() - release and dirty an array of gup-pinned pages | 33 | * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages |
70 | * @pages: array of pages to be marked dirty and released. | 34 | * @pages: array of pages to be maybe marked dirty, and definitely released. |
71 | * @npages: number of pages in the @pages array. | 35 | * @npages: number of pages in the @pages array. |
36 | * @make_dirty: whether to mark the pages dirty | ||
72 | * | 37 | * |
73 | * "gup-pinned page" refers to a page that has had one of the get_user_pages() | 38 | * "gup-pinned page" refers to a page that has had one of the get_user_pages() |
74 | * variants called on that page. | 39 | * variants called on that page. |
75 | * | 40 | * |
76 | * For each page in the @pages array, make that page (or its head page, if a | 41 | * For each page in the @pages array, make that page (or its head page, if a |
77 | * compound page) dirty, if it was previously listed as clean. Then, release | 42 | * compound page) dirty, if @make_dirty is true, and if the page was previously |
78 | * the page using put_user_page(). | 43 | * listed as clean. In any case, releases all pages using put_user_page(), |
44 | * possibly via put_user_pages(), for the non-dirty case. | ||
79 | * | 45 | * |
80 | * Please see the put_user_page() documentation for details. | 46 | * Please see the put_user_page() documentation for details. |
81 | * | 47 | * |
82 | * set_page_dirty(), which does not lock the page, is used here. | 48 | * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is |
83 | * Therefore, it is the caller's responsibility to ensure that this is | 49 | * required, then the caller should a) verify that this is really correct, |
84 | * safe. If not, then put_user_pages_dirty_lock() should be called instead. | 50 | * because _lock() is usually required, and b) hand code it: |
51 | * set_page_dirty_lock(), put_user_page(). | ||
85 | * | 52 | * |
86 | */ | 53 | */ |
87 | void put_user_pages_dirty(struct page **pages, unsigned long npages) | 54 | void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, |
55 | bool make_dirty) | ||
88 | { | 56 | { |
89 | __put_user_pages_dirty(pages, npages, set_page_dirty); | 57 | unsigned long index; |
90 | } | ||
91 | EXPORT_SYMBOL(put_user_pages_dirty); | ||
92 | 58 | ||
93 | /** | 59 | /* |
94 | * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages | 60 | * TODO: this can be optimized for huge pages: if a series of pages is |
95 | * @pages: array of pages to be marked dirty and released. | 61 | * physically contiguous and part of the same compound page, then a |
96 | * @npages: number of pages in the @pages array. | 62 | * single operation to the head page should suffice. |
97 | * | 63 | */ |
98 | * For each page in the @pages array, make that page (or its head page, if a | 64 | |
99 | * compound page) dirty, if it was previously listed as clean. Then, release | 65 | if (!make_dirty) { |
100 | * the page using put_user_page(). | 66 | put_user_pages(pages, npages); |
101 | * | 67 | return; |
102 | * Please see the put_user_page() documentation for details. | 68 | } |
103 | * | 69 | |
104 | * This is just like put_user_pages_dirty(), except that it invokes | 70 | for (index = 0; index < npages; index++) { |
105 | * set_page_dirty_lock(), instead of set_page_dirty(). | 71 | struct page *page = compound_head(pages[index]); |
106 | * | 72 | /* |
107 | */ | 73 | * Checking PageDirty at this point may race with |
108 | void put_user_pages_dirty_lock(struct page **pages, unsigned long npages) | 74 | * clear_page_dirty_for_io(), but that's OK. Two key |
109 | { | 75 | * cases: |
110 | __put_user_pages_dirty(pages, npages, set_page_dirty_lock); | 76 | * |
77 | * 1) This code sees the page as already dirty, so it | ||
78 | * skips the call to set_page_dirty(). That could happen | ||
79 | * because clear_page_dirty_for_io() called | ||
80 | * page_mkclean(), followed by set_page_dirty(). | ||
81 | * However, now the page is going to get written back, | ||
82 | * which meets the original intention of setting it | ||
83 | * dirty, so all is well: clear_page_dirty_for_io() goes | ||
84 | * on to call TestClearPageDirty(), and write the page | ||
85 | * back. | ||
86 | * | ||
87 | * 2) This code sees the page as clean, so it calls | ||
88 | * set_page_dirty(). The page stays dirty, despite being | ||
89 | * written back, so it gets written back again in the | ||
90 | * next writeback cycle. This is harmless. | ||
91 | */ | ||
92 | if (!PageDirty(page)) | ||
93 | set_page_dirty_lock(page); | ||
94 | put_user_page(page); | ||
95 | } | ||
111 | } | 96 | } |
112 | EXPORT_SYMBOL(put_user_pages_dirty_lock); | 97 | EXPORT_SYMBOL(put_user_pages_dirty_lock); |
113 | 98 | ||
@@ -399,7 +384,7 @@ retry_locked: | |||
399 | spin_unlock(ptl); | 384 | spin_unlock(ptl); |
400 | return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); | 385 | return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); |
401 | } | 386 | } |
402 | if (flags & FOLL_SPLIT) { | 387 | if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) { |
403 | int ret; | 388 | int ret; |
404 | page = pmd_page(*pmd); | 389 | page = pmd_page(*pmd); |
405 | if (is_huge_zero_page(page)) { | 390 | if (is_huge_zero_page(page)) { |
@@ -408,7 +393,7 @@ retry_locked: | |||
408 | split_huge_pmd(vma, pmd, address); | 393 | split_huge_pmd(vma, pmd, address); |
409 | if (pmd_trans_unstable(pmd)) | 394 | if (pmd_trans_unstable(pmd)) |
410 | ret = -EBUSY; | 395 | ret = -EBUSY; |
411 | } else { | 396 | } else if (flags & FOLL_SPLIT) { |
412 | if (unlikely(!try_get_page(page))) { | 397 | if (unlikely(!try_get_page(page))) { |
413 | spin_unlock(ptl); | 398 | spin_unlock(ptl); |
414 | return ERR_PTR(-ENOMEM); | 399 | return ERR_PTR(-ENOMEM); |
@@ -420,6 +405,10 @@ retry_locked: | |||
420 | put_page(page); | 405 | put_page(page); |
421 | if (pmd_none(*pmd)) | 406 | if (pmd_none(*pmd)) |
422 | return no_page_table(vma, flags); | 407 | return no_page_table(vma, flags); |
408 | } else { /* flags & FOLL_SPLIT_PMD */ | ||
409 | spin_unlock(ptl); | ||
410 | split_huge_pmd(vma, pmd, address); | ||
411 | ret = pte_alloc(mm, pmd) ? -ENOMEM : 0; | ||
423 | } | 412 | } |
424 | 413 | ||
425 | return ret ? ERR_PTR(ret) : | 414 | return ret ? ERR_PTR(ret) : |
@@ -1460,7 +1449,7 @@ check_again: | |||
1460 | * gup may start from a tail page. Advance step by the left | 1449 | * gup may start from a tail page. Advance step by the left |
1461 | * part. | 1450 | * part. |
1462 | */ | 1451 | */ |
1463 | step = (1 << compound_order(head)) - (pages[i] - head); | 1452 | step = compound_nr(head) - (pages[i] - head); |
1464 | /* | 1453 | /* |
1465 | * If we get a page from the CMA zone, since we are going to | 1454 | * If we get a page from the CMA zone, since we are going to |
1466 | * be pinning these entries, we might as well move them out | 1455 | * be pinning these entries, we might as well move them out |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index de1f15969e27..73fc517c08d2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -496,11 +496,25 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | |||
496 | return pmd; | 496 | return pmd; |
497 | } | 497 | } |
498 | 498 | ||
499 | static inline struct list_head *page_deferred_list(struct page *page) | 499 | #ifdef CONFIG_MEMCG |
500 | static inline struct deferred_split *get_deferred_split_queue(struct page *page) | ||
500 | { | 501 | { |
501 | /* ->lru in the tail pages is occupied by compound_head. */ | 502 | struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; |
502 | return &page[2].deferred_list; | 503 | struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); |
504 | |||
505 | if (memcg) | ||
506 | return &memcg->deferred_split_queue; | ||
507 | else | ||
508 | return &pgdat->deferred_split_queue; | ||
509 | } | ||
510 | #else | ||
511 | static inline struct deferred_split *get_deferred_split_queue(struct page *page) | ||
512 | { | ||
513 | struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); | ||
514 | |||
515 | return &pgdat->deferred_split_queue; | ||
503 | } | 516 | } |
517 | #endif | ||
504 | 518 | ||
505 | void prep_transhuge_page(struct page *page) | 519 | void prep_transhuge_page(struct page *page) |
506 | { | 520 | { |
@@ -2497,6 +2511,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, | |||
2497 | struct page *head = compound_head(page); | 2511 | struct page *head = compound_head(page); |
2498 | pg_data_t *pgdat = page_pgdat(head); | 2512 | pg_data_t *pgdat = page_pgdat(head); |
2499 | struct lruvec *lruvec; | 2513 | struct lruvec *lruvec; |
2514 | struct address_space *swap_cache = NULL; | ||
2515 | unsigned long offset = 0; | ||
2500 | int i; | 2516 | int i; |
2501 | 2517 | ||
2502 | lruvec = mem_cgroup_page_lruvec(head, pgdat); | 2518 | lruvec = mem_cgroup_page_lruvec(head, pgdat); |
@@ -2504,6 +2520,14 @@ static void __split_huge_page(struct page *page, struct list_head *list, | |||
2504 | /* complete memcg works before add pages to LRU */ | 2520 | /* complete memcg works before add pages to LRU */ |
2505 | mem_cgroup_split_huge_fixup(head); | 2521 | mem_cgroup_split_huge_fixup(head); |
2506 | 2522 | ||
2523 | if (PageAnon(head) && PageSwapCache(head)) { | ||
2524 | swp_entry_t entry = { .val = page_private(head) }; | ||
2525 | |||
2526 | offset = swp_offset(entry); | ||
2527 | swap_cache = swap_address_space(entry); | ||
2528 | xa_lock(&swap_cache->i_pages); | ||
2529 | } | ||
2530 | |||
2507 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { | 2531 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { |
2508 | __split_huge_page_tail(head, i, lruvec, list); | 2532 | __split_huge_page_tail(head, i, lruvec, list); |
2509 | /* Some pages can be beyond i_size: drop them from page cache */ | 2533 | /* Some pages can be beyond i_size: drop them from page cache */ |
@@ -2513,6 +2537,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, | |||
2513 | if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) | 2537 | if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) |
2514 | shmem_uncharge(head->mapping->host, 1); | 2538 | shmem_uncharge(head->mapping->host, 1); |
2515 | put_page(head + i); | 2539 | put_page(head + i); |
2540 | } else if (!PageAnon(page)) { | ||
2541 | __xa_store(&head->mapping->i_pages, head[i].index, | ||
2542 | head + i, 0); | ||
2543 | } else if (swap_cache) { | ||
2544 | __xa_store(&swap_cache->i_pages, offset + i, | ||
2545 | head + i, 0); | ||
2516 | } | 2546 | } |
2517 | } | 2547 | } |
2518 | 2548 | ||
@@ -2523,10 +2553,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, | |||
2523 | /* See comment in __split_huge_page_tail() */ | 2553 | /* See comment in __split_huge_page_tail() */ |
2524 | if (PageAnon(head)) { | 2554 | if (PageAnon(head)) { |
2525 | /* Additional pin to swap cache */ | 2555 | /* Additional pin to swap cache */ |
2526 | if (PageSwapCache(head)) | 2556 | if (PageSwapCache(head)) { |
2527 | page_ref_add(head, 2); | 2557 | page_ref_add(head, 2); |
2528 | else | 2558 | xa_unlock(&swap_cache->i_pages); |
2559 | } else { | ||
2529 | page_ref_inc(head); | 2560 | page_ref_inc(head); |
2561 | } | ||
2530 | } else { | 2562 | } else { |
2531 | /* Additional pin to page cache */ | 2563 | /* Additional pin to page cache */ |
2532 | page_ref_add(head, 2); | 2564 | page_ref_add(head, 2); |
@@ -2673,6 +2705,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
2673 | { | 2705 | { |
2674 | struct page *head = compound_head(page); | 2706 | struct page *head = compound_head(page); |
2675 | struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); | 2707 | struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); |
2708 | struct deferred_split *ds_queue = get_deferred_split_queue(page); | ||
2676 | struct anon_vma *anon_vma = NULL; | 2709 | struct anon_vma *anon_vma = NULL; |
2677 | struct address_space *mapping = NULL; | 2710 | struct address_space *mapping = NULL; |
2678 | int count, mapcount, extra_pins, ret; | 2711 | int count, mapcount, extra_pins, ret; |
@@ -2759,17 +2792,17 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
2759 | } | 2792 | } |
2760 | 2793 | ||
2761 | /* Prevent deferred_split_scan() touching ->_refcount */ | 2794 | /* Prevent deferred_split_scan() touching ->_refcount */ |
2762 | spin_lock(&pgdata->split_queue_lock); | 2795 | spin_lock(&ds_queue->split_queue_lock); |
2763 | count = page_count(head); | 2796 | count = page_count(head); |
2764 | mapcount = total_mapcount(head); | 2797 | mapcount = total_mapcount(head); |
2765 | if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { | 2798 | if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { |
2766 | if (!list_empty(page_deferred_list(head))) { | 2799 | if (!list_empty(page_deferred_list(head))) { |
2767 | pgdata->split_queue_len--; | 2800 | ds_queue->split_queue_len--; |
2768 | list_del(page_deferred_list(head)); | 2801 | list_del(page_deferred_list(head)); |
2769 | } | 2802 | } |
2770 | if (mapping) | 2803 | if (mapping) |
2771 | __dec_node_page_state(page, NR_SHMEM_THPS); | 2804 | __dec_node_page_state(page, NR_SHMEM_THPS); |
2772 | spin_unlock(&pgdata->split_queue_lock); | 2805 | spin_unlock(&ds_queue->split_queue_lock); |
2773 | __split_huge_page(page, list, end, flags); | 2806 | __split_huge_page(page, list, end, flags); |
2774 | if (PageSwapCache(head)) { | 2807 | if (PageSwapCache(head)) { |
2775 | swp_entry_t entry = { .val = page_private(head) }; | 2808 | swp_entry_t entry = { .val = page_private(head) }; |
@@ -2786,7 +2819,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
2786 | dump_page(page, "total_mapcount(head) > 0"); | 2819 | dump_page(page, "total_mapcount(head) > 0"); |
2787 | BUG(); | 2820 | BUG(); |
2788 | } | 2821 | } |
2789 | spin_unlock(&pgdata->split_queue_lock); | 2822 | spin_unlock(&ds_queue->split_queue_lock); |
2790 | fail: if (mapping) | 2823 | fail: if (mapping) |
2791 | xa_unlock(&mapping->i_pages); | 2824 | xa_unlock(&mapping->i_pages); |
2792 | spin_unlock_irqrestore(&pgdata->lru_lock, flags); | 2825 | spin_unlock_irqrestore(&pgdata->lru_lock, flags); |
@@ -2808,53 +2841,86 @@ out: | |||
2808 | 2841 | ||
2809 | void free_transhuge_page(struct page *page) | 2842 | void free_transhuge_page(struct page *page) |
2810 | { | 2843 | { |
2811 | struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); | 2844 | struct deferred_split *ds_queue = get_deferred_split_queue(page); |
2812 | unsigned long flags; | 2845 | unsigned long flags; |
2813 | 2846 | ||
2814 | spin_lock_irqsave(&pgdata->split_queue_lock, flags); | 2847 | spin_lock_irqsave(&ds_queue->split_queue_lock, flags); |
2815 | if (!list_empty(page_deferred_list(page))) { | 2848 | if (!list_empty(page_deferred_list(page))) { |
2816 | pgdata->split_queue_len--; | 2849 | ds_queue->split_queue_len--; |
2817 | list_del(page_deferred_list(page)); | 2850 | list_del(page_deferred_list(page)); |
2818 | } | 2851 | } |
2819 | spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); | 2852 | spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); |
2820 | free_compound_page(page); | 2853 | free_compound_page(page); |
2821 | } | 2854 | } |
2822 | 2855 | ||
2823 | void deferred_split_huge_page(struct page *page) | 2856 | void deferred_split_huge_page(struct page *page) |
2824 | { | 2857 | { |
2825 | struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); | 2858 | struct deferred_split *ds_queue = get_deferred_split_queue(page); |
2859 | #ifdef CONFIG_MEMCG | ||
2860 | struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; | ||
2861 | #endif | ||
2826 | unsigned long flags; | 2862 | unsigned long flags; |
2827 | 2863 | ||
2828 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | 2864 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
2829 | 2865 | ||
2830 | spin_lock_irqsave(&pgdata->split_queue_lock, flags); | 2866 | /* |
2867 | * The try_to_unmap() in page reclaim path might reach here too, | ||
2868 | * this may cause a race condition to corrupt deferred split queue. | ||
2869 | * And, if page reclaim is already handling the same page, it is | ||
2870 | * unnecessary to handle it again in shrinker. | ||
2871 | * | ||
2872 | * Check PageSwapCache to determine if the page is being | ||
2873 | * handled by page reclaim since THP swap would add the page into | ||
2874 | * swap cache before calling try_to_unmap(). | ||
2875 | */ | ||
2876 | if (PageSwapCache(page)) | ||
2877 | return; | ||
2878 | |||
2879 | spin_lock_irqsave(&ds_queue->split_queue_lock, flags); | ||
2831 | if (list_empty(page_deferred_list(page))) { | 2880 | if (list_empty(page_deferred_list(page))) { |
2832 | count_vm_event(THP_DEFERRED_SPLIT_PAGE); | 2881 | count_vm_event(THP_DEFERRED_SPLIT_PAGE); |
2833 | list_add_tail(page_deferred_list(page), &pgdata->split_queue); | 2882 | list_add_tail(page_deferred_list(page), &ds_queue->split_queue); |
2834 | pgdata->split_queue_len++; | 2883 | ds_queue->split_queue_len++; |
2884 | #ifdef CONFIG_MEMCG | ||
2885 | if (memcg) | ||
2886 | memcg_set_shrinker_bit(memcg, page_to_nid(page), | ||
2887 | deferred_split_shrinker.id); | ||
2888 | #endif | ||
2835 | } | 2889 | } |
2836 | spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); | 2890 | spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); |
2837 | } | 2891 | } |
2838 | 2892 | ||
2839 | static unsigned long deferred_split_count(struct shrinker *shrink, | 2893 | static unsigned long deferred_split_count(struct shrinker *shrink, |
2840 | struct shrink_control *sc) | 2894 | struct shrink_control *sc) |
2841 | { | 2895 | { |
2842 | struct pglist_data *pgdata = NODE_DATA(sc->nid); | 2896 | struct pglist_data *pgdata = NODE_DATA(sc->nid); |
2843 | return READ_ONCE(pgdata->split_queue_len); | 2897 | struct deferred_split *ds_queue = &pgdata->deferred_split_queue; |
2898 | |||
2899 | #ifdef CONFIG_MEMCG | ||
2900 | if (sc->memcg) | ||
2901 | ds_queue = &sc->memcg->deferred_split_queue; | ||
2902 | #endif | ||
2903 | return READ_ONCE(ds_queue->split_queue_len); | ||
2844 | } | 2904 | } |
2845 | 2905 | ||
2846 | static unsigned long deferred_split_scan(struct shrinker *shrink, | 2906 | static unsigned long deferred_split_scan(struct shrinker *shrink, |
2847 | struct shrink_control *sc) | 2907 | struct shrink_control *sc) |
2848 | { | 2908 | { |
2849 | struct pglist_data *pgdata = NODE_DATA(sc->nid); | 2909 | struct pglist_data *pgdata = NODE_DATA(sc->nid); |
2910 | struct deferred_split *ds_queue = &pgdata->deferred_split_queue; | ||
2850 | unsigned long flags; | 2911 | unsigned long flags; |
2851 | LIST_HEAD(list), *pos, *next; | 2912 | LIST_HEAD(list), *pos, *next; |
2852 | struct page *page; | 2913 | struct page *page; |
2853 | int split = 0; | 2914 | int split = 0; |
2854 | 2915 | ||
2855 | spin_lock_irqsave(&pgdata->split_queue_lock, flags); | 2916 | #ifdef CONFIG_MEMCG |
2917 | if (sc->memcg) | ||
2918 | ds_queue = &sc->memcg->deferred_split_queue; | ||
2919 | #endif | ||
2920 | |||
2921 | spin_lock_irqsave(&ds_queue->split_queue_lock, flags); | ||
2856 | /* Take pin on all head pages to avoid freeing them under us */ | 2922 | /* Take pin on all head pages to avoid freeing them under us */ |
2857 | list_for_each_safe(pos, next, &pgdata->split_queue) { | 2923 | list_for_each_safe(pos, next, &ds_queue->split_queue) { |
2858 | page = list_entry((void *)pos, struct page, mapping); | 2924 | page = list_entry((void *)pos, struct page, mapping); |
2859 | page = compound_head(page); | 2925 | page = compound_head(page); |
2860 | if (get_page_unless_zero(page)) { | 2926 | if (get_page_unless_zero(page)) { |
@@ -2862,12 +2928,12 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, | |||
2862 | } else { | 2928 | } else { |
2863 | /* We lost race with put_compound_page() */ | 2929 | /* We lost race with put_compound_page() */ |
2864 | list_del_init(page_deferred_list(page)); | 2930 | list_del_init(page_deferred_list(page)); |
2865 | pgdata->split_queue_len--; | 2931 | ds_queue->split_queue_len--; |
2866 | } | 2932 | } |
2867 | if (!--sc->nr_to_scan) | 2933 | if (!--sc->nr_to_scan) |
2868 | break; | 2934 | break; |
2869 | } | 2935 | } |
2870 | spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); | 2936 | spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); |
2871 | 2937 | ||
2872 | list_for_each_safe(pos, next, &list) { | 2938 | list_for_each_safe(pos, next, &list) { |
2873 | page = list_entry((void *)pos, struct page, mapping); | 2939 | page = list_entry((void *)pos, struct page, mapping); |
@@ -2881,15 +2947,15 @@ next: | |||
2881 | put_page(page); | 2947 | put_page(page); |
2882 | } | 2948 | } |
2883 | 2949 | ||
2884 | spin_lock_irqsave(&pgdata->split_queue_lock, flags); | 2950 | spin_lock_irqsave(&ds_queue->split_queue_lock, flags); |
2885 | list_splice_tail(&list, &pgdata->split_queue); | 2951 | list_splice_tail(&list, &ds_queue->split_queue); |
2886 | spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); | 2952 | spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); |
2887 | 2953 | ||
2888 | /* | 2954 | /* |
2889 | * Stop shrinker if we didn't split any page, but the queue is empty. | 2955 | * Stop shrinker if we didn't split any page, but the queue is empty. |
2890 | * This can happen if pages were freed under us. | 2956 | * This can happen if pages were freed under us. |
2891 | */ | 2957 | */ |
2892 | if (!split && list_empty(&pgdata->split_queue)) | 2958 | if (!split && list_empty(&ds_queue->split_queue)) |
2893 | return SHRINK_STOP; | 2959 | return SHRINK_STOP; |
2894 | return split; | 2960 | return split; |
2895 | } | 2961 | } |
@@ -2898,7 +2964,8 @@ static struct shrinker deferred_split_shrinker = { | |||
2898 | .count_objects = deferred_split_count, | 2964 | .count_objects = deferred_split_count, |
2899 | .scan_objects = deferred_split_scan, | 2965 | .scan_objects = deferred_split_scan, |
2900 | .seeks = DEFAULT_SEEKS, | 2966 | .seeks = DEFAULT_SEEKS, |
2901 | .flags = SHRINKER_NUMA_AWARE, | 2967 | .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | |
2968 | SHRINKER_NONSLAB, | ||
2902 | }; | 2969 | }; |
2903 | 2970 | ||
2904 | #ifdef CONFIG_DEBUG_FS | 2971 | #ifdef CONFIG_DEBUG_FS |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6d7296dd11b8..ef37c85423a5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -1405,12 +1405,25 @@ pgoff_t __basepage_index(struct page *page) | |||
1405 | } | 1405 | } |
1406 | 1406 | ||
1407 | static struct page *alloc_buddy_huge_page(struct hstate *h, | 1407 | static struct page *alloc_buddy_huge_page(struct hstate *h, |
1408 | gfp_t gfp_mask, int nid, nodemask_t *nmask) | 1408 | gfp_t gfp_mask, int nid, nodemask_t *nmask, |
1409 | nodemask_t *node_alloc_noretry) | ||
1409 | { | 1410 | { |
1410 | int order = huge_page_order(h); | 1411 | int order = huge_page_order(h); |
1411 | struct page *page; | 1412 | struct page *page; |
1413 | bool alloc_try_hard = true; | ||
1412 | 1414 | ||
1413 | gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; | 1415 | /* |
1416 | * By default we always try hard to allocate the page with | ||
1417 | * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in | ||
1418 | * a loop (to adjust global huge page counts) and previous allocation | ||
1419 | * failed, do not continue to try hard on the same node. Use the | ||
1420 | * node_alloc_noretry bitmap to manage this state information. | ||
1421 | */ | ||
1422 | if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry)) | ||
1423 | alloc_try_hard = false; | ||
1424 | gfp_mask |= __GFP_COMP|__GFP_NOWARN; | ||
1425 | if (alloc_try_hard) | ||
1426 | gfp_mask |= __GFP_RETRY_MAYFAIL; | ||
1414 | if (nid == NUMA_NO_NODE) | 1427 | if (nid == NUMA_NO_NODE) |
1415 | nid = numa_mem_id(); | 1428 | nid = numa_mem_id(); |
1416 | page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); | 1429 | page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); |
@@ -1419,6 +1432,22 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
1419 | else | 1432 | else |
1420 | __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); | 1433 | __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); |
1421 | 1434 | ||
1435 | /* | ||
1436 | * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this | ||
1437 | * indicates an overall state change. Clear bit so that we resume | ||
1438 | * normal 'try hard' allocations. | ||
1439 | */ | ||
1440 | if (node_alloc_noretry && page && !alloc_try_hard) | ||
1441 | node_clear(nid, *node_alloc_noretry); | ||
1442 | |||
1443 | /* | ||
1444 | * If we tried hard to get a page but failed, set bit so that | ||
1445 | * subsequent attempts will not try as hard until there is an | ||
1446 | * overall state change. | ||
1447 | */ | ||
1448 | if (node_alloc_noretry && !page && alloc_try_hard) | ||
1449 | node_set(nid, *node_alloc_noretry); | ||
1450 | |||
1422 | return page; | 1451 | return page; |
1423 | } | 1452 | } |
1424 | 1453 | ||
@@ -1427,7 +1456,8 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, | |||
1427 | * should use this function to get new hugetlb pages | 1456 | * should use this function to get new hugetlb pages |
1428 | */ | 1457 | */ |
1429 | static struct page *alloc_fresh_huge_page(struct hstate *h, | 1458 | static struct page *alloc_fresh_huge_page(struct hstate *h, |
1430 | gfp_t gfp_mask, int nid, nodemask_t *nmask) | 1459 | gfp_t gfp_mask, int nid, nodemask_t *nmask, |
1460 | nodemask_t *node_alloc_noretry) | ||
1431 | { | 1461 | { |
1432 | struct page *page; | 1462 | struct page *page; |
1433 | 1463 | ||
@@ -1435,7 +1465,7 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, | |||
1435 | page = alloc_gigantic_page(h, gfp_mask, nid, nmask); | 1465 | page = alloc_gigantic_page(h, gfp_mask, nid, nmask); |
1436 | else | 1466 | else |
1437 | page = alloc_buddy_huge_page(h, gfp_mask, | 1467 | page = alloc_buddy_huge_page(h, gfp_mask, |
1438 | nid, nmask); | 1468 | nid, nmask, node_alloc_noretry); |
1439 | if (!page) | 1469 | if (!page) |
1440 | return NULL; | 1470 | return NULL; |
1441 | 1471 | ||
@@ -1450,14 +1480,16 @@ static struct page *alloc_fresh_huge_page(struct hstate *h, | |||
1450 | * Allocates a fresh page to the hugetlb allocator pool in the node interleaved | 1480 | * Allocates a fresh page to the hugetlb allocator pool in the node interleaved |
1451 | * manner. | 1481 | * manner. |
1452 | */ | 1482 | */ |
1453 | static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) | 1483 | static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, |
1484 | nodemask_t *node_alloc_noretry) | ||
1454 | { | 1485 | { |
1455 | struct page *page; | 1486 | struct page *page; |
1456 | int nr_nodes, node; | 1487 | int nr_nodes, node; |
1457 | gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; | 1488 | gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; |
1458 | 1489 | ||
1459 | for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { | 1490 | for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { |
1460 | page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed); | 1491 | page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, |
1492 | node_alloc_noretry); | ||
1461 | if (page) | 1493 | if (page) |
1462 | break; | 1494 | break; |
1463 | } | 1495 | } |
@@ -1601,7 +1633,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, | |||
1601 | goto out_unlock; | 1633 | goto out_unlock; |
1602 | spin_unlock(&hugetlb_lock); | 1634 | spin_unlock(&hugetlb_lock); |
1603 | 1635 | ||
1604 | page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); | 1636 | page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); |
1605 | if (!page) | 1637 | if (!page) |
1606 | return NULL; | 1638 | return NULL; |
1607 | 1639 | ||
@@ -1637,7 +1669,7 @@ struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, | |||
1637 | if (hstate_is_gigantic(h)) | 1669 | if (hstate_is_gigantic(h)) |
1638 | return NULL; | 1670 | return NULL; |
1639 | 1671 | ||
1640 | page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); | 1672 | page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); |
1641 | if (!page) | 1673 | if (!page) |
1642 | return NULL; | 1674 | return NULL; |
1643 | 1675 | ||
@@ -2207,13 +2239,33 @@ static void __init gather_bootmem_prealloc(void) | |||
2207 | static void __init hugetlb_hstate_alloc_pages(struct hstate *h) | 2239 | static void __init hugetlb_hstate_alloc_pages(struct hstate *h) |
2208 | { | 2240 | { |
2209 | unsigned long i; | 2241 | unsigned long i; |
2242 | nodemask_t *node_alloc_noretry; | ||
2243 | |||
2244 | if (!hstate_is_gigantic(h)) { | ||
2245 | /* | ||
2246 | * Bit mask controlling how hard we retry per-node allocations. | ||
2247 | * Ignore errors as lower level routines can deal with | ||
2248 | * node_alloc_noretry == NULL. If this kmalloc fails at boot | ||
2249 | * time, we are likely in bigger trouble. | ||
2250 | */ | ||
2251 | node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry), | ||
2252 | GFP_KERNEL); | ||
2253 | } else { | ||
2254 | /* allocations done at boot time */ | ||
2255 | node_alloc_noretry = NULL; | ||
2256 | } | ||
2257 | |||
2258 | /* bit mask controlling how hard we retry per-node allocations */ | ||
2259 | if (node_alloc_noretry) | ||
2260 | nodes_clear(*node_alloc_noretry); | ||
2210 | 2261 | ||
2211 | for (i = 0; i < h->max_huge_pages; ++i) { | 2262 | for (i = 0; i < h->max_huge_pages; ++i) { |
2212 | if (hstate_is_gigantic(h)) { | 2263 | if (hstate_is_gigantic(h)) { |
2213 | if (!alloc_bootmem_huge_page(h)) | 2264 | if (!alloc_bootmem_huge_page(h)) |
2214 | break; | 2265 | break; |
2215 | } else if (!alloc_pool_huge_page(h, | 2266 | } else if (!alloc_pool_huge_page(h, |
2216 | &node_states[N_MEMORY])) | 2267 | &node_states[N_MEMORY], |
2268 | node_alloc_noretry)) | ||
2217 | break; | 2269 | break; |
2218 | cond_resched(); | 2270 | cond_resched(); |
2219 | } | 2271 | } |
@@ -2225,6 +2277,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) | |||
2225 | h->max_huge_pages, buf, i); | 2277 | h->max_huge_pages, buf, i); |
2226 | h->max_huge_pages = i; | 2278 | h->max_huge_pages = i; |
2227 | } | 2279 | } |
2280 | |||
2281 | kfree(node_alloc_noretry); | ||
2228 | } | 2282 | } |
2229 | 2283 | ||
2230 | static void __init hugetlb_init_hstates(void) | 2284 | static void __init hugetlb_init_hstates(void) |
@@ -2323,6 +2377,17 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, | |||
2323 | nodemask_t *nodes_allowed) | 2377 | nodemask_t *nodes_allowed) |
2324 | { | 2378 | { |
2325 | unsigned long min_count, ret; | 2379 | unsigned long min_count, ret; |
2380 | NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); | ||
2381 | |||
2382 | /* | ||
2383 | * Bit mask controlling how hard we retry per-node allocations. | ||
2384 | * If we can not allocate the bit mask, do not attempt to allocate | ||
2385 | * the requested huge pages. | ||
2386 | */ | ||
2387 | if (node_alloc_noretry) | ||
2388 | nodes_clear(*node_alloc_noretry); | ||
2389 | else | ||
2390 | return -ENOMEM; | ||
2326 | 2391 | ||
2327 | spin_lock(&hugetlb_lock); | 2392 | spin_lock(&hugetlb_lock); |
2328 | 2393 | ||
@@ -2356,6 +2421,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, | |||
2356 | if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { | 2421 | if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { |
2357 | if (count > persistent_huge_pages(h)) { | 2422 | if (count > persistent_huge_pages(h)) { |
2358 | spin_unlock(&hugetlb_lock); | 2423 | spin_unlock(&hugetlb_lock); |
2424 | NODEMASK_FREE(node_alloc_noretry); | ||
2359 | return -EINVAL; | 2425 | return -EINVAL; |
2360 | } | 2426 | } |
2361 | /* Fall through to decrease pool */ | 2427 | /* Fall through to decrease pool */ |
@@ -2388,7 +2454,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, | |||
2388 | /* yield cpu to avoid soft lockup */ | 2454 | /* yield cpu to avoid soft lockup */ |
2389 | cond_resched(); | 2455 | cond_resched(); |
2390 | 2456 | ||
2391 | ret = alloc_pool_huge_page(h, nodes_allowed); | 2457 | ret = alloc_pool_huge_page(h, nodes_allowed, |
2458 | node_alloc_noretry); | ||
2392 | spin_lock(&hugetlb_lock); | 2459 | spin_lock(&hugetlb_lock); |
2393 | if (!ret) | 2460 | if (!ret) |
2394 | goto out; | 2461 | goto out; |
@@ -2429,6 +2496,8 @@ out: | |||
2429 | h->max_huge_pages = persistent_huge_pages(h); | 2496 | h->max_huge_pages = persistent_huge_pages(h); |
2430 | spin_unlock(&hugetlb_lock); | 2497 | spin_unlock(&hugetlb_lock); |
2431 | 2498 | ||
2499 | NODEMASK_FREE(node_alloc_noretry); | ||
2500 | |||
2432 | return 0; | 2501 | return 0; |
2433 | } | 2502 | } |
2434 | 2503 | ||
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 68c2f2f3c05b..f1930fa0b445 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c | |||
@@ -139,7 +139,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, | |||
139 | if (!page_hcg || page_hcg != h_cg) | 139 | if (!page_hcg || page_hcg != h_cg) |
140 | goto out; | 140 | goto out; |
141 | 141 | ||
142 | nr_pages = 1 << compound_order(page); | 142 | nr_pages = compound_nr(page); |
143 | if (!parent) { | 143 | if (!parent) { |
144 | parent = root_h_cgroup; | 144 | parent = root_h_cgroup; |
145 | /* root has no limit */ | 145 | /* root has no limit */ |
diff --git a/mm/init-mm.c b/mm/init-mm.c index a787a319211e..fb1e15028ef0 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c | |||
@@ -35,6 +35,6 @@ struct mm_struct init_mm = { | |||
35 | .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), | 35 | .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), |
36 | .mmlist = LIST_HEAD_INIT(init_mm.mmlist), | 36 | .mmlist = LIST_HEAD_INIT(init_mm.mmlist), |
37 | .user_ns = &init_user_ns, | 37 | .user_ns = &init_user_ns, |
38 | .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, | 38 | .cpu_bitmap = CPU_BITS_NONE, |
39 | INIT_MM_CONTEXT(init_mm) | 39 | INIT_MM_CONTEXT(init_mm) |
40 | }; | 40 | }; |
diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 95d16a42db6b..6814d6d6a023 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c | |||
@@ -304,7 +304,6 @@ size_t kasan_metadata_size(struct kmem_cache *cache) | |||
304 | struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, | 304 | struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, |
305 | const void *object) | 305 | const void *object) |
306 | { | 306 | { |
307 | BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); | ||
308 | return (void *)object + cache->kasan_info.alloc_meta_offset; | 307 | return (void *)object + cache->kasan_info.alloc_meta_offset; |
309 | } | 308 | } |
310 | 309 | ||
@@ -315,14 +314,31 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache, | |||
315 | return (void *)object + cache->kasan_info.free_meta_offset; | 314 | return (void *)object + cache->kasan_info.free_meta_offset; |
316 | } | 315 | } |
317 | 316 | ||
317 | |||
318 | static void kasan_set_free_info(struct kmem_cache *cache, | ||
319 | void *object, u8 tag) | ||
320 | { | ||
321 | struct kasan_alloc_meta *alloc_meta; | ||
322 | u8 idx = 0; | ||
323 | |||
324 | alloc_meta = get_alloc_info(cache, object); | ||
325 | |||
326 | #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY | ||
327 | idx = alloc_meta->free_track_idx; | ||
328 | alloc_meta->free_pointer_tag[idx] = tag; | ||
329 | alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; | ||
330 | #endif | ||
331 | |||
332 | set_track(&alloc_meta->free_track[idx], GFP_NOWAIT); | ||
333 | } | ||
334 | |||
318 | void kasan_poison_slab(struct page *page) | 335 | void kasan_poison_slab(struct page *page) |
319 | { | 336 | { |
320 | unsigned long i; | 337 | unsigned long i; |
321 | 338 | ||
322 | for (i = 0; i < (1 << compound_order(page)); i++) | 339 | for (i = 0; i < compound_nr(page); i++) |
323 | page_kasan_tag_reset(page + i); | 340 | page_kasan_tag_reset(page + i); |
324 | kasan_poison_shadow(page_address(page), | 341 | kasan_poison_shadow(page_address(page), page_size(page), |
325 | PAGE_SIZE << compound_order(page), | ||
326 | KASAN_KMALLOC_REDZONE); | 342 | KASAN_KMALLOC_REDZONE); |
327 | } | 343 | } |
328 | 344 | ||
@@ -452,7 +468,8 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object, | |||
452 | unlikely(!(cache->flags & SLAB_KASAN))) | 468 | unlikely(!(cache->flags & SLAB_KASAN))) |
453 | return false; | 469 | return false; |
454 | 470 | ||
455 | set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT); | 471 | kasan_set_free_info(cache, object, tag); |
472 | |||
456 | quarantine_put(get_free_info(cache, object), cache); | 473 | quarantine_put(get_free_info(cache, object), cache); |
457 | 474 | ||
458 | return IS_ENABLED(CONFIG_KASAN_GENERIC); | 475 | return IS_ENABLED(CONFIG_KASAN_GENERIC); |
@@ -524,7 +541,7 @@ void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, | |||
524 | page = virt_to_page(ptr); | 541 | page = virt_to_page(ptr); |
525 | redzone_start = round_up((unsigned long)(ptr + size), | 542 | redzone_start = round_up((unsigned long)(ptr + size), |
526 | KASAN_SHADOW_SCALE_SIZE); | 543 | KASAN_SHADOW_SCALE_SIZE); |
527 | redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page)); | 544 | redzone_end = (unsigned long)ptr + page_size(page); |
528 | 545 | ||
529 | kasan_unpoison_shadow(ptr, size); | 546 | kasan_unpoison_shadow(ptr, size); |
530 | kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, | 547 | kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, |
@@ -560,8 +577,7 @@ void kasan_poison_kfree(void *ptr, unsigned long ip) | |||
560 | kasan_report_invalid_free(ptr, ip); | 577 | kasan_report_invalid_free(ptr, ip); |
561 | return; | 578 | return; |
562 | } | 579 | } |
563 | kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), | 580 | kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE); |
564 | KASAN_FREE_PAGE); | ||
565 | } else { | 581 | } else { |
566 | __kasan_slab_free(page->slab_cache, ptr, ip, false); | 582 | __kasan_slab_free(page->slab_cache, ptr, ip, false); |
567 | } | 583 | } |
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 014f19e76247..35cff6bbb716 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h | |||
@@ -95,9 +95,19 @@ struct kasan_track { | |||
95 | depot_stack_handle_t stack; | 95 | depot_stack_handle_t stack; |
96 | }; | 96 | }; |
97 | 97 | ||
98 | #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY | ||
99 | #define KASAN_NR_FREE_STACKS 5 | ||
100 | #else | ||
101 | #define KASAN_NR_FREE_STACKS 1 | ||
102 | #endif | ||
103 | |||
98 | struct kasan_alloc_meta { | 104 | struct kasan_alloc_meta { |
99 | struct kasan_track alloc_track; | 105 | struct kasan_track alloc_track; |
100 | struct kasan_track free_track; | 106 | struct kasan_track free_track[KASAN_NR_FREE_STACKS]; |
107 | #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY | ||
108 | u8 free_pointer_tag[KASAN_NR_FREE_STACKS]; | ||
109 | u8 free_track_idx; | ||
110 | #endif | ||
101 | }; | 111 | }; |
102 | 112 | ||
103 | struct qlist_node { | 113 | struct qlist_node { |
@@ -146,6 +156,8 @@ void kasan_report(unsigned long addr, size_t size, | |||
146 | bool is_write, unsigned long ip); | 156 | bool is_write, unsigned long ip); |
147 | void kasan_report_invalid_free(void *object, unsigned long ip); | 157 | void kasan_report_invalid_free(void *object, unsigned long ip); |
148 | 158 | ||
159 | struct page *kasan_addr_to_page(const void *addr); | ||
160 | |||
149 | #if defined(CONFIG_KASAN_GENERIC) && \ | 161 | #if defined(CONFIG_KASAN_GENERIC) && \ |
150 | (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) | 162 | (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) |
151 | void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); | 163 | void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); |
diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 0e5f965f1882..621782100eaa 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c | |||
@@ -111,7 +111,7 @@ static void print_track(struct kasan_track *track, const char *prefix) | |||
111 | } | 111 | } |
112 | } | 112 | } |
113 | 113 | ||
114 | static struct page *addr_to_page(const void *addr) | 114 | struct page *kasan_addr_to_page(const void *addr) |
115 | { | 115 | { |
116 | if ((addr >= (void *)PAGE_OFFSET) && | 116 | if ((addr >= (void *)PAGE_OFFSET) && |
117 | (addr < high_memory)) | 117 | (addr < high_memory)) |
@@ -151,15 +151,38 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, | |||
151 | (void *)(object_addr + cache->object_size)); | 151 | (void *)(object_addr + cache->object_size)); |
152 | } | 152 | } |
153 | 153 | ||
154 | static struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, | ||
155 | void *object, u8 tag) | ||
156 | { | ||
157 | struct kasan_alloc_meta *alloc_meta; | ||
158 | int i = 0; | ||
159 | |||
160 | alloc_meta = get_alloc_info(cache, object); | ||
161 | |||
162 | #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY | ||
163 | for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { | ||
164 | if (alloc_meta->free_pointer_tag[i] == tag) | ||
165 | break; | ||
166 | } | ||
167 | if (i == KASAN_NR_FREE_STACKS) | ||
168 | i = alloc_meta->free_track_idx; | ||
169 | #endif | ||
170 | |||
171 | return &alloc_meta->free_track[i]; | ||
172 | } | ||
173 | |||
154 | static void describe_object(struct kmem_cache *cache, void *object, | 174 | static void describe_object(struct kmem_cache *cache, void *object, |
155 | const void *addr) | 175 | const void *addr, u8 tag) |
156 | { | 176 | { |
157 | struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); | 177 | struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); |
158 | 178 | ||
159 | if (cache->flags & SLAB_KASAN) { | 179 | if (cache->flags & SLAB_KASAN) { |
180 | struct kasan_track *free_track; | ||
181 | |||
160 | print_track(&alloc_info->alloc_track, "Allocated"); | 182 | print_track(&alloc_info->alloc_track, "Allocated"); |
161 | pr_err("\n"); | 183 | pr_err("\n"); |
162 | print_track(&alloc_info->free_track, "Freed"); | 184 | free_track = kasan_get_free_track(cache, object, tag); |
185 | print_track(free_track, "Freed"); | ||
163 | pr_err("\n"); | 186 | pr_err("\n"); |
164 | } | 187 | } |
165 | 188 | ||
@@ -344,9 +367,9 @@ static void print_address_stack_frame(const void *addr) | |||
344 | print_decoded_frame_descr(frame_descr); | 367 | print_decoded_frame_descr(frame_descr); |
345 | } | 368 | } |
346 | 369 | ||
347 | static void print_address_description(void *addr) | 370 | static void print_address_description(void *addr, u8 tag) |
348 | { | 371 | { |
349 | struct page *page = addr_to_page(addr); | 372 | struct page *page = kasan_addr_to_page(addr); |
350 | 373 | ||
351 | dump_stack(); | 374 | dump_stack(); |
352 | pr_err("\n"); | 375 | pr_err("\n"); |
@@ -355,7 +378,7 @@ static void print_address_description(void *addr) | |||
355 | struct kmem_cache *cache = page->slab_cache; | 378 | struct kmem_cache *cache = page->slab_cache; |
356 | void *object = nearest_obj(cache, page, addr); | 379 | void *object = nearest_obj(cache, page, addr); |
357 | 380 | ||
358 | describe_object(cache, object, addr); | 381 | describe_object(cache, object, addr, tag); |
359 | } | 382 | } |
360 | 383 | ||
361 | if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { | 384 | if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { |
@@ -435,13 +458,14 @@ static bool report_enabled(void) | |||
435 | void kasan_report_invalid_free(void *object, unsigned long ip) | 458 | void kasan_report_invalid_free(void *object, unsigned long ip) |
436 | { | 459 | { |
437 | unsigned long flags; | 460 | unsigned long flags; |
461 | u8 tag = get_tag(object); | ||
438 | 462 | ||
463 | object = reset_tag(object); | ||
439 | start_report(&flags); | 464 | start_report(&flags); |
440 | pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); | 465 | pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); |
441 | print_tags(get_tag(object), reset_tag(object)); | 466 | print_tags(tag, object); |
442 | object = reset_tag(object); | ||
443 | pr_err("\n"); | 467 | pr_err("\n"); |
444 | print_address_description(object); | 468 | print_address_description(object, tag); |
445 | pr_err("\n"); | 469 | pr_err("\n"); |
446 | print_shadow_for_address(object); | 470 | print_shadow_for_address(object); |
447 | end_report(&flags); | 471 | end_report(&flags); |
@@ -479,7 +503,7 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon | |||
479 | pr_err("\n"); | 503 | pr_err("\n"); |
480 | 504 | ||
481 | if (addr_has_shadow(untagged_addr)) { | 505 | if (addr_has_shadow(untagged_addr)) { |
482 | print_address_description(untagged_addr); | 506 | print_address_description(untagged_addr, get_tag(tagged_addr)); |
483 | pr_err("\n"); | 507 | pr_err("\n"); |
484 | print_shadow_for_address(info.first_bad_addr); | 508 | print_shadow_for_address(info.first_bad_addr); |
485 | } else { | 509 | } else { |
diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c index 8eaf5f722271..969ae08f59d7 100644 --- a/mm/kasan/tags_report.c +++ b/mm/kasan/tags_report.c | |||
@@ -36,6 +36,30 @@ | |||
36 | 36 | ||
37 | const char *get_bug_type(struct kasan_access_info *info) | 37 | const char *get_bug_type(struct kasan_access_info *info) |
38 | { | 38 | { |
39 | #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY | ||
40 | struct kasan_alloc_meta *alloc_meta; | ||
41 | struct kmem_cache *cache; | ||
42 | struct page *page; | ||
43 | const void *addr; | ||
44 | void *object; | ||
45 | u8 tag; | ||
46 | int i; | ||
47 | |||
48 | tag = get_tag(info->access_addr); | ||
49 | addr = reset_tag(info->access_addr); | ||
50 | page = kasan_addr_to_page(addr); | ||
51 | if (page && PageSlab(page)) { | ||
52 | cache = page->slab_cache; | ||
53 | object = nearest_obj(cache, page, (void *)addr); | ||
54 | alloc_meta = get_alloc_info(cache, object); | ||
55 | |||
56 | for (i = 0; i < KASAN_NR_FREE_STACKS; i++) | ||
57 | if (alloc_meta->free_pointer_tag[i] == tag) | ||
58 | return "use-after-free"; | ||
59 | return "out-of-bounds"; | ||
60 | } | ||
61 | |||
62 | #endif | ||
39 | return "invalid-access"; | 63 | return "invalid-access"; |
40 | } | 64 | } |
41 | 65 | ||
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ccede2425c3f..0a1b4b484ac5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c | |||
@@ -48,6 +48,7 @@ enum scan_result { | |||
48 | SCAN_CGROUP_CHARGE_FAIL, | 48 | SCAN_CGROUP_CHARGE_FAIL, |
49 | SCAN_EXCEED_SWAP_PTE, | 49 | SCAN_EXCEED_SWAP_PTE, |
50 | SCAN_TRUNCATED, | 50 | SCAN_TRUNCATED, |
51 | SCAN_PAGE_HAS_PRIVATE, | ||
51 | }; | 52 | }; |
52 | 53 | ||
53 | #define CREATE_TRACE_POINTS | 54 | #define CREATE_TRACE_POINTS |
@@ -76,6 +77,8 @@ static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); | |||
76 | 77 | ||
77 | static struct kmem_cache *mm_slot_cache __read_mostly; | 78 | static struct kmem_cache *mm_slot_cache __read_mostly; |
78 | 79 | ||
80 | #define MAX_PTE_MAPPED_THP 8 | ||
81 | |||
79 | /** | 82 | /** |
80 | * struct mm_slot - hash lookup from mm to mm_slot | 83 | * struct mm_slot - hash lookup from mm to mm_slot |
81 | * @hash: hash collision list | 84 | * @hash: hash collision list |
@@ -86,6 +89,10 @@ struct mm_slot { | |||
86 | struct hlist_node hash; | 89 | struct hlist_node hash; |
87 | struct list_head mm_node; | 90 | struct list_head mm_node; |
88 | struct mm_struct *mm; | 91 | struct mm_struct *mm; |
92 | |||
93 | /* pte-mapped THP in this mm */ | ||
94 | int nr_pte_mapped_thp; | ||
95 | unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP]; | ||
89 | }; | 96 | }; |
90 | 97 | ||
91 | /** | 98 | /** |
@@ -404,7 +411,11 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, | |||
404 | (vm_flags & VM_NOHUGEPAGE) || | 411 | (vm_flags & VM_NOHUGEPAGE) || |
405 | test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) | 412 | test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) |
406 | return false; | 413 | return false; |
407 | if (shmem_file(vma->vm_file)) { | 414 | |
415 | if (shmem_file(vma->vm_file) || | ||
416 | (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && | ||
417 | vma->vm_file && | ||
418 | (vm_flags & VM_DENYWRITE))) { | ||
408 | if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) | 419 | if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) |
409 | return false; | 420 | return false; |
410 | return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, | 421 | return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, |
@@ -456,8 +467,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma, | |||
456 | unsigned long hstart, hend; | 467 | unsigned long hstart, hend; |
457 | 468 | ||
458 | /* | 469 | /* |
459 | * khugepaged does not yet work on non-shmem files or special | 470 | * khugepaged only supports read-only files for non-shmem files. |
460 | * mappings. And file-private shmem THP is not supported. | 471 | * khugepaged does not yet work on special mappings. And |
472 | * file-private shmem THP is not supported. | ||
461 | */ | 473 | */ |
462 | if (!hugepage_vma_check(vma, vm_flags)) | 474 | if (!hugepage_vma_check(vma, vm_flags)) |
463 | return 0; | 475 | return 0; |
@@ -1248,6 +1260,159 @@ static void collect_mm_slot(struct mm_slot *mm_slot) | |||
1248 | } | 1260 | } |
1249 | 1261 | ||
1250 | #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) | 1262 | #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) |
1263 | /* | ||
1264 | * Notify khugepaged that given addr of the mm is pte-mapped THP. Then | ||
1265 | * khugepaged should try to collapse the page table. | ||
1266 | */ | ||
1267 | static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, | ||
1268 | unsigned long addr) | ||
1269 | { | ||
1270 | struct mm_slot *mm_slot; | ||
1271 | |||
1272 | VM_BUG_ON(addr & ~HPAGE_PMD_MASK); | ||
1273 | |||
1274 | spin_lock(&khugepaged_mm_lock); | ||
1275 | mm_slot = get_mm_slot(mm); | ||
1276 | if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) | ||
1277 | mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; | ||
1278 | spin_unlock(&khugepaged_mm_lock); | ||
1279 | return 0; | ||
1280 | } | ||
1281 | |||
1282 | /** | ||
1283 | * Try to collapse a pte-mapped THP for mm at address haddr. | ||
1284 | * | ||
1285 | * This function checks whether all the PTEs in the PMD are pointing to the | ||
1286 | * right THP. If so, retract the page table so the THP can refault in with | ||
1287 | * as pmd-mapped. | ||
1288 | */ | ||
1289 | void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) | ||
1290 | { | ||
1291 | unsigned long haddr = addr & HPAGE_PMD_MASK; | ||
1292 | struct vm_area_struct *vma = find_vma(mm, haddr); | ||
1293 | struct page *hpage = NULL; | ||
1294 | pte_t *start_pte, *pte; | ||
1295 | pmd_t *pmd, _pmd; | ||
1296 | spinlock_t *ptl; | ||
1297 | int count = 0; | ||
1298 | int i; | ||
1299 | |||
1300 | if (!vma || !vma->vm_file || | ||
1301 | vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE) | ||
1302 | return; | ||
1303 | |||
1304 | /* | ||
1305 | * This vm_flags may not have VM_HUGEPAGE if the page was not | ||
1306 | * collapsed by this mm. But we can still collapse if the page is | ||
1307 | * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() | ||
1308 | * will not fail the vma for missing VM_HUGEPAGE | ||
1309 | */ | ||
1310 | if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE)) | ||
1311 | return; | ||
1312 | |||
1313 | pmd = mm_find_pmd(mm, haddr); | ||
1314 | if (!pmd) | ||
1315 | return; | ||
1316 | |||
1317 | start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); | ||
1318 | |||
1319 | /* step 1: check all mapped PTEs are to the right huge page */ | ||
1320 | for (i = 0, addr = haddr, pte = start_pte; | ||
1321 | i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { | ||
1322 | struct page *page; | ||
1323 | |||
1324 | /* empty pte, skip */ | ||
1325 | if (pte_none(*pte)) | ||
1326 | continue; | ||
1327 | |||
1328 | /* page swapped out, abort */ | ||
1329 | if (!pte_present(*pte)) | ||
1330 | goto abort; | ||
1331 | |||
1332 | page = vm_normal_page(vma, addr, *pte); | ||
1333 | |||
1334 | if (!page || !PageCompound(page)) | ||
1335 | goto abort; | ||
1336 | |||
1337 | if (!hpage) { | ||
1338 | hpage = compound_head(page); | ||
1339 | /* | ||
1340 | * The mapping of the THP should not change. | ||
1341 | * | ||
1342 | * Note that uprobe, debugger, or MAP_PRIVATE may | ||
1343 | * change the page table, but the new page will | ||
1344 | * not pass PageCompound() check. | ||
1345 | */ | ||
1346 | if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping)) | ||
1347 | goto abort; | ||
1348 | } | ||
1349 | |||
1350 | /* | ||
1351 | * Confirm the page maps to the correct subpage. | ||
1352 | * | ||
1353 | * Note that uprobe, debugger, or MAP_PRIVATE may change | ||
1354 | * the page table, but the new page will not pass | ||
1355 | * PageCompound() check. | ||
1356 | */ | ||
1357 | if (WARN_ON(hpage + i != page)) | ||
1358 | goto abort; | ||
1359 | count++; | ||
1360 | } | ||
1361 | |||
1362 | /* step 2: adjust rmap */ | ||
1363 | for (i = 0, addr = haddr, pte = start_pte; | ||
1364 | i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { | ||
1365 | struct page *page; | ||
1366 | |||
1367 | if (pte_none(*pte)) | ||
1368 | continue; | ||
1369 | page = vm_normal_page(vma, addr, *pte); | ||
1370 | page_remove_rmap(page, false); | ||
1371 | } | ||
1372 | |||
1373 | pte_unmap_unlock(start_pte, ptl); | ||
1374 | |||
1375 | /* step 3: set proper refcount and mm_counters. */ | ||
1376 | if (hpage) { | ||
1377 | page_ref_sub(hpage, count); | ||
1378 | add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count); | ||
1379 | } | ||
1380 | |||
1381 | /* step 4: collapse pmd */ | ||
1382 | ptl = pmd_lock(vma->vm_mm, pmd); | ||
1383 | _pmd = pmdp_collapse_flush(vma, addr, pmd); | ||
1384 | spin_unlock(ptl); | ||
1385 | mm_dec_nr_ptes(mm); | ||
1386 | pte_free(mm, pmd_pgtable(_pmd)); | ||
1387 | return; | ||
1388 | |||
1389 | abort: | ||
1390 | pte_unmap_unlock(start_pte, ptl); | ||
1391 | } | ||
1392 | |||
1393 | static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) | ||
1394 | { | ||
1395 | struct mm_struct *mm = mm_slot->mm; | ||
1396 | int i; | ||
1397 | |||
1398 | if (likely(mm_slot->nr_pte_mapped_thp == 0)) | ||
1399 | return 0; | ||
1400 | |||
1401 | if (!down_write_trylock(&mm->mmap_sem)) | ||
1402 | return -EBUSY; | ||
1403 | |||
1404 | if (unlikely(khugepaged_test_exit(mm))) | ||
1405 | goto out; | ||
1406 | |||
1407 | for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) | ||
1408 | collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]); | ||
1409 | |||
1410 | out: | ||
1411 | mm_slot->nr_pte_mapped_thp = 0; | ||
1412 | up_write(&mm->mmap_sem); | ||
1413 | return 0; | ||
1414 | } | ||
1415 | |||
1251 | static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) | 1416 | static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) |
1252 | { | 1417 | { |
1253 | struct vm_area_struct *vma; | 1418 | struct vm_area_struct *vma; |
@@ -1256,7 +1421,22 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) | |||
1256 | 1421 | ||
1257 | i_mmap_lock_write(mapping); | 1422 | i_mmap_lock_write(mapping); |
1258 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | 1423 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
1259 | /* probably overkill */ | 1424 | /* |
1425 | * Check vma->anon_vma to exclude MAP_PRIVATE mappings that | ||
1426 | * got written to. These VMAs are likely not worth investing | ||
1427 | * down_write(mmap_sem) as PMD-mapping is likely to be split | ||
1428 | * later. | ||
1429 | * | ||
1430 | * Not that vma->anon_vma check is racy: it can be set up after | ||
1431 | * the check but before we took mmap_sem by the fault path. | ||
1432 | * But page lock would prevent establishing any new ptes of the | ||
1433 | * page, so we are safe. | ||
1434 | * | ||
1435 | * An alternative would be drop the check, but check that page | ||
1436 | * table is clear before calling pmdp_collapse_flush() under | ||
1437 | * ptl. It has higher chance to recover THP for the VMA, but | ||
1438 | * has higher cost too. | ||
1439 | */ | ||
1260 | if (vma->anon_vma) | 1440 | if (vma->anon_vma) |
1261 | continue; | 1441 | continue; |
1262 | addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | 1442 | addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); |
@@ -1269,9 +1449,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) | |||
1269 | continue; | 1449 | continue; |
1270 | /* | 1450 | /* |
1271 | * We need exclusive mmap_sem to retract page table. | 1451 | * We need exclusive mmap_sem to retract page table. |
1272 | * If trylock fails we would end up with pte-mapped THP after | 1452 | * |
1273 | * re-fault. Not ideal, but it's more important to not disturb | 1453 | * We use trylock due to lock inversion: we need to acquire |
1274 | * the system too much. | 1454 | * mmap_sem while holding page lock. Fault path does it in |
1455 | * reverse order. Trylock is a way to avoid deadlock. | ||
1275 | */ | 1456 | */ |
1276 | if (down_write_trylock(&vma->vm_mm->mmap_sem)) { | 1457 | if (down_write_trylock(&vma->vm_mm->mmap_sem)) { |
1277 | spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); | 1458 | spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); |
@@ -1281,18 +1462,21 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) | |||
1281 | up_write(&vma->vm_mm->mmap_sem); | 1462 | up_write(&vma->vm_mm->mmap_sem); |
1282 | mm_dec_nr_ptes(vma->vm_mm); | 1463 | mm_dec_nr_ptes(vma->vm_mm); |
1283 | pte_free(vma->vm_mm, pmd_pgtable(_pmd)); | 1464 | pte_free(vma->vm_mm, pmd_pgtable(_pmd)); |
1465 | } else { | ||
1466 | /* Try again later */ | ||
1467 | khugepaged_add_pte_mapped_thp(vma->vm_mm, addr); | ||
1284 | } | 1468 | } |
1285 | } | 1469 | } |
1286 | i_mmap_unlock_write(mapping); | 1470 | i_mmap_unlock_write(mapping); |
1287 | } | 1471 | } |
1288 | 1472 | ||
1289 | /** | 1473 | /** |
1290 | * collapse_shmem - collapse small tmpfs/shmem pages into huge one. | 1474 | * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. |
1291 | * | 1475 | * |
1292 | * Basic scheme is simple, details are more complex: | 1476 | * Basic scheme is simple, details are more complex: |
1293 | * - allocate and lock a new huge page; | 1477 | * - allocate and lock a new huge page; |
1294 | * - scan page cache replacing old pages with the new one | 1478 | * - scan page cache replacing old pages with the new one |
1295 | * + swap in pages if necessary; | 1479 | * + swap/gup in pages if necessary; |
1296 | * + fill in gaps; | 1480 | * + fill in gaps; |
1297 | * + keep old pages around in case rollback is required; | 1481 | * + keep old pages around in case rollback is required; |
1298 | * - if replacing succeeds: | 1482 | * - if replacing succeeds: |
@@ -1304,10 +1488,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) | |||
1304 | * + restore gaps in the page cache; | 1488 | * + restore gaps in the page cache; |
1305 | * + unlock and free huge page; | 1489 | * + unlock and free huge page; |
1306 | */ | 1490 | */ |
1307 | static void collapse_shmem(struct mm_struct *mm, | 1491 | static void collapse_file(struct mm_struct *mm, |
1308 | struct address_space *mapping, pgoff_t start, | 1492 | struct file *file, pgoff_t start, |
1309 | struct page **hpage, int node) | 1493 | struct page **hpage, int node) |
1310 | { | 1494 | { |
1495 | struct address_space *mapping = file->f_mapping; | ||
1311 | gfp_t gfp; | 1496 | gfp_t gfp; |
1312 | struct page *new_page; | 1497 | struct page *new_page; |
1313 | struct mem_cgroup *memcg; | 1498 | struct mem_cgroup *memcg; |
@@ -1315,7 +1500,9 @@ static void collapse_shmem(struct mm_struct *mm, | |||
1315 | LIST_HEAD(pagelist); | 1500 | LIST_HEAD(pagelist); |
1316 | XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); | 1501 | XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); |
1317 | int nr_none = 0, result = SCAN_SUCCEED; | 1502 | int nr_none = 0, result = SCAN_SUCCEED; |
1503 | bool is_shmem = shmem_file(file); | ||
1318 | 1504 | ||
1505 | VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); | ||
1319 | VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); | 1506 | VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); |
1320 | 1507 | ||
1321 | /* Only allocate from the target node */ | 1508 | /* Only allocate from the target node */ |
@@ -1347,7 +1534,8 @@ static void collapse_shmem(struct mm_struct *mm, | |||
1347 | } while (1); | 1534 | } while (1); |
1348 | 1535 | ||
1349 | __SetPageLocked(new_page); | 1536 | __SetPageLocked(new_page); |
1350 | __SetPageSwapBacked(new_page); | 1537 | if (is_shmem) |
1538 | __SetPageSwapBacked(new_page); | ||
1351 | new_page->index = start; | 1539 | new_page->index = start; |
1352 | new_page->mapping = mapping; | 1540 | new_page->mapping = mapping; |
1353 | 1541 | ||
@@ -1362,41 +1550,75 @@ static void collapse_shmem(struct mm_struct *mm, | |||
1362 | struct page *page = xas_next(&xas); | 1550 | struct page *page = xas_next(&xas); |
1363 | 1551 | ||
1364 | VM_BUG_ON(index != xas.xa_index); | 1552 | VM_BUG_ON(index != xas.xa_index); |
1365 | if (!page) { | 1553 | if (is_shmem) { |
1366 | /* | 1554 | if (!page) { |
1367 | * Stop if extent has been truncated or hole-punched, | 1555 | /* |
1368 | * and is now completely empty. | 1556 | * Stop if extent has been truncated or |
1369 | */ | 1557 | * hole-punched, and is now completely |
1370 | if (index == start) { | 1558 | * empty. |
1371 | if (!xas_next_entry(&xas, end - 1)) { | 1559 | */ |
1372 | result = SCAN_TRUNCATED; | 1560 | if (index == start) { |
1561 | if (!xas_next_entry(&xas, end - 1)) { | ||
1562 | result = SCAN_TRUNCATED; | ||
1563 | goto xa_locked; | ||
1564 | } | ||
1565 | xas_set(&xas, index); | ||
1566 | } | ||
1567 | if (!shmem_charge(mapping->host, 1)) { | ||
1568 | result = SCAN_FAIL; | ||
1373 | goto xa_locked; | 1569 | goto xa_locked; |
1374 | } | 1570 | } |
1375 | xas_set(&xas, index); | 1571 | xas_store(&xas, new_page); |
1572 | nr_none++; | ||
1573 | continue; | ||
1376 | } | 1574 | } |
1377 | if (!shmem_charge(mapping->host, 1)) { | 1575 | |
1378 | result = SCAN_FAIL; | 1576 | if (xa_is_value(page) || !PageUptodate(page)) { |
1577 | xas_unlock_irq(&xas); | ||
1578 | /* swap in or instantiate fallocated page */ | ||
1579 | if (shmem_getpage(mapping->host, index, &page, | ||
1580 | SGP_NOHUGE)) { | ||
1581 | result = SCAN_FAIL; | ||
1582 | goto xa_unlocked; | ||
1583 | } | ||
1584 | } else if (trylock_page(page)) { | ||
1585 | get_page(page); | ||
1586 | xas_unlock_irq(&xas); | ||
1587 | } else { | ||
1588 | result = SCAN_PAGE_LOCK; | ||
1379 | goto xa_locked; | 1589 | goto xa_locked; |
1380 | } | 1590 | } |
1381 | xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); | 1591 | } else { /* !is_shmem */ |
1382 | nr_none++; | 1592 | if (!page || xa_is_value(page)) { |
1383 | continue; | 1593 | xas_unlock_irq(&xas); |
1384 | } | 1594 | page_cache_sync_readahead(mapping, &file->f_ra, |
1385 | 1595 | file, index, | |
1386 | if (xa_is_value(page) || !PageUptodate(page)) { | 1596 | PAGE_SIZE); |
1387 | xas_unlock_irq(&xas); | 1597 | /* drain pagevecs to help isolate_lru_page() */ |
1388 | /* swap in or instantiate fallocated page */ | 1598 | lru_add_drain(); |
1389 | if (shmem_getpage(mapping->host, index, &page, | 1599 | page = find_lock_page(mapping, index); |
1390 | SGP_NOHUGE)) { | 1600 | if (unlikely(page == NULL)) { |
1601 | result = SCAN_FAIL; | ||
1602 | goto xa_unlocked; | ||
1603 | } | ||
1604 | } else if (!PageUptodate(page)) { | ||
1605 | xas_unlock_irq(&xas); | ||
1606 | wait_on_page_locked(page); | ||
1607 | if (!trylock_page(page)) { | ||
1608 | result = SCAN_PAGE_LOCK; | ||
1609 | goto xa_unlocked; | ||
1610 | } | ||
1611 | get_page(page); | ||
1612 | } else if (PageDirty(page)) { | ||
1391 | result = SCAN_FAIL; | 1613 | result = SCAN_FAIL; |
1392 | goto xa_unlocked; | 1614 | goto xa_locked; |
1615 | } else if (trylock_page(page)) { | ||
1616 | get_page(page); | ||
1617 | xas_unlock_irq(&xas); | ||
1618 | } else { | ||
1619 | result = SCAN_PAGE_LOCK; | ||
1620 | goto xa_locked; | ||
1393 | } | 1621 | } |
1394 | } else if (trylock_page(page)) { | ||
1395 | get_page(page); | ||
1396 | xas_unlock_irq(&xas); | ||
1397 | } else { | ||
1398 | result = SCAN_PAGE_LOCK; | ||
1399 | goto xa_locked; | ||
1400 | } | 1622 | } |
1401 | 1623 | ||
1402 | /* | 1624 | /* |
@@ -1425,6 +1647,12 @@ static void collapse_shmem(struct mm_struct *mm, | |||
1425 | goto out_unlock; | 1647 | goto out_unlock; |
1426 | } | 1648 | } |
1427 | 1649 | ||
1650 | if (page_has_private(page) && | ||
1651 | !try_to_release_page(page, GFP_KERNEL)) { | ||
1652 | result = SCAN_PAGE_HAS_PRIVATE; | ||
1653 | goto out_unlock; | ||
1654 | } | ||
1655 | |||
1428 | if (page_mapped(page)) | 1656 | if (page_mapped(page)) |
1429 | unmap_mapping_pages(mapping, index, 1, false); | 1657 | unmap_mapping_pages(mapping, index, 1, false); |
1430 | 1658 | ||
@@ -1454,7 +1682,7 @@ static void collapse_shmem(struct mm_struct *mm, | |||
1454 | list_add_tail(&page->lru, &pagelist); | 1682 | list_add_tail(&page->lru, &pagelist); |
1455 | 1683 | ||
1456 | /* Finally, replace with the new page. */ | 1684 | /* Finally, replace with the new page. */ |
1457 | xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); | 1685 | xas_store(&xas, new_page); |
1458 | continue; | 1686 | continue; |
1459 | out_unlock: | 1687 | out_unlock: |
1460 | unlock_page(page); | 1688 | unlock_page(page); |
@@ -1462,12 +1690,20 @@ out_unlock: | |||
1462 | goto xa_unlocked; | 1690 | goto xa_unlocked; |
1463 | } | 1691 | } |
1464 | 1692 | ||
1465 | __inc_node_page_state(new_page, NR_SHMEM_THPS); | 1693 | if (is_shmem) |
1694 | __inc_node_page_state(new_page, NR_SHMEM_THPS); | ||
1695 | else { | ||
1696 | __inc_node_page_state(new_page, NR_FILE_THPS); | ||
1697 | filemap_nr_thps_inc(mapping); | ||
1698 | } | ||
1699 | |||
1466 | if (nr_none) { | 1700 | if (nr_none) { |
1467 | struct zone *zone = page_zone(new_page); | 1701 | struct zone *zone = page_zone(new_page); |
1468 | 1702 | ||
1469 | __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); | 1703 | __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); |
1470 | __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); | 1704 | if (is_shmem) |
1705 | __mod_node_page_state(zone->zone_pgdat, | ||
1706 | NR_SHMEM, nr_none); | ||
1471 | } | 1707 | } |
1472 | 1708 | ||
1473 | xa_locked: | 1709 | xa_locked: |
@@ -1505,10 +1741,15 @@ xa_unlocked: | |||
1505 | 1741 | ||
1506 | SetPageUptodate(new_page); | 1742 | SetPageUptodate(new_page); |
1507 | page_ref_add(new_page, HPAGE_PMD_NR - 1); | 1743 | page_ref_add(new_page, HPAGE_PMD_NR - 1); |
1508 | set_page_dirty(new_page); | ||
1509 | mem_cgroup_commit_charge(new_page, memcg, false, true); | 1744 | mem_cgroup_commit_charge(new_page, memcg, false, true); |
1745 | |||
1746 | if (is_shmem) { | ||
1747 | set_page_dirty(new_page); | ||
1748 | lru_cache_add_anon(new_page); | ||
1749 | } else { | ||
1750 | lru_cache_add_file(new_page); | ||
1751 | } | ||
1510 | count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); | 1752 | count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); |
1511 | lru_cache_add_anon(new_page); | ||
1512 | 1753 | ||
1513 | /* | 1754 | /* |
1514 | * Remove pte page tables, so we can re-fault the page as huge. | 1755 | * Remove pte page tables, so we can re-fault the page as huge. |
@@ -1523,7 +1764,9 @@ xa_unlocked: | |||
1523 | /* Something went wrong: roll back page cache changes */ | 1764 | /* Something went wrong: roll back page cache changes */ |
1524 | xas_lock_irq(&xas); | 1765 | xas_lock_irq(&xas); |
1525 | mapping->nrpages -= nr_none; | 1766 | mapping->nrpages -= nr_none; |
1526 | shmem_uncharge(mapping->host, nr_none); | 1767 | |
1768 | if (is_shmem) | ||
1769 | shmem_uncharge(mapping->host, nr_none); | ||
1527 | 1770 | ||
1528 | xas_set(&xas, start); | 1771 | xas_set(&xas, start); |
1529 | xas_for_each(&xas, page, end - 1) { | 1772 | xas_for_each(&xas, page, end - 1) { |
@@ -1563,11 +1806,11 @@ out: | |||
1563 | /* TODO: tracepoints */ | 1806 | /* TODO: tracepoints */ |
1564 | } | 1807 | } |
1565 | 1808 | ||
1566 | static void khugepaged_scan_shmem(struct mm_struct *mm, | 1809 | static void khugepaged_scan_file(struct mm_struct *mm, |
1567 | struct address_space *mapping, | 1810 | struct file *file, pgoff_t start, struct page **hpage) |
1568 | pgoff_t start, struct page **hpage) | ||
1569 | { | 1811 | { |
1570 | struct page *page = NULL; | 1812 | struct page *page = NULL; |
1813 | struct address_space *mapping = file->f_mapping; | ||
1571 | XA_STATE(xas, &mapping->i_pages, start); | 1814 | XA_STATE(xas, &mapping->i_pages, start); |
1572 | int present, swap; | 1815 | int present, swap; |
1573 | int node = NUMA_NO_NODE; | 1816 | int node = NUMA_NO_NODE; |
@@ -1606,7 +1849,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, | |||
1606 | break; | 1849 | break; |
1607 | } | 1850 | } |
1608 | 1851 | ||
1609 | if (page_count(page) != 1 + page_mapcount(page)) { | 1852 | if (page_count(page) != |
1853 | 1 + page_mapcount(page) + page_has_private(page)) { | ||
1610 | result = SCAN_PAGE_COUNT; | 1854 | result = SCAN_PAGE_COUNT; |
1611 | break; | 1855 | break; |
1612 | } | 1856 | } |
@@ -1631,19 +1875,23 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, | |||
1631 | result = SCAN_EXCEED_NONE_PTE; | 1875 | result = SCAN_EXCEED_NONE_PTE; |
1632 | } else { | 1876 | } else { |
1633 | node = khugepaged_find_target_node(); | 1877 | node = khugepaged_find_target_node(); |
1634 | collapse_shmem(mm, mapping, start, hpage, node); | 1878 | collapse_file(mm, file, start, hpage, node); |
1635 | } | 1879 | } |
1636 | } | 1880 | } |
1637 | 1881 | ||
1638 | /* TODO: tracepoints */ | 1882 | /* TODO: tracepoints */ |
1639 | } | 1883 | } |
1640 | #else | 1884 | #else |
1641 | static void khugepaged_scan_shmem(struct mm_struct *mm, | 1885 | static void khugepaged_scan_file(struct mm_struct *mm, |
1642 | struct address_space *mapping, | 1886 | struct file *file, pgoff_t start, struct page **hpage) |
1643 | pgoff_t start, struct page **hpage) | ||
1644 | { | 1887 | { |
1645 | BUILD_BUG(); | 1888 | BUILD_BUG(); |
1646 | } | 1889 | } |
1890 | |||
1891 | static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) | ||
1892 | { | ||
1893 | return 0; | ||
1894 | } | ||
1647 | #endif | 1895 | #endif |
1648 | 1896 | ||
1649 | static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | 1897 | static unsigned int khugepaged_scan_mm_slot(unsigned int pages, |
@@ -1668,6 +1916,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | |||
1668 | khugepaged_scan.mm_slot = mm_slot; | 1916 | khugepaged_scan.mm_slot = mm_slot; |
1669 | } | 1917 | } |
1670 | spin_unlock(&khugepaged_mm_lock); | 1918 | spin_unlock(&khugepaged_mm_lock); |
1919 | khugepaged_collapse_pte_mapped_thps(mm_slot); | ||
1671 | 1920 | ||
1672 | mm = mm_slot->mm; | 1921 | mm = mm_slot->mm; |
1673 | /* | 1922 | /* |
@@ -1713,17 +1962,18 @@ skip: | |||
1713 | VM_BUG_ON(khugepaged_scan.address < hstart || | 1962 | VM_BUG_ON(khugepaged_scan.address < hstart || |
1714 | khugepaged_scan.address + HPAGE_PMD_SIZE > | 1963 | khugepaged_scan.address + HPAGE_PMD_SIZE > |
1715 | hend); | 1964 | hend); |
1716 | if (shmem_file(vma->vm_file)) { | 1965 | if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { |
1717 | struct file *file; | 1966 | struct file *file; |
1718 | pgoff_t pgoff = linear_page_index(vma, | 1967 | pgoff_t pgoff = linear_page_index(vma, |
1719 | khugepaged_scan.address); | 1968 | khugepaged_scan.address); |
1720 | if (!shmem_huge_enabled(vma)) | 1969 | |
1970 | if (shmem_file(vma->vm_file) | ||
1971 | && !shmem_huge_enabled(vma)) | ||
1721 | goto skip; | 1972 | goto skip; |
1722 | file = get_file(vma->vm_file); | 1973 | file = get_file(vma->vm_file); |
1723 | up_read(&mm->mmap_sem); | 1974 | up_read(&mm->mmap_sem); |
1724 | ret = 1; | 1975 | ret = 1; |
1725 | khugepaged_scan_shmem(mm, file->f_mapping, | 1976 | khugepaged_scan_file(mm, file, pgoff, hpage); |
1726 | pgoff, hpage); | ||
1727 | fput(file); | 1977 | fput(file); |
1728 | } else { | 1978 | } else { |
1729 | ret = khugepaged_scan_pmd(mm, vma, | 1979 | ret = khugepaged_scan_pmd(mm, vma, |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f6e602918dac..03a8d84badad 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -168,6 +168,8 @@ struct kmemleak_object { | |||
168 | #define OBJECT_REPORTED (1 << 1) | 168 | #define OBJECT_REPORTED (1 << 1) |
169 | /* flag set to not scan the object */ | 169 | /* flag set to not scan the object */ |
170 | #define OBJECT_NO_SCAN (1 << 2) | 170 | #define OBJECT_NO_SCAN (1 << 2) |
171 | /* flag set to fully scan the object when scan_area allocation failed */ | ||
172 | #define OBJECT_FULL_SCAN (1 << 3) | ||
171 | 173 | ||
172 | #define HEX_PREFIX " " | 174 | #define HEX_PREFIX " " |
173 | /* number of bytes to print per line; must be 16 or 32 */ | 175 | /* number of bytes to print per line; must be 16 or 32 */ |
@@ -183,6 +185,10 @@ struct kmemleak_object { | |||
183 | static LIST_HEAD(object_list); | 185 | static LIST_HEAD(object_list); |
184 | /* the list of gray-colored objects (see color_gray comment below) */ | 186 | /* the list of gray-colored objects (see color_gray comment below) */ |
185 | static LIST_HEAD(gray_list); | 187 | static LIST_HEAD(gray_list); |
188 | /* memory pool allocation */ | ||
189 | static struct kmemleak_object mem_pool[CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE]; | ||
190 | static int mem_pool_free_count = ARRAY_SIZE(mem_pool); | ||
191 | static LIST_HEAD(mem_pool_free_list); | ||
186 | /* search tree for object boundaries */ | 192 | /* search tree for object boundaries */ |
187 | static struct rb_root object_tree_root = RB_ROOT; | 193 | static struct rb_root object_tree_root = RB_ROOT; |
188 | /* rw_lock protecting the access to object_list and object_tree_root */ | 194 | /* rw_lock protecting the access to object_list and object_tree_root */ |
@@ -193,13 +199,11 @@ static struct kmem_cache *object_cache; | |||
193 | static struct kmem_cache *scan_area_cache; | 199 | static struct kmem_cache *scan_area_cache; |
194 | 200 | ||
195 | /* set if tracing memory operations is enabled */ | 201 | /* set if tracing memory operations is enabled */ |
196 | static int kmemleak_enabled; | 202 | static int kmemleak_enabled = 1; |
197 | /* same as above but only for the kmemleak_free() callback */ | 203 | /* same as above but only for the kmemleak_free() callback */ |
198 | static int kmemleak_free_enabled; | 204 | static int kmemleak_free_enabled = 1; |
199 | /* set in the late_initcall if there were no errors */ | 205 | /* set in the late_initcall if there were no errors */ |
200 | static int kmemleak_initialized; | 206 | static int kmemleak_initialized; |
201 | /* enables or disables early logging of the memory operations */ | ||
202 | static int kmemleak_early_log = 1; | ||
203 | /* set if a kmemleak warning was issued */ | 207 | /* set if a kmemleak warning was issued */ |
204 | static int kmemleak_warning; | 208 | static int kmemleak_warning; |
205 | /* set if a fatal kmemleak error has occurred */ | 209 | /* set if a fatal kmemleak error has occurred */ |
@@ -227,49 +231,6 @@ static bool kmemleak_found_leaks; | |||
227 | static bool kmemleak_verbose; | 231 | static bool kmemleak_verbose; |
228 | module_param_named(verbose, kmemleak_verbose, bool, 0600); | 232 | module_param_named(verbose, kmemleak_verbose, bool, 0600); |
229 | 233 | ||
230 | /* | ||
231 | * Early object allocation/freeing logging. Kmemleak is initialized after the | ||
232 | * kernel allocator. However, both the kernel allocator and kmemleak may | ||
233 | * allocate memory blocks which need to be tracked. Kmemleak defines an | ||
234 | * arbitrary buffer to hold the allocation/freeing information before it is | ||
235 | * fully initialized. | ||
236 | */ | ||
237 | |||
238 | /* kmemleak operation type for early logging */ | ||
239 | enum { | ||
240 | KMEMLEAK_ALLOC, | ||
241 | KMEMLEAK_ALLOC_PERCPU, | ||
242 | KMEMLEAK_FREE, | ||
243 | KMEMLEAK_FREE_PART, | ||
244 | KMEMLEAK_FREE_PERCPU, | ||
245 | KMEMLEAK_NOT_LEAK, | ||
246 | KMEMLEAK_IGNORE, | ||
247 | KMEMLEAK_SCAN_AREA, | ||
248 | KMEMLEAK_NO_SCAN, | ||
249 | KMEMLEAK_SET_EXCESS_REF | ||
250 | }; | ||
251 | |||
252 | /* | ||
253 | * Structure holding the information passed to kmemleak callbacks during the | ||
254 | * early logging. | ||
255 | */ | ||
256 | struct early_log { | ||
257 | int op_type; /* kmemleak operation type */ | ||
258 | int min_count; /* minimum reference count */ | ||
259 | const void *ptr; /* allocated/freed memory block */ | ||
260 | union { | ||
261 | size_t size; /* memory block size */ | ||
262 | unsigned long excess_ref; /* surplus reference passing */ | ||
263 | }; | ||
264 | unsigned long trace[MAX_TRACE]; /* stack trace */ | ||
265 | unsigned int trace_len; /* stack trace length */ | ||
266 | }; | ||
267 | |||
268 | /* early logging buffer and current position */ | ||
269 | static struct early_log | ||
270 | early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata; | ||
271 | static int crt_early_log __initdata; | ||
272 | |||
273 | static void kmemleak_disable(void); | 234 | static void kmemleak_disable(void); |
274 | 235 | ||
275 | /* | 236 | /* |
@@ -450,6 +411,54 @@ static int get_object(struct kmemleak_object *object) | |||
450 | } | 411 | } |
451 | 412 | ||
452 | /* | 413 | /* |
414 | * Memory pool allocation and freeing. kmemleak_lock must not be held. | ||
415 | */ | ||
416 | static struct kmemleak_object *mem_pool_alloc(gfp_t gfp) | ||
417 | { | ||
418 | unsigned long flags; | ||
419 | struct kmemleak_object *object; | ||
420 | |||
421 | /* try the slab allocator first */ | ||
422 | if (object_cache) { | ||
423 | object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); | ||
424 | if (object) | ||
425 | return object; | ||
426 | } | ||
427 | |||
428 | /* slab allocation failed, try the memory pool */ | ||
429 | write_lock_irqsave(&kmemleak_lock, flags); | ||
430 | object = list_first_entry_or_null(&mem_pool_free_list, | ||
431 | typeof(*object), object_list); | ||
432 | if (object) | ||
433 | list_del(&object->object_list); | ||
434 | else if (mem_pool_free_count) | ||
435 | object = &mem_pool[--mem_pool_free_count]; | ||
436 | else | ||
437 | pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n"); | ||
438 | write_unlock_irqrestore(&kmemleak_lock, flags); | ||
439 | |||
440 | return object; | ||
441 | } | ||
442 | |||
443 | /* | ||
444 | * Return the object to either the slab allocator or the memory pool. | ||
445 | */ | ||
446 | static void mem_pool_free(struct kmemleak_object *object) | ||
447 | { | ||
448 | unsigned long flags; | ||
449 | |||
450 | if (object < mem_pool || object >= mem_pool + ARRAY_SIZE(mem_pool)) { | ||
451 | kmem_cache_free(object_cache, object); | ||
452 | return; | ||
453 | } | ||
454 | |||
455 | /* add the object to the memory pool free list */ | ||
456 | write_lock_irqsave(&kmemleak_lock, flags); | ||
457 | list_add(&object->object_list, &mem_pool_free_list); | ||
458 | write_unlock_irqrestore(&kmemleak_lock, flags); | ||
459 | } | ||
460 | |||
461 | /* | ||
453 | * RCU callback to free a kmemleak_object. | 462 | * RCU callback to free a kmemleak_object. |
454 | */ | 463 | */ |
455 | static void free_object_rcu(struct rcu_head *rcu) | 464 | static void free_object_rcu(struct rcu_head *rcu) |
@@ -467,7 +476,7 @@ static void free_object_rcu(struct rcu_head *rcu) | |||
467 | hlist_del(&area->node); | 476 | hlist_del(&area->node); |
468 | kmem_cache_free(scan_area_cache, area); | 477 | kmem_cache_free(scan_area_cache, area); |
469 | } | 478 | } |
470 | kmem_cache_free(object_cache, object); | 479 | mem_pool_free(object); |
471 | } | 480 | } |
472 | 481 | ||
473 | /* | 482 | /* |
@@ -485,7 +494,15 @@ static void put_object(struct kmemleak_object *object) | |||
485 | /* should only get here after delete_object was called */ | 494 | /* should only get here after delete_object was called */ |
486 | WARN_ON(object->flags & OBJECT_ALLOCATED); | 495 | WARN_ON(object->flags & OBJECT_ALLOCATED); |
487 | 496 | ||
488 | call_rcu(&object->rcu, free_object_rcu); | 497 | /* |
498 | * It may be too early for the RCU callbacks, however, there is no | ||
499 | * concurrent object_list traversal when !object_cache and all objects | ||
500 | * came from the memory pool. Free the object directly. | ||
501 | */ | ||
502 | if (object_cache) | ||
503 | call_rcu(&object->rcu, free_object_rcu); | ||
504 | else | ||
505 | free_object_rcu(&object->rcu); | ||
489 | } | 506 | } |
490 | 507 | ||
491 | /* | 508 | /* |
@@ -550,7 +567,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, | |||
550 | struct rb_node **link, *rb_parent; | 567 | struct rb_node **link, *rb_parent; |
551 | unsigned long untagged_ptr; | 568 | unsigned long untagged_ptr; |
552 | 569 | ||
553 | object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); | 570 | object = mem_pool_alloc(gfp); |
554 | if (!object) { | 571 | if (!object) { |
555 | pr_warn("Cannot allocate a kmemleak_object structure\n"); | 572 | pr_warn("Cannot allocate a kmemleak_object structure\n"); |
556 | kmemleak_disable(); | 573 | kmemleak_disable(); |
@@ -689,9 +706,7 @@ static void delete_object_part(unsigned long ptr, size_t size) | |||
689 | /* | 706 | /* |
690 | * Create one or two objects that may result from the memory block | 707 | * Create one or two objects that may result from the memory block |
691 | * split. Note that partial freeing is only done by free_bootmem() and | 708 | * split. Note that partial freeing is only done by free_bootmem() and |
692 | * this happens before kmemleak_init() is called. The path below is | 709 | * this happens before kmemleak_init() is called. |
693 | * only executed during early log recording in kmemleak_init(), so | ||
694 | * GFP_KERNEL is enough. | ||
695 | */ | 710 | */ |
696 | start = object->pointer; | 711 | start = object->pointer; |
697 | end = object->pointer + object->size; | 712 | end = object->pointer + object->size; |
@@ -763,7 +778,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) | |||
763 | { | 778 | { |
764 | unsigned long flags; | 779 | unsigned long flags; |
765 | struct kmemleak_object *object; | 780 | struct kmemleak_object *object; |
766 | struct kmemleak_scan_area *area; | 781 | struct kmemleak_scan_area *area = NULL; |
767 | 782 | ||
768 | object = find_and_get_object(ptr, 1); | 783 | object = find_and_get_object(ptr, 1); |
769 | if (!object) { | 784 | if (!object) { |
@@ -772,13 +787,16 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) | |||
772 | return; | 787 | return; |
773 | } | 788 | } |
774 | 789 | ||
775 | area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); | 790 | if (scan_area_cache) |
776 | if (!area) { | 791 | area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); |
777 | pr_warn("Cannot allocate a scan area\n"); | ||
778 | goto out; | ||
779 | } | ||
780 | 792 | ||
781 | spin_lock_irqsave(&object->lock, flags); | 793 | spin_lock_irqsave(&object->lock, flags); |
794 | if (!area) { | ||
795 | pr_warn_once("Cannot allocate a scan area, scanning the full object\n"); | ||
796 | /* mark the object for full scan to avoid false positives */ | ||
797 | object->flags |= OBJECT_FULL_SCAN; | ||
798 | goto out_unlock; | ||
799 | } | ||
782 | if (size == SIZE_MAX) { | 800 | if (size == SIZE_MAX) { |
783 | size = object->pointer + object->size - ptr; | 801 | size = object->pointer + object->size - ptr; |
784 | } else if (ptr + size > object->pointer + object->size) { | 802 | } else if (ptr + size > object->pointer + object->size) { |
@@ -795,7 +813,6 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) | |||
795 | hlist_add_head(&area->node, &object->area_list); | 813 | hlist_add_head(&area->node, &object->area_list); |
796 | out_unlock: | 814 | out_unlock: |
797 | spin_unlock_irqrestore(&object->lock, flags); | 815 | spin_unlock_irqrestore(&object->lock, flags); |
798 | out: | ||
799 | put_object(object); | 816 | put_object(object); |
800 | } | 817 | } |
801 | 818 | ||
@@ -845,86 +862,6 @@ static void object_no_scan(unsigned long ptr) | |||
845 | put_object(object); | 862 | put_object(object); |
846 | } | 863 | } |
847 | 864 | ||
848 | /* | ||
849 | * Log an early kmemleak_* call to the early_log buffer. These calls will be | ||
850 | * processed later once kmemleak is fully initialized. | ||
851 | */ | ||
852 | static void __init log_early(int op_type, const void *ptr, size_t size, | ||
853 | int min_count) | ||
854 | { | ||
855 | unsigned long flags; | ||
856 | struct early_log *log; | ||
857 | |||
858 | if (kmemleak_error) { | ||
859 | /* kmemleak stopped recording, just count the requests */ | ||
860 | crt_early_log++; | ||
861 | return; | ||
862 | } | ||
863 | |||
864 | if (crt_early_log >= ARRAY_SIZE(early_log)) { | ||
865 | crt_early_log++; | ||
866 | kmemleak_disable(); | ||
867 | return; | ||
868 | } | ||
869 | |||
870 | /* | ||
871 | * There is no need for locking since the kernel is still in UP mode | ||
872 | * at this stage. Disabling the IRQs is enough. | ||
873 | */ | ||
874 | local_irq_save(flags); | ||
875 | log = &early_log[crt_early_log]; | ||
876 | log->op_type = op_type; | ||
877 | log->ptr = ptr; | ||
878 | log->size = size; | ||
879 | log->min_count = min_count; | ||
880 | log->trace_len = __save_stack_trace(log->trace); | ||
881 | crt_early_log++; | ||
882 | local_irq_restore(flags); | ||
883 | } | ||
884 | |||
885 | /* | ||
886 | * Log an early allocated block and populate the stack trace. | ||
887 | */ | ||
888 | static void early_alloc(struct early_log *log) | ||
889 | { | ||
890 | struct kmemleak_object *object; | ||
891 | unsigned long flags; | ||
892 | int i; | ||
893 | |||
894 | if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr)) | ||
895 | return; | ||
896 | |||
897 | /* | ||
898 | * RCU locking needed to ensure object is not freed via put_object(). | ||
899 | */ | ||
900 | rcu_read_lock(); | ||
901 | object = create_object((unsigned long)log->ptr, log->size, | ||
902 | log->min_count, GFP_ATOMIC); | ||
903 | if (!object) | ||
904 | goto out; | ||
905 | spin_lock_irqsave(&object->lock, flags); | ||
906 | for (i = 0; i < log->trace_len; i++) | ||
907 | object->trace[i] = log->trace[i]; | ||
908 | object->trace_len = log->trace_len; | ||
909 | spin_unlock_irqrestore(&object->lock, flags); | ||
910 | out: | ||
911 | rcu_read_unlock(); | ||
912 | } | ||
913 | |||
914 | /* | ||
915 | * Log an early allocated block and populate the stack trace. | ||
916 | */ | ||
917 | static void early_alloc_percpu(struct early_log *log) | ||
918 | { | ||
919 | unsigned int cpu; | ||
920 | const void __percpu *ptr = log->ptr; | ||
921 | |||
922 | for_each_possible_cpu(cpu) { | ||
923 | log->ptr = per_cpu_ptr(ptr, cpu); | ||
924 | early_alloc(log); | ||
925 | } | ||
926 | } | ||
927 | |||
928 | /** | 865 | /** |
929 | * kmemleak_alloc - register a newly allocated object | 866 | * kmemleak_alloc - register a newly allocated object |
930 | * @ptr: pointer to beginning of the object | 867 | * @ptr: pointer to beginning of the object |
@@ -946,8 +883,6 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, | |||
946 | 883 | ||
947 | if (kmemleak_enabled && ptr && !IS_ERR(ptr)) | 884 | if (kmemleak_enabled && ptr && !IS_ERR(ptr)) |
948 | create_object((unsigned long)ptr, size, min_count, gfp); | 885 | create_object((unsigned long)ptr, size, min_count, gfp); |
949 | else if (kmemleak_early_log) | ||
950 | log_early(KMEMLEAK_ALLOC, ptr, size, min_count); | ||
951 | } | 886 | } |
952 | EXPORT_SYMBOL_GPL(kmemleak_alloc); | 887 | EXPORT_SYMBOL_GPL(kmemleak_alloc); |
953 | 888 | ||
@@ -975,8 +910,6 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, | |||
975 | for_each_possible_cpu(cpu) | 910 | for_each_possible_cpu(cpu) |
976 | create_object((unsigned long)per_cpu_ptr(ptr, cpu), | 911 | create_object((unsigned long)per_cpu_ptr(ptr, cpu), |
977 | size, 0, gfp); | 912 | size, 0, gfp); |
978 | else if (kmemleak_early_log) | ||
979 | log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); | ||
980 | } | 913 | } |
981 | EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); | 914 | EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); |
982 | 915 | ||
@@ -1001,11 +934,6 @@ void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp | |||
1001 | create_object((unsigned long)area->addr, size, 2, gfp); | 934 | create_object((unsigned long)area->addr, size, 2, gfp); |
1002 | object_set_excess_ref((unsigned long)area, | 935 | object_set_excess_ref((unsigned long)area, |
1003 | (unsigned long)area->addr); | 936 | (unsigned long)area->addr); |
1004 | } else if (kmemleak_early_log) { | ||
1005 | log_early(KMEMLEAK_ALLOC, area->addr, size, 2); | ||
1006 | /* reusing early_log.size for storing area->addr */ | ||
1007 | log_early(KMEMLEAK_SET_EXCESS_REF, | ||
1008 | area, (unsigned long)area->addr, 0); | ||
1009 | } | 937 | } |
1010 | } | 938 | } |
1011 | EXPORT_SYMBOL_GPL(kmemleak_vmalloc); | 939 | EXPORT_SYMBOL_GPL(kmemleak_vmalloc); |
@@ -1023,8 +951,6 @@ void __ref kmemleak_free(const void *ptr) | |||
1023 | 951 | ||
1024 | if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) | 952 | if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) |
1025 | delete_object_full((unsigned long)ptr); | 953 | delete_object_full((unsigned long)ptr); |
1026 | else if (kmemleak_early_log) | ||
1027 | log_early(KMEMLEAK_FREE, ptr, 0, 0); | ||
1028 | } | 954 | } |
1029 | EXPORT_SYMBOL_GPL(kmemleak_free); | 955 | EXPORT_SYMBOL_GPL(kmemleak_free); |
1030 | 956 | ||
@@ -1043,8 +969,6 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) | |||
1043 | 969 | ||
1044 | if (kmemleak_enabled && ptr && !IS_ERR(ptr)) | 970 | if (kmemleak_enabled && ptr && !IS_ERR(ptr)) |
1045 | delete_object_part((unsigned long)ptr, size); | 971 | delete_object_part((unsigned long)ptr, size); |
1046 | else if (kmemleak_early_log) | ||
1047 | log_early(KMEMLEAK_FREE_PART, ptr, size, 0); | ||
1048 | } | 972 | } |
1049 | EXPORT_SYMBOL_GPL(kmemleak_free_part); | 973 | EXPORT_SYMBOL_GPL(kmemleak_free_part); |
1050 | 974 | ||
@@ -1065,8 +989,6 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) | |||
1065 | for_each_possible_cpu(cpu) | 989 | for_each_possible_cpu(cpu) |
1066 | delete_object_full((unsigned long)per_cpu_ptr(ptr, | 990 | delete_object_full((unsigned long)per_cpu_ptr(ptr, |
1067 | cpu)); | 991 | cpu)); |
1068 | else if (kmemleak_early_log) | ||
1069 | log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0); | ||
1070 | } | 992 | } |
1071 | EXPORT_SYMBOL_GPL(kmemleak_free_percpu); | 993 | EXPORT_SYMBOL_GPL(kmemleak_free_percpu); |
1072 | 994 | ||
@@ -1117,8 +1039,6 @@ void __ref kmemleak_not_leak(const void *ptr) | |||
1117 | 1039 | ||
1118 | if (kmemleak_enabled && ptr && !IS_ERR(ptr)) | 1040 | if (kmemleak_enabled && ptr && !IS_ERR(ptr)) |
1119 | make_gray_object((unsigned long)ptr); | 1041 | make_gray_object((unsigned long)ptr); |
1120 | else if (kmemleak_early_log) | ||
1121 | log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); | ||
1122 | } | 1042 | } |
1123 | EXPORT_SYMBOL(kmemleak_not_leak); | 1043 | EXPORT_SYMBOL(kmemleak_not_leak); |
1124 | 1044 | ||
@@ -1137,8 +1057,6 @@ void __ref kmemleak_ignore(const void *ptr) | |||
1137 | 1057 | ||
1138 | if (kmemleak_enabled && ptr && !IS_ERR(ptr)) | 1058 | if (kmemleak_enabled && ptr && !IS_ERR(ptr)) |
1139 | make_black_object((unsigned long)ptr); | 1059 | make_black_object((unsigned long)ptr); |
1140 | else if (kmemleak_early_log) | ||
1141 | log_early(KMEMLEAK_IGNORE, ptr, 0, 0); | ||
1142 | } | 1060 | } |
1143 | EXPORT_SYMBOL(kmemleak_ignore); | 1061 | EXPORT_SYMBOL(kmemleak_ignore); |
1144 | 1062 | ||
@@ -1159,8 +1077,6 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) | |||
1159 | 1077 | ||
1160 | if (kmemleak_enabled && ptr && size && !IS_ERR(ptr)) | 1078 | if (kmemleak_enabled && ptr && size && !IS_ERR(ptr)) |
1161 | add_scan_area((unsigned long)ptr, size, gfp); | 1079 | add_scan_area((unsigned long)ptr, size, gfp); |
1162 | else if (kmemleak_early_log) | ||
1163 | log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); | ||
1164 | } | 1080 | } |
1165 | EXPORT_SYMBOL(kmemleak_scan_area); | 1081 | EXPORT_SYMBOL(kmemleak_scan_area); |
1166 | 1082 | ||
@@ -1179,8 +1095,6 @@ void __ref kmemleak_no_scan(const void *ptr) | |||
1179 | 1095 | ||
1180 | if (kmemleak_enabled && ptr && !IS_ERR(ptr)) | 1096 | if (kmemleak_enabled && ptr && !IS_ERR(ptr)) |
1181 | object_no_scan((unsigned long)ptr); | 1097 | object_no_scan((unsigned long)ptr); |
1182 | else if (kmemleak_early_log) | ||
1183 | log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); | ||
1184 | } | 1098 | } |
1185 | EXPORT_SYMBOL(kmemleak_no_scan); | 1099 | EXPORT_SYMBOL(kmemleak_no_scan); |
1186 | 1100 | ||
@@ -1408,7 +1322,8 @@ static void scan_object(struct kmemleak_object *object) | |||
1408 | if (!(object->flags & OBJECT_ALLOCATED)) | 1322 | if (!(object->flags & OBJECT_ALLOCATED)) |
1409 | /* already freed object */ | 1323 | /* already freed object */ |
1410 | goto out; | 1324 | goto out; |
1411 | if (hlist_empty(&object->area_list)) { | 1325 | if (hlist_empty(&object->area_list) || |
1326 | object->flags & OBJECT_FULL_SCAN) { | ||
1412 | void *start = (void *)object->pointer; | 1327 | void *start = (void *)object->pointer; |
1413 | void *end = (void *)(object->pointer + object->size); | 1328 | void *end = (void *)(object->pointer + object->size); |
1414 | void *next; | 1329 | void *next; |
@@ -1966,7 +1881,6 @@ static void kmemleak_disable(void) | |||
1966 | 1881 | ||
1967 | /* stop any memory operation tracing */ | 1882 | /* stop any memory operation tracing */ |
1968 | kmemleak_enabled = 0; | 1883 | kmemleak_enabled = 0; |
1969 | kmemleak_early_log = 0; | ||
1970 | 1884 | ||
1971 | /* check whether it is too early for a kernel thread */ | 1885 | /* check whether it is too early for a kernel thread */ |
1972 | if (kmemleak_initialized) | 1886 | if (kmemleak_initialized) |
@@ -1994,20 +1908,11 @@ static int __init kmemleak_boot_config(char *str) | |||
1994 | } | 1908 | } |
1995 | early_param("kmemleak", kmemleak_boot_config); | 1909 | early_param("kmemleak", kmemleak_boot_config); |
1996 | 1910 | ||
1997 | static void __init print_log_trace(struct early_log *log) | ||
1998 | { | ||
1999 | pr_notice("Early log backtrace:\n"); | ||
2000 | stack_trace_print(log->trace, log->trace_len, 2); | ||
2001 | } | ||
2002 | |||
2003 | /* | 1911 | /* |
2004 | * Kmemleak initialization. | 1912 | * Kmemleak initialization. |
2005 | */ | 1913 | */ |
2006 | void __init kmemleak_init(void) | 1914 | void __init kmemleak_init(void) |
2007 | { | 1915 | { |
2008 | int i; | ||
2009 | unsigned long flags; | ||
2010 | |||
2011 | #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF | 1916 | #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF |
2012 | if (!kmemleak_skip_disable) { | 1917 | if (!kmemleak_skip_disable) { |
2013 | kmemleak_disable(); | 1918 | kmemleak_disable(); |
@@ -2015,28 +1920,15 @@ void __init kmemleak_init(void) | |||
2015 | } | 1920 | } |
2016 | #endif | 1921 | #endif |
2017 | 1922 | ||
1923 | if (kmemleak_error) | ||
1924 | return; | ||
1925 | |||
2018 | jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); | 1926 | jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); |
2019 | jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); | 1927 | jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); |
2020 | 1928 | ||
2021 | object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); | 1929 | object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); |
2022 | scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); | 1930 | scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); |
2023 | 1931 | ||
2024 | if (crt_early_log > ARRAY_SIZE(early_log)) | ||
2025 | pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", | ||
2026 | crt_early_log); | ||
2027 | |||
2028 | /* the kernel is still in UP mode, so disabling the IRQs is enough */ | ||
2029 | local_irq_save(flags); | ||
2030 | kmemleak_early_log = 0; | ||
2031 | if (kmemleak_error) { | ||
2032 | local_irq_restore(flags); | ||
2033 | return; | ||
2034 | } else { | ||
2035 | kmemleak_enabled = 1; | ||
2036 | kmemleak_free_enabled = 1; | ||
2037 | } | ||
2038 | local_irq_restore(flags); | ||
2039 | |||
2040 | /* register the data/bss sections */ | 1932 | /* register the data/bss sections */ |
2041 | create_object((unsigned long)_sdata, _edata - _sdata, | 1933 | create_object((unsigned long)_sdata, _edata - _sdata, |
2042 | KMEMLEAK_GREY, GFP_ATOMIC); | 1934 | KMEMLEAK_GREY, GFP_ATOMIC); |
@@ -2047,57 +1939,6 @@ void __init kmemleak_init(void) | |||
2047 | create_object((unsigned long)__start_ro_after_init, | 1939 | create_object((unsigned long)__start_ro_after_init, |
2048 | __end_ro_after_init - __start_ro_after_init, | 1940 | __end_ro_after_init - __start_ro_after_init, |
2049 | KMEMLEAK_GREY, GFP_ATOMIC); | 1941 | KMEMLEAK_GREY, GFP_ATOMIC); |
2050 | |||
2051 | /* | ||
2052 | * This is the point where tracking allocations is safe. Automatic | ||
2053 | * scanning is started during the late initcall. Add the early logged | ||
2054 | * callbacks to the kmemleak infrastructure. | ||
2055 | */ | ||
2056 | for (i = 0; i < crt_early_log; i++) { | ||
2057 | struct early_log *log = &early_log[i]; | ||
2058 | |||
2059 | switch (log->op_type) { | ||
2060 | case KMEMLEAK_ALLOC: | ||
2061 | early_alloc(log); | ||
2062 | break; | ||
2063 | case KMEMLEAK_ALLOC_PERCPU: | ||
2064 | early_alloc_percpu(log); | ||
2065 | break; | ||
2066 | case KMEMLEAK_FREE: | ||
2067 | kmemleak_free(log->ptr); | ||
2068 | break; | ||
2069 | case KMEMLEAK_FREE_PART: | ||
2070 | kmemleak_free_part(log->ptr, log->size); | ||
2071 | break; | ||
2072 | case KMEMLEAK_FREE_PERCPU: | ||
2073 | kmemleak_free_percpu(log->ptr); | ||
2074 | break; | ||
2075 | case KMEMLEAK_NOT_LEAK: | ||
2076 | kmemleak_not_leak(log->ptr); | ||
2077 | break; | ||
2078 | case KMEMLEAK_IGNORE: | ||
2079 | kmemleak_ignore(log->ptr); | ||
2080 | break; | ||
2081 | case KMEMLEAK_SCAN_AREA: | ||
2082 | kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL); | ||
2083 | break; | ||
2084 | case KMEMLEAK_NO_SCAN: | ||
2085 | kmemleak_no_scan(log->ptr); | ||
2086 | break; | ||
2087 | case KMEMLEAK_SET_EXCESS_REF: | ||
2088 | object_set_excess_ref((unsigned long)log->ptr, | ||
2089 | log->excess_ref); | ||
2090 | break; | ||
2091 | default: | ||
2092 | kmemleak_warn("Unknown early log operation: %d\n", | ||
2093 | log->op_type); | ||
2094 | } | ||
2095 | |||
2096 | if (kmemleak_warning) { | ||
2097 | print_log_trace(log); | ||
2098 | kmemleak_warning = 0; | ||
2099 | } | ||
2100 | } | ||
2101 | } | 1942 | } |
2102 | 1943 | ||
2103 | /* | 1944 | /* |
@@ -2126,7 +1967,8 @@ static int __init kmemleak_late_init(void) | |||
2126 | mutex_unlock(&scan_mutex); | 1967 | mutex_unlock(&scan_mutex); |
2127 | } | 1968 | } |
2128 | 1969 | ||
2129 | pr_info("Kernel memory leak detector initialized\n"); | 1970 | pr_info("Kernel memory leak detector initialized (mem pool available: %d)\n", |
1971 | mem_pool_free_count); | ||
2130 | 1972 | ||
2131 | return 0; | 1973 | return 0; |
2132 | } | 1974 | } |
@@ -1029,24 +1029,6 @@ static u32 calc_checksum(struct page *page) | |||
1029 | return checksum; | 1029 | return checksum; |
1030 | } | 1030 | } |
1031 | 1031 | ||
1032 | static int memcmp_pages(struct page *page1, struct page *page2) | ||
1033 | { | ||
1034 | char *addr1, *addr2; | ||
1035 | int ret; | ||
1036 | |||
1037 | addr1 = kmap_atomic(page1); | ||
1038 | addr2 = kmap_atomic(page2); | ||
1039 | ret = memcmp(addr1, addr2, PAGE_SIZE); | ||
1040 | kunmap_atomic(addr2); | ||
1041 | kunmap_atomic(addr1); | ||
1042 | return ret; | ||
1043 | } | ||
1044 | |||
1045 | static inline int pages_identical(struct page *page1, struct page *page2) | ||
1046 | { | ||
1047 | return !memcmp_pages(page1, page2); | ||
1048 | } | ||
1049 | |||
1050 | static int write_protect_page(struct vm_area_struct *vma, struct page *page, | 1032 | static int write_protect_page(struct vm_area_struct *vma, struct page *page, |
1051 | pte_t *orig_pte) | 1033 | pte_t *orig_pte) |
1052 | { | 1034 | { |
diff --git a/mm/madvise.c b/mm/madvise.c index 88babcc384b9..68ab988ad433 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -107,28 +107,14 @@ static long madvise_behavior(struct vm_area_struct *vma, | |||
107 | case MADV_MERGEABLE: | 107 | case MADV_MERGEABLE: |
108 | case MADV_UNMERGEABLE: | 108 | case MADV_UNMERGEABLE: |
109 | error = ksm_madvise(vma, start, end, behavior, &new_flags); | 109 | error = ksm_madvise(vma, start, end, behavior, &new_flags); |
110 | if (error) { | 110 | if (error) |
111 | /* | 111 | goto out_convert_errno; |
112 | * madvise() returns EAGAIN if kernel resources, such as | ||
113 | * slab, are temporarily unavailable. | ||
114 | */ | ||
115 | if (error == -ENOMEM) | ||
116 | error = -EAGAIN; | ||
117 | goto out; | ||
118 | } | ||
119 | break; | 112 | break; |
120 | case MADV_HUGEPAGE: | 113 | case MADV_HUGEPAGE: |
121 | case MADV_NOHUGEPAGE: | 114 | case MADV_NOHUGEPAGE: |
122 | error = hugepage_madvise(vma, &new_flags, behavior); | 115 | error = hugepage_madvise(vma, &new_flags, behavior); |
123 | if (error) { | 116 | if (error) |
124 | /* | 117 | goto out_convert_errno; |
125 | * madvise() returns EAGAIN if kernel resources, such as | ||
126 | * slab, are temporarily unavailable. | ||
127 | */ | ||
128 | if (error == -ENOMEM) | ||
129 | error = -EAGAIN; | ||
130 | goto out; | ||
131 | } | ||
132 | break; | 118 | break; |
133 | } | 119 | } |
134 | 120 | ||
@@ -154,15 +140,8 @@ static long madvise_behavior(struct vm_area_struct *vma, | |||
154 | goto out; | 140 | goto out; |
155 | } | 141 | } |
156 | error = __split_vma(mm, vma, start, 1); | 142 | error = __split_vma(mm, vma, start, 1); |
157 | if (error) { | 143 | if (error) |
158 | /* | 144 | goto out_convert_errno; |
159 | * madvise() returns EAGAIN if kernel resources, such as | ||
160 | * slab, are temporarily unavailable. | ||
161 | */ | ||
162 | if (error == -ENOMEM) | ||
163 | error = -EAGAIN; | ||
164 | goto out; | ||
165 | } | ||
166 | } | 145 | } |
167 | 146 | ||
168 | if (end != vma->vm_end) { | 147 | if (end != vma->vm_end) { |
@@ -171,15 +150,8 @@ static long madvise_behavior(struct vm_area_struct *vma, | |||
171 | goto out; | 150 | goto out; |
172 | } | 151 | } |
173 | error = __split_vma(mm, vma, end, 0); | 152 | error = __split_vma(mm, vma, end, 0); |
174 | if (error) { | 153 | if (error) |
175 | /* | 154 | goto out_convert_errno; |
176 | * madvise() returns EAGAIN if kernel resources, such as | ||
177 | * slab, are temporarily unavailable. | ||
178 | */ | ||
179 | if (error == -ENOMEM) | ||
180 | error = -EAGAIN; | ||
181 | goto out; | ||
182 | } | ||
183 | } | 155 | } |
184 | 156 | ||
185 | success: | 157 | success: |
@@ -187,6 +159,14 @@ success: | |||
187 | * vm_flags is protected by the mmap_sem held in write mode. | 159 | * vm_flags is protected by the mmap_sem held in write mode. |
188 | */ | 160 | */ |
189 | vma->vm_flags = new_flags; | 161 | vma->vm_flags = new_flags; |
162 | |||
163 | out_convert_errno: | ||
164 | /* | ||
165 | * madvise() returns EAGAIN if kernel resources, such as | ||
166 | * slab, are temporarily unavailable. | ||
167 | */ | ||
168 | if (error == -ENOMEM) | ||
169 | error = -EAGAIN; | ||
190 | out: | 170 | out: |
191 | return error; | 171 | return error; |
192 | } | 172 | } |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f3c15bb07cce..2156ef775d04 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/lockdep.h> | 57 | #include <linux/lockdep.h> |
58 | #include <linux/file.h> | 58 | #include <linux/file.h> |
59 | #include <linux/tracehook.h> | 59 | #include <linux/tracehook.h> |
60 | #include <linux/psi.h> | ||
60 | #include <linux/seq_buf.h> | 61 | #include <linux/seq_buf.h> |
61 | #include "internal.h" | 62 | #include "internal.h" |
62 | #include <net/sock.h> | 63 | #include <net/sock.h> |
@@ -317,6 +318,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); | |||
317 | EXPORT_SYMBOL(memcg_kmem_enabled_key); | 318 | EXPORT_SYMBOL(memcg_kmem_enabled_key); |
318 | 319 | ||
319 | struct workqueue_struct *memcg_kmem_cache_wq; | 320 | struct workqueue_struct *memcg_kmem_cache_wq; |
321 | #endif | ||
320 | 322 | ||
321 | static int memcg_shrinker_map_size; | 323 | static int memcg_shrinker_map_size; |
322 | static DEFINE_MUTEX(memcg_shrinker_map_mutex); | 324 | static DEFINE_MUTEX(memcg_shrinker_map_mutex); |
@@ -440,14 +442,6 @@ void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) | |||
440 | } | 442 | } |
441 | } | 443 | } |
442 | 444 | ||
443 | #else /* CONFIG_MEMCG_KMEM */ | ||
444 | static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) | ||
445 | { | ||
446 | return 0; | ||
447 | } | ||
448 | static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { } | ||
449 | #endif /* CONFIG_MEMCG_KMEM */ | ||
450 | |||
451 | /** | 445 | /** |
452 | * mem_cgroup_css_from_page - css of the memcg associated with a page | 446 | * mem_cgroup_css_from_page - css of the memcg associated with a page |
453 | * @page: page of interest | 447 | * @page: page of interest |
@@ -2270,21 +2264,22 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) | |||
2270 | for_each_online_cpu(cpu) { | 2264 | for_each_online_cpu(cpu) { |
2271 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | 2265 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); |
2272 | struct mem_cgroup *memcg; | 2266 | struct mem_cgroup *memcg; |
2267 | bool flush = false; | ||
2273 | 2268 | ||
2269 | rcu_read_lock(); | ||
2274 | memcg = stock->cached; | 2270 | memcg = stock->cached; |
2275 | if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css)) | 2271 | if (memcg && stock->nr_pages && |
2276 | continue; | 2272 | mem_cgroup_is_descendant(memcg, root_memcg)) |
2277 | if (!mem_cgroup_is_descendant(memcg, root_memcg)) { | 2273 | flush = true; |
2278 | css_put(&memcg->css); | 2274 | rcu_read_unlock(); |
2279 | continue; | 2275 | |
2280 | } | 2276 | if (flush && |
2281 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { | 2277 | !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { |
2282 | if (cpu == curcpu) | 2278 | if (cpu == curcpu) |
2283 | drain_local_stock(&stock->work); | 2279 | drain_local_stock(&stock->work); |
2284 | else | 2280 | else |
2285 | schedule_work_on(cpu, &stock->work); | 2281 | schedule_work_on(cpu, &stock->work); |
2286 | } | 2282 | } |
2287 | css_put(&memcg->css); | ||
2288 | } | 2283 | } |
2289 | put_cpu(); | 2284 | put_cpu(); |
2290 | mutex_unlock(&percpu_charge_mutex); | 2285 | mutex_unlock(&percpu_charge_mutex); |
@@ -2359,11 +2354,67 @@ static void high_work_func(struct work_struct *work) | |||
2359 | } | 2354 | } |
2360 | 2355 | ||
2361 | /* | 2356 | /* |
2357 | * Clamp the maximum sleep time per allocation batch to 2 seconds. This is | ||
2358 | * enough to still cause a significant slowdown in most cases, while still | ||
2359 | * allowing diagnostics and tracing to proceed without becoming stuck. | ||
2360 | */ | ||
2361 | #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ) | ||
2362 | |||
2363 | /* | ||
2364 | * When calculating the delay, we use these either side of the exponentiation to | ||
2365 | * maintain precision and scale to a reasonable number of jiffies (see the table | ||
2366 | * below. | ||
2367 | * | ||
2368 | * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the | ||
2369 | * overage ratio to a delay. | ||
2370 | * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the | ||
2371 | * proposed penalty in order to reduce to a reasonable number of jiffies, and | ||
2372 | * to produce a reasonable delay curve. | ||
2373 | * | ||
2374 | * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a | ||
2375 | * reasonable delay curve compared to precision-adjusted overage, not | ||
2376 | * penalising heavily at first, but still making sure that growth beyond the | ||
2377 | * limit penalises misbehaviour cgroups by slowing them down exponentially. For | ||
2378 | * example, with a high of 100 megabytes: | ||
2379 | * | ||
2380 | * +-------+------------------------+ | ||
2381 | * | usage | time to allocate in ms | | ||
2382 | * +-------+------------------------+ | ||
2383 | * | 100M | 0 | | ||
2384 | * | 101M | 6 | | ||
2385 | * | 102M | 25 | | ||
2386 | * | 103M | 57 | | ||
2387 | * | 104M | 102 | | ||
2388 | * | 105M | 159 | | ||
2389 | * | 106M | 230 | | ||
2390 | * | 107M | 313 | | ||
2391 | * | 108M | 409 | | ||
2392 | * | 109M | 518 | | ||
2393 | * | 110M | 639 | | ||
2394 | * | 111M | 774 | | ||
2395 | * | 112M | 921 | | ||
2396 | * | 113M | 1081 | | ||
2397 | * | 114M | 1254 | | ||
2398 | * | 115M | 1439 | | ||
2399 | * | 116M | 1638 | | ||
2400 | * | 117M | 1849 | | ||
2401 | * | 118M | 2000 | | ||
2402 | * | 119M | 2000 | | ||
2403 | * | 120M | 2000 | | ||
2404 | * +-------+------------------------+ | ||
2405 | */ | ||
2406 | #define MEMCG_DELAY_PRECISION_SHIFT 20 | ||
2407 | #define MEMCG_DELAY_SCALING_SHIFT 14 | ||
2408 | |||
2409 | /* | ||
2362 | * Scheduled by try_charge() to be executed from the userland return path | 2410 | * Scheduled by try_charge() to be executed from the userland return path |
2363 | * and reclaims memory over the high limit. | 2411 | * and reclaims memory over the high limit. |
2364 | */ | 2412 | */ |
2365 | void mem_cgroup_handle_over_high(void) | 2413 | void mem_cgroup_handle_over_high(void) |
2366 | { | 2414 | { |
2415 | unsigned long usage, high, clamped_high; | ||
2416 | unsigned long pflags; | ||
2417 | unsigned long penalty_jiffies, overage; | ||
2367 | unsigned int nr_pages = current->memcg_nr_pages_over_high; | 2418 | unsigned int nr_pages = current->memcg_nr_pages_over_high; |
2368 | struct mem_cgroup *memcg; | 2419 | struct mem_cgroup *memcg; |
2369 | 2420 | ||
@@ -2372,8 +2423,75 @@ void mem_cgroup_handle_over_high(void) | |||
2372 | 2423 | ||
2373 | memcg = get_mem_cgroup_from_mm(current->mm); | 2424 | memcg = get_mem_cgroup_from_mm(current->mm); |
2374 | reclaim_high(memcg, nr_pages, GFP_KERNEL); | 2425 | reclaim_high(memcg, nr_pages, GFP_KERNEL); |
2375 | css_put(&memcg->css); | ||
2376 | current->memcg_nr_pages_over_high = 0; | 2426 | current->memcg_nr_pages_over_high = 0; |
2427 | |||
2428 | /* | ||
2429 | * memory.high is breached and reclaim is unable to keep up. Throttle | ||
2430 | * allocators proactively to slow down excessive growth. | ||
2431 | * | ||
2432 | * We use overage compared to memory.high to calculate the number of | ||
2433 | * jiffies to sleep (penalty_jiffies). Ideally this value should be | ||
2434 | * fairly lenient on small overages, and increasingly harsh when the | ||
2435 | * memcg in question makes it clear that it has no intention of stopping | ||
2436 | * its crazy behaviour, so we exponentially increase the delay based on | ||
2437 | * overage amount. | ||
2438 | */ | ||
2439 | |||
2440 | usage = page_counter_read(&memcg->memory); | ||
2441 | high = READ_ONCE(memcg->high); | ||
2442 | |||
2443 | if (usage <= high) | ||
2444 | goto out; | ||
2445 | |||
2446 | /* | ||
2447 | * Prevent division by 0 in overage calculation by acting as if it was a | ||
2448 | * threshold of 1 page | ||
2449 | */ | ||
2450 | clamped_high = max(high, 1UL); | ||
2451 | |||
2452 | overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT, | ||
2453 | clamped_high); | ||
2454 | |||
2455 | penalty_jiffies = ((u64)overage * overage * HZ) | ||
2456 | >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT); | ||
2457 | |||
2458 | /* | ||
2459 | * Factor in the task's own contribution to the overage, such that four | ||
2460 | * N-sized allocations are throttled approximately the same as one | ||
2461 | * 4N-sized allocation. | ||
2462 | * | ||
2463 | * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or | ||
2464 | * larger the current charge patch is than that. | ||
2465 | */ | ||
2466 | penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH; | ||
2467 | |||
2468 | /* | ||
2469 | * Clamp the max delay per usermode return so as to still keep the | ||
2470 | * application moving forwards and also permit diagnostics, albeit | ||
2471 | * extremely slowly. | ||
2472 | */ | ||
2473 | penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES); | ||
2474 | |||
2475 | /* | ||
2476 | * Don't sleep if the amount of jiffies this memcg owes us is so low | ||
2477 | * that it's not even worth doing, in an attempt to be nice to those who | ||
2478 | * go only a small amount over their memory.high value and maybe haven't | ||
2479 | * been aggressively reclaimed enough yet. | ||
2480 | */ | ||
2481 | if (penalty_jiffies <= HZ / 100) | ||
2482 | goto out; | ||
2483 | |||
2484 | /* | ||
2485 | * If we exit early, we're guaranteed to die (since | ||
2486 | * schedule_timeout_killable sets TASK_KILLABLE). This means we don't | ||
2487 | * need to account for any ill-begotten jiffies to pay them off later. | ||
2488 | */ | ||
2489 | psi_memstall_enter(&pflags); | ||
2490 | schedule_timeout_killable(penalty_jiffies); | ||
2491 | psi_memstall_leave(&pflags); | ||
2492 | |||
2493 | out: | ||
2494 | css_put(&memcg->css); | ||
2377 | } | 2495 | } |
2378 | 2496 | ||
2379 | static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | 2497 | static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, |
@@ -3512,6 +3630,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, | |||
3512 | ret = mem_cgroup_resize_max(memcg, nr_pages, true); | 3630 | ret = mem_cgroup_resize_max(memcg, nr_pages, true); |
3513 | break; | 3631 | break; |
3514 | case _KMEM: | 3632 | case _KMEM: |
3633 | pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " | ||
3634 | "Please report your usecase to linux-mm@kvack.org if you " | ||
3635 | "depend on this functionality.\n"); | ||
3515 | ret = memcg_update_kmem_max(memcg, nr_pages); | 3636 | ret = memcg_update_kmem_max(memcg, nr_pages); |
3516 | break; | 3637 | break; |
3517 | case _TCP: | 3638 | case _TCP: |
@@ -4805,11 +4926,6 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) | |||
4805 | } | 4926 | } |
4806 | } | 4927 | } |
4807 | 4928 | ||
4808 | static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) | ||
4809 | { | ||
4810 | mem_cgroup_id_get_many(memcg, 1); | ||
4811 | } | ||
4812 | |||
4813 | static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) | 4929 | static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) |
4814 | { | 4930 | { |
4815 | mem_cgroup_id_put_many(memcg, 1); | 4931 | mem_cgroup_id_put_many(memcg, 1); |
@@ -4955,6 +5071,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
4955 | memcg->cgwb_frn[i].done = | 5071 | memcg->cgwb_frn[i].done = |
4956 | __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); | 5072 | __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); |
4957 | #endif | 5073 | #endif |
5074 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
5075 | spin_lock_init(&memcg->deferred_split_queue.split_queue_lock); | ||
5076 | INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); | ||
5077 | memcg->deferred_split_queue.split_queue_len = 0; | ||
5078 | #endif | ||
4958 | idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); | 5079 | idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); |
4959 | return memcg; | 5080 | return memcg; |
4960 | fail: | 5081 | fail: |
@@ -5333,6 +5454,14 @@ static int mem_cgroup_move_account(struct page *page, | |||
5333 | __mod_memcg_state(to, NR_WRITEBACK, nr_pages); | 5454 | __mod_memcg_state(to, NR_WRITEBACK, nr_pages); |
5334 | } | 5455 | } |
5335 | 5456 | ||
5457 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
5458 | if (compound && !list_empty(page_deferred_list(page))) { | ||
5459 | spin_lock(&from->deferred_split_queue.split_queue_lock); | ||
5460 | list_del_init(page_deferred_list(page)); | ||
5461 | from->deferred_split_queue.split_queue_len--; | ||
5462 | spin_unlock(&from->deferred_split_queue.split_queue_lock); | ||
5463 | } | ||
5464 | #endif | ||
5336 | /* | 5465 | /* |
5337 | * It is safe to change page->mem_cgroup here because the page | 5466 | * It is safe to change page->mem_cgroup here because the page |
5338 | * is referenced, charged, and isolated - we can't race with | 5467 | * is referenced, charged, and isolated - we can't race with |
@@ -5341,6 +5470,17 @@ static int mem_cgroup_move_account(struct page *page, | |||
5341 | 5470 | ||
5342 | /* caller should have done css_get */ | 5471 | /* caller should have done css_get */ |
5343 | page->mem_cgroup = to; | 5472 | page->mem_cgroup = to; |
5473 | |||
5474 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
5475 | if (compound && list_empty(page_deferred_list(page))) { | ||
5476 | spin_lock(&to->deferred_split_queue.split_queue_lock); | ||
5477 | list_add_tail(page_deferred_list(page), | ||
5478 | &to->deferred_split_queue.split_queue); | ||
5479 | to->deferred_split_queue.split_queue_len++; | ||
5480 | spin_unlock(&to->deferred_split_queue.split_queue_lock); | ||
5481 | } | ||
5482 | #endif | ||
5483 | |||
5344 | spin_unlock_irqrestore(&from->move_lock, flags); | 5484 | spin_unlock_irqrestore(&from->move_lock, flags); |
5345 | 5485 | ||
5346 | ret = 0; | 5486 | ret = 0; |
@@ -6511,7 +6651,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) | |||
6511 | unsigned int nr_pages = 1; | 6651 | unsigned int nr_pages = 1; |
6512 | 6652 | ||
6513 | if (PageTransHuge(page)) { | 6653 | if (PageTransHuge(page)) { |
6514 | nr_pages <<= compound_order(page); | 6654 | nr_pages = compound_nr(page); |
6515 | ug->nr_huge += nr_pages; | 6655 | ug->nr_huge += nr_pages; |
6516 | } | 6656 | } |
6517 | if (PageAnon(page)) | 6657 | if (PageAnon(page)) |
@@ -6523,7 +6663,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) | |||
6523 | } | 6663 | } |
6524 | ug->pgpgout++; | 6664 | ug->pgpgout++; |
6525 | } else { | 6665 | } else { |
6526 | ug->nr_kmem += 1 << compound_order(page); | 6666 | ug->nr_kmem += compound_nr(page); |
6527 | __ClearPageKmemcg(page); | 6667 | __ClearPageKmemcg(page); |
6528 | } | 6668 | } |
6529 | 6669 | ||
diff --git a/mm/memfd.c b/mm/memfd.c index 650e65a46b9c..2647c898990c 100644 --- a/mm/memfd.c +++ b/mm/memfd.c | |||
@@ -39,6 +39,7 @@ static void memfd_tag_pins(struct xa_state *xas) | |||
39 | xas_for_each(xas, page, ULONG_MAX) { | 39 | xas_for_each(xas, page, ULONG_MAX) { |
40 | if (xa_is_value(page)) | 40 | if (xa_is_value(page)) |
41 | continue; | 41 | continue; |
42 | page = find_subpage(page, xas->xa_index); | ||
42 | if (page_count(page) - page_mapcount(page) > 1) | 43 | if (page_count(page) - page_mapcount(page) > 1) |
43 | xas_set_mark(xas, MEMFD_TAG_PINNED); | 44 | xas_set_mark(xas, MEMFD_TAG_PINNED); |
44 | 45 | ||
@@ -88,6 +89,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) | |||
88 | bool clear = true; | 89 | bool clear = true; |
89 | if (xa_is_value(page)) | 90 | if (xa_is_value(page)) |
90 | continue; | 91 | continue; |
92 | page = find_subpage(page, xas.xa_index); | ||
91 | if (page_count(page) - page_mapcount(page) != 1) { | 93 | if (page_count(page) - page_mapcount(page) != 1) { |
92 | /* | 94 | /* |
93 | * On the last scan, we clean up all those tags | 95 | * On the last scan, we clean up all those tags |
diff --git a/mm/memory.c b/mm/memory.c index b1dff75640b7..b1ca51a079f2 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -518,7 +518,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
518 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); | 518 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); |
519 | if (page) | 519 | if (page) |
520 | dump_page(page, "bad pte"); | 520 | dump_page(page, "bad pte"); |
521 | pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", | 521 | pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", |
522 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); | 522 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); |
523 | pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n", | 523 | pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n", |
524 | vma->vm_file, | 524 | vma->vm_file, |
@@ -1026,6 +1026,9 @@ again: | |||
1026 | if (pte_none(ptent)) | 1026 | if (pte_none(ptent)) |
1027 | continue; | 1027 | continue; |
1028 | 1028 | ||
1029 | if (need_resched()) | ||
1030 | break; | ||
1031 | |||
1029 | if (pte_present(ptent)) { | 1032 | if (pte_present(ptent)) { |
1030 | struct page *page; | 1033 | struct page *page; |
1031 | 1034 | ||
@@ -1093,7 +1096,6 @@ again: | |||
1093 | if (unlikely(details)) | 1096 | if (unlikely(details)) |
1094 | continue; | 1097 | continue; |
1095 | 1098 | ||
1096 | entry = pte_to_swp_entry(ptent); | ||
1097 | if (!non_swap_entry(entry)) | 1099 | if (!non_swap_entry(entry)) |
1098 | rss[MM_SWAPENTS]--; | 1100 | rss[MM_SWAPENTS]--; |
1099 | else if (is_migration_entry(entry)) { | 1101 | else if (is_migration_entry(entry)) { |
@@ -1124,8 +1126,11 @@ again: | |||
1124 | if (force_flush) { | 1126 | if (force_flush) { |
1125 | force_flush = 0; | 1127 | force_flush = 0; |
1126 | tlb_flush_mmu(tlb); | 1128 | tlb_flush_mmu(tlb); |
1127 | if (addr != end) | 1129 | } |
1128 | goto again; | 1130 | |
1131 | if (addr != end) { | ||
1132 | cond_resched(); | ||
1133 | goto again; | ||
1129 | } | 1134 | } |
1130 | 1135 | ||
1131 | return addr; | 1136 | return addr; |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c73f09913165..b1be791f772d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -632,33 +632,30 @@ static void generic_online_page(struct page *page, unsigned int order) | |||
632 | #endif | 632 | #endif |
633 | } | 633 | } |
634 | 634 | ||
635 | static int online_pages_blocks(unsigned long start, unsigned long nr_pages) | ||
636 | { | ||
637 | unsigned long end = start + nr_pages; | ||
638 | int order, onlined_pages = 0; | ||
639 | |||
640 | while (start < end) { | ||
641 | order = min(MAX_ORDER - 1, | ||
642 | get_order(PFN_PHYS(end) - PFN_PHYS(start))); | ||
643 | (*online_page_callback)(pfn_to_page(start), order); | ||
644 | |||
645 | onlined_pages += (1UL << order); | ||
646 | start += (1UL << order); | ||
647 | } | ||
648 | return onlined_pages; | ||
649 | } | ||
650 | |||
651 | static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, | 635 | static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, |
652 | void *arg) | 636 | void *arg) |
653 | { | 637 | { |
654 | unsigned long onlined_pages = *(unsigned long *)arg; | 638 | const unsigned long end_pfn = start_pfn + nr_pages; |
639 | unsigned long pfn; | ||
640 | int order; | ||
655 | 641 | ||
656 | if (PageReserved(pfn_to_page(start_pfn))) | 642 | /* |
657 | onlined_pages += online_pages_blocks(start_pfn, nr_pages); | 643 | * Online the pages. The callback might decide to keep some pages |
644 | * PG_reserved (to add them to the buddy later), but we still account | ||
645 | * them as being online/belonging to this zone ("present"). | ||
646 | */ | ||
647 | for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) { | ||
648 | order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn))); | ||
649 | /* __free_pages_core() wants pfns to be aligned to the order */ | ||
650 | if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order))) | ||
651 | order = 0; | ||
652 | (*online_page_callback)(pfn_to_page(pfn), order); | ||
653 | } | ||
658 | 654 | ||
659 | online_mem_sections(start_pfn, start_pfn + nr_pages); | 655 | /* mark all involved sections as online */ |
656 | online_mem_sections(start_pfn, end_pfn); | ||
660 | 657 | ||
661 | *(unsigned long *)arg = onlined_pages; | 658 | *(unsigned long *)arg += nr_pages; |
662 | return 0; | 659 | return 0; |
663 | } | 660 | } |
664 | 661 | ||
@@ -714,8 +711,13 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon | |||
714 | pgdat->node_start_pfn = start_pfn; | 711 | pgdat->node_start_pfn = start_pfn; |
715 | 712 | ||
716 | pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; | 713 | pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; |
717 | } | ||
718 | 714 | ||
715 | } | ||
716 | /* | ||
717 | * Associate the pfn range with the given zone, initializing the memmaps | ||
718 | * and resizing the pgdat/zone data to span the added pages. After this | ||
719 | * call, all affected pages are PG_reserved. | ||
720 | */ | ||
719 | void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, | 721 | void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, |
720 | unsigned long nr_pages, struct vmem_altmap *altmap) | 722 | unsigned long nr_pages, struct vmem_altmap *altmap) |
721 | { | 723 | { |
@@ -804,20 +806,6 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, | |||
804 | return default_zone_for_pfn(nid, start_pfn, nr_pages); | 806 | return default_zone_for_pfn(nid, start_pfn, nr_pages); |
805 | } | 807 | } |
806 | 808 | ||
807 | /* | ||
808 | * Associates the given pfn range with the given node and the zone appropriate | ||
809 | * for the given online type. | ||
810 | */ | ||
811 | static struct zone * __meminit move_pfn_range(int online_type, int nid, | ||
812 | unsigned long start_pfn, unsigned long nr_pages) | ||
813 | { | ||
814 | struct zone *zone; | ||
815 | |||
816 | zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); | ||
817 | move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL); | ||
818 | return zone; | ||
819 | } | ||
820 | |||
821 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) | 809 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) |
822 | { | 810 | { |
823 | unsigned long flags; | 811 | unsigned long flags; |
@@ -840,7 +828,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
840 | put_device(&mem->dev); | 828 | put_device(&mem->dev); |
841 | 829 | ||
842 | /* associate pfn range with the zone */ | 830 | /* associate pfn range with the zone */ |
843 | zone = move_pfn_range(online_type, nid, pfn, nr_pages); | 831 | zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); |
832 | move_pfn_range_to_zone(zone, pfn, nr_pages, NULL); | ||
844 | 833 | ||
845 | arg.start_pfn = pfn; | 834 | arg.start_pfn = pfn; |
846 | arg.nr_pages = nr_pages; | 835 | arg.nr_pages = nr_pages; |
@@ -864,6 +853,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
864 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, | 853 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, |
865 | online_pages_range); | 854 | online_pages_range); |
866 | if (ret) { | 855 | if (ret) { |
856 | /* not a single memory resource was applicable */ | ||
867 | if (need_zonelists_rebuild) | 857 | if (need_zonelists_rebuild) |
868 | zone_pcp_reset(zone); | 858 | zone_pcp_reset(zone); |
869 | goto failed_addition; | 859 | goto failed_addition; |
@@ -877,27 +867,22 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
877 | 867 | ||
878 | shuffle_zone(zone); | 868 | shuffle_zone(zone); |
879 | 869 | ||
880 | if (onlined_pages) { | 870 | node_states_set_node(nid, &arg); |
881 | node_states_set_node(nid, &arg); | 871 | if (need_zonelists_rebuild) |
882 | if (need_zonelists_rebuild) | 872 | build_all_zonelists(NULL); |
883 | build_all_zonelists(NULL); | 873 | else |
884 | else | 874 | zone_pcp_update(zone); |
885 | zone_pcp_update(zone); | ||
886 | } | ||
887 | 875 | ||
888 | init_per_zone_wmark_min(); | 876 | init_per_zone_wmark_min(); |
889 | 877 | ||
890 | if (onlined_pages) { | 878 | kswapd_run(nid); |
891 | kswapd_run(nid); | 879 | kcompactd_run(nid); |
892 | kcompactd_run(nid); | ||
893 | } | ||
894 | 880 | ||
895 | vm_total_pages = nr_free_pagecache_pages(); | 881 | vm_total_pages = nr_free_pagecache_pages(); |
896 | 882 | ||
897 | writeback_set_ratelimit(); | 883 | writeback_set_ratelimit(); |
898 | 884 | ||
899 | if (onlined_pages) | 885 | memory_notify(MEM_ONLINE, &arg); |
900 | memory_notify(MEM_ONLINE, &arg); | ||
901 | mem_hotplug_done(); | 886 | mem_hotplug_done(); |
902 | return 0; | 887 | return 0; |
903 | 888 | ||
@@ -933,8 +918,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
933 | if (!pgdat) | 918 | if (!pgdat) |
934 | return NULL; | 919 | return NULL; |
935 | 920 | ||
921 | pgdat->per_cpu_nodestats = | ||
922 | alloc_percpu(struct per_cpu_nodestat); | ||
936 | arch_refresh_nodedata(nid, pgdat); | 923 | arch_refresh_nodedata(nid, pgdat); |
937 | } else { | 924 | } else { |
925 | int cpu; | ||
938 | /* | 926 | /* |
939 | * Reset the nr_zones, order and classzone_idx before reuse. | 927 | * Reset the nr_zones, order and classzone_idx before reuse. |
940 | * Note that kswapd will init kswapd_classzone_idx properly | 928 | * Note that kswapd will init kswapd_classzone_idx properly |
@@ -943,6 +931,12 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
943 | pgdat->nr_zones = 0; | 931 | pgdat->nr_zones = 0; |
944 | pgdat->kswapd_order = 0; | 932 | pgdat->kswapd_order = 0; |
945 | pgdat->kswapd_classzone_idx = 0; | 933 | pgdat->kswapd_classzone_idx = 0; |
934 | for_each_online_cpu(cpu) { | ||
935 | struct per_cpu_nodestat *p; | ||
936 | |||
937 | p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu); | ||
938 | memset(p, 0, sizeof(*p)); | ||
939 | } | ||
946 | } | 940 | } |
947 | 941 | ||
948 | /* we can use NODE_DATA(nid) from here */ | 942 | /* we can use NODE_DATA(nid) from here */ |
@@ -952,7 +946,6 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
952 | 946 | ||
953 | /* init node's zones as empty zones, we don't have any present pages.*/ | 947 | /* init node's zones as empty zones, we don't have any present pages.*/ |
954 | free_area_init_core_hotplug(nid); | 948 | free_area_init_core_hotplug(nid); |
955 | pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); | ||
956 | 949 | ||
957 | /* | 950 | /* |
958 | * The node we allocated has no zone fallback lists. For avoiding | 951 | * The node we allocated has no zone fallback lists. For avoiding |
@@ -1309,7 +1302,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) | |||
1309 | head = compound_head(page); | 1302 | head = compound_head(page); |
1310 | if (page_huge_active(head)) | 1303 | if (page_huge_active(head)) |
1311 | return pfn; | 1304 | return pfn; |
1312 | skip = (1 << compound_order(head)) - (page - head); | 1305 | skip = compound_nr(head) - (page - head); |
1313 | pfn += skip - 1; | 1306 | pfn += skip - 1; |
1314 | } | 1307 | } |
1315 | return 0; | 1308 | return 0; |
@@ -1347,7 +1340,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1347 | 1340 | ||
1348 | if (PageHuge(page)) { | 1341 | if (PageHuge(page)) { |
1349 | struct page *head = compound_head(page); | 1342 | struct page *head = compound_head(page); |
1350 | pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; | 1343 | pfn = page_to_pfn(head) + compound_nr(head) - 1; |
1351 | isolate_huge_page(head, &source); | 1344 | isolate_huge_page(head, &source); |
1352 | continue; | 1345 | continue; |
1353 | } else if (PageTransHuge(page)) | 1346 | } else if (PageTransHuge(page)) |
@@ -1662,7 +1655,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) | |||
1662 | phys_addr_t beginpa, endpa; | 1655 | phys_addr_t beginpa, endpa; |
1663 | 1656 | ||
1664 | beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); | 1657 | beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); |
1665 | endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; | 1658 | endpa = beginpa + memory_block_size_bytes() - 1; |
1666 | pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", | 1659 | pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", |
1667 | &beginpa, &endpa); | 1660 | &beginpa, &endpa); |
1668 | 1661 | ||
@@ -1800,7 +1793,7 @@ void __remove_memory(int nid, u64 start, u64 size) | |||
1800 | { | 1793 | { |
1801 | 1794 | ||
1802 | /* | 1795 | /* |
1803 | * trigger BUG() is some memory is not offlined prior to calling this | 1796 | * trigger BUG() if some memory is not offlined prior to calling this |
1804 | * function | 1797 | * function |
1805 | */ | 1798 | */ |
1806 | if (try_remove_memory(nid, start, size)) | 1799 | if (try_remove_memory(nid, start, size)) |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f000771558d8..464406e8da91 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -1512,10 +1512,6 @@ static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, | |||
1512 | if (nodes_empty(*new)) | 1512 | if (nodes_empty(*new)) |
1513 | goto out_put; | 1513 | goto out_put; |
1514 | 1514 | ||
1515 | nodes_and(*new, *new, node_states[N_MEMORY]); | ||
1516 | if (nodes_empty(*new)) | ||
1517 | goto out_put; | ||
1518 | |||
1519 | err = security_task_movememory(task); | 1515 | err = security_task_movememory(task); |
1520 | if (err) | 1516 | if (err) |
1521 | goto out_put; | 1517 | goto out_put; |
diff --git a/mm/migrate.c b/mm/migrate.c index 9f4ed4e985c1..73d476d690b1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -460,7 +460,7 @@ int migrate_page_move_mapping(struct address_space *mapping, | |||
460 | 460 | ||
461 | for (i = 1; i < HPAGE_PMD_NR; i++) { | 461 | for (i = 1; i < HPAGE_PMD_NR; i++) { |
462 | xas_next(&xas); | 462 | xas_next(&xas); |
463 | xas_store(&xas, newpage + i); | 463 | xas_store(&xas, newpage); |
464 | } | 464 | } |
465 | } | 465 | } |
466 | 466 | ||
@@ -1892,7 +1892,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | |||
1892 | VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); | 1892 | VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); |
1893 | 1893 | ||
1894 | /* Avoid migrating to a node that is nearly full */ | 1894 | /* Avoid migrating to a node that is nearly full */ |
1895 | if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) | 1895 | if (!migrate_balanced_pgdat(pgdat, compound_nr(page))) |
1896 | return 0; | 1896 | return 0; |
1897 | 1897 | ||
1898 | if (isolate_lru_page(page)) | 1898 | if (isolate_lru_page(page)) |
@@ -2218,17 +2218,15 @@ again: | |||
2218 | pte_t pte; | 2218 | pte_t pte; |
2219 | 2219 | ||
2220 | pte = *ptep; | 2220 | pte = *ptep; |
2221 | pfn = pte_pfn(pte); | ||
2222 | 2221 | ||
2223 | if (pte_none(pte)) { | 2222 | if (pte_none(pte)) { |
2224 | mpfn = MIGRATE_PFN_MIGRATE; | 2223 | mpfn = MIGRATE_PFN_MIGRATE; |
2225 | migrate->cpages++; | 2224 | migrate->cpages++; |
2226 | pfn = 0; | ||
2227 | goto next; | 2225 | goto next; |
2228 | } | 2226 | } |
2229 | 2227 | ||
2230 | if (!pte_present(pte)) { | 2228 | if (!pte_present(pte)) { |
2231 | mpfn = pfn = 0; | 2229 | mpfn = 0; |
2232 | 2230 | ||
2233 | /* | 2231 | /* |
2234 | * Only care about unaddressable device page special | 2232 | * Only care about unaddressable device page special |
@@ -2245,10 +2243,10 @@ again: | |||
2245 | if (is_write_device_private_entry(entry)) | 2243 | if (is_write_device_private_entry(entry)) |
2246 | mpfn |= MIGRATE_PFN_WRITE; | 2244 | mpfn |= MIGRATE_PFN_WRITE; |
2247 | } else { | 2245 | } else { |
2246 | pfn = pte_pfn(pte); | ||
2248 | if (is_zero_pfn(pfn)) { | 2247 | if (is_zero_pfn(pfn)) { |
2249 | mpfn = MIGRATE_PFN_MIGRATE; | 2248 | mpfn = MIGRATE_PFN_MIGRATE; |
2250 | migrate->cpages++; | 2249 | migrate->cpages++; |
2251 | pfn = 0; | ||
2252 | goto next; | 2250 | goto next; |
2253 | } | 2251 | } |
2254 | page = vm_normal_page(migrate->vma, addr, pte); | 2252 | page = vm_normal_page(migrate->vma, addr, pte); |
@@ -2258,10 +2256,9 @@ again: | |||
2258 | 2256 | ||
2259 | /* FIXME support THP */ | 2257 | /* FIXME support THP */ |
2260 | if (!page || !page->mapping || PageTransCompound(page)) { | 2258 | if (!page || !page->mapping || PageTransCompound(page)) { |
2261 | mpfn = pfn = 0; | 2259 | mpfn = 0; |
2262 | goto next; | 2260 | goto next; |
2263 | } | 2261 | } |
2264 | pfn = page_to_pfn(page); | ||
2265 | 2262 | ||
2266 | /* | 2263 | /* |
2267 | * By getting a reference on the page we pin it and that blocks | 2264 | * By getting a reference on the page we pin it and that blocks |
@@ -1358,6 +1358,9 @@ static inline u64 file_mmap_size_max(struct file *file, struct inode *inode) | |||
1358 | if (S_ISBLK(inode->i_mode)) | 1358 | if (S_ISBLK(inode->i_mode)) |
1359 | return MAX_LFS_FILESIZE; | 1359 | return MAX_LFS_FILESIZE; |
1360 | 1360 | ||
1361 | if (S_ISSOCK(inode->i_mode)) | ||
1362 | return MAX_LFS_FILESIZE; | ||
1363 | |||
1361 | /* Special "we do even unsigned file positions" case */ | 1364 | /* Special "we do even unsigned file positions" case */ |
1362 | if (file->f_mode & FMODE_UNSIGNED_OFFSET) | 1365 | if (file->f_mode & FMODE_UNSIGNED_OFFSET) |
1363 | return 0; | 1366 | return 0; |
@@ -2274,12 +2277,9 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr, | |||
2274 | if (vma) { | 2277 | if (vma) { |
2275 | *pprev = vma->vm_prev; | 2278 | *pprev = vma->vm_prev; |
2276 | } else { | 2279 | } else { |
2277 | struct rb_node *rb_node = mm->mm_rb.rb_node; | 2280 | struct rb_node *rb_node = rb_last(&mm->mm_rb); |
2278 | *pprev = NULL; | 2281 | |
2279 | while (rb_node) { | 2282 | *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL; |
2280 | *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb); | ||
2281 | rb_node = rb_node->rb_right; | ||
2282 | } | ||
2283 | } | 2283 | } |
2284 | return vma; | 2284 | return vma; |
2285 | } | 2285 | } |
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 8c943a6e1696..7d70e5c78f97 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c | |||
@@ -271,8 +271,6 @@ void tlb_finish_mmu(struct mmu_gather *tlb, | |||
271 | 271 | ||
272 | tlb_flush_mmu(tlb); | 272 | tlb_flush_mmu(tlb); |
273 | 273 | ||
274 | /* keep the page table cache within bounds */ | ||
275 | check_pgt_cache(); | ||
276 | #ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER | 274 | #ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER |
277 | tlb_batch_list_free(tlb); | 275 | tlb_batch_list_free(tlb); |
278 | #endif | 276 | #endif |
diff --git a/mm/nommu.c b/mm/nommu.c index fed1b6e9c89b..99b7ec318824 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -108,7 +108,7 @@ unsigned int kobjsize(const void *objp) | |||
108 | * The ksize() function is only guaranteed to work for pointers | 108 | * The ksize() function is only guaranteed to work for pointers |
109 | * returned by kmalloc(). So handle arbitrary pointers here. | 109 | * returned by kmalloc(). So handle arbitrary pointers here. |
110 | */ | 110 | */ |
111 | return PAGE_SIZE << compound_order(page); | 111 | return page_size(page); |
112 | } | 112 | } |
113 | 113 | ||
114 | /** | 114 | /** |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eda2e2a0bdc6..c1d9496b4c43 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -73,7 +73,7 @@ static inline bool is_memcg_oom(struct oom_control *oc) | |||
73 | /** | 73 | /** |
74 | * oom_cpuset_eligible() - check task eligiblity for kill | 74 | * oom_cpuset_eligible() - check task eligiblity for kill |
75 | * @start: task struct of which task to consider | 75 | * @start: task struct of which task to consider |
76 | * @mask: nodemask passed to page allocator for mempolicy ooms | 76 | * @oc: pointer to struct oom_control |
77 | * | 77 | * |
78 | * Task eligibility is determined by whether or not a candidate task, @tsk, | 78 | * Task eligibility is determined by whether or not a candidate task, @tsk, |
79 | * shares the same mempolicy nodes as current if it is bound by such a policy | 79 | * shares the same mempolicy nodes as current if it is bound by such a policy |
@@ -287,7 +287,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) | |||
287 | !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { | 287 | !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { |
288 | oc->totalpages = total_swap_pages; | 288 | oc->totalpages = total_swap_pages; |
289 | for_each_node_mask(nid, *oc->nodemask) | 289 | for_each_node_mask(nid, *oc->nodemask) |
290 | oc->totalpages += node_spanned_pages(nid); | 290 | oc->totalpages += node_present_pages(nid); |
291 | return CONSTRAINT_MEMORY_POLICY; | 291 | return CONSTRAINT_MEMORY_POLICY; |
292 | } | 292 | } |
293 | 293 | ||
@@ -300,7 +300,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) | |||
300 | if (cpuset_limited) { | 300 | if (cpuset_limited) { |
301 | oc->totalpages = total_swap_pages; | 301 | oc->totalpages = total_swap_pages; |
302 | for_each_node_mask(nid, cpuset_current_mems_allowed) | 302 | for_each_node_mask(nid, cpuset_current_mems_allowed) |
303 | oc->totalpages += node_spanned_pages(nid); | 303 | oc->totalpages += node_present_pages(nid); |
304 | return CONSTRAINT_CPUSET; | 304 | return CONSTRAINT_CPUSET; |
305 | } | 305 | } |
306 | return CONSTRAINT_NONE; | 306 | return CONSTRAINT_NONE; |
@@ -884,12 +884,13 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) | |||
884 | */ | 884 | */ |
885 | do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); | 885 | do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); |
886 | mark_oom_victim(victim); | 886 | mark_oom_victim(victim); |
887 | pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", | 887 | pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n", |
888 | message, task_pid_nr(victim), victim->comm, | 888 | message, task_pid_nr(victim), victim->comm, K(mm->total_vm), |
889 | K(victim->mm->total_vm), | 889 | K(get_mm_counter(mm, MM_ANONPAGES)), |
890 | K(get_mm_counter(victim->mm, MM_ANONPAGES)), | 890 | K(get_mm_counter(mm, MM_FILEPAGES)), |
891 | K(get_mm_counter(victim->mm, MM_FILEPAGES)), | 891 | K(get_mm_counter(mm, MM_SHMEMPAGES)), |
892 | K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); | 892 | from_kuid(&init_user_ns, task_uid(victim)), |
893 | mm_pgtables_bytes(mm), victim->signal->oom_score_adj); | ||
893 | task_unlock(victim); | 894 | task_unlock(victim); |
894 | 895 | ||
895 | /* | 896 | /* |
@@ -1068,9 +1069,10 @@ bool out_of_memory(struct oom_control *oc) | |||
1068 | * The OOM killer does not compensate for IO-less reclaim. | 1069 | * The OOM killer does not compensate for IO-less reclaim. |
1069 | * pagefault_out_of_memory lost its gfp context so we have to | 1070 | * pagefault_out_of_memory lost its gfp context so we have to |
1070 | * make sure exclude 0 mask - all other users should have at least | 1071 | * make sure exclude 0 mask - all other users should have at least |
1071 | * ___GFP_DIRECT_RECLAIM to get here. | 1072 | * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to |
1073 | * invoke the OOM killer even if it is a GFP_NOFS allocation. | ||
1072 | */ | 1074 | */ |
1073 | if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) | 1075 | if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc)) |
1074 | return true; | 1076 | return true; |
1075 | 1077 | ||
1076 | /* | 1078 | /* |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ff5484fdbdf9..3334a769eb91 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -670,6 +670,7 @@ out: | |||
670 | 670 | ||
671 | void free_compound_page(struct page *page) | 671 | void free_compound_page(struct page *page) |
672 | { | 672 | { |
673 | mem_cgroup_uncharge(page); | ||
673 | __free_pages_ok(page, compound_order(page)); | 674 | __free_pages_ok(page, compound_order(page)); |
674 | } | 675 | } |
675 | 676 | ||
@@ -3955,14 +3956,22 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, | |||
3955 | goto check_priority; | 3956 | goto check_priority; |
3956 | 3957 | ||
3957 | /* | 3958 | /* |
3959 | * compaction was skipped because there are not enough order-0 pages | ||
3960 | * to work with, so we retry only if it looks like reclaim can help. | ||
3961 | */ | ||
3962 | if (compaction_needs_reclaim(compact_result)) { | ||
3963 | ret = compaction_zonelist_suitable(ac, order, alloc_flags); | ||
3964 | goto out; | ||
3965 | } | ||
3966 | |||
3967 | /* | ||
3958 | * make sure the compaction wasn't deferred or didn't bail out early | 3968 | * make sure the compaction wasn't deferred or didn't bail out early |
3959 | * due to locks contention before we declare that we should give up. | 3969 | * due to locks contention before we declare that we should give up. |
3960 | * But do not retry if the given zonelist is not suitable for | 3970 | * But the next retry should use a higher priority if allowed, so |
3961 | * compaction. | 3971 | * we don't just keep bailing out endlessly. |
3962 | */ | 3972 | */ |
3963 | if (compaction_withdrawn(compact_result)) { | 3973 | if (compaction_withdrawn(compact_result)) { |
3964 | ret = compaction_zonelist_suitable(ac, order, alloc_flags); | 3974 | goto check_priority; |
3965 | goto out; | ||
3966 | } | 3975 | } |
3967 | 3976 | ||
3968 | /* | 3977 | /* |
@@ -6638,9 +6647,11 @@ static unsigned long __init calc_memmap_size(unsigned long spanned_pages, | |||
6638 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 6647 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
6639 | static void pgdat_init_split_queue(struct pglist_data *pgdat) | 6648 | static void pgdat_init_split_queue(struct pglist_data *pgdat) |
6640 | { | 6649 | { |
6641 | spin_lock_init(&pgdat->split_queue_lock); | 6650 | struct deferred_split *ds_queue = &pgdat->deferred_split_queue; |
6642 | INIT_LIST_HEAD(&pgdat->split_queue); | 6651 | |
6643 | pgdat->split_queue_len = 0; | 6652 | spin_lock_init(&ds_queue->split_queue_lock); |
6653 | INIT_LIST_HEAD(&ds_queue->split_queue); | ||
6654 | ds_queue->split_queue_len = 0; | ||
6644 | } | 6655 | } |
6645 | #else | 6656 | #else |
6646 | static void pgdat_init_split_queue(struct pglist_data *pgdat) {} | 6657 | static void pgdat_init_split_queue(struct pglist_data *pgdat) {} |
@@ -8196,7 +8207,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, | |||
8196 | if (!hugepage_migration_supported(page_hstate(head))) | 8207 | if (!hugepage_migration_supported(page_hstate(head))) |
8197 | goto unmovable; | 8208 | goto unmovable; |
8198 | 8209 | ||
8199 | skip_pages = (1 << compound_order(head)) - (page - head); | 8210 | skip_pages = compound_nr(head) - (page - head); |
8200 | iter += skip_pages - 1; | 8211 | iter += skip_pages - 1; |
8201 | continue; | 8212 | continue; |
8202 | } | 8213 | } |
diff --git a/mm/page_owner.c b/mm/page_owner.c index addcbb2ae4e4..dee931184788 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c | |||
@@ -24,6 +24,9 @@ struct page_owner { | |||
24 | short last_migrate_reason; | 24 | short last_migrate_reason; |
25 | gfp_t gfp_mask; | 25 | gfp_t gfp_mask; |
26 | depot_stack_handle_t handle; | 26 | depot_stack_handle_t handle; |
27 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
28 | depot_stack_handle_t free_handle; | ||
29 | #endif | ||
27 | }; | 30 | }; |
28 | 31 | ||
29 | static bool page_owner_disabled = true; | 32 | static bool page_owner_disabled = true; |
@@ -102,19 +105,6 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext) | |||
102 | return (void *)page_ext + page_owner_ops.offset; | 105 | return (void *)page_ext + page_owner_ops.offset; |
103 | } | 106 | } |
104 | 107 | ||
105 | void __reset_page_owner(struct page *page, unsigned int order) | ||
106 | { | ||
107 | int i; | ||
108 | struct page_ext *page_ext; | ||
109 | |||
110 | for (i = 0; i < (1 << order); i++) { | ||
111 | page_ext = lookup_page_ext(page + i); | ||
112 | if (unlikely(!page_ext)) | ||
113 | continue; | ||
114 | __clear_bit(PAGE_EXT_OWNER, &page_ext->flags); | ||
115 | } | ||
116 | } | ||
117 | |||
118 | static inline bool check_recursive_alloc(unsigned long *entries, | 108 | static inline bool check_recursive_alloc(unsigned long *entries, |
119 | unsigned int nr_entries, | 109 | unsigned int nr_entries, |
120 | unsigned long ip) | 110 | unsigned long ip) |
@@ -154,18 +144,50 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags) | |||
154 | return handle; | 144 | return handle; |
155 | } | 145 | } |
156 | 146 | ||
157 | static inline void __set_page_owner_handle(struct page_ext *page_ext, | 147 | void __reset_page_owner(struct page *page, unsigned int order) |
158 | depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask) | ||
159 | { | 148 | { |
149 | int i; | ||
150 | struct page_ext *page_ext; | ||
151 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
152 | depot_stack_handle_t handle = 0; | ||
160 | struct page_owner *page_owner; | 153 | struct page_owner *page_owner; |
161 | 154 | ||
162 | page_owner = get_page_owner(page_ext); | 155 | if (debug_pagealloc_enabled()) |
163 | page_owner->handle = handle; | 156 | handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); |
164 | page_owner->order = order; | 157 | #endif |
165 | page_owner->gfp_mask = gfp_mask; | ||
166 | page_owner->last_migrate_reason = -1; | ||
167 | 158 | ||
168 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); | 159 | for (i = 0; i < (1 << order); i++) { |
160 | page_ext = lookup_page_ext(page + i); | ||
161 | if (unlikely(!page_ext)) | ||
162 | continue; | ||
163 | __clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags); | ||
164 | #ifdef CONFIG_DEBUG_PAGEALLOC | ||
165 | if (debug_pagealloc_enabled()) { | ||
166 | page_owner = get_page_owner(page_ext); | ||
167 | page_owner->free_handle = handle; | ||
168 | } | ||
169 | #endif | ||
170 | } | ||
171 | } | ||
172 | |||
173 | static inline void __set_page_owner_handle(struct page *page, | ||
174 | struct page_ext *page_ext, depot_stack_handle_t handle, | ||
175 | unsigned int order, gfp_t gfp_mask) | ||
176 | { | ||
177 | struct page_owner *page_owner; | ||
178 | int i; | ||
179 | |||
180 | for (i = 0; i < (1 << order); i++) { | ||
181 | page_owner = get_page_owner(page_ext); | ||
182 | page_owner->handle = handle; | ||
183 | page_owner->order = order; | ||
184 | page_owner->gfp_mask = gfp_mask; | ||
185 | page_owner->last_migrate_reason = -1; | ||
186 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); | ||
187 | __set_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags); | ||
188 | |||
189 | page_ext = lookup_page_ext(page + i); | ||
190 | } | ||
169 | } | 191 | } |
170 | 192 | ||
171 | noinline void __set_page_owner(struct page *page, unsigned int order, | 193 | noinline void __set_page_owner(struct page *page, unsigned int order, |
@@ -178,7 +200,7 @@ noinline void __set_page_owner(struct page *page, unsigned int order, | |||
178 | return; | 200 | return; |
179 | 201 | ||
180 | handle = save_stack(gfp_mask); | 202 | handle = save_stack(gfp_mask); |
181 | __set_page_owner_handle(page_ext, handle, order, gfp_mask); | 203 | __set_page_owner_handle(page, page_ext, handle, order, gfp_mask); |
182 | } | 204 | } |
183 | 205 | ||
184 | void __set_page_owner_migrate_reason(struct page *page, int reason) | 206 | void __set_page_owner_migrate_reason(struct page *page, int reason) |
@@ -204,8 +226,11 @@ void __split_page_owner(struct page *page, unsigned int order) | |||
204 | 226 | ||
205 | page_owner = get_page_owner(page_ext); | 227 | page_owner = get_page_owner(page_ext); |
206 | page_owner->order = 0; | 228 | page_owner->order = 0; |
207 | for (i = 1; i < (1 << order); i++) | 229 | for (i = 1; i < (1 << order); i++) { |
208 | __copy_page_owner(page, page + i); | 230 | page_ext = lookup_page_ext(page + i); |
231 | page_owner = get_page_owner(page_ext); | ||
232 | page_owner->order = 0; | ||
233 | } | ||
209 | } | 234 | } |
210 | 235 | ||
211 | void __copy_page_owner(struct page *oldpage, struct page *newpage) | 236 | void __copy_page_owner(struct page *oldpage, struct page *newpage) |
@@ -235,6 +260,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) | |||
235 | * the new page, which will be freed. | 260 | * the new page, which will be freed. |
236 | */ | 261 | */ |
237 | __set_bit(PAGE_EXT_OWNER, &new_ext->flags); | 262 | __set_bit(PAGE_EXT_OWNER, &new_ext->flags); |
263 | __set_bit(PAGE_EXT_OWNER_ACTIVE, &new_ext->flags); | ||
238 | } | 264 | } |
239 | 265 | ||
240 | void pagetypeinfo_showmixedcount_print(struct seq_file *m, | 266 | void pagetypeinfo_showmixedcount_print(struct seq_file *m, |
@@ -294,7 +320,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, | |||
294 | if (unlikely(!page_ext)) | 320 | if (unlikely(!page_ext)) |
295 | continue; | 321 | continue; |
296 | 322 | ||
297 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) | 323 | if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) |
298 | continue; | 324 | continue; |
299 | 325 | ||
300 | page_owner = get_page_owner(page_ext); | 326 | page_owner = get_page_owner(page_ext); |
@@ -405,20 +431,36 @@ void __dump_page_owner(struct page *page) | |||
405 | mt = gfpflags_to_migratetype(gfp_mask); | 431 | mt = gfpflags_to_migratetype(gfp_mask); |
406 | 432 | ||
407 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { | 433 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { |
408 | pr_alert("page_owner info is not active (free page?)\n"); | 434 | pr_alert("page_owner info is not present (never set?)\n"); |
409 | return; | 435 | return; |
410 | } | 436 | } |
411 | 437 | ||
438 | if (test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) | ||
439 | pr_alert("page_owner tracks the page as allocated\n"); | ||
440 | else | ||
441 | pr_alert("page_owner tracks the page as freed\n"); | ||
442 | |||
443 | pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", | ||
444 | page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); | ||
445 | |||
412 | handle = READ_ONCE(page_owner->handle); | 446 | handle = READ_ONCE(page_owner->handle); |
413 | if (!handle) { | 447 | if (!handle) { |
414 | pr_alert("page_owner info is not active (free page?)\n"); | 448 | pr_alert("page_owner allocation stack trace missing\n"); |
415 | return; | 449 | } else { |
450 | nr_entries = stack_depot_fetch(handle, &entries); | ||
451 | stack_trace_print(entries, nr_entries, 0); | ||
416 | } | 452 | } |
417 | 453 | ||
418 | nr_entries = stack_depot_fetch(handle, &entries); | 454 | #ifdef CONFIG_DEBUG_PAGEALLOC |
419 | pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", | 455 | handle = READ_ONCE(page_owner->free_handle); |
420 | page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); | 456 | if (!handle) { |
421 | stack_trace_print(entries, nr_entries, 0); | 457 | pr_alert("page_owner free stack trace missing\n"); |
458 | } else { | ||
459 | nr_entries = stack_depot_fetch(handle, &entries); | ||
460 | pr_alert("page last free stack trace:\n"); | ||
461 | stack_trace_print(entries, nr_entries, 0); | ||
462 | } | ||
463 | #endif | ||
422 | 464 | ||
423 | if (page_owner->last_migrate_reason != -1) | 465 | if (page_owner->last_migrate_reason != -1) |
424 | pr_alert("page has been migrated, last migrate reason: %s\n", | 466 | pr_alert("page has been migrated, last migrate reason: %s\n", |
@@ -481,9 +523,23 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
481 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) | 523 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) |
482 | continue; | 524 | continue; |
483 | 525 | ||
526 | /* | ||
527 | * Although we do have the info about past allocation of free | ||
528 | * pages, it's not relevant for current memory usage. | ||
529 | */ | ||
530 | if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags)) | ||
531 | continue; | ||
532 | |||
484 | page_owner = get_page_owner(page_ext); | 533 | page_owner = get_page_owner(page_ext); |
485 | 534 | ||
486 | /* | 535 | /* |
536 | * Don't print "tail" pages of high-order allocations as that | ||
537 | * would inflate the stats. | ||
538 | */ | ||
539 | if (!IS_ALIGNED(pfn, 1 << page_owner->order)) | ||
540 | continue; | ||
541 | |||
542 | /* | ||
487 | * Access to page_ext->handle isn't synchronous so we should | 543 | * Access to page_ext->handle isn't synchronous so we should |
488 | * be careful to access it. | 544 | * be careful to access it. |
489 | */ | 545 | */ |
@@ -562,7 +618,8 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) | |||
562 | continue; | 618 | continue; |
563 | 619 | ||
564 | /* Found early allocated page */ | 620 | /* Found early allocated page */ |
565 | __set_page_owner_handle(page_ext, early_handle, 0, 0); | 621 | __set_page_owner_handle(page, page_ext, early_handle, |
622 | 0, 0); | ||
566 | count++; | 623 | count++; |
567 | } | 624 | } |
568 | cond_resched(); | 625 | cond_resched(); |
diff --git a/mm/page_poison.c b/mm/page_poison.c index 21d4f97cb49b..34b9181ee5d1 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c | |||
@@ -101,7 +101,7 @@ static void unpoison_page(struct page *page) | |||
101 | /* | 101 | /* |
102 | * Page poisoning when enabled poisons each and every page | 102 | * Page poisoning when enabled poisons each and every page |
103 | * that is freed to buddy. Thus no extra check is done to | 103 | * that is freed to buddy. Thus no extra check is done to |
104 | * see if a page was posioned. | 104 | * see if a page was poisoned. |
105 | */ | 105 | */ |
106 | check_poison_mem(addr, PAGE_SIZE); | 106 | check_poison_mem(addr, PAGE_SIZE); |
107 | kunmap_atomic(addr); | 107 | kunmap_atomic(addr); |
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 11df03e71288..eff4b4520c8d 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c | |||
@@ -153,8 +153,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) | |||
153 | 153 | ||
154 | if (unlikely(PageHuge(pvmw->page))) { | 154 | if (unlikely(PageHuge(pvmw->page))) { |
155 | /* when pud is not present, pte will be NULL */ | 155 | /* when pud is not present, pte will be NULL */ |
156 | pvmw->pte = huge_pte_offset(mm, pvmw->address, | 156 | pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page)); |
157 | PAGE_SIZE << compound_order(page)); | ||
158 | if (!pvmw->pte) | 157 | if (!pvmw->pte) |
159 | return false; | 158 | return false; |
160 | 159 | ||
diff --git a/mm/quicklist.c b/mm/quicklist.c deleted file mode 100644 index 5e98ac78e410..000000000000 --- a/mm/quicklist.c +++ /dev/null | |||
@@ -1,103 +0,0 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Quicklist support. | ||
4 | * | ||
5 | * Quicklists are light weight lists of pages that have a defined state | ||
6 | * on alloc and free. Pages must be in the quicklist specific defined state | ||
7 | * (zero by default) when the page is freed. It seems that the initial idea | ||
8 | * for such lists first came from Dave Miller and then various other people | ||
9 | * improved on it. | ||
10 | * | ||
11 | * Copyright (C) 2007 SGI, | ||
12 | * Christoph Lameter <cl@linux.com> | ||
13 | * Generalized, added support for multiple lists and | ||
14 | * constructors / destructors. | ||
15 | */ | ||
16 | #include <linux/kernel.h> | ||
17 | |||
18 | #include <linux/gfp.h> | ||
19 | #include <linux/mm.h> | ||
20 | #include <linux/mmzone.h> | ||
21 | #include <linux/quicklist.h> | ||
22 | |||
23 | DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); | ||
24 | |||
25 | #define FRACTION_OF_NODE_MEM 16 | ||
26 | |||
27 | static unsigned long max_pages(unsigned long min_pages) | ||
28 | { | ||
29 | unsigned long node_free_pages, max; | ||
30 | int node = numa_node_id(); | ||
31 | struct zone *zones = NODE_DATA(node)->node_zones; | ||
32 | int num_cpus_on_node; | ||
33 | |||
34 | node_free_pages = | ||
35 | #ifdef CONFIG_ZONE_DMA | ||
36 | zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) + | ||
37 | #endif | ||
38 | #ifdef CONFIG_ZONE_DMA32 | ||
39 | zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) + | ||
40 | #endif | ||
41 | zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); | ||
42 | |||
43 | max = node_free_pages / FRACTION_OF_NODE_MEM; | ||
44 | |||
45 | num_cpus_on_node = cpumask_weight(cpumask_of_node(node)); | ||
46 | max /= num_cpus_on_node; | ||
47 | |||
48 | return max(max, min_pages); | ||
49 | } | ||
50 | |||
51 | static long min_pages_to_free(struct quicklist *q, | ||
52 | unsigned long min_pages, long max_free) | ||
53 | { | ||
54 | long pages_to_free; | ||
55 | |||
56 | pages_to_free = q->nr_pages - max_pages(min_pages); | ||
57 | |||
58 | return min(pages_to_free, max_free); | ||
59 | } | ||
60 | |||
61 | /* | ||
62 | * Trim down the number of pages in the quicklist | ||
63 | */ | ||
64 | void quicklist_trim(int nr, void (*dtor)(void *), | ||
65 | unsigned long min_pages, unsigned long max_free) | ||
66 | { | ||
67 | long pages_to_free; | ||
68 | struct quicklist *q; | ||
69 | |||
70 | q = &get_cpu_var(quicklist)[nr]; | ||
71 | if (q->nr_pages > min_pages) { | ||
72 | pages_to_free = min_pages_to_free(q, min_pages, max_free); | ||
73 | |||
74 | while (pages_to_free > 0) { | ||
75 | /* | ||
76 | * We pass a gfp_t of 0 to quicklist_alloc here | ||
77 | * because we will never call into the page allocator. | ||
78 | */ | ||
79 | void *p = quicklist_alloc(nr, 0, NULL); | ||
80 | |||
81 | if (dtor) | ||
82 | dtor(p); | ||
83 | free_page((unsigned long)p); | ||
84 | pages_to_free--; | ||
85 | } | ||
86 | } | ||
87 | put_cpu_var(quicklist); | ||
88 | } | ||
89 | |||
90 | unsigned long quicklist_total_size(void) | ||
91 | { | ||
92 | unsigned long count = 0; | ||
93 | int cpu; | ||
94 | struct quicklist *ql, *q; | ||
95 | |||
96 | for_each_online_cpu(cpu) { | ||
97 | ql = per_cpu(quicklist, cpu); | ||
98 | for (q = ql; q < ql + CONFIG_NR_QUICK; q++) | ||
99 | count += q->nr_pages; | ||
100 | } | ||
101 | return count; | ||
102 | } | ||
103 | |||
@@ -898,15 +898,13 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
898 | */ | 898 | */ |
899 | mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, | 899 | mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, |
900 | 0, vma, vma->vm_mm, address, | 900 | 0, vma, vma->vm_mm, address, |
901 | min(vma->vm_end, address + | 901 | min(vma->vm_end, address + page_size(page))); |
902 | (PAGE_SIZE << compound_order(page)))); | ||
903 | mmu_notifier_invalidate_range_start(&range); | 902 | mmu_notifier_invalidate_range_start(&range); |
904 | 903 | ||
905 | while (page_vma_mapped_walk(&pvmw)) { | 904 | while (page_vma_mapped_walk(&pvmw)) { |
906 | unsigned long cstart; | ||
907 | int ret = 0; | 905 | int ret = 0; |
908 | 906 | ||
909 | cstart = address = pvmw.address; | 907 | address = pvmw.address; |
910 | if (pvmw.pte) { | 908 | if (pvmw.pte) { |
911 | pte_t entry; | 909 | pte_t entry; |
912 | pte_t *pte = pvmw.pte; | 910 | pte_t *pte = pvmw.pte; |
@@ -933,7 +931,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
933 | entry = pmd_wrprotect(entry); | 931 | entry = pmd_wrprotect(entry); |
934 | entry = pmd_mkclean(entry); | 932 | entry = pmd_mkclean(entry); |
935 | set_pmd_at(vma->vm_mm, address, pmd, entry); | 933 | set_pmd_at(vma->vm_mm, address, pmd, entry); |
936 | cstart &= PMD_MASK; | ||
937 | ret = 1; | 934 | ret = 1; |
938 | #else | 935 | #else |
939 | /* unexpected pmd-mapped page? */ | 936 | /* unexpected pmd-mapped page? */ |
@@ -1192,8 +1189,10 @@ void page_add_file_rmap(struct page *page, bool compound) | |||
1192 | } | 1189 | } |
1193 | if (!atomic_inc_and_test(compound_mapcount_ptr(page))) | 1190 | if (!atomic_inc_and_test(compound_mapcount_ptr(page))) |
1194 | goto out; | 1191 | goto out; |
1195 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | 1192 | if (PageSwapBacked(page)) |
1196 | __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); | 1193 | __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); |
1194 | else | ||
1195 | __inc_node_page_state(page, NR_FILE_PMDMAPPED); | ||
1197 | } else { | 1196 | } else { |
1198 | if (PageTransCompound(page) && page_mapping(page)) { | 1197 | if (PageTransCompound(page) && page_mapping(page)) { |
1199 | VM_WARN_ON_ONCE(!PageLocked(page)); | 1198 | VM_WARN_ON_ONCE(!PageLocked(page)); |
@@ -1232,8 +1231,10 @@ static void page_remove_file_rmap(struct page *page, bool compound) | |||
1232 | } | 1231 | } |
1233 | if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) | 1232 | if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) |
1234 | goto out; | 1233 | goto out; |
1235 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | 1234 | if (PageSwapBacked(page)) |
1236 | __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); | 1235 | __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); |
1236 | else | ||
1237 | __dec_node_page_state(page, NR_FILE_PMDMAPPED); | ||
1237 | } else { | 1238 | } else { |
1238 | if (!atomic_add_negative(-1, &page->_mapcount)) | 1239 | if (!atomic_add_negative(-1, &page->_mapcount)) |
1239 | goto out; | 1240 | goto out; |
@@ -1374,8 +1375,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1374 | */ | 1375 | */ |
1375 | mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, | 1376 | mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, |
1376 | address, | 1377 | address, |
1377 | min(vma->vm_end, address + | 1378 | min(vma->vm_end, address + page_size(page))); |
1378 | (PAGE_SIZE << compound_order(page)))); | ||
1379 | if (PageHuge(page)) { | 1379 | if (PageHuge(page)) { |
1380 | /* | 1380 | /* |
1381 | * If sharing is possible, start and end will be adjusted | 1381 | * If sharing is possible, start and end will be adjusted |
@@ -1524,8 +1524,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1524 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { | 1524 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
1525 | pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); | 1525 | pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); |
1526 | if (PageHuge(page)) { | 1526 | if (PageHuge(page)) { |
1527 | int nr = 1 << compound_order(page); | 1527 | hugetlb_count_sub(compound_nr(page), mm); |
1528 | hugetlb_count_sub(nr, mm); | ||
1529 | set_huge_swap_pte_at(mm, address, | 1528 | set_huge_swap_pte_at(mm, address, |
1530 | pvmw.pte, pteval, | 1529 | pvmw.pte, pteval, |
1531 | vma_mmu_pagesize(vma)); | 1530 | vma_mmu_pagesize(vma)); |
diff --git a/mm/shmem.c b/mm/shmem.c index 0f7fd4a85db6..30ce722c23fa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -609,7 +609,7 @@ static int shmem_add_to_page_cache(struct page *page, | |||
609 | { | 609 | { |
610 | XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); | 610 | XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); |
611 | unsigned long i = 0; | 611 | unsigned long i = 0; |
612 | unsigned long nr = 1UL << compound_order(page); | 612 | unsigned long nr = compound_nr(page); |
613 | 613 | ||
614 | VM_BUG_ON_PAGE(PageTail(page), page); | 614 | VM_BUG_ON_PAGE(PageTail(page), page); |
615 | VM_BUG_ON_PAGE(index != round_down(index, nr), page); | 615 | VM_BUG_ON_PAGE(index != round_down(index, nr), page); |
@@ -631,7 +631,7 @@ static int shmem_add_to_page_cache(struct page *page, | |||
631 | if (xas_error(&xas)) | 631 | if (xas_error(&xas)) |
632 | goto unlock; | 632 | goto unlock; |
633 | next: | 633 | next: |
634 | xas_store(&xas, page + i); | 634 | xas_store(&xas, page); |
635 | if (++i < nr) { | 635 | if (++i < nr) { |
636 | xas_next(&xas); | 636 | xas_next(&xas); |
637 | goto next; | 637 | goto next; |
@@ -1734,7 +1734,7 @@ unlock: | |||
1734 | * vm. If we swap it in we mark it dirty since we also free the swap | 1734 | * vm. If we swap it in we mark it dirty since we also free the swap |
1735 | * entry since a page cannot live in both the swap and page cache. | 1735 | * entry since a page cannot live in both the swap and page cache. |
1736 | * | 1736 | * |
1737 | * fault_mm and fault_type are only supplied by shmem_fault: | 1737 | * vmf and fault_type are only supplied by shmem_fault: |
1738 | * otherwise they are NULL. | 1738 | * otherwise they are NULL. |
1739 | */ | 1739 | */ |
1740 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | 1740 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
@@ -1884,7 +1884,7 @@ alloc_nohuge: | |||
1884 | lru_cache_add_anon(page); | 1884 | lru_cache_add_anon(page); |
1885 | 1885 | ||
1886 | spin_lock_irq(&info->lock); | 1886 | spin_lock_irq(&info->lock); |
1887 | info->alloced += 1 << compound_order(page); | 1887 | info->alloced += compound_nr(page); |
1888 | inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); | 1888 | inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); |
1889 | shmem_recalc_inode(inode); | 1889 | shmem_recalc_inode(inode); |
1890 | spin_unlock_irq(&info->lock); | 1890 | spin_unlock_irq(&info->lock); |
@@ -1925,7 +1925,7 @@ clear: | |||
1925 | struct page *head = compound_head(page); | 1925 | struct page *head = compound_head(page); |
1926 | int i; | 1926 | int i; |
1927 | 1927 | ||
1928 | for (i = 0; i < (1 << compound_order(head)); i++) { | 1928 | for (i = 0; i < compound_nr(head); i++) { |
1929 | clear_highpage(head + i); | 1929 | clear_highpage(head + i); |
1930 | flush_dcache_page(head + i); | 1930 | flush_dcache_page(head + i); |
1931 | } | 1931 | } |
@@ -1952,7 +1952,7 @@ clear: | |||
1952 | * Error recovery. | 1952 | * Error recovery. |
1953 | */ | 1953 | */ |
1954 | unacct: | 1954 | unacct: |
1955 | shmem_inode_unacct_blocks(inode, 1 << compound_order(page)); | 1955 | shmem_inode_unacct_blocks(inode, compound_nr(page)); |
1956 | 1956 | ||
1957 | if (PageTransHuge(page)) { | 1957 | if (PageTransHuge(page)) { |
1958 | unlock_page(page); | 1958 | unlock_page(page); |
@@ -30,6 +30,69 @@ struct kmem_cache { | |||
30 | struct list_head list; /* List of all slab caches on the system */ | 30 | struct list_head list; /* List of all slab caches on the system */ |
31 | }; | 31 | }; |
32 | 32 | ||
33 | #else /* !CONFIG_SLOB */ | ||
34 | |||
35 | struct memcg_cache_array { | ||
36 | struct rcu_head rcu; | ||
37 | struct kmem_cache *entries[0]; | ||
38 | }; | ||
39 | |||
40 | /* | ||
41 | * This is the main placeholder for memcg-related information in kmem caches. | ||
42 | * Both the root cache and the child caches will have it. For the root cache, | ||
43 | * this will hold a dynamically allocated array large enough to hold | ||
44 | * information about the currently limited memcgs in the system. To allow the | ||
45 | * array to be accessed without taking any locks, on relocation we free the old | ||
46 | * version only after a grace period. | ||
47 | * | ||
48 | * Root and child caches hold different metadata. | ||
49 | * | ||
50 | * @root_cache: Common to root and child caches. NULL for root, pointer to | ||
51 | * the root cache for children. | ||
52 | * | ||
53 | * The following fields are specific to root caches. | ||
54 | * | ||
55 | * @memcg_caches: kmemcg ID indexed table of child caches. This table is | ||
56 | * used to index child cachces during allocation and cleared | ||
57 | * early during shutdown. | ||
58 | * | ||
59 | * @root_caches_node: List node for slab_root_caches list. | ||
60 | * | ||
61 | * @children: List of all child caches. While the child caches are also | ||
62 | * reachable through @memcg_caches, a child cache remains on | ||
63 | * this list until it is actually destroyed. | ||
64 | * | ||
65 | * The following fields are specific to child caches. | ||
66 | * | ||
67 | * @memcg: Pointer to the memcg this cache belongs to. | ||
68 | * | ||
69 | * @children_node: List node for @root_cache->children list. | ||
70 | * | ||
71 | * @kmem_caches_node: List node for @memcg->kmem_caches list. | ||
72 | */ | ||
73 | struct memcg_cache_params { | ||
74 | struct kmem_cache *root_cache; | ||
75 | union { | ||
76 | struct { | ||
77 | struct memcg_cache_array __rcu *memcg_caches; | ||
78 | struct list_head __root_caches_node; | ||
79 | struct list_head children; | ||
80 | bool dying; | ||
81 | }; | ||
82 | struct { | ||
83 | struct mem_cgroup *memcg; | ||
84 | struct list_head children_node; | ||
85 | struct list_head kmem_caches_node; | ||
86 | struct percpu_ref refcnt; | ||
87 | |||
88 | void (*work_fn)(struct kmem_cache *); | ||
89 | union { | ||
90 | struct rcu_head rcu_head; | ||
91 | struct work_struct work; | ||
92 | }; | ||
93 | }; | ||
94 | }; | ||
95 | }; | ||
33 | #endif /* CONFIG_SLOB */ | 96 | #endif /* CONFIG_SLOB */ |
34 | 97 | ||
35 | #ifdef CONFIG_SLAB | 98 | #ifdef CONFIG_SLAB |
@@ -174,6 +237,7 @@ int __kmem_cache_shrink(struct kmem_cache *); | |||
174 | void __kmemcg_cache_deactivate(struct kmem_cache *s); | 237 | void __kmemcg_cache_deactivate(struct kmem_cache *s); |
175 | void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s); | 238 | void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s); |
176 | void slab_kmem_cache_release(struct kmem_cache *); | 239 | void slab_kmem_cache_release(struct kmem_cache *); |
240 | void kmem_cache_shrink_all(struct kmem_cache *s); | ||
177 | 241 | ||
178 | struct seq_file; | 242 | struct seq_file; |
179 | struct file; | 243 | struct file; |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 807490fe217a..6491c3a41805 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -981,6 +981,43 @@ int kmem_cache_shrink(struct kmem_cache *cachep) | |||
981 | } | 981 | } |
982 | EXPORT_SYMBOL(kmem_cache_shrink); | 982 | EXPORT_SYMBOL(kmem_cache_shrink); |
983 | 983 | ||
984 | /** | ||
985 | * kmem_cache_shrink_all - shrink a cache and all memcg caches for root cache | ||
986 | * @s: The cache pointer | ||
987 | */ | ||
988 | void kmem_cache_shrink_all(struct kmem_cache *s) | ||
989 | { | ||
990 | struct kmem_cache *c; | ||
991 | |||
992 | if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || !is_root_cache(s)) { | ||
993 | kmem_cache_shrink(s); | ||
994 | return; | ||
995 | } | ||
996 | |||
997 | get_online_cpus(); | ||
998 | get_online_mems(); | ||
999 | kasan_cache_shrink(s); | ||
1000 | __kmem_cache_shrink(s); | ||
1001 | |||
1002 | /* | ||
1003 | * We have to take the slab_mutex to protect from the memcg list | ||
1004 | * modification. | ||
1005 | */ | ||
1006 | mutex_lock(&slab_mutex); | ||
1007 | for_each_memcg_cache(c, s) { | ||
1008 | /* | ||
1009 | * Don't need to shrink deactivated memcg caches. | ||
1010 | */ | ||
1011 | if (s->flags & SLAB_DEACTIVATED) | ||
1012 | continue; | ||
1013 | kasan_cache_shrink(c); | ||
1014 | __kmem_cache_shrink(c); | ||
1015 | } | ||
1016 | mutex_unlock(&slab_mutex); | ||
1017 | put_online_mems(); | ||
1018 | put_online_cpus(); | ||
1019 | } | ||
1020 | |||
984 | bool slab_is_available(void) | 1021 | bool slab_is_available(void) |
985 | { | 1022 | { |
986 | return slab_state >= UP; | 1023 | return slab_state >= UP; |
@@ -539,7 +539,7 @@ size_t __ksize(const void *block) | |||
539 | 539 | ||
540 | sp = virt_to_page(block); | 540 | sp = virt_to_page(block); |
541 | if (unlikely(!PageSlab(sp))) | 541 | if (unlikely(!PageSlab(sp))) |
542 | return PAGE_SIZE << compound_order(sp); | 542 | return page_size(sp); |
543 | 543 | ||
544 | align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | 544 | align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
545 | m = (unsigned int *)(block - align); | 545 | m = (unsigned int *)(block - align); |
@@ -829,7 +829,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) | |||
829 | return 1; | 829 | return 1; |
830 | 830 | ||
831 | start = page_address(page); | 831 | start = page_address(page); |
832 | length = PAGE_SIZE << compound_order(page); | 832 | length = page_size(page); |
833 | end = start + length; | 833 | end = start + length; |
834 | remainder = length % s->size; | 834 | remainder = length % s->size; |
835 | if (!remainder) | 835 | if (!remainder) |
@@ -1074,13 +1074,14 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, | |||
1074 | init_tracking(s, object); | 1074 | init_tracking(s, object); |
1075 | } | 1075 | } |
1076 | 1076 | ||
1077 | static void setup_page_debug(struct kmem_cache *s, void *addr, int order) | 1077 | static |
1078 | void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) | ||
1078 | { | 1079 | { |
1079 | if (!(s->flags & SLAB_POISON)) | 1080 | if (!(s->flags & SLAB_POISON)) |
1080 | return; | 1081 | return; |
1081 | 1082 | ||
1082 | metadata_access_enable(); | 1083 | metadata_access_enable(); |
1083 | memset(addr, POISON_INUSE, PAGE_SIZE << order); | 1084 | memset(addr, POISON_INUSE, page_size(page)); |
1084 | metadata_access_disable(); | 1085 | metadata_access_disable(); |
1085 | } | 1086 | } |
1086 | 1087 | ||
@@ -1340,8 +1341,8 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, | |||
1340 | #else /* !CONFIG_SLUB_DEBUG */ | 1341 | #else /* !CONFIG_SLUB_DEBUG */ |
1341 | static inline void setup_object_debug(struct kmem_cache *s, | 1342 | static inline void setup_object_debug(struct kmem_cache *s, |
1342 | struct page *page, void *object) {} | 1343 | struct page *page, void *object) {} |
1343 | static inline void setup_page_debug(struct kmem_cache *s, | 1344 | static inline |
1344 | void *addr, int order) {} | 1345 | void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {} |
1345 | 1346 | ||
1346 | static inline int alloc_debug_processing(struct kmem_cache *s, | 1347 | static inline int alloc_debug_processing(struct kmem_cache *s, |
1347 | struct page *page, void *object, unsigned long addr) { return 0; } | 1348 | struct page *page, void *object, unsigned long addr) { return 0; } |
@@ -1639,7 +1640,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1639 | struct kmem_cache_order_objects oo = s->oo; | 1640 | struct kmem_cache_order_objects oo = s->oo; |
1640 | gfp_t alloc_gfp; | 1641 | gfp_t alloc_gfp; |
1641 | void *start, *p, *next; | 1642 | void *start, *p, *next; |
1642 | int idx, order; | 1643 | int idx; |
1643 | bool shuffle; | 1644 | bool shuffle; |
1644 | 1645 | ||
1645 | flags &= gfp_allowed_mask; | 1646 | flags &= gfp_allowed_mask; |
@@ -1673,7 +1674,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1673 | 1674 | ||
1674 | page->objects = oo_objects(oo); | 1675 | page->objects = oo_objects(oo); |
1675 | 1676 | ||
1676 | order = compound_order(page); | ||
1677 | page->slab_cache = s; | 1677 | page->slab_cache = s; |
1678 | __SetPageSlab(page); | 1678 | __SetPageSlab(page); |
1679 | if (page_is_pfmemalloc(page)) | 1679 | if (page_is_pfmemalloc(page)) |
@@ -1683,7 +1683,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1683 | 1683 | ||
1684 | start = page_address(page); | 1684 | start = page_address(page); |
1685 | 1685 | ||
1686 | setup_page_debug(s, start, order); | 1686 | setup_page_debug(s, page, start); |
1687 | 1687 | ||
1688 | shuffle = shuffle_freelist(s, page); | 1688 | shuffle = shuffle_freelist(s, page); |
1689 | 1689 | ||
@@ -2004,6 +2004,7 @@ static inline unsigned long next_tid(unsigned long tid) | |||
2004 | return tid + TID_STEP; | 2004 | return tid + TID_STEP; |
2005 | } | 2005 | } |
2006 | 2006 | ||
2007 | #ifdef SLUB_DEBUG_CMPXCHG | ||
2007 | static inline unsigned int tid_to_cpu(unsigned long tid) | 2008 | static inline unsigned int tid_to_cpu(unsigned long tid) |
2008 | { | 2009 | { |
2009 | return tid % TID_STEP; | 2010 | return tid % TID_STEP; |
@@ -2013,6 +2014,7 @@ static inline unsigned long tid_to_event(unsigned long tid) | |||
2013 | { | 2014 | { |
2014 | return tid / TID_STEP; | 2015 | return tid / TID_STEP; |
2015 | } | 2016 | } |
2017 | #endif | ||
2016 | 2018 | ||
2017 | static inline unsigned int init_tid(int cpu) | 2019 | static inline unsigned int init_tid(int cpu) |
2018 | { | 2020 | { |
@@ -3930,7 +3932,7 @@ size_t __ksize(const void *object) | |||
3930 | 3932 | ||
3931 | if (unlikely(!PageSlab(page))) { | 3933 | if (unlikely(!PageSlab(page))) { |
3932 | WARN_ON(!PageCompound(page)); | 3934 | WARN_ON(!PageCompound(page)); |
3933 | return PAGE_SIZE << compound_order(page); | 3935 | return page_size(page); |
3934 | } | 3936 | } |
3935 | 3937 | ||
3936 | return slab_ksize(page->slab_cache); | 3938 | return slab_ksize(page->slab_cache); |
@@ -5298,7 +5300,7 @@ static ssize_t shrink_store(struct kmem_cache *s, | |||
5298 | const char *buf, size_t length) | 5300 | const char *buf, size_t length) |
5299 | { | 5301 | { |
5300 | if (buf[0] == '1') | 5302 | if (buf[0] == '1') |
5301 | kmem_cache_shrink(s); | 5303 | kmem_cache_shrink_all(s); |
5302 | else | 5304 | else |
5303 | return -EINVAL; | 5305 | return -EINVAL; |
5304 | return length; | 5306 | return length; |
diff --git a/mm/sparse.c b/mm/sparse.c index 72f010d9bff5..bf32de9e666b 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -11,6 +11,8 @@ | |||
11 | #include <linux/export.h> | 11 | #include <linux/export.h> |
12 | #include <linux/spinlock.h> | 12 | #include <linux/spinlock.h> |
13 | #include <linux/vmalloc.h> | 13 | #include <linux/vmalloc.h> |
14 | #include <linux/swap.h> | ||
15 | #include <linux/swapops.h> | ||
14 | 16 | ||
15 | #include "internal.h" | 17 | #include "internal.h" |
16 | #include <asm/dma.h> | 18 | #include <asm/dma.h> |
@@ -470,6 +472,12 @@ struct page __init *__populate_section_memmap(unsigned long pfn, | |||
470 | static void *sparsemap_buf __meminitdata; | 472 | static void *sparsemap_buf __meminitdata; |
471 | static void *sparsemap_buf_end __meminitdata; | 473 | static void *sparsemap_buf_end __meminitdata; |
472 | 474 | ||
475 | static inline void __meminit sparse_buffer_free(unsigned long size) | ||
476 | { | ||
477 | WARN_ON(!sparsemap_buf || size == 0); | ||
478 | memblock_free_early(__pa(sparsemap_buf), size); | ||
479 | } | ||
480 | |||
473 | static void __init sparse_buffer_init(unsigned long size, int nid) | 481 | static void __init sparse_buffer_init(unsigned long size, int nid) |
474 | { | 482 | { |
475 | phys_addr_t addr = __pa(MAX_DMA_ADDRESS); | 483 | phys_addr_t addr = __pa(MAX_DMA_ADDRESS); |
@@ -486,7 +494,7 @@ static void __init sparse_buffer_fini(void) | |||
486 | unsigned long size = sparsemap_buf_end - sparsemap_buf; | 494 | unsigned long size = sparsemap_buf_end - sparsemap_buf; |
487 | 495 | ||
488 | if (sparsemap_buf && size > 0) | 496 | if (sparsemap_buf && size > 0) |
489 | memblock_free_early(__pa(sparsemap_buf), size); | 497 | sparse_buffer_free(size); |
490 | sparsemap_buf = NULL; | 498 | sparsemap_buf = NULL; |
491 | } | 499 | } |
492 | 500 | ||
@@ -495,11 +503,15 @@ void * __meminit sparse_buffer_alloc(unsigned long size) | |||
495 | void *ptr = NULL; | 503 | void *ptr = NULL; |
496 | 504 | ||
497 | if (sparsemap_buf) { | 505 | if (sparsemap_buf) { |
498 | ptr = PTR_ALIGN(sparsemap_buf, size); | 506 | ptr = (void *) roundup((unsigned long)sparsemap_buf, size); |
499 | if (ptr + size > sparsemap_buf_end) | 507 | if (ptr + size > sparsemap_buf_end) |
500 | ptr = NULL; | 508 | ptr = NULL; |
501 | else | 509 | else { |
510 | /* Free redundant aligned space */ | ||
511 | if ((unsigned long)(ptr - sparsemap_buf) > 0) | ||
512 | sparse_buffer_free((unsigned long)(ptr - sparsemap_buf)); | ||
502 | sparsemap_buf = ptr + size; | 513 | sparsemap_buf = ptr + size; |
514 | } | ||
503 | } | 515 | } |
504 | return ptr; | 516 | return ptr; |
505 | } | 517 | } |
@@ -867,7 +879,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn, | |||
867 | */ | 879 | */ |
868 | page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); | 880 | page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); |
869 | 881 | ||
870 | ms = __pfn_to_section(start_pfn); | 882 | ms = __nr_to_section(section_nr); |
871 | set_section_nid(section_nr, nid); | 883 | set_section_nid(section_nr, nid); |
872 | section_mark_present(ms); | 884 | section_mark_present(ms); |
873 | 885 | ||
@@ -884,9 +896,6 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | |||
884 | { | 896 | { |
885 | int i; | 897 | int i; |
886 | 898 | ||
887 | if (!memmap) | ||
888 | return; | ||
889 | |||
890 | /* | 899 | /* |
891 | * A further optimization is to have per section refcounted | 900 | * A further optimization is to have per section refcounted |
892 | * num_poisoned_pages. But that would need more space per memmap, so | 901 | * num_poisoned_pages. But that would need more space per memmap, so |
@@ -898,7 +907,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | |||
898 | 907 | ||
899 | for (i = 0; i < nr_pages; i++) { | 908 | for (i = 0; i < nr_pages; i++) { |
900 | if (PageHWPoison(&memmap[i])) { | 909 | if (PageHWPoison(&memmap[i])) { |
901 | atomic_long_sub(1, &num_poisoned_pages); | 910 | num_poisoned_pages_dec(); |
902 | ClearPageHWPoison(&memmap[i]); | 911 | ClearPageHWPoison(&memmap[i]); |
903 | } | 912 | } |
904 | } | 913 | } |
@@ -71,12 +71,12 @@ static void __page_cache_release(struct page *page) | |||
71 | spin_unlock_irqrestore(&pgdat->lru_lock, flags); | 71 | spin_unlock_irqrestore(&pgdat->lru_lock, flags); |
72 | } | 72 | } |
73 | __ClearPageWaiters(page); | 73 | __ClearPageWaiters(page); |
74 | mem_cgroup_uncharge(page); | ||
75 | } | 74 | } |
76 | 75 | ||
77 | static void __put_single_page(struct page *page) | 76 | static void __put_single_page(struct page *page) |
78 | { | 77 | { |
79 | __page_cache_release(page); | 78 | __page_cache_release(page); |
79 | mem_cgroup_uncharge(page); | ||
80 | free_unref_page(page); | 80 | free_unref_page(page); |
81 | } | 81 | } |
82 | 82 | ||
@@ -515,7 +515,6 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, | |||
515 | del_page_from_lru_list(page, lruvec, lru + active); | 515 | del_page_from_lru_list(page, lruvec, lru + active); |
516 | ClearPageActive(page); | 516 | ClearPageActive(page); |
517 | ClearPageReferenced(page); | 517 | ClearPageReferenced(page); |
518 | add_page_to_lru_list(page, lruvec, lru); | ||
519 | 518 | ||
520 | if (PageWriteback(page) || PageDirty(page)) { | 519 | if (PageWriteback(page) || PageDirty(page)) { |
521 | /* | 520 | /* |
@@ -523,13 +522,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, | |||
523 | * It can make readahead confusing. But race window | 522 | * It can make readahead confusing. But race window |
524 | * is _really_ small and it's non-critical problem. | 523 | * is _really_ small and it's non-critical problem. |
525 | */ | 524 | */ |
525 | add_page_to_lru_list(page, lruvec, lru); | ||
526 | SetPageReclaim(page); | 526 | SetPageReclaim(page); |
527 | } else { | 527 | } else { |
528 | /* | 528 | /* |
529 | * The page's writeback ends up during pagevec | 529 | * The page's writeback ends up during pagevec |
530 | * We moves tha page into tail of inactive. | 530 | * We moves tha page into tail of inactive. |
531 | */ | 531 | */ |
532 | list_move_tail(&page->lru, &lruvec->lists[lru]); | 532 | add_page_to_lru_list_tail(page, lruvec, lru); |
533 | __count_vm_event(PGROTATED); | 533 | __count_vm_event(PGROTATED); |
534 | } | 534 | } |
535 | 535 | ||
@@ -844,17 +844,15 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, | |||
844 | get_page(page_tail); | 844 | get_page(page_tail); |
845 | list_add_tail(&page_tail->lru, list); | 845 | list_add_tail(&page_tail->lru, list); |
846 | } else { | 846 | } else { |
847 | struct list_head *list_head; | ||
848 | /* | 847 | /* |
849 | * Head page has not yet been counted, as an hpage, | 848 | * Head page has not yet been counted, as an hpage, |
850 | * so we must account for each subpage individually. | 849 | * so we must account for each subpage individually. |
851 | * | 850 | * |
852 | * Use the standard add function to put page_tail on the list, | 851 | * Put page_tail on the list at the correct position |
853 | * but then correct its position so they all end up in order. | 852 | * so they all end up in order. |
854 | */ | 853 | */ |
855 | add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); | 854 | add_page_to_lru_list_tail(page_tail, lruvec, |
856 | list_head = page_tail->lru.prev; | 855 | page_lru(page_tail)); |
857 | list_move_tail(&page_tail->lru, list_head); | ||
858 | } | 856 | } |
859 | 857 | ||
860 | if (!PageUnevictable(page)) | 858 | if (!PageUnevictable(page)) |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 8368621a0fc7..8e7ce9a9bc5e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -116,7 +116,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) | |||
116 | struct address_space *address_space = swap_address_space(entry); | 116 | struct address_space *address_space = swap_address_space(entry); |
117 | pgoff_t idx = swp_offset(entry); | 117 | pgoff_t idx = swp_offset(entry); |
118 | XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); | 118 | XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); |
119 | unsigned long i, nr = 1UL << compound_order(page); | 119 | unsigned long i, nr = compound_nr(page); |
120 | 120 | ||
121 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 121 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
122 | VM_BUG_ON_PAGE(PageSwapCache(page), page); | 122 | VM_BUG_ON_PAGE(PageSwapCache(page), page); |
@@ -133,7 +133,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) | |||
133 | for (i = 0; i < nr; i++) { | 133 | for (i = 0; i < nr; i++) { |
134 | VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); | 134 | VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); |
135 | set_page_private(page + i, entry.val + i); | 135 | set_page_private(page + i, entry.val + i); |
136 | xas_store(&xas, page + i); | 136 | xas_store(&xas, page); |
137 | xas_next(&xas); | 137 | xas_next(&xas); |
138 | } | 138 | } |
139 | address_space->nrpages += nr; | 139 | address_space->nrpages += nr; |
@@ -168,7 +168,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry) | |||
168 | 168 | ||
169 | for (i = 0; i < nr; i++) { | 169 | for (i = 0; i < nr; i++) { |
170 | void *entry = xas_store(&xas, NULL); | 170 | void *entry = xas_store(&xas, NULL); |
171 | VM_BUG_ON_PAGE(entry != page + i, entry); | 171 | VM_BUG_ON_PAGE(entry != page, entry); |
172 | set_page_private(page + i, 0); | 172 | set_page_private(page + i, 0); |
173 | xas_next(&xas); | 173 | xas_next(&xas); |
174 | } | 174 | } |
@@ -16,6 +16,13 @@ | |||
16 | #include <linux/hugetlb.h> | 16 | #include <linux/hugetlb.h> |
17 | #include <linux/vmalloc.h> | 17 | #include <linux/vmalloc.h> |
18 | #include <linux/userfaultfd_k.h> | 18 | #include <linux/userfaultfd_k.h> |
19 | #include <linux/elf.h> | ||
20 | #include <linux/elf-randomize.h> | ||
21 | #include <linux/personality.h> | ||
22 | #include <linux/random.h> | ||
23 | #include <linux/processor.h> | ||
24 | #include <linux/sizes.h> | ||
25 | #include <linux/compat.h> | ||
19 | 26 | ||
20 | #include <linux/uaccess.h> | 27 | #include <linux/uaccess.h> |
21 | 28 | ||
@@ -293,7 +300,105 @@ int vma_is_stack_for_current(struct vm_area_struct *vma) | |||
293 | return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); | 300 | return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); |
294 | } | 301 | } |
295 | 302 | ||
296 | #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) | 303 | #ifndef STACK_RND_MASK |
304 | #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ | ||
305 | #endif | ||
306 | |||
307 | unsigned long randomize_stack_top(unsigned long stack_top) | ||
308 | { | ||
309 | unsigned long random_variable = 0; | ||
310 | |||
311 | if (current->flags & PF_RANDOMIZE) { | ||
312 | random_variable = get_random_long(); | ||
313 | random_variable &= STACK_RND_MASK; | ||
314 | random_variable <<= PAGE_SHIFT; | ||
315 | } | ||
316 | #ifdef CONFIG_STACK_GROWSUP | ||
317 | return PAGE_ALIGN(stack_top) + random_variable; | ||
318 | #else | ||
319 | return PAGE_ALIGN(stack_top) - random_variable; | ||
320 | #endif | ||
321 | } | ||
322 | |||
323 | #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT | ||
324 | unsigned long arch_randomize_brk(struct mm_struct *mm) | ||
325 | { | ||
326 | /* Is the current task 32bit ? */ | ||
327 | if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) | ||
328 | return randomize_page(mm->brk, SZ_32M); | ||
329 | |||
330 | return randomize_page(mm->brk, SZ_1G); | ||
331 | } | ||
332 | |||
333 | unsigned long arch_mmap_rnd(void) | ||
334 | { | ||
335 | unsigned long rnd; | ||
336 | |||
337 | #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS | ||
338 | if (is_compat_task()) | ||
339 | rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); | ||
340 | else | ||
341 | #endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */ | ||
342 | rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); | ||
343 | |||
344 | return rnd << PAGE_SHIFT; | ||
345 | } | ||
346 | |||
347 | static int mmap_is_legacy(struct rlimit *rlim_stack) | ||
348 | { | ||
349 | if (current->personality & ADDR_COMPAT_LAYOUT) | ||
350 | return 1; | ||
351 | |||
352 | if (rlim_stack->rlim_cur == RLIM_INFINITY) | ||
353 | return 1; | ||
354 | |||
355 | return sysctl_legacy_va_layout; | ||
356 | } | ||
357 | |||
358 | /* | ||
359 | * Leave enough space between the mmap area and the stack to honour ulimit in | ||
360 | * the face of randomisation. | ||
361 | */ | ||
362 | #define MIN_GAP (SZ_128M) | ||
363 | #define MAX_GAP (STACK_TOP / 6 * 5) | ||
364 | |||
365 | static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) | ||
366 | { | ||
367 | unsigned long gap = rlim_stack->rlim_cur; | ||
368 | unsigned long pad = stack_guard_gap; | ||
369 | |||
370 | /* Account for stack randomization if necessary */ | ||
371 | if (current->flags & PF_RANDOMIZE) | ||
372 | pad += (STACK_RND_MASK << PAGE_SHIFT); | ||
373 | |||
374 | /* Values close to RLIM_INFINITY can overflow. */ | ||
375 | if (gap + pad > gap) | ||
376 | gap += pad; | ||
377 | |||
378 | if (gap < MIN_GAP) | ||
379 | gap = MIN_GAP; | ||
380 | else if (gap > MAX_GAP) | ||
381 | gap = MAX_GAP; | ||
382 | |||
383 | return PAGE_ALIGN(STACK_TOP - gap - rnd); | ||
384 | } | ||
385 | |||
386 | void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) | ||
387 | { | ||
388 | unsigned long random_factor = 0UL; | ||
389 | |||
390 | if (current->flags & PF_RANDOMIZE) | ||
391 | random_factor = arch_mmap_rnd(); | ||
392 | |||
393 | if (mmap_is_legacy(rlim_stack)) { | ||
394 | mm->mmap_base = TASK_UNMAPPED_BASE + random_factor; | ||
395 | mm->get_unmapped_area = arch_get_unmapped_area; | ||
396 | } else { | ||
397 | mm->mmap_base = mmap_base(random_factor, rlim_stack); | ||
398 | mm->get_unmapped_area = arch_get_unmapped_area_topdown; | ||
399 | } | ||
400 | } | ||
401 | #elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) | ||
297 | void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) | 402 | void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) |
298 | { | 403 | { |
299 | mm->mmap_base = TASK_UNMAPPED_BASE; | 404 | mm->mmap_base = TASK_UNMAPPED_BASE; |
@@ -521,7 +626,7 @@ bool page_mapped(struct page *page) | |||
521 | return true; | 626 | return true; |
522 | if (PageHuge(page)) | 627 | if (PageHuge(page)) |
523 | return false; | 628 | return false; |
524 | for (i = 0; i < (1 << compound_order(page)); i++) { | 629 | for (i = 0; i < compound_nr(page); i++) { |
525 | if (atomic_read(&page[i]._mapcount) >= 0) | 630 | if (atomic_read(&page[i]._mapcount) >= 0) |
526 | return true; | 631 | return true; |
527 | } | 632 | } |
@@ -783,3 +888,16 @@ out_mm: | |||
783 | out: | 888 | out: |
784 | return res; | 889 | return res; |
785 | } | 890 | } |
891 | |||
892 | int memcmp_pages(struct page *page1, struct page *page2) | ||
893 | { | ||
894 | char *addr1, *addr2; | ||
895 | int ret; | ||
896 | |||
897 | addr1 = kmap_atomic(page1); | ||
898 | addr2 = kmap_atomic(page2); | ||
899 | ret = memcmp(addr1, addr2, PAGE_SIZE); | ||
900 | kunmap_atomic(addr2); | ||
901 | kunmap_atomic(addr1); | ||
902 | return ret; | ||
903 | } | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c1246d77cf75..fcadd3e25c0c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -329,8 +329,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn); | |||
329 | #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 | 329 | #define DEBUG_AUGMENT_PROPAGATE_CHECK 0 |
330 | #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 | 330 | #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 |
331 | 331 | ||
332 | #define VM_LAZY_FREE 0x02 | ||
333 | #define VM_VM_AREA 0x04 | ||
334 | 332 | ||
335 | static DEFINE_SPINLOCK(vmap_area_lock); | 333 | static DEFINE_SPINLOCK(vmap_area_lock); |
336 | /* Export for kexec only */ | 334 | /* Export for kexec only */ |
@@ -1116,7 +1114,7 @@ retry: | |||
1116 | 1114 | ||
1117 | va->va_start = addr; | 1115 | va->va_start = addr; |
1118 | va->va_end = addr + size; | 1116 | va->va_end = addr + size; |
1119 | va->flags = 0; | 1117 | va->vm = NULL; |
1120 | insert_vmap_area(va, &vmap_area_root, &vmap_area_list); | 1118 | insert_vmap_area(va, &vmap_area_root, &vmap_area_list); |
1121 | 1119 | ||
1122 | spin_unlock(&vmap_area_lock); | 1120 | spin_unlock(&vmap_area_lock); |
@@ -1282,7 +1280,14 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) | |||
1282 | llist_for_each_entry_safe(va, n_va, valist, purge_list) { | 1280 | llist_for_each_entry_safe(va, n_va, valist, purge_list) { |
1283 | unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; | 1281 | unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; |
1284 | 1282 | ||
1285 | __free_vmap_area(va); | 1283 | /* |
1284 | * Finally insert or merge lazily-freed area. It is | ||
1285 | * detached and there is no need to "unlink" it from | ||
1286 | * anything. | ||
1287 | */ | ||
1288 | merge_or_add_vmap_area(va, | ||
1289 | &free_vmap_area_root, &free_vmap_area_list); | ||
1290 | |||
1286 | atomic_long_sub(nr, &vmap_lazy_nr); | 1291 | atomic_long_sub(nr, &vmap_lazy_nr); |
1287 | 1292 | ||
1288 | if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) | 1293 | if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) |
@@ -1324,6 +1329,10 @@ static void free_vmap_area_noflush(struct vmap_area *va) | |||
1324 | { | 1329 | { |
1325 | unsigned long nr_lazy; | 1330 | unsigned long nr_lazy; |
1326 | 1331 | ||
1332 | spin_lock(&vmap_area_lock); | ||
1333 | unlink_va(va, &vmap_area_root); | ||
1334 | spin_unlock(&vmap_area_lock); | ||
1335 | |||
1327 | nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> | 1336 | nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> |
1328 | PAGE_SHIFT, &vmap_lazy_nr); | 1337 | PAGE_SHIFT, &vmap_lazy_nr); |
1329 | 1338 | ||
@@ -1918,7 +1927,6 @@ void __init vmalloc_init(void) | |||
1918 | if (WARN_ON_ONCE(!va)) | 1927 | if (WARN_ON_ONCE(!va)) |
1919 | continue; | 1928 | continue; |
1920 | 1929 | ||
1921 | va->flags = VM_VM_AREA; | ||
1922 | va->va_start = (unsigned long)tmp->addr; | 1930 | va->va_start = (unsigned long)tmp->addr; |
1923 | va->va_end = va->va_start + tmp->size; | 1931 | va->va_end = va->va_start + tmp->size; |
1924 | va->vm = tmp; | 1932 | va->vm = tmp; |
@@ -2016,7 +2024,6 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | |||
2016 | vm->size = va->va_end - va->va_start; | 2024 | vm->size = va->va_end - va->va_start; |
2017 | vm->caller = caller; | 2025 | vm->caller = caller; |
2018 | va->vm = vm; | 2026 | va->vm = vm; |
2019 | va->flags |= VM_VM_AREA; | ||
2020 | spin_unlock(&vmap_area_lock); | 2027 | spin_unlock(&vmap_area_lock); |
2021 | } | 2028 | } |
2022 | 2029 | ||
@@ -2121,10 +2128,10 @@ struct vm_struct *find_vm_area(const void *addr) | |||
2121 | struct vmap_area *va; | 2128 | struct vmap_area *va; |
2122 | 2129 | ||
2123 | va = find_vmap_area((unsigned long)addr); | 2130 | va = find_vmap_area((unsigned long)addr); |
2124 | if (va && va->flags & VM_VM_AREA) | 2131 | if (!va) |
2125 | return va->vm; | 2132 | return NULL; |
2126 | 2133 | ||
2127 | return NULL; | 2134 | return va->vm; |
2128 | } | 2135 | } |
2129 | 2136 | ||
2130 | /** | 2137 | /** |
@@ -2143,14 +2150,12 @@ struct vm_struct *remove_vm_area(const void *addr) | |||
2143 | 2150 | ||
2144 | might_sleep(); | 2151 | might_sleep(); |
2145 | 2152 | ||
2146 | va = find_vmap_area((unsigned long)addr); | 2153 | spin_lock(&vmap_area_lock); |
2147 | if (va && va->flags & VM_VM_AREA) { | 2154 | va = __find_vmap_area((unsigned long)addr); |
2155 | if (va && va->vm) { | ||
2148 | struct vm_struct *vm = va->vm; | 2156 | struct vm_struct *vm = va->vm; |
2149 | 2157 | ||
2150 | spin_lock(&vmap_area_lock); | ||
2151 | va->vm = NULL; | 2158 | va->vm = NULL; |
2152 | va->flags &= ~VM_VM_AREA; | ||
2153 | va->flags |= VM_LAZY_FREE; | ||
2154 | spin_unlock(&vmap_area_lock); | 2159 | spin_unlock(&vmap_area_lock); |
2155 | 2160 | ||
2156 | kasan_free_shadow(vm); | 2161 | kasan_free_shadow(vm); |
@@ -2158,6 +2163,8 @@ struct vm_struct *remove_vm_area(const void *addr) | |||
2158 | 2163 | ||
2159 | return vm; | 2164 | return vm; |
2160 | } | 2165 | } |
2166 | |||
2167 | spin_unlock(&vmap_area_lock); | ||
2161 | return NULL; | 2168 | return NULL; |
2162 | } | 2169 | } |
2163 | 2170 | ||
@@ -2402,7 +2409,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
2402 | nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; | 2409 | nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; |
2403 | array_size = (nr_pages * sizeof(struct page *)); | 2410 | array_size = (nr_pages * sizeof(struct page *)); |
2404 | 2411 | ||
2405 | area->nr_pages = nr_pages; | ||
2406 | /* Please note that the recursion is strictly bounded. */ | 2412 | /* Please note that the recursion is strictly bounded. */ |
2407 | if (array_size > PAGE_SIZE) { | 2413 | if (array_size > PAGE_SIZE) { |
2408 | pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, | 2414 | pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, |
@@ -2410,13 +2416,16 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
2410 | } else { | 2416 | } else { |
2411 | pages = kmalloc_node(array_size, nested_gfp, node); | 2417 | pages = kmalloc_node(array_size, nested_gfp, node); |
2412 | } | 2418 | } |
2413 | area->pages = pages; | 2419 | |
2414 | if (!area->pages) { | 2420 | if (!pages) { |
2415 | remove_vm_area(area->addr); | 2421 | remove_vm_area(area->addr); |
2416 | kfree(area); | 2422 | kfree(area); |
2417 | return NULL; | 2423 | return NULL; |
2418 | } | 2424 | } |
2419 | 2425 | ||
2426 | area->pages = pages; | ||
2427 | area->nr_pages = nr_pages; | ||
2428 | |||
2420 | for (i = 0; i < area->nr_pages; i++) { | 2429 | for (i = 0; i < area->nr_pages; i++) { |
2421 | struct page *page; | 2430 | struct page *page; |
2422 | 2431 | ||
@@ -2851,7 +2860,7 @@ long vread(char *buf, char *addr, unsigned long count) | |||
2851 | if (!count) | 2860 | if (!count) |
2852 | break; | 2861 | break; |
2853 | 2862 | ||
2854 | if (!(va->flags & VM_VM_AREA)) | 2863 | if (!va->vm) |
2855 | continue; | 2864 | continue; |
2856 | 2865 | ||
2857 | vm = va->vm; | 2866 | vm = va->vm; |
@@ -2931,7 +2940,7 @@ long vwrite(char *buf, char *addr, unsigned long count) | |||
2931 | if (!count) | 2940 | if (!count) |
2932 | break; | 2941 | break; |
2933 | 2942 | ||
2934 | if (!(va->flags & VM_VM_AREA)) | 2943 | if (!va->vm) |
2935 | continue; | 2944 | continue; |
2936 | 2945 | ||
2937 | vm = va->vm; | 2946 | vm = va->vm; |
@@ -3450,6 +3459,22 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) | |||
3450 | } | 3459 | } |
3451 | } | 3460 | } |
3452 | 3461 | ||
3462 | static void show_purge_info(struct seq_file *m) | ||
3463 | { | ||
3464 | struct llist_node *head; | ||
3465 | struct vmap_area *va; | ||
3466 | |||
3467 | head = READ_ONCE(vmap_purge_list.first); | ||
3468 | if (head == NULL) | ||
3469 | return; | ||
3470 | |||
3471 | llist_for_each_entry(va, head, purge_list) { | ||
3472 | seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", | ||
3473 | (void *)va->va_start, (void *)va->va_end, | ||
3474 | va->va_end - va->va_start); | ||
3475 | } | ||
3476 | } | ||
3477 | |||
3453 | static int s_show(struct seq_file *m, void *p) | 3478 | static int s_show(struct seq_file *m, void *p) |
3454 | { | 3479 | { |
3455 | struct vmap_area *va; | 3480 | struct vmap_area *va; |
@@ -3458,14 +3483,13 @@ static int s_show(struct seq_file *m, void *p) | |||
3458 | va = list_entry(p, struct vmap_area, list); | 3483 | va = list_entry(p, struct vmap_area, list); |
3459 | 3484 | ||
3460 | /* | 3485 | /* |
3461 | * s_show can encounter race with remove_vm_area, !VM_VM_AREA on | 3486 | * s_show can encounter race with remove_vm_area, !vm on behalf |
3462 | * behalf of vmap area is being tear down or vm_map_ram allocation. | 3487 | * of vmap area is being tear down or vm_map_ram allocation. |
3463 | */ | 3488 | */ |
3464 | if (!(va->flags & VM_VM_AREA)) { | 3489 | if (!va->vm) { |
3465 | seq_printf(m, "0x%pK-0x%pK %7ld %s\n", | 3490 | seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", |
3466 | (void *)va->va_start, (void *)va->va_end, | 3491 | (void *)va->va_start, (void *)va->va_end, |
3467 | va->va_end - va->va_start, | 3492 | va->va_end - va->va_start); |
3468 | va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram"); | ||
3469 | 3493 | ||
3470 | return 0; | 3494 | return 0; |
3471 | } | 3495 | } |
@@ -3504,6 +3528,16 @@ static int s_show(struct seq_file *m, void *p) | |||
3504 | 3528 | ||
3505 | show_numa_info(m, v); | 3529 | show_numa_info(m, v); |
3506 | seq_putc(m, '\n'); | 3530 | seq_putc(m, '\n'); |
3531 | |||
3532 | /* | ||
3533 | * As a final step, dump "unpurged" areas. Note, | ||
3534 | * that entire "/proc/vmallocinfo" output will not | ||
3535 | * be address sorted, because the purge list is not | ||
3536 | * sorted. | ||
3537 | */ | ||
3538 | if (list_is_last(&va->list, &vmap_area_list)) | ||
3539 | show_purge_info(m); | ||
3540 | |||
3507 | return 0; | 3541 | return 0; |
3508 | } | 3542 | } |
3509 | 3543 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index a6c5d0b28321..4911754c93b7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -171,11 +171,22 @@ int vm_swappiness = 60; | |||
171 | */ | 171 | */ |
172 | unsigned long vm_total_pages; | 172 | unsigned long vm_total_pages; |
173 | 173 | ||
174 | static void set_task_reclaim_state(struct task_struct *task, | ||
175 | struct reclaim_state *rs) | ||
176 | { | ||
177 | /* Check for an overwrite */ | ||
178 | WARN_ON_ONCE(rs && task->reclaim_state); | ||
179 | |||
180 | /* Check for the nulling of an already-nulled member */ | ||
181 | WARN_ON_ONCE(!rs && !task->reclaim_state); | ||
182 | |||
183 | task->reclaim_state = rs; | ||
184 | } | ||
185 | |||
174 | static LIST_HEAD(shrinker_list); | 186 | static LIST_HEAD(shrinker_list); |
175 | static DECLARE_RWSEM(shrinker_rwsem); | 187 | static DECLARE_RWSEM(shrinker_rwsem); |
176 | 188 | ||
177 | #ifdef CONFIG_MEMCG_KMEM | 189 | #ifdef CONFIG_MEMCG |
178 | |||
179 | /* | 190 | /* |
180 | * We allow subsystems to populate their shrinker-related | 191 | * We allow subsystems to populate their shrinker-related |
181 | * LRU lists before register_shrinker_prepared() is called | 192 | * LRU lists before register_shrinker_prepared() is called |
@@ -227,30 +238,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) | |||
227 | idr_remove(&shrinker_idr, id); | 238 | idr_remove(&shrinker_idr, id); |
228 | up_write(&shrinker_rwsem); | 239 | up_write(&shrinker_rwsem); |
229 | } | 240 | } |
230 | #else /* CONFIG_MEMCG_KMEM */ | ||
231 | static int prealloc_memcg_shrinker(struct shrinker *shrinker) | ||
232 | { | ||
233 | return 0; | ||
234 | } | ||
235 | |||
236 | static void unregister_memcg_shrinker(struct shrinker *shrinker) | ||
237 | { | ||
238 | } | ||
239 | #endif /* CONFIG_MEMCG_KMEM */ | ||
240 | |||
241 | static void set_task_reclaim_state(struct task_struct *task, | ||
242 | struct reclaim_state *rs) | ||
243 | { | ||
244 | /* Check for an overwrite */ | ||
245 | WARN_ON_ONCE(rs && task->reclaim_state); | ||
246 | |||
247 | /* Check for the nulling of an already-nulled member */ | ||
248 | WARN_ON_ONCE(!rs && !task->reclaim_state); | ||
249 | 241 | ||
250 | task->reclaim_state = rs; | ||
251 | } | ||
252 | |||
253 | #ifdef CONFIG_MEMCG | ||
254 | static bool global_reclaim(struct scan_control *sc) | 242 | static bool global_reclaim(struct scan_control *sc) |
255 | { | 243 | { |
256 | return !sc->target_mem_cgroup; | 244 | return !sc->target_mem_cgroup; |
@@ -305,6 +293,15 @@ static bool memcg_congested(pg_data_t *pgdat, | |||
305 | 293 | ||
306 | } | 294 | } |
307 | #else | 295 | #else |
296 | static int prealloc_memcg_shrinker(struct shrinker *shrinker) | ||
297 | { | ||
298 | return 0; | ||
299 | } | ||
300 | |||
301 | static void unregister_memcg_shrinker(struct shrinker *shrinker) | ||
302 | { | ||
303 | } | ||
304 | |||
308 | static bool global_reclaim(struct scan_control *sc) | 305 | static bool global_reclaim(struct scan_control *sc) |
309 | { | 306 | { |
310 | return true; | 307 | return true; |
@@ -591,7 +588,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, | |||
591 | return freed; | 588 | return freed; |
592 | } | 589 | } |
593 | 590 | ||
594 | #ifdef CONFIG_MEMCG_KMEM | 591 | #ifdef CONFIG_MEMCG |
595 | static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, | 592 | static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, |
596 | struct mem_cgroup *memcg, int priority) | 593 | struct mem_cgroup *memcg, int priority) |
597 | { | 594 | { |
@@ -599,7 +596,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, | |||
599 | unsigned long ret, freed = 0; | 596 | unsigned long ret, freed = 0; |
600 | int i; | 597 | int i; |
601 | 598 | ||
602 | if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) | 599 | if (!mem_cgroup_online(memcg)) |
603 | return 0; | 600 | return 0; |
604 | 601 | ||
605 | if (!down_read_trylock(&shrinker_rwsem)) | 602 | if (!down_read_trylock(&shrinker_rwsem)) |
@@ -625,6 +622,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, | |||
625 | continue; | 622 | continue; |
626 | } | 623 | } |
627 | 624 | ||
625 | /* Call non-slab shrinkers even though kmem is disabled */ | ||
626 | if (!memcg_kmem_enabled() && | ||
627 | !(shrinker->flags & SHRINKER_NONSLAB)) | ||
628 | continue; | ||
629 | |||
628 | ret = do_shrink_slab(&sc, shrinker, priority); | 630 | ret = do_shrink_slab(&sc, shrinker, priority); |
629 | if (ret == SHRINK_EMPTY) { | 631 | if (ret == SHRINK_EMPTY) { |
630 | clear_bit(i, map->map); | 632 | clear_bit(i, map->map); |
@@ -661,13 +663,13 @@ unlock: | |||
661 | up_read(&shrinker_rwsem); | 663 | up_read(&shrinker_rwsem); |
662 | return freed; | 664 | return freed; |
663 | } | 665 | } |
664 | #else /* CONFIG_MEMCG_KMEM */ | 666 | #else /* CONFIG_MEMCG */ |
665 | static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, | 667 | static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, |
666 | struct mem_cgroup *memcg, int priority) | 668 | struct mem_cgroup *memcg, int priority) |
667 | { | 669 | { |
668 | return 0; | 670 | return 0; |
669 | } | 671 | } |
670 | #endif /* CONFIG_MEMCG_KMEM */ | 672 | #endif /* CONFIG_MEMCG */ |
671 | 673 | ||
672 | /** | 674 | /** |
673 | * shrink_slab - shrink slab caches | 675 | * shrink_slab - shrink slab caches |
@@ -1149,7 +1151,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1149 | 1151 | ||
1150 | VM_BUG_ON_PAGE(PageActive(page), page); | 1152 | VM_BUG_ON_PAGE(PageActive(page), page); |
1151 | 1153 | ||
1152 | nr_pages = 1 << compound_order(page); | 1154 | nr_pages = compound_nr(page); |
1153 | 1155 | ||
1154 | /* Account the number of base pages even though THP */ | 1156 | /* Account the number of base pages even though THP */ |
1155 | sc->nr_scanned += nr_pages; | 1157 | sc->nr_scanned += nr_pages; |
@@ -1487,10 +1489,9 @@ free_it: | |||
1487 | * Is there need to periodically free_page_list? It would | 1489 | * Is there need to periodically free_page_list? It would |
1488 | * appear not as the counts should be low | 1490 | * appear not as the counts should be low |
1489 | */ | 1491 | */ |
1490 | if (unlikely(PageTransHuge(page))) { | 1492 | if (unlikely(PageTransHuge(page))) |
1491 | mem_cgroup_uncharge(page); | ||
1492 | (*get_compound_page_dtor(page))(page); | 1493 | (*get_compound_page_dtor(page))(page); |
1493 | } else | 1494 | else |
1494 | list_add(&page->lru, &free_pages); | 1495 | list_add(&page->lru, &free_pages); |
1495 | continue; | 1496 | continue; |
1496 | 1497 | ||
@@ -1705,7 +1706,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1705 | 1706 | ||
1706 | VM_BUG_ON_PAGE(!PageLRU(page), page); | 1707 | VM_BUG_ON_PAGE(!PageLRU(page), page); |
1707 | 1708 | ||
1708 | nr_pages = 1 << compound_order(page); | 1709 | nr_pages = compound_nr(page); |
1709 | total_scan += nr_pages; | 1710 | total_scan += nr_pages; |
1710 | 1711 | ||
1711 | if (page_zonenum(page) > sc->reclaim_idx) { | 1712 | if (page_zonenum(page) > sc->reclaim_idx) { |
@@ -1911,7 +1912,6 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, | |||
1911 | 1912 | ||
1912 | if (unlikely(PageCompound(page))) { | 1913 | if (unlikely(PageCompound(page))) { |
1913 | spin_unlock_irq(&pgdat->lru_lock); | 1914 | spin_unlock_irq(&pgdat->lru_lock); |
1914 | mem_cgroup_uncharge(page); | ||
1915 | (*get_compound_page_dtor(page))(page); | 1915 | (*get_compound_page_dtor(page))(page); |
1916 | spin_lock_irq(&pgdat->lru_lock); | 1916 | spin_lock_irq(&pgdat->lru_lock); |
1917 | } else | 1917 | } else |
@@ -2586,7 +2586,6 @@ static bool in_reclaim_compaction(struct scan_control *sc) | |||
2586 | */ | 2586 | */ |
2587 | static inline bool should_continue_reclaim(struct pglist_data *pgdat, | 2587 | static inline bool should_continue_reclaim(struct pglist_data *pgdat, |
2588 | unsigned long nr_reclaimed, | 2588 | unsigned long nr_reclaimed, |
2589 | unsigned long nr_scanned, | ||
2590 | struct scan_control *sc) | 2589 | struct scan_control *sc) |
2591 | { | 2590 | { |
2592 | unsigned long pages_for_compaction; | 2591 | unsigned long pages_for_compaction; |
@@ -2597,40 +2596,18 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, | |||
2597 | if (!in_reclaim_compaction(sc)) | 2596 | if (!in_reclaim_compaction(sc)) |
2598 | return false; | 2597 | return false; |
2599 | 2598 | ||
2600 | /* Consider stopping depending on scan and reclaim activity */ | ||
2601 | if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) { | ||
2602 | /* | ||
2603 | * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the | ||
2604 | * full LRU list has been scanned and we are still failing | ||
2605 | * to reclaim pages. This full LRU scan is potentially | ||
2606 | * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed | ||
2607 | */ | ||
2608 | if (!nr_reclaimed && !nr_scanned) | ||
2609 | return false; | ||
2610 | } else { | ||
2611 | /* | ||
2612 | * For non-__GFP_RETRY_MAYFAIL allocations which can presumably | ||
2613 | * fail without consequence, stop if we failed to reclaim | ||
2614 | * any pages from the last SWAP_CLUSTER_MAX number of | ||
2615 | * pages that were scanned. This will return to the | ||
2616 | * caller faster at the risk reclaim/compaction and | ||
2617 | * the resulting allocation attempt fails | ||
2618 | */ | ||
2619 | if (!nr_reclaimed) | ||
2620 | return false; | ||
2621 | } | ||
2622 | |||
2623 | /* | 2599 | /* |
2624 | * If we have not reclaimed enough pages for compaction and the | 2600 | * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX |
2625 | * inactive lists are large enough, continue reclaiming | 2601 | * number of pages that were scanned. This will return to the caller |
2602 | * with the risk reclaim/compaction and the resulting allocation attempt | ||
2603 | * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL | ||
2604 | * allocations through requiring that the full LRU list has been scanned | ||
2605 | * first, by assuming that zero delta of sc->nr_scanned means full LRU | ||
2606 | * scan, but that approximation was wrong, and there were corner cases | ||
2607 | * where always a non-zero amount of pages were scanned. | ||
2626 | */ | 2608 | */ |
2627 | pages_for_compaction = compact_gap(sc->order); | 2609 | if (!nr_reclaimed) |
2628 | inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); | 2610 | return false; |
2629 | if (get_nr_swap_pages() > 0) | ||
2630 | inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); | ||
2631 | if (sc->nr_reclaimed < pages_for_compaction && | ||
2632 | inactive_lru_pages > pages_for_compaction) | ||
2633 | return true; | ||
2634 | 2611 | ||
2635 | /* If compaction would go ahead or the allocation would succeed, stop */ | 2612 | /* If compaction would go ahead or the allocation would succeed, stop */ |
2636 | for (z = 0; z <= sc->reclaim_idx; z++) { | 2613 | for (z = 0; z <= sc->reclaim_idx; z++) { |
@@ -2647,7 +2624,17 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, | |||
2647 | ; | 2624 | ; |
2648 | } | 2625 | } |
2649 | } | 2626 | } |
2650 | return true; | 2627 | |
2628 | /* | ||
2629 | * If we have not reclaimed enough pages for compaction and the | ||
2630 | * inactive lists are large enough, continue reclaiming | ||
2631 | */ | ||
2632 | pages_for_compaction = compact_gap(sc->order); | ||
2633 | inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); | ||
2634 | if (get_nr_swap_pages() > 0) | ||
2635 | inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON); | ||
2636 | |||
2637 | return inactive_lru_pages > pages_for_compaction; | ||
2651 | } | 2638 | } |
2652 | 2639 | ||
2653 | static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) | 2640 | static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) |
@@ -2664,10 +2651,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) | |||
2664 | 2651 | ||
2665 | do { | 2652 | do { |
2666 | struct mem_cgroup *root = sc->target_mem_cgroup; | 2653 | struct mem_cgroup *root = sc->target_mem_cgroup; |
2667 | struct mem_cgroup_reclaim_cookie reclaim = { | ||
2668 | .pgdat = pgdat, | ||
2669 | .priority = sc->priority, | ||
2670 | }; | ||
2671 | unsigned long node_lru_pages = 0; | 2654 | unsigned long node_lru_pages = 0; |
2672 | struct mem_cgroup *memcg; | 2655 | struct mem_cgroup *memcg; |
2673 | 2656 | ||
@@ -2676,7 +2659,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) | |||
2676 | nr_reclaimed = sc->nr_reclaimed; | 2659 | nr_reclaimed = sc->nr_reclaimed; |
2677 | nr_scanned = sc->nr_scanned; | 2660 | nr_scanned = sc->nr_scanned; |
2678 | 2661 | ||
2679 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 2662 | memcg = mem_cgroup_iter(root, NULL, NULL); |
2680 | do { | 2663 | do { |
2681 | unsigned long lru_pages; | 2664 | unsigned long lru_pages; |
2682 | unsigned long reclaimed; | 2665 | unsigned long reclaimed; |
@@ -2719,21 +2702,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) | |||
2719 | sc->nr_scanned - scanned, | 2702 | sc->nr_scanned - scanned, |
2720 | sc->nr_reclaimed - reclaimed); | 2703 | sc->nr_reclaimed - reclaimed); |
2721 | 2704 | ||
2722 | /* | 2705 | } while ((memcg = mem_cgroup_iter(root, memcg, NULL))); |
2723 | * Kswapd have to scan all memory cgroups to fulfill | ||
2724 | * the overall scan target for the node. | ||
2725 | * | ||
2726 | * Limit reclaim, on the other hand, only cares about | ||
2727 | * nr_to_reclaim pages to be reclaimed and it will | ||
2728 | * retry with decreasing priority if one round over the | ||
2729 | * whole hierarchy is not sufficient. | ||
2730 | */ | ||
2731 | if (!current_is_kswapd() && | ||
2732 | sc->nr_reclaimed >= sc->nr_to_reclaim) { | ||
2733 | mem_cgroup_iter_break(root, memcg); | ||
2734 | break; | ||
2735 | } | ||
2736 | } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); | ||
2737 | 2706 | ||
2738 | if (reclaim_state) { | 2707 | if (reclaim_state) { |
2739 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; | 2708 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; |
@@ -2810,7 +2779,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) | |||
2810 | wait_iff_congested(BLK_RW_ASYNC, HZ/10); | 2779 | wait_iff_congested(BLK_RW_ASYNC, HZ/10); |
2811 | 2780 | ||
2812 | } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, | 2781 | } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, |
2813 | sc->nr_scanned - nr_scanned, sc)); | 2782 | sc)); |
2814 | 2783 | ||
2815 | /* | 2784 | /* |
2816 | * Kswapd gives up on balancing particular nodes after too | 2785 | * Kswapd gives up on balancing particular nodes after too |
diff --git a/mm/vmstat.c b/mm/vmstat.c index fd7e16ca6996..6afc892a148a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -1158,6 +1158,8 @@ const char * const vmstat_text[] = { | |||
1158 | "nr_shmem", | 1158 | "nr_shmem", |
1159 | "nr_shmem_hugepages", | 1159 | "nr_shmem_hugepages", |
1160 | "nr_shmem_pmdmapped", | 1160 | "nr_shmem_pmdmapped", |
1161 | "nr_file_hugepages", | ||
1162 | "nr_file_pmdmapped", | ||
1161 | "nr_anon_transparent_hugepages", | 1163 | "nr_anon_transparent_hugepages", |
1162 | "nr_unstable", | 1164 | "nr_unstable", |
1163 | "nr_vmscan_write", | 1165 | "nr_vmscan_write", |
diff --git a/mm/z3fold.c b/mm/z3fold.c index 75b7962439ff..05bdf90646e7 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c | |||
@@ -41,7 +41,6 @@ | |||
41 | #include <linux/workqueue.h> | 41 | #include <linux/workqueue.h> |
42 | #include <linux/slab.h> | 42 | #include <linux/slab.h> |
43 | #include <linux/spinlock.h> | 43 | #include <linux/spinlock.h> |
44 | #include <linux/wait.h> | ||
45 | #include <linux/zpool.h> | 44 | #include <linux/zpool.h> |
46 | #include <linux/magic.h> | 45 | #include <linux/magic.h> |
47 | 46 | ||
@@ -146,8 +145,6 @@ struct z3fold_header { | |||
146 | * @release_wq: workqueue for safe page release | 145 | * @release_wq: workqueue for safe page release |
147 | * @work: work_struct for safe page release | 146 | * @work: work_struct for safe page release |
148 | * @inode: inode for z3fold pseudo filesystem | 147 | * @inode: inode for z3fold pseudo filesystem |
149 | * @destroying: bool to stop migration once we start destruction | ||
150 | * @isolated: int to count the number of pages currently in isolation | ||
151 | * | 148 | * |
152 | * This structure is allocated at pool creation time and maintains metadata | 149 | * This structure is allocated at pool creation time and maintains metadata |
153 | * pertaining to a particular z3fold pool. | 150 | * pertaining to a particular z3fold pool. |
@@ -166,11 +163,8 @@ struct z3fold_pool { | |||
166 | const struct zpool_ops *zpool_ops; | 163 | const struct zpool_ops *zpool_ops; |
167 | struct workqueue_struct *compact_wq; | 164 | struct workqueue_struct *compact_wq; |
168 | struct workqueue_struct *release_wq; | 165 | struct workqueue_struct *release_wq; |
169 | struct wait_queue_head isolate_wait; | ||
170 | struct work_struct work; | 166 | struct work_struct work; |
171 | struct inode *inode; | 167 | struct inode *inode; |
172 | bool destroying; | ||
173 | int isolated; | ||
174 | }; | 168 | }; |
175 | 169 | ||
176 | /* | 170 | /* |
@@ -301,14 +295,11 @@ static void z3fold_unregister_migration(struct z3fold_pool *pool) | |||
301 | } | 295 | } |
302 | 296 | ||
303 | /* Initializes the z3fold header of a newly allocated z3fold page */ | 297 | /* Initializes the z3fold header of a newly allocated z3fold page */ |
304 | static struct z3fold_header *init_z3fold_page(struct page *page, | 298 | static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, |
305 | struct z3fold_pool *pool, gfp_t gfp) | 299 | struct z3fold_pool *pool, gfp_t gfp) |
306 | { | 300 | { |
307 | struct z3fold_header *zhdr = page_address(page); | 301 | struct z3fold_header *zhdr = page_address(page); |
308 | struct z3fold_buddy_slots *slots = alloc_slots(pool, gfp); | 302 | struct z3fold_buddy_slots *slots; |
309 | |||
310 | if (!slots) | ||
311 | return NULL; | ||
312 | 303 | ||
313 | INIT_LIST_HEAD(&page->lru); | 304 | INIT_LIST_HEAD(&page->lru); |
314 | clear_bit(PAGE_HEADLESS, &page->private); | 305 | clear_bit(PAGE_HEADLESS, &page->private); |
@@ -316,6 +307,12 @@ static struct z3fold_header *init_z3fold_page(struct page *page, | |||
316 | clear_bit(NEEDS_COMPACTING, &page->private); | 307 | clear_bit(NEEDS_COMPACTING, &page->private); |
317 | clear_bit(PAGE_STALE, &page->private); | 308 | clear_bit(PAGE_STALE, &page->private); |
318 | clear_bit(PAGE_CLAIMED, &page->private); | 309 | clear_bit(PAGE_CLAIMED, &page->private); |
310 | if (headless) | ||
311 | return zhdr; | ||
312 | |||
313 | slots = alloc_slots(pool, gfp); | ||
314 | if (!slots) | ||
315 | return NULL; | ||
319 | 316 | ||
320 | spin_lock_init(&zhdr->page_lock); | 317 | spin_lock_init(&zhdr->page_lock); |
321 | kref_init(&zhdr->refcount); | 318 | kref_init(&zhdr->refcount); |
@@ -372,9 +369,10 @@ static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) | |||
372 | * Encodes the handle of a particular buddy within a z3fold page | 369 | * Encodes the handle of a particular buddy within a z3fold page |
373 | * Pool lock should be held as this function accesses first_num | 370 | * Pool lock should be held as this function accesses first_num |
374 | */ | 371 | */ |
375 | static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) | 372 | static unsigned long __encode_handle(struct z3fold_header *zhdr, |
373 | struct z3fold_buddy_slots *slots, | ||
374 | enum buddy bud) | ||
376 | { | 375 | { |
377 | struct z3fold_buddy_slots *slots; | ||
378 | unsigned long h = (unsigned long)zhdr; | 376 | unsigned long h = (unsigned long)zhdr; |
379 | int idx = 0; | 377 | int idx = 0; |
380 | 378 | ||
@@ -391,11 +389,15 @@ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) | |||
391 | if (bud == LAST) | 389 | if (bud == LAST) |
392 | h |= (zhdr->last_chunks << BUDDY_SHIFT); | 390 | h |= (zhdr->last_chunks << BUDDY_SHIFT); |
393 | 391 | ||
394 | slots = zhdr->slots; | ||
395 | slots->slot[idx] = h; | 392 | slots->slot[idx] = h; |
396 | return (unsigned long)&slots->slot[idx]; | 393 | return (unsigned long)&slots->slot[idx]; |
397 | } | 394 | } |
398 | 395 | ||
396 | static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) | ||
397 | { | ||
398 | return __encode_handle(zhdr, zhdr->slots, bud); | ||
399 | } | ||
400 | |||
399 | /* Returns the z3fold page where a given handle is stored */ | 401 | /* Returns the z3fold page where a given handle is stored */ |
400 | static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) | 402 | static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) |
401 | { | 403 | { |
@@ -630,6 +632,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) | |||
630 | } | 632 | } |
631 | 633 | ||
632 | if (unlikely(PageIsolated(page) || | 634 | if (unlikely(PageIsolated(page) || |
635 | test_bit(PAGE_CLAIMED, &page->private) || | ||
633 | test_bit(PAGE_STALE, &page->private))) { | 636 | test_bit(PAGE_STALE, &page->private))) { |
634 | z3fold_page_unlock(zhdr); | 637 | z3fold_page_unlock(zhdr); |
635 | return; | 638 | return; |
@@ -775,7 +778,6 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, | |||
775 | goto out_c; | 778 | goto out_c; |
776 | spin_lock_init(&pool->lock); | 779 | spin_lock_init(&pool->lock); |
777 | spin_lock_init(&pool->stale_lock); | 780 | spin_lock_init(&pool->stale_lock); |
778 | init_waitqueue_head(&pool->isolate_wait); | ||
779 | pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); | 781 | pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); |
780 | if (!pool->unbuddied) | 782 | if (!pool->unbuddied) |
781 | goto out_pool; | 783 | goto out_pool; |
@@ -815,15 +817,6 @@ out: | |||
815 | return NULL; | 817 | return NULL; |
816 | } | 818 | } |
817 | 819 | ||
818 | static bool pool_isolated_are_drained(struct z3fold_pool *pool) | ||
819 | { | ||
820 | bool ret; | ||
821 | |||
822 | spin_lock(&pool->lock); | ||
823 | ret = pool->isolated == 0; | ||
824 | spin_unlock(&pool->lock); | ||
825 | return ret; | ||
826 | } | ||
827 | /** | 820 | /** |
828 | * z3fold_destroy_pool() - destroys an existing z3fold pool | 821 | * z3fold_destroy_pool() - destroys an existing z3fold pool |
829 | * @pool: the z3fold pool to be destroyed | 822 | * @pool: the z3fold pool to be destroyed |
@@ -833,22 +826,6 @@ static bool pool_isolated_are_drained(struct z3fold_pool *pool) | |||
833 | static void z3fold_destroy_pool(struct z3fold_pool *pool) | 826 | static void z3fold_destroy_pool(struct z3fold_pool *pool) |
834 | { | 827 | { |
835 | kmem_cache_destroy(pool->c_handle); | 828 | kmem_cache_destroy(pool->c_handle); |
836 | /* | ||
837 | * We set pool-> destroying under lock to ensure that | ||
838 | * z3fold_page_isolate() sees any changes to destroying. This way we | ||
839 | * avoid the need for any memory barriers. | ||
840 | */ | ||
841 | |||
842 | spin_lock(&pool->lock); | ||
843 | pool->destroying = true; | ||
844 | spin_unlock(&pool->lock); | ||
845 | |||
846 | /* | ||
847 | * We need to ensure that no pages are being migrated while we destroy | ||
848 | * these workqueues, as migration can queue work on either of the | ||
849 | * workqueues. | ||
850 | */ | ||
851 | wait_event(pool->isolate_wait, !pool_isolated_are_drained(pool)); | ||
852 | 829 | ||
853 | /* | 830 | /* |
854 | * We need to destroy pool->compact_wq before pool->release_wq, | 831 | * We need to destroy pool->compact_wq before pool->release_wq, |
@@ -956,7 +933,7 @@ retry: | |||
956 | if (!page) | 933 | if (!page) |
957 | return -ENOMEM; | 934 | return -ENOMEM; |
958 | 935 | ||
959 | zhdr = init_z3fold_page(page, pool, gfp); | 936 | zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp); |
960 | if (!zhdr) { | 937 | if (!zhdr) { |
961 | __free_page(page); | 938 | __free_page(page); |
962 | return -ENOMEM; | 939 | return -ENOMEM; |
@@ -1132,6 +1109,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) | |||
1132 | struct z3fold_header *zhdr = NULL; | 1109 | struct z3fold_header *zhdr = NULL; |
1133 | struct page *page = NULL; | 1110 | struct page *page = NULL; |
1134 | struct list_head *pos; | 1111 | struct list_head *pos; |
1112 | struct z3fold_buddy_slots slots; | ||
1135 | unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; | 1113 | unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; |
1136 | 1114 | ||
1137 | spin_lock(&pool->lock); | 1115 | spin_lock(&pool->lock); |
@@ -1150,16 +1128,22 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) | |||
1150 | /* this bit could have been set by free, in which case | 1128 | /* this bit could have been set by free, in which case |
1151 | * we pass over to the next page in the pool. | 1129 | * we pass over to the next page in the pool. |
1152 | */ | 1130 | */ |
1153 | if (test_and_set_bit(PAGE_CLAIMED, &page->private)) | 1131 | if (test_and_set_bit(PAGE_CLAIMED, &page->private)) { |
1132 | page = NULL; | ||
1154 | continue; | 1133 | continue; |
1134 | } | ||
1155 | 1135 | ||
1156 | if (unlikely(PageIsolated(page))) | 1136 | if (unlikely(PageIsolated(page))) { |
1137 | clear_bit(PAGE_CLAIMED, &page->private); | ||
1138 | page = NULL; | ||
1157 | continue; | 1139 | continue; |
1140 | } | ||
1141 | zhdr = page_address(page); | ||
1158 | if (test_bit(PAGE_HEADLESS, &page->private)) | 1142 | if (test_bit(PAGE_HEADLESS, &page->private)) |
1159 | break; | 1143 | break; |
1160 | 1144 | ||
1161 | zhdr = page_address(page); | ||
1162 | if (!z3fold_page_trylock(zhdr)) { | 1145 | if (!z3fold_page_trylock(zhdr)) { |
1146 | clear_bit(PAGE_CLAIMED, &page->private); | ||
1163 | zhdr = NULL; | 1147 | zhdr = NULL; |
1164 | continue; /* can't evict at this point */ | 1148 | continue; /* can't evict at this point */ |
1165 | } | 1149 | } |
@@ -1177,26 +1161,30 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) | |||
1177 | 1161 | ||
1178 | if (!test_bit(PAGE_HEADLESS, &page->private)) { | 1162 | if (!test_bit(PAGE_HEADLESS, &page->private)) { |
1179 | /* | 1163 | /* |
1180 | * We need encode the handles before unlocking, since | 1164 | * We need encode the handles before unlocking, and |
1181 | * we can race with free that will set | 1165 | * use our local slots structure because z3fold_free |
1182 | * (first|last)_chunks to 0 | 1166 | * can zero out zhdr->slots and we can't do much |
1167 | * about that | ||
1183 | */ | 1168 | */ |
1184 | first_handle = 0; | 1169 | first_handle = 0; |
1185 | last_handle = 0; | 1170 | last_handle = 0; |
1186 | middle_handle = 0; | 1171 | middle_handle = 0; |
1187 | if (zhdr->first_chunks) | 1172 | if (zhdr->first_chunks) |
1188 | first_handle = encode_handle(zhdr, FIRST); | 1173 | first_handle = __encode_handle(zhdr, &slots, |
1174 | FIRST); | ||
1189 | if (zhdr->middle_chunks) | 1175 | if (zhdr->middle_chunks) |
1190 | middle_handle = encode_handle(zhdr, MIDDLE); | 1176 | middle_handle = __encode_handle(zhdr, &slots, |
1177 | MIDDLE); | ||
1191 | if (zhdr->last_chunks) | 1178 | if (zhdr->last_chunks) |
1192 | last_handle = encode_handle(zhdr, LAST); | 1179 | last_handle = __encode_handle(zhdr, &slots, |
1180 | LAST); | ||
1193 | /* | 1181 | /* |
1194 | * it's safe to unlock here because we hold a | 1182 | * it's safe to unlock here because we hold a |
1195 | * reference to this page | 1183 | * reference to this page |
1196 | */ | 1184 | */ |
1197 | z3fold_page_unlock(zhdr); | 1185 | z3fold_page_unlock(zhdr); |
1198 | } else { | 1186 | } else { |
1199 | first_handle = encode_handle(zhdr, HEADLESS); | 1187 | first_handle = __encode_handle(zhdr, &slots, HEADLESS); |
1200 | last_handle = middle_handle = 0; | 1188 | last_handle = middle_handle = 0; |
1201 | } | 1189 | } |
1202 | 1190 | ||
@@ -1226,9 +1214,9 @@ next: | |||
1226 | spin_lock(&pool->lock); | 1214 | spin_lock(&pool->lock); |
1227 | list_add(&page->lru, &pool->lru); | 1215 | list_add(&page->lru, &pool->lru); |
1228 | spin_unlock(&pool->lock); | 1216 | spin_unlock(&pool->lock); |
1217 | clear_bit(PAGE_CLAIMED, &page->private); | ||
1229 | } else { | 1218 | } else { |
1230 | z3fold_page_lock(zhdr); | 1219 | z3fold_page_lock(zhdr); |
1231 | clear_bit(PAGE_CLAIMED, &page->private); | ||
1232 | if (kref_put(&zhdr->refcount, | 1220 | if (kref_put(&zhdr->refcount, |
1233 | release_z3fold_page_locked)) { | 1221 | release_z3fold_page_locked)) { |
1234 | atomic64_dec(&pool->pages_nr); | 1222 | atomic64_dec(&pool->pages_nr); |
@@ -1243,6 +1231,7 @@ next: | |||
1243 | list_add(&page->lru, &pool->lru); | 1231 | list_add(&page->lru, &pool->lru); |
1244 | spin_unlock(&pool->lock); | 1232 | spin_unlock(&pool->lock); |
1245 | z3fold_page_unlock(zhdr); | 1233 | z3fold_page_unlock(zhdr); |
1234 | clear_bit(PAGE_CLAIMED, &page->private); | ||
1246 | } | 1235 | } |
1247 | 1236 | ||
1248 | /* We started off locked to we need to lock the pool back */ | 1237 | /* We started off locked to we need to lock the pool back */ |
@@ -1339,28 +1328,6 @@ static u64 z3fold_get_pool_size(struct z3fold_pool *pool) | |||
1339 | return atomic64_read(&pool->pages_nr); | 1328 | return atomic64_read(&pool->pages_nr); |
1340 | } | 1329 | } |
1341 | 1330 | ||
1342 | /* | ||
1343 | * z3fold_dec_isolated() expects to be called while pool->lock is held. | ||
1344 | */ | ||
1345 | static void z3fold_dec_isolated(struct z3fold_pool *pool) | ||
1346 | { | ||
1347 | assert_spin_locked(&pool->lock); | ||
1348 | VM_BUG_ON(pool->isolated <= 0); | ||
1349 | pool->isolated--; | ||
1350 | |||
1351 | /* | ||
1352 | * If we have no more isolated pages, we have to see if | ||
1353 | * z3fold_destroy_pool() is waiting for a signal. | ||
1354 | */ | ||
1355 | if (pool->isolated == 0 && waitqueue_active(&pool->isolate_wait)) | ||
1356 | wake_up_all(&pool->isolate_wait); | ||
1357 | } | ||
1358 | |||
1359 | static void z3fold_inc_isolated(struct z3fold_pool *pool) | ||
1360 | { | ||
1361 | pool->isolated++; | ||
1362 | } | ||
1363 | |||
1364 | static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) | 1331 | static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) |
1365 | { | 1332 | { |
1366 | struct z3fold_header *zhdr; | 1333 | struct z3fold_header *zhdr; |
@@ -1369,7 +1336,8 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) | |||
1369 | VM_BUG_ON_PAGE(!PageMovable(page), page); | 1336 | VM_BUG_ON_PAGE(!PageMovable(page), page); |
1370 | VM_BUG_ON_PAGE(PageIsolated(page), page); | 1337 | VM_BUG_ON_PAGE(PageIsolated(page), page); |
1371 | 1338 | ||
1372 | if (test_bit(PAGE_HEADLESS, &page->private)) | 1339 | if (test_bit(PAGE_HEADLESS, &page->private) || |
1340 | test_bit(PAGE_CLAIMED, &page->private)) | ||
1373 | return false; | 1341 | return false; |
1374 | 1342 | ||
1375 | zhdr = page_address(page); | 1343 | zhdr = page_address(page); |
@@ -1387,34 +1355,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) | |||
1387 | spin_lock(&pool->lock); | 1355 | spin_lock(&pool->lock); |
1388 | if (!list_empty(&page->lru)) | 1356 | if (!list_empty(&page->lru)) |
1389 | list_del(&page->lru); | 1357 | list_del(&page->lru); |
1390 | /* | ||
1391 | * We need to check for destruction while holding pool->lock, as | ||
1392 | * otherwise destruction could see 0 isolated pages, and | ||
1393 | * proceed. | ||
1394 | */ | ||
1395 | if (unlikely(pool->destroying)) { | ||
1396 | spin_unlock(&pool->lock); | ||
1397 | /* | ||
1398 | * If this page isn't stale, somebody else holds a | ||
1399 | * reference to it. Let't drop our refcount so that they | ||
1400 | * can call the release logic. | ||
1401 | */ | ||
1402 | if (unlikely(kref_put(&zhdr->refcount, | ||
1403 | release_z3fold_page_locked))) { | ||
1404 | /* | ||
1405 | * If we get here we have kref problems, so we | ||
1406 | * should freak out. | ||
1407 | */ | ||
1408 | WARN(1, "Z3fold is experiencing kref problems\n"); | ||
1409 | z3fold_page_unlock(zhdr); | ||
1410 | return false; | ||
1411 | } | ||
1412 | z3fold_page_unlock(zhdr); | ||
1413 | return false; | ||
1414 | } | ||
1415 | |||
1416 | |||
1417 | z3fold_inc_isolated(pool); | ||
1418 | spin_unlock(&pool->lock); | 1358 | spin_unlock(&pool->lock); |
1419 | z3fold_page_unlock(zhdr); | 1359 | z3fold_page_unlock(zhdr); |
1420 | return true; | 1360 | return true; |
@@ -1483,10 +1423,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa | |||
1483 | 1423 | ||
1484 | queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); | 1424 | queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); |
1485 | 1425 | ||
1486 | spin_lock(&pool->lock); | ||
1487 | z3fold_dec_isolated(pool); | ||
1488 | spin_unlock(&pool->lock); | ||
1489 | |||
1490 | page_mapcount_reset(page); | 1426 | page_mapcount_reset(page); |
1491 | put_page(page); | 1427 | put_page(page); |
1492 | return 0; | 1428 | return 0; |
@@ -1506,14 +1442,10 @@ static void z3fold_page_putback(struct page *page) | |||
1506 | INIT_LIST_HEAD(&page->lru); | 1442 | INIT_LIST_HEAD(&page->lru); |
1507 | if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { | 1443 | if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { |
1508 | atomic64_dec(&pool->pages_nr); | 1444 | atomic64_dec(&pool->pages_nr); |
1509 | spin_lock(&pool->lock); | ||
1510 | z3fold_dec_isolated(pool); | ||
1511 | spin_unlock(&pool->lock); | ||
1512 | return; | 1445 | return; |
1513 | } | 1446 | } |
1514 | spin_lock(&pool->lock); | 1447 | spin_lock(&pool->lock); |
1515 | list_add(&page->lru, &pool->lru); | 1448 | list_add(&page->lru, &pool->lru); |
1516 | z3fold_dec_isolated(pool); | ||
1517 | spin_unlock(&pool->lock); | 1449 | spin_unlock(&pool->lock); |
1518 | z3fold_page_unlock(zhdr); | 1450 | z3fold_page_unlock(zhdr); |
1519 | } | 1451 | } |
diff --git a/mm/zpool.c b/mm/zpool.c index a2dd9107857d..863669212070 100644 --- a/mm/zpool.c +++ b/mm/zpool.c | |||
@@ -239,6 +239,22 @@ const char *zpool_get_type(struct zpool *zpool) | |||
239 | } | 239 | } |
240 | 240 | ||
241 | /** | 241 | /** |
242 | * zpool_malloc_support_movable() - Check if the zpool support | ||
243 | * allocate movable memory | ||
244 | * @zpool: The zpool to check | ||
245 | * | ||
246 | * This returns if the zpool support allocate movable memory. | ||
247 | * | ||
248 | * Implementations must guarantee this to be thread-safe. | ||
249 | * | ||
250 | * Returns: true if if the zpool support allocate movable memory, false if not | ||
251 | */ | ||
252 | bool zpool_malloc_support_movable(struct zpool *zpool) | ||
253 | { | ||
254 | return zpool->driver->malloc_support_movable; | ||
255 | } | ||
256 | |||
257 | /** | ||
242 | * zpool_malloc() - Allocate memory | 258 | * zpool_malloc() - Allocate memory |
243 | * @zpool: The zpool to allocate from. | 259 | * @zpool: The zpool to allocate from. |
244 | * @size: The amount of memory to allocate. | 260 | * @size: The amount of memory to allocate. |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e98bb6ab4f7e..2b2b9aae8a3c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
@@ -443,15 +443,16 @@ static u64 zs_zpool_total_size(void *pool) | |||
443 | } | 443 | } |
444 | 444 | ||
445 | static struct zpool_driver zs_zpool_driver = { | 445 | static struct zpool_driver zs_zpool_driver = { |
446 | .type = "zsmalloc", | 446 | .type = "zsmalloc", |
447 | .owner = THIS_MODULE, | 447 | .owner = THIS_MODULE, |
448 | .create = zs_zpool_create, | 448 | .create = zs_zpool_create, |
449 | .destroy = zs_zpool_destroy, | 449 | .destroy = zs_zpool_destroy, |
450 | .malloc = zs_zpool_malloc, | 450 | .malloc_support_movable = true, |
451 | .free = zs_zpool_free, | 451 | .malloc = zs_zpool_malloc, |
452 | .map = zs_zpool_map, | 452 | .free = zs_zpool_free, |
453 | .unmap = zs_zpool_unmap, | 453 | .map = zs_zpool_map, |
454 | .total_size = zs_zpool_total_size, | 454 | .unmap = zs_zpool_unmap, |
455 | .total_size = zs_zpool_total_size, | ||
455 | }; | 456 | }; |
456 | 457 | ||
457 | MODULE_ALIAS("zpool-zsmalloc"); | 458 | MODULE_ALIAS("zpool-zsmalloc"); |
@@ -476,10 +477,6 @@ static inline int get_zspage_inuse(struct zspage *zspage) | |||
476 | return zspage->inuse; | 477 | return zspage->inuse; |
477 | } | 478 | } |
478 | 479 | ||
479 | static inline void set_zspage_inuse(struct zspage *zspage, int val) | ||
480 | { | ||
481 | zspage->inuse = val; | ||
482 | } | ||
483 | 480 | ||
484 | static inline void mod_zspage_inuse(struct zspage *zspage, int val) | 481 | static inline void mod_zspage_inuse(struct zspage *zspage, int val) |
485 | { | 482 | { |
diff --git a/mm/zswap.c b/mm/zswap.c index 0e22744a76cb..46a322316e52 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
@@ -856,7 +856,6 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) | |||
856 | /* extract swpentry from data */ | 856 | /* extract swpentry from data */ |
857 | zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); | 857 | zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); |
858 | swpentry = zhdr->swpentry; /* here */ | 858 | swpentry = zhdr->swpentry; /* here */ |
859 | zpool_unmap_handle(pool, handle); | ||
860 | tree = zswap_trees[swp_type(swpentry)]; | 859 | tree = zswap_trees[swp_type(swpentry)]; |
861 | offset = swp_offset(swpentry); | 860 | offset = swp_offset(swpentry); |
862 | 861 | ||
@@ -866,6 +865,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) | |||
866 | if (!entry) { | 865 | if (!entry) { |
867 | /* entry was invalidated */ | 866 | /* entry was invalidated */ |
868 | spin_unlock(&tree->lock); | 867 | spin_unlock(&tree->lock); |
868 | zpool_unmap_handle(pool, handle); | ||
869 | return 0; | 869 | return 0; |
870 | } | 870 | } |
871 | spin_unlock(&tree->lock); | 871 | spin_unlock(&tree->lock); |
@@ -886,15 +886,13 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) | |||
886 | case ZSWAP_SWAPCACHE_NEW: /* page is locked */ | 886 | case ZSWAP_SWAPCACHE_NEW: /* page is locked */ |
887 | /* decompress */ | 887 | /* decompress */ |
888 | dlen = PAGE_SIZE; | 888 | dlen = PAGE_SIZE; |
889 | src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, | 889 | src = (u8 *)zhdr + sizeof(struct zswap_header); |
890 | ZPOOL_MM_RO) + sizeof(struct zswap_header); | ||
891 | dst = kmap_atomic(page); | 890 | dst = kmap_atomic(page); |
892 | tfm = *get_cpu_ptr(entry->pool->tfm); | 891 | tfm = *get_cpu_ptr(entry->pool->tfm); |
893 | ret = crypto_comp_decompress(tfm, src, entry->length, | 892 | ret = crypto_comp_decompress(tfm, src, entry->length, |
894 | dst, &dlen); | 893 | dst, &dlen); |
895 | put_cpu_ptr(entry->pool->tfm); | 894 | put_cpu_ptr(entry->pool->tfm); |
896 | kunmap_atomic(dst); | 895 | kunmap_atomic(dst); |
897 | zpool_unmap_handle(entry->pool->zpool, entry->handle); | ||
898 | BUG_ON(ret); | 896 | BUG_ON(ret); |
899 | BUG_ON(dlen != PAGE_SIZE); | 897 | BUG_ON(dlen != PAGE_SIZE); |
900 | 898 | ||
@@ -940,6 +938,7 @@ fail: | |||
940 | spin_unlock(&tree->lock); | 938 | spin_unlock(&tree->lock); |
941 | 939 | ||
942 | end: | 940 | end: |
941 | zpool_unmap_handle(pool, handle); | ||
943 | return ret; | 942 | return ret; |
944 | } | 943 | } |
945 | 944 | ||
@@ -997,6 +996,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, | |||
997 | char *buf; | 996 | char *buf; |
998 | u8 *src, *dst; | 997 | u8 *src, *dst; |
999 | struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; | 998 | struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; |
999 | gfp_t gfp; | ||
1000 | 1000 | ||
1001 | /* THP isn't supported */ | 1001 | /* THP isn't supported */ |
1002 | if (PageTransHuge(page)) { | 1002 | if (PageTransHuge(page)) { |
@@ -1070,9 +1070,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, | |||
1070 | 1070 | ||
1071 | /* store */ | 1071 | /* store */ |
1072 | hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; | 1072 | hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; |
1073 | ret = zpool_malloc(entry->pool->zpool, hlen + dlen, | 1073 | gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; |
1074 | __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, | 1074 | if (zpool_malloc_support_movable(entry->pool->zpool)) |
1075 | &handle); | 1075 | gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; |
1076 | ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle); | ||
1076 | if (ret == -ENOSPC) { | 1077 | if (ret == -ENOSPC) { |
1077 | zswap_reject_compress_poor++; | 1078 | zswap_reject_compress_poor++; |
1078 | goto put_dstmem; | 1079 | goto put_dstmem; |
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 947b8ff0227e..bba3104f128f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c | |||
@@ -206,14 +206,7 @@ static int xdp_umem_map_pages(struct xdp_umem *umem) | |||
206 | 206 | ||
207 | static void xdp_umem_unpin_pages(struct xdp_umem *umem) | 207 | static void xdp_umem_unpin_pages(struct xdp_umem *umem) |
208 | { | 208 | { |
209 | unsigned int i; | 209 | put_user_pages_dirty_lock(umem->pgs, umem->npgs, true); |
210 | |||
211 | for (i = 0; i < umem->npgs; i++) { | ||
212 | struct page *page = umem->pgs[i]; | ||
213 | |||
214 | set_page_dirty_lock(page); | ||
215 | put_page(page); | ||
216 | } | ||
217 | 210 | ||
218 | kfree(umem->pgs); | 211 | kfree(umem->pgs); |
219 | umem->pgs = NULL; | 212 | umem->pgs = NULL; |
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c2f1af3b6a7c..fa8fbb8fa3c8 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c | |||
@@ -977,7 +977,7 @@ static int xsk_mmap(struct file *file, struct socket *sock, | |||
977 | /* Matches the smp_wmb() in xsk_init_queue */ | 977 | /* Matches the smp_wmb() in xsk_init_queue */ |
978 | smp_rmb(); | 978 | smp_rmb(); |
979 | qpg = virt_to_head_page(q->ring); | 979 | qpg = virt_to_head_page(q->ring); |
980 | if (size > (PAGE_SIZE << compound_order(qpg))) | 980 | if (size > page_size(qpg)) |
981 | return -EINVAL; | 981 | return -EINVAL; |
982 | 982 | ||
983 | pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; | 983 | pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; |
diff --git a/usr/Makefile b/usr/Makefile index 6a89eb019275..e6f7cb2f81db 100644 --- a/usr/Makefile +++ b/usr/Makefile | |||
@@ -11,6 +11,9 @@ datafile_y = initramfs_data.cpio$(suffix_y) | |||
11 | datafile_d_y = .$(datafile_y).d | 11 | datafile_d_y = .$(datafile_y).d |
12 | AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/$(datafile_y)" | 12 | AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/$(datafile_y)" |
13 | 13 | ||
14 | # clean rules do not have CONFIG_INITRAMFS_COMPRESSION. So clean up after all | ||
15 | # possible compression formats. | ||
16 | clean-files += initramfs_data.cpio* | ||
14 | 17 | ||
15 | # Generate builtin.o based on initramfs_data.o | 18 | # Generate builtin.o based on initramfs_data.o |
16 | obj-$(CONFIG_BLK_DEV_INITRD) := initramfs_data.o | 19 | obj-$(CONFIG_BLK_DEV_INITRD) := initramfs_data.o |