summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-24 19:10:23 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-24 19:10:23 -0400
commit9c9fa97a8edbc3668dfc7a25de516e80c146e86f (patch)
tree2dc0e90203796a4b346ce190f9521c3294104058
parent5184d449600f501a8688069f35c138c6b3bf8b94 (diff)
parent2b38d01b4de8b1bbda7f5f7e91252609557635fc (diff)
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - a few hot fixes - ocfs2 updates - almost all of -mm (slab-generic, slab, slub, kmemleak, kasan, cleanups, debug, pagecache, memcg, gup, pagemap, memory-hotplug, sparsemem, vmalloc, initialization, z3fold, compaction, mempolicy, oom-kill, hugetlb, migration, thp, mmap, madvise, shmem, zswap, zsmalloc) * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (132 commits) mm/zsmalloc.c: fix a -Wunused-function warning zswap: do not map same object twice zswap: use movable memory if zpool support allocate movable memory zpool: add malloc_support_movable to zpool_driver shmem: fix obsolete comment in shmem_getpage_gfp() mm/madvise: reduce code duplication in error handling paths mm: mmap: increase sockets maximum memory size pgoff for 32bits mm/mmap.c: refine find_vma_prev() with rb_last() riscv: make mmap allocation top-down by default mips: use generic mmap top-down layout and brk randomization mips: replace arch specific way to determine 32bit task with generic version mips: adjust brk randomization offset to fit generic version mips: use STACK_TOP when computing mmap base address mips: properly account for stack randomization and stack guard gap arm: use generic mmap top-down layout and brk randomization arm: use STACK_TOP when computing mmap base address arm: properly account for stack randomization and stack guard gap arm64, mm: make randomization selected by generic topdown mmap layout arm64, mm: move generic mmap layout functions to mm arm64: consider stack randomization for mmap base only when necessary ...
-rw-r--r--Documentation/ABI/testing/sysfs-kernel-slab13
-rw-r--r--Documentation/admin-guide/cgroup-v1/memory.rst4
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt2
-rw-r--r--arch/Kconfig11
-rw-r--r--arch/alpha/include/asm/pgalloc.h2
-rw-r--r--arch/alpha/include/asm/pgtable.h5
-rw-r--r--arch/arc/include/asm/pgalloc.h1
-rw-r--r--arch/arc/include/asm/pgtable.h5
-rw-r--r--arch/arm/Kconfig1
-rw-r--r--arch/arm/include/asm/pgalloc.h2
-rw-r--r--arch/arm/include/asm/pgtable-nommu.h5
-rw-r--r--arch/arm/include/asm/pgtable.h2
-rw-r--r--arch/arm/include/asm/processor.h2
-rw-r--r--arch/arm/kernel/process.c5
-rw-r--r--arch/arm/mm/flush.c7
-rw-r--r--arch/arm/mm/mmap.c52
-rw-r--r--arch/arm64/Kconfig2
-rw-r--r--arch/arm64/include/asm/pgalloc.h2
-rw-r--r--arch/arm64/include/asm/pgtable.h2
-rw-r--r--arch/arm64/include/asm/processor.h2
-rw-r--r--arch/arm64/kernel/process.c8
-rw-r--r--arch/arm64/mm/flush.c3
-rw-r--r--arch/arm64/mm/mmap.c72
-rw-r--r--arch/arm64/mm/pgd.c2
-rw-r--r--arch/c6x/include/asm/pgtable.h5
-rw-r--r--arch/csky/include/asm/pgalloc.h2
-rw-r--r--arch/csky/include/asm/pgtable.h5
-rw-r--r--arch/h8300/include/asm/pgtable.h6
-rw-r--r--arch/hexagon/include/asm/pgalloc.h2
-rw-r--r--arch/hexagon/include/asm/pgtable.h3
-rw-r--r--arch/hexagon/mm/Makefile2
-rw-r--r--arch/hexagon/mm/pgalloc.c10
-rw-r--r--arch/ia64/Kconfig4
-rw-r--r--arch/ia64/include/asm/pgalloc.h52
-rw-r--r--arch/ia64/include/asm/pgtable.h5
-rw-r--r--arch/ia64/mm/init.c2
-rw-r--r--arch/m68k/include/asm/pgtable_mm.h7
-rw-r--r--arch/m68k/include/asm/pgtable_no.h7
-rw-r--r--arch/microblaze/include/asm/pgalloc.h122
-rw-r--r--arch/microblaze/include/asm/pgtable.h7
-rw-r--r--arch/microblaze/mm/pgtable.c4
-rw-r--r--arch/mips/Kconfig2
-rw-r--r--arch/mips/include/asm/pgalloc.h2
-rw-r--r--arch/mips/include/asm/pgtable.h5
-rw-r--r--arch/mips/include/asm/processor.h5
-rw-r--r--arch/mips/mm/mmap.c84
-rw-r--r--arch/nds32/include/asm/pgalloc.h2
-rw-r--r--arch/nds32/include/asm/pgtable.h2
-rw-r--r--arch/nios2/include/asm/pgalloc.h2
-rw-r--r--arch/nios2/include/asm/pgtable.h2
-rw-r--r--arch/openrisc/include/asm/pgalloc.h2
-rw-r--r--arch/openrisc/include/asm/pgtable.h5
-rw-r--r--arch/parisc/include/asm/pgalloc.h2
-rw-r--r--arch/parisc/include/asm/pgtable.h2
-rw-r--r--arch/powerpc/include/asm/pgalloc.h2
-rw-r--r--arch/powerpc/include/asm/pgtable.h1
-rw-r--r--arch/powerpc/mm/book3s64/hash_utils.c2
-rw-r--r--arch/powerpc/mm/book3s64/iommu_api.c7
-rw-r--r--arch/powerpc/mm/hugetlbpage.c2
-rw-r--r--arch/riscv/Kconfig12
-rw-r--r--arch/riscv/include/asm/pgalloc.h4
-rw-r--r--arch/riscv/include/asm/pgtable.h5
-rw-r--r--arch/s390/include/asm/pgtable.h6
-rw-r--r--arch/sh/include/asm/pgalloc.h44
-rw-r--r--arch/sh/include/asm/pgtable.h5
-rw-r--r--arch/sh/mm/Kconfig3
-rw-r--r--arch/sh/mm/nommu.c4
-rw-r--r--arch/sparc/include/asm/pgalloc_32.h2
-rw-r--r--arch/sparc/include/asm/pgalloc_64.h2
-rw-r--r--arch/sparc/include/asm/pgtable_32.h5
-rw-r--r--arch/sparc/include/asm/pgtable_64.h1
-rw-r--r--arch/sparc/mm/init_32.c1
-rw-r--r--arch/um/include/asm/pgalloc.h2
-rw-r--r--arch/um/include/asm/pgtable.h2
-rw-r--r--arch/unicore32/include/asm/pgalloc.h2
-rw-r--r--arch/unicore32/include/asm/pgtable.h2
-rw-r--r--arch/x86/include/asm/pgtable_32.h2
-rw-r--r--arch/x86/include/asm/pgtable_64.h3
-rw-r--r--arch/x86/mm/pgtable.c6
-rw-r--r--arch/xtensa/include/asm/pgtable.h1
-rw-r--r--arch/xtensa/include/asm/tlbflush.h3
-rw-r--r--drivers/base/memory.c44
-rw-r--r--drivers/base/node.c55
-rw-r--r--drivers/crypto/chelsio/chtls/chtls_io.c5
-rw-r--r--drivers/gpu/drm/via/via_dmablit.c10
-rw-r--r--drivers/infiniband/core/umem.c5
-rw-r--r--drivers/infiniband/hw/hfi1/user_pages.c5
-rw-r--r--drivers/infiniband/hw/qib/qib_user_pages.c5
-rw-r--r--drivers/infiniband/hw/usnic/usnic_uiom.c5
-rw-r--r--drivers/infiniband/sw/siw/siw_mem.c10
-rw-r--r--drivers/staging/android/ion/ion_system_heap.c4
-rw-r--r--drivers/target/tcm_fc/tfc_io.c3
-rw-r--r--drivers/vfio/vfio_iommu_spapr_tce.c8
-rw-r--r--fs/binfmt_elf.c20
-rw-r--r--fs/fat/dir.c13
-rw-r--r--fs/fat/fatent.c3
-rw-r--r--fs/inode.c3
-rw-r--r--fs/io_uring.c2
-rw-r--r--fs/jbd2/journal.c2
-rw-r--r--fs/jbd2/transaction.c12
-rw-r--r--fs/ocfs2/alloc.c20
-rw-r--r--fs/ocfs2/aops.c13
-rw-r--r--fs/ocfs2/blockcheck.c26
-rw-r--r--fs/ocfs2/cluster/heartbeat.c103
-rw-r--r--fs/ocfs2/dir.c3
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h1
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c55
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h16
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c7
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c23
-rw-r--r--fs/ocfs2/dlmglue.c27
-rw-r--r--fs/ocfs2/extent_map.c3
-rw-r--r--fs/ocfs2/file.c13
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/journal.h42
-rw-r--r--fs/ocfs2/namei.c2
-rw-r--r--fs/ocfs2/ocfs2.h3
-rw-r--r--fs/ocfs2/super.c10
-rw-r--r--fs/open.c8
-rw-r--r--fs/proc/meminfo.c8
-rw-r--r--fs/proc/task_mmu.c6
-rw-r--r--include/asm-generic/pgalloc.h5
-rw-r--r--include/asm-generic/pgtable.h7
-rw-r--r--include/linux/compaction.h22
-rw-r--r--include/linux/fs.h32
-rw-r--r--include/linux/huge_mm.h9
-rw-r--r--include/linux/hugetlb.h2
-rw-r--r--include/linux/jbd2.h2
-rw-r--r--include/linux/khugepaged.h12
-rw-r--r--include/linux/memcontrol.h23
-rw-r--r--include/linux/memory.h7
-rw-r--r--include/linux/mm.h37
-rw-r--r--include/linux/mm_types.h1
-rw-r--r--include/linux/mmzone.h14
-rw-r--r--include/linux/page_ext.h1
-rw-r--r--include/linux/pagemap.h10
-rw-r--r--include/linux/quicklist.h94
-rw-r--r--include/linux/shrinker.h7
-rw-r--r--include/linux/slab.h62
-rw-r--r--include/linux/vmalloc.h20
-rw-r--r--include/linux/zpool.h3
-rw-r--r--init/main.c6
-rw-r--r--kernel/events/uprobes.c81
-rw-r--r--kernel/resource.c4
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sysctl.c6
-rw-r--r--lib/Kconfig.debug15
-rw-r--r--lib/Kconfig.kasan8
-rw-r--r--lib/iov_iter.c2
-rw-r--r--lib/show_mem.c5
-rw-r--r--lib/test_kasan.c41
-rw-r--r--mm/Kconfig16
-rw-r--r--mm/Kconfig.debug4
-rw-r--r--mm/Makefile4
-rw-r--r--mm/compaction.c50
-rw-r--r--mm/filemap.c168
-rw-r--r--mm/gup.c125
-rw-r--r--mm/huge_memory.c123
-rw-r--r--mm/hugetlb.c89
-rw-r--r--mm/hugetlb_cgroup.c2
-rw-r--r--mm/init-mm.c2
-rw-r--r--mm/kasan/common.c32
-rw-r--r--mm/kasan/kasan.h14
-rw-r--r--mm/kasan/report.c44
-rw-r--r--mm/kasan/tags_report.c24
-rw-r--r--mm/khugepaged.c366
-rw-r--r--mm/kmemleak.c326
-rw-r--r--mm/ksm.c18
-rw-r--r--mm/madvise.c52
-rw-r--r--mm/memcontrol.c188
-rw-r--r--mm/memfd.c2
-rw-r--r--mm/memory.c13
-rw-r--r--mm/memory_hotplug.c103
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/migrate.c13
-rw-r--r--mm/mmap.c12
-rw-r--r--mm/mmu_gather.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c24
-rw-r--r--mm/page_alloc.c27
-rw-r--r--mm/page_owner.c123
-rw-r--r--mm/page_poison.c2
-rw-r--r--mm/page_vma_mapped.c3
-rw-r--r--mm/quicklist.c103
-rw-r--r--mm/rmap.c25
-rw-r--r--mm/shmem.c12
-rw-r--r--mm/slab.h64
-rw-r--r--mm/slab_common.c37
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c22
-rw-r--r--mm/sparse.c25
-rw-r--r--mm/swap.c16
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/util.c122
-rw-r--r--mm/vmalloc.c84
-rw-r--r--mm/vmscan.c149
-rw-r--r--mm/vmstat.c2
-rw-r--r--mm/z3fold.c154
-rw-r--r--mm/zpool.c16
-rw-r--r--mm/zsmalloc.c23
-rw-r--r--mm/zswap.c15
-rw-r--r--net/xdp/xdp_umem.c9
-rw-r--r--net/xdp/xsk.c2
-rw-r--r--usr/Makefile3
204 files changed, 2273 insertions, 2444 deletions
diff --git a/Documentation/ABI/testing/sysfs-kernel-slab b/Documentation/ABI/testing/sysfs-kernel-slab
index 29601d93a1c2..ed35833ad7f0 100644
--- a/Documentation/ABI/testing/sysfs-kernel-slab
+++ b/Documentation/ABI/testing/sysfs-kernel-slab
@@ -429,10 +429,15 @@ KernelVersion: 2.6.22
429Contact: Pekka Enberg <penberg@cs.helsinki.fi>, 429Contact: Pekka Enberg <penberg@cs.helsinki.fi>,
430 Christoph Lameter <cl@linux-foundation.org> 430 Christoph Lameter <cl@linux-foundation.org>
431Description: 431Description:
432 The shrink file is written when memory should be reclaimed from 432 The shrink file is used to reclaim unused slab cache
433 a cache. Empty partial slabs are freed and the partial list is 433 memory from a cache. Empty per-cpu or partial slabs
434 sorted so the slabs with the fewest available objects are used 434 are freed and the partial list is sorted so the slabs
435 first. 435 with the fewest available objects are used first.
436 It only accepts a value of "1" on write for shrinking
437 the cache. Other input values are considered invalid.
438 Shrinking slab caches might be expensive and can
439 adversely impact other running applications. So it
440 should be used with care.
436 441
437What: /sys/kernel/slab/cache/slab_size 442What: /sys/kernel/slab/cache/slab_size
438Date: May 2007 443Date: May 2007
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index 41bdc038dad9..0ae4f564c2d6 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -85,8 +85,10 @@ Brief summary of control files.
85 memory.oom_control set/show oom controls. 85 memory.oom_control set/show oom controls.
86 memory.numa_stat show the number of memory usage per numa 86 memory.numa_stat show the number of memory usage per numa
87 node 87 node
88
89 memory.kmem.limit_in_bytes set/show hard limit for kernel memory 88 memory.kmem.limit_in_bytes set/show hard limit for kernel memory
89 This knob is deprecated and shouldn't be
90 used. It is planned that this be removed in
91 the foreseeable future.
90 memory.kmem.usage_in_bytes show current kernel memory allocation 92 memory.kmem.usage_in_bytes show current kernel memory allocation
91 memory.kmem.failcnt show the number of kernel memory usage 93 memory.kmem.failcnt show the number of kernel memory usage
92 hits limits 94 hits limits
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 254d8a369f32..944e03e29f65 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -809,6 +809,8 @@
809 enables the feature at boot time. By default, it is 809 enables the feature at boot time. By default, it is
810 disabled and the system will work mostly the same as a 810 disabled and the system will work mostly the same as a
811 kernel built without CONFIG_DEBUG_PAGEALLOC. 811 kernel built without CONFIG_DEBUG_PAGEALLOC.
812 Note: to get most of debug_pagealloc error reports, it's
813 useful to also enable the page_owner functionality.
812 on: enable the feature 814 on: enable the feature
813 815
814 debugpat [X86] Enable PAT debugging 816 debugpat [X86] Enable PAT debugging
diff --git a/arch/Kconfig b/arch/Kconfig
index 0fcf8ec1e098..5f8a5d84dbbe 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -706,6 +706,17 @@ config HAVE_ARCH_COMPAT_MMAP_BASES
706 and vice-versa 32-bit applications to call 64-bit mmap(). 706 and vice-versa 32-bit applications to call 64-bit mmap().
707 Required for applications doing different bitness syscalls. 707 Required for applications doing different bitness syscalls.
708 708
709# This allows to use a set of generic functions to determine mmap base
710# address by giving priority to top-down scheme only if the process
711# is not in legacy mode (compat task, unlimited stack size or
712# sysctl_legacy_va_layout).
713# Architecture that selects this option can provide its own version of:
714# - STACK_RND_MASK
715config ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
716 bool
717 depends on MMU
718 select ARCH_HAS_ELF_RANDOMIZE
719
709config HAVE_COPY_THREAD_TLS 720config HAVE_COPY_THREAD_TLS
710 bool 721 bool
711 help 722 help
diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h
index 71ded3b7d82d..eb91f1e85629 100644
--- a/arch/alpha/include/asm/pgalloc.h
+++ b/arch/alpha/include/asm/pgalloc.h
@@ -53,6 +53,4 @@ pmd_free(struct mm_struct *mm, pmd_t *pmd)
53 free_page((unsigned long)pmd); 53 free_page((unsigned long)pmd);
54} 54}
55 55
56#define check_pgt_cache() do { } while (0)
57
58#endif /* _ALPHA_PGALLOC_H */ 56#endif /* _ALPHA_PGALLOC_H */
diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index 89c2032f9960..065b57f408c3 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h
@@ -359,11 +359,6 @@ extern void paging_init(void);
359 359
360#include <asm-generic/pgtable.h> 360#include <asm-generic/pgtable.h>
361 361
362/*
363 * No page table caches to initialise
364 */
365#define pgtable_cache_init() do { } while (0)
366
367/* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */ 362/* We have our own get_unmapped_area to cope with ADDR_LIMIT_32BIT. */
368#define HAVE_ARCH_UNMAPPED_AREA 363#define HAVE_ARCH_UNMAPPED_AREA
369 364
diff --git a/arch/arc/include/asm/pgalloc.h b/arch/arc/include/asm/pgalloc.h
index 9bdb8ed5b0db..4751f2251cd9 100644
--- a/arch/arc/include/asm/pgalloc.h
+++ b/arch/arc/include/asm/pgalloc.h
@@ -129,7 +129,6 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptep)
129 129
130#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte) 130#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte)
131 131
132#define check_pgt_cache() do { } while (0)
133#define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd)) 132#define pmd_pgtable(pmd) ((pgtable_t) pmd_page_vaddr(pmd))
134 133
135#endif /* _ASM_ARC_PGALLOC_H */ 134#endif /* _ASM_ARC_PGALLOC_H */
diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h
index 1d87c18a2976..7addd0301c51 100644
--- a/arch/arc/include/asm/pgtable.h
+++ b/arch/arc/include/asm/pgtable.h
@@ -395,11 +395,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
395/* to cope with aliasing VIPT cache */ 395/* to cope with aliasing VIPT cache */
396#define HAVE_ARCH_UNMAPPED_AREA 396#define HAVE_ARCH_UNMAPPED_AREA
397 397
398/*
399 * No page table caches to initialise
400 */
401#define pgtable_cache_init() do { } while (0)
402
403#endif /* __ASSEMBLY__ */ 398#endif /* __ASSEMBLY__ */
404 399
405#endif 400#endif
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 229f2cdd81ca..8a50efb559f3 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -34,6 +34,7 @@ config ARM
34 select ARCH_SUPPORTS_ATOMIC_RMW 34 select ARCH_SUPPORTS_ATOMIC_RMW
35 select ARCH_USE_BUILTIN_BSWAP 35 select ARCH_USE_BUILTIN_BSWAP
36 select ARCH_USE_CMPXCHG_LOCKREF 36 select ARCH_USE_CMPXCHG_LOCKREF
37 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
37 select ARCH_WANT_IPC_PARSE_VERSION 38 select ARCH_WANT_IPC_PARSE_VERSION
38 select BINFMT_FLAT_ARGVP_ENVP_ON_STACK 39 select BINFMT_FLAT_ARGVP_ENVP_ON_STACK
39 select BUILDTIME_EXTABLE_SORT if MMU 40 select BUILDTIME_EXTABLE_SORT if MMU
diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h
index a2a68b751971..069da393110c 100644
--- a/arch/arm/include/asm/pgalloc.h
+++ b/arch/arm/include/asm/pgalloc.h
@@ -15,8 +15,6 @@
15#include <asm/cacheflush.h> 15#include <asm/cacheflush.h>
16#include <asm/tlbflush.h> 16#include <asm/tlbflush.h>
17 17
18#define check_pgt_cache() do { } while (0)
19
20#ifdef CONFIG_MMU 18#ifdef CONFIG_MMU
21 19
22#define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_USER)) 20#define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_BIT4 | PMD_DOMAIN(DOMAIN_USER))
diff --git a/arch/arm/include/asm/pgtable-nommu.h b/arch/arm/include/asm/pgtable-nommu.h
index d0de24f06724..010fa1a35a68 100644
--- a/arch/arm/include/asm/pgtable-nommu.h
+++ b/arch/arm/include/asm/pgtable-nommu.h
@@ -71,11 +71,6 @@ typedef pte_t *pte_addr_t;
71extern unsigned int kobjsize(const void *objp); 71extern unsigned int kobjsize(const void *objp);
72 72
73/* 73/*
74 * No page table caches to initialise.
75 */
76#define pgtable_cache_init() do { } while (0)
77
78/*
79 * All 32bit addresses are effectively valid for vmalloc... 74 * All 32bit addresses are effectively valid for vmalloc...
80 * Sort of meaningless for non-VM targets. 75 * Sort of meaningless for non-VM targets.
81 */ 76 */
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index f2e990dc27e7..3ae120cd1715 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -368,8 +368,6 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
368#define HAVE_ARCH_UNMAPPED_AREA 368#define HAVE_ARCH_UNMAPPED_AREA
369#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 369#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
370 370
371#define pgtable_cache_init() do { } while (0)
372
373#endif /* !__ASSEMBLY__ */ 371#endif /* !__ASSEMBLY__ */
374 372
375#endif /* CONFIG_MMU */ 373#endif /* CONFIG_MMU */
diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h
index 20c2f42454b8..614bf829e454 100644
--- a/arch/arm/include/asm/processor.h
+++ b/arch/arm/include/asm/processor.h
@@ -140,8 +140,6 @@ static inline void prefetchw(const void *ptr)
140#endif 140#endif
141#endif 141#endif
142 142
143#define HAVE_ARCH_PICK_MMAP_LAYOUT
144
145#endif 143#endif
146 144
147#endif /* __ASM_ARM_PROCESSOR_H */ 145#endif /* __ASM_ARM_PROCESSOR_H */
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index f934a6739fc0..9485acc520a4 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -319,11 +319,6 @@ unsigned long get_wchan(struct task_struct *p)
319 return 0; 319 return 0;
320} 320}
321 321
322unsigned long arch_randomize_brk(struct mm_struct *mm)
323{
324 return randomize_page(mm->brk, 0x02000000);
325}
326
327#ifdef CONFIG_MMU 322#ifdef CONFIG_MMU
328#ifdef CONFIG_KUSER_HELPERS 323#ifdef CONFIG_KUSER_HELPERS
329/* 324/*
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index 6ecbda87ee46..6d89db7895d1 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -204,18 +204,17 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page)
204 * coherent with the kernels mapping. 204 * coherent with the kernels mapping.
205 */ 205 */
206 if (!PageHighMem(page)) { 206 if (!PageHighMem(page)) {
207 size_t page_size = PAGE_SIZE << compound_order(page); 207 __cpuc_flush_dcache_area(page_address(page), page_size(page));
208 __cpuc_flush_dcache_area(page_address(page), page_size);
209 } else { 208 } else {
210 unsigned long i; 209 unsigned long i;
211 if (cache_is_vipt_nonaliasing()) { 210 if (cache_is_vipt_nonaliasing()) {
212 for (i = 0; i < (1 << compound_order(page)); i++) { 211 for (i = 0; i < compound_nr(page); i++) {
213 void *addr = kmap_atomic(page + i); 212 void *addr = kmap_atomic(page + i);
214 __cpuc_flush_dcache_area(addr, PAGE_SIZE); 213 __cpuc_flush_dcache_area(addr, PAGE_SIZE);
215 kunmap_atomic(addr); 214 kunmap_atomic(addr);
216 } 215 }
217 } else { 216 } else {
218 for (i = 0; i < (1 << compound_order(page)); i++) { 217 for (i = 0; i < compound_nr(page); i++) {
219 void *addr = kmap_high_get(page + i); 218 void *addr = kmap_high_get(page + i);
220 if (addr) { 219 if (addr) {
221 __cpuc_flush_dcache_area(addr, PAGE_SIZE); 220 __cpuc_flush_dcache_area(addr, PAGE_SIZE);
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index f866870db749..b8d912ac9e61 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -17,33 +17,6 @@
17 ((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \ 17 ((((addr)+SHMLBA-1)&~(SHMLBA-1)) + \
18 (((pgoff)<<PAGE_SHIFT) & (SHMLBA-1))) 18 (((pgoff)<<PAGE_SHIFT) & (SHMLBA-1)))
19 19
20/* gap between mmap and stack */
21#define MIN_GAP (128*1024*1024UL)
22#define MAX_GAP ((TASK_SIZE)/6*5)
23
24static int mmap_is_legacy(struct rlimit *rlim_stack)
25{
26 if (current->personality & ADDR_COMPAT_LAYOUT)
27 return 1;
28
29 if (rlim_stack->rlim_cur == RLIM_INFINITY)
30 return 1;
31
32 return sysctl_legacy_va_layout;
33}
34
35static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
36{
37 unsigned long gap = rlim_stack->rlim_cur;
38
39 if (gap < MIN_GAP)
40 gap = MIN_GAP;
41 else if (gap > MAX_GAP)
42 gap = MAX_GAP;
43
44 return PAGE_ALIGN(TASK_SIZE - gap - rnd);
45}
46
47/* 20/*
48 * We need to ensure that shared mappings are correctly aligned to 21 * We need to ensure that shared mappings are correctly aligned to
49 * avoid aliasing issues with VIPT caches. We need to ensure that 22 * avoid aliasing issues with VIPT caches. We need to ensure that
@@ -171,31 +144,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
171 return addr; 144 return addr;
172} 145}
173 146
174unsigned long arch_mmap_rnd(void)
175{
176 unsigned long rnd;
177
178 rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
179
180 return rnd << PAGE_SHIFT;
181}
182
183void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
184{
185 unsigned long random_factor = 0UL;
186
187 if (current->flags & PF_RANDOMIZE)
188 random_factor = arch_mmap_rnd();
189
190 if (mmap_is_legacy(rlim_stack)) {
191 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
192 mm->get_unmapped_area = arch_get_unmapped_area;
193 } else {
194 mm->mmap_base = mmap_base(random_factor, rlim_stack);
195 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
196 }
197}
198
199/* 147/*
200 * You really shouldn't be using read() or write() on /dev/mem. This 148 * You really shouldn't be using read() or write() on /dev/mem. This
201 * might go away in the future. 149 * might go away in the future.
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 37c610963eee..866e05882799 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -15,7 +15,6 @@ config ARM64
15 select ARCH_HAS_DMA_COHERENT_TO_PFN 15 select ARCH_HAS_DMA_COHERENT_TO_PFN
16 select ARCH_HAS_DMA_PREP_COHERENT 16 select ARCH_HAS_DMA_PREP_COHERENT
17 select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI 17 select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
18 select ARCH_HAS_ELF_RANDOMIZE
19 select ARCH_HAS_FAST_MULTIPLIER 18 select ARCH_HAS_FAST_MULTIPLIER
20 select ARCH_HAS_FORTIFY_SOURCE 19 select ARCH_HAS_FORTIFY_SOURCE
21 select ARCH_HAS_GCOV_PROFILE_ALL 20 select ARCH_HAS_GCOV_PROFILE_ALL
@@ -71,6 +70,7 @@ config ARM64
71 select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG 70 select ARCH_SUPPORTS_INT128 if GCC_VERSION >= 50000 || CC_IS_CLANG
72 select ARCH_SUPPORTS_NUMA_BALANCING 71 select ARCH_SUPPORTS_NUMA_BALANCING
73 select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT 72 select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
73 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
74 select ARCH_WANT_FRAME_POINTERS 74 select ARCH_WANT_FRAME_POINTERS
75 select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) 75 select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
76 select ARCH_HAS_UBSAN_SANITIZE_ALL 76 select ARCH_HAS_UBSAN_SANITIZE_ALL
diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h
index 14d0bc44d451..172d76fa0245 100644
--- a/arch/arm64/include/asm/pgalloc.h
+++ b/arch/arm64/include/asm/pgalloc.h
@@ -15,8 +15,6 @@
15 15
16#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ 16#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
17 17
18#define check_pgt_cache() do { } while (0)
19
20#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) 18#define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t))
21 19
22#if CONFIG_PGTABLE_LEVELS > 2 20#if CONFIG_PGTABLE_LEVELS > 2
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 57427d17580e..7576df00eb50 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -861,8 +861,6 @@ extern int kern_addr_valid(unsigned long addr);
861 861
862#include <asm-generic/pgtable.h> 862#include <asm-generic/pgtable.h>
863 863
864static inline void pgtable_cache_init(void) { }
865
866/* 864/*
867 * On AArch64, the cache coherency is handled via the set_pte_at() function. 865 * On AArch64, the cache coherency is handled via the set_pte_at() function.
868 */ 866 */
diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h
index c67848c55009..5623685c7d13 100644
--- a/arch/arm64/include/asm/processor.h
+++ b/arch/arm64/include/asm/processor.h
@@ -280,8 +280,6 @@ static inline void spin_lock_prefetch(const void *ptr)
280 "nop") : : "p" (ptr)); 280 "nop") : : "p" (ptr));
281} 281}
282 282
283#define HAVE_ARCH_PICK_MMAP_LAYOUT
284
285extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */ 283extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */
286extern void __init minsigstksz_setup(void); 284extern void __init minsigstksz_setup(void);
287 285
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 03689c0beb34..a47462def04b 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -557,14 +557,6 @@ unsigned long arch_align_stack(unsigned long sp)
557 return sp & ~0xf; 557 return sp & ~0xf;
558} 558}
559 559
560unsigned long arch_randomize_brk(struct mm_struct *mm)
561{
562 if (is_compat_task())
563 return randomize_page(mm->brk, SZ_32M);
564 else
565 return randomize_page(mm->brk, SZ_1G);
566}
567
568/* 560/*
569 * Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY. 561 * Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY.
570 */ 562 */
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c
index dc19300309d2..ac485163a4a7 100644
--- a/arch/arm64/mm/flush.c
+++ b/arch/arm64/mm/flush.c
@@ -56,8 +56,7 @@ void __sync_icache_dcache(pte_t pte)
56 struct page *page = pte_page(pte); 56 struct page *page = pte_page(pte);
57 57
58 if (!test_and_set_bit(PG_dcache_clean, &page->flags)) 58 if (!test_and_set_bit(PG_dcache_clean, &page->flags))
59 sync_icache_aliases(page_address(page), 59 sync_icache_aliases(page_address(page), page_size(page));
60 PAGE_SIZE << compound_order(page));
61} 60}
62EXPORT_SYMBOL_GPL(__sync_icache_dcache); 61EXPORT_SYMBOL_GPL(__sync_icache_dcache);
63 62
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index b050641b5139..3028bacbc4e9 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -21,78 +21,6 @@
21#include <asm/cputype.h> 21#include <asm/cputype.h>
22 22
23/* 23/*
24 * Leave enough space between the mmap area and the stack to honour ulimit in
25 * the face of randomisation.
26 */
27#define MIN_GAP (SZ_128M)
28#define MAX_GAP (STACK_TOP/6*5)
29
30static int mmap_is_legacy(struct rlimit *rlim_stack)
31{
32 if (current->personality & ADDR_COMPAT_LAYOUT)
33 return 1;
34
35 if (rlim_stack->rlim_cur == RLIM_INFINITY)
36 return 1;
37
38 return sysctl_legacy_va_layout;
39}
40
41unsigned long arch_mmap_rnd(void)
42{
43 unsigned long rnd;
44
45#ifdef CONFIG_COMPAT
46 if (test_thread_flag(TIF_32BIT))
47 rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
48 else
49#endif
50 rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
51 return rnd << PAGE_SHIFT;
52}
53
54static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
55{
56 unsigned long gap = rlim_stack->rlim_cur;
57 unsigned long pad = (STACK_RND_MASK << PAGE_SHIFT) + stack_guard_gap;
58
59 /* Values close to RLIM_INFINITY can overflow. */
60 if (gap + pad > gap)
61 gap += pad;
62
63 if (gap < MIN_GAP)
64 gap = MIN_GAP;
65 else if (gap > MAX_GAP)
66 gap = MAX_GAP;
67
68 return PAGE_ALIGN(STACK_TOP - gap - rnd);
69}
70
71/*
72 * This function, called very early during the creation of a new process VM
73 * image, sets up which VM layout function to use:
74 */
75void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
76{
77 unsigned long random_factor = 0UL;
78
79 if (current->flags & PF_RANDOMIZE)
80 random_factor = arch_mmap_rnd();
81
82 /*
83 * Fall back to the standard layout if the personality bit is set, or
84 * if the expected stack growth is unlimited:
85 */
86 if (mmap_is_legacy(rlim_stack)) {
87 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
88 mm->get_unmapped_area = arch_get_unmapped_area;
89 } else {
90 mm->mmap_base = mmap_base(random_factor, rlim_stack);
91 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
92 }
93}
94
95/*
96 * You really shouldn't be using read() or write() on /dev/mem. This might go 24 * You really shouldn't be using read() or write() on /dev/mem. This might go
97 * away in the future. 25 * away in the future.
98 */ 26 */
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c
index 7548f9ca1f11..4a64089e5771 100644
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@@ -35,7 +35,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
35 kmem_cache_free(pgd_cache, pgd); 35 kmem_cache_free(pgd_cache, pgd);
36} 36}
37 37
38void __init pgd_cache_init(void) 38void __init pgtable_cache_init(void)
39{ 39{
40 if (PGD_SIZE == PAGE_SIZE) 40 if (PGD_SIZE == PAGE_SIZE)
41 return; 41 return;
diff --git a/arch/c6x/include/asm/pgtable.h b/arch/c6x/include/asm/pgtable.h
index 0bd805964ea6..0b6919c00413 100644
--- a/arch/c6x/include/asm/pgtable.h
+++ b/arch/c6x/include/asm/pgtable.h
@@ -60,11 +60,6 @@ extern unsigned long empty_zero_page;
60#define swapper_pg_dir ((pgd_t *) 0) 60#define swapper_pg_dir ((pgd_t *) 0)
61 61
62/* 62/*
63 * No page table caches to initialise
64 */
65#define pgtable_cache_init() do { } while (0)
66
67/*
68 * c6x is !MMU, so define the simpliest implementation 63 * c6x is !MMU, so define the simpliest implementation
69 */ 64 */
70#define pgprot_writecombine pgprot_noncached 65#define pgprot_writecombine pgprot_noncached
diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h
index 98c5716708d6..d089113fe41f 100644
--- a/arch/csky/include/asm/pgalloc.h
+++ b/arch/csky/include/asm/pgalloc.h
@@ -75,8 +75,6 @@ do { \
75 tlb_remove_page(tlb, pte); \ 75 tlb_remove_page(tlb, pte); \
76} while (0) 76} while (0)
77 77
78#define check_pgt_cache() do {} while (0)
79
80extern void pagetable_init(void); 78extern void pagetable_init(void);
81extern void pre_mmu_init(void); 79extern void pre_mmu_init(void);
82extern void pre_trap_init(void); 80extern void pre_trap_init(void);
diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h
index c429a6f347de..0040b3a05b61 100644
--- a/arch/csky/include/asm/pgtable.h
+++ b/arch/csky/include/asm/pgtable.h
@@ -296,11 +296,6 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address,
296/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ 296/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
297#define kern_addr_valid(addr) (1) 297#define kern_addr_valid(addr) (1)
298 298
299/*
300 * No page table caches to initialise
301 */
302#define pgtable_cache_init() do {} while (0)
303
304#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ 299#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
305 remap_pfn_range(vma, vaddr, pfn, size, prot) 300 remap_pfn_range(vma, vaddr, pfn, size, prot)
306 301
diff --git a/arch/h8300/include/asm/pgtable.h b/arch/h8300/include/asm/pgtable.h
index a99caa49d265..4d00152fab58 100644
--- a/arch/h8300/include/asm/pgtable.h
+++ b/arch/h8300/include/asm/pgtable.h
@@ -4,7 +4,6 @@
4#define __ARCH_USE_5LEVEL_HACK 4#define __ARCH_USE_5LEVEL_HACK
5#include <asm-generic/pgtable-nopud.h> 5#include <asm-generic/pgtable-nopud.h>
6#include <asm-generic/pgtable.h> 6#include <asm-generic/pgtable.h>
7#define pgtable_cache_init() do { } while (0)
8extern void paging_init(void); 7extern void paging_init(void);
9#define PAGE_NONE __pgprot(0) /* these mean nothing to NO_MM */ 8#define PAGE_NONE __pgprot(0) /* these mean nothing to NO_MM */
10#define PAGE_SHARED __pgprot(0) /* these mean nothing to NO_MM */ 9#define PAGE_SHARED __pgprot(0) /* these mean nothing to NO_MM */
@@ -35,11 +34,6 @@ extern unsigned int kobjsize(const void *objp);
35extern int is_in_rom(unsigned long); 34extern int is_in_rom(unsigned long);
36 35
37/* 36/*
38 * No page table caches to initialise
39 */
40#define pgtable_cache_init() do { } while (0)
41
42/*
43 * All 32bit addresses are effectively valid for vmalloc... 37 * All 32bit addresses are effectively valid for vmalloc...
44 * Sort of meaningless for non-VM targets. 38 * Sort of meaningless for non-VM targets.
45 */ 39 */
diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h
index d6544dc71258..5a6e79e7926d 100644
--- a/arch/hexagon/include/asm/pgalloc.h
+++ b/arch/hexagon/include/asm/pgalloc.h
@@ -13,8 +13,6 @@
13 13
14#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */ 14#include <asm-generic/pgalloc.h> /* for pte_{alloc,free}_one */
15 15
16#define check_pgt_cache() do {} while (0)
17
18extern unsigned long long kmap_generation; 16extern unsigned long long kmap_generation;
19 17
20/* 18/*
diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h
index a3ff6d24c09e..2fec20ad939e 100644
--- a/arch/hexagon/include/asm/pgtable.h
+++ b/arch/hexagon/include/asm/pgtable.h
@@ -431,9 +431,6 @@ static inline int pte_exec(pte_t pte)
431 431
432#define __pte_offset(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) 432#define __pte_offset(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
433 433
434/* I think this is in case we have page table caches; needed by init/main.c */
435#define pgtable_cache_init() do { } while (0)
436
437/* 434/*
438 * Swap/file PTE definitions. If _PAGE_PRESENT is zero, the rest of the PTE is 435 * Swap/file PTE definitions. If _PAGE_PRESENT is zero, the rest of the PTE is
439 * interpreted as swap information. The remaining free bits are interpreted as 436 * interpreted as swap information. The remaining free bits are interpreted as
diff --git a/arch/hexagon/mm/Makefile b/arch/hexagon/mm/Makefile
index 1894263ae5bc..893838499591 100644
--- a/arch/hexagon/mm/Makefile
+++ b/arch/hexagon/mm/Makefile
@@ -3,5 +3,5 @@
3# Makefile for Hexagon memory management subsystem 3# Makefile for Hexagon memory management subsystem
4# 4#
5 5
6obj-y := init.o pgalloc.o ioremap.o uaccess.o vm_fault.o cache.o 6obj-y := init.o ioremap.o uaccess.o vm_fault.o cache.o
7obj-y += copy_to_user.o copy_from_user.o strnlen_user.o vm_tlb.o 7obj-y += copy_to_user.o copy_from_user.o strnlen_user.o vm_tlb.o
diff --git a/arch/hexagon/mm/pgalloc.c b/arch/hexagon/mm/pgalloc.c
deleted file mode 100644
index 4d4316140237..000000000000
--- a/arch/hexagon/mm/pgalloc.c
+++ /dev/null
@@ -1,10 +0,0 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
4 */
5
6#include <linux/init.h>
7
8void __init pgtable_cache_init(void)
9{
10}
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 685a3df126ca..16714477eef4 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -72,10 +72,6 @@ config 64BIT
72config ZONE_DMA32 72config ZONE_DMA32
73 def_bool y 73 def_bool y
74 74
75config QUICKLIST
76 bool
77 default y
78
79config MMU 75config MMU
80 bool 76 bool
81 default y 77 default y
diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h
index c9e481023c25..f4c491044882 100644
--- a/arch/ia64/include/asm/pgalloc.h
+++ b/arch/ia64/include/asm/pgalloc.h
@@ -19,18 +19,19 @@
19#include <linux/mm.h> 19#include <linux/mm.h>
20#include <linux/page-flags.h> 20#include <linux/page-flags.h>
21#include <linux/threads.h> 21#include <linux/threads.h>
22#include <linux/quicklist.h> 22
23#include <asm-generic/pgalloc.h>
23 24
24#include <asm/mmu_context.h> 25#include <asm/mmu_context.h>
25 26
26static inline pgd_t *pgd_alloc(struct mm_struct *mm) 27static inline pgd_t *pgd_alloc(struct mm_struct *mm)
27{ 28{
28 return quicklist_alloc(0, GFP_KERNEL, NULL); 29 return (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
29} 30}
30 31
31static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) 32static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
32{ 33{
33 quicklist_free(0, NULL, pgd); 34 free_page((unsigned long)pgd);
34} 35}
35 36
36#if CONFIG_PGTABLE_LEVELS == 4 37#if CONFIG_PGTABLE_LEVELS == 4
@@ -42,12 +43,12 @@ pgd_populate(struct mm_struct *mm, pgd_t * pgd_entry, pud_t * pud)
42 43
43static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) 44static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
44{ 45{
45 return quicklist_alloc(0, GFP_KERNEL, NULL); 46 return (pud_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
46} 47}
47 48
48static inline void pud_free(struct mm_struct *mm, pud_t *pud) 49static inline void pud_free(struct mm_struct *mm, pud_t *pud)
49{ 50{
50 quicklist_free(0, NULL, pud); 51 free_page((unsigned long)pud);
51} 52}
52#define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud) 53#define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud)
53#endif /* CONFIG_PGTABLE_LEVELS == 4 */ 54#endif /* CONFIG_PGTABLE_LEVELS == 4 */
@@ -60,12 +61,12 @@ pud_populate(struct mm_struct *mm, pud_t * pud_entry, pmd_t * pmd)
60 61
61static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) 62static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
62{ 63{
63 return quicklist_alloc(0, GFP_KERNEL, NULL); 64 return (pmd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
64} 65}
65 66
66static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) 67static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
67{ 68{
68 quicklist_free(0, NULL, pmd); 69 free_page((unsigned long)pmd);
69} 70}
70 71
71#define __pmd_free_tlb(tlb, pmd, address) pmd_free((tlb)->mm, pmd) 72#define __pmd_free_tlb(tlb, pmd, address) pmd_free((tlb)->mm, pmd)
@@ -83,43 +84,6 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t * pmd_entry, pte_t * pte)
83 pmd_val(*pmd_entry) = __pa(pte); 84 pmd_val(*pmd_entry) = __pa(pte);
84} 85}
85 86
86static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
87{
88 struct page *page;
89 void *pg;
90
91 pg = quicklist_alloc(0, GFP_KERNEL, NULL);
92 if (!pg)
93 return NULL;
94 page = virt_to_page(pg);
95 if (!pgtable_page_ctor(page)) {
96 quicklist_free(0, NULL, pg);
97 return NULL;
98 }
99 return page;
100}
101
102static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
103{
104 return quicklist_alloc(0, GFP_KERNEL, NULL);
105}
106
107static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
108{
109 pgtable_page_dtor(pte);
110 quicklist_free_page(0, NULL, pte);
111}
112
113static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
114{
115 quicklist_free(0, NULL, pte);
116}
117
118static inline void check_pgt_cache(void)
119{
120 quicklist_trim(0, NULL, 25, 16);
121}
122
123#define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) 87#define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte)
124 88
125#endif /* _ASM_IA64_PGALLOC_H */ 89#endif /* _ASM_IA64_PGALLOC_H */
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index b1e7468eb65a..d602e7c622db 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -566,11 +566,6 @@ extern struct page *zero_page_memmap_ptr;
566#define KERNEL_TR_PAGE_SHIFT _PAGE_SIZE_64M 566#define KERNEL_TR_PAGE_SHIFT _PAGE_SIZE_64M
567#define KERNEL_TR_PAGE_SIZE (1 << KERNEL_TR_PAGE_SHIFT) 567#define KERNEL_TR_PAGE_SIZE (1 << KERNEL_TR_PAGE_SHIFT)
568 568
569/*
570 * No page table caches to initialise
571 */
572#define pgtable_cache_init() do { } while (0)
573
574/* These tell get_user_pages() that the first gate page is accessible from user-level. */ 569/* These tell get_user_pages() that the first gate page is accessible from user-level. */
575#define FIXADDR_USER_START GATE_ADDR 570#define FIXADDR_USER_START GATE_ADDR
576#ifdef HAVE_BUGGY_SEGREL 571#ifdef HAVE_BUGGY_SEGREL
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 678b98a09c85..bf9df2625bc8 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -64,7 +64,7 @@ __ia64_sync_icache_dcache (pte_t pte)
64 if (test_bit(PG_arch_1, &page->flags)) 64 if (test_bit(PG_arch_1, &page->flags))
65 return; /* i-cache is already coherent with d-cache */ 65 return; /* i-cache is already coherent with d-cache */
66 66
67 flush_icache_range(addr, addr + (PAGE_SIZE << compound_order(page))); 67 flush_icache_range(addr, addr + page_size(page));
68 set_bit(PG_arch_1, &page->flags); /* mark page as clean */ 68 set_bit(PG_arch_1, &page->flags); /* mark page as clean */
69} 69}
70 70
diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h
index fde4534b974f..646c174fff99 100644
--- a/arch/m68k/include/asm/pgtable_mm.h
+++ b/arch/m68k/include/asm/pgtable_mm.h
@@ -176,11 +176,4 @@ pgprot_t pgprot_dmacoherent(pgprot_t prot);
176#include <asm-generic/pgtable.h> 176#include <asm-generic/pgtable.h>
177#endif /* !__ASSEMBLY__ */ 177#endif /* !__ASSEMBLY__ */
178 178
179/*
180 * No page table caches to initialise
181 */
182#define pgtable_cache_init() do { } while (0)
183
184#define check_pgt_cache() do { } while (0)
185
186#endif /* _M68K_PGTABLE_H */ 179#endif /* _M68K_PGTABLE_H */
diff --git a/arch/m68k/include/asm/pgtable_no.h b/arch/m68k/include/asm/pgtable_no.h
index fc3a96c77bd8..c18165b0d904 100644
--- a/arch/m68k/include/asm/pgtable_no.h
+++ b/arch/m68k/include/asm/pgtable_no.h
@@ -45,11 +45,6 @@ extern void paging_init(void);
45#define ZERO_PAGE(vaddr) (virt_to_page(0)) 45#define ZERO_PAGE(vaddr) (virt_to_page(0))
46 46
47/* 47/*
48 * No page table caches to initialise.
49 */
50#define pgtable_cache_init() do { } while (0)
51
52/*
53 * All 32bit addresses are effectively valid for vmalloc... 48 * All 32bit addresses are effectively valid for vmalloc...
54 * Sort of meaningless for non-VM targets. 49 * Sort of meaningless for non-VM targets.
55 */ 50 */
@@ -60,6 +55,4 @@ extern void paging_init(void);
60 55
61#include <asm-generic/pgtable.h> 56#include <asm-generic/pgtable.h>
62 57
63#define check_pgt_cache() do { } while (0)
64
65#endif /* _M68KNOMMU_PGTABLE_H */ 58#endif /* _M68KNOMMU_PGTABLE_H */
diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h
index f4cc9ffc449e..7ecb05baa601 100644
--- a/arch/microblaze/include/asm/pgalloc.h
+++ b/arch/microblaze/include/asm/pgalloc.h
@@ -21,83 +21,23 @@
21#include <asm/cache.h> 21#include <asm/cache.h>
22#include <asm/pgtable.h> 22#include <asm/pgtable.h>
23 23
24#define PGDIR_ORDER 0 24#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
25 25#include <asm-generic/pgalloc.h>
26/*
27 * This is handled very differently on MicroBlaze since out page tables
28 * are all 0's and I want to be able to use these zero'd pages elsewhere
29 * as well - it gives us quite a speedup.
30 * -- Cort
31 */
32extern struct pgtable_cache_struct {
33 unsigned long *pgd_cache;
34 unsigned long *pte_cache;
35 unsigned long pgtable_cache_sz;
36} quicklists;
37
38#define pgd_quicklist (quicklists.pgd_cache)
39#define pmd_quicklist ((unsigned long *)0)
40#define pte_quicklist (quicklists.pte_cache)
41#define pgtable_cache_size (quicklists.pgtable_cache_sz)
42
43extern unsigned long *zero_cache; /* head linked list of pre-zero'd pages */
44extern atomic_t zero_sz; /* # currently pre-zero'd pages */
45extern atomic_t zeropage_hits; /* # zero'd pages request that we've done */
46extern atomic_t zeropage_calls; /* # zero'd pages request that've been made */
47extern atomic_t zerototal; /* # pages zero'd over time */
48
49#define zero_quicklist (zero_cache)
50#define zero_cache_sz (zero_sz)
51#define zero_cache_calls (zeropage_calls)
52#define zero_cache_hits (zeropage_hits)
53#define zero_cache_total (zerototal)
54
55/*
56 * return a pre-zero'd page from the list,
57 * return NULL if none available -- Cort
58 */
59extern unsigned long get_zero_page_fast(void);
60 26
61extern void __bad_pte(pmd_t *pmd); 27extern void __bad_pte(pmd_t *pmd);
62 28
63static inline pgd_t *get_pgd_slow(void) 29static inline pgd_t *get_pgd(void)
64{ 30{
65 pgd_t *ret; 31 return (pgd_t *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, 0);
66
67 ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGDIR_ORDER);
68 if (ret != NULL)
69 clear_page(ret);
70 return ret;
71} 32}
72 33
73static inline pgd_t *get_pgd_fast(void) 34static inline void free_pgd(pgd_t *pgd)
74{
75 unsigned long *ret;
76
77 ret = pgd_quicklist;
78 if (ret != NULL) {
79 pgd_quicklist = (unsigned long *)(*ret);
80 ret[0] = 0;
81 pgtable_cache_size--;
82 } else
83 ret = (unsigned long *)get_pgd_slow();
84 return (pgd_t *)ret;
85}
86
87static inline void free_pgd_fast(pgd_t *pgd)
88{
89 *(unsigned long **)pgd = pgd_quicklist;
90 pgd_quicklist = (unsigned long *) pgd;
91 pgtable_cache_size++;
92}
93
94static inline void free_pgd_slow(pgd_t *pgd)
95{ 35{
96 free_page((unsigned long)pgd); 36 free_page((unsigned long)pgd);
97} 37}
98 38
99#define pgd_free(mm, pgd) free_pgd_fast(pgd) 39#define pgd_free(mm, pgd) free_pgd(pgd)
100#define pgd_alloc(mm) get_pgd_fast() 40#define pgd_alloc(mm) get_pgd()
101 41
102#define pmd_pgtable(pmd) pmd_page(pmd) 42#define pmd_pgtable(pmd) pmd_page(pmd)
103 43
@@ -110,50 +50,6 @@ static inline void free_pgd_slow(pgd_t *pgd)
110 50
111extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); 51extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm);
112 52
113static inline struct page *pte_alloc_one(struct mm_struct *mm)
114{
115 struct page *ptepage;
116
117#ifdef CONFIG_HIGHPTE
118 int flags = GFP_KERNEL | __GFP_HIGHMEM;
119#else
120 int flags = GFP_KERNEL;
121#endif
122
123 ptepage = alloc_pages(flags, 0);
124 if (!ptepage)
125 return NULL;
126 clear_highpage(ptepage);
127 if (!pgtable_page_ctor(ptepage)) {
128 __free_page(ptepage);
129 return NULL;
130 }
131 return ptepage;
132}
133
134static inline void pte_free_fast(pte_t *pte)
135{
136 *(unsigned long **)pte = pte_quicklist;
137 pte_quicklist = (unsigned long *) pte;
138 pgtable_cache_size++;
139}
140
141static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
142{
143 free_page((unsigned long)pte);
144}
145
146static inline void pte_free_slow(struct page *ptepage)
147{
148 __free_page(ptepage);
149}
150
151static inline void pte_free(struct mm_struct *mm, struct page *ptepage)
152{
153 pgtable_page_dtor(ptepage);
154 __free_page(ptepage);
155}
156
157#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, (pte)) 53#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, (pte))
158 54
159#define pmd_populate(mm, pmd, pte) \ 55#define pmd_populate(mm, pmd, pte) \
@@ -171,10 +67,6 @@ static inline void pte_free(struct mm_struct *mm, struct page *ptepage)
171#define __pmd_free_tlb(tlb, x, addr) pmd_free((tlb)->mm, x) 67#define __pmd_free_tlb(tlb, x, addr) pmd_free((tlb)->mm, x)
172#define pgd_populate(mm, pmd, pte) BUG() 68#define pgd_populate(mm, pmd, pte) BUG()
173 69
174extern int do_check_pgt_cache(int, int);
175
176#endif /* CONFIG_MMU */ 70#endif /* CONFIG_MMU */
177 71
178#define check_pgt_cache() do { } while (0)
179
180#endif /* _ASM_MICROBLAZE_PGALLOC_H */ 72#endif /* _ASM_MICROBLAZE_PGALLOC_H */
diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h
index 142d3f004848..954b69af451f 100644
--- a/arch/microblaze/include/asm/pgtable.h
+++ b/arch/microblaze/include/asm/pgtable.h
@@ -46,8 +46,6 @@ extern int mem_init_done;
46 46
47#define swapper_pg_dir ((pgd_t *) NULL) 47#define swapper_pg_dir ((pgd_t *) NULL)
48 48
49#define pgtable_cache_init() do {} while (0)
50
51#define arch_enter_lazy_cpu_mode() do {} while (0) 49#define arch_enter_lazy_cpu_mode() do {} while (0)
52 50
53#define pgprot_noncached_wc(prot) prot 51#define pgprot_noncached_wc(prot) prot
@@ -526,11 +524,6 @@ extern unsigned long iopa(unsigned long addr);
526/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */ 524/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
527#define kern_addr_valid(addr) (1) 525#define kern_addr_valid(addr) (1)
528 526
529/*
530 * No page table caches to initialise
531 */
532#define pgtable_cache_init() do { } while (0)
533
534void do_page_fault(struct pt_regs *regs, unsigned long address, 527void do_page_fault(struct pt_regs *regs, unsigned long address,
535 unsigned long error_code); 528 unsigned long error_code);
536 529
diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c
index 8fe54fda31dc..010bb9cee2e4 100644
--- a/arch/microblaze/mm/pgtable.c
+++ b/arch/microblaze/mm/pgtable.c
@@ -44,10 +44,6 @@ unsigned long ioremap_base;
44unsigned long ioremap_bot; 44unsigned long ioremap_bot;
45EXPORT_SYMBOL(ioremap_bot); 45EXPORT_SYMBOL(ioremap_bot);
46 46
47#ifndef CONFIG_SMP
48struct pgtable_cache_struct quicklists;
49#endif
50
51static void __iomem *__ioremap(phys_addr_t addr, unsigned long size, 47static void __iomem *__ioremap(phys_addr_t addr, unsigned long size,
52 unsigned long flags) 48 unsigned long flags)
53{ 49{
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index cc8e2b1032a5..a0bd9bdb5f83 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -5,7 +5,6 @@ config MIPS
5 select ARCH_32BIT_OFF_T if !64BIT 5 select ARCH_32BIT_OFF_T if !64BIT
6 select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT 6 select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT
7 select ARCH_CLOCKSOURCE_DATA 7 select ARCH_CLOCKSOURCE_DATA
8 select ARCH_HAS_ELF_RANDOMIZE
9 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST 8 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
10 select ARCH_HAS_UBSAN_SANITIZE_ALL 9 select ARCH_HAS_UBSAN_SANITIZE_ALL
11 select ARCH_SUPPORTS_UPROBES 10 select ARCH_SUPPORTS_UPROBES
@@ -13,6 +12,7 @@ config MIPS
13 select ARCH_USE_CMPXCHG_LOCKREF if 64BIT 12 select ARCH_USE_CMPXCHG_LOCKREF if 64BIT
14 select ARCH_USE_QUEUED_RWLOCKS 13 select ARCH_USE_QUEUED_RWLOCKS
15 select ARCH_USE_QUEUED_SPINLOCKS 14 select ARCH_USE_QUEUED_SPINLOCKS
15 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
16 select ARCH_WANT_IPC_PARSE_VERSION 16 select ARCH_WANT_IPC_PARSE_VERSION
17 select BUILDTIME_EXTABLE_SORT 17 select BUILDTIME_EXTABLE_SORT
18 select CLONE_BACKWARDS 18 select CLONE_BACKWARDS
diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h
index aa16b85ddffc..aa73cb187a07 100644
--- a/arch/mips/include/asm/pgalloc.h
+++ b/arch/mips/include/asm/pgalloc.h
@@ -105,8 +105,6 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
105 105
106#endif /* __PAGETABLE_PUD_FOLDED */ 106#endif /* __PAGETABLE_PUD_FOLDED */
107 107
108#define check_pgt_cache() do { } while (0)
109
110extern void pagetable_init(void); 108extern void pagetable_init(void);
111 109
112#endif /* _ASM_PGALLOC_H */ 110#endif /* _ASM_PGALLOC_H */
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 4dca733d5076..f85bd5b15f51 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -661,9 +661,4 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
661#define HAVE_ARCH_UNMAPPED_AREA 661#define HAVE_ARCH_UNMAPPED_AREA
662#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 662#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
663 663
664/*
665 * No page table caches to initialise
666 */
667#define pgtable_cache_init() do { } while (0)
668
669#endif /* _ASM_PGTABLE_H */ 664#endif /* _ASM_PGTABLE_H */
diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h
index aca909bd7841..fba18d4a9190 100644
--- a/arch/mips/include/asm/processor.h
+++ b/arch/mips/include/asm/processor.h
@@ -29,11 +29,6 @@
29 29
30extern unsigned int vced_count, vcei_count; 30extern unsigned int vced_count, vcei_count;
31 31
32/*
33 * MIPS does have an arch_pick_mmap_layout()
34 */
35#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
36
37#ifdef CONFIG_32BIT 32#ifdef CONFIG_32BIT
38#ifdef CONFIG_KVM_GUEST 33#ifdef CONFIG_KVM_GUEST
39/* User space process size is limited to 1GB in KVM Guest Mode */ 34/* User space process size is limited to 1GB in KVM Guest Mode */
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index d79f2b432318..00fe90c6db3e 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -20,33 +20,6 @@
20unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */ 20unsigned long shm_align_mask = PAGE_SIZE - 1; /* Sane caches */
21EXPORT_SYMBOL(shm_align_mask); 21EXPORT_SYMBOL(shm_align_mask);
22 22
23/* gap between mmap and stack */
24#define MIN_GAP (128*1024*1024UL)
25#define MAX_GAP ((TASK_SIZE)/6*5)
26
27static int mmap_is_legacy(struct rlimit *rlim_stack)
28{
29 if (current->personality & ADDR_COMPAT_LAYOUT)
30 return 1;
31
32 if (rlim_stack->rlim_cur == RLIM_INFINITY)
33 return 1;
34
35 return sysctl_legacy_va_layout;
36}
37
38static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
39{
40 unsigned long gap = rlim_stack->rlim_cur;
41
42 if (gap < MIN_GAP)
43 gap = MIN_GAP;
44 else if (gap > MAX_GAP)
45 gap = MAX_GAP;
46
47 return PAGE_ALIGN(TASK_SIZE - gap - rnd);
48}
49
50#define COLOUR_ALIGN(addr, pgoff) \ 23#define COLOUR_ALIGN(addr, pgoff) \
51 ((((addr) + shm_align_mask) & ~shm_align_mask) + \ 24 ((((addr) + shm_align_mask) & ~shm_align_mask) + \
52 (((pgoff) << PAGE_SHIFT) & shm_align_mask)) 25 (((pgoff) << PAGE_SHIFT) & shm_align_mask))
@@ -144,63 +117,6 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp,
144 addr0, len, pgoff, flags, DOWN); 117 addr0, len, pgoff, flags, DOWN);
145} 118}
146 119
147unsigned long arch_mmap_rnd(void)
148{
149 unsigned long rnd;
150
151#ifdef CONFIG_COMPAT
152 if (TASK_IS_32BIT_ADDR)
153 rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
154 else
155#endif /* CONFIG_COMPAT */
156 rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
157
158 return rnd << PAGE_SHIFT;
159}
160
161void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
162{
163 unsigned long random_factor = 0UL;
164
165 if (current->flags & PF_RANDOMIZE)
166 random_factor = arch_mmap_rnd();
167
168 if (mmap_is_legacy(rlim_stack)) {
169 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
170 mm->get_unmapped_area = arch_get_unmapped_area;
171 } else {
172 mm->mmap_base = mmap_base(random_factor, rlim_stack);
173 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
174 }
175}
176
177static inline unsigned long brk_rnd(void)
178{
179 unsigned long rnd = get_random_long();
180
181 rnd = rnd << PAGE_SHIFT;
182 /* 8MB for 32bit, 256MB for 64bit */
183 if (TASK_IS_32BIT_ADDR)
184 rnd = rnd & 0x7ffffful;
185 else
186 rnd = rnd & 0xffffffful;
187
188 return rnd;
189}
190
191unsigned long arch_randomize_brk(struct mm_struct *mm)
192{
193 unsigned long base = mm->brk;
194 unsigned long ret;
195
196 ret = PAGE_ALIGN(base + brk_rnd());
197
198 if (ret < mm->brk)
199 return mm->brk;
200
201 return ret;
202}
203
204bool __virt_addr_valid(const volatile void *kaddr) 120bool __virt_addr_valid(const volatile void *kaddr)
205{ 121{
206 unsigned long vaddr = (unsigned long)kaddr; 122 unsigned long vaddr = (unsigned long)kaddr;
diff --git a/arch/nds32/include/asm/pgalloc.h b/arch/nds32/include/asm/pgalloc.h
index e78b43d8389f..37125e6884d7 100644
--- a/arch/nds32/include/asm/pgalloc.h
+++ b/arch/nds32/include/asm/pgalloc.h
@@ -23,8 +23,6 @@
23extern pgd_t *pgd_alloc(struct mm_struct *mm); 23extern pgd_t *pgd_alloc(struct mm_struct *mm);
24extern void pgd_free(struct mm_struct *mm, pgd_t * pgd); 24extern void pgd_free(struct mm_struct *mm, pgd_t * pgd);
25 25
26#define check_pgt_cache() do { } while (0)
27
28static inline pgtable_t pte_alloc_one(struct mm_struct *mm) 26static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
29{ 27{
30 pgtable_t pte; 28 pgtable_t pte;
diff --git a/arch/nds32/include/asm/pgtable.h b/arch/nds32/include/asm/pgtable.h
index c70cc56bec09..0588ec99725c 100644
--- a/arch/nds32/include/asm/pgtable.h
+++ b/arch/nds32/include/asm/pgtable.h
@@ -403,8 +403,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
403 * into virtual address `from' 403 * into virtual address `from'
404 */ 404 */
405 405
406#define pgtable_cache_init() do { } while (0)
407
408#endif /* !__ASSEMBLY__ */ 406#endif /* !__ASSEMBLY__ */
409 407
410#endif /* _ASMNDS32_PGTABLE_H */ 408#endif /* _ASMNDS32_PGTABLE_H */
diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h
index 4bc8cf72067e..750d18d5980b 100644
--- a/arch/nios2/include/asm/pgalloc.h
+++ b/arch/nios2/include/asm/pgalloc.h
@@ -45,6 +45,4 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
45 tlb_remove_page((tlb), (pte)); \ 45 tlb_remove_page((tlb), (pte)); \
46 } while (0) 46 } while (0)
47 47
48#define check_pgt_cache() do { } while (0)
49
50#endif /* _ASM_NIOS2_PGALLOC_H */ 48#endif /* _ASM_NIOS2_PGALLOC_H */
diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index 95237b7f6fc1..99985d8b7166 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -291,8 +291,6 @@ static inline void pte_clear(struct mm_struct *mm,
291 291
292#include <asm-generic/pgtable.h> 292#include <asm-generic/pgtable.h>
293 293
294#define pgtable_cache_init() do { } while (0)
295
296extern void __init paging_init(void); 294extern void __init paging_init(void);
297extern void __init mmu_init(void); 295extern void __init mmu_init(void);
298 296
diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h
index 3d4b397c2d06..787c1b9d2f6d 100644
--- a/arch/openrisc/include/asm/pgalloc.h
+++ b/arch/openrisc/include/asm/pgalloc.h
@@ -101,6 +101,4 @@ do { \
101 101
102#define pmd_pgtable(pmd) pmd_page(pmd) 102#define pmd_pgtable(pmd) pmd_page(pmd)
103 103
104#define check_pgt_cache() do { } while (0)
105
106#endif 104#endif
diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h
index 2fe9ff5b5d6f..248d22d8faa7 100644
--- a/arch/openrisc/include/asm/pgtable.h
+++ b/arch/openrisc/include/asm/pgtable.h
@@ -443,11 +443,6 @@ static inline void update_mmu_cache(struct vm_area_struct *vma,
443 443
444#include <asm-generic/pgtable.h> 444#include <asm-generic/pgtable.h>
445 445
446/*
447 * No page table caches to initialise
448 */
449#define pgtable_cache_init() do { } while (0)
450
451typedef pte_t *pte_addr_t; 446typedef pte_t *pte_addr_t;
452 447
453#endif /* __ASSEMBLY__ */ 448#endif /* __ASSEMBLY__ */
diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h
index 4f2059a50fae..d98647c29b74 100644
--- a/arch/parisc/include/asm/pgalloc.h
+++ b/arch/parisc/include/asm/pgalloc.h
@@ -124,6 +124,4 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
124 pmd_populate_kernel(mm, pmd, page_address(pte_page)) 124 pmd_populate_kernel(mm, pmd, page_address(pte_page))
125#define pmd_pgtable(pmd) pmd_page(pmd) 125#define pmd_pgtable(pmd) pmd_page(pmd)
126 126
127#define check_pgt_cache() do { } while (0)
128
129#endif 127#endif
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index 6d58c1739b42..4ac374b3a99f 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -132,8 +132,6 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
132#define PTRS_PER_PTE (1UL << BITS_PER_PTE) 132#define PTRS_PER_PTE (1UL << BITS_PER_PTE)
133 133
134/* Definitions for 2nd level */ 134/* Definitions for 2nd level */
135#define pgtable_cache_init() do { } while (0)
136
137#define PMD_SHIFT (PLD_SHIFT + BITS_PER_PTE) 135#define PMD_SHIFT (PLD_SHIFT + BITS_PER_PTE)
138#define PMD_SIZE (1UL << PMD_SHIFT) 136#define PMD_SIZE (1UL << PMD_SHIFT)
139#define PMD_MASK (~(PMD_SIZE-1)) 137#define PMD_MASK (~(PMD_SIZE-1))
diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h
index 2b2c60a1a66d..6dd78a2dc03a 100644
--- a/arch/powerpc/include/asm/pgalloc.h
+++ b/arch/powerpc/include/asm/pgalloc.h
@@ -64,8 +64,6 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage)
64extern struct kmem_cache *pgtable_cache[]; 64extern struct kmem_cache *pgtable_cache[];
65#define PGT_CACHE(shift) pgtable_cache[shift] 65#define PGT_CACHE(shift) pgtable_cache[shift]
66 66
67static inline void check_pgt_cache(void) { }
68
69#ifdef CONFIG_PPC_BOOK3S 67#ifdef CONFIG_PPC_BOOK3S
70#include <asm/book3s/pgalloc.h> 68#include <asm/book3s/pgalloc.h>
71#else 69#else
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 8b7865a2d576..4053b2ab427c 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -87,7 +87,6 @@ extern unsigned long ioremap_bot;
87unsigned long vmalloc_to_phys(void *vmalloc_addr); 87unsigned long vmalloc_to_phys(void *vmalloc_addr);
88 88
89void pgtable_cache_add(unsigned int shift); 89void pgtable_cache_add(unsigned int shift);
90void pgtable_cache_init(void);
91 90
92#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_PPC32) 91#if defined(CONFIG_STRICT_KERNEL_RWX) || defined(CONFIG_PPC32)
93void mark_initmem_nx(void); 92void mark_initmem_nx(void);
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c
index 3410ea9f4de1..6c123760164e 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1748,7 +1748,7 @@ void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
1748 /* 1748 /*
1749 * IF we try to do a HUGE PTE update after a withdraw is done. 1749 * IF we try to do a HUGE PTE update after a withdraw is done.
1750 * we will find the below NULL. This happens when we do 1750 * we will find the below NULL. This happens when we do
1751 * split_huge_page_pmd 1751 * split_huge_pmd
1752 */ 1752 */
1753 if (!hpte_slot_array) 1753 if (!hpte_slot_array)
1754 return; 1754 return;
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c
index b056cae3388b..56cc84520577 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -129,11 +129,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua,
129 * Allow to use larger than 64k IOMMU pages. Only do that 129 * Allow to use larger than 64k IOMMU pages. Only do that
130 * if we are backed by hugetlb. 130 * if we are backed by hugetlb.
131 */ 131 */
132 if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { 132 if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page))
133 struct page *head = compound_head(page); 133 pageshift = page_shift(compound_head(page));
134
135 pageshift = compound_order(head) + PAGE_SHIFT;
136 }
137 mem->pageshift = min(mem->pageshift, pageshift); 134 mem->pageshift = min(mem->pageshift, pageshift);
138 /* 135 /*
139 * We don't need struct page reference any more, switch 136 * We don't need struct page reference any more, switch
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index a8953f108808..73d4873fc7f8 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -667,7 +667,7 @@ void flush_dcache_icache_hugepage(struct page *page)
667 667
668 BUG_ON(!PageCompound(page)); 668 BUG_ON(!PageCompound(page));
669 669
670 for (i = 0; i < (1UL << compound_order(page)); i++) { 670 for (i = 0; i < compound_nr(page); i++) {
671 if (!PageHighMem(page)) { 671 if (!PageHighMem(page)) {
672 __flush_dcache_icache(page_address(page+i)); 672 __flush_dcache_icache(page_address(page+i));
673 } else { 673 } else {
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 71d29fb4008a..8eebbc8860bb 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -59,6 +59,18 @@ config RISCV
59 select ARCH_HAS_GIGANTIC_PAGE 59 select ARCH_HAS_GIGANTIC_PAGE
60 select ARCH_WANT_HUGE_PMD_SHARE if 64BIT 60 select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
61 select SPARSEMEM_STATIC if 32BIT 61 select SPARSEMEM_STATIC if 32BIT
62 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
63 select HAVE_ARCH_MMAP_RND_BITS
64
65config ARCH_MMAP_RND_BITS_MIN
66 default 18 if 64BIT
67 default 8
68
69# max bits determined by the following formula:
70# VA_BITS - PAGE_SHIFT - 3
71config ARCH_MMAP_RND_BITS_MAX
72 default 24 if 64BIT # SV39 based
73 default 17
62 74
63config MMU 75config MMU
64 def_bool y 76 def_bool y
diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
index 56a67d66f72f..f66a00d8cb19 100644
--- a/arch/riscv/include/asm/pgalloc.h
+++ b/arch/riscv/include/asm/pgalloc.h
@@ -82,8 +82,4 @@ do { \
82 tlb_remove_page((tlb), pte); \ 82 tlb_remove_page((tlb), pte); \
83} while (0) 83} while (0)
84 84
85static inline void check_pgt_cache(void)
86{
87}
88
89#endif /* _ASM_RISCV_PGALLOC_H */ 85#endif /* _ASM_RISCV_PGALLOC_H */
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 80905b27ee98..c60123f018f5 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -424,11 +424,6 @@ extern void *dtb_early_va;
424extern void setup_bootmem(void); 424extern void setup_bootmem(void);
425extern void paging_init(void); 425extern void paging_init(void);
426 426
427static inline void pgtable_cache_init(void)
428{
429 /* No page table caches to initialize */
430}
431
432#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) 427#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
433#define VMALLOC_END (PAGE_OFFSET - 1) 428#define VMALLOC_END (PAGE_OFFSET - 1)
434#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE) 429#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE)
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 0c4600725fc2..36c578c0ff96 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1682,12 +1682,6 @@ extern void s390_reset_cmma(struct mm_struct *mm);
1682#define HAVE_ARCH_UNMAPPED_AREA 1682#define HAVE_ARCH_UNMAPPED_AREA
1683#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 1683#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
1684 1684
1685/*
1686 * No page table caches to initialise
1687 */
1688static inline void pgtable_cache_init(void) { }
1689static inline void check_pgt_cache(void) { }
1690
1691#include <asm-generic/pgtable.h> 1685#include <asm-generic/pgtable.h>
1692 1686
1693#endif /* _S390_PAGE_H */ 1687#endif /* _S390_PAGE_H */
diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h
index b56f908b1395..8c6341a4d807 100644
--- a/arch/sh/include/asm/pgalloc.h
+++ b/arch/sh/include/asm/pgalloc.h
@@ -2,10 +2,8 @@
2#ifndef __ASM_SH_PGALLOC_H 2#ifndef __ASM_SH_PGALLOC_H
3#define __ASM_SH_PGALLOC_H 3#define __ASM_SH_PGALLOC_H
4 4
5#include <linux/quicklist.h>
6#include <asm/page.h> 5#include <asm/page.h>
7 6#include <asm-generic/pgalloc.h>
8#define QUICK_PT 0 /* Other page table pages that are zero on free */
9 7
10extern pgd_t *pgd_alloc(struct mm_struct *); 8extern pgd_t *pgd_alloc(struct mm_struct *);
11extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); 9extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
@@ -29,41 +27,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
29} 27}
30#define pmd_pgtable(pmd) pmd_page(pmd) 28#define pmd_pgtable(pmd) pmd_page(pmd)
31 29
32/*
33 * Allocate and free page tables.
34 */
35static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm)
36{
37 return quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL);
38}
39
40static inline pgtable_t pte_alloc_one(struct mm_struct *mm)
41{
42 struct page *page;
43 void *pg;
44
45 pg = quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL);
46 if (!pg)
47 return NULL;
48 page = virt_to_page(pg);
49 if (!pgtable_page_ctor(page)) {
50 quicklist_free(QUICK_PT, NULL, pg);
51 return NULL;
52 }
53 return page;
54}
55
56static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
57{
58 quicklist_free(QUICK_PT, NULL, pte);
59}
60
61static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
62{
63 pgtable_page_dtor(pte);
64 quicklist_free_page(QUICK_PT, NULL, pte);
65}
66
67#define __pte_free_tlb(tlb,pte,addr) \ 30#define __pte_free_tlb(tlb,pte,addr) \
68do { \ 31do { \
69 pgtable_page_dtor(pte); \ 32 pgtable_page_dtor(pte); \
@@ -79,9 +42,4 @@ do { \
79} while (0); 42} while (0);
80#endif 43#endif
81 44
82static inline void check_pgt_cache(void)
83{
84 quicklist_trim(QUICK_PT, NULL, 25, 16);
85}
86
87#endif /* __ASM_SH_PGALLOC_H */ 45#endif /* __ASM_SH_PGALLOC_H */
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index 9085d1142fa3..cbd0f3c55a0c 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h
@@ -123,11 +123,6 @@ typedef pte_t *pte_addr_t;
123 123
124#define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT))) 124#define pte_pfn(x) ((unsigned long)(((x).pte_low >> PAGE_SHIFT)))
125 125
126/*
127 * Initialise the page table caches
128 */
129extern void pgtable_cache_init(void);
130
131struct vm_area_struct; 126struct vm_area_struct;
132struct mm_struct; 127struct mm_struct;
133 128
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index 02ed2df25a54..5c8a2ebfc720 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -1,9 +1,6 @@
1# SPDX-License-Identifier: GPL-2.0 1# SPDX-License-Identifier: GPL-2.0
2menu "Memory management options" 2menu "Memory management options"
3 3
4config QUICKLIST
5 def_bool y
6
7config MMU 4config MMU
8 bool "Support for memory management hardware" 5 bool "Support for memory management hardware"
9 depends on !CPU_SH2 6 depends on !CPU_SH2
diff --git a/arch/sh/mm/nommu.c b/arch/sh/mm/nommu.c
index cc779a90d917..dca946f426c6 100644
--- a/arch/sh/mm/nommu.c
+++ b/arch/sh/mm/nommu.c
@@ -97,7 +97,3 @@ void __init page_table_range_init(unsigned long start, unsigned long end,
97void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) 97void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
98{ 98{
99} 99}
100
101void pgtable_cache_init(void)
102{
103}
diff --git a/arch/sparc/include/asm/pgalloc_32.h b/arch/sparc/include/asm/pgalloc_32.h
index 282be50a4adf..10538a4d1a1e 100644
--- a/arch/sparc/include/asm/pgalloc_32.h
+++ b/arch/sparc/include/asm/pgalloc_32.h
@@ -17,8 +17,6 @@ void srmmu_free_nocache(void *addr, int size);
17 17
18extern struct resource sparc_iomap; 18extern struct resource sparc_iomap;
19 19
20#define check_pgt_cache() do { } while (0)
21
22pgd_t *get_pgd_fast(void); 20pgd_t *get_pgd_fast(void);
23static inline void free_pgd_fast(pgd_t *pgd) 21static inline void free_pgd_fast(pgd_t *pgd)
24{ 22{
diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h
index 48abccba4991..9d3e5cc95bbb 100644
--- a/arch/sparc/include/asm/pgalloc_64.h
+++ b/arch/sparc/include/asm/pgalloc_64.h
@@ -69,8 +69,6 @@ void pte_free(struct mm_struct *mm, pgtable_t ptepage);
69#define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE) 69#define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE)
70#define pmd_pgtable(PMD) ((pte_t *)__pmd_page(PMD)) 70#define pmd_pgtable(PMD) ((pte_t *)__pmd_page(PMD))
71 71
72#define check_pgt_cache() do { } while (0)
73
74void pgtable_free(void *table, bool is_page); 72void pgtable_free(void *table, bool is_page);
75 73
76#ifdef CONFIG_SMP 74#ifdef CONFIG_SMP
diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h
index 4eebed6c6781..31da44826645 100644
--- a/arch/sparc/include/asm/pgtable_32.h
+++ b/arch/sparc/include/asm/pgtable_32.h
@@ -445,9 +445,4 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma,
445/* We provide our own get_unmapped_area to cope with VA holes for userland */ 445/* We provide our own get_unmapped_area to cope with VA holes for userland */
446#define HAVE_ARCH_UNMAPPED_AREA 446#define HAVE_ARCH_UNMAPPED_AREA
447 447
448/*
449 * No page table caches to initialise
450 */
451#define pgtable_cache_init() do { } while (0)
452
453#endif /* !(_SPARC_PGTABLE_H) */ 448#endif /* !(_SPARC_PGTABLE_H) */
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 1599de730532..b57f9c631eca 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -1135,7 +1135,6 @@ unsigned long get_fb_unmapped_area(struct file *filp, unsigned long,
1135 unsigned long); 1135 unsigned long);
1136#define HAVE_ARCH_FB_UNMAPPED_AREA 1136#define HAVE_ARCH_FB_UNMAPPED_AREA
1137 1137
1138void pgtable_cache_init(void);
1139void sun4v_register_fault_status(void); 1138void sun4v_register_fault_status(void);
1140void sun4v_ktsb_register(void); 1139void sun4v_ktsb_register(void);
1141void __init cheetah_ecache_flush_init(void); 1140void __init cheetah_ecache_flush_init(void);
diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c
index 046ab116cc8c..906eda1158b4 100644
--- a/arch/sparc/mm/init_32.c
+++ b/arch/sparc/mm/init_32.c
@@ -31,7 +31,6 @@
31#include <asm/page.h> 31#include <asm/page.h>
32#include <asm/pgtable.h> 32#include <asm/pgtable.h>
33#include <asm/vaddrs.h> 33#include <asm/vaddrs.h>
34#include <asm/pgalloc.h> /* bug in asm-generic/tlb.h: check_pgt_cache */
35#include <asm/setup.h> 34#include <asm/setup.h>
36#include <asm/tlb.h> 35#include <asm/tlb.h>
37#include <asm/prom.h> 36#include <asm/prom.h>
diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h
index 023599c3fa51..446e0c0f4018 100644
--- a/arch/um/include/asm/pgalloc.h
+++ b/arch/um/include/asm/pgalloc.h
@@ -43,7 +43,5 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
43#define __pmd_free_tlb(tlb,x, address) tlb_remove_page((tlb),virt_to_page(x)) 43#define __pmd_free_tlb(tlb,x, address) tlb_remove_page((tlb),virt_to_page(x))
44#endif 44#endif
45 45
46#define check_pgt_cache() do { } while (0)
47
48#endif 46#endif
49 47
diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index e4d3ed980d82..36a44d58f373 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h
@@ -32,8 +32,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
32/* zero page used for uninitialized stuff */ 32/* zero page used for uninitialized stuff */
33extern unsigned long *empty_zero_page; 33extern unsigned long *empty_zero_page;
34 34
35#define pgtable_cache_init() do ; while (0)
36
37/* Just any arbitrary offset to the start of the vmalloc VM area: the 35/* Just any arbitrary offset to the start of the vmalloc VM area: the
38 * current 8MB value just means that there will be a 8MB "hole" after the 36 * current 8MB value just means that there will be a 8MB "hole" after the
39 * physical memory until the kernel virtual memory starts. That means that 37 * physical memory until the kernel virtual memory starts. That means that
diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h
index 3f0903bd98e9..ba1c9a79993b 100644
--- a/arch/unicore32/include/asm/pgalloc.h
+++ b/arch/unicore32/include/asm/pgalloc.h
@@ -18,8 +18,6 @@
18#define __HAVE_ARCH_PTE_ALLOC_ONE 18#define __HAVE_ARCH_PTE_ALLOC_ONE
19#include <asm-generic/pgalloc.h> 19#include <asm-generic/pgalloc.h>
20 20
21#define check_pgt_cache() do { } while (0)
22
23#define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) 21#define _PAGE_USER_TABLE (PMD_TYPE_TABLE | PMD_PRESENT)
24#define _PAGE_KERNEL_TABLE (PMD_TYPE_TABLE | PMD_PRESENT) 22#define _PAGE_KERNEL_TABLE (PMD_TYPE_TABLE | PMD_PRESENT)
25 23
diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h
index 126e961a8cb0..c8f7ba12f309 100644
--- a/arch/unicore32/include/asm/pgtable.h
+++ b/arch/unicore32/include/asm/pgtable.h
@@ -285,8 +285,6 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
285 285
286#include <asm-generic/pgtable.h> 286#include <asm-generic/pgtable.h>
287 287
288#define pgtable_cache_init() do { } while (0)
289
290#endif /* !__ASSEMBLY__ */ 288#endif /* !__ASSEMBLY__ */
291 289
292#endif /* __UNICORE_PGTABLE_H__ */ 290#endif /* __UNICORE_PGTABLE_H__ */
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index c78da8eda8f2..0dca7f7aeff2 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -29,8 +29,6 @@ extern pgd_t swapper_pg_dir[1024];
29extern pgd_t initial_page_table[1024]; 29extern pgd_t initial_page_table[1024];
30extern pmd_t initial_pg_pmd[]; 30extern pmd_t initial_pg_pmd[];
31 31
32static inline void pgtable_cache_init(void) { }
33static inline void check_pgt_cache(void) { }
34void paging_init(void); 32void paging_init(void);
35void sync_initial_page_table(void); 33void sync_initial_page_table(void);
36 34
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 4990d26dfc73..0b6c4042942a 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -241,9 +241,6 @@ extern void cleanup_highmap(void);
241#define HAVE_ARCH_UNMAPPED_AREA 241#define HAVE_ARCH_UNMAPPED_AREA
242#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN 242#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
243 243
244#define pgtable_cache_init() do { } while (0)
245#define check_pgt_cache() do { } while (0)
246
247#define PAGE_AGP PAGE_KERNEL_NOCACHE 244#define PAGE_AGP PAGE_KERNEL_NOCACHE
248#define HAVE_PAGE_AGP 1 245#define HAVE_PAGE_AGP 1
249 246
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 44816ff6411f..463940faf52f 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -357,7 +357,7 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm,
357 357
358static struct kmem_cache *pgd_cache; 358static struct kmem_cache *pgd_cache;
359 359
360void __init pgd_cache_init(void) 360void __init pgtable_cache_init(void)
361{ 361{
362 /* 362 /*
363 * When PAE kernel is running as a Xen domain, it does not use 363 * When PAE kernel is running as a Xen domain, it does not use
@@ -402,10 +402,6 @@ static inline void _pgd_free(pgd_t *pgd)
402} 402}
403#else 403#else
404 404
405void __init pgd_cache_init(void)
406{
407}
408
409static inline pgd_t *_pgd_alloc(void) 405static inline pgd_t *_pgd_alloc(void)
410{ 406{
411 return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER, 407 return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h
index ce3ff5e591b9..3f7fe5a8c286 100644
--- a/arch/xtensa/include/asm/pgtable.h
+++ b/arch/xtensa/include/asm/pgtable.h
@@ -238,7 +238,6 @@ extern void paging_init(void);
238# define swapper_pg_dir NULL 238# define swapper_pg_dir NULL
239static inline void paging_init(void) { } 239static inline void paging_init(void) { }
240#endif 240#endif
241static inline void pgtable_cache_init(void) { }
242 241
243/* 242/*
244 * The pmd contains the kernel virtual address of the pte page. 243 * The pmd contains the kernel virtual address of the pte page.
diff --git a/arch/xtensa/include/asm/tlbflush.h b/arch/xtensa/include/asm/tlbflush.h
index 06875feb27c2..856e2da2e397 100644
--- a/arch/xtensa/include/asm/tlbflush.h
+++ b/arch/xtensa/include/asm/tlbflush.h
@@ -160,9 +160,6 @@ static inline void invalidate_dtlb_mapping (unsigned address)
160 invalidate_dtlb_entry(tlb_entry); 160 invalidate_dtlb_entry(tlb_entry);
161} 161}
162 162
163#define check_pgt_cache() do { } while (0)
164
165
166/* 163/*
167 * DO NOT USE THESE FUNCTIONS. These instructions aren't part of the Xtensa 164 * DO NOT USE THESE FUNCTIONS. These instructions aren't part of the Xtensa
168 * ISA and exist only for test purposes.. 165 * ISA and exist only for test purposes..
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 20c39d1bcef8..6bea4f3f8040 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -100,26 +100,9 @@ unsigned long __weak memory_block_size_bytes(void)
100} 100}
101EXPORT_SYMBOL_GPL(memory_block_size_bytes); 101EXPORT_SYMBOL_GPL(memory_block_size_bytes);
102 102
103static unsigned long get_memory_block_size(void)
104{
105 unsigned long block_sz;
106
107 block_sz = memory_block_size_bytes();
108
109 /* Validate blk_sz is a power of 2 and not less than section size */
110 if ((block_sz & (block_sz - 1)) || (block_sz < MIN_MEMORY_BLOCK_SIZE)) {
111 WARN_ON(1);
112 block_sz = MIN_MEMORY_BLOCK_SIZE;
113 }
114
115 return block_sz;
116}
117
118/* 103/*
119 * use this as the physical section index that this memsection 104 * Show the first physical section index (number) of this memory block.
120 * uses.
121 */ 105 */
122
123static ssize_t phys_index_show(struct device *dev, 106static ssize_t phys_index_show(struct device *dev,
124 struct device_attribute *attr, char *buf) 107 struct device_attribute *attr, char *buf)
125{ 108{
@@ -131,7 +114,10 @@ static ssize_t phys_index_show(struct device *dev,
131} 114}
132 115
133/* 116/*
134 * Show whether the section of memory is likely to be hot-removable 117 * Show whether the memory block is likely to be offlineable (or is already
118 * offline). Once offline, the memory block could be removed. The return
119 * value does, however, not indicate that there is a way to remove the
120 * memory block.
135 */ 121 */
136static ssize_t removable_show(struct device *dev, struct device_attribute *attr, 122static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
137 char *buf) 123 char *buf)
@@ -455,12 +441,12 @@ static DEVICE_ATTR_RO(phys_device);
455static DEVICE_ATTR_RO(removable); 441static DEVICE_ATTR_RO(removable);
456 442
457/* 443/*
458 * Block size attribute stuff 444 * Show the memory block size (shared by all memory blocks).
459 */ 445 */
460static ssize_t block_size_bytes_show(struct device *dev, 446static ssize_t block_size_bytes_show(struct device *dev,
461 struct device_attribute *attr, char *buf) 447 struct device_attribute *attr, char *buf)
462{ 448{
463 return sprintf(buf, "%lx\n", get_memory_block_size()); 449 return sprintf(buf, "%lx\n", memory_block_size_bytes());
464} 450}
465 451
466static DEVICE_ATTR_RO(block_size_bytes); 452static DEVICE_ATTR_RO(block_size_bytes);
@@ -670,10 +656,10 @@ static int init_memory_block(struct memory_block **memory,
670 return -ENOMEM; 656 return -ENOMEM;
671 657
672 mem->start_section_nr = block_id * sections_per_block; 658 mem->start_section_nr = block_id * sections_per_block;
673 mem->end_section_nr = mem->start_section_nr + sections_per_block - 1;
674 mem->state = state; 659 mem->state = state;
675 start_pfn = section_nr_to_pfn(mem->start_section_nr); 660 start_pfn = section_nr_to_pfn(mem->start_section_nr);
676 mem->phys_device = arch_get_memory_phys_device(start_pfn); 661 mem->phys_device = arch_get_memory_phys_device(start_pfn);
662 mem->nid = NUMA_NO_NODE;
677 663
678 ret = register_memory(mem); 664 ret = register_memory(mem);
679 665
@@ -810,19 +796,22 @@ static const struct attribute_group *memory_root_attr_groups[] = {
810/* 796/*
811 * Initialize the sysfs support for memory devices... 797 * Initialize the sysfs support for memory devices...
812 */ 798 */
813int __init memory_dev_init(void) 799void __init memory_dev_init(void)
814{ 800{
815 int ret; 801 int ret;
816 int err; 802 int err;
817 unsigned long block_sz, nr; 803 unsigned long block_sz, nr;
818 804
805 /* Validate the configured memory block size */
806 block_sz = memory_block_size_bytes();
807 if (!is_power_of_2(block_sz) || block_sz < MIN_MEMORY_BLOCK_SIZE)
808 panic("Memory block size not suitable: 0x%lx\n", block_sz);
809 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
810
819 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups); 811 ret = subsys_system_register(&memory_subsys, memory_root_attr_groups);
820 if (ret) 812 if (ret)
821 goto out; 813 goto out;
822 814
823 block_sz = get_memory_block_size();
824 sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
825
826 /* 815 /*
827 * Create entries for memory sections that were found 816 * Create entries for memory sections that were found
828 * during boot and have been initialized 817 * during boot and have been initialized
@@ -838,8 +827,7 @@ int __init memory_dev_init(void)
838 827
839out: 828out:
840 if (ret) 829 if (ret)
841 printk(KERN_ERR "%s() failed: %d\n", __func__, ret); 830 panic("%s() failed: %d\n", __func__, ret);
842 return ret;
843} 831}
844 832
845/** 833/**
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 75b7e6f6535b..296546ffed6c 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -427,6 +427,8 @@ static ssize_t node_read_meminfo(struct device *dev,
427 "Node %d AnonHugePages: %8lu kB\n" 427 "Node %d AnonHugePages: %8lu kB\n"
428 "Node %d ShmemHugePages: %8lu kB\n" 428 "Node %d ShmemHugePages: %8lu kB\n"
429 "Node %d ShmemPmdMapped: %8lu kB\n" 429 "Node %d ShmemPmdMapped: %8lu kB\n"
430 "Node %d FileHugePages: %8lu kB\n"
431 "Node %d FilePmdMapped: %8lu kB\n"
430#endif 432#endif
431 , 433 ,
432 nid, K(node_page_state(pgdat, NR_FILE_DIRTY)), 434 nid, K(node_page_state(pgdat, NR_FILE_DIRTY)),
@@ -452,6 +454,10 @@ static ssize_t node_read_meminfo(struct device *dev,
452 nid, K(node_page_state(pgdat, NR_SHMEM_THPS) * 454 nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
453 HPAGE_PMD_NR), 455 HPAGE_PMD_NR),
454 nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) * 456 nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
457 HPAGE_PMD_NR),
458 nid, K(node_page_state(pgdat, NR_FILE_THPS) *
459 HPAGE_PMD_NR),
460 nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) *
455 HPAGE_PMD_NR) 461 HPAGE_PMD_NR)
456#endif 462#endif
457 ); 463 );
@@ -756,15 +762,13 @@ static int __ref get_nid_for_pfn(unsigned long pfn)
756static int register_mem_sect_under_node(struct memory_block *mem_blk, 762static int register_mem_sect_under_node(struct memory_block *mem_blk,
757 void *arg) 763 void *arg)
758{ 764{
765 unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE;
766 unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
767 unsigned long end_pfn = start_pfn + memory_block_pfns - 1;
759 int ret, nid = *(int *)arg; 768 int ret, nid = *(int *)arg;
760 unsigned long pfn, sect_start_pfn, sect_end_pfn; 769 unsigned long pfn;
761 770
762 mem_blk->nid = nid; 771 for (pfn = start_pfn; pfn <= end_pfn; pfn++) {
763
764 sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
765 sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
766 sect_end_pfn += PAGES_PER_SECTION - 1;
767 for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
768 int page_nid; 772 int page_nid;
769 773
770 /* 774 /*
@@ -789,6 +793,13 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
789 if (page_nid != nid) 793 if (page_nid != nid)
790 continue; 794 continue;
791 } 795 }
796
797 /*
798 * If this memory block spans multiple nodes, we only indicate
799 * the last processed node.
800 */
801 mem_blk->nid = nid;
802
792 ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, 803 ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
793 &mem_blk->dev.kobj, 804 &mem_blk->dev.kobj,
794 kobject_name(&mem_blk->dev.kobj)); 805 kobject_name(&mem_blk->dev.kobj));
@@ -804,32 +815,18 @@ static int register_mem_sect_under_node(struct memory_block *mem_blk,
804} 815}
805 816
806/* 817/*
807 * Unregister memory block device under all nodes that it spans. 818 * Unregister a memory block device under the node it spans. Memory blocks
808 * Has to be called with mem_sysfs_mutex held (due to unlinked_nodes). 819 * with multiple nodes cannot be offlined and therefore also never be removed.
809 */ 820 */
810void unregister_memory_block_under_nodes(struct memory_block *mem_blk) 821void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
811{ 822{
812 unsigned long pfn, sect_start_pfn, sect_end_pfn; 823 if (mem_blk->nid == NUMA_NO_NODE)
813 static nodemask_t unlinked_nodes; 824 return;
814
815 nodes_clear(unlinked_nodes);
816 sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
817 sect_end_pfn = section_nr_to_pfn(mem_blk->end_section_nr);
818 for (pfn = sect_start_pfn; pfn <= sect_end_pfn; pfn++) {
819 int nid;
820 825
821 nid = get_nid_for_pfn(pfn); 826 sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj,
822 if (nid < 0) 827 kobject_name(&mem_blk->dev.kobj));
823 continue; 828 sysfs_remove_link(&mem_blk->dev.kobj,
824 if (!node_online(nid)) 829 kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
825 continue;
826 if (node_test_and_set(nid, unlinked_nodes))
827 continue;
828 sysfs_remove_link(&node_devices[nid]->dev.kobj,
829 kobject_name(&mem_blk->dev.kobj));
830 sysfs_remove_link(&mem_blk->dev.kobj,
831 kobject_name(&node_devices[nid]->dev.kobj));
832 }
833} 830}
834 831
835int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn) 832int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn)
diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c
index c70cb5f272cf..0891ab829b1b 100644
--- a/drivers/crypto/chelsio/chtls/chtls_io.c
+++ b/drivers/crypto/chelsio/chtls/chtls_io.c
@@ -1078,7 +1078,7 @@ new_buf:
1078 bool merge; 1078 bool merge;
1079 1079
1080 if (page) 1080 if (page)
1081 pg_size <<= compound_order(page); 1081 pg_size = page_size(page);
1082 if (off < pg_size && 1082 if (off < pg_size &&
1083 skb_can_coalesce(skb, i, page, off)) { 1083 skb_can_coalesce(skb, i, page, off)) {
1084 merge = 1; 1084 merge = 1;
@@ -1105,8 +1105,7 @@ new_buf:
1105 __GFP_NORETRY, 1105 __GFP_NORETRY,
1106 order); 1106 order);
1107 if (page) 1107 if (page)
1108 pg_size <<= 1108 pg_size <<= order;
1109 compound_order(page);
1110 } 1109 }
1111 if (!page) { 1110 if (!page) {
1112 page = alloc_page(gfp); 1111 page = alloc_page(gfp);
diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c
index feaa538026a0..3db000aacd26 100644
--- a/drivers/gpu/drm/via/via_dmablit.c
+++ b/drivers/gpu/drm/via/via_dmablit.c
@@ -174,7 +174,6 @@ via_map_blit_for_device(struct pci_dev *pdev,
174static void 174static void
175via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg) 175via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg)
176{ 176{
177 struct page *page;
178 int i; 177 int i;
179 178
180 switch (vsg->state) { 179 switch (vsg->state) {
@@ -189,13 +188,8 @@ via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg)
189 kfree(vsg->desc_pages); 188 kfree(vsg->desc_pages);
190 /* fall through */ 189 /* fall through */
191 case dr_via_pages_locked: 190 case dr_via_pages_locked:
192 for (i = 0; i < vsg->num_pages; ++i) { 191 put_user_pages_dirty_lock(vsg->pages, vsg->num_pages,
193 if (NULL != (page = vsg->pages[i])) { 192 (vsg->direction == DMA_FROM_DEVICE));
194 if (!PageReserved(page) && (DMA_FROM_DEVICE == vsg->direction))
195 SetPageDirty(page);
196 put_page(page);
197 }
198 }
199 /* fall through */ 193 /* fall through */
200 case dr_via_pages_alloc: 194 case dr_via_pages_alloc:
201 vfree(vsg->pages); 195 vfree(vsg->pages);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 41f9e268e3fb..24244a2f68cc 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -54,10 +54,7 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
54 54
55 for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { 55 for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
56 page = sg_page_iter_page(&sg_iter); 56 page = sg_page_iter_page(&sg_iter);
57 if (umem->writable && dirty) 57 put_user_pages_dirty_lock(&page, 1, umem->writable && dirty);
58 put_user_pages_dirty_lock(&page, 1);
59 else
60 put_user_page(page);
61 } 58 }
62 59
63 sg_free_table(&umem->sg_head); 60 sg_free_table(&umem->sg_head);
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c
index b89a9b9aef7a..469acb961fbd 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -118,10 +118,7 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np
118void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, 118void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
119 size_t npages, bool dirty) 119 size_t npages, bool dirty)
120{ 120{
121 if (dirty) 121 put_user_pages_dirty_lock(p, npages, dirty);
122 put_user_pages_dirty_lock(p, npages);
123 else
124 put_user_pages(p, npages);
125 122
126 if (mm) { /* during close after signal, mm can be NULL */ 123 if (mm) { /* during close after signal, mm can be NULL */
127 atomic64_sub(npages, &mm->pinned_vm); 124 atomic64_sub(npages, &mm->pinned_vm);
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
index bfbfbb7e0ff4..6bf764e41891 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -40,10 +40,7 @@
40static void __qib_release_user_pages(struct page **p, size_t num_pages, 40static void __qib_release_user_pages(struct page **p, size_t num_pages,
41 int dirty) 41 int dirty)
42{ 42{
43 if (dirty) 43 put_user_pages_dirty_lock(p, num_pages, dirty);
44 put_user_pages_dirty_lock(p, num_pages);
45 else
46 put_user_pages(p, num_pages);
47} 44}
48 45
49/** 46/**
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 0b0237d41613..62e6ffa9ad78 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -75,10 +75,7 @@ static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty)
75 for_each_sg(chunk->page_list, sg, chunk->nents, i) { 75 for_each_sg(chunk->page_list, sg, chunk->nents, i) {
76 page = sg_page(sg); 76 page = sg_page(sg);
77 pa = sg_phys(sg); 77 pa = sg_phys(sg);
78 if (dirty) 78 put_user_pages_dirty_lock(&page, 1, dirty);
79 put_user_pages_dirty_lock(&page, 1);
80 else
81 put_user_page(page);
82 usnic_dbg("pa: %pa\n", &pa); 79 usnic_dbg("pa: %pa\n", &pa);
83 } 80 }
84 kfree(chunk); 81 kfree(chunk);
diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c
index 87a56039f0ef..e99983f07663 100644
--- a/drivers/infiniband/sw/siw/siw_mem.c
+++ b/drivers/infiniband/sw/siw/siw_mem.c
@@ -63,15 +63,7 @@ struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index)
63static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, 63static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages,
64 bool dirty) 64 bool dirty)
65{ 65{
66 struct page **p = chunk->plist; 66 put_user_pages_dirty_lock(chunk->plist, num_pages, dirty);
67
68 while (num_pages--) {
69 if (!PageDirty(*p) && dirty)
70 put_user_pages_dirty_lock(p, 1);
71 else
72 put_user_page(*p);
73 p++;
74 }
75} 67}
76 68
77void siw_umem_release(struct siw_umem *umem, bool dirty) 69void siw_umem_release(struct siw_umem *umem, bool dirty)
diff --git a/drivers/staging/android/ion/ion_system_heap.c b/drivers/staging/android/ion/ion_system_heap.c
index aa8d8425be25..b83a1d16bd89 100644
--- a/drivers/staging/android/ion/ion_system_heap.c
+++ b/drivers/staging/android/ion/ion_system_heap.c
@@ -120,7 +120,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap,
120 if (!page) 120 if (!page)
121 goto free_pages; 121 goto free_pages;
122 list_add_tail(&page->lru, &pages); 122 list_add_tail(&page->lru, &pages);
123 size_remaining -= PAGE_SIZE << compound_order(page); 123 size_remaining -= page_size(page);
124 max_order = compound_order(page); 124 max_order = compound_order(page);
125 i++; 125 i++;
126 } 126 }
@@ -133,7 +133,7 @@ static int ion_system_heap_allocate(struct ion_heap *heap,
133 133
134 sg = table->sgl; 134 sg = table->sgl;
135 list_for_each_entry_safe(page, tmp_page, &pages, lru) { 135 list_for_each_entry_safe(page, tmp_page, &pages, lru) {
136 sg_set_page(sg, page, PAGE_SIZE << compound_order(page), 0); 136 sg_set_page(sg, page, page_size(page), 0);
137 sg = sg_next(sg); 137 sg = sg_next(sg);
138 list_del(&page->lru); 138 list_del(&page->lru);
139 } 139 }
diff --git a/drivers/target/tcm_fc/tfc_io.c b/drivers/target/tcm_fc/tfc_io.c
index a254792d882c..1354a157e9af 100644
--- a/drivers/target/tcm_fc/tfc_io.c
+++ b/drivers/target/tcm_fc/tfc_io.c
@@ -136,8 +136,7 @@ int ft_queue_data_in(struct se_cmd *se_cmd)
136 page, off_in_page, tlen); 136 page, off_in_page, tlen);
137 fr_len(fp) += tlen; 137 fr_len(fp) += tlen;
138 fp_skb(fp)->data_len += tlen; 138 fp_skb(fp)->data_len += tlen;
139 fp_skb(fp)->truesize += 139 fp_skb(fp)->truesize += page_size(page);
140 PAGE_SIZE << compound_order(page);
141 } else { 140 } else {
142 BUG_ON(!page); 141 BUG_ON(!page);
143 from = kmap_atomic(page + (mem_off >> PAGE_SHIFT)); 142 from = kmap_atomic(page + (mem_off >> PAGE_SHIFT));
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 3b18fa4d090a..26cef65b41e7 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -176,13 +176,13 @@ put_exit:
176} 176}
177 177
178static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa, 178static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
179 unsigned int page_shift) 179 unsigned int it_page_shift)
180{ 180{
181 struct page *page; 181 struct page *page;
182 unsigned long size = 0; 182 unsigned long size = 0;
183 183
184 if (mm_iommu_is_devmem(mm, hpa, page_shift, &size)) 184 if (mm_iommu_is_devmem(mm, hpa, it_page_shift, &size))
185 return size == (1UL << page_shift); 185 return size == (1UL << it_page_shift);
186 186
187 page = pfn_to_page(hpa >> PAGE_SHIFT); 187 page = pfn_to_page(hpa >> PAGE_SHIFT);
188 /* 188 /*
@@ -190,7 +190,7 @@ static bool tce_page_is_contained(struct mm_struct *mm, unsigned long hpa,
190 * a page we just found. Otherwise the hardware can get access to 190 * a page we just found. Otherwise the hardware can get access to
191 * a bigger memory chunk that it should. 191 * a bigger memory chunk that it should.
192 */ 192 */
193 return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift; 193 return page_shift(compound_head(page)) >= it_page_shift;
194} 194}
195 195
196static inline bool tce_groups_attached(struct tce_container *container) 196static inline bool tce_groups_attached(struct tce_container *container)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d4e11b2e04f6..cec3b4146440 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -670,26 +670,6 @@ out:
670 * libraries. There is no binary dependent code anywhere else. 670 * libraries. There is no binary dependent code anywhere else.
671 */ 671 */
672 672
673#ifndef STACK_RND_MASK
674#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
675#endif
676
677static unsigned long randomize_stack_top(unsigned long stack_top)
678{
679 unsigned long random_variable = 0;
680
681 if (current->flags & PF_RANDOMIZE) {
682 random_variable = get_random_long();
683 random_variable &= STACK_RND_MASK;
684 random_variable <<= PAGE_SHIFT;
685 }
686#ifdef CONFIG_STACK_GROWSUP
687 return PAGE_ALIGN(stack_top) + random_variable;
688#else
689 return PAGE_ALIGN(stack_top) - random_variable;
690#endif
691}
692
693static int load_elf_binary(struct linux_binprm *bprm) 673static int load_elf_binary(struct linux_binprm *bprm)
694{ 674{
695 struct file *interpreter = NULL; /* to shut gcc up */ 675 struct file *interpreter = NULL; /* to shut gcc up */
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 1bda2ab6745b..814ad2c2ba80 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1100,8 +1100,11 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
1100 err = -ENOMEM; 1100 err = -ENOMEM;
1101 goto error; 1101 goto error;
1102 } 1102 }
1103 /* Avoid race with userspace read via bdev */
1104 lock_buffer(bhs[n]);
1103 memset(bhs[n]->b_data, 0, sb->s_blocksize); 1105 memset(bhs[n]->b_data, 0, sb->s_blocksize);
1104 set_buffer_uptodate(bhs[n]); 1106 set_buffer_uptodate(bhs[n]);
1107 unlock_buffer(bhs[n]);
1105 mark_buffer_dirty_inode(bhs[n], dir); 1108 mark_buffer_dirty_inode(bhs[n], dir);
1106 1109
1107 n++; 1110 n++;
@@ -1158,6 +1161,8 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts)
1158 fat_time_unix2fat(sbi, ts, &time, &date, &time_cs); 1161 fat_time_unix2fat(sbi, ts, &time, &date, &time_cs);
1159 1162
1160 de = (struct msdos_dir_entry *)bhs[0]->b_data; 1163 de = (struct msdos_dir_entry *)bhs[0]->b_data;
1164 /* Avoid race with userspace read via bdev */
1165 lock_buffer(bhs[0]);
1161 /* filling the new directory slots ("." and ".." entries) */ 1166 /* filling the new directory slots ("." and ".." entries) */
1162 memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME); 1167 memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME);
1163 memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME); 1168 memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME);
@@ -1180,6 +1185,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts)
1180 de[0].size = de[1].size = 0; 1185 de[0].size = de[1].size = 0;
1181 memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); 1186 memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
1182 set_buffer_uptodate(bhs[0]); 1187 set_buffer_uptodate(bhs[0]);
1188 unlock_buffer(bhs[0]);
1183 mark_buffer_dirty_inode(bhs[0], dir); 1189 mark_buffer_dirty_inode(bhs[0], dir);
1184 1190
1185 err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE); 1191 err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
@@ -1237,11 +1243,14 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
1237 1243
1238 /* fill the directory entry */ 1244 /* fill the directory entry */
1239 copy = min(size, sb->s_blocksize); 1245 copy = min(size, sb->s_blocksize);
1246 /* Avoid race with userspace read via bdev */
1247 lock_buffer(bhs[n]);
1240 memcpy(bhs[n]->b_data, slots, copy); 1248 memcpy(bhs[n]->b_data, slots, copy);
1241 slots += copy;
1242 size -= copy;
1243 set_buffer_uptodate(bhs[n]); 1249 set_buffer_uptodate(bhs[n]);
1250 unlock_buffer(bhs[n]);
1244 mark_buffer_dirty_inode(bhs[n], dir); 1251 mark_buffer_dirty_inode(bhs[n], dir);
1252 slots += copy;
1253 size -= copy;
1245 if (!size) 1254 if (!size)
1246 break; 1255 break;
1247 n++; 1256 n++;
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 265983635f2b..3647c65a0f48 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -388,8 +388,11 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
388 err = -ENOMEM; 388 err = -ENOMEM;
389 goto error; 389 goto error;
390 } 390 }
391 /* Avoid race with userspace read via bdev */
392 lock_buffer(c_bh);
391 memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); 393 memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
392 set_buffer_uptodate(c_bh); 394 set_buffer_uptodate(c_bh);
395 unlock_buffer(c_bh);
393 mark_buffer_dirty_inode(c_bh, sbi->fat_inode); 396 mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
394 if (sb->s_flags & SB_SYNCHRONOUS) 397 if (sb->s_flags & SB_SYNCHRONOUS)
395 err = sync_dirty_buffer(c_bh); 398 err = sync_dirty_buffer(c_bh);
diff --git a/fs/inode.c b/fs/inode.c
index 64bf28cf05cd..fef457a42882 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -181,6 +181,9 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
181 mapping->flags = 0; 181 mapping->flags = 0;
182 mapping->wb_err = 0; 182 mapping->wb_err = 0;
183 atomic_set(&mapping->i_mmap_writable, 0); 183 atomic_set(&mapping->i_mmap_writable, 0);
184#ifdef CONFIG_READ_ONLY_THP_FOR_FS
185 atomic_set(&mapping->nr_thps, 0);
186#endif
184 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); 187 mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
185 mapping->private_data = NULL; 188 mapping->private_data = NULL;
186 mapping->writeback_index = 0; 189 mapping->writeback_index = 0;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0dadbdbead0f..f83de4c6a826 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3319,7 +3319,7 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
3319 } 3319 }
3320 3320
3321 page = virt_to_head_page(ptr); 3321 page = virt_to_head_page(ptr);
3322 if (sz > (PAGE_SIZE << compound_order(page))) 3322 if (sz > page_size(page))
3323 return -EINVAL; 3323 return -EINVAL;
3324 3324
3325 pfn = virt_to_phys(ptr) >> PAGE_SHIFT; 3325 pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 953990eb70a9..1c58859aa592 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -89,8 +89,6 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
89EXPORT_SYMBOL(jbd2_journal_invalidatepage); 89EXPORT_SYMBOL(jbd2_journal_invalidatepage);
90EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); 90EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
91EXPORT_SYMBOL(jbd2_journal_force_commit); 91EXPORT_SYMBOL(jbd2_journal_force_commit);
92EXPORT_SYMBOL(jbd2_journal_inode_add_write);
93EXPORT_SYMBOL(jbd2_journal_inode_add_wait);
94EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); 92EXPORT_SYMBOL(jbd2_journal_inode_ranged_write);
95EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait); 93EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait);
96EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); 94EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index afc06daee5bb..bee8498d7792 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2622,18 +2622,6 @@ done:
2622 return 0; 2622 return 0;
2623} 2623}
2624 2624
2625int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode)
2626{
2627 return jbd2_journal_file_inode(handle, jinode,
2628 JI_WRITE_DATA | JI_WAIT_DATA, 0, LLONG_MAX);
2629}
2630
2631int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode)
2632{
2633 return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, 0,
2634 LLONG_MAX);
2635}
2636
2637int jbd2_journal_inode_ranged_write(handle_t *handle, 2625int jbd2_journal_inode_ranged_write(handle_t *handle,
2638 struct jbd2_inode *jinode, loff_t start_byte, loff_t length) 2626 struct jbd2_inode *jinode, loff_t start_byte, loff_t length)
2639{ 2627{
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0c335b51043d..f9baefc76cf9 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5993,6 +5993,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5993 struct buffer_head *data_alloc_bh = NULL; 5993 struct buffer_head *data_alloc_bh = NULL;
5994 struct ocfs2_dinode *di; 5994 struct ocfs2_dinode *di;
5995 struct ocfs2_truncate_log *tl; 5995 struct ocfs2_truncate_log *tl;
5996 struct ocfs2_journal *journal = osb->journal;
5996 5997
5997 BUG_ON(inode_trylock(tl_inode)); 5998 BUG_ON(inode_trylock(tl_inode));
5998 5999
@@ -6013,6 +6014,20 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
6013 goto out; 6014 goto out;
6014 } 6015 }
6015 6016
6017 /* Appending truncate log(TA) and and flushing truncate log(TF) are
6018 * two separated transactions. They can be both committed but not
6019 * checkpointed. If crash occurs then, both two transaction will be
6020 * replayed with several already released to global bitmap clusters.
6021 * Then truncate log will be replayed resulting in cluster double free.
6022 */
6023 jbd2_journal_lock_updates(journal->j_journal);
6024 status = jbd2_journal_flush(journal->j_journal);
6025 jbd2_journal_unlock_updates(journal->j_journal);
6026 if (status < 0) {
6027 mlog_errno(status);
6028 goto out;
6029 }
6030
6016 data_alloc_inode = ocfs2_get_system_file_inode(osb, 6031 data_alloc_inode = ocfs2_get_system_file_inode(osb,
6017 GLOBAL_BITMAP_SYSTEM_INODE, 6032 GLOBAL_BITMAP_SYSTEM_INODE,
6018 OCFS2_INVALID_SLOT); 6033 OCFS2_INVALID_SLOT);
@@ -6792,6 +6807,8 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6792 struct page *page, int zero, u64 *phys) 6807 struct page *page, int zero, u64 *phys)
6793{ 6808{
6794 int ret, partial = 0; 6809 int ret, partial = 0;
6810 loff_t start_byte = ((loff_t)page->index << PAGE_SHIFT) + from;
6811 loff_t length = to - from;
6795 6812
6796 ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); 6813 ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6797 if (ret) 6814 if (ret)
@@ -6811,7 +6828,8 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6811 if (ret < 0) 6828 if (ret < 0)
6812 mlog_errno(ret); 6829 mlog_errno(ret);
6813 else if (ocfs2_should_order_data(inode)) { 6830 else if (ocfs2_should_order_data(inode)) {
6814 ret = ocfs2_jbd2_file_inode(handle, inode); 6831 ret = ocfs2_jbd2_inode_add_write(handle, inode,
6832 start_byte, length);
6815 if (ret < 0) 6833 if (ret < 0)
6816 mlog_errno(ret); 6834 mlog_errno(ret);
6817 } 6835 }
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a4c905d6b575..8de1c9d644f6 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -942,7 +942,8 @@ static void ocfs2_write_failure(struct inode *inode,
942 942
943 if (tmppage && page_has_buffers(tmppage)) { 943 if (tmppage && page_has_buffers(tmppage)) {
944 if (ocfs2_should_order_data(inode)) 944 if (ocfs2_should_order_data(inode))
945 ocfs2_jbd2_file_inode(wc->w_handle, inode); 945 ocfs2_jbd2_inode_add_write(wc->w_handle, inode,
946 user_pos, user_len);
946 947
947 block_commit_write(tmppage, from, to); 948 block_commit_write(tmppage, from, to);
948 } 949 }
@@ -2023,8 +2024,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
2023 } 2024 }
2024 2025
2025 if (page_has_buffers(tmppage)) { 2026 if (page_has_buffers(tmppage)) {
2026 if (handle && ocfs2_should_order_data(inode)) 2027 if (handle && ocfs2_should_order_data(inode)) {
2027 ocfs2_jbd2_file_inode(handle, inode); 2028 loff_t start_byte =
2029 ((loff_t)tmppage->index << PAGE_SHIFT) +
2030 from;
2031 loff_t length = to - from;
2032 ocfs2_jbd2_inode_add_write(handle, inode,
2033 start_byte, length);
2034 }
2028 block_commit_write(tmppage, from, to); 2035 block_commit_write(tmppage, from, to);
2029 } 2036 }
2030 } 2037 }
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index 429e6a8359a5..eaf042feaf5e 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -231,14 +231,6 @@ static int blockcheck_u64_get(void *data, u64 *val)
231} 231}
232DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); 232DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n");
233 233
234static struct dentry *blockcheck_debugfs_create(const char *name,
235 struct dentry *parent,
236 u64 *value)
237{
238 return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value,
239 &blockcheck_fops);
240}
241
242static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) 234static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
243{ 235{
244 if (stats) { 236 if (stats) {
@@ -250,16 +242,20 @@ static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats)
250static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, 242static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
251 struct dentry *parent) 243 struct dentry *parent)
252{ 244{
253 stats->b_debug_dir = debugfs_create_dir("blockcheck", parent); 245 struct dentry *dir;
246
247 dir = debugfs_create_dir("blockcheck", parent);
248 stats->b_debug_dir = dir;
249
250 debugfs_create_file("blocks_checked", S_IFREG | S_IRUSR, dir,
251 &stats->b_check_count, &blockcheck_fops);
254 252
255 blockcheck_debugfs_create("blocks_checked", stats->b_debug_dir, 253 debugfs_create_file("checksums_failed", S_IFREG | S_IRUSR, dir,
256 &stats->b_check_count); 254 &stats->b_failure_count, &blockcheck_fops);
257 255
258 blockcheck_debugfs_create("checksums_failed", stats->b_debug_dir, 256 debugfs_create_file("ecc_recoveries", S_IFREG | S_IRUSR, dir,
259 &stats->b_failure_count); 257 &stats->b_recover_count, &blockcheck_fops);
260 258
261 blockcheck_debugfs_create("ecc_recoveries", stats->b_debug_dir,
262 &stats->b_recover_count);
263} 259}
264#else 260#else
265static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, 261static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats,
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f1b613327ac8..a368350d4c27 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -225,10 +225,6 @@ struct o2hb_region {
225 unsigned int hr_region_num; 225 unsigned int hr_region_num;
226 226
227 struct dentry *hr_debug_dir; 227 struct dentry *hr_debug_dir;
228 struct dentry *hr_debug_livenodes;
229 struct dentry *hr_debug_regnum;
230 struct dentry *hr_debug_elapsed_time;
231 struct dentry *hr_debug_pinned;
232 struct o2hb_debug_buf *hr_db_livenodes; 228 struct o2hb_debug_buf *hr_db_livenodes;
233 struct o2hb_debug_buf *hr_db_regnum; 229 struct o2hb_debug_buf *hr_db_regnum;
234 struct o2hb_debug_buf *hr_db_elapsed_time; 230 struct o2hb_debug_buf *hr_db_elapsed_time;
@@ -1394,21 +1390,20 @@ void o2hb_exit(void)
1394 kfree(o2hb_db_failedregions); 1390 kfree(o2hb_db_failedregions);
1395} 1391}
1396 1392
1397static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, 1393static void o2hb_debug_create(const char *name, struct dentry *dir,
1398 struct o2hb_debug_buf **db, int db_len, 1394 struct o2hb_debug_buf **db, int db_len, int type,
1399 int type, int size, int len, void *data) 1395 int size, int len, void *data)
1400{ 1396{
1401 *db = kmalloc(db_len, GFP_KERNEL); 1397 *db = kmalloc(db_len, GFP_KERNEL);
1402 if (!*db) 1398 if (!*db)
1403 return NULL; 1399 return;
1404 1400
1405 (*db)->db_type = type; 1401 (*db)->db_type = type;
1406 (*db)->db_size = size; 1402 (*db)->db_size = size;
1407 (*db)->db_len = len; 1403 (*db)->db_len = len;
1408 (*db)->db_data = data; 1404 (*db)->db_data = data;
1409 1405
1410 return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, 1406 debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, &o2hb_debug_fops);
1411 &o2hb_debug_fops);
1412} 1407}
1413 1408
1414static void o2hb_debug_init(void) 1409static void o2hb_debug_init(void)
@@ -1525,11 +1520,7 @@ static void o2hb_region_release(struct config_item *item)
1525 1520
1526 kfree(reg->hr_slots); 1521 kfree(reg->hr_slots);
1527 1522
1528 debugfs_remove(reg->hr_debug_livenodes); 1523 debugfs_remove_recursive(reg->hr_debug_dir);
1529 debugfs_remove(reg->hr_debug_regnum);
1530 debugfs_remove(reg->hr_debug_elapsed_time);
1531 debugfs_remove(reg->hr_debug_pinned);
1532 debugfs_remove(reg->hr_debug_dir);
1533 kfree(reg->hr_db_livenodes); 1524 kfree(reg->hr_db_livenodes);
1534 kfree(reg->hr_db_regnum); 1525 kfree(reg->hr_db_regnum);
1535 kfree(reg->hr_db_elapsed_time); 1526 kfree(reg->hr_db_elapsed_time);
@@ -1988,69 +1979,33 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group
1988 : NULL; 1979 : NULL;
1989} 1980}
1990 1981
1991static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) 1982static void o2hb_debug_region_init(struct o2hb_region *reg,
1983 struct dentry *parent)
1992{ 1984{
1993 int ret = -ENOMEM; 1985 struct dentry *dir;
1994 1986
1995 reg->hr_debug_dir = 1987 dir = debugfs_create_dir(config_item_name(&reg->hr_item), parent);
1996 debugfs_create_dir(config_item_name(&reg->hr_item), dir); 1988 reg->hr_debug_dir = dir;
1997 if (!reg->hr_debug_dir) {
1998 mlog_errno(ret);
1999 goto bail;
2000 }
2001 1989
2002 reg->hr_debug_livenodes = 1990 o2hb_debug_create(O2HB_DEBUG_LIVENODES, dir, &(reg->hr_db_livenodes),
2003 o2hb_debug_create(O2HB_DEBUG_LIVENODES, 1991 sizeof(*(reg->hr_db_livenodes)),
2004 reg->hr_debug_dir, 1992 O2HB_DB_TYPE_REGION_LIVENODES,
2005 &(reg->hr_db_livenodes), 1993 sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES,
2006 sizeof(*(reg->hr_db_livenodes)), 1994 reg);
2007 O2HB_DB_TYPE_REGION_LIVENODES,
2008 sizeof(reg->hr_live_node_bitmap),
2009 O2NM_MAX_NODES, reg);
2010 if (!reg->hr_debug_livenodes) {
2011 mlog_errno(ret);
2012 goto bail;
2013 }
2014 1995
2015 reg->hr_debug_regnum = 1996 o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, dir, &(reg->hr_db_regnum),
2016 o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, 1997 sizeof(*(reg->hr_db_regnum)),
2017 reg->hr_debug_dir, 1998 O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg);
2018 &(reg->hr_db_regnum),
2019 sizeof(*(reg->hr_db_regnum)),
2020 O2HB_DB_TYPE_REGION_NUMBER,
2021 0, O2NM_MAX_NODES, reg);
2022 if (!reg->hr_debug_regnum) {
2023 mlog_errno(ret);
2024 goto bail;
2025 }
2026 1999
2027 reg->hr_debug_elapsed_time = 2000 o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, dir,
2028 o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, 2001 &(reg->hr_db_elapsed_time),
2029 reg->hr_debug_dir, 2002 sizeof(*(reg->hr_db_elapsed_time)),
2030 &(reg->hr_db_elapsed_time), 2003 O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg);
2031 sizeof(*(reg->hr_db_elapsed_time)),
2032 O2HB_DB_TYPE_REGION_ELAPSED_TIME,
2033 0, 0, reg);
2034 if (!reg->hr_debug_elapsed_time) {
2035 mlog_errno(ret);
2036 goto bail;
2037 }
2038 2004
2039 reg->hr_debug_pinned = 2005 o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, dir, &(reg->hr_db_pinned),
2040 o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, 2006 sizeof(*(reg->hr_db_pinned)),
2041 reg->hr_debug_dir, 2007 O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg);
2042 &(reg->hr_db_pinned),
2043 sizeof(*(reg->hr_db_pinned)),
2044 O2HB_DB_TYPE_REGION_PINNED,
2045 0, 0, reg);
2046 if (!reg->hr_debug_pinned) {
2047 mlog_errno(ret);
2048 goto bail;
2049 }
2050 2008
2051 ret = 0;
2052bail:
2053 return ret;
2054} 2009}
2055 2010
2056static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, 2011static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group,
@@ -2106,11 +2061,7 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g
2106 if (ret) 2061 if (ret)
2107 goto unregister_handler; 2062 goto unregister_handler;
2108 2063
2109 ret = o2hb_debug_region_init(reg, o2hb_debug_dir); 2064 o2hb_debug_region_init(reg, o2hb_debug_dir);
2110 if (ret) {
2111 config_item_put(&reg->hr_item);
2112 goto unregister_handler;
2113 }
2114 2065
2115 return &reg->hr_item; 2066 return &reg->hr_item;
2116 2067
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 784426dee56c..bdef72c0f099 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3636,7 +3636,7 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
3636 int i, j, num_used; 3636 int i, j, num_used;
3637 u32 major_hash; 3637 u32 major_hash;
3638 struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf; 3638 struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf;
3639 struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list; 3639 struct ocfs2_dx_entry_list *orig_list, *tmp_list;
3640 struct ocfs2_dx_entry *dx_entry; 3640 struct ocfs2_dx_entry *dx_entry;
3641 3641
3642 tmp_list = &tmp_dx_leaf->dl_list; 3642 tmp_list = &tmp_dx_leaf->dl_list;
@@ -3645,7 +3645,6 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash,
3645 orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data; 3645 orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data;
3646 orig_list = &orig_dx_leaf->dl_list; 3646 orig_list = &orig_dx_leaf->dl_list;
3647 new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data; 3647 new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data;
3648 new_list = &new_dx_leaf->dl_list;
3649 3648
3650 num_used = le16_to_cpu(orig_list->de_num_used); 3649 num_used = le16_to_cpu(orig_list->de_num_used);
3651 3650
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 69a429b625cc..aaf24548b02a 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -142,7 +142,6 @@ struct dlm_ctxt
142 atomic_t res_tot_count; 142 atomic_t res_tot_count;
143 atomic_t res_cur_count; 143 atomic_t res_cur_count;
144 144
145 struct dlm_debug_ctxt *dlm_debug_ctxt;
146 struct dentry *dlm_debugfs_subroot; 145 struct dentry *dlm_debugfs_subroot;
147 146
148 /* NOTE: Next three are protected by dlm_domain_lock */ 147 /* NOTE: Next three are protected by dlm_domain_lock */
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index a4b58ba99927..4d0b452012b2 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -853,67 +853,34 @@ static const struct file_operations debug_state_fops = {
853/* files in subroot */ 853/* files in subroot */
854void dlm_debug_init(struct dlm_ctxt *dlm) 854void dlm_debug_init(struct dlm_ctxt *dlm)
855{ 855{
856 struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
857
858 /* for dumping dlm_ctxt */ 856 /* for dumping dlm_ctxt */
859 dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE, 857 debugfs_create_file(DLM_DEBUGFS_DLM_STATE, S_IFREG|S_IRUSR,
860 S_IFREG|S_IRUSR, 858 dlm->dlm_debugfs_subroot, dlm, &debug_state_fops);
861 dlm->dlm_debugfs_subroot,
862 dlm, &debug_state_fops);
863 859
864 /* for dumping lockres */ 860 /* for dumping lockres */
865 dc->debug_lockres_dentry = 861 debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, S_IFREG|S_IRUSR,
866 debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, 862 dlm->dlm_debugfs_subroot, dlm, &debug_lockres_fops);
867 S_IFREG|S_IRUSR,
868 dlm->dlm_debugfs_subroot,
869 dlm, &debug_lockres_fops);
870 863
871 /* for dumping mles */ 864 /* for dumping mles */
872 dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE, 865 debugfs_create_file(DLM_DEBUGFS_MLE_STATE, S_IFREG|S_IRUSR,
873 S_IFREG|S_IRUSR, 866 dlm->dlm_debugfs_subroot, dlm, &debug_mle_fops);
874 dlm->dlm_debugfs_subroot,
875 dlm, &debug_mle_fops);
876 867
877 /* for dumping lockres on the purge list */ 868 /* for dumping lockres on the purge list */
878 dc->debug_purgelist_dentry = 869 debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, S_IFREG|S_IRUSR,
879 debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, 870 dlm->dlm_debugfs_subroot, dlm,
880 S_IFREG|S_IRUSR, 871 &debug_purgelist_fops);
881 dlm->dlm_debugfs_subroot,
882 dlm, &debug_purgelist_fops);
883}
884
885void dlm_debug_shutdown(struct dlm_ctxt *dlm)
886{
887 struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
888
889 if (dc) {
890 debugfs_remove(dc->debug_purgelist_dentry);
891 debugfs_remove(dc->debug_mle_dentry);
892 debugfs_remove(dc->debug_lockres_dentry);
893 debugfs_remove(dc->debug_state_dentry);
894 kfree(dc);
895 dc = NULL;
896 }
897} 872}
898 873
899/* subroot - domain dir */ 874/* subroot - domain dir */
900int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) 875void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
901{ 876{
902 dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
903 GFP_KERNEL);
904 if (!dlm->dlm_debug_ctxt) {
905 mlog_errno(-ENOMEM);
906 return -ENOMEM;
907 }
908
909 dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, 877 dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
910 dlm_debugfs_root); 878 dlm_debugfs_root);
911 return 0;
912} 879}
913 880
914void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) 881void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
915{ 882{
916 debugfs_remove(dlm->dlm_debugfs_subroot); 883 debugfs_remove_recursive(dlm->dlm_debugfs_subroot);
917} 884}
918 885
919/* debugfs root */ 886/* debugfs root */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 7d0c7c9013ce..f8fd8680a4b6 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -14,13 +14,6 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle);
14 14
15#ifdef CONFIG_DEBUG_FS 15#ifdef CONFIG_DEBUG_FS
16 16
17struct dlm_debug_ctxt {
18 struct dentry *debug_state_dentry;
19 struct dentry *debug_lockres_dentry;
20 struct dentry *debug_mle_dentry;
21 struct dentry *debug_purgelist_dentry;
22};
23
24struct debug_lockres { 17struct debug_lockres {
25 int dl_len; 18 int dl_len;
26 char *dl_buf; 19 char *dl_buf;
@@ -29,9 +22,8 @@ struct debug_lockres {
29}; 22};
30 23
31void dlm_debug_init(struct dlm_ctxt *dlm); 24void dlm_debug_init(struct dlm_ctxt *dlm);
32void dlm_debug_shutdown(struct dlm_ctxt *dlm);
33 25
34int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); 26void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
35void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm); 27void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
36 28
37void dlm_create_debugfs_root(void); 29void dlm_create_debugfs_root(void);
@@ -42,12 +34,8 @@ void dlm_destroy_debugfs_root(void);
42static inline void dlm_debug_init(struct dlm_ctxt *dlm) 34static inline void dlm_debug_init(struct dlm_ctxt *dlm)
43{ 35{
44} 36}
45static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm) 37static inline void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
46{
47}
48static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
49{ 38{
50 return 0;
51} 39}
52static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) 40static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
53{ 41{
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 7338b5d4647c..ee6f459f9770 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -387,7 +387,6 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
387static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) 387static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
388{ 388{
389 dlm_unregister_domain_handlers(dlm); 389 dlm_unregister_domain_handlers(dlm);
390 dlm_debug_shutdown(dlm);
391 dlm_complete_thread(dlm); 390 dlm_complete_thread(dlm);
392 dlm_complete_recovery_thread(dlm); 391 dlm_complete_recovery_thread(dlm);
393 dlm_destroy_dlm_worker(dlm); 392 dlm_destroy_dlm_worker(dlm);
@@ -1938,7 +1937,6 @@ bail:
1938 1937
1939 if (status) { 1938 if (status) {
1940 dlm_unregister_domain_handlers(dlm); 1939 dlm_unregister_domain_handlers(dlm);
1941 dlm_debug_shutdown(dlm);
1942 dlm_complete_thread(dlm); 1940 dlm_complete_thread(dlm);
1943 dlm_complete_recovery_thread(dlm); 1941 dlm_complete_recovery_thread(dlm);
1944 dlm_destroy_dlm_worker(dlm); 1942 dlm_destroy_dlm_worker(dlm);
@@ -1992,9 +1990,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1992 dlm->key = key; 1990 dlm->key = key;
1993 dlm->node_num = o2nm_this_node(); 1991 dlm->node_num = o2nm_this_node();
1994 1992
1995 ret = dlm_create_debugfs_subroot(dlm); 1993 dlm_create_debugfs_subroot(dlm);
1996 if (ret < 0)
1997 goto leave;
1998 1994
1999 spin_lock_init(&dlm->spinlock); 1995 spin_lock_init(&dlm->spinlock);
2000 spin_lock_init(&dlm->master_lock); 1996 spin_lock_init(&dlm->master_lock);
@@ -2056,6 +2052,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
2056 mlog(0, "context init: refcount %u\n", 2052 mlog(0, "context init: refcount %u\n",
2057 kref_read(&dlm->dlm_refs)); 2053 kref_read(&dlm->dlm_refs));
2058 2054
2055 ret = 0;
2059leave: 2056leave:
2060 if (ret < 0 && dlm) { 2057 if (ret < 0 && dlm) {
2061 if (dlm->master_hash) 2058 if (dlm->master_hash)
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index e78657742bd8..3883633e82eb 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -90,7 +90,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
90 enum dlm_status status; 90 enum dlm_status status;
91 int actions = 0; 91 int actions = 0;
92 int in_use; 92 int in_use;
93 u8 owner; 93 u8 owner;
94 int recovery_wait = 0;
94 95
95 mlog(0, "master_node = %d, valblk = %d\n", master_node, 96 mlog(0, "master_node = %d, valblk = %d\n", master_node,
96 flags & LKM_VALBLK); 97 flags & LKM_VALBLK);
@@ -193,9 +194,12 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
193 } 194 }
194 if (flags & LKM_CANCEL) 195 if (flags & LKM_CANCEL)
195 lock->cancel_pending = 0; 196 lock->cancel_pending = 0;
196 else 197 else {
197 lock->unlock_pending = 0; 198 if (!lock->unlock_pending)
198 199 recovery_wait = 1;
200 else
201 lock->unlock_pending = 0;
202 }
199 } 203 }
200 204
201 /* get an extra ref on lock. if we are just switching 205 /* get an extra ref on lock. if we are just switching
@@ -229,6 +233,17 @@ leave:
229 spin_unlock(&res->spinlock); 233 spin_unlock(&res->spinlock);
230 wake_up(&res->wq); 234 wake_up(&res->wq);
231 235
236 if (recovery_wait) {
237 spin_lock(&res->spinlock);
238 /* Unlock request will directly succeed after owner dies,
239 * and the lock is already removed from grant list. We have to
240 * wait for RECOVERING done or we miss the chance to purge it
241 * since the removement is much faster than RECOVERING proc.
242 */
243 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING);
244 spin_unlock(&res->spinlock);
245 }
246
232 /* let the caller's final dlm_lock_put handle the actual kfree */ 247 /* let the caller's final dlm_lock_put handle the actual kfree */
233 if (actions & DLM_UNLOCK_FREE_LOCK) { 248 if (actions & DLM_UNLOCK_FREE_LOCK) {
234 /* this should always be coupled with list removal */ 249 /* this should always be coupled with list removal */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 14207234fa3d..6e774c5ea13b 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2508,9 +2508,7 @@ bail:
2508 ocfs2_inode_unlock(inode, ex); 2508 ocfs2_inode_unlock(inode, ex);
2509 } 2509 }
2510 2510
2511 if (local_bh) 2511 brelse(local_bh);
2512 brelse(local_bh);
2513
2514 return status; 2512 return status;
2515} 2513}
2516 2514
@@ -2593,8 +2591,7 @@ int ocfs2_inode_lock_atime(struct inode *inode,
2593 *level = 1; 2591 *level = 1;
2594 if (ocfs2_should_update_atime(inode, vfsmnt)) 2592 if (ocfs2_should_update_atime(inode, vfsmnt))
2595 ocfs2_update_inode_atime(inode, bh); 2593 ocfs2_update_inode_atime(inode, bh);
2596 if (bh) 2594 brelse(bh);
2597 brelse(bh);
2598 } else 2595 } else
2599 *level = 0; 2596 *level = 0;
2600 2597
@@ -3012,8 +3009,6 @@ struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
3012 3009
3013 kref_init(&dlm_debug->d_refcnt); 3010 kref_init(&dlm_debug->d_refcnt);
3014 INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); 3011 INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
3015 dlm_debug->d_locking_state = NULL;
3016 dlm_debug->d_locking_filter = NULL;
3017 dlm_debug->d_filter_secs = 0; 3012 dlm_debug->d_filter_secs = 0;
3018out: 3013out:
3019 return dlm_debug; 3014 return dlm_debug;
@@ -3282,27 +3277,19 @@ static void ocfs2_dlm_init_debug(struct ocfs2_super *osb)
3282{ 3277{
3283 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; 3278 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
3284 3279
3285 dlm_debug->d_locking_state = debugfs_create_file("locking_state", 3280 debugfs_create_file("locking_state", S_IFREG|S_IRUSR,
3286 S_IFREG|S_IRUSR, 3281 osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops);
3287 osb->osb_debug_root,
3288 osb,
3289 &ocfs2_dlm_debug_fops);
3290 3282
3291 dlm_debug->d_locking_filter = debugfs_create_u32("locking_filter", 3283 debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root,
3292 0600, 3284 &dlm_debug->d_filter_secs);
3293 osb->osb_debug_root,
3294 &dlm_debug->d_filter_secs);
3295} 3285}
3296 3286
3297static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) 3287static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
3298{ 3288{
3299 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; 3289 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
3300 3290
3301 if (dlm_debug) { 3291 if (dlm_debug)
3302 debugfs_remove(dlm_debug->d_locking_state);
3303 debugfs_remove(dlm_debug->d_locking_filter);
3304 ocfs2_put_dlm_debug(dlm_debug); 3292 ocfs2_put_dlm_debug(dlm_debug);
3305 }
3306} 3293}
3307 3294
3308int ocfs2_dlm_init(struct ocfs2_super *osb) 3295int ocfs2_dlm_init(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index e66a249fe07c..e3e2d1b2af51 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -590,8 +590,7 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
590 *extent_flags = rec->e_flags; 590 *extent_flags = rec->e_flags;
591 } 591 }
592out: 592out:
593 if (eb_bh) 593 brelse(eb_bh);
594 brelse(eb_bh);
595 return ret; 594 return ret;
596} 595}
597 596
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4435df3e5adb..2e982db3e1ae 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -706,7 +706,9 @@ leave:
706 * Thus, we need to explicitly order the zeroed pages. 706 * Thus, we need to explicitly order the zeroed pages.
707 */ 707 */
708static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, 708static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
709 struct buffer_head *di_bh) 709 struct buffer_head *di_bh,
710 loff_t start_byte,
711 loff_t length)
710{ 712{
711 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 713 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
712 handle_t *handle = NULL; 714 handle_t *handle = NULL;
@@ -722,7 +724,7 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode,
722 goto out; 724 goto out;
723 } 725 }
724 726
725 ret = ocfs2_jbd2_file_inode(handle, inode); 727 ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length);
726 if (ret < 0) { 728 if (ret < 0) {
727 mlog_errno(ret); 729 mlog_errno(ret);
728 goto out; 730 goto out;
@@ -761,7 +763,9 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
761 BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT)); 763 BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT));
762 BUG_ON(abs_from & (inode->i_blkbits - 1)); 764 BUG_ON(abs_from & (inode->i_blkbits - 1));
763 765
764 handle = ocfs2_zero_start_ordered_transaction(inode, di_bh); 766 handle = ocfs2_zero_start_ordered_transaction(inode, di_bh,
767 abs_from,
768 abs_to - abs_from);
765 if (IS_ERR(handle)) { 769 if (IS_ERR(handle)) {
766 ret = PTR_ERR(handle); 770 ret = PTR_ERR(handle);
767 goto out; 771 goto out;
@@ -2126,7 +2130,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2126 struct dentry *dentry = file->f_path.dentry; 2130 struct dentry *dentry = file->f_path.dentry;
2127 struct inode *inode = d_inode(dentry); 2131 struct inode *inode = d_inode(dentry);
2128 struct buffer_head *di_bh = NULL; 2132 struct buffer_head *di_bh = NULL;
2129 loff_t end;
2130 2133
2131 /* 2134 /*
2132 * We start with a read level meta lock and only jump to an ex 2135 * We start with a read level meta lock and only jump to an ex
@@ -2190,8 +2193,6 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
2190 } 2193 }
2191 } 2194 }
2192 2195
2193 end = pos + count;
2194
2195 ret = ocfs2_check_range_for_refcount(inode, pos, count); 2196 ret = ocfs2_check_range_for_refcount(inode, pos, count);
2196 if (ret == 1) { 2197 if (ret == 1) {
2197 ocfs2_inode_unlock(inode, meta_level); 2198 ocfs2_inode_unlock(inode, meta_level);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7ad9d6590818..7c9dfd50c1c1 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -534,7 +534,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
534 */ 534 */
535 mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != 535 mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) !=
536 !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), 536 !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE),
537 "Inode %llu: system file state is ambigous\n", 537 "Inode %llu: system file state is ambiguous\n",
538 (unsigned long long)args->fi_blkno); 538 (unsigned long long)args->fi_blkno);
539 539
540 if (S_ISCHR(le16_to_cpu(fe->i_mode)) || 540 if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index c0fe6ed08ab1..3103ba7f97a2 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -144,7 +144,6 @@ static inline void ocfs2_ci_set_new(struct ocfs2_super *osb,
144void ocfs2_orphan_scan_init(struct ocfs2_super *osb); 144void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
145void ocfs2_orphan_scan_start(struct ocfs2_super *osb); 145void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
146void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); 146void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
147void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
148 147
149void ocfs2_complete_recovery(struct work_struct *work); 148void ocfs2_complete_recovery(struct work_struct *work);
150void ocfs2_wait_for_recovery(struct ocfs2_super *osb); 149void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
@@ -232,8 +231,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
232 * ocfs2_journal_access_*() unless you intend to 231 * ocfs2_journal_access_*() unless you intend to
233 * manage the checksum by hand. 232 * manage the checksum by hand.
234 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. 233 * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
235 * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before 234 * ocfs2_jbd2_inode_add_write - Mark an inode with range so that its data goes
236 * the current handle commits. 235 * out before the current handle commits.
237 */ 236 */
238 237
239/* You must always start_trans with a number of buffs > 0, but it's 238/* You must always start_trans with a number of buffs > 0, but it's
@@ -441,7 +440,7 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir,
441 * previous dirblock update in the free list */ 440 * previous dirblock update in the free list */
442static inline int ocfs2_link_credits(struct super_block *sb) 441static inline int ocfs2_link_credits(struct super_block *sb)
443{ 442{
444 return 2*OCFS2_INODE_UPDATE_CREDITS + 4 + 443 return 2 * OCFS2_INODE_UPDATE_CREDITS + 4 +
445 ocfs2_quota_trans_credits(sb); 444 ocfs2_quota_trans_credits(sb);
446} 445}
447 446
@@ -575,37 +574,12 @@ static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb)
575 return ocfs2_extent_recs_per_gd(sb); 574 return ocfs2_extent_recs_per_gd(sb);
576} 575}
577 576
578static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, 577static inline int ocfs2_jbd2_inode_add_write(handle_t *handle, struct inode *inode,
579 unsigned int clusters_to_del, 578 loff_t start_byte, loff_t length)
580 struct ocfs2_dinode *fe,
581 struct ocfs2_extent_list *last_el)
582{ 579{
583 /* for dinode + all headers in this pass + update to next leaf */ 580 return jbd2_journal_inode_ranged_write(handle,
584 u16 next_free = le16_to_cpu(last_el->l_next_free_rec); 581 &OCFS2_I(inode)->ip_jinode,
585 u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth); 582 start_byte, length);
586 int credits = 1 + tree_depth + 1;
587 int i;
588
589 i = next_free - 1;
590 BUG_ON(i < 0);
591
592 /* We may be deleting metadata blocks, so metadata alloc dinode +
593 one desc. block for each possible delete. */
594 if (tree_depth && next_free == 1 &&
595 ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
596 credits += 1 + tree_depth;
597
598 /* update to the truncate log. */
599 credits += OCFS2_TRUNCATE_LOG_UPDATE;
600
601 credits += ocfs2_quota_trans_credits(sb);
602
603 return credits;
604}
605
606static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
607{
608 return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode);
609} 583}
610 584
611static inline int ocfs2_begin_ordered_truncate(struct inode *inode, 585static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 6f8e1c4fdb9c..8ea51cf27b97 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2486,7 +2486,6 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2486 struct inode *inode = NULL; 2486 struct inode *inode = NULL;
2487 struct inode *orphan_dir = NULL; 2487 struct inode *orphan_dir = NULL;
2488 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); 2488 struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
2489 struct ocfs2_dinode *di = NULL;
2490 handle_t *handle = NULL; 2489 handle_t *handle = NULL;
2491 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; 2490 char orphan_name[OCFS2_ORPHAN_NAMELEN + 1];
2492 struct buffer_head *parent_di_bh = NULL; 2491 struct buffer_head *parent_di_bh = NULL;
@@ -2552,7 +2551,6 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
2552 goto leave; 2551 goto leave;
2553 } 2552 }
2554 2553
2555 di = (struct ocfs2_dinode *)new_di_bh->b_data;
2556 status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, 2554 status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
2557 &orphan_insert, orphan_dir, false); 2555 &orphan_insert, orphan_dir, false);
2558 if (status < 0) { 2556 if (status < 0) {
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index fddbbd60f434..9150cfa4df7d 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -223,8 +223,6 @@ struct ocfs2_orphan_scan {
223 223
224struct ocfs2_dlm_debug { 224struct ocfs2_dlm_debug {
225 struct kref d_refcnt; 225 struct kref d_refcnt;
226 struct dentry *d_locking_state;
227 struct dentry *d_locking_filter;
228 u32 d_filter_secs; 226 u32 d_filter_secs;
229 struct list_head d_lockres_tracking; 227 struct list_head d_lockres_tracking;
230}; 228};
@@ -401,7 +399,6 @@ struct ocfs2_super
401 struct ocfs2_dlm_debug *osb_dlm_debug; 399 struct ocfs2_dlm_debug *osb_dlm_debug;
402 400
403 struct dentry *osb_debug_root; 401 struct dentry *osb_debug_root;
404 struct dentry *osb_ctxt;
405 402
406 wait_queue_head_t recovery_event; 403 wait_queue_head_t recovery_event;
407 404
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 8b2f39506648..c81e86c62380 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1080,10 +1080,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1080 osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, 1080 osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
1081 ocfs2_debugfs_root); 1081 ocfs2_debugfs_root);
1082 1082
1083 osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, 1083 debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root,
1084 osb->osb_debug_root, 1084 osb, &ocfs2_osb_debug_fops);
1085 osb,
1086 &ocfs2_osb_debug_fops);
1087 1085
1088 if (ocfs2_meta_ecc(osb)) 1086 if (ocfs2_meta_ecc(osb))
1089 ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats, 1087 ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats,
@@ -1861,8 +1859,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1861 1859
1862 kset_unregister(osb->osb_dev_kset); 1860 kset_unregister(osb->osb_dev_kset);
1863 1861
1864 debugfs_remove(osb->osb_ctxt);
1865
1866 /* Orphan scan should be stopped as early as possible */ 1862 /* Orphan scan should be stopped as early as possible */
1867 ocfs2_orphan_scan_stop(osb); 1863 ocfs2_orphan_scan_stop(osb);
1868 1864
@@ -1918,7 +1914,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1918 ocfs2_dlm_shutdown(osb, hangup_needed); 1914 ocfs2_dlm_shutdown(osb, hangup_needed);
1919 1915
1920 ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); 1916 ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
1921 debugfs_remove(osb->osb_debug_root); 1917 debugfs_remove_recursive(osb->osb_debug_root);
1922 1918
1923 if (hangup_needed) 1919 if (hangup_needed)
1924 ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str)); 1920 ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
diff --git a/fs/open.c b/fs/open.c
index a59abe3c669a..c60cd22cc052 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -818,6 +818,14 @@ static int do_dentry_open(struct file *f,
818 if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) 818 if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
819 return -EINVAL; 819 return -EINVAL;
820 } 820 }
821
822 /*
823 * XXX: Huge page cache doesn't support writing yet. Drop all page
824 * cache for this file before processing writes.
825 */
826 if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping))
827 truncate_pagecache(inode, 0);
828
821 return 0; 829 return 0;
822 830
823cleanup_all: 831cleanup_all:
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 465ea0153b2a..ac9247371871 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -8,7 +8,6 @@
8#include <linux/mmzone.h> 8#include <linux/mmzone.h>
9#include <linux/proc_fs.h> 9#include <linux/proc_fs.h>
10#include <linux/percpu.h> 10#include <linux/percpu.h>
11#include <linux/quicklist.h>
12#include <linux/seq_file.h> 11#include <linux/seq_file.h>
13#include <linux/swap.h> 12#include <linux/swap.h>
14#include <linux/vmstat.h> 13#include <linux/vmstat.h>
@@ -106,9 +105,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
106 global_zone_page_state(NR_KERNEL_STACK_KB)); 105 global_zone_page_state(NR_KERNEL_STACK_KB));
107 show_val_kb(m, "PageTables: ", 106 show_val_kb(m, "PageTables: ",
108 global_zone_page_state(NR_PAGETABLE)); 107 global_zone_page_state(NR_PAGETABLE));
109#ifdef CONFIG_QUICKLIST
110 show_val_kb(m, "Quicklists: ", quicklist_total_size());
111#endif
112 108
113 show_val_kb(m, "NFS_Unstable: ", 109 show_val_kb(m, "NFS_Unstable: ",
114 global_node_page_state(NR_UNSTABLE_NFS)); 110 global_node_page_state(NR_UNSTABLE_NFS));
@@ -136,6 +132,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
136 global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); 132 global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR);
137 show_val_kb(m, "ShmemPmdMapped: ", 133 show_val_kb(m, "ShmemPmdMapped: ",
138 global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); 134 global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR);
135 show_val_kb(m, "FileHugePages: ",
136 global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR);
137 show_val_kb(m, "FilePmdMapped: ",
138 global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR);
139#endif 139#endif
140 140
141#ifdef CONFIG_CMA 141#ifdef CONFIG_CMA
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index bf43d1d60059..9442631fd4af 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -417,6 +417,7 @@ struct mem_size_stats {
417 unsigned long lazyfree; 417 unsigned long lazyfree;
418 unsigned long anonymous_thp; 418 unsigned long anonymous_thp;
419 unsigned long shmem_thp; 419 unsigned long shmem_thp;
420 unsigned long file_thp;
420 unsigned long swap; 421 unsigned long swap;
421 unsigned long shared_hugetlb; 422 unsigned long shared_hugetlb;
422 unsigned long private_hugetlb; 423 unsigned long private_hugetlb;
@@ -461,7 +462,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
461static void smaps_account(struct mem_size_stats *mss, struct page *page, 462static void smaps_account(struct mem_size_stats *mss, struct page *page,
462 bool compound, bool young, bool dirty, bool locked) 463 bool compound, bool young, bool dirty, bool locked)
463{ 464{
464 int i, nr = compound ? 1 << compound_order(page) : 1; 465 int i, nr = compound ? compound_nr(page) : 1;
465 unsigned long size = nr * PAGE_SIZE; 466 unsigned long size = nr * PAGE_SIZE;
466 467
467 /* 468 /*
@@ -588,7 +589,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
588 else if (is_zone_device_page(page)) 589 else if (is_zone_device_page(page))
589 /* pass */; 590 /* pass */;
590 else 591 else
591 VM_BUG_ON_PAGE(1, page); 592 mss->file_thp += HPAGE_PMD_SIZE;
592 smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); 593 smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked);
593} 594}
594#else 595#else
@@ -809,6 +810,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss,
809 SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); 810 SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree);
810 SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); 811 SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp);
811 SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); 812 SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp);
813 SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp);
812 SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); 814 SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb);
813 seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", 815 seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ",
814 mss->private_hugetlb >> 10, 7); 816 mss->private_hugetlb >> 10, 7);
diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h
index 8476175c07e7..6f8cc06ee44e 100644
--- a/include/asm-generic/pgalloc.h
+++ b/include/asm-generic/pgalloc.h
@@ -102,11 +102,6 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
102 __free_page(pte_page); 102 __free_page(pte_page);
103} 103}
104 104
105#else /* CONFIG_MMU */
106
107/* This is enough for a nommu architecture */
108#define check_pgt_cache() do { } while (0)
109
110#endif /* CONFIG_MMU */ 105#endif /* CONFIG_MMU */
111 106
112#endif /* __ASM_GENERIC_PGALLOC_H */ 107#endif /* __ASM_GENERIC_PGALLOC_H */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 75d9d68a6de7..818691846c90 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1002,9 +1002,8 @@ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
1002 * need this). If THP is not enabled, the pmd can't go away under the 1002 * need this). If THP is not enabled, the pmd can't go away under the
1003 * code even if MADV_DONTNEED runs, but if THP is enabled we need to 1003 * code even if MADV_DONTNEED runs, but if THP is enabled we need to
1004 * run a pmd_trans_unstable before walking the ptes after 1004 * run a pmd_trans_unstable before walking the ptes after
1005 * split_huge_page_pmd returns (because it may have run when the pmd 1005 * split_huge_pmd returns (because it may have run when the pmd become
1006 * become null, but then a page fault can map in a THP and not a 1006 * null, but then a page fault can map in a THP and not a regular page).
1007 * regular page).
1008 */ 1007 */
1009static inline int pmd_trans_unstable(pmd_t *pmd) 1008static inline int pmd_trans_unstable(pmd_t *pmd)
1010{ 1009{
@@ -1126,7 +1125,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
1126static inline void init_espfix_bsp(void) { } 1125static inline void init_espfix_bsp(void) { }
1127#endif 1126#endif
1128 1127
1129extern void __init pgd_cache_init(void); 1128extern void __init pgtable_cache_init(void);
1130 1129
1131#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED 1130#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
1132static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) 1131static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 9569e7c786d3..4b898cdbdf05 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -129,11 +129,8 @@ static inline bool compaction_failed(enum compact_result result)
129 return false; 129 return false;
130} 130}
131 131
132/* 132/* Compaction needs reclaim to be performed first, so it can continue. */
133 * Compaction has backed off for some reason. It might be throttling or 133static inline bool compaction_needs_reclaim(enum compact_result result)
134 * lock contention. Retrying is still worthwhile.
135 */
136static inline bool compaction_withdrawn(enum compact_result result)
137{ 134{
138 /* 135 /*
139 * Compaction backed off due to watermark checks for order-0 136 * Compaction backed off due to watermark checks for order-0
@@ -142,6 +139,16 @@ static inline bool compaction_withdrawn(enum compact_result result)
142 if (result == COMPACT_SKIPPED) 139 if (result == COMPACT_SKIPPED)
143 return true; 140 return true;
144 141
142 return false;
143}
144
145/*
146 * Compaction has backed off for some reason after doing some work or none
147 * at all. It might be throttling or lock contention. Retrying might be still
148 * worthwhile, but with a higher priority if allowed.
149 */
150static inline bool compaction_withdrawn(enum compact_result result)
151{
145 /* 152 /*
146 * If compaction is deferred for high-order allocations, it is 153 * If compaction is deferred for high-order allocations, it is
147 * because sync compaction recently failed. If this is the case 154 * because sync compaction recently failed. If this is the case
@@ -207,6 +214,11 @@ static inline bool compaction_failed(enum compact_result result)
207 return false; 214 return false;
208} 215}
209 216
217static inline bool compaction_needs_reclaim(enum compact_result result)
218{
219 return false;
220}
221
210static inline bool compaction_withdrawn(enum compact_result result) 222static inline bool compaction_withdrawn(enum compact_result result)
211{ 223{
212 return true; 224 return true;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 866268c2c6e3..b0c6b0d34d02 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -429,6 +429,7 @@ int pagecache_write_end(struct file *, struct address_space *mapping,
429 * @i_pages: Cached pages. 429 * @i_pages: Cached pages.
430 * @gfp_mask: Memory allocation flags to use for allocating pages. 430 * @gfp_mask: Memory allocation flags to use for allocating pages.
431 * @i_mmap_writable: Number of VM_SHARED mappings. 431 * @i_mmap_writable: Number of VM_SHARED mappings.
432 * @nr_thps: Number of THPs in the pagecache (non-shmem only).
432 * @i_mmap: Tree of private and shared mappings. 433 * @i_mmap: Tree of private and shared mappings.
433 * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. 434 * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
434 * @nrpages: Number of page entries, protected by the i_pages lock. 435 * @nrpages: Number of page entries, protected by the i_pages lock.
@@ -446,6 +447,10 @@ struct address_space {
446 struct xarray i_pages; 447 struct xarray i_pages;
447 gfp_t gfp_mask; 448 gfp_t gfp_mask;
448 atomic_t i_mmap_writable; 449 atomic_t i_mmap_writable;
450#ifdef CONFIG_READ_ONLY_THP_FOR_FS
451 /* number of thp, only for non-shmem files */
452 atomic_t nr_thps;
453#endif
449 struct rb_root_cached i_mmap; 454 struct rb_root_cached i_mmap;
450 struct rw_semaphore i_mmap_rwsem; 455 struct rw_semaphore i_mmap_rwsem;
451 unsigned long nrpages; 456 unsigned long nrpages;
@@ -2798,6 +2803,33 @@ static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
2798 return errseq_sample(&mapping->wb_err); 2803 return errseq_sample(&mapping->wb_err);
2799} 2804}
2800 2805
2806static inline int filemap_nr_thps(struct address_space *mapping)
2807{
2808#ifdef CONFIG_READ_ONLY_THP_FOR_FS
2809 return atomic_read(&mapping->nr_thps);
2810#else
2811 return 0;
2812#endif
2813}
2814
2815static inline void filemap_nr_thps_inc(struct address_space *mapping)
2816{
2817#ifdef CONFIG_READ_ONLY_THP_FOR_FS
2818 atomic_inc(&mapping->nr_thps);
2819#else
2820 WARN_ON_ONCE(1);
2821#endif
2822}
2823
2824static inline void filemap_nr_thps_dec(struct address_space *mapping)
2825{
2826#ifdef CONFIG_READ_ONLY_THP_FOR_FS
2827 atomic_dec(&mapping->nr_thps);
2828#else
2829 WARN_ON_ONCE(1);
2830#endif
2831}
2832
2801extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, 2833extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
2802 int datasync); 2834 int datasync);
2803extern int vfs_fsync(struct file *file, int datasync); 2835extern int vfs_fsync(struct file *file, int datasync);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 45ede62aa85b..61c9ffd89b05 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -267,6 +267,15 @@ static inline bool thp_migration_supported(void)
267 return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); 267 return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
268} 268}
269 269
270static inline struct list_head *page_deferred_list(struct page *page)
271{
272 /*
273 * Global or memcg deferred list in the second tail pages is
274 * occupied by compound_head.
275 */
276 return &page[2].deferred_list;
277}
278
270#else /* CONFIG_TRANSPARENT_HUGEPAGE */ 279#else /* CONFIG_TRANSPARENT_HUGEPAGE */
271#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) 280#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
272#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) 281#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index edfca4278319..53fc34f930d0 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -454,7 +454,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
454static inline struct hstate *page_hstate(struct page *page) 454static inline struct hstate *page_hstate(struct page *page)
455{ 455{
456 VM_BUG_ON_PAGE(!PageHuge(page), page); 456 VM_BUG_ON_PAGE(!PageHuge(page), page);
457 return size_to_hstate(PAGE_SIZE << compound_order(page)); 457 return size_to_hstate(page_size(page));
458} 458}
459 459
460static inline unsigned hstate_index_to_shift(unsigned index) 460static inline unsigned hstate_index_to_shift(unsigned index)
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index df03825ad1a1..603fbc4e2f70 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1410,8 +1410,6 @@ extern int jbd2_journal_clear_err (journal_t *);
1410extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); 1410extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
1411extern int jbd2_journal_force_commit(journal_t *); 1411extern int jbd2_journal_force_commit(journal_t *);
1412extern int jbd2_journal_force_commit_nested(journal_t *); 1412extern int jbd2_journal_force_commit_nested(journal_t *);
1413extern int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode);
1414extern int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode);
1415extern int jbd2_journal_inode_ranged_write(handle_t *handle, 1413extern int jbd2_journal_inode_ranged_write(handle_t *handle,
1416 struct jbd2_inode *inode, loff_t start_byte, 1414 struct jbd2_inode *inode, loff_t start_byte,
1417 loff_t length); 1415 loff_t length);
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index 082d1d2a5216..bc45ea1efbf7 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -15,6 +15,14 @@ extern int __khugepaged_enter(struct mm_struct *mm);
15extern void __khugepaged_exit(struct mm_struct *mm); 15extern void __khugepaged_exit(struct mm_struct *mm);
16extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, 16extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
17 unsigned long vm_flags); 17 unsigned long vm_flags);
18#ifdef CONFIG_SHMEM
19extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr);
20#else
21static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
22 unsigned long addr)
23{
24}
25#endif
18 26
19#define khugepaged_enabled() \ 27#define khugepaged_enabled() \
20 (transparent_hugepage_flags & \ 28 (transparent_hugepage_flags & \
@@ -73,6 +81,10 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
73{ 81{
74 return 0; 82 return 0;
75} 83}
84static inline void collapse_pte_mapped_thp(struct mm_struct *mm,
85 unsigned long addr)
86{
87}
76#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 88#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
77 89
78#endif /* _LINUX_KHUGEPAGED_H */ 90#endif /* _LINUX_KHUGEPAGED_H */
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ad8f1a397ae4..9b60863429cc 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -128,9 +128,8 @@ struct mem_cgroup_per_node {
128 128
129 struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; 129 struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
130 130
131#ifdef CONFIG_MEMCG_KMEM
132 struct memcg_shrinker_map __rcu *shrinker_map; 131 struct memcg_shrinker_map __rcu *shrinker_map;
133#endif 132
134 struct rb_node tree_node; /* RB tree node */ 133 struct rb_node tree_node; /* RB tree node */
135 unsigned long usage_in_excess;/* Set to the value by which */ 134 unsigned long usage_in_excess;/* Set to the value by which */
136 /* the soft limit is exceeded*/ 135 /* the soft limit is exceeded*/
@@ -331,6 +330,10 @@ struct mem_cgroup {
331 struct list_head event_list; 330 struct list_head event_list;
332 spinlock_t event_list_lock; 331 spinlock_t event_list_lock;
333 332
333#ifdef CONFIG_TRANSPARENT_HUGEPAGE
334 struct deferred_split deferred_split_queue;
335#endif
336
334 struct mem_cgroup_per_node *nodeinfo[0]; 337 struct mem_cgroup_per_node *nodeinfo[0];
335 /* WARNING: nodeinfo must be the last member here */ 338 /* WARNING: nodeinfo must be the last member here */
336}; 339};
@@ -1311,6 +1314,11 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
1311 } while ((memcg = parent_mem_cgroup(memcg))); 1314 } while ((memcg = parent_mem_cgroup(memcg)));
1312 return false; 1315 return false;
1313} 1316}
1317
1318extern int memcg_expand_shrinker_maps(int new_id);
1319
1320extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
1321 int nid, int shrinker_id);
1314#else 1322#else
1315#define mem_cgroup_sockets_enabled 0 1323#define mem_cgroup_sockets_enabled 0
1316static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; 1324static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
@@ -1319,6 +1327,11 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
1319{ 1327{
1320 return false; 1328 return false;
1321} 1329}
1330
1331static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
1332 int nid, int shrinker_id)
1333{
1334}
1322#endif 1335#endif
1323 1336
1324struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); 1337struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
@@ -1390,10 +1403,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
1390 return memcg ? memcg->kmemcg_id : -1; 1403 return memcg ? memcg->kmemcg_id : -1;
1391} 1404}
1392 1405
1393extern int memcg_expand_shrinker_maps(int new_id);
1394
1395extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
1396 int nid, int shrinker_id);
1397#else 1406#else
1398 1407
1399static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) 1408static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
@@ -1435,8 +1444,6 @@ static inline void memcg_put_cache_ids(void)
1435{ 1444{
1436} 1445}
1437 1446
1438static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg,
1439 int nid, int shrinker_id) { }
1440#endif /* CONFIG_MEMCG_KMEM */ 1447#endif /* CONFIG_MEMCG_KMEM */
1441 1448
1442#endif /* _LINUX_MEMCONTROL_H */ 1449#endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 02e633f3ede0..0ebb105eb261 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -25,7 +25,6 @@
25 25
26struct memory_block { 26struct memory_block {
27 unsigned long start_section_nr; 27 unsigned long start_section_nr;
28 unsigned long end_section_nr;
29 unsigned long state; /* serialized by the dev->lock */ 28 unsigned long state; /* serialized by the dev->lock */
30 int section_count; /* serialized by mem_sysfs_mutex */ 29 int section_count; /* serialized by mem_sysfs_mutex */
31 int online_type; /* for passing data to online routine */ 30 int online_type; /* for passing data to online routine */
@@ -80,9 +79,9 @@ struct mem_section;
80#define IPC_CALLBACK_PRI 10 79#define IPC_CALLBACK_PRI 10
81 80
82#ifndef CONFIG_MEMORY_HOTPLUG_SPARSE 81#ifndef CONFIG_MEMORY_HOTPLUG_SPARSE
83static inline int memory_dev_init(void) 82static inline void memory_dev_init(void)
84{ 83{
85 return 0; 84 return;
86} 85}
87static inline int register_memory_notifier(struct notifier_block *nb) 86static inline int register_memory_notifier(struct notifier_block *nb)
88{ 87{
@@ -113,7 +112,7 @@ extern int register_memory_isolate_notifier(struct notifier_block *nb);
113extern void unregister_memory_isolate_notifier(struct notifier_block *nb); 112extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
114int create_memory_block_devices(unsigned long start, unsigned long size); 113int create_memory_block_devices(unsigned long start, unsigned long size);
115void remove_memory_block_devices(unsigned long start, unsigned long size); 114void remove_memory_block_devices(unsigned long start, unsigned long size);
116extern int memory_dev_init(void); 115extern void memory_dev_init(void);
117extern int memory_notify(unsigned long val, void *v); 116extern int memory_notify(unsigned long val, void *v);
118extern int memory_isolate_notify(unsigned long val, void *v); 117extern int memory_isolate_notify(unsigned long val, void *v);
119extern struct memory_block *find_memory_block(struct mem_section *); 118extern struct memory_block *find_memory_block(struct mem_section *);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7cf955feb823..294a67b94147 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -805,6 +805,24 @@ static inline void set_compound_order(struct page *page, unsigned int order)
805 page[1].compound_order = order; 805 page[1].compound_order = order;
806} 806}
807 807
808/* Returns the number of pages in this potentially compound page. */
809static inline unsigned long compound_nr(struct page *page)
810{
811 return 1UL << compound_order(page);
812}
813
814/* Returns the number of bytes in this potentially compound page. */
815static inline unsigned long page_size(struct page *page)
816{
817 return PAGE_SIZE << compound_order(page);
818}
819
820/* Returns the number of bits needed for the number of bytes in a page */
821static inline unsigned int page_shift(struct page *page)
822{
823 return PAGE_SHIFT + compound_order(page);
824}
825
808void free_compound_page(struct page *page); 826void free_compound_page(struct page *page);
809 827
810#ifdef CONFIG_MMU 828#ifdef CONFIG_MMU
@@ -1057,8 +1075,9 @@ static inline void put_user_page(struct page *page)
1057 put_page(page); 1075 put_page(page);
1058} 1076}
1059 1077
1060void put_user_pages_dirty(struct page **pages, unsigned long npages); 1078void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
1061void put_user_pages_dirty_lock(struct page **pages, unsigned long npages); 1079 bool make_dirty);
1080
1062void put_user_pages(struct page **pages, unsigned long npages); 1081void put_user_pages(struct page **pages, unsigned long npages);
1063 1082
1064#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) 1083#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
@@ -1405,7 +1424,11 @@ extern void pagefault_out_of_memory(void);
1405 1424
1406extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); 1425extern void show_free_areas(unsigned int flags, nodemask_t *nodemask);
1407 1426
1427#ifdef CONFIG_MMU
1408extern bool can_do_mlock(void); 1428extern bool can_do_mlock(void);
1429#else
1430static inline bool can_do_mlock(void) { return false; }
1431#endif
1409extern int user_shm_lock(size_t, struct user_struct *); 1432extern int user_shm_lock(size_t, struct user_struct *);
1410extern void user_shm_unlock(size_t, struct user_struct *); 1433extern void user_shm_unlock(size_t, struct user_struct *);
1411 1434
@@ -2305,6 +2328,8 @@ extern int install_special_mapping(struct mm_struct *mm,
2305 unsigned long addr, unsigned long len, 2328 unsigned long addr, unsigned long len,
2306 unsigned long flags, struct page **pages); 2329 unsigned long flags, struct page **pages);
2307 2330
2331unsigned long randomize_stack_top(unsigned long stack_top);
2332
2308extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); 2333extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
2309 2334
2310extern unsigned long mmap_region(struct file *file, unsigned long addr, 2335extern unsigned long mmap_region(struct file *file, unsigned long addr,
@@ -2568,6 +2593,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
2568#define FOLL_COW 0x4000 /* internal GUP flag */ 2593#define FOLL_COW 0x4000 /* internal GUP flag */
2569#define FOLL_ANON 0x8000 /* don't do file mappings */ 2594#define FOLL_ANON 0x8000 /* don't do file mappings */
2570#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ 2595#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */
2596#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
2571 2597
2572/* 2598/*
2573 * NOTE on FOLL_LONGTERM: 2599 * NOTE on FOLL_LONGTERM:
@@ -2845,5 +2871,12 @@ void __init setup_nr_node_ids(void);
2845static inline void setup_nr_node_ids(void) {} 2871static inline void setup_nr_node_ids(void) {}
2846#endif 2872#endif
2847 2873
2874extern int memcmp_pages(struct page *page1, struct page *page2);
2875
2876static inline int pages_identical(struct page *page1, struct page *page2)
2877{
2878 return !memcmp_pages(page1, page2);
2879}
2880
2848#endif /* __KERNEL__ */ 2881#endif /* __KERNEL__ */
2849#endif /* _LINUX_MM_H */ 2882#endif /* _LINUX_MM_H */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0b739f360cec..5183e0d77dfa 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -138,6 +138,7 @@ struct page {
138 struct { /* Second tail page of compound page */ 138 struct { /* Second tail page of compound page */
139 unsigned long _compound_pad_1; /* compound_head */ 139 unsigned long _compound_pad_1; /* compound_head */
140 unsigned long _compound_pad_2; 140 unsigned long _compound_pad_2;
141 /* For both global and memcg */
141 struct list_head deferred_list; 142 struct list_head deferred_list;
142 }; 143 };
143 struct { /* Page table pages */ 144 struct { /* Page table pages */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3f38c30d2f13..bda20282746b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -235,6 +235,8 @@ enum node_stat_item {
235 NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ 235 NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
236 NR_SHMEM_THPS, 236 NR_SHMEM_THPS,
237 NR_SHMEM_PMDMAPPED, 237 NR_SHMEM_PMDMAPPED,
238 NR_FILE_THPS,
239 NR_FILE_PMDMAPPED,
238 NR_ANON_THPS, 240 NR_ANON_THPS,
239 NR_UNSTABLE_NFS, /* NFS unstable pages */ 241 NR_UNSTABLE_NFS, /* NFS unstable pages */
240 NR_VMSCAN_WRITE, 242 NR_VMSCAN_WRITE,
@@ -677,6 +679,14 @@ struct zonelist {
677extern struct page *mem_map; 679extern struct page *mem_map;
678#endif 680#endif
679 681
682#ifdef CONFIG_TRANSPARENT_HUGEPAGE
683struct deferred_split {
684 spinlock_t split_queue_lock;
685 struct list_head split_queue;
686 unsigned long split_queue_len;
687};
688#endif
689
680/* 690/*
681 * On NUMA machines, each NUMA node would have a pg_data_t to describe 691 * On NUMA machines, each NUMA node would have a pg_data_t to describe
682 * it's memory layout. On UMA machines there is a single pglist_data which 692 * it's memory layout. On UMA machines there is a single pglist_data which
@@ -756,9 +766,7 @@ typedef struct pglist_data {
756#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ 766#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
757 767
758#ifdef CONFIG_TRANSPARENT_HUGEPAGE 768#ifdef CONFIG_TRANSPARENT_HUGEPAGE
759 spinlock_t split_queue_lock; 769 struct deferred_split deferred_split_queue;
760 struct list_head split_queue;
761 unsigned long split_queue_len;
762#endif 770#endif
763 771
764 /* Fields commonly accessed by the page reclaim scanner */ 772 /* Fields commonly accessed by the page reclaim scanner */
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index 09592951725c..682fd465df06 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -18,6 +18,7 @@ struct page_ext_operations {
18 18
19enum page_ext_flags { 19enum page_ext_flags {
20 PAGE_EXT_OWNER, 20 PAGE_EXT_OWNER,
21 PAGE_EXT_OWNER_ACTIVE,
21#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) 22#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT)
22 PAGE_EXT_YOUNG, 23 PAGE_EXT_YOUNG,
23 PAGE_EXT_IDLE, 24 PAGE_EXT_IDLE,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index c7552459a15f..37a4d9e32cd3 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -333,6 +333,16 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
333 mapping_gfp_mask(mapping)); 333 mapping_gfp_mask(mapping));
334} 334}
335 335
336static inline struct page *find_subpage(struct page *page, pgoff_t offset)
337{
338 if (PageHuge(page))
339 return page;
340
341 VM_BUG_ON_PAGE(PageTail(page), page);
342
343 return page + (offset & (compound_nr(page) - 1));
344}
345
336struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); 346struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
337struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); 347struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
338unsigned find_get_entries(struct address_space *mapping, pgoff_t start, 348unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
diff --git a/include/linux/quicklist.h b/include/linux/quicklist.h
deleted file mode 100644
index 034982c98c8b..000000000000
--- a/include/linux/quicklist.h
+++ /dev/null
@@ -1,94 +0,0 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef LINUX_QUICKLIST_H
3#define LINUX_QUICKLIST_H
4/*
5 * Fast allocations and disposal of pages. Pages must be in the condition
6 * as needed after allocation when they are freed. Per cpu lists of pages
7 * are kept that only contain node local pages.
8 *
9 * (C) 2007, SGI. Christoph Lameter <cl@linux.com>
10 */
11#include <linux/kernel.h>
12#include <linux/gfp.h>
13#include <linux/percpu.h>
14
15#ifdef CONFIG_QUICKLIST
16
17struct quicklist {
18 void *page;
19 int nr_pages;
20};
21
22DECLARE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
23
24/*
25 * The two key functions quicklist_alloc and quicklist_free are inline so
26 * that they may be custom compiled for the platform.
27 * Specifying a NULL ctor can remove constructor support. Specifying
28 * a constant quicklist allows the determination of the exact address
29 * in the per cpu area.
30 *
31 * The fast patch in quicklist_alloc touched only a per cpu cacheline and
32 * the first cacheline of the page itself. There is minmal overhead involved.
33 */
34static inline void *quicklist_alloc(int nr, gfp_t flags, void (*ctor)(void *))
35{
36 struct quicklist *q;
37 void **p = NULL;
38
39 q =&get_cpu_var(quicklist)[nr];
40 p = q->page;
41 if (likely(p)) {
42 q->page = p[0];
43 p[0] = NULL;
44 q->nr_pages--;
45 }
46 put_cpu_var(quicklist);
47 if (likely(p))
48 return p;
49
50 p = (void *)__get_free_page(flags | __GFP_ZERO);
51 if (ctor && p)
52 ctor(p);
53 return p;
54}
55
56static inline void __quicklist_free(int nr, void (*dtor)(void *), void *p,
57 struct page *page)
58{
59 struct quicklist *q;
60
61 q = &get_cpu_var(quicklist)[nr];
62 *(void **)p = q->page;
63 q->page = p;
64 q->nr_pages++;
65 put_cpu_var(quicklist);
66}
67
68static inline void quicklist_free(int nr, void (*dtor)(void *), void *pp)
69{
70 __quicklist_free(nr, dtor, pp, virt_to_page(pp));
71}
72
73static inline void quicklist_free_page(int nr, void (*dtor)(void *),
74 struct page *page)
75{
76 __quicklist_free(nr, dtor, page_address(page), page);
77}
78
79void quicklist_trim(int nr, void (*dtor)(void *),
80 unsigned long min_pages, unsigned long max_free);
81
82unsigned long quicklist_total_size(void);
83
84#else
85
86static inline unsigned long quicklist_total_size(void)
87{
88 return 0;
89}
90
91#endif
92
93#endif /* LINUX_QUICKLIST_H */
94
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 9443cafd1969..0f80123650e2 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -69,7 +69,7 @@ struct shrinker {
69 69
70 /* These are for internal use */ 70 /* These are for internal use */
71 struct list_head list; 71 struct list_head list;
72#ifdef CONFIG_MEMCG_KMEM 72#ifdef CONFIG_MEMCG
73 /* ID in shrinker_idr */ 73 /* ID in shrinker_idr */
74 int id; 74 int id;
75#endif 75#endif
@@ -81,6 +81,11 @@ struct shrinker {
81/* Flags */ 81/* Flags */
82#define SHRINKER_NUMA_AWARE (1 << 0) 82#define SHRINKER_NUMA_AWARE (1 << 0)
83#define SHRINKER_MEMCG_AWARE (1 << 1) 83#define SHRINKER_MEMCG_AWARE (1 << 1)
84/*
85 * It just makes sense when the shrinker is also MEMCG_AWARE for now,
86 * non-MEMCG_AWARE shrinker should not have this flag set.
87 */
88#define SHRINKER_NONSLAB (1 << 2)
84 89
85extern int prealloc_shrinker(struct shrinker *shrinker); 90extern int prealloc_shrinker(struct shrinker *shrinker);
86extern void register_shrinker_prepared(struct shrinker *shrinker); 91extern void register_shrinker_prepared(struct shrinker *shrinker);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 56c9c7eed34e..ab2b98ad76e1 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -595,68 +595,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
595 return __kmalloc_node(size, flags, node); 595 return __kmalloc_node(size, flags, node);
596} 596}
597 597
598struct memcg_cache_array {
599 struct rcu_head rcu;
600 struct kmem_cache *entries[0];
601};
602
603/*
604 * This is the main placeholder for memcg-related information in kmem caches.
605 * Both the root cache and the child caches will have it. For the root cache,
606 * this will hold a dynamically allocated array large enough to hold
607 * information about the currently limited memcgs in the system. To allow the
608 * array to be accessed without taking any locks, on relocation we free the old
609 * version only after a grace period.
610 *
611 * Root and child caches hold different metadata.
612 *
613 * @root_cache: Common to root and child caches. NULL for root, pointer to
614 * the root cache for children.
615 *
616 * The following fields are specific to root caches.
617 *
618 * @memcg_caches: kmemcg ID indexed table of child caches. This table is
619 * used to index child cachces during allocation and cleared
620 * early during shutdown.
621 *
622 * @root_caches_node: List node for slab_root_caches list.
623 *
624 * @children: List of all child caches. While the child caches are also
625 * reachable through @memcg_caches, a child cache remains on
626 * this list until it is actually destroyed.
627 *
628 * The following fields are specific to child caches.
629 *
630 * @memcg: Pointer to the memcg this cache belongs to.
631 *
632 * @children_node: List node for @root_cache->children list.
633 *
634 * @kmem_caches_node: List node for @memcg->kmem_caches list.
635 */
636struct memcg_cache_params {
637 struct kmem_cache *root_cache;
638 union {
639 struct {
640 struct memcg_cache_array __rcu *memcg_caches;
641 struct list_head __root_caches_node;
642 struct list_head children;
643 bool dying;
644 };
645 struct {
646 struct mem_cgroup *memcg;
647 struct list_head children_node;
648 struct list_head kmem_caches_node;
649 struct percpu_ref refcnt;
650
651 void (*work_fn)(struct kmem_cache *);
652 union {
653 struct rcu_head rcu_head;
654 struct work_struct work;
655 };
656 };
657 };
658};
659
660int memcg_update_all_caches(int num_memcgs); 598int memcg_update_all_caches(int num_memcgs);
661 599
662/** 600/**
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index dfa718ffdd4f..4e7809408073 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -53,15 +53,21 @@ struct vmap_area {
53 unsigned long va_start; 53 unsigned long va_start;
54 unsigned long va_end; 54 unsigned long va_end;
55 55
56 /*
57 * Largest available free size in subtree.
58 */
59 unsigned long subtree_max_size;
60 unsigned long flags;
61 struct rb_node rb_node; /* address sorted rbtree */ 56 struct rb_node rb_node; /* address sorted rbtree */
62 struct list_head list; /* address sorted list */ 57 struct list_head list; /* address sorted list */
63 struct llist_node purge_list; /* "lazy purge" list */ 58
64 struct vm_struct *vm; 59 /*
60 * The following three variables can be packed, because
61 * a vmap_area object is always one of the three states:
62 * 1) in "free" tree (root is vmap_area_root)
63 * 2) in "busy" tree (root is free_vmap_area_root)
64 * 3) in purge list (head is vmap_purge_list)
65 */
66 union {
67 unsigned long subtree_max_size; /* in "free" tree */
68 struct vm_struct *vm; /* in "busy" tree */
69 struct llist_node purge_list; /* in purge list */
70 };
65}; 71};
66 72
67/* 73/*
diff --git a/include/linux/zpool.h b/include/linux/zpool.h
index 7238865e75b0..51bf43076165 100644
--- a/include/linux/zpool.h
+++ b/include/linux/zpool.h
@@ -46,6 +46,8 @@ const char *zpool_get_type(struct zpool *pool);
46 46
47void zpool_destroy_pool(struct zpool *pool); 47void zpool_destroy_pool(struct zpool *pool);
48 48
49bool zpool_malloc_support_movable(struct zpool *pool);
50
49int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, 51int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp,
50 unsigned long *handle); 52 unsigned long *handle);
51 53
@@ -90,6 +92,7 @@ struct zpool_driver {
90 struct zpool *zpool); 92 struct zpool *zpool);
91 void (*destroy)(void *pool); 93 void (*destroy)(void *pool);
92 94
95 bool malloc_support_movable;
93 int (*malloc)(void *pool, size_t size, gfp_t gfp, 96 int (*malloc)(void *pool, size_t size, gfp_t gfp,
94 unsigned long *handle); 97 unsigned long *handle);
95 void (*free)(void *pool, unsigned long handle); 98 void (*free)(void *pool, unsigned long handle);
diff --git a/init/main.c b/init/main.c
index 653693da8da6..208b8fa1808e 100644
--- a/init/main.c
+++ b/init/main.c
@@ -507,7 +507,7 @@ void __init __weak mem_encrypt_init(void) { }
507 507
508void __init __weak poking_init(void) { } 508void __init __weak poking_init(void) { }
509 509
510void __init __weak pgd_cache_init(void) { } 510void __init __weak pgtable_cache_init(void) { }
511 511
512bool initcall_debug; 512bool initcall_debug;
513core_param(initcall_debug, initcall_debug, bool, 0644); 513core_param(initcall_debug, initcall_debug, bool, 0644);
@@ -556,6 +556,7 @@ static void __init mm_init(void)
556 report_meminit(); 556 report_meminit();
557 mem_init(); 557 mem_init();
558 kmem_cache_init(); 558 kmem_cache_init();
559 kmemleak_init();
559 pgtable_init(); 560 pgtable_init();
560 debug_objects_mem_init(); 561 debug_objects_mem_init();
561 vmalloc_init(); 562 vmalloc_init();
@@ -564,7 +565,6 @@ static void __init mm_init(void)
564 init_espfix_bsp(); 565 init_espfix_bsp();
565 /* Should be run after espfix64 is set up. */ 566 /* Should be run after espfix64 is set up. */
566 pti_init(); 567 pti_init();
567 pgd_cache_init();
568} 568}
569 569
570void __init __weak arch_call_rest_init(void) 570void __init __weak arch_call_rest_init(void)
@@ -594,7 +594,6 @@ asmlinkage __visible void __init start_kernel(void)
594 page_address_init(); 594 page_address_init();
595 pr_notice("%s", linux_banner); 595 pr_notice("%s", linux_banner);
596 setup_arch(&command_line); 596 setup_arch(&command_line);
597 mm_init_cpumask(&init_mm);
598 setup_command_line(command_line); 597 setup_command_line(command_line);
599 setup_nr_cpu_ids(); 598 setup_nr_cpu_ids();
600 setup_per_cpu_areas(); 599 setup_per_cpu_areas();
@@ -740,7 +739,6 @@ asmlinkage __visible void __init start_kernel(void)
740 initrd_start = 0; 739 initrd_start = 0;
741 } 740 }
742#endif 741#endif
743 kmemleak_init();
744 setup_per_cpu_pageset(); 742 setup_per_cpu_pageset();
745 numa_policy_init(); 743 numa_policy_init();
746 acpi_early_init(); 744 acpi_early_init();
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 84fa00497c49..94d38a39d72e 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -26,6 +26,7 @@
26#include <linux/percpu-rwsem.h> 26#include <linux/percpu-rwsem.h>
27#include <linux/task_work.h> 27#include <linux/task_work.h>
28#include <linux/shmem_fs.h> 28#include <linux/shmem_fs.h>
29#include <linux/khugepaged.h>
29 30
30#include <linux/uprobes.h> 31#include <linux/uprobes.h>
31 32
@@ -143,17 +144,19 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
143 * 144 *
144 * @vma: vma that holds the pte pointing to page 145 * @vma: vma that holds the pte pointing to page
145 * @addr: address the old @page is mapped at 146 * @addr: address the old @page is mapped at
146 * @page: the cowed page we are replacing by kpage 147 * @old_page: the page we are replacing by new_page
147 * @kpage: the modified page we replace page by 148 * @new_page: the modified page we replace page by
148 * 149 *
149 * Returns 0 on success, -EFAULT on failure. 150 * If @new_page is NULL, only unmap @old_page.
151 *
152 * Returns 0 on success, negative error code otherwise.
150 */ 153 */
151static int __replace_page(struct vm_area_struct *vma, unsigned long addr, 154static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
152 struct page *old_page, struct page *new_page) 155 struct page *old_page, struct page *new_page)
153{ 156{
154 struct mm_struct *mm = vma->vm_mm; 157 struct mm_struct *mm = vma->vm_mm;
155 struct page_vma_mapped_walk pvmw = { 158 struct page_vma_mapped_walk pvmw = {
156 .page = old_page, 159 .page = compound_head(old_page),
157 .vma = vma, 160 .vma = vma,
158 .address = addr, 161 .address = addr,
159 }; 162 };
@@ -164,12 +167,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
164 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, 167 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
165 addr + PAGE_SIZE); 168 addr + PAGE_SIZE);
166 169
167 VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); 170 if (new_page) {
168 171 err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
169 err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, 172 &memcg, false);
170 false); 173 if (err)
171 if (err) 174 return err;
172 return err; 175 }
173 176
174 /* For try_to_free_swap() and munlock_vma_page() below */ 177 /* For try_to_free_swap() and munlock_vma_page() below */
175 lock_page(old_page); 178 lock_page(old_page);
@@ -177,15 +180,20 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
177 mmu_notifier_invalidate_range_start(&range); 180 mmu_notifier_invalidate_range_start(&range);
178 err = -EAGAIN; 181 err = -EAGAIN;
179 if (!page_vma_mapped_walk(&pvmw)) { 182 if (!page_vma_mapped_walk(&pvmw)) {
180 mem_cgroup_cancel_charge(new_page, memcg, false); 183 if (new_page)
184 mem_cgroup_cancel_charge(new_page, memcg, false);
181 goto unlock; 185 goto unlock;
182 } 186 }
183 VM_BUG_ON_PAGE(addr != pvmw.address, old_page); 187 VM_BUG_ON_PAGE(addr != pvmw.address, old_page);
184 188
185 get_page(new_page); 189 if (new_page) {
186 page_add_new_anon_rmap(new_page, vma, addr, false); 190 get_page(new_page);
187 mem_cgroup_commit_charge(new_page, memcg, false, false); 191 page_add_new_anon_rmap(new_page, vma, addr, false);
188 lru_cache_add_active_or_unevictable(new_page, vma); 192 mem_cgroup_commit_charge(new_page, memcg, false, false);
193 lru_cache_add_active_or_unevictable(new_page, vma);
194 } else
195 /* no new page, just dec_mm_counter for old_page */
196 dec_mm_counter(mm, MM_ANONPAGES);
189 197
190 if (!PageAnon(old_page)) { 198 if (!PageAnon(old_page)) {
191 dec_mm_counter(mm, mm_counter_file(old_page)); 199 dec_mm_counter(mm, mm_counter_file(old_page));
@@ -194,8 +202,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
194 202
195 flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); 203 flush_cache_page(vma, addr, pte_pfn(*pvmw.pte));
196 ptep_clear_flush_notify(vma, addr, pvmw.pte); 204 ptep_clear_flush_notify(vma, addr, pvmw.pte);
197 set_pte_at_notify(mm, addr, pvmw.pte, 205 if (new_page)
198 mk_pte(new_page, vma->vm_page_prot)); 206 set_pte_at_notify(mm, addr, pvmw.pte,
207 mk_pte(new_page, vma->vm_page_prot));
199 208
200 page_remove_rmap(old_page, false); 209 page_remove_rmap(old_page, false);
201 if (!page_mapped(old_page)) 210 if (!page_mapped(old_page))
@@ -464,6 +473,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
464 struct page *old_page, *new_page; 473 struct page *old_page, *new_page;
465 struct vm_area_struct *vma; 474 struct vm_area_struct *vma;
466 int ret, is_register, ref_ctr_updated = 0; 475 int ret, is_register, ref_ctr_updated = 0;
476 bool orig_page_huge = false;
467 477
468 is_register = is_swbp_insn(&opcode); 478 is_register = is_swbp_insn(&opcode);
469 uprobe = container_of(auprobe, struct uprobe, arch); 479 uprobe = container_of(auprobe, struct uprobe, arch);
@@ -471,7 +481,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
471retry: 481retry:
472 /* Read the page with vaddr into memory */ 482 /* Read the page with vaddr into memory */
473 ret = get_user_pages_remote(NULL, mm, vaddr, 1, 483 ret = get_user_pages_remote(NULL, mm, vaddr, 1,
474 FOLL_FORCE | FOLL_SPLIT, &old_page, &vma, NULL); 484 FOLL_FORCE | FOLL_SPLIT_PMD, &old_page, &vma, NULL);
475 if (ret <= 0) 485 if (ret <= 0)
476 return ret; 486 return ret;
477 487
@@ -488,6 +498,10 @@ retry:
488 ref_ctr_updated = 1; 498 ref_ctr_updated = 1;
489 } 499 }
490 500
501 ret = 0;
502 if (!is_register && !PageAnon(old_page))
503 goto put_old;
504
491 ret = anon_vma_prepare(vma); 505 ret = anon_vma_prepare(vma);
492 if (ret) 506 if (ret)
493 goto put_old; 507 goto put_old;
@@ -501,8 +515,33 @@ retry:
501 copy_highpage(new_page, old_page); 515 copy_highpage(new_page, old_page);
502 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); 516 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
503 517
518 if (!is_register) {
519 struct page *orig_page;
520 pgoff_t index;
521
522 VM_BUG_ON_PAGE(!PageAnon(old_page), old_page);
523
524 index = vaddr_to_offset(vma, vaddr & PAGE_MASK) >> PAGE_SHIFT;
525 orig_page = find_get_page(vma->vm_file->f_inode->i_mapping,
526 index);
527
528 if (orig_page) {
529 if (PageUptodate(orig_page) &&
530 pages_identical(new_page, orig_page)) {
531 /* let go new_page */
532 put_page(new_page);
533 new_page = NULL;
534
535 if (PageCompound(orig_page))
536 orig_page_huge = true;
537 }
538 put_page(orig_page);
539 }
540 }
541
504 ret = __replace_page(vma, vaddr, old_page, new_page); 542 ret = __replace_page(vma, vaddr, old_page, new_page);
505 put_page(new_page); 543 if (new_page)
544 put_page(new_page);
506put_old: 545put_old:
507 put_page(old_page); 546 put_page(old_page);
508 547
@@ -513,6 +552,10 @@ put_old:
513 if (ret && is_register && ref_ctr_updated) 552 if (ret && is_register && ref_ctr_updated)
514 update_ref_ctr(uprobe, mm, -1); 553 update_ref_ctr(uprobe, mm, -1);
515 554
555 /* try collapse pmd for compound page */
556 if (!ret && orig_page_huge)
557 collapse_pte_mapped_thp(mm, vaddr);
558
516 return ret; 559 return ret;
517} 560}
518 561
diff --git a/kernel/resource.c b/kernel/resource.c
index 74877e9d90ca..76036a41143b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -487,8 +487,8 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
487 while (start < end && 487 while (start < end &&
488 !find_next_iomem_res(start, end, flags, IORES_DESC_NONE, 488 !find_next_iomem_res(start, end, flags, IORES_DESC_NONE,
489 false, &res)) { 489 false, &res)) {
490 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; 490 pfn = PFN_UP(res.start);
491 end_pfn = (res.end + 1) >> PAGE_SHIFT; 491 end_pfn = PFN_DOWN(res.end + 1);
492 if (end_pfn > pfn) 492 if (end_pfn > pfn)
493 ret = (*func)(pfn, end_pfn - pfn, arg); 493 ret = (*func)(pfn, end_pfn - pfn, arg);
494 if (ret) 494 if (ret)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c892c6280c9f..8dad5aa600ea 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -238,7 +238,6 @@ static void do_idle(void)
238 tick_nohz_idle_enter(); 238 tick_nohz_idle_enter();
239 239
240 while (!need_resched()) { 240 while (!need_resched()) {
241 check_pgt_cache();
242 rmb(); 241 rmb();
243 242
244 local_irq_disable(); 243 local_irq_disable();
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 078950d9605b..00fcea236eba 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -264,7 +264,8 @@ extern struct ctl_table epoll_table[];
264extern struct ctl_table firmware_config_table[]; 264extern struct ctl_table firmware_config_table[];
265#endif 265#endif
266 266
267#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 267#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
268 defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
268int sysctl_legacy_va_layout; 269int sysctl_legacy_va_layout;
269#endif 270#endif
270 271
@@ -1573,7 +1574,8 @@ static struct ctl_table vm_table[] = {
1573 .proc_handler = proc_dointvec, 1574 .proc_handler = proc_dointvec,
1574 .extra1 = SYSCTL_ZERO, 1575 .extra1 = SYSCTL_ZERO,
1575 }, 1576 },
1576#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 1577#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
1578 defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
1577 { 1579 {
1578 .procname = "legacy_va_layout", 1580 .procname = "legacy_va_layout",
1579 .data = &sysctl_legacy_va_layout, 1581 .data = &sysctl_legacy_va_layout,
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e0e14780a13d..6b1b1703a646 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -576,17 +576,18 @@ config DEBUG_KMEMLEAK
576 In order to access the kmemleak file, debugfs needs to be 576 In order to access the kmemleak file, debugfs needs to be
577 mounted (usually at /sys/kernel/debug). 577 mounted (usually at /sys/kernel/debug).
578 578
579config DEBUG_KMEMLEAK_EARLY_LOG_SIZE 579config DEBUG_KMEMLEAK_MEM_POOL_SIZE
580 int "Maximum kmemleak early log entries" 580 int "Kmemleak memory pool size"
581 depends on DEBUG_KMEMLEAK 581 depends on DEBUG_KMEMLEAK
582 range 200 40000 582 range 200 1000000
583 default 400 583 default 16000
584 help 584 help
585 Kmemleak must track all the memory allocations to avoid 585 Kmemleak must track all the memory allocations to avoid
586 reporting false positives. Since memory may be allocated or 586 reporting false positives. Since memory may be allocated or
587 freed before kmemleak is initialised, an early log buffer is 587 freed before kmemleak is fully initialised, use a static pool
588 used to store these actions. If kmemleak reports "early log 588 of metadata objects to track such callbacks. After kmemleak is
589 buffer exceeded", please increase this value. 589 fully initialised, this memory pool acts as an emergency one
590 if slab allocations fail.
590 591
591config DEBUG_KMEMLEAK_TEST 592config DEBUG_KMEMLEAK_TEST
592 tristate "Simple test for the kernel memory leak detector" 593 tristate "Simple test for the kernel memory leak detector"
diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan
index 7fa97a8b5717..6c9682ce0254 100644
--- a/lib/Kconfig.kasan
+++ b/lib/Kconfig.kasan
@@ -134,6 +134,14 @@ config KASAN_S390_4_LEVEL_PAGING
134 to 3TB of RAM with KASan enabled). This options allows to force 134 to 3TB of RAM with KASan enabled). This options allows to force
135 4-level paging instead. 135 4-level paging instead.
136 136
137config KASAN_SW_TAGS_IDENTIFY
138 bool "Enable memory corruption identification"
139 depends on KASAN_SW_TAGS
140 help
141 This option enables best-effort identification of bug type
142 (use-after-free or out-of-bounds) at the cost of increased
143 memory consumption.
144
137config TEST_KASAN 145config TEST_KASAN
138 tristate "Module for testing KASAN for bug detection" 146 tristate "Module for testing KASAN for bug detection"
139 depends on m && KASAN 147 depends on m && KASAN
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f1e0569b4539..639d5e7014c1 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -878,7 +878,7 @@ static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
878 head = compound_head(page); 878 head = compound_head(page);
879 v += (page - head) << PAGE_SHIFT; 879 v += (page - head) << PAGE_SHIFT;
880 880
881 if (likely(n <= v && v <= (PAGE_SIZE << compound_order(head)))) 881 if (likely(n <= v && v <= (page_size(head))))
882 return true; 882 return true;
883 WARN_ON(1); 883 WARN_ON(1);
884 return false; 884 return false;
diff --git a/lib/show_mem.c b/lib/show_mem.c
index 5c86ef4c899f..1c26c14ffbb9 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -6,7 +6,6 @@
6 */ 6 */
7 7
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/quicklist.h>
10#include <linux/cma.h> 9#include <linux/cma.h>
11 10
12void show_mem(unsigned int filter, nodemask_t *nodemask) 11void show_mem(unsigned int filter, nodemask_t *nodemask)
@@ -39,10 +38,6 @@ void show_mem(unsigned int filter, nodemask_t *nodemask)
39#ifdef CONFIG_CMA 38#ifdef CONFIG_CMA
40 printk("%lu pages cma reserved\n", totalcma_pages); 39 printk("%lu pages cma reserved\n", totalcma_pages);
41#endif 40#endif
42#ifdef CONFIG_QUICKLIST
43 printk("%lu pages in pagetable cache\n",
44 quicklist_total_size());
45#endif
46#ifdef CONFIG_MEMORY_FAILURE 41#ifdef CONFIG_MEMORY_FAILURE
47 printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); 42 printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
48#endif 43#endif
diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index b63b367a94e8..49cc4d570a40 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -18,6 +18,9 @@
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/uaccess.h> 20#include <linux/uaccess.h>
21#include <linux/io.h>
22
23#include <asm/page.h>
21 24
22/* 25/*
23 * Note: test functions are marked noinline so that their names appear in 26 * Note: test functions are marked noinline so that their names appear in
@@ -337,6 +340,42 @@ static noinline void __init kmalloc_uaf2(void)
337 kfree(ptr2); 340 kfree(ptr2);
338} 341}
339 342
343static noinline void __init kfree_via_page(void)
344{
345 char *ptr;
346 size_t size = 8;
347 struct page *page;
348 unsigned long offset;
349
350 pr_info("invalid-free false positive (via page)\n");
351 ptr = kmalloc(size, GFP_KERNEL);
352 if (!ptr) {
353 pr_err("Allocation failed\n");
354 return;
355 }
356
357 page = virt_to_page(ptr);
358 offset = offset_in_page(ptr);
359 kfree(page_address(page) + offset);
360}
361
362static noinline void __init kfree_via_phys(void)
363{
364 char *ptr;
365 size_t size = 8;
366 phys_addr_t phys;
367
368 pr_info("invalid-free false positive (via phys)\n");
369 ptr = kmalloc(size, GFP_KERNEL);
370 if (!ptr) {
371 pr_err("Allocation failed\n");
372 return;
373 }
374
375 phys = virt_to_phys(ptr);
376 kfree(phys_to_virt(phys));
377}
378
340static noinline void __init kmem_cache_oob(void) 379static noinline void __init kmem_cache_oob(void)
341{ 380{
342 char *p; 381 char *p;
@@ -737,6 +776,8 @@ static int __init kmalloc_tests_init(void)
737 kmalloc_uaf(); 776 kmalloc_uaf();
738 kmalloc_uaf_memset(); 777 kmalloc_uaf_memset();
739 kmalloc_uaf2(); 778 kmalloc_uaf2();
779 kfree_via_page();
780 kfree_via_phys();
740 kmem_cache_oob(); 781 kmem_cache_oob();
741 memcg_accounted_kmem_cache(); 782 memcg_accounted_kmem_cache();
742 kasan_stack_oob(); 783 kasan_stack_oob();
diff --git a/mm/Kconfig b/mm/Kconfig
index 2fe4902ad755..a5dae9a7eb51 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -273,11 +273,6 @@ config BOUNCE
273 by default when ZONE_DMA or HIGHMEM is selected, but you 273 by default when ZONE_DMA or HIGHMEM is selected, but you
274 may say n to override this. 274 may say n to override this.
275 275
276config NR_QUICK
277 int
278 depends on QUICKLIST
279 default "1"
280
281config VIRT_TO_BUS 276config VIRT_TO_BUS
282 bool 277 bool
283 help 278 help
@@ -717,6 +712,17 @@ config GUP_BENCHMARK
717config GUP_GET_PTE_LOW_HIGH 712config GUP_GET_PTE_LOW_HIGH
718 bool 713 bool
719 714
715config READ_ONLY_THP_FOR_FS
716 bool "Read-only THP for filesystems (EXPERIMENTAL)"
717 depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM
718
719 help
720 Allow khugepaged to put read-only file-backed pages in THP.
721
722 This is marked experimental because it is a new feature. Write
723 support of file THPs will be developed in the next few release
724 cycles.
725
720config ARCH_HAS_PTE_SPECIAL 726config ARCH_HAS_PTE_SPECIAL
721 bool 727 bool
722 728
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 82b6a20898bd..327b3ebf23bf 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -21,7 +21,9 @@ config DEBUG_PAGEALLOC
21 Also, the state of page tracking structures is checked more often as 21 Also, the state of page tracking structures is checked more often as
22 pages are being allocated and freed, as unexpected state changes 22 pages are being allocated and freed, as unexpected state changes
23 often happen for same reasons as memory corruption (e.g. double free, 23 often happen for same reasons as memory corruption (e.g. double free,
24 use-after-free). 24 use-after-free). The error reports for these checks can be augmented
25 with stack traces of last allocation and freeing of the page, when
26 PAGE_OWNER is also selected and enabled on boot.
25 27
26 For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, 28 For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
27 fill the pages with poison patterns after free_pages() and verify 29 fill the pages with poison patterns after free_pages() and verify
diff --git a/mm/Makefile b/mm/Makefile
index d0b295c3b764..d996846697ef 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -21,6 +21,9 @@ KCOV_INSTRUMENT_memcontrol.o := n
21KCOV_INSTRUMENT_mmzone.o := n 21KCOV_INSTRUMENT_mmzone.o := n
22KCOV_INSTRUMENT_vmstat.o := n 22KCOV_INSTRUMENT_vmstat.o := n
23 23
24CFLAGS_init-mm.o += $(call cc-disable-warning, override-init)
25CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides)
26
24mmu-y := nommu.o 27mmu-y := nommu.o
25mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \ 28mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
26 mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \ 29 mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
@@ -72,7 +75,6 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
72obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 75obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
73obj-$(CONFIG_MEMTEST) += memtest.o 76obj-$(CONFIG_MEMTEST) += memtest.o
74obj-$(CONFIG_MIGRATION) += migrate.o 77obj-$(CONFIG_MIGRATION) += migrate.o
75obj-$(CONFIG_QUICKLIST) += quicklist.o
76obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o 78obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
77obj-$(CONFIG_PAGE_COUNTER) += page_counter.o 79obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
78obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o 80obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 952dc2fb24e5..ce08b39d85d4 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -969,7 +969,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
969 * is safe to read and it's 0 for tail pages. 969 * is safe to read and it's 0 for tail pages.
970 */ 970 */
971 if (unlikely(PageCompound(page))) { 971 if (unlikely(PageCompound(page))) {
972 low_pfn += (1UL << compound_order(page)) - 1; 972 low_pfn += compound_nr(page) - 1;
973 goto isolate_fail; 973 goto isolate_fail;
974 } 974 }
975 } 975 }
@@ -1737,8 +1737,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
1737 * starting at the block pointed to by the migrate scanner pfn within 1737 * starting at the block pointed to by the migrate scanner pfn within
1738 * compact_control. 1738 * compact_control.
1739 */ 1739 */
1740static isolate_migrate_t isolate_migratepages(struct zone *zone, 1740static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
1741 struct compact_control *cc)
1742{ 1741{
1743 unsigned long block_start_pfn; 1742 unsigned long block_start_pfn;
1744 unsigned long block_end_pfn; 1743 unsigned long block_end_pfn;
@@ -1756,8 +1755,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1756 */ 1755 */
1757 low_pfn = fast_find_migrateblock(cc); 1756 low_pfn = fast_find_migrateblock(cc);
1758 block_start_pfn = pageblock_start_pfn(low_pfn); 1757 block_start_pfn = pageblock_start_pfn(low_pfn);
1759 if (block_start_pfn < zone->zone_start_pfn) 1758 if (block_start_pfn < cc->zone->zone_start_pfn)
1760 block_start_pfn = zone->zone_start_pfn; 1759 block_start_pfn = cc->zone->zone_start_pfn;
1761 1760
1762 /* 1761 /*
1763 * fast_find_migrateblock marks a pageblock skipped so to avoid 1762 * fast_find_migrateblock marks a pageblock skipped so to avoid
@@ -1787,8 +1786,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1787 if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))) 1786 if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
1788 cond_resched(); 1787 cond_resched();
1789 1788
1790 page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, 1789 page = pageblock_pfn_to_page(block_start_pfn,
1791 zone); 1790 block_end_pfn, cc->zone);
1792 if (!page) 1791 if (!page)
1793 continue; 1792 continue;
1794 1793
@@ -2078,6 +2077,17 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
2078 const bool sync = cc->mode != MIGRATE_ASYNC; 2077 const bool sync = cc->mode != MIGRATE_ASYNC;
2079 bool update_cached; 2078 bool update_cached;
2080 2079
2080 /*
2081 * These counters track activities during zone compaction. Initialize
2082 * them before compacting a new zone.
2083 */
2084 cc->total_migrate_scanned = 0;
2085 cc->total_free_scanned = 0;
2086 cc->nr_migratepages = 0;
2087 cc->nr_freepages = 0;
2088 INIT_LIST_HEAD(&cc->freepages);
2089 INIT_LIST_HEAD(&cc->migratepages);
2090
2081 cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); 2091 cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
2082 ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, 2092 ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
2083 cc->classzone_idx); 2093 cc->classzone_idx);
@@ -2158,7 +2168,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
2158 cc->rescan = true; 2168 cc->rescan = true;
2159 } 2169 }
2160 2170
2161 switch (isolate_migratepages(cc->zone, cc)) { 2171 switch (isolate_migratepages(cc)) {
2162 case ISOLATE_ABORT: 2172 case ISOLATE_ABORT:
2163 ret = COMPACT_CONTENDED; 2173 ret = COMPACT_CONTENDED;
2164 putback_movable_pages(&cc->migratepages); 2174 putback_movable_pages(&cc->migratepages);
@@ -2281,10 +2291,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
2281{ 2291{
2282 enum compact_result ret; 2292 enum compact_result ret;
2283 struct compact_control cc = { 2293 struct compact_control cc = {
2284 .nr_freepages = 0,
2285 .nr_migratepages = 0,
2286 .total_migrate_scanned = 0,
2287 .total_free_scanned = 0,
2288 .order = order, 2294 .order = order,
2289 .search_order = order, 2295 .search_order = order,
2290 .gfp_mask = gfp_mask, 2296 .gfp_mask = gfp_mask,
@@ -2305,8 +2311,6 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
2305 2311
2306 if (capture) 2312 if (capture)
2307 current->capture_control = &capc; 2313 current->capture_control = &capc;
2308 INIT_LIST_HEAD(&cc.freepages);
2309 INIT_LIST_HEAD(&cc.migratepages);
2310 2314
2311 ret = compact_zone(&cc, &capc); 2315 ret = compact_zone(&cc, &capc);
2312 2316
@@ -2408,8 +2412,6 @@ static void compact_node(int nid)
2408 struct zone *zone; 2412 struct zone *zone;
2409 struct compact_control cc = { 2413 struct compact_control cc = {
2410 .order = -1, 2414 .order = -1,
2411 .total_migrate_scanned = 0,
2412 .total_free_scanned = 0,
2413 .mode = MIGRATE_SYNC, 2415 .mode = MIGRATE_SYNC,
2414 .ignore_skip_hint = true, 2416 .ignore_skip_hint = true,
2415 .whole_zone = true, 2417 .whole_zone = true,
@@ -2423,11 +2425,7 @@ static void compact_node(int nid)
2423 if (!populated_zone(zone)) 2425 if (!populated_zone(zone))
2424 continue; 2426 continue;
2425 2427
2426 cc.nr_freepages = 0;
2427 cc.nr_migratepages = 0;
2428 cc.zone = zone; 2428 cc.zone = zone;
2429 INIT_LIST_HEAD(&cc.freepages);
2430 INIT_LIST_HEAD(&cc.migratepages);
2431 2429
2432 compact_zone(&cc, NULL); 2430 compact_zone(&cc, NULL);
2433 2431
@@ -2529,8 +2527,6 @@ static void kcompactd_do_work(pg_data_t *pgdat)
2529 struct compact_control cc = { 2527 struct compact_control cc = {
2530 .order = pgdat->kcompactd_max_order, 2528 .order = pgdat->kcompactd_max_order,
2531 .search_order = pgdat->kcompactd_max_order, 2529 .search_order = pgdat->kcompactd_max_order,
2532 .total_migrate_scanned = 0,
2533 .total_free_scanned = 0,
2534 .classzone_idx = pgdat->kcompactd_classzone_idx, 2530 .classzone_idx = pgdat->kcompactd_classzone_idx,
2535 .mode = MIGRATE_SYNC_LIGHT, 2531 .mode = MIGRATE_SYNC_LIGHT,
2536 .ignore_skip_hint = false, 2532 .ignore_skip_hint = false,
@@ -2554,16 +2550,10 @@ static void kcompactd_do_work(pg_data_t *pgdat)
2554 COMPACT_CONTINUE) 2550 COMPACT_CONTINUE)
2555 continue; 2551 continue;
2556 2552
2557 cc.nr_freepages = 0;
2558 cc.nr_migratepages = 0;
2559 cc.total_migrate_scanned = 0;
2560 cc.total_free_scanned = 0;
2561 cc.zone = zone;
2562 INIT_LIST_HEAD(&cc.freepages);
2563 INIT_LIST_HEAD(&cc.migratepages);
2564
2565 if (kthread_should_stop()) 2553 if (kthread_should_stop())
2566 return; 2554 return;
2555
2556 cc.zone = zone;
2567 status = compact_zone(&cc, NULL); 2557 status = compact_zone(&cc, NULL);
2568 2558
2569 if (status == COMPACT_SUCCESS) { 2559 if (status == COMPACT_SUCCESS) {
diff --git a/mm/filemap.c b/mm/filemap.c
index 40667c2f3383..1146fcfa3215 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -126,7 +126,7 @@ static void page_cache_delete(struct address_space *mapping,
126 /* hugetlb pages are represented by a single entry in the xarray */ 126 /* hugetlb pages are represented by a single entry in the xarray */
127 if (!PageHuge(page)) { 127 if (!PageHuge(page)) {
128 xas_set_order(&xas, page->index, compound_order(page)); 128 xas_set_order(&xas, page->index, compound_order(page));
129 nr = 1U << compound_order(page); 129 nr = compound_nr(page);
130 } 130 }
131 131
132 VM_BUG_ON_PAGE(!PageLocked(page), page); 132 VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -203,8 +203,9 @@ static void unaccount_page_cache_page(struct address_space *mapping,
203 __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); 203 __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
204 if (PageTransHuge(page)) 204 if (PageTransHuge(page))
205 __dec_node_page_state(page, NR_SHMEM_THPS); 205 __dec_node_page_state(page, NR_SHMEM_THPS);
206 } else { 206 } else if (PageTransHuge(page)) {
207 VM_BUG_ON_PAGE(PageTransHuge(page), page); 207 __dec_node_page_state(page, NR_FILE_THPS);
208 filemap_nr_thps_dec(mapping);
208 } 209 }
209 210
210 /* 211 /*
@@ -281,11 +282,11 @@ EXPORT_SYMBOL(delete_from_page_cache);
281 * @pvec: pagevec with pages to delete 282 * @pvec: pagevec with pages to delete
282 * 283 *
283 * The function walks over mapping->i_pages and removes pages passed in @pvec 284 * The function walks over mapping->i_pages and removes pages passed in @pvec
284 * from the mapping. The function expects @pvec to be sorted by page index. 285 * from the mapping. The function expects @pvec to be sorted by page index
286 * and is optimised for it to be dense.
285 * It tolerates holes in @pvec (mapping entries at those indices are not 287 * It tolerates holes in @pvec (mapping entries at those indices are not
286 * modified). The function expects only THP head pages to be present in the 288 * modified). The function expects only THP head pages to be present in the
287 * @pvec and takes care to delete all corresponding tail pages from the 289 * @pvec.
288 * mapping as well.
289 * 290 *
290 * The function expects the i_pages lock to be held. 291 * The function expects the i_pages lock to be held.
291 */ 292 */
@@ -294,40 +295,43 @@ static void page_cache_delete_batch(struct address_space *mapping,
294{ 295{
295 XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); 296 XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
296 int total_pages = 0; 297 int total_pages = 0;
297 int i = 0, tail_pages = 0; 298 int i = 0;
298 struct page *page; 299 struct page *page;
299 300
300 mapping_set_update(&xas, mapping); 301 mapping_set_update(&xas, mapping);
301 xas_for_each(&xas, page, ULONG_MAX) { 302 xas_for_each(&xas, page, ULONG_MAX) {
302 if (i >= pagevec_count(pvec) && !tail_pages) 303 if (i >= pagevec_count(pvec))
303 break; 304 break;
305
306 /* A swap/dax/shadow entry got inserted? Skip it. */
304 if (xa_is_value(page)) 307 if (xa_is_value(page))
305 continue; 308 continue;
306 if (!tail_pages) { 309 /*
307 /* 310 * A page got inserted in our range? Skip it. We have our
308 * Some page got inserted in our range? Skip it. We 311 * pages locked so they are protected from being removed.
309 * have our pages locked so they are protected from 312 * If we see a page whose index is higher than ours, it
310 * being removed. 313 * means our page has been removed, which shouldn't be
311 */ 314 * possible because we're holding the PageLock.
312 if (page != pvec->pages[i]) { 315 */
313 VM_BUG_ON_PAGE(page->index > 316 if (page != pvec->pages[i]) {
314 pvec->pages[i]->index, page); 317 VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
315 continue; 318 page);
316 } 319 continue;
317 WARN_ON_ONCE(!PageLocked(page)); 320 }
318 if (PageTransHuge(page) && !PageHuge(page)) 321
319 tail_pages = HPAGE_PMD_NR - 1; 322 WARN_ON_ONCE(!PageLocked(page));
323
324 if (page->index == xas.xa_index)
320 page->mapping = NULL; 325 page->mapping = NULL;
321 /* 326 /* Leave page->index set: truncation lookup relies on it */
322 * Leave page->index set: truncation lookup relies 327
323 * upon it 328 /*
324 */ 329 * Move to the next page in the vector if this is a regular
330 * page or the index is of the last sub-page of this compound
331 * page.
332 */
333 if (page->index + compound_nr(page) - 1 == xas.xa_index)
325 i++; 334 i++;
326 } else {
327 VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages
328 != pvec->pages[i]->index, page);
329 tail_pages--;
330 }
331 xas_store(&xas, NULL); 335 xas_store(&xas, NULL);
332 total_pages++; 336 total_pages++;
333 } 337 }
@@ -408,7 +412,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
408 .range_end = end, 412 .range_end = end,
409 }; 413 };
410 414
411 if (!mapping_cap_writeback_dirty(mapping)) 415 if (!mapping_cap_writeback_dirty(mapping) ||
416 !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
412 return 0; 417 return 0;
413 418
414 wbc_attach_fdatawrite_inode(&wbc, mapping->host); 419 wbc_attach_fdatawrite_inode(&wbc, mapping->host);
@@ -617,10 +622,13 @@ int filemap_fdatawait_keep_errors(struct address_space *mapping)
617} 622}
618EXPORT_SYMBOL(filemap_fdatawait_keep_errors); 623EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
619 624
625/* Returns true if writeback might be needed or already in progress. */
620static bool mapping_needs_writeback(struct address_space *mapping) 626static bool mapping_needs_writeback(struct address_space *mapping)
621{ 627{
622 return (!dax_mapping(mapping) && mapping->nrpages) || 628 if (dax_mapping(mapping))
623 (dax_mapping(mapping) && mapping->nrexceptional); 629 return mapping->nrexceptional;
630
631 return mapping->nrpages;
624} 632}
625 633
626int filemap_write_and_wait(struct address_space *mapping) 634int filemap_write_and_wait(struct address_space *mapping)
@@ -1516,7 +1524,7 @@ EXPORT_SYMBOL(page_cache_prev_miss);
1516struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) 1524struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
1517{ 1525{
1518 XA_STATE(xas, &mapping->i_pages, offset); 1526 XA_STATE(xas, &mapping->i_pages, offset);
1519 struct page *head, *page; 1527 struct page *page;
1520 1528
1521 rcu_read_lock(); 1529 rcu_read_lock();
1522repeat: 1530repeat:
@@ -1531,25 +1539,19 @@ repeat:
1531 if (!page || xa_is_value(page)) 1539 if (!page || xa_is_value(page))
1532 goto out; 1540 goto out;
1533 1541
1534 head = compound_head(page); 1542 if (!page_cache_get_speculative(page))
1535 if (!page_cache_get_speculative(head))
1536 goto repeat; 1543 goto repeat;
1537 1544
1538 /* The page was split under us? */
1539 if (compound_head(page) != head) {
1540 put_page(head);
1541 goto repeat;
1542 }
1543
1544 /* 1545 /*
1545 * Has the page moved? 1546 * Has the page moved or been split?
1546 * This is part of the lockless pagecache protocol. See 1547 * This is part of the lockless pagecache protocol. See
1547 * include/linux/pagemap.h for details. 1548 * include/linux/pagemap.h for details.
1548 */ 1549 */
1549 if (unlikely(page != xas_reload(&xas))) { 1550 if (unlikely(page != xas_reload(&xas))) {
1550 put_page(head); 1551 put_page(page);
1551 goto repeat; 1552 goto repeat;
1552 } 1553 }
1554 page = find_subpage(page, offset);
1553out: 1555out:
1554 rcu_read_unlock(); 1556 rcu_read_unlock();
1555 1557
@@ -1646,7 +1648,7 @@ repeat:
1646 } 1648 }
1647 1649
1648 /* Has the page been truncated? */ 1650 /* Has the page been truncated? */
1649 if (unlikely(page->mapping != mapping)) { 1651 if (unlikely(compound_head(page)->mapping != mapping)) {
1650 unlock_page(page); 1652 unlock_page(page);
1651 put_page(page); 1653 put_page(page);
1652 goto repeat; 1654 goto repeat;
@@ -1731,7 +1733,6 @@ unsigned find_get_entries(struct address_space *mapping,
1731 1733
1732 rcu_read_lock(); 1734 rcu_read_lock();
1733 xas_for_each(&xas, page, ULONG_MAX) { 1735 xas_for_each(&xas, page, ULONG_MAX) {
1734 struct page *head;
1735 if (xas_retry(&xas, page)) 1736 if (xas_retry(&xas, page))
1736 continue; 1737 continue;
1737 /* 1738 /*
@@ -1742,17 +1743,13 @@ unsigned find_get_entries(struct address_space *mapping,
1742 if (xa_is_value(page)) 1743 if (xa_is_value(page))
1743 goto export; 1744 goto export;
1744 1745
1745 head = compound_head(page); 1746 if (!page_cache_get_speculative(page))
1746 if (!page_cache_get_speculative(head))
1747 goto retry; 1747 goto retry;
1748 1748
1749 /* The page was split under us? */ 1749 /* Has the page moved or been split? */
1750 if (compound_head(page) != head)
1751 goto put_page;
1752
1753 /* Has the page moved? */
1754 if (unlikely(page != xas_reload(&xas))) 1750 if (unlikely(page != xas_reload(&xas)))
1755 goto put_page; 1751 goto put_page;
1752 page = find_subpage(page, xas.xa_index);
1756 1753
1757export: 1754export:
1758 indices[ret] = xas.xa_index; 1755 indices[ret] = xas.xa_index;
@@ -1761,7 +1758,7 @@ export:
1761 break; 1758 break;
1762 continue; 1759 continue;
1763put_page: 1760put_page:
1764 put_page(head); 1761 put_page(page);
1765retry: 1762retry:
1766 xas_reset(&xas); 1763 xas_reset(&xas);
1767 } 1764 }
@@ -1803,33 +1800,27 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
1803 1800
1804 rcu_read_lock(); 1801 rcu_read_lock();
1805 xas_for_each(&xas, page, end) { 1802 xas_for_each(&xas, page, end) {
1806 struct page *head;
1807 if (xas_retry(&xas, page)) 1803 if (xas_retry(&xas, page))
1808 continue; 1804 continue;
1809 /* Skip over shadow, swap and DAX entries */ 1805 /* Skip over shadow, swap and DAX entries */
1810 if (xa_is_value(page)) 1806 if (xa_is_value(page))
1811 continue; 1807 continue;
1812 1808
1813 head = compound_head(page); 1809 if (!page_cache_get_speculative(page))
1814 if (!page_cache_get_speculative(head))
1815 goto retry; 1810 goto retry;
1816 1811
1817 /* The page was split under us? */ 1812 /* Has the page moved or been split? */
1818 if (compound_head(page) != head)
1819 goto put_page;
1820
1821 /* Has the page moved? */
1822 if (unlikely(page != xas_reload(&xas))) 1813 if (unlikely(page != xas_reload(&xas)))
1823 goto put_page; 1814 goto put_page;
1824 1815
1825 pages[ret] = page; 1816 pages[ret] = find_subpage(page, xas.xa_index);
1826 if (++ret == nr_pages) { 1817 if (++ret == nr_pages) {
1827 *start = xas.xa_index + 1; 1818 *start = xas.xa_index + 1;
1828 goto out; 1819 goto out;
1829 } 1820 }
1830 continue; 1821 continue;
1831put_page: 1822put_page:
1832 put_page(head); 1823 put_page(page);
1833retry: 1824retry:
1834 xas_reset(&xas); 1825 xas_reset(&xas);
1835 } 1826 }
@@ -1874,7 +1865,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
1874 1865
1875 rcu_read_lock(); 1866 rcu_read_lock();
1876 for (page = xas_load(&xas); page; page = xas_next(&xas)) { 1867 for (page = xas_load(&xas); page; page = xas_next(&xas)) {
1877 struct page *head;
1878 if (xas_retry(&xas, page)) 1868 if (xas_retry(&xas, page))
1879 continue; 1869 continue;
1880 /* 1870 /*
@@ -1884,24 +1874,19 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
1884 if (xa_is_value(page)) 1874 if (xa_is_value(page))
1885 break; 1875 break;
1886 1876
1887 head = compound_head(page); 1877 if (!page_cache_get_speculative(page))
1888 if (!page_cache_get_speculative(head))
1889 goto retry; 1878 goto retry;
1890 1879
1891 /* The page was split under us? */ 1880 /* Has the page moved or been split? */
1892 if (compound_head(page) != head)
1893 goto put_page;
1894
1895 /* Has the page moved? */
1896 if (unlikely(page != xas_reload(&xas))) 1881 if (unlikely(page != xas_reload(&xas)))
1897 goto put_page; 1882 goto put_page;
1898 1883
1899 pages[ret] = page; 1884 pages[ret] = find_subpage(page, xas.xa_index);
1900 if (++ret == nr_pages) 1885 if (++ret == nr_pages)
1901 break; 1886 break;
1902 continue; 1887 continue;
1903put_page: 1888put_page:
1904 put_page(head); 1889 put_page(page);
1905retry: 1890retry:
1906 xas_reset(&xas); 1891 xas_reset(&xas);
1907 } 1892 }
@@ -1937,7 +1922,6 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
1937 1922
1938 rcu_read_lock(); 1923 rcu_read_lock();
1939 xas_for_each_marked(&xas, page, end, tag) { 1924 xas_for_each_marked(&xas, page, end, tag) {
1940 struct page *head;
1941 if (xas_retry(&xas, page)) 1925 if (xas_retry(&xas, page))
1942 continue; 1926 continue;
1943 /* 1927 /*
@@ -1948,26 +1932,21 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
1948 if (xa_is_value(page)) 1932 if (xa_is_value(page))
1949 continue; 1933 continue;
1950 1934
1951 head = compound_head(page); 1935 if (!page_cache_get_speculative(page))
1952 if (!page_cache_get_speculative(head))
1953 goto retry; 1936 goto retry;
1954 1937
1955 /* The page was split under us? */ 1938 /* Has the page moved or been split? */
1956 if (compound_head(page) != head)
1957 goto put_page;
1958
1959 /* Has the page moved? */
1960 if (unlikely(page != xas_reload(&xas))) 1939 if (unlikely(page != xas_reload(&xas)))
1961 goto put_page; 1940 goto put_page;
1962 1941
1963 pages[ret] = page; 1942 pages[ret] = find_subpage(page, xas.xa_index);
1964 if (++ret == nr_pages) { 1943 if (++ret == nr_pages) {
1965 *index = xas.xa_index + 1; 1944 *index = xas.xa_index + 1;
1966 goto out; 1945 goto out;
1967 } 1946 }
1968 continue; 1947 continue;
1969put_page: 1948put_page:
1970 put_page(head); 1949 put_page(page);
1971retry: 1950retry:
1972 xas_reset(&xas); 1951 xas_reset(&xas);
1973 } 1952 }
@@ -2562,12 +2541,12 @@ retry_find:
2562 goto out_retry; 2541 goto out_retry;
2563 2542
2564 /* Did it get truncated? */ 2543 /* Did it get truncated? */
2565 if (unlikely(page->mapping != mapping)) { 2544 if (unlikely(compound_head(page)->mapping != mapping)) {
2566 unlock_page(page); 2545 unlock_page(page);
2567 put_page(page); 2546 put_page(page);
2568 goto retry_find; 2547 goto retry_find;
2569 } 2548 }
2570 VM_BUG_ON_PAGE(page->index != offset, page); 2549 VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
2571 2550
2572 /* 2551 /*
2573 * We have a locked page in the page cache, now we need to check 2552 * We have a locked page in the page cache, now we need to check
@@ -2648,7 +2627,7 @@ void filemap_map_pages(struct vm_fault *vmf,
2648 pgoff_t last_pgoff = start_pgoff; 2627 pgoff_t last_pgoff = start_pgoff;
2649 unsigned long max_idx; 2628 unsigned long max_idx;
2650 XA_STATE(xas, &mapping->i_pages, start_pgoff); 2629 XA_STATE(xas, &mapping->i_pages, start_pgoff);
2651 struct page *head, *page; 2630 struct page *page;
2652 2631
2653 rcu_read_lock(); 2632 rcu_read_lock();
2654 xas_for_each(&xas, page, end_pgoff) { 2633 xas_for_each(&xas, page, end_pgoff) {
@@ -2657,24 +2636,19 @@ void filemap_map_pages(struct vm_fault *vmf,
2657 if (xa_is_value(page)) 2636 if (xa_is_value(page))
2658 goto next; 2637 goto next;
2659 2638
2660 head = compound_head(page);
2661
2662 /* 2639 /*
2663 * Check for a locked page first, as a speculative 2640 * Check for a locked page first, as a speculative
2664 * reference may adversely influence page migration. 2641 * reference may adversely influence page migration.
2665 */ 2642 */
2666 if (PageLocked(head)) 2643 if (PageLocked(page))
2667 goto next; 2644 goto next;
2668 if (!page_cache_get_speculative(head)) 2645 if (!page_cache_get_speculative(page))
2669 goto next; 2646 goto next;
2670 2647
2671 /* The page was split under us? */ 2648 /* Has the page moved or been split? */
2672 if (compound_head(page) != head)
2673 goto skip;
2674
2675 /* Has the page moved? */
2676 if (unlikely(page != xas_reload(&xas))) 2649 if (unlikely(page != xas_reload(&xas)))
2677 goto skip; 2650 goto skip;
2651 page = find_subpage(page, xas.xa_index);
2678 2652
2679 if (!PageUptodate(page) || 2653 if (!PageUptodate(page) ||
2680 PageReadahead(page) || 2654 PageReadahead(page) ||
diff --git a/mm/gup.c b/mm/gup.c
index 98f13ab37bac..60c3915c8ee6 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -29,85 +29,70 @@ struct follow_page_context {
29 unsigned int page_mask; 29 unsigned int page_mask;
30}; 30};
31 31
32typedef int (*set_dirty_func_t)(struct page *page);
33
34static void __put_user_pages_dirty(struct page **pages,
35 unsigned long npages,
36 set_dirty_func_t sdf)
37{
38 unsigned long index;
39
40 for (index = 0; index < npages; index++) {
41 struct page *page = compound_head(pages[index]);
42
43 /*
44 * Checking PageDirty at this point may race with
45 * clear_page_dirty_for_io(), but that's OK. Two key cases:
46 *
47 * 1) This code sees the page as already dirty, so it skips
48 * the call to sdf(). That could happen because
49 * clear_page_dirty_for_io() called page_mkclean(),
50 * followed by set_page_dirty(). However, now the page is
51 * going to get written back, which meets the original
52 * intention of setting it dirty, so all is well:
53 * clear_page_dirty_for_io() goes on to call
54 * TestClearPageDirty(), and write the page back.
55 *
56 * 2) This code sees the page as clean, so it calls sdf().
57 * The page stays dirty, despite being written back, so it
58 * gets written back again in the next writeback cycle.
59 * This is harmless.
60 */
61 if (!PageDirty(page))
62 sdf(page);
63
64 put_user_page(page);
65 }
66}
67
68/** 32/**
69 * put_user_pages_dirty() - release and dirty an array of gup-pinned pages 33 * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
70 * @pages: array of pages to be marked dirty and released. 34 * @pages: array of pages to be maybe marked dirty, and definitely released.
71 * @npages: number of pages in the @pages array. 35 * @npages: number of pages in the @pages array.
36 * @make_dirty: whether to mark the pages dirty
72 * 37 *
73 * "gup-pinned page" refers to a page that has had one of the get_user_pages() 38 * "gup-pinned page" refers to a page that has had one of the get_user_pages()
74 * variants called on that page. 39 * variants called on that page.
75 * 40 *
76 * For each page in the @pages array, make that page (or its head page, if a 41 * For each page in the @pages array, make that page (or its head page, if a
77 * compound page) dirty, if it was previously listed as clean. Then, release 42 * compound page) dirty, if @make_dirty is true, and if the page was previously
78 * the page using put_user_page(). 43 * listed as clean. In any case, releases all pages using put_user_page(),
44 * possibly via put_user_pages(), for the non-dirty case.
79 * 45 *
80 * Please see the put_user_page() documentation for details. 46 * Please see the put_user_page() documentation for details.
81 * 47 *
82 * set_page_dirty(), which does not lock the page, is used here. 48 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
83 * Therefore, it is the caller's responsibility to ensure that this is 49 * required, then the caller should a) verify that this is really correct,
84 * safe. If not, then put_user_pages_dirty_lock() should be called instead. 50 * because _lock() is usually required, and b) hand code it:
51 * set_page_dirty_lock(), put_user_page().
85 * 52 *
86 */ 53 */
87void put_user_pages_dirty(struct page **pages, unsigned long npages) 54void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
55 bool make_dirty)
88{ 56{
89 __put_user_pages_dirty(pages, npages, set_page_dirty); 57 unsigned long index;
90}
91EXPORT_SYMBOL(put_user_pages_dirty);
92 58
93/** 59 /*
94 * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages 60 * TODO: this can be optimized for huge pages: if a series of pages is
95 * @pages: array of pages to be marked dirty and released. 61 * physically contiguous and part of the same compound page, then a
96 * @npages: number of pages in the @pages array. 62 * single operation to the head page should suffice.
97 * 63 */
98 * For each page in the @pages array, make that page (or its head page, if a 64
99 * compound page) dirty, if it was previously listed as clean. Then, release 65 if (!make_dirty) {
100 * the page using put_user_page(). 66 put_user_pages(pages, npages);
101 * 67 return;
102 * Please see the put_user_page() documentation for details. 68 }
103 * 69
104 * This is just like put_user_pages_dirty(), except that it invokes 70 for (index = 0; index < npages; index++) {
105 * set_page_dirty_lock(), instead of set_page_dirty(). 71 struct page *page = compound_head(pages[index]);
106 * 72 /*
107 */ 73 * Checking PageDirty at this point may race with
108void put_user_pages_dirty_lock(struct page **pages, unsigned long npages) 74 * clear_page_dirty_for_io(), but that's OK. Two key
109{ 75 * cases:
110 __put_user_pages_dirty(pages, npages, set_page_dirty_lock); 76 *
77 * 1) This code sees the page as already dirty, so it
78 * skips the call to set_page_dirty(). That could happen
79 * because clear_page_dirty_for_io() called
80 * page_mkclean(), followed by set_page_dirty().
81 * However, now the page is going to get written back,
82 * which meets the original intention of setting it
83 * dirty, so all is well: clear_page_dirty_for_io() goes
84 * on to call TestClearPageDirty(), and write the page
85 * back.
86 *
87 * 2) This code sees the page as clean, so it calls
88 * set_page_dirty(). The page stays dirty, despite being
89 * written back, so it gets written back again in the
90 * next writeback cycle. This is harmless.
91 */
92 if (!PageDirty(page))
93 set_page_dirty_lock(page);
94 put_user_page(page);
95 }
111} 96}
112EXPORT_SYMBOL(put_user_pages_dirty_lock); 97EXPORT_SYMBOL(put_user_pages_dirty_lock);
113 98
@@ -399,7 +384,7 @@ retry_locked:
399 spin_unlock(ptl); 384 spin_unlock(ptl);
400 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); 385 return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
401 } 386 }
402 if (flags & FOLL_SPLIT) { 387 if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
403 int ret; 388 int ret;
404 page = pmd_page(*pmd); 389 page = pmd_page(*pmd);
405 if (is_huge_zero_page(page)) { 390 if (is_huge_zero_page(page)) {
@@ -408,7 +393,7 @@ retry_locked:
408 split_huge_pmd(vma, pmd, address); 393 split_huge_pmd(vma, pmd, address);
409 if (pmd_trans_unstable(pmd)) 394 if (pmd_trans_unstable(pmd))
410 ret = -EBUSY; 395 ret = -EBUSY;
411 } else { 396 } else if (flags & FOLL_SPLIT) {
412 if (unlikely(!try_get_page(page))) { 397 if (unlikely(!try_get_page(page))) {
413 spin_unlock(ptl); 398 spin_unlock(ptl);
414 return ERR_PTR(-ENOMEM); 399 return ERR_PTR(-ENOMEM);
@@ -420,6 +405,10 @@ retry_locked:
420 put_page(page); 405 put_page(page);
421 if (pmd_none(*pmd)) 406 if (pmd_none(*pmd))
422 return no_page_table(vma, flags); 407 return no_page_table(vma, flags);
408 } else { /* flags & FOLL_SPLIT_PMD */
409 spin_unlock(ptl);
410 split_huge_pmd(vma, pmd, address);
411 ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
423 } 412 }
424 413
425 return ret ? ERR_PTR(ret) : 414 return ret ? ERR_PTR(ret) :
@@ -1460,7 +1449,7 @@ check_again:
1460 * gup may start from a tail page. Advance step by the left 1449 * gup may start from a tail page. Advance step by the left
1461 * part. 1450 * part.
1462 */ 1451 */
1463 step = (1 << compound_order(head)) - (pages[i] - head); 1452 step = compound_nr(head) - (pages[i] - head);
1464 /* 1453 /*
1465 * If we get a page from the CMA zone, since we are going to 1454 * If we get a page from the CMA zone, since we are going to
1466 * be pinning these entries, we might as well move them out 1455 * be pinning these entries, we might as well move them out
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index de1f15969e27..73fc517c08d2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -496,11 +496,25 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
496 return pmd; 496 return pmd;
497} 497}
498 498
499static inline struct list_head *page_deferred_list(struct page *page) 499#ifdef CONFIG_MEMCG
500static inline struct deferred_split *get_deferred_split_queue(struct page *page)
500{ 501{
501 /* ->lru in the tail pages is occupied by compound_head. */ 502 struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
502 return &page[2].deferred_list; 503 struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
504
505 if (memcg)
506 return &memcg->deferred_split_queue;
507 else
508 return &pgdat->deferred_split_queue;
509}
510#else
511static inline struct deferred_split *get_deferred_split_queue(struct page *page)
512{
513 struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
514
515 return &pgdat->deferred_split_queue;
503} 516}
517#endif
504 518
505void prep_transhuge_page(struct page *page) 519void prep_transhuge_page(struct page *page)
506{ 520{
@@ -2497,6 +2511,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
2497 struct page *head = compound_head(page); 2511 struct page *head = compound_head(page);
2498 pg_data_t *pgdat = page_pgdat(head); 2512 pg_data_t *pgdat = page_pgdat(head);
2499 struct lruvec *lruvec; 2513 struct lruvec *lruvec;
2514 struct address_space *swap_cache = NULL;
2515 unsigned long offset = 0;
2500 int i; 2516 int i;
2501 2517
2502 lruvec = mem_cgroup_page_lruvec(head, pgdat); 2518 lruvec = mem_cgroup_page_lruvec(head, pgdat);
@@ -2504,6 +2520,14 @@ static void __split_huge_page(struct page *page, struct list_head *list,
2504 /* complete memcg works before add pages to LRU */ 2520 /* complete memcg works before add pages to LRU */
2505 mem_cgroup_split_huge_fixup(head); 2521 mem_cgroup_split_huge_fixup(head);
2506 2522
2523 if (PageAnon(head) && PageSwapCache(head)) {
2524 swp_entry_t entry = { .val = page_private(head) };
2525
2526 offset = swp_offset(entry);
2527 swap_cache = swap_address_space(entry);
2528 xa_lock(&swap_cache->i_pages);
2529 }
2530
2507 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { 2531 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
2508 __split_huge_page_tail(head, i, lruvec, list); 2532 __split_huge_page_tail(head, i, lruvec, list);
2509 /* Some pages can be beyond i_size: drop them from page cache */ 2533 /* Some pages can be beyond i_size: drop them from page cache */
@@ -2513,6 +2537,12 @@ static void __split_huge_page(struct page *page, struct list_head *list,
2513 if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) 2537 if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
2514 shmem_uncharge(head->mapping->host, 1); 2538 shmem_uncharge(head->mapping->host, 1);
2515 put_page(head + i); 2539 put_page(head + i);
2540 } else if (!PageAnon(page)) {
2541 __xa_store(&head->mapping->i_pages, head[i].index,
2542 head + i, 0);
2543 } else if (swap_cache) {
2544 __xa_store(&swap_cache->i_pages, offset + i,
2545 head + i, 0);
2516 } 2546 }
2517 } 2547 }
2518 2548
@@ -2523,10 +2553,12 @@ static void __split_huge_page(struct page *page, struct list_head *list,
2523 /* See comment in __split_huge_page_tail() */ 2553 /* See comment in __split_huge_page_tail() */
2524 if (PageAnon(head)) { 2554 if (PageAnon(head)) {
2525 /* Additional pin to swap cache */ 2555 /* Additional pin to swap cache */
2526 if (PageSwapCache(head)) 2556 if (PageSwapCache(head)) {
2527 page_ref_add(head, 2); 2557 page_ref_add(head, 2);
2528 else 2558 xa_unlock(&swap_cache->i_pages);
2559 } else {
2529 page_ref_inc(head); 2560 page_ref_inc(head);
2561 }
2530 } else { 2562 } else {
2531 /* Additional pin to page cache */ 2563 /* Additional pin to page cache */
2532 page_ref_add(head, 2); 2564 page_ref_add(head, 2);
@@ -2673,6 +2705,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2673{ 2705{
2674 struct page *head = compound_head(page); 2706 struct page *head = compound_head(page);
2675 struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); 2707 struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
2708 struct deferred_split *ds_queue = get_deferred_split_queue(page);
2676 struct anon_vma *anon_vma = NULL; 2709 struct anon_vma *anon_vma = NULL;
2677 struct address_space *mapping = NULL; 2710 struct address_space *mapping = NULL;
2678 int count, mapcount, extra_pins, ret; 2711 int count, mapcount, extra_pins, ret;
@@ -2759,17 +2792,17 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2759 } 2792 }
2760 2793
2761 /* Prevent deferred_split_scan() touching ->_refcount */ 2794 /* Prevent deferred_split_scan() touching ->_refcount */
2762 spin_lock(&pgdata->split_queue_lock); 2795 spin_lock(&ds_queue->split_queue_lock);
2763 count = page_count(head); 2796 count = page_count(head);
2764 mapcount = total_mapcount(head); 2797 mapcount = total_mapcount(head);
2765 if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { 2798 if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
2766 if (!list_empty(page_deferred_list(head))) { 2799 if (!list_empty(page_deferred_list(head))) {
2767 pgdata->split_queue_len--; 2800 ds_queue->split_queue_len--;
2768 list_del(page_deferred_list(head)); 2801 list_del(page_deferred_list(head));
2769 } 2802 }
2770 if (mapping) 2803 if (mapping)
2771 __dec_node_page_state(page, NR_SHMEM_THPS); 2804 __dec_node_page_state(page, NR_SHMEM_THPS);
2772 spin_unlock(&pgdata->split_queue_lock); 2805 spin_unlock(&ds_queue->split_queue_lock);
2773 __split_huge_page(page, list, end, flags); 2806 __split_huge_page(page, list, end, flags);
2774 if (PageSwapCache(head)) { 2807 if (PageSwapCache(head)) {
2775 swp_entry_t entry = { .val = page_private(head) }; 2808 swp_entry_t entry = { .val = page_private(head) };
@@ -2786,7 +2819,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
2786 dump_page(page, "total_mapcount(head) > 0"); 2819 dump_page(page, "total_mapcount(head) > 0");
2787 BUG(); 2820 BUG();
2788 } 2821 }
2789 spin_unlock(&pgdata->split_queue_lock); 2822 spin_unlock(&ds_queue->split_queue_lock);
2790fail: if (mapping) 2823fail: if (mapping)
2791 xa_unlock(&mapping->i_pages); 2824 xa_unlock(&mapping->i_pages);
2792 spin_unlock_irqrestore(&pgdata->lru_lock, flags); 2825 spin_unlock_irqrestore(&pgdata->lru_lock, flags);
@@ -2808,53 +2841,86 @@ out:
2808 2841
2809void free_transhuge_page(struct page *page) 2842void free_transhuge_page(struct page *page)
2810{ 2843{
2811 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 2844 struct deferred_split *ds_queue = get_deferred_split_queue(page);
2812 unsigned long flags; 2845 unsigned long flags;
2813 2846
2814 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2847 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2815 if (!list_empty(page_deferred_list(page))) { 2848 if (!list_empty(page_deferred_list(page))) {
2816 pgdata->split_queue_len--; 2849 ds_queue->split_queue_len--;
2817 list_del(page_deferred_list(page)); 2850 list_del(page_deferred_list(page));
2818 } 2851 }
2819 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2852 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
2820 free_compound_page(page); 2853 free_compound_page(page);
2821} 2854}
2822 2855
2823void deferred_split_huge_page(struct page *page) 2856void deferred_split_huge_page(struct page *page)
2824{ 2857{
2825 struct pglist_data *pgdata = NODE_DATA(page_to_nid(page)); 2858 struct deferred_split *ds_queue = get_deferred_split_queue(page);
2859#ifdef CONFIG_MEMCG
2860 struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
2861#endif
2826 unsigned long flags; 2862 unsigned long flags;
2827 2863
2828 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 2864 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
2829 2865
2830 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2866 /*
2867 * The try_to_unmap() in page reclaim path might reach here too,
2868 * this may cause a race condition to corrupt deferred split queue.
2869 * And, if page reclaim is already handling the same page, it is
2870 * unnecessary to handle it again in shrinker.
2871 *
2872 * Check PageSwapCache to determine if the page is being
2873 * handled by page reclaim since THP swap would add the page into
2874 * swap cache before calling try_to_unmap().
2875 */
2876 if (PageSwapCache(page))
2877 return;
2878
2879 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2831 if (list_empty(page_deferred_list(page))) { 2880 if (list_empty(page_deferred_list(page))) {
2832 count_vm_event(THP_DEFERRED_SPLIT_PAGE); 2881 count_vm_event(THP_DEFERRED_SPLIT_PAGE);
2833 list_add_tail(page_deferred_list(page), &pgdata->split_queue); 2882 list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
2834 pgdata->split_queue_len++; 2883 ds_queue->split_queue_len++;
2884#ifdef CONFIG_MEMCG
2885 if (memcg)
2886 memcg_set_shrinker_bit(memcg, page_to_nid(page),
2887 deferred_split_shrinker.id);
2888#endif
2835 } 2889 }
2836 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2890 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
2837} 2891}
2838 2892
2839static unsigned long deferred_split_count(struct shrinker *shrink, 2893static unsigned long deferred_split_count(struct shrinker *shrink,
2840 struct shrink_control *sc) 2894 struct shrink_control *sc)
2841{ 2895{
2842 struct pglist_data *pgdata = NODE_DATA(sc->nid); 2896 struct pglist_data *pgdata = NODE_DATA(sc->nid);
2843 return READ_ONCE(pgdata->split_queue_len); 2897 struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
2898
2899#ifdef CONFIG_MEMCG
2900 if (sc->memcg)
2901 ds_queue = &sc->memcg->deferred_split_queue;
2902#endif
2903 return READ_ONCE(ds_queue->split_queue_len);
2844} 2904}
2845 2905
2846static unsigned long deferred_split_scan(struct shrinker *shrink, 2906static unsigned long deferred_split_scan(struct shrinker *shrink,
2847 struct shrink_control *sc) 2907 struct shrink_control *sc)
2848{ 2908{
2849 struct pglist_data *pgdata = NODE_DATA(sc->nid); 2909 struct pglist_data *pgdata = NODE_DATA(sc->nid);
2910 struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
2850 unsigned long flags; 2911 unsigned long flags;
2851 LIST_HEAD(list), *pos, *next; 2912 LIST_HEAD(list), *pos, *next;
2852 struct page *page; 2913 struct page *page;
2853 int split = 0; 2914 int split = 0;
2854 2915
2855 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2916#ifdef CONFIG_MEMCG
2917 if (sc->memcg)
2918 ds_queue = &sc->memcg->deferred_split_queue;
2919#endif
2920
2921 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2856 /* Take pin on all head pages to avoid freeing them under us */ 2922 /* Take pin on all head pages to avoid freeing them under us */
2857 list_for_each_safe(pos, next, &pgdata->split_queue) { 2923 list_for_each_safe(pos, next, &ds_queue->split_queue) {
2858 page = list_entry((void *)pos, struct page, mapping); 2924 page = list_entry((void *)pos, struct page, mapping);
2859 page = compound_head(page); 2925 page = compound_head(page);
2860 if (get_page_unless_zero(page)) { 2926 if (get_page_unless_zero(page)) {
@@ -2862,12 +2928,12 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
2862 } else { 2928 } else {
2863 /* We lost race with put_compound_page() */ 2929 /* We lost race with put_compound_page() */
2864 list_del_init(page_deferred_list(page)); 2930 list_del_init(page_deferred_list(page));
2865 pgdata->split_queue_len--; 2931 ds_queue->split_queue_len--;
2866 } 2932 }
2867 if (!--sc->nr_to_scan) 2933 if (!--sc->nr_to_scan)
2868 break; 2934 break;
2869 } 2935 }
2870 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2936 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
2871 2937
2872 list_for_each_safe(pos, next, &list) { 2938 list_for_each_safe(pos, next, &list) {
2873 page = list_entry((void *)pos, struct page, mapping); 2939 page = list_entry((void *)pos, struct page, mapping);
@@ -2881,15 +2947,15 @@ next:
2881 put_page(page); 2947 put_page(page);
2882 } 2948 }
2883 2949
2884 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2950 spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2885 list_splice_tail(&list, &pgdata->split_queue); 2951 list_splice_tail(&list, &ds_queue->split_queue);
2886 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2952 spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
2887 2953
2888 /* 2954 /*
2889 * Stop shrinker if we didn't split any page, but the queue is empty. 2955 * Stop shrinker if we didn't split any page, but the queue is empty.
2890 * This can happen if pages were freed under us. 2956 * This can happen if pages were freed under us.
2891 */ 2957 */
2892 if (!split && list_empty(&pgdata->split_queue)) 2958 if (!split && list_empty(&ds_queue->split_queue))
2893 return SHRINK_STOP; 2959 return SHRINK_STOP;
2894 return split; 2960 return split;
2895} 2961}
@@ -2898,7 +2964,8 @@ static struct shrinker deferred_split_shrinker = {
2898 .count_objects = deferred_split_count, 2964 .count_objects = deferred_split_count,
2899 .scan_objects = deferred_split_scan, 2965 .scan_objects = deferred_split_scan,
2900 .seeks = DEFAULT_SEEKS, 2966 .seeks = DEFAULT_SEEKS,
2901 .flags = SHRINKER_NUMA_AWARE, 2967 .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
2968 SHRINKER_NONSLAB,
2902}; 2969};
2903 2970
2904#ifdef CONFIG_DEBUG_FS 2971#ifdef CONFIG_DEBUG_FS
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6d7296dd11b8..ef37c85423a5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1405,12 +1405,25 @@ pgoff_t __basepage_index(struct page *page)
1405} 1405}
1406 1406
1407static struct page *alloc_buddy_huge_page(struct hstate *h, 1407static struct page *alloc_buddy_huge_page(struct hstate *h,
1408 gfp_t gfp_mask, int nid, nodemask_t *nmask) 1408 gfp_t gfp_mask, int nid, nodemask_t *nmask,
1409 nodemask_t *node_alloc_noretry)
1409{ 1410{
1410 int order = huge_page_order(h); 1411 int order = huge_page_order(h);
1411 struct page *page; 1412 struct page *page;
1413 bool alloc_try_hard = true;
1412 1414
1413 gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; 1415 /*
1416 * By default we always try hard to allocate the page with
1417 * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
1418 * a loop (to adjust global huge page counts) and previous allocation
1419 * failed, do not continue to try hard on the same node. Use the
1420 * node_alloc_noretry bitmap to manage this state information.
1421 */
1422 if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
1423 alloc_try_hard = false;
1424 gfp_mask |= __GFP_COMP|__GFP_NOWARN;
1425 if (alloc_try_hard)
1426 gfp_mask |= __GFP_RETRY_MAYFAIL;
1414 if (nid == NUMA_NO_NODE) 1427 if (nid == NUMA_NO_NODE)
1415 nid = numa_mem_id(); 1428 nid = numa_mem_id();
1416 page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); 1429 page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
@@ -1419,6 +1432,22 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
1419 else 1432 else
1420 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); 1433 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
1421 1434
1435 /*
1436 * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
1437 * indicates an overall state change. Clear bit so that we resume
1438 * normal 'try hard' allocations.
1439 */
1440 if (node_alloc_noretry && page && !alloc_try_hard)
1441 node_clear(nid, *node_alloc_noretry);
1442
1443 /*
1444 * If we tried hard to get a page but failed, set bit so that
1445 * subsequent attempts will not try as hard until there is an
1446 * overall state change.
1447 */
1448 if (node_alloc_noretry && !page && alloc_try_hard)
1449 node_set(nid, *node_alloc_noretry);
1450
1422 return page; 1451 return page;
1423} 1452}
1424 1453
@@ -1427,7 +1456,8 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
1427 * should use this function to get new hugetlb pages 1456 * should use this function to get new hugetlb pages
1428 */ 1457 */
1429static struct page *alloc_fresh_huge_page(struct hstate *h, 1458static struct page *alloc_fresh_huge_page(struct hstate *h,
1430 gfp_t gfp_mask, int nid, nodemask_t *nmask) 1459 gfp_t gfp_mask, int nid, nodemask_t *nmask,
1460 nodemask_t *node_alloc_noretry)
1431{ 1461{
1432 struct page *page; 1462 struct page *page;
1433 1463
@@ -1435,7 +1465,7 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
1435 page = alloc_gigantic_page(h, gfp_mask, nid, nmask); 1465 page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
1436 else 1466 else
1437 page = alloc_buddy_huge_page(h, gfp_mask, 1467 page = alloc_buddy_huge_page(h, gfp_mask,
1438 nid, nmask); 1468 nid, nmask, node_alloc_noretry);
1439 if (!page) 1469 if (!page)
1440 return NULL; 1470 return NULL;
1441 1471
@@ -1450,14 +1480,16 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
1450 * Allocates a fresh page to the hugetlb allocator pool in the node interleaved 1480 * Allocates a fresh page to the hugetlb allocator pool in the node interleaved
1451 * manner. 1481 * manner.
1452 */ 1482 */
1453static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) 1483static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
1484 nodemask_t *node_alloc_noretry)
1454{ 1485{
1455 struct page *page; 1486 struct page *page;
1456 int nr_nodes, node; 1487 int nr_nodes, node;
1457 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; 1488 gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
1458 1489
1459 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { 1490 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1460 page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed); 1491 page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
1492 node_alloc_noretry);
1461 if (page) 1493 if (page)
1462 break; 1494 break;
1463 } 1495 }
@@ -1601,7 +1633,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
1601 goto out_unlock; 1633 goto out_unlock;
1602 spin_unlock(&hugetlb_lock); 1634 spin_unlock(&hugetlb_lock);
1603 1635
1604 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); 1636 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
1605 if (!page) 1637 if (!page)
1606 return NULL; 1638 return NULL;
1607 1639
@@ -1637,7 +1669,7 @@ struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
1637 if (hstate_is_gigantic(h)) 1669 if (hstate_is_gigantic(h))
1638 return NULL; 1670 return NULL;
1639 1671
1640 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); 1672 page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
1641 if (!page) 1673 if (!page)
1642 return NULL; 1674 return NULL;
1643 1675
@@ -2207,13 +2239,33 @@ static void __init gather_bootmem_prealloc(void)
2207static void __init hugetlb_hstate_alloc_pages(struct hstate *h) 2239static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
2208{ 2240{
2209 unsigned long i; 2241 unsigned long i;
2242 nodemask_t *node_alloc_noretry;
2243
2244 if (!hstate_is_gigantic(h)) {
2245 /*
2246 * Bit mask controlling how hard we retry per-node allocations.
2247 * Ignore errors as lower level routines can deal with
2248 * node_alloc_noretry == NULL. If this kmalloc fails at boot
2249 * time, we are likely in bigger trouble.
2250 */
2251 node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
2252 GFP_KERNEL);
2253 } else {
2254 /* allocations done at boot time */
2255 node_alloc_noretry = NULL;
2256 }
2257
2258 /* bit mask controlling how hard we retry per-node allocations */
2259 if (node_alloc_noretry)
2260 nodes_clear(*node_alloc_noretry);
2210 2261
2211 for (i = 0; i < h->max_huge_pages; ++i) { 2262 for (i = 0; i < h->max_huge_pages; ++i) {
2212 if (hstate_is_gigantic(h)) { 2263 if (hstate_is_gigantic(h)) {
2213 if (!alloc_bootmem_huge_page(h)) 2264 if (!alloc_bootmem_huge_page(h))
2214 break; 2265 break;
2215 } else if (!alloc_pool_huge_page(h, 2266 } else if (!alloc_pool_huge_page(h,
2216 &node_states[N_MEMORY])) 2267 &node_states[N_MEMORY],
2268 node_alloc_noretry))
2217 break; 2269 break;
2218 cond_resched(); 2270 cond_resched();
2219 } 2271 }
@@ -2225,6 +2277,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
2225 h->max_huge_pages, buf, i); 2277 h->max_huge_pages, buf, i);
2226 h->max_huge_pages = i; 2278 h->max_huge_pages = i;
2227 } 2279 }
2280
2281 kfree(node_alloc_noretry);
2228} 2282}
2229 2283
2230static void __init hugetlb_init_hstates(void) 2284static void __init hugetlb_init_hstates(void)
@@ -2323,6 +2377,17 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
2323 nodemask_t *nodes_allowed) 2377 nodemask_t *nodes_allowed)
2324{ 2378{
2325 unsigned long min_count, ret; 2379 unsigned long min_count, ret;
2380 NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
2381
2382 /*
2383 * Bit mask controlling how hard we retry per-node allocations.
2384 * If we can not allocate the bit mask, do not attempt to allocate
2385 * the requested huge pages.
2386 */
2387 if (node_alloc_noretry)
2388 nodes_clear(*node_alloc_noretry);
2389 else
2390 return -ENOMEM;
2326 2391
2327 spin_lock(&hugetlb_lock); 2392 spin_lock(&hugetlb_lock);
2328 2393
@@ -2356,6 +2421,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
2356 if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { 2421 if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
2357 if (count > persistent_huge_pages(h)) { 2422 if (count > persistent_huge_pages(h)) {
2358 spin_unlock(&hugetlb_lock); 2423 spin_unlock(&hugetlb_lock);
2424 NODEMASK_FREE(node_alloc_noretry);
2359 return -EINVAL; 2425 return -EINVAL;
2360 } 2426 }
2361 /* Fall through to decrease pool */ 2427 /* Fall through to decrease pool */
@@ -2388,7 +2454,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
2388 /* yield cpu to avoid soft lockup */ 2454 /* yield cpu to avoid soft lockup */
2389 cond_resched(); 2455 cond_resched();
2390 2456
2391 ret = alloc_pool_huge_page(h, nodes_allowed); 2457 ret = alloc_pool_huge_page(h, nodes_allowed,
2458 node_alloc_noretry);
2392 spin_lock(&hugetlb_lock); 2459 spin_lock(&hugetlb_lock);
2393 if (!ret) 2460 if (!ret)
2394 goto out; 2461 goto out;
@@ -2429,6 +2496,8 @@ out:
2429 h->max_huge_pages = persistent_huge_pages(h); 2496 h->max_huge_pages = persistent_huge_pages(h);
2430 spin_unlock(&hugetlb_lock); 2497 spin_unlock(&hugetlb_lock);
2431 2498
2499 NODEMASK_FREE(node_alloc_noretry);
2500
2432 return 0; 2501 return 0;
2433} 2502}
2434 2503
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 68c2f2f3c05b..f1930fa0b445 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -139,7 +139,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
139 if (!page_hcg || page_hcg != h_cg) 139 if (!page_hcg || page_hcg != h_cg)
140 goto out; 140 goto out;
141 141
142 nr_pages = 1 << compound_order(page); 142 nr_pages = compound_nr(page);
143 if (!parent) { 143 if (!parent) {
144 parent = root_h_cgroup; 144 parent = root_h_cgroup;
145 /* root has no limit */ 145 /* root has no limit */
diff --git a/mm/init-mm.c b/mm/init-mm.c
index a787a319211e..fb1e15028ef0 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -35,6 +35,6 @@ struct mm_struct init_mm = {
35 .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), 35 .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
36 .mmlist = LIST_HEAD_INIT(init_mm.mmlist), 36 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
37 .user_ns = &init_user_ns, 37 .user_ns = &init_user_ns,
38 .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, 38 .cpu_bitmap = CPU_BITS_NONE,
39 INIT_MM_CONTEXT(init_mm) 39 INIT_MM_CONTEXT(init_mm)
40}; 40};
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 95d16a42db6b..6814d6d6a023 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -304,7 +304,6 @@ size_t kasan_metadata_size(struct kmem_cache *cache)
304struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, 304struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
305 const void *object) 305 const void *object)
306{ 306{
307 BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
308 return (void *)object + cache->kasan_info.alloc_meta_offset; 307 return (void *)object + cache->kasan_info.alloc_meta_offset;
309} 308}
310 309
@@ -315,14 +314,31 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
315 return (void *)object + cache->kasan_info.free_meta_offset; 314 return (void *)object + cache->kasan_info.free_meta_offset;
316} 315}
317 316
317
318static void kasan_set_free_info(struct kmem_cache *cache,
319 void *object, u8 tag)
320{
321 struct kasan_alloc_meta *alloc_meta;
322 u8 idx = 0;
323
324 alloc_meta = get_alloc_info(cache, object);
325
326#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
327 idx = alloc_meta->free_track_idx;
328 alloc_meta->free_pointer_tag[idx] = tag;
329 alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
330#endif
331
332 set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
333}
334
318void kasan_poison_slab(struct page *page) 335void kasan_poison_slab(struct page *page)
319{ 336{
320 unsigned long i; 337 unsigned long i;
321 338
322 for (i = 0; i < (1 << compound_order(page)); i++) 339 for (i = 0; i < compound_nr(page); i++)
323 page_kasan_tag_reset(page + i); 340 page_kasan_tag_reset(page + i);
324 kasan_poison_shadow(page_address(page), 341 kasan_poison_shadow(page_address(page), page_size(page),
325 PAGE_SIZE << compound_order(page),
326 KASAN_KMALLOC_REDZONE); 342 KASAN_KMALLOC_REDZONE);
327} 343}
328 344
@@ -452,7 +468,8 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
452 unlikely(!(cache->flags & SLAB_KASAN))) 468 unlikely(!(cache->flags & SLAB_KASAN)))
453 return false; 469 return false;
454 470
455 set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT); 471 kasan_set_free_info(cache, object, tag);
472
456 quarantine_put(get_free_info(cache, object), cache); 473 quarantine_put(get_free_info(cache, object), cache);
457 474
458 return IS_ENABLED(CONFIG_KASAN_GENERIC); 475 return IS_ENABLED(CONFIG_KASAN_GENERIC);
@@ -524,7 +541,7 @@ void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
524 page = virt_to_page(ptr); 541 page = virt_to_page(ptr);
525 redzone_start = round_up((unsigned long)(ptr + size), 542 redzone_start = round_up((unsigned long)(ptr + size),
526 KASAN_SHADOW_SCALE_SIZE); 543 KASAN_SHADOW_SCALE_SIZE);
527 redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page)); 544 redzone_end = (unsigned long)ptr + page_size(page);
528 545
529 kasan_unpoison_shadow(ptr, size); 546 kasan_unpoison_shadow(ptr, size);
530 kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, 547 kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
@@ -560,8 +577,7 @@ void kasan_poison_kfree(void *ptr, unsigned long ip)
560 kasan_report_invalid_free(ptr, ip); 577 kasan_report_invalid_free(ptr, ip);
561 return; 578 return;
562 } 579 }
563 kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), 580 kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE);
564 KASAN_FREE_PAGE);
565 } else { 581 } else {
566 __kasan_slab_free(page->slab_cache, ptr, ip, false); 582 __kasan_slab_free(page->slab_cache, ptr, ip, false);
567 } 583 }
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 014f19e76247..35cff6bbb716 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -95,9 +95,19 @@ struct kasan_track {
95 depot_stack_handle_t stack; 95 depot_stack_handle_t stack;
96}; 96};
97 97
98#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
99#define KASAN_NR_FREE_STACKS 5
100#else
101#define KASAN_NR_FREE_STACKS 1
102#endif
103
98struct kasan_alloc_meta { 104struct kasan_alloc_meta {
99 struct kasan_track alloc_track; 105 struct kasan_track alloc_track;
100 struct kasan_track free_track; 106 struct kasan_track free_track[KASAN_NR_FREE_STACKS];
107#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
108 u8 free_pointer_tag[KASAN_NR_FREE_STACKS];
109 u8 free_track_idx;
110#endif
101}; 111};
102 112
103struct qlist_node { 113struct qlist_node {
@@ -146,6 +156,8 @@ void kasan_report(unsigned long addr, size_t size,
146 bool is_write, unsigned long ip); 156 bool is_write, unsigned long ip);
147void kasan_report_invalid_free(void *object, unsigned long ip); 157void kasan_report_invalid_free(void *object, unsigned long ip);
148 158
159struct page *kasan_addr_to_page(const void *addr);
160
149#if defined(CONFIG_KASAN_GENERIC) && \ 161#if defined(CONFIG_KASAN_GENERIC) && \
150 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) 162 (defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
151void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); 163void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 0e5f965f1882..621782100eaa 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -111,7 +111,7 @@ static void print_track(struct kasan_track *track, const char *prefix)
111 } 111 }
112} 112}
113 113
114static struct page *addr_to_page(const void *addr) 114struct page *kasan_addr_to_page(const void *addr)
115{ 115{
116 if ((addr >= (void *)PAGE_OFFSET) && 116 if ((addr >= (void *)PAGE_OFFSET) &&
117 (addr < high_memory)) 117 (addr < high_memory))
@@ -151,15 +151,38 @@ static void describe_object_addr(struct kmem_cache *cache, void *object,
151 (void *)(object_addr + cache->object_size)); 151 (void *)(object_addr + cache->object_size));
152} 152}
153 153
154static struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
155 void *object, u8 tag)
156{
157 struct kasan_alloc_meta *alloc_meta;
158 int i = 0;
159
160 alloc_meta = get_alloc_info(cache, object);
161
162#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
163 for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
164 if (alloc_meta->free_pointer_tag[i] == tag)
165 break;
166 }
167 if (i == KASAN_NR_FREE_STACKS)
168 i = alloc_meta->free_track_idx;
169#endif
170
171 return &alloc_meta->free_track[i];
172}
173
154static void describe_object(struct kmem_cache *cache, void *object, 174static void describe_object(struct kmem_cache *cache, void *object,
155 const void *addr) 175 const void *addr, u8 tag)
156{ 176{
157 struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); 177 struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
158 178
159 if (cache->flags & SLAB_KASAN) { 179 if (cache->flags & SLAB_KASAN) {
180 struct kasan_track *free_track;
181
160 print_track(&alloc_info->alloc_track, "Allocated"); 182 print_track(&alloc_info->alloc_track, "Allocated");
161 pr_err("\n"); 183 pr_err("\n");
162 print_track(&alloc_info->free_track, "Freed"); 184 free_track = kasan_get_free_track(cache, object, tag);
185 print_track(free_track, "Freed");
163 pr_err("\n"); 186 pr_err("\n");
164 } 187 }
165 188
@@ -344,9 +367,9 @@ static void print_address_stack_frame(const void *addr)
344 print_decoded_frame_descr(frame_descr); 367 print_decoded_frame_descr(frame_descr);
345} 368}
346 369
347static void print_address_description(void *addr) 370static void print_address_description(void *addr, u8 tag)
348{ 371{
349 struct page *page = addr_to_page(addr); 372 struct page *page = kasan_addr_to_page(addr);
350 373
351 dump_stack(); 374 dump_stack();
352 pr_err("\n"); 375 pr_err("\n");
@@ -355,7 +378,7 @@ static void print_address_description(void *addr)
355 struct kmem_cache *cache = page->slab_cache; 378 struct kmem_cache *cache = page->slab_cache;
356 void *object = nearest_obj(cache, page, addr); 379 void *object = nearest_obj(cache, page, addr);
357 380
358 describe_object(cache, object, addr); 381 describe_object(cache, object, addr, tag);
359 } 382 }
360 383
361 if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { 384 if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
@@ -435,13 +458,14 @@ static bool report_enabled(void)
435void kasan_report_invalid_free(void *object, unsigned long ip) 458void kasan_report_invalid_free(void *object, unsigned long ip)
436{ 459{
437 unsigned long flags; 460 unsigned long flags;
461 u8 tag = get_tag(object);
438 462
463 object = reset_tag(object);
439 start_report(&flags); 464 start_report(&flags);
440 pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); 465 pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
441 print_tags(get_tag(object), reset_tag(object)); 466 print_tags(tag, object);
442 object = reset_tag(object);
443 pr_err("\n"); 467 pr_err("\n");
444 print_address_description(object); 468 print_address_description(object, tag);
445 pr_err("\n"); 469 pr_err("\n");
446 print_shadow_for_address(object); 470 print_shadow_for_address(object);
447 end_report(&flags); 471 end_report(&flags);
@@ -479,7 +503,7 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon
479 pr_err("\n"); 503 pr_err("\n");
480 504
481 if (addr_has_shadow(untagged_addr)) { 505 if (addr_has_shadow(untagged_addr)) {
482 print_address_description(untagged_addr); 506 print_address_description(untagged_addr, get_tag(tagged_addr));
483 pr_err("\n"); 507 pr_err("\n");
484 print_shadow_for_address(info.first_bad_addr); 508 print_shadow_for_address(info.first_bad_addr);
485 } else { 509 } else {
diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c
index 8eaf5f722271..969ae08f59d7 100644
--- a/mm/kasan/tags_report.c
+++ b/mm/kasan/tags_report.c
@@ -36,6 +36,30 @@
36 36
37const char *get_bug_type(struct kasan_access_info *info) 37const char *get_bug_type(struct kasan_access_info *info)
38{ 38{
39#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
40 struct kasan_alloc_meta *alloc_meta;
41 struct kmem_cache *cache;
42 struct page *page;
43 const void *addr;
44 void *object;
45 u8 tag;
46 int i;
47
48 tag = get_tag(info->access_addr);
49 addr = reset_tag(info->access_addr);
50 page = kasan_addr_to_page(addr);
51 if (page && PageSlab(page)) {
52 cache = page->slab_cache;
53 object = nearest_obj(cache, page, (void *)addr);
54 alloc_meta = get_alloc_info(cache, object);
55
56 for (i = 0; i < KASAN_NR_FREE_STACKS; i++)
57 if (alloc_meta->free_pointer_tag[i] == tag)
58 return "use-after-free";
59 return "out-of-bounds";
60 }
61
62#endif
39 return "invalid-access"; 63 return "invalid-access";
40} 64}
41 65
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ccede2425c3f..0a1b4b484ac5 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -48,6 +48,7 @@ enum scan_result {
48 SCAN_CGROUP_CHARGE_FAIL, 48 SCAN_CGROUP_CHARGE_FAIL,
49 SCAN_EXCEED_SWAP_PTE, 49 SCAN_EXCEED_SWAP_PTE,
50 SCAN_TRUNCATED, 50 SCAN_TRUNCATED,
51 SCAN_PAGE_HAS_PRIVATE,
51}; 52};
52 53
53#define CREATE_TRACE_POINTS 54#define CREATE_TRACE_POINTS
@@ -76,6 +77,8 @@ static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
76 77
77static struct kmem_cache *mm_slot_cache __read_mostly; 78static struct kmem_cache *mm_slot_cache __read_mostly;
78 79
80#define MAX_PTE_MAPPED_THP 8
81
79/** 82/**
80 * struct mm_slot - hash lookup from mm to mm_slot 83 * struct mm_slot - hash lookup from mm to mm_slot
81 * @hash: hash collision list 84 * @hash: hash collision list
@@ -86,6 +89,10 @@ struct mm_slot {
86 struct hlist_node hash; 89 struct hlist_node hash;
87 struct list_head mm_node; 90 struct list_head mm_node;
88 struct mm_struct *mm; 91 struct mm_struct *mm;
92
93 /* pte-mapped THP in this mm */
94 int nr_pte_mapped_thp;
95 unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
89}; 96};
90 97
91/** 98/**
@@ -404,7 +411,11 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
404 (vm_flags & VM_NOHUGEPAGE) || 411 (vm_flags & VM_NOHUGEPAGE) ||
405 test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) 412 test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
406 return false; 413 return false;
407 if (shmem_file(vma->vm_file)) { 414
415 if (shmem_file(vma->vm_file) ||
416 (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
417 vma->vm_file &&
418 (vm_flags & VM_DENYWRITE))) {
408 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) 419 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
409 return false; 420 return false;
410 return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, 421 return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
@@ -456,8 +467,9 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
456 unsigned long hstart, hend; 467 unsigned long hstart, hend;
457 468
458 /* 469 /*
459 * khugepaged does not yet work on non-shmem files or special 470 * khugepaged only supports read-only files for non-shmem files.
460 * mappings. And file-private shmem THP is not supported. 471 * khugepaged does not yet work on special mappings. And
472 * file-private shmem THP is not supported.
461 */ 473 */
462 if (!hugepage_vma_check(vma, vm_flags)) 474 if (!hugepage_vma_check(vma, vm_flags))
463 return 0; 475 return 0;
@@ -1248,6 +1260,159 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
1248} 1260}
1249 1261
1250#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) 1262#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
1263/*
1264 * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
1265 * khugepaged should try to collapse the page table.
1266 */
1267static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
1268 unsigned long addr)
1269{
1270 struct mm_slot *mm_slot;
1271
1272 VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
1273
1274 spin_lock(&khugepaged_mm_lock);
1275 mm_slot = get_mm_slot(mm);
1276 if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
1277 mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
1278 spin_unlock(&khugepaged_mm_lock);
1279 return 0;
1280}
1281
1282/**
1283 * Try to collapse a pte-mapped THP for mm at address haddr.
1284 *
1285 * This function checks whether all the PTEs in the PMD are pointing to the
1286 * right THP. If so, retract the page table so the THP can refault in with
1287 * as pmd-mapped.
1288 */
1289void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
1290{
1291 unsigned long haddr = addr & HPAGE_PMD_MASK;
1292 struct vm_area_struct *vma = find_vma(mm, haddr);
1293 struct page *hpage = NULL;
1294 pte_t *start_pte, *pte;
1295 pmd_t *pmd, _pmd;
1296 spinlock_t *ptl;
1297 int count = 0;
1298 int i;
1299
1300 if (!vma || !vma->vm_file ||
1301 vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
1302 return;
1303
1304 /*
1305 * This vm_flags may not have VM_HUGEPAGE if the page was not
1306 * collapsed by this mm. But we can still collapse if the page is
1307 * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
1308 * will not fail the vma for missing VM_HUGEPAGE
1309 */
1310 if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
1311 return;
1312
1313 pmd = mm_find_pmd(mm, haddr);
1314 if (!pmd)
1315 return;
1316
1317 start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
1318
1319 /* step 1: check all mapped PTEs are to the right huge page */
1320 for (i = 0, addr = haddr, pte = start_pte;
1321 i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
1322 struct page *page;
1323
1324 /* empty pte, skip */
1325 if (pte_none(*pte))
1326 continue;
1327
1328 /* page swapped out, abort */
1329 if (!pte_present(*pte))
1330 goto abort;
1331
1332 page = vm_normal_page(vma, addr, *pte);
1333
1334 if (!page || !PageCompound(page))
1335 goto abort;
1336
1337 if (!hpage) {
1338 hpage = compound_head(page);
1339 /*
1340 * The mapping of the THP should not change.
1341 *
1342 * Note that uprobe, debugger, or MAP_PRIVATE may
1343 * change the page table, but the new page will
1344 * not pass PageCompound() check.
1345 */
1346 if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping))
1347 goto abort;
1348 }
1349
1350 /*
1351 * Confirm the page maps to the correct subpage.
1352 *
1353 * Note that uprobe, debugger, or MAP_PRIVATE may change
1354 * the page table, but the new page will not pass
1355 * PageCompound() check.
1356 */
1357 if (WARN_ON(hpage + i != page))
1358 goto abort;
1359 count++;
1360 }
1361
1362 /* step 2: adjust rmap */
1363 for (i = 0, addr = haddr, pte = start_pte;
1364 i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
1365 struct page *page;
1366
1367 if (pte_none(*pte))
1368 continue;
1369 page = vm_normal_page(vma, addr, *pte);
1370 page_remove_rmap(page, false);
1371 }
1372
1373 pte_unmap_unlock(start_pte, ptl);
1374
1375 /* step 3: set proper refcount and mm_counters. */
1376 if (hpage) {
1377 page_ref_sub(hpage, count);
1378 add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
1379 }
1380
1381 /* step 4: collapse pmd */
1382 ptl = pmd_lock(vma->vm_mm, pmd);
1383 _pmd = pmdp_collapse_flush(vma, addr, pmd);
1384 spin_unlock(ptl);
1385 mm_dec_nr_ptes(mm);
1386 pte_free(mm, pmd_pgtable(_pmd));
1387 return;
1388
1389abort:
1390 pte_unmap_unlock(start_pte, ptl);
1391}
1392
1393static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
1394{
1395 struct mm_struct *mm = mm_slot->mm;
1396 int i;
1397
1398 if (likely(mm_slot->nr_pte_mapped_thp == 0))
1399 return 0;
1400
1401 if (!down_write_trylock(&mm->mmap_sem))
1402 return -EBUSY;
1403
1404 if (unlikely(khugepaged_test_exit(mm)))
1405 goto out;
1406
1407 for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
1408 collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
1409
1410out:
1411 mm_slot->nr_pte_mapped_thp = 0;
1412 up_write(&mm->mmap_sem);
1413 return 0;
1414}
1415
1251static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) 1416static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1252{ 1417{
1253 struct vm_area_struct *vma; 1418 struct vm_area_struct *vma;
@@ -1256,7 +1421,22 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1256 1421
1257 i_mmap_lock_write(mapping); 1422 i_mmap_lock_write(mapping);
1258 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { 1423 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1259 /* probably overkill */ 1424 /*
1425 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
1426 * got written to. These VMAs are likely not worth investing
1427 * down_write(mmap_sem) as PMD-mapping is likely to be split
1428 * later.
1429 *
1430 * Not that vma->anon_vma check is racy: it can be set up after
1431 * the check but before we took mmap_sem by the fault path.
1432 * But page lock would prevent establishing any new ptes of the
1433 * page, so we are safe.
1434 *
1435 * An alternative would be drop the check, but check that page
1436 * table is clear before calling pmdp_collapse_flush() under
1437 * ptl. It has higher chance to recover THP for the VMA, but
1438 * has higher cost too.
1439 */
1260 if (vma->anon_vma) 1440 if (vma->anon_vma)
1261 continue; 1441 continue;
1262 addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 1442 addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
@@ -1269,9 +1449,10 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1269 continue; 1449 continue;
1270 /* 1450 /*
1271 * We need exclusive mmap_sem to retract page table. 1451 * We need exclusive mmap_sem to retract page table.
1272 * If trylock fails we would end up with pte-mapped THP after 1452 *
1273 * re-fault. Not ideal, but it's more important to not disturb 1453 * We use trylock due to lock inversion: we need to acquire
1274 * the system too much. 1454 * mmap_sem while holding page lock. Fault path does it in
1455 * reverse order. Trylock is a way to avoid deadlock.
1275 */ 1456 */
1276 if (down_write_trylock(&vma->vm_mm->mmap_sem)) { 1457 if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
1277 spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); 1458 spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
@@ -1281,18 +1462,21 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1281 up_write(&vma->vm_mm->mmap_sem); 1462 up_write(&vma->vm_mm->mmap_sem);
1282 mm_dec_nr_ptes(vma->vm_mm); 1463 mm_dec_nr_ptes(vma->vm_mm);
1283 pte_free(vma->vm_mm, pmd_pgtable(_pmd)); 1464 pte_free(vma->vm_mm, pmd_pgtable(_pmd));
1465 } else {
1466 /* Try again later */
1467 khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
1284 } 1468 }
1285 } 1469 }
1286 i_mmap_unlock_write(mapping); 1470 i_mmap_unlock_write(mapping);
1287} 1471}
1288 1472
1289/** 1473/**
1290 * collapse_shmem - collapse small tmpfs/shmem pages into huge one. 1474 * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
1291 * 1475 *
1292 * Basic scheme is simple, details are more complex: 1476 * Basic scheme is simple, details are more complex:
1293 * - allocate and lock a new huge page; 1477 * - allocate and lock a new huge page;
1294 * - scan page cache replacing old pages with the new one 1478 * - scan page cache replacing old pages with the new one
1295 * + swap in pages if necessary; 1479 * + swap/gup in pages if necessary;
1296 * + fill in gaps; 1480 * + fill in gaps;
1297 * + keep old pages around in case rollback is required; 1481 * + keep old pages around in case rollback is required;
1298 * - if replacing succeeds: 1482 * - if replacing succeeds:
@@ -1304,10 +1488,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1304 * + restore gaps in the page cache; 1488 * + restore gaps in the page cache;
1305 * + unlock and free huge page; 1489 * + unlock and free huge page;
1306 */ 1490 */
1307static void collapse_shmem(struct mm_struct *mm, 1491static void collapse_file(struct mm_struct *mm,
1308 struct address_space *mapping, pgoff_t start, 1492 struct file *file, pgoff_t start,
1309 struct page **hpage, int node) 1493 struct page **hpage, int node)
1310{ 1494{
1495 struct address_space *mapping = file->f_mapping;
1311 gfp_t gfp; 1496 gfp_t gfp;
1312 struct page *new_page; 1497 struct page *new_page;
1313 struct mem_cgroup *memcg; 1498 struct mem_cgroup *memcg;
@@ -1315,7 +1500,9 @@ static void collapse_shmem(struct mm_struct *mm,
1315 LIST_HEAD(pagelist); 1500 LIST_HEAD(pagelist);
1316 XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); 1501 XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
1317 int nr_none = 0, result = SCAN_SUCCEED; 1502 int nr_none = 0, result = SCAN_SUCCEED;
1503 bool is_shmem = shmem_file(file);
1318 1504
1505 VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
1319 VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); 1506 VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
1320 1507
1321 /* Only allocate from the target node */ 1508 /* Only allocate from the target node */
@@ -1347,7 +1534,8 @@ static void collapse_shmem(struct mm_struct *mm,
1347 } while (1); 1534 } while (1);
1348 1535
1349 __SetPageLocked(new_page); 1536 __SetPageLocked(new_page);
1350 __SetPageSwapBacked(new_page); 1537 if (is_shmem)
1538 __SetPageSwapBacked(new_page);
1351 new_page->index = start; 1539 new_page->index = start;
1352 new_page->mapping = mapping; 1540 new_page->mapping = mapping;
1353 1541
@@ -1362,41 +1550,75 @@ static void collapse_shmem(struct mm_struct *mm,
1362 struct page *page = xas_next(&xas); 1550 struct page *page = xas_next(&xas);
1363 1551
1364 VM_BUG_ON(index != xas.xa_index); 1552 VM_BUG_ON(index != xas.xa_index);
1365 if (!page) { 1553 if (is_shmem) {
1366 /* 1554 if (!page) {
1367 * Stop if extent has been truncated or hole-punched, 1555 /*
1368 * and is now completely empty. 1556 * Stop if extent has been truncated or
1369 */ 1557 * hole-punched, and is now completely
1370 if (index == start) { 1558 * empty.
1371 if (!xas_next_entry(&xas, end - 1)) { 1559 */
1372 result = SCAN_TRUNCATED; 1560 if (index == start) {
1561 if (!xas_next_entry(&xas, end - 1)) {
1562 result = SCAN_TRUNCATED;
1563 goto xa_locked;
1564 }
1565 xas_set(&xas, index);
1566 }
1567 if (!shmem_charge(mapping->host, 1)) {
1568 result = SCAN_FAIL;
1373 goto xa_locked; 1569 goto xa_locked;
1374 } 1570 }
1375 xas_set(&xas, index); 1571 xas_store(&xas, new_page);
1572 nr_none++;
1573 continue;
1376 } 1574 }
1377 if (!shmem_charge(mapping->host, 1)) { 1575
1378 result = SCAN_FAIL; 1576 if (xa_is_value(page) || !PageUptodate(page)) {
1577 xas_unlock_irq(&xas);
1578 /* swap in or instantiate fallocated page */
1579 if (shmem_getpage(mapping->host, index, &page,
1580 SGP_NOHUGE)) {
1581 result = SCAN_FAIL;
1582 goto xa_unlocked;
1583 }
1584 } else if (trylock_page(page)) {
1585 get_page(page);
1586 xas_unlock_irq(&xas);
1587 } else {
1588 result = SCAN_PAGE_LOCK;
1379 goto xa_locked; 1589 goto xa_locked;
1380 } 1590 }
1381 xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); 1591 } else { /* !is_shmem */
1382 nr_none++; 1592 if (!page || xa_is_value(page)) {
1383 continue; 1593 xas_unlock_irq(&xas);
1384 } 1594 page_cache_sync_readahead(mapping, &file->f_ra,
1385 1595 file, index,
1386 if (xa_is_value(page) || !PageUptodate(page)) { 1596 PAGE_SIZE);
1387 xas_unlock_irq(&xas); 1597 /* drain pagevecs to help isolate_lru_page() */
1388 /* swap in or instantiate fallocated page */ 1598 lru_add_drain();
1389 if (shmem_getpage(mapping->host, index, &page, 1599 page = find_lock_page(mapping, index);
1390 SGP_NOHUGE)) { 1600 if (unlikely(page == NULL)) {
1601 result = SCAN_FAIL;
1602 goto xa_unlocked;
1603 }
1604 } else if (!PageUptodate(page)) {
1605 xas_unlock_irq(&xas);
1606 wait_on_page_locked(page);
1607 if (!trylock_page(page)) {
1608 result = SCAN_PAGE_LOCK;
1609 goto xa_unlocked;
1610 }
1611 get_page(page);
1612 } else if (PageDirty(page)) {
1391 result = SCAN_FAIL; 1613 result = SCAN_FAIL;
1392 goto xa_unlocked; 1614 goto xa_locked;
1615 } else if (trylock_page(page)) {
1616 get_page(page);
1617 xas_unlock_irq(&xas);
1618 } else {
1619 result = SCAN_PAGE_LOCK;
1620 goto xa_locked;
1393 } 1621 }
1394 } else if (trylock_page(page)) {
1395 get_page(page);
1396 xas_unlock_irq(&xas);
1397 } else {
1398 result = SCAN_PAGE_LOCK;
1399 goto xa_locked;
1400 } 1622 }
1401 1623
1402 /* 1624 /*
@@ -1425,6 +1647,12 @@ static void collapse_shmem(struct mm_struct *mm,
1425 goto out_unlock; 1647 goto out_unlock;
1426 } 1648 }
1427 1649
1650 if (page_has_private(page) &&
1651 !try_to_release_page(page, GFP_KERNEL)) {
1652 result = SCAN_PAGE_HAS_PRIVATE;
1653 goto out_unlock;
1654 }
1655
1428 if (page_mapped(page)) 1656 if (page_mapped(page))
1429 unmap_mapping_pages(mapping, index, 1, false); 1657 unmap_mapping_pages(mapping, index, 1, false);
1430 1658
@@ -1454,7 +1682,7 @@ static void collapse_shmem(struct mm_struct *mm,
1454 list_add_tail(&page->lru, &pagelist); 1682 list_add_tail(&page->lru, &pagelist);
1455 1683
1456 /* Finally, replace with the new page. */ 1684 /* Finally, replace with the new page. */
1457 xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); 1685 xas_store(&xas, new_page);
1458 continue; 1686 continue;
1459out_unlock: 1687out_unlock:
1460 unlock_page(page); 1688 unlock_page(page);
@@ -1462,12 +1690,20 @@ out_unlock:
1462 goto xa_unlocked; 1690 goto xa_unlocked;
1463 } 1691 }
1464 1692
1465 __inc_node_page_state(new_page, NR_SHMEM_THPS); 1693 if (is_shmem)
1694 __inc_node_page_state(new_page, NR_SHMEM_THPS);
1695 else {
1696 __inc_node_page_state(new_page, NR_FILE_THPS);
1697 filemap_nr_thps_inc(mapping);
1698 }
1699
1466 if (nr_none) { 1700 if (nr_none) {
1467 struct zone *zone = page_zone(new_page); 1701 struct zone *zone = page_zone(new_page);
1468 1702
1469 __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none); 1703 __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
1470 __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none); 1704 if (is_shmem)
1705 __mod_node_page_state(zone->zone_pgdat,
1706 NR_SHMEM, nr_none);
1471 } 1707 }
1472 1708
1473xa_locked: 1709xa_locked:
@@ -1505,10 +1741,15 @@ xa_unlocked:
1505 1741
1506 SetPageUptodate(new_page); 1742 SetPageUptodate(new_page);
1507 page_ref_add(new_page, HPAGE_PMD_NR - 1); 1743 page_ref_add(new_page, HPAGE_PMD_NR - 1);
1508 set_page_dirty(new_page);
1509 mem_cgroup_commit_charge(new_page, memcg, false, true); 1744 mem_cgroup_commit_charge(new_page, memcg, false, true);
1745
1746 if (is_shmem) {
1747 set_page_dirty(new_page);
1748 lru_cache_add_anon(new_page);
1749 } else {
1750 lru_cache_add_file(new_page);
1751 }
1510 count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1); 1752 count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
1511 lru_cache_add_anon(new_page);
1512 1753
1513 /* 1754 /*
1514 * Remove pte page tables, so we can re-fault the page as huge. 1755 * Remove pte page tables, so we can re-fault the page as huge.
@@ -1523,7 +1764,9 @@ xa_unlocked:
1523 /* Something went wrong: roll back page cache changes */ 1764 /* Something went wrong: roll back page cache changes */
1524 xas_lock_irq(&xas); 1765 xas_lock_irq(&xas);
1525 mapping->nrpages -= nr_none; 1766 mapping->nrpages -= nr_none;
1526 shmem_uncharge(mapping->host, nr_none); 1767
1768 if (is_shmem)
1769 shmem_uncharge(mapping->host, nr_none);
1527 1770
1528 xas_set(&xas, start); 1771 xas_set(&xas, start);
1529 xas_for_each(&xas, page, end - 1) { 1772 xas_for_each(&xas, page, end - 1) {
@@ -1563,11 +1806,11 @@ out:
1563 /* TODO: tracepoints */ 1806 /* TODO: tracepoints */
1564} 1807}
1565 1808
1566static void khugepaged_scan_shmem(struct mm_struct *mm, 1809static void khugepaged_scan_file(struct mm_struct *mm,
1567 struct address_space *mapping, 1810 struct file *file, pgoff_t start, struct page **hpage)
1568 pgoff_t start, struct page **hpage)
1569{ 1811{
1570 struct page *page = NULL; 1812 struct page *page = NULL;
1813 struct address_space *mapping = file->f_mapping;
1571 XA_STATE(xas, &mapping->i_pages, start); 1814 XA_STATE(xas, &mapping->i_pages, start);
1572 int present, swap; 1815 int present, swap;
1573 int node = NUMA_NO_NODE; 1816 int node = NUMA_NO_NODE;
@@ -1606,7 +1849,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
1606 break; 1849 break;
1607 } 1850 }
1608 1851
1609 if (page_count(page) != 1 + page_mapcount(page)) { 1852 if (page_count(page) !=
1853 1 + page_mapcount(page) + page_has_private(page)) {
1610 result = SCAN_PAGE_COUNT; 1854 result = SCAN_PAGE_COUNT;
1611 break; 1855 break;
1612 } 1856 }
@@ -1631,19 +1875,23 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
1631 result = SCAN_EXCEED_NONE_PTE; 1875 result = SCAN_EXCEED_NONE_PTE;
1632 } else { 1876 } else {
1633 node = khugepaged_find_target_node(); 1877 node = khugepaged_find_target_node();
1634 collapse_shmem(mm, mapping, start, hpage, node); 1878 collapse_file(mm, file, start, hpage, node);
1635 } 1879 }
1636 } 1880 }
1637 1881
1638 /* TODO: tracepoints */ 1882 /* TODO: tracepoints */
1639} 1883}
1640#else 1884#else
1641static void khugepaged_scan_shmem(struct mm_struct *mm, 1885static void khugepaged_scan_file(struct mm_struct *mm,
1642 struct address_space *mapping, 1886 struct file *file, pgoff_t start, struct page **hpage)
1643 pgoff_t start, struct page **hpage)
1644{ 1887{
1645 BUILD_BUG(); 1888 BUILD_BUG();
1646} 1889}
1890
1891static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
1892{
1893 return 0;
1894}
1647#endif 1895#endif
1648 1896
1649static unsigned int khugepaged_scan_mm_slot(unsigned int pages, 1897static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
@@ -1668,6 +1916,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
1668 khugepaged_scan.mm_slot = mm_slot; 1916 khugepaged_scan.mm_slot = mm_slot;
1669 } 1917 }
1670 spin_unlock(&khugepaged_mm_lock); 1918 spin_unlock(&khugepaged_mm_lock);
1919 khugepaged_collapse_pte_mapped_thps(mm_slot);
1671 1920
1672 mm = mm_slot->mm; 1921 mm = mm_slot->mm;
1673 /* 1922 /*
@@ -1713,17 +1962,18 @@ skip:
1713 VM_BUG_ON(khugepaged_scan.address < hstart || 1962 VM_BUG_ON(khugepaged_scan.address < hstart ||
1714 khugepaged_scan.address + HPAGE_PMD_SIZE > 1963 khugepaged_scan.address + HPAGE_PMD_SIZE >
1715 hend); 1964 hend);
1716 if (shmem_file(vma->vm_file)) { 1965 if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
1717 struct file *file; 1966 struct file *file;
1718 pgoff_t pgoff = linear_page_index(vma, 1967 pgoff_t pgoff = linear_page_index(vma,
1719 khugepaged_scan.address); 1968 khugepaged_scan.address);
1720 if (!shmem_huge_enabled(vma)) 1969
1970 if (shmem_file(vma->vm_file)
1971 && !shmem_huge_enabled(vma))
1721 goto skip; 1972 goto skip;
1722 file = get_file(vma->vm_file); 1973 file = get_file(vma->vm_file);
1723 up_read(&mm->mmap_sem); 1974 up_read(&mm->mmap_sem);
1724 ret = 1; 1975 ret = 1;
1725 khugepaged_scan_shmem(mm, file->f_mapping, 1976 khugepaged_scan_file(mm, file, pgoff, hpage);
1726 pgoff, hpage);
1727 fput(file); 1977 fput(file);
1728 } else { 1978 } else {
1729 ret = khugepaged_scan_pmd(mm, vma, 1979 ret = khugepaged_scan_pmd(mm, vma,
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index f6e602918dac..03a8d84badad 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -168,6 +168,8 @@ struct kmemleak_object {
168#define OBJECT_REPORTED (1 << 1) 168#define OBJECT_REPORTED (1 << 1)
169/* flag set to not scan the object */ 169/* flag set to not scan the object */
170#define OBJECT_NO_SCAN (1 << 2) 170#define OBJECT_NO_SCAN (1 << 2)
171/* flag set to fully scan the object when scan_area allocation failed */
172#define OBJECT_FULL_SCAN (1 << 3)
171 173
172#define HEX_PREFIX " " 174#define HEX_PREFIX " "
173/* number of bytes to print per line; must be 16 or 32 */ 175/* number of bytes to print per line; must be 16 or 32 */
@@ -183,6 +185,10 @@ struct kmemleak_object {
183static LIST_HEAD(object_list); 185static LIST_HEAD(object_list);
184/* the list of gray-colored objects (see color_gray comment below) */ 186/* the list of gray-colored objects (see color_gray comment below) */
185static LIST_HEAD(gray_list); 187static LIST_HEAD(gray_list);
188/* memory pool allocation */
189static struct kmemleak_object mem_pool[CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE];
190static int mem_pool_free_count = ARRAY_SIZE(mem_pool);
191static LIST_HEAD(mem_pool_free_list);
186/* search tree for object boundaries */ 192/* search tree for object boundaries */
187static struct rb_root object_tree_root = RB_ROOT; 193static struct rb_root object_tree_root = RB_ROOT;
188/* rw_lock protecting the access to object_list and object_tree_root */ 194/* rw_lock protecting the access to object_list and object_tree_root */
@@ -193,13 +199,11 @@ static struct kmem_cache *object_cache;
193static struct kmem_cache *scan_area_cache; 199static struct kmem_cache *scan_area_cache;
194 200
195/* set if tracing memory operations is enabled */ 201/* set if tracing memory operations is enabled */
196static int kmemleak_enabled; 202static int kmemleak_enabled = 1;
197/* same as above but only for the kmemleak_free() callback */ 203/* same as above but only for the kmemleak_free() callback */
198static int kmemleak_free_enabled; 204static int kmemleak_free_enabled = 1;
199/* set in the late_initcall if there were no errors */ 205/* set in the late_initcall if there were no errors */
200static int kmemleak_initialized; 206static int kmemleak_initialized;
201/* enables or disables early logging of the memory operations */
202static int kmemleak_early_log = 1;
203/* set if a kmemleak warning was issued */ 207/* set if a kmemleak warning was issued */
204static int kmemleak_warning; 208static int kmemleak_warning;
205/* set if a fatal kmemleak error has occurred */ 209/* set if a fatal kmemleak error has occurred */
@@ -227,49 +231,6 @@ static bool kmemleak_found_leaks;
227static bool kmemleak_verbose; 231static bool kmemleak_verbose;
228module_param_named(verbose, kmemleak_verbose, bool, 0600); 232module_param_named(verbose, kmemleak_verbose, bool, 0600);
229 233
230/*
231 * Early object allocation/freeing logging. Kmemleak is initialized after the
232 * kernel allocator. However, both the kernel allocator and kmemleak may
233 * allocate memory blocks which need to be tracked. Kmemleak defines an
234 * arbitrary buffer to hold the allocation/freeing information before it is
235 * fully initialized.
236 */
237
238/* kmemleak operation type for early logging */
239enum {
240 KMEMLEAK_ALLOC,
241 KMEMLEAK_ALLOC_PERCPU,
242 KMEMLEAK_FREE,
243 KMEMLEAK_FREE_PART,
244 KMEMLEAK_FREE_PERCPU,
245 KMEMLEAK_NOT_LEAK,
246 KMEMLEAK_IGNORE,
247 KMEMLEAK_SCAN_AREA,
248 KMEMLEAK_NO_SCAN,
249 KMEMLEAK_SET_EXCESS_REF
250};
251
252/*
253 * Structure holding the information passed to kmemleak callbacks during the
254 * early logging.
255 */
256struct early_log {
257 int op_type; /* kmemleak operation type */
258 int min_count; /* minimum reference count */
259 const void *ptr; /* allocated/freed memory block */
260 union {
261 size_t size; /* memory block size */
262 unsigned long excess_ref; /* surplus reference passing */
263 };
264 unsigned long trace[MAX_TRACE]; /* stack trace */
265 unsigned int trace_len; /* stack trace length */
266};
267
268/* early logging buffer and current position */
269static struct early_log
270 early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata;
271static int crt_early_log __initdata;
272
273static void kmemleak_disable(void); 234static void kmemleak_disable(void);
274 235
275/* 236/*
@@ -450,6 +411,54 @@ static int get_object(struct kmemleak_object *object)
450} 411}
451 412
452/* 413/*
414 * Memory pool allocation and freeing. kmemleak_lock must not be held.
415 */
416static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
417{
418 unsigned long flags;
419 struct kmemleak_object *object;
420
421 /* try the slab allocator first */
422 if (object_cache) {
423 object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
424 if (object)
425 return object;
426 }
427
428 /* slab allocation failed, try the memory pool */
429 write_lock_irqsave(&kmemleak_lock, flags);
430 object = list_first_entry_or_null(&mem_pool_free_list,
431 typeof(*object), object_list);
432 if (object)
433 list_del(&object->object_list);
434 else if (mem_pool_free_count)
435 object = &mem_pool[--mem_pool_free_count];
436 else
437 pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
438 write_unlock_irqrestore(&kmemleak_lock, flags);
439
440 return object;
441}
442
443/*
444 * Return the object to either the slab allocator or the memory pool.
445 */
446static void mem_pool_free(struct kmemleak_object *object)
447{
448 unsigned long flags;
449
450 if (object < mem_pool || object >= mem_pool + ARRAY_SIZE(mem_pool)) {
451 kmem_cache_free(object_cache, object);
452 return;
453 }
454
455 /* add the object to the memory pool free list */
456 write_lock_irqsave(&kmemleak_lock, flags);
457 list_add(&object->object_list, &mem_pool_free_list);
458 write_unlock_irqrestore(&kmemleak_lock, flags);
459}
460
461/*
453 * RCU callback to free a kmemleak_object. 462 * RCU callback to free a kmemleak_object.
454 */ 463 */
455static void free_object_rcu(struct rcu_head *rcu) 464static void free_object_rcu(struct rcu_head *rcu)
@@ -467,7 +476,7 @@ static void free_object_rcu(struct rcu_head *rcu)
467 hlist_del(&area->node); 476 hlist_del(&area->node);
468 kmem_cache_free(scan_area_cache, area); 477 kmem_cache_free(scan_area_cache, area);
469 } 478 }
470 kmem_cache_free(object_cache, object); 479 mem_pool_free(object);
471} 480}
472 481
473/* 482/*
@@ -485,7 +494,15 @@ static void put_object(struct kmemleak_object *object)
485 /* should only get here after delete_object was called */ 494 /* should only get here after delete_object was called */
486 WARN_ON(object->flags & OBJECT_ALLOCATED); 495 WARN_ON(object->flags & OBJECT_ALLOCATED);
487 496
488 call_rcu(&object->rcu, free_object_rcu); 497 /*
498 * It may be too early for the RCU callbacks, however, there is no
499 * concurrent object_list traversal when !object_cache and all objects
500 * came from the memory pool. Free the object directly.
501 */
502 if (object_cache)
503 call_rcu(&object->rcu, free_object_rcu);
504 else
505 free_object_rcu(&object->rcu);
489} 506}
490 507
491/* 508/*
@@ -550,7 +567,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
550 struct rb_node **link, *rb_parent; 567 struct rb_node **link, *rb_parent;
551 unsigned long untagged_ptr; 568 unsigned long untagged_ptr;
552 569
553 object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); 570 object = mem_pool_alloc(gfp);
554 if (!object) { 571 if (!object) {
555 pr_warn("Cannot allocate a kmemleak_object structure\n"); 572 pr_warn("Cannot allocate a kmemleak_object structure\n");
556 kmemleak_disable(); 573 kmemleak_disable();
@@ -689,9 +706,7 @@ static void delete_object_part(unsigned long ptr, size_t size)
689 /* 706 /*
690 * Create one or two objects that may result from the memory block 707 * Create one or two objects that may result from the memory block
691 * split. Note that partial freeing is only done by free_bootmem() and 708 * split. Note that partial freeing is only done by free_bootmem() and
692 * this happens before kmemleak_init() is called. The path below is 709 * this happens before kmemleak_init() is called.
693 * only executed during early log recording in kmemleak_init(), so
694 * GFP_KERNEL is enough.
695 */ 710 */
696 start = object->pointer; 711 start = object->pointer;
697 end = object->pointer + object->size; 712 end = object->pointer + object->size;
@@ -763,7 +778,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
763{ 778{
764 unsigned long flags; 779 unsigned long flags;
765 struct kmemleak_object *object; 780 struct kmemleak_object *object;
766 struct kmemleak_scan_area *area; 781 struct kmemleak_scan_area *area = NULL;
767 782
768 object = find_and_get_object(ptr, 1); 783 object = find_and_get_object(ptr, 1);
769 if (!object) { 784 if (!object) {
@@ -772,13 +787,16 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
772 return; 787 return;
773 } 788 }
774 789
775 area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); 790 if (scan_area_cache)
776 if (!area) { 791 area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
777 pr_warn("Cannot allocate a scan area\n");
778 goto out;
779 }
780 792
781 spin_lock_irqsave(&object->lock, flags); 793 spin_lock_irqsave(&object->lock, flags);
794 if (!area) {
795 pr_warn_once("Cannot allocate a scan area, scanning the full object\n");
796 /* mark the object for full scan to avoid false positives */
797 object->flags |= OBJECT_FULL_SCAN;
798 goto out_unlock;
799 }
782 if (size == SIZE_MAX) { 800 if (size == SIZE_MAX) {
783 size = object->pointer + object->size - ptr; 801 size = object->pointer + object->size - ptr;
784 } else if (ptr + size > object->pointer + object->size) { 802 } else if (ptr + size > object->pointer + object->size) {
@@ -795,7 +813,6 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
795 hlist_add_head(&area->node, &object->area_list); 813 hlist_add_head(&area->node, &object->area_list);
796out_unlock: 814out_unlock:
797 spin_unlock_irqrestore(&object->lock, flags); 815 spin_unlock_irqrestore(&object->lock, flags);
798out:
799 put_object(object); 816 put_object(object);
800} 817}
801 818
@@ -845,86 +862,6 @@ static void object_no_scan(unsigned long ptr)
845 put_object(object); 862 put_object(object);
846} 863}
847 864
848/*
849 * Log an early kmemleak_* call to the early_log buffer. These calls will be
850 * processed later once kmemleak is fully initialized.
851 */
852static void __init log_early(int op_type, const void *ptr, size_t size,
853 int min_count)
854{
855 unsigned long flags;
856 struct early_log *log;
857
858 if (kmemleak_error) {
859 /* kmemleak stopped recording, just count the requests */
860 crt_early_log++;
861 return;
862 }
863
864 if (crt_early_log >= ARRAY_SIZE(early_log)) {
865 crt_early_log++;
866 kmemleak_disable();
867 return;
868 }
869
870 /*
871 * There is no need for locking since the kernel is still in UP mode
872 * at this stage. Disabling the IRQs is enough.
873 */
874 local_irq_save(flags);
875 log = &early_log[crt_early_log];
876 log->op_type = op_type;
877 log->ptr = ptr;
878 log->size = size;
879 log->min_count = min_count;
880 log->trace_len = __save_stack_trace(log->trace);
881 crt_early_log++;
882 local_irq_restore(flags);
883}
884
885/*
886 * Log an early allocated block and populate the stack trace.
887 */
888static void early_alloc(struct early_log *log)
889{
890 struct kmemleak_object *object;
891 unsigned long flags;
892 int i;
893
894 if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr))
895 return;
896
897 /*
898 * RCU locking needed to ensure object is not freed via put_object().
899 */
900 rcu_read_lock();
901 object = create_object((unsigned long)log->ptr, log->size,
902 log->min_count, GFP_ATOMIC);
903 if (!object)
904 goto out;
905 spin_lock_irqsave(&object->lock, flags);
906 for (i = 0; i < log->trace_len; i++)
907 object->trace[i] = log->trace[i];
908 object->trace_len = log->trace_len;
909 spin_unlock_irqrestore(&object->lock, flags);
910out:
911 rcu_read_unlock();
912}
913
914/*
915 * Log an early allocated block and populate the stack trace.
916 */
917static void early_alloc_percpu(struct early_log *log)
918{
919 unsigned int cpu;
920 const void __percpu *ptr = log->ptr;
921
922 for_each_possible_cpu(cpu) {
923 log->ptr = per_cpu_ptr(ptr, cpu);
924 early_alloc(log);
925 }
926}
927
928/** 865/**
929 * kmemleak_alloc - register a newly allocated object 866 * kmemleak_alloc - register a newly allocated object
930 * @ptr: pointer to beginning of the object 867 * @ptr: pointer to beginning of the object
@@ -946,8 +883,6 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
946 883
947 if (kmemleak_enabled && ptr && !IS_ERR(ptr)) 884 if (kmemleak_enabled && ptr && !IS_ERR(ptr))
948 create_object((unsigned long)ptr, size, min_count, gfp); 885 create_object((unsigned long)ptr, size, min_count, gfp);
949 else if (kmemleak_early_log)
950 log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
951} 886}
952EXPORT_SYMBOL_GPL(kmemleak_alloc); 887EXPORT_SYMBOL_GPL(kmemleak_alloc);
953 888
@@ -975,8 +910,6 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
975 for_each_possible_cpu(cpu) 910 for_each_possible_cpu(cpu)
976 create_object((unsigned long)per_cpu_ptr(ptr, cpu), 911 create_object((unsigned long)per_cpu_ptr(ptr, cpu),
977 size, 0, gfp); 912 size, 0, gfp);
978 else if (kmemleak_early_log)
979 log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
980} 913}
981EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); 914EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
982 915
@@ -1001,11 +934,6 @@ void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp
1001 create_object((unsigned long)area->addr, size, 2, gfp); 934 create_object((unsigned long)area->addr, size, 2, gfp);
1002 object_set_excess_ref((unsigned long)area, 935 object_set_excess_ref((unsigned long)area,
1003 (unsigned long)area->addr); 936 (unsigned long)area->addr);
1004 } else if (kmemleak_early_log) {
1005 log_early(KMEMLEAK_ALLOC, area->addr, size, 2);
1006 /* reusing early_log.size for storing area->addr */
1007 log_early(KMEMLEAK_SET_EXCESS_REF,
1008 area, (unsigned long)area->addr, 0);
1009 } 937 }
1010} 938}
1011EXPORT_SYMBOL_GPL(kmemleak_vmalloc); 939EXPORT_SYMBOL_GPL(kmemleak_vmalloc);
@@ -1023,8 +951,6 @@ void __ref kmemleak_free(const void *ptr)
1023 951
1024 if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) 952 if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
1025 delete_object_full((unsigned long)ptr); 953 delete_object_full((unsigned long)ptr);
1026 else if (kmemleak_early_log)
1027 log_early(KMEMLEAK_FREE, ptr, 0, 0);
1028} 954}
1029EXPORT_SYMBOL_GPL(kmemleak_free); 955EXPORT_SYMBOL_GPL(kmemleak_free);
1030 956
@@ -1043,8 +969,6 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
1043 969
1044 if (kmemleak_enabled && ptr && !IS_ERR(ptr)) 970 if (kmemleak_enabled && ptr && !IS_ERR(ptr))
1045 delete_object_part((unsigned long)ptr, size); 971 delete_object_part((unsigned long)ptr, size);
1046 else if (kmemleak_early_log)
1047 log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
1048} 972}
1049EXPORT_SYMBOL_GPL(kmemleak_free_part); 973EXPORT_SYMBOL_GPL(kmemleak_free_part);
1050 974
@@ -1065,8 +989,6 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr)
1065 for_each_possible_cpu(cpu) 989 for_each_possible_cpu(cpu)
1066 delete_object_full((unsigned long)per_cpu_ptr(ptr, 990 delete_object_full((unsigned long)per_cpu_ptr(ptr,
1067 cpu)); 991 cpu));
1068 else if (kmemleak_early_log)
1069 log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0);
1070} 992}
1071EXPORT_SYMBOL_GPL(kmemleak_free_percpu); 993EXPORT_SYMBOL_GPL(kmemleak_free_percpu);
1072 994
@@ -1117,8 +1039,6 @@ void __ref kmemleak_not_leak(const void *ptr)
1117 1039
1118 if (kmemleak_enabled && ptr && !IS_ERR(ptr)) 1040 if (kmemleak_enabled && ptr && !IS_ERR(ptr))
1119 make_gray_object((unsigned long)ptr); 1041 make_gray_object((unsigned long)ptr);
1120 else if (kmemleak_early_log)
1121 log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
1122} 1042}
1123EXPORT_SYMBOL(kmemleak_not_leak); 1043EXPORT_SYMBOL(kmemleak_not_leak);
1124 1044
@@ -1137,8 +1057,6 @@ void __ref kmemleak_ignore(const void *ptr)
1137 1057
1138 if (kmemleak_enabled && ptr && !IS_ERR(ptr)) 1058 if (kmemleak_enabled && ptr && !IS_ERR(ptr))
1139 make_black_object((unsigned long)ptr); 1059 make_black_object((unsigned long)ptr);
1140 else if (kmemleak_early_log)
1141 log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
1142} 1060}
1143EXPORT_SYMBOL(kmemleak_ignore); 1061EXPORT_SYMBOL(kmemleak_ignore);
1144 1062
@@ -1159,8 +1077,6 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
1159 1077
1160 if (kmemleak_enabled && ptr && size && !IS_ERR(ptr)) 1078 if (kmemleak_enabled && ptr && size && !IS_ERR(ptr))
1161 add_scan_area((unsigned long)ptr, size, gfp); 1079 add_scan_area((unsigned long)ptr, size, gfp);
1162 else if (kmemleak_early_log)
1163 log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
1164} 1080}
1165EXPORT_SYMBOL(kmemleak_scan_area); 1081EXPORT_SYMBOL(kmemleak_scan_area);
1166 1082
@@ -1179,8 +1095,6 @@ void __ref kmemleak_no_scan(const void *ptr)
1179 1095
1180 if (kmemleak_enabled && ptr && !IS_ERR(ptr)) 1096 if (kmemleak_enabled && ptr && !IS_ERR(ptr))
1181 object_no_scan((unsigned long)ptr); 1097 object_no_scan((unsigned long)ptr);
1182 else if (kmemleak_early_log)
1183 log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
1184} 1098}
1185EXPORT_SYMBOL(kmemleak_no_scan); 1099EXPORT_SYMBOL(kmemleak_no_scan);
1186 1100
@@ -1408,7 +1322,8 @@ static void scan_object(struct kmemleak_object *object)
1408 if (!(object->flags & OBJECT_ALLOCATED)) 1322 if (!(object->flags & OBJECT_ALLOCATED))
1409 /* already freed object */ 1323 /* already freed object */
1410 goto out; 1324 goto out;
1411 if (hlist_empty(&object->area_list)) { 1325 if (hlist_empty(&object->area_list) ||
1326 object->flags & OBJECT_FULL_SCAN) {
1412 void *start = (void *)object->pointer; 1327 void *start = (void *)object->pointer;
1413 void *end = (void *)(object->pointer + object->size); 1328 void *end = (void *)(object->pointer + object->size);
1414 void *next; 1329 void *next;
@@ -1966,7 +1881,6 @@ static void kmemleak_disable(void)
1966 1881
1967 /* stop any memory operation tracing */ 1882 /* stop any memory operation tracing */
1968 kmemleak_enabled = 0; 1883 kmemleak_enabled = 0;
1969 kmemleak_early_log = 0;
1970 1884
1971 /* check whether it is too early for a kernel thread */ 1885 /* check whether it is too early for a kernel thread */
1972 if (kmemleak_initialized) 1886 if (kmemleak_initialized)
@@ -1994,20 +1908,11 @@ static int __init kmemleak_boot_config(char *str)
1994} 1908}
1995early_param("kmemleak", kmemleak_boot_config); 1909early_param("kmemleak", kmemleak_boot_config);
1996 1910
1997static void __init print_log_trace(struct early_log *log)
1998{
1999 pr_notice("Early log backtrace:\n");
2000 stack_trace_print(log->trace, log->trace_len, 2);
2001}
2002
2003/* 1911/*
2004 * Kmemleak initialization. 1912 * Kmemleak initialization.
2005 */ 1913 */
2006void __init kmemleak_init(void) 1914void __init kmemleak_init(void)
2007{ 1915{
2008 int i;
2009 unsigned long flags;
2010
2011#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF 1916#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
2012 if (!kmemleak_skip_disable) { 1917 if (!kmemleak_skip_disable) {
2013 kmemleak_disable(); 1918 kmemleak_disable();
@@ -2015,28 +1920,15 @@ void __init kmemleak_init(void)
2015 } 1920 }
2016#endif 1921#endif
2017 1922
1923 if (kmemleak_error)
1924 return;
1925
2018 jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); 1926 jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
2019 jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); 1927 jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
2020 1928
2021 object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); 1929 object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
2022 scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); 1930 scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
2023 1931
2024 if (crt_early_log > ARRAY_SIZE(early_log))
2025 pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n",
2026 crt_early_log);
2027
2028 /* the kernel is still in UP mode, so disabling the IRQs is enough */
2029 local_irq_save(flags);
2030 kmemleak_early_log = 0;
2031 if (kmemleak_error) {
2032 local_irq_restore(flags);
2033 return;
2034 } else {
2035 kmemleak_enabled = 1;
2036 kmemleak_free_enabled = 1;
2037 }
2038 local_irq_restore(flags);
2039
2040 /* register the data/bss sections */ 1932 /* register the data/bss sections */
2041 create_object((unsigned long)_sdata, _edata - _sdata, 1933 create_object((unsigned long)_sdata, _edata - _sdata,
2042 KMEMLEAK_GREY, GFP_ATOMIC); 1934 KMEMLEAK_GREY, GFP_ATOMIC);
@@ -2047,57 +1939,6 @@ void __init kmemleak_init(void)
2047 create_object((unsigned long)__start_ro_after_init, 1939 create_object((unsigned long)__start_ro_after_init,
2048 __end_ro_after_init - __start_ro_after_init, 1940 __end_ro_after_init - __start_ro_after_init,
2049 KMEMLEAK_GREY, GFP_ATOMIC); 1941 KMEMLEAK_GREY, GFP_ATOMIC);
2050
2051 /*
2052 * This is the point where tracking allocations is safe. Automatic
2053 * scanning is started during the late initcall. Add the early logged
2054 * callbacks to the kmemleak infrastructure.
2055 */
2056 for (i = 0; i < crt_early_log; i++) {
2057 struct early_log *log = &early_log[i];
2058
2059 switch (log->op_type) {
2060 case KMEMLEAK_ALLOC:
2061 early_alloc(log);
2062 break;
2063 case KMEMLEAK_ALLOC_PERCPU:
2064 early_alloc_percpu(log);
2065 break;
2066 case KMEMLEAK_FREE:
2067 kmemleak_free(log->ptr);
2068 break;
2069 case KMEMLEAK_FREE_PART:
2070 kmemleak_free_part(log->ptr, log->size);
2071 break;
2072 case KMEMLEAK_FREE_PERCPU:
2073 kmemleak_free_percpu(log->ptr);
2074 break;
2075 case KMEMLEAK_NOT_LEAK:
2076 kmemleak_not_leak(log->ptr);
2077 break;
2078 case KMEMLEAK_IGNORE:
2079 kmemleak_ignore(log->ptr);
2080 break;
2081 case KMEMLEAK_SCAN_AREA:
2082 kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
2083 break;
2084 case KMEMLEAK_NO_SCAN:
2085 kmemleak_no_scan(log->ptr);
2086 break;
2087 case KMEMLEAK_SET_EXCESS_REF:
2088 object_set_excess_ref((unsigned long)log->ptr,
2089 log->excess_ref);
2090 break;
2091 default:
2092 kmemleak_warn("Unknown early log operation: %d\n",
2093 log->op_type);
2094 }
2095
2096 if (kmemleak_warning) {
2097 print_log_trace(log);
2098 kmemleak_warning = 0;
2099 }
2100 }
2101} 1942}
2102 1943
2103/* 1944/*
@@ -2126,7 +1967,8 @@ static int __init kmemleak_late_init(void)
2126 mutex_unlock(&scan_mutex); 1967 mutex_unlock(&scan_mutex);
2127 } 1968 }
2128 1969
2129 pr_info("Kernel memory leak detector initialized\n"); 1970 pr_info("Kernel memory leak detector initialized (mem pool available: %d)\n",
1971 mem_pool_free_count);
2130 1972
2131 return 0; 1973 return 0;
2132} 1974}
diff --git a/mm/ksm.c b/mm/ksm.c
index 3dc4346411e4..dbee2eb4dd05 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1029,24 +1029,6 @@ static u32 calc_checksum(struct page *page)
1029 return checksum; 1029 return checksum;
1030} 1030}
1031 1031
1032static int memcmp_pages(struct page *page1, struct page *page2)
1033{
1034 char *addr1, *addr2;
1035 int ret;
1036
1037 addr1 = kmap_atomic(page1);
1038 addr2 = kmap_atomic(page2);
1039 ret = memcmp(addr1, addr2, PAGE_SIZE);
1040 kunmap_atomic(addr2);
1041 kunmap_atomic(addr1);
1042 return ret;
1043}
1044
1045static inline int pages_identical(struct page *page1, struct page *page2)
1046{
1047 return !memcmp_pages(page1, page2);
1048}
1049
1050static int write_protect_page(struct vm_area_struct *vma, struct page *page, 1032static int write_protect_page(struct vm_area_struct *vma, struct page *page,
1051 pte_t *orig_pte) 1033 pte_t *orig_pte)
1052{ 1034{
diff --git a/mm/madvise.c b/mm/madvise.c
index 88babcc384b9..68ab988ad433 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -107,28 +107,14 @@ static long madvise_behavior(struct vm_area_struct *vma,
107 case MADV_MERGEABLE: 107 case MADV_MERGEABLE:
108 case MADV_UNMERGEABLE: 108 case MADV_UNMERGEABLE:
109 error = ksm_madvise(vma, start, end, behavior, &new_flags); 109 error = ksm_madvise(vma, start, end, behavior, &new_flags);
110 if (error) { 110 if (error)
111 /* 111 goto out_convert_errno;
112 * madvise() returns EAGAIN if kernel resources, such as
113 * slab, are temporarily unavailable.
114 */
115 if (error == -ENOMEM)
116 error = -EAGAIN;
117 goto out;
118 }
119 break; 112 break;
120 case MADV_HUGEPAGE: 113 case MADV_HUGEPAGE:
121 case MADV_NOHUGEPAGE: 114 case MADV_NOHUGEPAGE:
122 error = hugepage_madvise(vma, &new_flags, behavior); 115 error = hugepage_madvise(vma, &new_flags, behavior);
123 if (error) { 116 if (error)
124 /* 117 goto out_convert_errno;
125 * madvise() returns EAGAIN if kernel resources, such as
126 * slab, are temporarily unavailable.
127 */
128 if (error == -ENOMEM)
129 error = -EAGAIN;
130 goto out;
131 }
132 break; 118 break;
133 } 119 }
134 120
@@ -154,15 +140,8 @@ static long madvise_behavior(struct vm_area_struct *vma,
154 goto out; 140 goto out;
155 } 141 }
156 error = __split_vma(mm, vma, start, 1); 142 error = __split_vma(mm, vma, start, 1);
157 if (error) { 143 if (error)
158 /* 144 goto out_convert_errno;
159 * madvise() returns EAGAIN if kernel resources, such as
160 * slab, are temporarily unavailable.
161 */
162 if (error == -ENOMEM)
163 error = -EAGAIN;
164 goto out;
165 }
166 } 145 }
167 146
168 if (end != vma->vm_end) { 147 if (end != vma->vm_end) {
@@ -171,15 +150,8 @@ static long madvise_behavior(struct vm_area_struct *vma,
171 goto out; 150 goto out;
172 } 151 }
173 error = __split_vma(mm, vma, end, 0); 152 error = __split_vma(mm, vma, end, 0);
174 if (error) { 153 if (error)
175 /* 154 goto out_convert_errno;
176 * madvise() returns EAGAIN if kernel resources, such as
177 * slab, are temporarily unavailable.
178 */
179 if (error == -ENOMEM)
180 error = -EAGAIN;
181 goto out;
182 }
183 } 155 }
184 156
185success: 157success:
@@ -187,6 +159,14 @@ success:
187 * vm_flags is protected by the mmap_sem held in write mode. 159 * vm_flags is protected by the mmap_sem held in write mode.
188 */ 160 */
189 vma->vm_flags = new_flags; 161 vma->vm_flags = new_flags;
162
163out_convert_errno:
164 /*
165 * madvise() returns EAGAIN if kernel resources, such as
166 * slab, are temporarily unavailable.
167 */
168 if (error == -ENOMEM)
169 error = -EAGAIN;
190out: 170out:
191 return error; 171 return error;
192} 172}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f3c15bb07cce..2156ef775d04 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -57,6 +57,7 @@
57#include <linux/lockdep.h> 57#include <linux/lockdep.h>
58#include <linux/file.h> 58#include <linux/file.h>
59#include <linux/tracehook.h> 59#include <linux/tracehook.h>
60#include <linux/psi.h>
60#include <linux/seq_buf.h> 61#include <linux/seq_buf.h>
61#include "internal.h" 62#include "internal.h"
62#include <net/sock.h> 63#include <net/sock.h>
@@ -317,6 +318,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
317EXPORT_SYMBOL(memcg_kmem_enabled_key); 318EXPORT_SYMBOL(memcg_kmem_enabled_key);
318 319
319struct workqueue_struct *memcg_kmem_cache_wq; 320struct workqueue_struct *memcg_kmem_cache_wq;
321#endif
320 322
321static int memcg_shrinker_map_size; 323static int memcg_shrinker_map_size;
322static DEFINE_MUTEX(memcg_shrinker_map_mutex); 324static DEFINE_MUTEX(memcg_shrinker_map_mutex);
@@ -440,14 +442,6 @@ void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
440 } 442 }
441} 443}
442 444
443#else /* CONFIG_MEMCG_KMEM */
444static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
445{
446 return 0;
447}
448static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
449#endif /* CONFIG_MEMCG_KMEM */
450
451/** 445/**
452 * mem_cgroup_css_from_page - css of the memcg associated with a page 446 * mem_cgroup_css_from_page - css of the memcg associated with a page
453 * @page: page of interest 447 * @page: page of interest
@@ -2270,21 +2264,22 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
2270 for_each_online_cpu(cpu) { 2264 for_each_online_cpu(cpu) {
2271 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2265 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2272 struct mem_cgroup *memcg; 2266 struct mem_cgroup *memcg;
2267 bool flush = false;
2273 2268
2269 rcu_read_lock();
2274 memcg = stock->cached; 2270 memcg = stock->cached;
2275 if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css)) 2271 if (memcg && stock->nr_pages &&
2276 continue; 2272 mem_cgroup_is_descendant(memcg, root_memcg))
2277 if (!mem_cgroup_is_descendant(memcg, root_memcg)) { 2273 flush = true;
2278 css_put(&memcg->css); 2274 rcu_read_unlock();
2279 continue; 2275
2280 } 2276 if (flush &&
2281 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2277 !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2282 if (cpu == curcpu) 2278 if (cpu == curcpu)
2283 drain_local_stock(&stock->work); 2279 drain_local_stock(&stock->work);
2284 else 2280 else
2285 schedule_work_on(cpu, &stock->work); 2281 schedule_work_on(cpu, &stock->work);
2286 } 2282 }
2287 css_put(&memcg->css);
2288 } 2283 }
2289 put_cpu(); 2284 put_cpu();
2290 mutex_unlock(&percpu_charge_mutex); 2285 mutex_unlock(&percpu_charge_mutex);
@@ -2359,11 +2354,67 @@ static void high_work_func(struct work_struct *work)
2359} 2354}
2360 2355
2361/* 2356/*
2357 * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
2358 * enough to still cause a significant slowdown in most cases, while still
2359 * allowing diagnostics and tracing to proceed without becoming stuck.
2360 */
2361#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2362
2363/*
2364 * When calculating the delay, we use these either side of the exponentiation to
2365 * maintain precision and scale to a reasonable number of jiffies (see the table
2366 * below.
2367 *
2368 * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
2369 * overage ratio to a delay.
2370 * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
2371 * proposed penalty in order to reduce to a reasonable number of jiffies, and
2372 * to produce a reasonable delay curve.
2373 *
2374 * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
2375 * reasonable delay curve compared to precision-adjusted overage, not
2376 * penalising heavily at first, but still making sure that growth beyond the
2377 * limit penalises misbehaviour cgroups by slowing them down exponentially. For
2378 * example, with a high of 100 megabytes:
2379 *
2380 * +-------+------------------------+
2381 * | usage | time to allocate in ms |
2382 * +-------+------------------------+
2383 * | 100M | 0 |
2384 * | 101M | 6 |
2385 * | 102M | 25 |
2386 * | 103M | 57 |
2387 * | 104M | 102 |
2388 * | 105M | 159 |
2389 * | 106M | 230 |
2390 * | 107M | 313 |
2391 * | 108M | 409 |
2392 * | 109M | 518 |
2393 * | 110M | 639 |
2394 * | 111M | 774 |
2395 * | 112M | 921 |
2396 * | 113M | 1081 |
2397 * | 114M | 1254 |
2398 * | 115M | 1439 |
2399 * | 116M | 1638 |
2400 * | 117M | 1849 |
2401 * | 118M | 2000 |
2402 * | 119M | 2000 |
2403 * | 120M | 2000 |
2404 * +-------+------------------------+
2405 */
2406 #define MEMCG_DELAY_PRECISION_SHIFT 20
2407 #define MEMCG_DELAY_SCALING_SHIFT 14
2408
2409/*
2362 * Scheduled by try_charge() to be executed from the userland return path 2410 * Scheduled by try_charge() to be executed from the userland return path
2363 * and reclaims memory over the high limit. 2411 * and reclaims memory over the high limit.
2364 */ 2412 */
2365void mem_cgroup_handle_over_high(void) 2413void mem_cgroup_handle_over_high(void)
2366{ 2414{
2415 unsigned long usage, high, clamped_high;
2416 unsigned long pflags;
2417 unsigned long penalty_jiffies, overage;
2367 unsigned int nr_pages = current->memcg_nr_pages_over_high; 2418 unsigned int nr_pages = current->memcg_nr_pages_over_high;
2368 struct mem_cgroup *memcg; 2419 struct mem_cgroup *memcg;
2369 2420
@@ -2372,8 +2423,75 @@ void mem_cgroup_handle_over_high(void)
2372 2423
2373 memcg = get_mem_cgroup_from_mm(current->mm); 2424 memcg = get_mem_cgroup_from_mm(current->mm);
2374 reclaim_high(memcg, nr_pages, GFP_KERNEL); 2425 reclaim_high(memcg, nr_pages, GFP_KERNEL);
2375 css_put(&memcg->css);
2376 current->memcg_nr_pages_over_high = 0; 2426 current->memcg_nr_pages_over_high = 0;
2427
2428 /*
2429 * memory.high is breached and reclaim is unable to keep up. Throttle
2430 * allocators proactively to slow down excessive growth.
2431 *
2432 * We use overage compared to memory.high to calculate the number of
2433 * jiffies to sleep (penalty_jiffies). Ideally this value should be
2434 * fairly lenient on small overages, and increasingly harsh when the
2435 * memcg in question makes it clear that it has no intention of stopping
2436 * its crazy behaviour, so we exponentially increase the delay based on
2437 * overage amount.
2438 */
2439
2440 usage = page_counter_read(&memcg->memory);
2441 high = READ_ONCE(memcg->high);
2442
2443 if (usage <= high)
2444 goto out;
2445
2446 /*
2447 * Prevent division by 0 in overage calculation by acting as if it was a
2448 * threshold of 1 page
2449 */
2450 clamped_high = max(high, 1UL);
2451
2452 overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
2453 clamped_high);
2454
2455 penalty_jiffies = ((u64)overage * overage * HZ)
2456 >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
2457
2458 /*
2459 * Factor in the task's own contribution to the overage, such that four
2460 * N-sized allocations are throttled approximately the same as one
2461 * 4N-sized allocation.
2462 *
2463 * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2464 * larger the current charge patch is than that.
2465 */
2466 penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2467
2468 /*
2469 * Clamp the max delay per usermode return so as to still keep the
2470 * application moving forwards and also permit diagnostics, albeit
2471 * extremely slowly.
2472 */
2473 penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2474
2475 /*
2476 * Don't sleep if the amount of jiffies this memcg owes us is so low
2477 * that it's not even worth doing, in an attempt to be nice to those who
2478 * go only a small amount over their memory.high value and maybe haven't
2479 * been aggressively reclaimed enough yet.
2480 */
2481 if (penalty_jiffies <= HZ / 100)
2482 goto out;
2483
2484 /*
2485 * If we exit early, we're guaranteed to die (since
2486 * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2487 * need to account for any ill-begotten jiffies to pay them off later.
2488 */
2489 psi_memstall_enter(&pflags);
2490 schedule_timeout_killable(penalty_jiffies);
2491 psi_memstall_leave(&pflags);
2492
2493out:
2494 css_put(&memcg->css);
2377} 2495}
2378 2496
2379static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2497static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -3512,6 +3630,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3512 ret = mem_cgroup_resize_max(memcg, nr_pages, true); 3630 ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3513 break; 3631 break;
3514 case _KMEM: 3632 case _KMEM:
3633 pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
3634 "Please report your usecase to linux-mm@kvack.org if you "
3635 "depend on this functionality.\n");
3515 ret = memcg_update_kmem_max(memcg, nr_pages); 3636 ret = memcg_update_kmem_max(memcg, nr_pages);
3516 break; 3637 break;
3517 case _TCP: 3638 case _TCP:
@@ -4805,11 +4926,6 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4805 } 4926 }
4806} 4927}
4807 4928
4808static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4809{
4810 mem_cgroup_id_get_many(memcg, 1);
4811}
4812
4813static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) 4929static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
4814{ 4930{
4815 mem_cgroup_id_put_many(memcg, 1); 4931 mem_cgroup_id_put_many(memcg, 1);
@@ -4955,6 +5071,11 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
4955 memcg->cgwb_frn[i].done = 5071 memcg->cgwb_frn[i].done =
4956 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); 5072 __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
4957#endif 5073#endif
5074#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5075 spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5076 INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5077 memcg->deferred_split_queue.split_queue_len = 0;
5078#endif
4958 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); 5079 idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
4959 return memcg; 5080 return memcg;
4960fail: 5081fail:
@@ -5333,6 +5454,14 @@ static int mem_cgroup_move_account(struct page *page,
5333 __mod_memcg_state(to, NR_WRITEBACK, nr_pages); 5454 __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
5334 } 5455 }
5335 5456
5457#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5458 if (compound && !list_empty(page_deferred_list(page))) {
5459 spin_lock(&from->deferred_split_queue.split_queue_lock);
5460 list_del_init(page_deferred_list(page));
5461 from->deferred_split_queue.split_queue_len--;
5462 spin_unlock(&from->deferred_split_queue.split_queue_lock);
5463 }
5464#endif
5336 /* 5465 /*
5337 * It is safe to change page->mem_cgroup here because the page 5466 * It is safe to change page->mem_cgroup here because the page
5338 * is referenced, charged, and isolated - we can't race with 5467 * is referenced, charged, and isolated - we can't race with
@@ -5341,6 +5470,17 @@ static int mem_cgroup_move_account(struct page *page,
5341 5470
5342 /* caller should have done css_get */ 5471 /* caller should have done css_get */
5343 page->mem_cgroup = to; 5472 page->mem_cgroup = to;
5473
5474#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5475 if (compound && list_empty(page_deferred_list(page))) {
5476 spin_lock(&to->deferred_split_queue.split_queue_lock);
5477 list_add_tail(page_deferred_list(page),
5478 &to->deferred_split_queue.split_queue);
5479 to->deferred_split_queue.split_queue_len++;
5480 spin_unlock(&to->deferred_split_queue.split_queue_lock);
5481 }
5482#endif
5483
5344 spin_unlock_irqrestore(&from->move_lock, flags); 5484 spin_unlock_irqrestore(&from->move_lock, flags);
5345 5485
5346 ret = 0; 5486 ret = 0;
@@ -6511,7 +6651,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6511 unsigned int nr_pages = 1; 6651 unsigned int nr_pages = 1;
6512 6652
6513 if (PageTransHuge(page)) { 6653 if (PageTransHuge(page)) {
6514 nr_pages <<= compound_order(page); 6654 nr_pages = compound_nr(page);
6515 ug->nr_huge += nr_pages; 6655 ug->nr_huge += nr_pages;
6516 } 6656 }
6517 if (PageAnon(page)) 6657 if (PageAnon(page))
@@ -6523,7 +6663,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6523 } 6663 }
6524 ug->pgpgout++; 6664 ug->pgpgout++;
6525 } else { 6665 } else {
6526 ug->nr_kmem += 1 << compound_order(page); 6666 ug->nr_kmem += compound_nr(page);
6527 __ClearPageKmemcg(page); 6667 __ClearPageKmemcg(page);
6528 } 6668 }
6529 6669
diff --git a/mm/memfd.c b/mm/memfd.c
index 650e65a46b9c..2647c898990c 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -39,6 +39,7 @@ static void memfd_tag_pins(struct xa_state *xas)
39 xas_for_each(xas, page, ULONG_MAX) { 39 xas_for_each(xas, page, ULONG_MAX) {
40 if (xa_is_value(page)) 40 if (xa_is_value(page))
41 continue; 41 continue;
42 page = find_subpage(page, xas->xa_index);
42 if (page_count(page) - page_mapcount(page) > 1) 43 if (page_count(page) - page_mapcount(page) > 1)
43 xas_set_mark(xas, MEMFD_TAG_PINNED); 44 xas_set_mark(xas, MEMFD_TAG_PINNED);
44 45
@@ -88,6 +89,7 @@ static int memfd_wait_for_pins(struct address_space *mapping)
88 bool clear = true; 89 bool clear = true;
89 if (xa_is_value(page)) 90 if (xa_is_value(page))
90 continue; 91 continue;
92 page = find_subpage(page, xas.xa_index);
91 if (page_count(page) - page_mapcount(page) != 1) { 93 if (page_count(page) - page_mapcount(page) != 1) {
92 /* 94 /*
93 * On the last scan, we clean up all those tags 95 * On the last scan, we clean up all those tags
diff --git a/mm/memory.c b/mm/memory.c
index b1dff75640b7..b1ca51a079f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -518,7 +518,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
518 (long long)pte_val(pte), (long long)pmd_val(*pmd)); 518 (long long)pte_val(pte), (long long)pmd_val(*pmd));
519 if (page) 519 if (page)
520 dump_page(page, "bad pte"); 520 dump_page(page, "bad pte");
521 pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", 521 pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
522 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); 522 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
523 pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n", 523 pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
524 vma->vm_file, 524 vma->vm_file,
@@ -1026,6 +1026,9 @@ again:
1026 if (pte_none(ptent)) 1026 if (pte_none(ptent))
1027 continue; 1027 continue;
1028 1028
1029 if (need_resched())
1030 break;
1031
1029 if (pte_present(ptent)) { 1032 if (pte_present(ptent)) {
1030 struct page *page; 1033 struct page *page;
1031 1034
@@ -1093,7 +1096,6 @@ again:
1093 if (unlikely(details)) 1096 if (unlikely(details))
1094 continue; 1097 continue;
1095 1098
1096 entry = pte_to_swp_entry(ptent);
1097 if (!non_swap_entry(entry)) 1099 if (!non_swap_entry(entry))
1098 rss[MM_SWAPENTS]--; 1100 rss[MM_SWAPENTS]--;
1099 else if (is_migration_entry(entry)) { 1101 else if (is_migration_entry(entry)) {
@@ -1124,8 +1126,11 @@ again:
1124 if (force_flush) { 1126 if (force_flush) {
1125 force_flush = 0; 1127 force_flush = 0;
1126 tlb_flush_mmu(tlb); 1128 tlb_flush_mmu(tlb);
1127 if (addr != end) 1129 }
1128 goto again; 1130
1131 if (addr != end) {
1132 cond_resched();
1133 goto again;
1129 } 1134 }
1130 1135
1131 return addr; 1136 return addr;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c73f09913165..b1be791f772d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -632,33 +632,30 @@ static void generic_online_page(struct page *page, unsigned int order)
632#endif 632#endif
633} 633}
634 634
635static int online_pages_blocks(unsigned long start, unsigned long nr_pages)
636{
637 unsigned long end = start + nr_pages;
638 int order, onlined_pages = 0;
639
640 while (start < end) {
641 order = min(MAX_ORDER - 1,
642 get_order(PFN_PHYS(end) - PFN_PHYS(start)));
643 (*online_page_callback)(pfn_to_page(start), order);
644
645 onlined_pages += (1UL << order);
646 start += (1UL << order);
647 }
648 return onlined_pages;
649}
650
651static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 635static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
652 void *arg) 636 void *arg)
653{ 637{
654 unsigned long onlined_pages = *(unsigned long *)arg; 638 const unsigned long end_pfn = start_pfn + nr_pages;
639 unsigned long pfn;
640 int order;
655 641
656 if (PageReserved(pfn_to_page(start_pfn))) 642 /*
657 onlined_pages += online_pages_blocks(start_pfn, nr_pages); 643 * Online the pages. The callback might decide to keep some pages
644 * PG_reserved (to add them to the buddy later), but we still account
645 * them as being online/belonging to this zone ("present").
646 */
647 for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) {
648 order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn)));
649 /* __free_pages_core() wants pfns to be aligned to the order */
650 if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order)))
651 order = 0;
652 (*online_page_callback)(pfn_to_page(pfn), order);
653 }
658 654
659 online_mem_sections(start_pfn, start_pfn + nr_pages); 655 /* mark all involved sections as online */
656 online_mem_sections(start_pfn, end_pfn);
660 657
661 *(unsigned long *)arg = onlined_pages; 658 *(unsigned long *)arg += nr_pages;
662 return 0; 659 return 0;
663} 660}
664 661
@@ -714,8 +711,13 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
714 pgdat->node_start_pfn = start_pfn; 711 pgdat->node_start_pfn = start_pfn;
715 712
716 pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; 713 pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
717}
718 714
715}
716/*
717 * Associate the pfn range with the given zone, initializing the memmaps
718 * and resizing the pgdat/zone data to span the added pages. After this
719 * call, all affected pages are PG_reserved.
720 */
719void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, 721void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
720 unsigned long nr_pages, struct vmem_altmap *altmap) 722 unsigned long nr_pages, struct vmem_altmap *altmap)
721{ 723{
@@ -804,20 +806,6 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
804 return default_zone_for_pfn(nid, start_pfn, nr_pages); 806 return default_zone_for_pfn(nid, start_pfn, nr_pages);
805} 807}
806 808
807/*
808 * Associates the given pfn range with the given node and the zone appropriate
809 * for the given online type.
810 */
811static struct zone * __meminit move_pfn_range(int online_type, int nid,
812 unsigned long start_pfn, unsigned long nr_pages)
813{
814 struct zone *zone;
815
816 zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
817 move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL);
818 return zone;
819}
820
821int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 809int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
822{ 810{
823 unsigned long flags; 811 unsigned long flags;
@@ -840,7 +828,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
840 put_device(&mem->dev); 828 put_device(&mem->dev);
841 829
842 /* associate pfn range with the zone */ 830 /* associate pfn range with the zone */
843 zone = move_pfn_range(online_type, nid, pfn, nr_pages); 831 zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
832 move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
844 833
845 arg.start_pfn = pfn; 834 arg.start_pfn = pfn;
846 arg.nr_pages = nr_pages; 835 arg.nr_pages = nr_pages;
@@ -864,6 +853,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
864 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 853 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
865 online_pages_range); 854 online_pages_range);
866 if (ret) { 855 if (ret) {
856 /* not a single memory resource was applicable */
867 if (need_zonelists_rebuild) 857 if (need_zonelists_rebuild)
868 zone_pcp_reset(zone); 858 zone_pcp_reset(zone);
869 goto failed_addition; 859 goto failed_addition;
@@ -877,27 +867,22 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
877 867
878 shuffle_zone(zone); 868 shuffle_zone(zone);
879 869
880 if (onlined_pages) { 870 node_states_set_node(nid, &arg);
881 node_states_set_node(nid, &arg); 871 if (need_zonelists_rebuild)
882 if (need_zonelists_rebuild) 872 build_all_zonelists(NULL);
883 build_all_zonelists(NULL); 873 else
884 else 874 zone_pcp_update(zone);
885 zone_pcp_update(zone);
886 }
887 875
888 init_per_zone_wmark_min(); 876 init_per_zone_wmark_min();
889 877
890 if (onlined_pages) { 878 kswapd_run(nid);
891 kswapd_run(nid); 879 kcompactd_run(nid);
892 kcompactd_run(nid);
893 }
894 880
895 vm_total_pages = nr_free_pagecache_pages(); 881 vm_total_pages = nr_free_pagecache_pages();
896 882
897 writeback_set_ratelimit(); 883 writeback_set_ratelimit();
898 884
899 if (onlined_pages) 885 memory_notify(MEM_ONLINE, &arg);
900 memory_notify(MEM_ONLINE, &arg);
901 mem_hotplug_done(); 886 mem_hotplug_done();
902 return 0; 887 return 0;
903 888
@@ -933,8 +918,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
933 if (!pgdat) 918 if (!pgdat)
934 return NULL; 919 return NULL;
935 920
921 pgdat->per_cpu_nodestats =
922 alloc_percpu(struct per_cpu_nodestat);
936 arch_refresh_nodedata(nid, pgdat); 923 arch_refresh_nodedata(nid, pgdat);
937 } else { 924 } else {
925 int cpu;
938 /* 926 /*
939 * Reset the nr_zones, order and classzone_idx before reuse. 927 * Reset the nr_zones, order and classzone_idx before reuse.
940 * Note that kswapd will init kswapd_classzone_idx properly 928 * Note that kswapd will init kswapd_classzone_idx properly
@@ -943,6 +931,12 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
943 pgdat->nr_zones = 0; 931 pgdat->nr_zones = 0;
944 pgdat->kswapd_order = 0; 932 pgdat->kswapd_order = 0;
945 pgdat->kswapd_classzone_idx = 0; 933 pgdat->kswapd_classzone_idx = 0;
934 for_each_online_cpu(cpu) {
935 struct per_cpu_nodestat *p;
936
937 p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
938 memset(p, 0, sizeof(*p));
939 }
946 } 940 }
947 941
948 /* we can use NODE_DATA(nid) from here */ 942 /* we can use NODE_DATA(nid) from here */
@@ -952,7 +946,6 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
952 946
953 /* init node's zones as empty zones, we don't have any present pages.*/ 947 /* init node's zones as empty zones, we don't have any present pages.*/
954 free_area_init_core_hotplug(nid); 948 free_area_init_core_hotplug(nid);
955 pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
956 949
957 /* 950 /*
958 * The node we allocated has no zone fallback lists. For avoiding 951 * The node we allocated has no zone fallback lists. For avoiding
@@ -1309,7 +1302,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
1309 head = compound_head(page); 1302 head = compound_head(page);
1310 if (page_huge_active(head)) 1303 if (page_huge_active(head))
1311 return pfn; 1304 return pfn;
1312 skip = (1 << compound_order(head)) - (page - head); 1305 skip = compound_nr(head) - (page - head);
1313 pfn += skip - 1; 1306 pfn += skip - 1;
1314 } 1307 }
1315 return 0; 1308 return 0;
@@ -1347,7 +1340,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1347 1340
1348 if (PageHuge(page)) { 1341 if (PageHuge(page)) {
1349 struct page *head = compound_head(page); 1342 struct page *head = compound_head(page);
1350 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; 1343 pfn = page_to_pfn(head) + compound_nr(head) - 1;
1351 isolate_huge_page(head, &source); 1344 isolate_huge_page(head, &source);
1352 continue; 1345 continue;
1353 } else if (PageTransHuge(page)) 1346 } else if (PageTransHuge(page))
@@ -1662,7 +1655,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
1662 phys_addr_t beginpa, endpa; 1655 phys_addr_t beginpa, endpa;
1663 1656
1664 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); 1657 beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
1665 endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; 1658 endpa = beginpa + memory_block_size_bytes() - 1;
1666 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", 1659 pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
1667 &beginpa, &endpa); 1660 &beginpa, &endpa);
1668 1661
@@ -1800,7 +1793,7 @@ void __remove_memory(int nid, u64 start, u64 size)
1800{ 1793{
1801 1794
1802 /* 1795 /*
1803 * trigger BUG() is some memory is not offlined prior to calling this 1796 * trigger BUG() if some memory is not offlined prior to calling this
1804 * function 1797 * function
1805 */ 1798 */
1806 if (try_remove_memory(nid, start, size)) 1799 if (try_remove_memory(nid, start, size))
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f000771558d8..464406e8da91 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1512,10 +1512,6 @@ static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
1512 if (nodes_empty(*new)) 1512 if (nodes_empty(*new))
1513 goto out_put; 1513 goto out_put;
1514 1514
1515 nodes_and(*new, *new, node_states[N_MEMORY]);
1516 if (nodes_empty(*new))
1517 goto out_put;
1518
1519 err = security_task_movememory(task); 1515 err = security_task_movememory(task);
1520 if (err) 1516 if (err)
1521 goto out_put; 1517 goto out_put;
diff --git a/mm/migrate.c b/mm/migrate.c
index 9f4ed4e985c1..73d476d690b1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -460,7 +460,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
460 460
461 for (i = 1; i < HPAGE_PMD_NR; i++) { 461 for (i = 1; i < HPAGE_PMD_NR; i++) {
462 xas_next(&xas); 462 xas_next(&xas);
463 xas_store(&xas, newpage + i); 463 xas_store(&xas, newpage);
464 } 464 }
465 } 465 }
466 466
@@ -1892,7 +1892,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
1892 VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page); 1892 VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
1893 1893
1894 /* Avoid migrating to a node that is nearly full */ 1894 /* Avoid migrating to a node that is nearly full */
1895 if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page))) 1895 if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
1896 return 0; 1896 return 0;
1897 1897
1898 if (isolate_lru_page(page)) 1898 if (isolate_lru_page(page))
@@ -2218,17 +2218,15 @@ again:
2218 pte_t pte; 2218 pte_t pte;
2219 2219
2220 pte = *ptep; 2220 pte = *ptep;
2221 pfn = pte_pfn(pte);
2222 2221
2223 if (pte_none(pte)) { 2222 if (pte_none(pte)) {
2224 mpfn = MIGRATE_PFN_MIGRATE; 2223 mpfn = MIGRATE_PFN_MIGRATE;
2225 migrate->cpages++; 2224 migrate->cpages++;
2226 pfn = 0;
2227 goto next; 2225 goto next;
2228 } 2226 }
2229 2227
2230 if (!pte_present(pte)) { 2228 if (!pte_present(pte)) {
2231 mpfn = pfn = 0; 2229 mpfn = 0;
2232 2230
2233 /* 2231 /*
2234 * Only care about unaddressable device page special 2232 * Only care about unaddressable device page special
@@ -2245,10 +2243,10 @@ again:
2245 if (is_write_device_private_entry(entry)) 2243 if (is_write_device_private_entry(entry))
2246 mpfn |= MIGRATE_PFN_WRITE; 2244 mpfn |= MIGRATE_PFN_WRITE;
2247 } else { 2245 } else {
2246 pfn = pte_pfn(pte);
2248 if (is_zero_pfn(pfn)) { 2247 if (is_zero_pfn(pfn)) {
2249 mpfn = MIGRATE_PFN_MIGRATE; 2248 mpfn = MIGRATE_PFN_MIGRATE;
2250 migrate->cpages++; 2249 migrate->cpages++;
2251 pfn = 0;
2252 goto next; 2250 goto next;
2253 } 2251 }
2254 page = vm_normal_page(migrate->vma, addr, pte); 2252 page = vm_normal_page(migrate->vma, addr, pte);
@@ -2258,10 +2256,9 @@ again:
2258 2256
2259 /* FIXME support THP */ 2257 /* FIXME support THP */
2260 if (!page || !page->mapping || PageTransCompound(page)) { 2258 if (!page || !page->mapping || PageTransCompound(page)) {
2261 mpfn = pfn = 0; 2259 mpfn = 0;
2262 goto next; 2260 goto next;
2263 } 2261 }
2264 pfn = page_to_pfn(page);
2265 2262
2266 /* 2263 /*
2267 * By getting a reference on the page we pin it and that blocks 2264 * By getting a reference on the page we pin it and that blocks
diff --git a/mm/mmap.c b/mm/mmap.c
index 6bc21fca20bc..f1e8c7f93e04 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1358,6 +1358,9 @@ static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
1358 if (S_ISBLK(inode->i_mode)) 1358 if (S_ISBLK(inode->i_mode))
1359 return MAX_LFS_FILESIZE; 1359 return MAX_LFS_FILESIZE;
1360 1360
1361 if (S_ISSOCK(inode->i_mode))
1362 return MAX_LFS_FILESIZE;
1363
1361 /* Special "we do even unsigned file positions" case */ 1364 /* Special "we do even unsigned file positions" case */
1362 if (file->f_mode & FMODE_UNSIGNED_OFFSET) 1365 if (file->f_mode & FMODE_UNSIGNED_OFFSET)
1363 return 0; 1366 return 0;
@@ -2274,12 +2277,9 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
2274 if (vma) { 2277 if (vma) {
2275 *pprev = vma->vm_prev; 2278 *pprev = vma->vm_prev;
2276 } else { 2279 } else {
2277 struct rb_node *rb_node = mm->mm_rb.rb_node; 2280 struct rb_node *rb_node = rb_last(&mm->mm_rb);
2278 *pprev = NULL; 2281
2279 while (rb_node) { 2282 *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
2280 *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
2281 rb_node = rb_node->rb_right;
2282 }
2283 } 2283 }
2284 return vma; 2284 return vma;
2285} 2285}
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 8c943a6e1696..7d70e5c78f97 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -271,8 +271,6 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
271 271
272 tlb_flush_mmu(tlb); 272 tlb_flush_mmu(tlb);
273 273
274 /* keep the page table cache within bounds */
275 check_pgt_cache();
276#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER 274#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
277 tlb_batch_list_free(tlb); 275 tlb_batch_list_free(tlb);
278#endif 276#endif
diff --git a/mm/nommu.c b/mm/nommu.c
index fed1b6e9c89b..99b7ec318824 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -108,7 +108,7 @@ unsigned int kobjsize(const void *objp)
108 * The ksize() function is only guaranteed to work for pointers 108 * The ksize() function is only guaranteed to work for pointers
109 * returned by kmalloc(). So handle arbitrary pointers here. 109 * returned by kmalloc(). So handle arbitrary pointers here.
110 */ 110 */
111 return PAGE_SIZE << compound_order(page); 111 return page_size(page);
112} 112}
113 113
114/** 114/**
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eda2e2a0bdc6..c1d9496b4c43 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -73,7 +73,7 @@ static inline bool is_memcg_oom(struct oom_control *oc)
73/** 73/**
74 * oom_cpuset_eligible() - check task eligiblity for kill 74 * oom_cpuset_eligible() - check task eligiblity for kill
75 * @start: task struct of which task to consider 75 * @start: task struct of which task to consider
76 * @mask: nodemask passed to page allocator for mempolicy ooms 76 * @oc: pointer to struct oom_control
77 * 77 *
78 * Task eligibility is determined by whether or not a candidate task, @tsk, 78 * Task eligibility is determined by whether or not a candidate task, @tsk,
79 * shares the same mempolicy nodes as current if it is bound by such a policy 79 * shares the same mempolicy nodes as current if it is bound by such a policy
@@ -287,7 +287,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
287 !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { 287 !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
288 oc->totalpages = total_swap_pages; 288 oc->totalpages = total_swap_pages;
289 for_each_node_mask(nid, *oc->nodemask) 289 for_each_node_mask(nid, *oc->nodemask)
290 oc->totalpages += node_spanned_pages(nid); 290 oc->totalpages += node_present_pages(nid);
291 return CONSTRAINT_MEMORY_POLICY; 291 return CONSTRAINT_MEMORY_POLICY;
292 } 292 }
293 293
@@ -300,7 +300,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
300 if (cpuset_limited) { 300 if (cpuset_limited) {
301 oc->totalpages = total_swap_pages; 301 oc->totalpages = total_swap_pages;
302 for_each_node_mask(nid, cpuset_current_mems_allowed) 302 for_each_node_mask(nid, cpuset_current_mems_allowed)
303 oc->totalpages += node_spanned_pages(nid); 303 oc->totalpages += node_present_pages(nid);
304 return CONSTRAINT_CPUSET; 304 return CONSTRAINT_CPUSET;
305 } 305 }
306 return CONSTRAINT_NONE; 306 return CONSTRAINT_NONE;
@@ -884,12 +884,13 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
884 */ 884 */
885 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID); 885 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
886 mark_oom_victim(victim); 886 mark_oom_victim(victim);
887 pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", 887 pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
888 message, task_pid_nr(victim), victim->comm, 888 message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
889 K(victim->mm->total_vm), 889 K(get_mm_counter(mm, MM_ANONPAGES)),
890 K(get_mm_counter(victim->mm, MM_ANONPAGES)), 890 K(get_mm_counter(mm, MM_FILEPAGES)),
891 K(get_mm_counter(victim->mm, MM_FILEPAGES)), 891 K(get_mm_counter(mm, MM_SHMEMPAGES)),
892 K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); 892 from_kuid(&init_user_ns, task_uid(victim)),
893 mm_pgtables_bytes(mm), victim->signal->oom_score_adj);
893 task_unlock(victim); 894 task_unlock(victim);
894 895
895 /* 896 /*
@@ -1068,9 +1069,10 @@ bool out_of_memory(struct oom_control *oc)
1068 * The OOM killer does not compensate for IO-less reclaim. 1069 * The OOM killer does not compensate for IO-less reclaim.
1069 * pagefault_out_of_memory lost its gfp context so we have to 1070 * pagefault_out_of_memory lost its gfp context so we have to
1070 * make sure exclude 0 mask - all other users should have at least 1071 * make sure exclude 0 mask - all other users should have at least
1071 * ___GFP_DIRECT_RECLAIM to get here. 1072 * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
1073 * invoke the OOM killer even if it is a GFP_NOFS allocation.
1072 */ 1074 */
1073 if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) 1075 if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
1074 return true; 1076 return true;
1075 1077
1076 /* 1078 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ff5484fdbdf9..3334a769eb91 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -670,6 +670,7 @@ out:
670 670
671void free_compound_page(struct page *page) 671void free_compound_page(struct page *page)
672{ 672{
673 mem_cgroup_uncharge(page);
673 __free_pages_ok(page, compound_order(page)); 674 __free_pages_ok(page, compound_order(page));
674} 675}
675 676
@@ -3955,14 +3956,22 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3955 goto check_priority; 3956 goto check_priority;
3956 3957
3957 /* 3958 /*
3959 * compaction was skipped because there are not enough order-0 pages
3960 * to work with, so we retry only if it looks like reclaim can help.
3961 */
3962 if (compaction_needs_reclaim(compact_result)) {
3963 ret = compaction_zonelist_suitable(ac, order, alloc_flags);
3964 goto out;
3965 }
3966
3967 /*
3958 * make sure the compaction wasn't deferred or didn't bail out early 3968 * make sure the compaction wasn't deferred or didn't bail out early
3959 * due to locks contention before we declare that we should give up. 3969 * due to locks contention before we declare that we should give up.
3960 * But do not retry if the given zonelist is not suitable for 3970 * But the next retry should use a higher priority if allowed, so
3961 * compaction. 3971 * we don't just keep bailing out endlessly.
3962 */ 3972 */
3963 if (compaction_withdrawn(compact_result)) { 3973 if (compaction_withdrawn(compact_result)) {
3964 ret = compaction_zonelist_suitable(ac, order, alloc_flags); 3974 goto check_priority;
3965 goto out;
3966 } 3975 }
3967 3976
3968 /* 3977 /*
@@ -6638,9 +6647,11 @@ static unsigned long __init calc_memmap_size(unsigned long spanned_pages,
6638#ifdef CONFIG_TRANSPARENT_HUGEPAGE 6647#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6639static void pgdat_init_split_queue(struct pglist_data *pgdat) 6648static void pgdat_init_split_queue(struct pglist_data *pgdat)
6640{ 6649{
6641 spin_lock_init(&pgdat->split_queue_lock); 6650 struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
6642 INIT_LIST_HEAD(&pgdat->split_queue); 6651
6643 pgdat->split_queue_len = 0; 6652 spin_lock_init(&ds_queue->split_queue_lock);
6653 INIT_LIST_HEAD(&ds_queue->split_queue);
6654 ds_queue->split_queue_len = 0;
6644} 6655}
6645#else 6656#else
6646static void pgdat_init_split_queue(struct pglist_data *pgdat) {} 6657static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
@@ -8196,7 +8207,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
8196 if (!hugepage_migration_supported(page_hstate(head))) 8207 if (!hugepage_migration_supported(page_hstate(head)))
8197 goto unmovable; 8208 goto unmovable;
8198 8209
8199 skip_pages = (1 << compound_order(head)) - (page - head); 8210 skip_pages = compound_nr(head) - (page - head);
8200 iter += skip_pages - 1; 8211 iter += skip_pages - 1;
8201 continue; 8212 continue;
8202 } 8213 }
diff --git a/mm/page_owner.c b/mm/page_owner.c
index addcbb2ae4e4..dee931184788 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -24,6 +24,9 @@ struct page_owner {
24 short last_migrate_reason; 24 short last_migrate_reason;
25 gfp_t gfp_mask; 25 gfp_t gfp_mask;
26 depot_stack_handle_t handle; 26 depot_stack_handle_t handle;
27#ifdef CONFIG_DEBUG_PAGEALLOC
28 depot_stack_handle_t free_handle;
29#endif
27}; 30};
28 31
29static bool page_owner_disabled = true; 32static bool page_owner_disabled = true;
@@ -102,19 +105,6 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext)
102 return (void *)page_ext + page_owner_ops.offset; 105 return (void *)page_ext + page_owner_ops.offset;
103} 106}
104 107
105void __reset_page_owner(struct page *page, unsigned int order)
106{
107 int i;
108 struct page_ext *page_ext;
109
110 for (i = 0; i < (1 << order); i++) {
111 page_ext = lookup_page_ext(page + i);
112 if (unlikely(!page_ext))
113 continue;
114 __clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
115 }
116}
117
118static inline bool check_recursive_alloc(unsigned long *entries, 108static inline bool check_recursive_alloc(unsigned long *entries,
119 unsigned int nr_entries, 109 unsigned int nr_entries,
120 unsigned long ip) 110 unsigned long ip)
@@ -154,18 +144,50 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags)
154 return handle; 144 return handle;
155} 145}
156 146
157static inline void __set_page_owner_handle(struct page_ext *page_ext, 147void __reset_page_owner(struct page *page, unsigned int order)
158 depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask)
159{ 148{
149 int i;
150 struct page_ext *page_ext;
151#ifdef CONFIG_DEBUG_PAGEALLOC
152 depot_stack_handle_t handle = 0;
160 struct page_owner *page_owner; 153 struct page_owner *page_owner;
161 154
162 page_owner = get_page_owner(page_ext); 155 if (debug_pagealloc_enabled())
163 page_owner->handle = handle; 156 handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
164 page_owner->order = order; 157#endif
165 page_owner->gfp_mask = gfp_mask;
166 page_owner->last_migrate_reason = -1;
167 158
168 __set_bit(PAGE_EXT_OWNER, &page_ext->flags); 159 for (i = 0; i < (1 << order); i++) {
160 page_ext = lookup_page_ext(page + i);
161 if (unlikely(!page_ext))
162 continue;
163 __clear_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags);
164#ifdef CONFIG_DEBUG_PAGEALLOC
165 if (debug_pagealloc_enabled()) {
166 page_owner = get_page_owner(page_ext);
167 page_owner->free_handle = handle;
168 }
169#endif
170 }
171}
172
173static inline void __set_page_owner_handle(struct page *page,
174 struct page_ext *page_ext, depot_stack_handle_t handle,
175 unsigned int order, gfp_t gfp_mask)
176{
177 struct page_owner *page_owner;
178 int i;
179
180 for (i = 0; i < (1 << order); i++) {
181 page_owner = get_page_owner(page_ext);
182 page_owner->handle = handle;
183 page_owner->order = order;
184 page_owner->gfp_mask = gfp_mask;
185 page_owner->last_migrate_reason = -1;
186 __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
187 __set_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags);
188
189 page_ext = lookup_page_ext(page + i);
190 }
169} 191}
170 192
171noinline void __set_page_owner(struct page *page, unsigned int order, 193noinline void __set_page_owner(struct page *page, unsigned int order,
@@ -178,7 +200,7 @@ noinline void __set_page_owner(struct page *page, unsigned int order,
178 return; 200 return;
179 201
180 handle = save_stack(gfp_mask); 202 handle = save_stack(gfp_mask);
181 __set_page_owner_handle(page_ext, handle, order, gfp_mask); 203 __set_page_owner_handle(page, page_ext, handle, order, gfp_mask);
182} 204}
183 205
184void __set_page_owner_migrate_reason(struct page *page, int reason) 206void __set_page_owner_migrate_reason(struct page *page, int reason)
@@ -204,8 +226,11 @@ void __split_page_owner(struct page *page, unsigned int order)
204 226
205 page_owner = get_page_owner(page_ext); 227 page_owner = get_page_owner(page_ext);
206 page_owner->order = 0; 228 page_owner->order = 0;
207 for (i = 1; i < (1 << order); i++) 229 for (i = 1; i < (1 << order); i++) {
208 __copy_page_owner(page, page + i); 230 page_ext = lookup_page_ext(page + i);
231 page_owner = get_page_owner(page_ext);
232 page_owner->order = 0;
233 }
209} 234}
210 235
211void __copy_page_owner(struct page *oldpage, struct page *newpage) 236void __copy_page_owner(struct page *oldpage, struct page *newpage)
@@ -235,6 +260,7 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
235 * the new page, which will be freed. 260 * the new page, which will be freed.
236 */ 261 */
237 __set_bit(PAGE_EXT_OWNER, &new_ext->flags); 262 __set_bit(PAGE_EXT_OWNER, &new_ext->flags);
263 __set_bit(PAGE_EXT_OWNER_ACTIVE, &new_ext->flags);
238} 264}
239 265
240void pagetypeinfo_showmixedcount_print(struct seq_file *m, 266void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -294,7 +320,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
294 if (unlikely(!page_ext)) 320 if (unlikely(!page_ext))
295 continue; 321 continue;
296 322
297 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) 323 if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
298 continue; 324 continue;
299 325
300 page_owner = get_page_owner(page_ext); 326 page_owner = get_page_owner(page_ext);
@@ -405,20 +431,36 @@ void __dump_page_owner(struct page *page)
405 mt = gfpflags_to_migratetype(gfp_mask); 431 mt = gfpflags_to_migratetype(gfp_mask);
406 432
407 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { 433 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
408 pr_alert("page_owner info is not active (free page?)\n"); 434 pr_alert("page_owner info is not present (never set?)\n");
409 return; 435 return;
410 } 436 }
411 437
438 if (test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
439 pr_alert("page_owner tracks the page as allocated\n");
440 else
441 pr_alert("page_owner tracks the page as freed\n");
442
443 pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
444 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
445
412 handle = READ_ONCE(page_owner->handle); 446 handle = READ_ONCE(page_owner->handle);
413 if (!handle) { 447 if (!handle) {
414 pr_alert("page_owner info is not active (free page?)\n"); 448 pr_alert("page_owner allocation stack trace missing\n");
415 return; 449 } else {
450 nr_entries = stack_depot_fetch(handle, &entries);
451 stack_trace_print(entries, nr_entries, 0);
416 } 452 }
417 453
418 nr_entries = stack_depot_fetch(handle, &entries); 454#ifdef CONFIG_DEBUG_PAGEALLOC
419 pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", 455 handle = READ_ONCE(page_owner->free_handle);
420 page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); 456 if (!handle) {
421 stack_trace_print(entries, nr_entries, 0); 457 pr_alert("page_owner free stack trace missing\n");
458 } else {
459 nr_entries = stack_depot_fetch(handle, &entries);
460 pr_alert("page last free stack trace:\n");
461 stack_trace_print(entries, nr_entries, 0);
462 }
463#endif
422 464
423 if (page_owner->last_migrate_reason != -1) 465 if (page_owner->last_migrate_reason != -1)
424 pr_alert("page has been migrated, last migrate reason: %s\n", 466 pr_alert("page has been migrated, last migrate reason: %s\n",
@@ -481,9 +523,23 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
481 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) 523 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
482 continue; 524 continue;
483 525
526 /*
527 * Although we do have the info about past allocation of free
528 * pages, it's not relevant for current memory usage.
529 */
530 if (!test_bit(PAGE_EXT_OWNER_ACTIVE, &page_ext->flags))
531 continue;
532
484 page_owner = get_page_owner(page_ext); 533 page_owner = get_page_owner(page_ext);
485 534
486 /* 535 /*
536 * Don't print "tail" pages of high-order allocations as that
537 * would inflate the stats.
538 */
539 if (!IS_ALIGNED(pfn, 1 << page_owner->order))
540 continue;
541
542 /*
487 * Access to page_ext->handle isn't synchronous so we should 543 * Access to page_ext->handle isn't synchronous so we should
488 * be careful to access it. 544 * be careful to access it.
489 */ 545 */
@@ -562,7 +618,8 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
562 continue; 618 continue;
563 619
564 /* Found early allocated page */ 620 /* Found early allocated page */
565 __set_page_owner_handle(page_ext, early_handle, 0, 0); 621 __set_page_owner_handle(page, page_ext, early_handle,
622 0, 0);
566 count++; 623 count++;
567 } 624 }
568 cond_resched(); 625 cond_resched();
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 21d4f97cb49b..34b9181ee5d1 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -101,7 +101,7 @@ static void unpoison_page(struct page *page)
101 /* 101 /*
102 * Page poisoning when enabled poisons each and every page 102 * Page poisoning when enabled poisons each and every page
103 * that is freed to buddy. Thus no extra check is done to 103 * that is freed to buddy. Thus no extra check is done to
104 * see if a page was posioned. 104 * see if a page was poisoned.
105 */ 105 */
106 check_poison_mem(addr, PAGE_SIZE); 106 check_poison_mem(addr, PAGE_SIZE);
107 kunmap_atomic(addr); 107 kunmap_atomic(addr);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 11df03e71288..eff4b4520c8d 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -153,8 +153,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
153 153
154 if (unlikely(PageHuge(pvmw->page))) { 154 if (unlikely(PageHuge(pvmw->page))) {
155 /* when pud is not present, pte will be NULL */ 155 /* when pud is not present, pte will be NULL */
156 pvmw->pte = huge_pte_offset(mm, pvmw->address, 156 pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
157 PAGE_SIZE << compound_order(page));
158 if (!pvmw->pte) 157 if (!pvmw->pte)
159 return false; 158 return false;
160 159
diff --git a/mm/quicklist.c b/mm/quicklist.c
deleted file mode 100644
index 5e98ac78e410..000000000000
--- a/mm/quicklist.c
+++ /dev/null
@@ -1,103 +0,0 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Quicklist support.
4 *
5 * Quicklists are light weight lists of pages that have a defined state
6 * on alloc and free. Pages must be in the quicklist specific defined state
7 * (zero by default) when the page is freed. It seems that the initial idea
8 * for such lists first came from Dave Miller and then various other people
9 * improved on it.
10 *
11 * Copyright (C) 2007 SGI,
12 * Christoph Lameter <cl@linux.com>
13 * Generalized, added support for multiple lists and
14 * constructors / destructors.
15 */
16#include <linux/kernel.h>
17
18#include <linux/gfp.h>
19#include <linux/mm.h>
20#include <linux/mmzone.h>
21#include <linux/quicklist.h>
22
23DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
24
25#define FRACTION_OF_NODE_MEM 16
26
27static unsigned long max_pages(unsigned long min_pages)
28{
29 unsigned long node_free_pages, max;
30 int node = numa_node_id();
31 struct zone *zones = NODE_DATA(node)->node_zones;
32 int num_cpus_on_node;
33
34 node_free_pages =
35#ifdef CONFIG_ZONE_DMA
36 zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) +
37#endif
38#ifdef CONFIG_ZONE_DMA32
39 zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) +
40#endif
41 zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
42
43 max = node_free_pages / FRACTION_OF_NODE_MEM;
44
45 num_cpus_on_node = cpumask_weight(cpumask_of_node(node));
46 max /= num_cpus_on_node;
47
48 return max(max, min_pages);
49}
50
51static long min_pages_to_free(struct quicklist *q,
52 unsigned long min_pages, long max_free)
53{
54 long pages_to_free;
55
56 pages_to_free = q->nr_pages - max_pages(min_pages);
57
58 return min(pages_to_free, max_free);
59}
60
61/*
62 * Trim down the number of pages in the quicklist
63 */
64void quicklist_trim(int nr, void (*dtor)(void *),
65 unsigned long min_pages, unsigned long max_free)
66{
67 long pages_to_free;
68 struct quicklist *q;
69
70 q = &get_cpu_var(quicklist)[nr];
71 if (q->nr_pages > min_pages) {
72 pages_to_free = min_pages_to_free(q, min_pages, max_free);
73
74 while (pages_to_free > 0) {
75 /*
76 * We pass a gfp_t of 0 to quicklist_alloc here
77 * because we will never call into the page allocator.
78 */
79 void *p = quicklist_alloc(nr, 0, NULL);
80
81 if (dtor)
82 dtor(p);
83 free_page((unsigned long)p);
84 pages_to_free--;
85 }
86 }
87 put_cpu_var(quicklist);
88}
89
90unsigned long quicklist_total_size(void)
91{
92 unsigned long count = 0;
93 int cpu;
94 struct quicklist *ql, *q;
95
96 for_each_online_cpu(cpu) {
97 ql = per_cpu(quicklist, cpu);
98 for (q = ql; q < ql + CONFIG_NR_QUICK; q++)
99 count += q->nr_pages;
100 }
101 return count;
102}
103
diff --git a/mm/rmap.c b/mm/rmap.c
index 003377e24232..d9a23bb773bf 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -898,15 +898,13 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
898 */ 898 */
899 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 899 mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
900 0, vma, vma->vm_mm, address, 900 0, vma, vma->vm_mm, address,
901 min(vma->vm_end, address + 901 min(vma->vm_end, address + page_size(page)));
902 (PAGE_SIZE << compound_order(page))));
903 mmu_notifier_invalidate_range_start(&range); 902 mmu_notifier_invalidate_range_start(&range);
904 903
905 while (page_vma_mapped_walk(&pvmw)) { 904 while (page_vma_mapped_walk(&pvmw)) {
906 unsigned long cstart;
907 int ret = 0; 905 int ret = 0;
908 906
909 cstart = address = pvmw.address; 907 address = pvmw.address;
910 if (pvmw.pte) { 908 if (pvmw.pte) {
911 pte_t entry; 909 pte_t entry;
912 pte_t *pte = pvmw.pte; 910 pte_t *pte = pvmw.pte;
@@ -933,7 +931,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
933 entry = pmd_wrprotect(entry); 931 entry = pmd_wrprotect(entry);
934 entry = pmd_mkclean(entry); 932 entry = pmd_mkclean(entry);
935 set_pmd_at(vma->vm_mm, address, pmd, entry); 933 set_pmd_at(vma->vm_mm, address, pmd, entry);
936 cstart &= PMD_MASK;
937 ret = 1; 934 ret = 1;
938#else 935#else
939 /* unexpected pmd-mapped page? */ 936 /* unexpected pmd-mapped page? */
@@ -1192,8 +1189,10 @@ void page_add_file_rmap(struct page *page, bool compound)
1192 } 1189 }
1193 if (!atomic_inc_and_test(compound_mapcount_ptr(page))) 1190 if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
1194 goto out; 1191 goto out;
1195 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1192 if (PageSwapBacked(page))
1196 __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); 1193 __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
1194 else
1195 __inc_node_page_state(page, NR_FILE_PMDMAPPED);
1197 } else { 1196 } else {
1198 if (PageTransCompound(page) && page_mapping(page)) { 1197 if (PageTransCompound(page) && page_mapping(page)) {
1199 VM_WARN_ON_ONCE(!PageLocked(page)); 1198 VM_WARN_ON_ONCE(!PageLocked(page));
@@ -1232,8 +1231,10 @@ static void page_remove_file_rmap(struct page *page, bool compound)
1232 } 1231 }
1233 if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) 1232 if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
1234 goto out; 1233 goto out;
1235 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1234 if (PageSwapBacked(page))
1236 __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); 1235 __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
1236 else
1237 __dec_node_page_state(page, NR_FILE_PMDMAPPED);
1237 } else { 1238 } else {
1238 if (!atomic_add_negative(-1, &page->_mapcount)) 1239 if (!atomic_add_negative(-1, &page->_mapcount))
1239 goto out; 1240 goto out;
@@ -1374,8 +1375,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1374 */ 1375 */
1375 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, 1376 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1376 address, 1377 address,
1377 min(vma->vm_end, address + 1378 min(vma->vm_end, address + page_size(page)));
1378 (PAGE_SIZE << compound_order(page))));
1379 if (PageHuge(page)) { 1379 if (PageHuge(page)) {
1380 /* 1380 /*
1381 * If sharing is possible, start and end will be adjusted 1381 * If sharing is possible, start and end will be adjusted
@@ -1524,8 +1524,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1524 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { 1524 if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
1525 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); 1525 pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
1526 if (PageHuge(page)) { 1526 if (PageHuge(page)) {
1527 int nr = 1 << compound_order(page); 1527 hugetlb_count_sub(compound_nr(page), mm);
1528 hugetlb_count_sub(nr, mm);
1529 set_huge_swap_pte_at(mm, address, 1528 set_huge_swap_pte_at(mm, address,
1530 pvmw.pte, pteval, 1529 pvmw.pte, pteval,
1531 vma_mmu_pagesize(vma)); 1530 vma_mmu_pagesize(vma));
diff --git a/mm/shmem.c b/mm/shmem.c
index 0f7fd4a85db6..30ce722c23fa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -609,7 +609,7 @@ static int shmem_add_to_page_cache(struct page *page,
609{ 609{
610 XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); 610 XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
611 unsigned long i = 0; 611 unsigned long i = 0;
612 unsigned long nr = 1UL << compound_order(page); 612 unsigned long nr = compound_nr(page);
613 613
614 VM_BUG_ON_PAGE(PageTail(page), page); 614 VM_BUG_ON_PAGE(PageTail(page), page);
615 VM_BUG_ON_PAGE(index != round_down(index, nr), page); 615 VM_BUG_ON_PAGE(index != round_down(index, nr), page);
@@ -631,7 +631,7 @@ static int shmem_add_to_page_cache(struct page *page,
631 if (xas_error(&xas)) 631 if (xas_error(&xas))
632 goto unlock; 632 goto unlock;
633next: 633next:
634 xas_store(&xas, page + i); 634 xas_store(&xas, page);
635 if (++i < nr) { 635 if (++i < nr) {
636 xas_next(&xas); 636 xas_next(&xas);
637 goto next; 637 goto next;
@@ -1734,7 +1734,7 @@ unlock:
1734 * vm. If we swap it in we mark it dirty since we also free the swap 1734 * vm. If we swap it in we mark it dirty since we also free the swap
1735 * entry since a page cannot live in both the swap and page cache. 1735 * entry since a page cannot live in both the swap and page cache.
1736 * 1736 *
1737 * fault_mm and fault_type are only supplied by shmem_fault: 1737 * vmf and fault_type are only supplied by shmem_fault:
1738 * otherwise they are NULL. 1738 * otherwise they are NULL.
1739 */ 1739 */
1740static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 1740static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
@@ -1884,7 +1884,7 @@ alloc_nohuge:
1884 lru_cache_add_anon(page); 1884 lru_cache_add_anon(page);
1885 1885
1886 spin_lock_irq(&info->lock); 1886 spin_lock_irq(&info->lock);
1887 info->alloced += 1 << compound_order(page); 1887 info->alloced += compound_nr(page);
1888 inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); 1888 inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
1889 shmem_recalc_inode(inode); 1889 shmem_recalc_inode(inode);
1890 spin_unlock_irq(&info->lock); 1890 spin_unlock_irq(&info->lock);
@@ -1925,7 +1925,7 @@ clear:
1925 struct page *head = compound_head(page); 1925 struct page *head = compound_head(page);
1926 int i; 1926 int i;
1927 1927
1928 for (i = 0; i < (1 << compound_order(head)); i++) { 1928 for (i = 0; i < compound_nr(head); i++) {
1929 clear_highpage(head + i); 1929 clear_highpage(head + i);
1930 flush_dcache_page(head + i); 1930 flush_dcache_page(head + i);
1931 } 1931 }
@@ -1952,7 +1952,7 @@ clear:
1952 * Error recovery. 1952 * Error recovery.
1953 */ 1953 */
1954unacct: 1954unacct:
1955 shmem_inode_unacct_blocks(inode, 1 << compound_order(page)); 1955 shmem_inode_unacct_blocks(inode, compound_nr(page));
1956 1956
1957 if (PageTransHuge(page)) { 1957 if (PageTransHuge(page)) {
1958 unlock_page(page); 1958 unlock_page(page);
diff --git a/mm/slab.h b/mm/slab.h
index 9057b8056b07..68e455f2b698 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -30,6 +30,69 @@ struct kmem_cache {
30 struct list_head list; /* List of all slab caches on the system */ 30 struct list_head list; /* List of all slab caches on the system */
31}; 31};
32 32
33#else /* !CONFIG_SLOB */
34
35struct memcg_cache_array {
36 struct rcu_head rcu;
37 struct kmem_cache *entries[0];
38};
39
40/*
41 * This is the main placeholder for memcg-related information in kmem caches.
42 * Both the root cache and the child caches will have it. For the root cache,
43 * this will hold a dynamically allocated array large enough to hold
44 * information about the currently limited memcgs in the system. To allow the
45 * array to be accessed without taking any locks, on relocation we free the old
46 * version only after a grace period.
47 *
48 * Root and child caches hold different metadata.
49 *
50 * @root_cache: Common to root and child caches. NULL for root, pointer to
51 * the root cache for children.
52 *
53 * The following fields are specific to root caches.
54 *
55 * @memcg_caches: kmemcg ID indexed table of child caches. This table is
56 * used to index child cachces during allocation and cleared
57 * early during shutdown.
58 *
59 * @root_caches_node: List node for slab_root_caches list.
60 *
61 * @children: List of all child caches. While the child caches are also
62 * reachable through @memcg_caches, a child cache remains on
63 * this list until it is actually destroyed.
64 *
65 * The following fields are specific to child caches.
66 *
67 * @memcg: Pointer to the memcg this cache belongs to.
68 *
69 * @children_node: List node for @root_cache->children list.
70 *
71 * @kmem_caches_node: List node for @memcg->kmem_caches list.
72 */
73struct memcg_cache_params {
74 struct kmem_cache *root_cache;
75 union {
76 struct {
77 struct memcg_cache_array __rcu *memcg_caches;
78 struct list_head __root_caches_node;
79 struct list_head children;
80 bool dying;
81 };
82 struct {
83 struct mem_cgroup *memcg;
84 struct list_head children_node;
85 struct list_head kmem_caches_node;
86 struct percpu_ref refcnt;
87
88 void (*work_fn)(struct kmem_cache *);
89 union {
90 struct rcu_head rcu_head;
91 struct work_struct work;
92 };
93 };
94 };
95};
33#endif /* CONFIG_SLOB */ 96#endif /* CONFIG_SLOB */
34 97
35#ifdef CONFIG_SLAB 98#ifdef CONFIG_SLAB
@@ -174,6 +237,7 @@ int __kmem_cache_shrink(struct kmem_cache *);
174void __kmemcg_cache_deactivate(struct kmem_cache *s); 237void __kmemcg_cache_deactivate(struct kmem_cache *s);
175void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s); 238void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
176void slab_kmem_cache_release(struct kmem_cache *); 239void slab_kmem_cache_release(struct kmem_cache *);
240void kmem_cache_shrink_all(struct kmem_cache *s);
177 241
178struct seq_file; 242struct seq_file;
179struct file; 243struct file;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 807490fe217a..6491c3a41805 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -981,6 +981,43 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
981} 981}
982EXPORT_SYMBOL(kmem_cache_shrink); 982EXPORT_SYMBOL(kmem_cache_shrink);
983 983
984/**
985 * kmem_cache_shrink_all - shrink a cache and all memcg caches for root cache
986 * @s: The cache pointer
987 */
988void kmem_cache_shrink_all(struct kmem_cache *s)
989{
990 struct kmem_cache *c;
991
992 if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || !is_root_cache(s)) {
993 kmem_cache_shrink(s);
994 return;
995 }
996
997 get_online_cpus();
998 get_online_mems();
999 kasan_cache_shrink(s);
1000 __kmem_cache_shrink(s);
1001
1002 /*
1003 * We have to take the slab_mutex to protect from the memcg list
1004 * modification.
1005 */
1006 mutex_lock(&slab_mutex);
1007 for_each_memcg_cache(c, s) {
1008 /*
1009 * Don't need to shrink deactivated memcg caches.
1010 */
1011 if (s->flags & SLAB_DEACTIVATED)
1012 continue;
1013 kasan_cache_shrink(c);
1014 __kmem_cache_shrink(c);
1015 }
1016 mutex_unlock(&slab_mutex);
1017 put_online_mems();
1018 put_online_cpus();
1019}
1020
984bool slab_is_available(void) 1021bool slab_is_available(void)
985{ 1022{
986 return slab_state >= UP; 1023 return slab_state >= UP;
diff --git a/mm/slob.c b/mm/slob.c
index 7f421d0ca9ab..cf377beab962 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -539,7 +539,7 @@ size_t __ksize(const void *block)
539 539
540 sp = virt_to_page(block); 540 sp = virt_to_page(block);
541 if (unlikely(!PageSlab(sp))) 541 if (unlikely(!PageSlab(sp)))
542 return PAGE_SIZE << compound_order(sp); 542 return page_size(sp);
543 543
544 align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 544 align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
545 m = (unsigned int *)(block - align); 545 m = (unsigned int *)(block - align);
diff --git a/mm/slub.c b/mm/slub.c
index 8834563cdb4b..42c1b3af3c98 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -829,7 +829,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
829 return 1; 829 return 1;
830 830
831 start = page_address(page); 831 start = page_address(page);
832 length = PAGE_SIZE << compound_order(page); 832 length = page_size(page);
833 end = start + length; 833 end = start + length;
834 remainder = length % s->size; 834 remainder = length % s->size;
835 if (!remainder) 835 if (!remainder)
@@ -1074,13 +1074,14 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
1074 init_tracking(s, object); 1074 init_tracking(s, object);
1075} 1075}
1076 1076
1077static void setup_page_debug(struct kmem_cache *s, void *addr, int order) 1077static
1078void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
1078{ 1079{
1079 if (!(s->flags & SLAB_POISON)) 1080 if (!(s->flags & SLAB_POISON))
1080 return; 1081 return;
1081 1082
1082 metadata_access_enable(); 1083 metadata_access_enable();
1083 memset(addr, POISON_INUSE, PAGE_SIZE << order); 1084 memset(addr, POISON_INUSE, page_size(page));
1084 metadata_access_disable(); 1085 metadata_access_disable();
1085} 1086}
1086 1087
@@ -1340,8 +1341,8 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
1340#else /* !CONFIG_SLUB_DEBUG */ 1341#else /* !CONFIG_SLUB_DEBUG */
1341static inline void setup_object_debug(struct kmem_cache *s, 1342static inline void setup_object_debug(struct kmem_cache *s,
1342 struct page *page, void *object) {} 1343 struct page *page, void *object) {}
1343static inline void setup_page_debug(struct kmem_cache *s, 1344static inline
1344 void *addr, int order) {} 1345void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {}
1345 1346
1346static inline int alloc_debug_processing(struct kmem_cache *s, 1347static inline int alloc_debug_processing(struct kmem_cache *s,
1347 struct page *page, void *object, unsigned long addr) { return 0; } 1348 struct page *page, void *object, unsigned long addr) { return 0; }
@@ -1639,7 +1640,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1639 struct kmem_cache_order_objects oo = s->oo; 1640 struct kmem_cache_order_objects oo = s->oo;
1640 gfp_t alloc_gfp; 1641 gfp_t alloc_gfp;
1641 void *start, *p, *next; 1642 void *start, *p, *next;
1642 int idx, order; 1643 int idx;
1643 bool shuffle; 1644 bool shuffle;
1644 1645
1645 flags &= gfp_allowed_mask; 1646 flags &= gfp_allowed_mask;
@@ -1673,7 +1674,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1673 1674
1674 page->objects = oo_objects(oo); 1675 page->objects = oo_objects(oo);
1675 1676
1676 order = compound_order(page);
1677 page->slab_cache = s; 1677 page->slab_cache = s;
1678 __SetPageSlab(page); 1678 __SetPageSlab(page);
1679 if (page_is_pfmemalloc(page)) 1679 if (page_is_pfmemalloc(page))
@@ -1683,7 +1683,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1683 1683
1684 start = page_address(page); 1684 start = page_address(page);
1685 1685
1686 setup_page_debug(s, start, order); 1686 setup_page_debug(s, page, start);
1687 1687
1688 shuffle = shuffle_freelist(s, page); 1688 shuffle = shuffle_freelist(s, page);
1689 1689
@@ -2004,6 +2004,7 @@ static inline unsigned long next_tid(unsigned long tid)
2004 return tid + TID_STEP; 2004 return tid + TID_STEP;
2005} 2005}
2006 2006
2007#ifdef SLUB_DEBUG_CMPXCHG
2007static inline unsigned int tid_to_cpu(unsigned long tid) 2008static inline unsigned int tid_to_cpu(unsigned long tid)
2008{ 2009{
2009 return tid % TID_STEP; 2010 return tid % TID_STEP;
@@ -2013,6 +2014,7 @@ static inline unsigned long tid_to_event(unsigned long tid)
2013{ 2014{
2014 return tid / TID_STEP; 2015 return tid / TID_STEP;
2015} 2016}
2017#endif
2016 2018
2017static inline unsigned int init_tid(int cpu) 2019static inline unsigned int init_tid(int cpu)
2018{ 2020{
@@ -3930,7 +3932,7 @@ size_t __ksize(const void *object)
3930 3932
3931 if (unlikely(!PageSlab(page))) { 3933 if (unlikely(!PageSlab(page))) {
3932 WARN_ON(!PageCompound(page)); 3934 WARN_ON(!PageCompound(page));
3933 return PAGE_SIZE << compound_order(page); 3935 return page_size(page);
3934 } 3936 }
3935 3937
3936 return slab_ksize(page->slab_cache); 3938 return slab_ksize(page->slab_cache);
@@ -5298,7 +5300,7 @@ static ssize_t shrink_store(struct kmem_cache *s,
5298 const char *buf, size_t length) 5300 const char *buf, size_t length)
5299{ 5301{
5300 if (buf[0] == '1') 5302 if (buf[0] == '1')
5301 kmem_cache_shrink(s); 5303 kmem_cache_shrink_all(s);
5302 else 5304 else
5303 return -EINVAL; 5305 return -EINVAL;
5304 return length; 5306 return length;
diff --git a/mm/sparse.c b/mm/sparse.c
index 72f010d9bff5..bf32de9e666b 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -11,6 +11,8 @@
11#include <linux/export.h> 11#include <linux/export.h>
12#include <linux/spinlock.h> 12#include <linux/spinlock.h>
13#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
14#include <linux/swap.h>
15#include <linux/swapops.h>
14 16
15#include "internal.h" 17#include "internal.h"
16#include <asm/dma.h> 18#include <asm/dma.h>
@@ -470,6 +472,12 @@ struct page __init *__populate_section_memmap(unsigned long pfn,
470static void *sparsemap_buf __meminitdata; 472static void *sparsemap_buf __meminitdata;
471static void *sparsemap_buf_end __meminitdata; 473static void *sparsemap_buf_end __meminitdata;
472 474
475static inline void __meminit sparse_buffer_free(unsigned long size)
476{
477 WARN_ON(!sparsemap_buf || size == 0);
478 memblock_free_early(__pa(sparsemap_buf), size);
479}
480
473static void __init sparse_buffer_init(unsigned long size, int nid) 481static void __init sparse_buffer_init(unsigned long size, int nid)
474{ 482{
475 phys_addr_t addr = __pa(MAX_DMA_ADDRESS); 483 phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
@@ -486,7 +494,7 @@ static void __init sparse_buffer_fini(void)
486 unsigned long size = sparsemap_buf_end - sparsemap_buf; 494 unsigned long size = sparsemap_buf_end - sparsemap_buf;
487 495
488 if (sparsemap_buf && size > 0) 496 if (sparsemap_buf && size > 0)
489 memblock_free_early(__pa(sparsemap_buf), size); 497 sparse_buffer_free(size);
490 sparsemap_buf = NULL; 498 sparsemap_buf = NULL;
491} 499}
492 500
@@ -495,11 +503,15 @@ void * __meminit sparse_buffer_alloc(unsigned long size)
495 void *ptr = NULL; 503 void *ptr = NULL;
496 504
497 if (sparsemap_buf) { 505 if (sparsemap_buf) {
498 ptr = PTR_ALIGN(sparsemap_buf, size); 506 ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
499 if (ptr + size > sparsemap_buf_end) 507 if (ptr + size > sparsemap_buf_end)
500 ptr = NULL; 508 ptr = NULL;
501 else 509 else {
510 /* Free redundant aligned space */
511 if ((unsigned long)(ptr - sparsemap_buf) > 0)
512 sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
502 sparsemap_buf = ptr + size; 513 sparsemap_buf = ptr + size;
514 }
503 } 515 }
504 return ptr; 516 return ptr;
505} 517}
@@ -867,7 +879,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
867 */ 879 */
868 page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); 880 page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
869 881
870 ms = __pfn_to_section(start_pfn); 882 ms = __nr_to_section(section_nr);
871 set_section_nid(section_nr, nid); 883 set_section_nid(section_nr, nid);
872 section_mark_present(ms); 884 section_mark_present(ms);
873 885
@@ -884,9 +896,6 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
884{ 896{
885 int i; 897 int i;
886 898
887 if (!memmap)
888 return;
889
890 /* 899 /*
891 * A further optimization is to have per section refcounted 900 * A further optimization is to have per section refcounted
892 * num_poisoned_pages. But that would need more space per memmap, so 901 * num_poisoned_pages. But that would need more space per memmap, so
@@ -898,7 +907,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
898 907
899 for (i = 0; i < nr_pages; i++) { 908 for (i = 0; i < nr_pages; i++) {
900 if (PageHWPoison(&memmap[i])) { 909 if (PageHWPoison(&memmap[i])) {
901 atomic_long_sub(1, &num_poisoned_pages); 910 num_poisoned_pages_dec();
902 ClearPageHWPoison(&memmap[i]); 911 ClearPageHWPoison(&memmap[i]);
903 } 912 }
904 } 913 }
diff --git a/mm/swap.c b/mm/swap.c
index ae300397dfda..784dc1620620 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -71,12 +71,12 @@ static void __page_cache_release(struct page *page)
71 spin_unlock_irqrestore(&pgdat->lru_lock, flags); 71 spin_unlock_irqrestore(&pgdat->lru_lock, flags);
72 } 72 }
73 __ClearPageWaiters(page); 73 __ClearPageWaiters(page);
74 mem_cgroup_uncharge(page);
75} 74}
76 75
77static void __put_single_page(struct page *page) 76static void __put_single_page(struct page *page)
78{ 77{
79 __page_cache_release(page); 78 __page_cache_release(page);
79 mem_cgroup_uncharge(page);
80 free_unref_page(page); 80 free_unref_page(page);
81} 81}
82 82
@@ -515,7 +515,6 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
515 del_page_from_lru_list(page, lruvec, lru + active); 515 del_page_from_lru_list(page, lruvec, lru + active);
516 ClearPageActive(page); 516 ClearPageActive(page);
517 ClearPageReferenced(page); 517 ClearPageReferenced(page);
518 add_page_to_lru_list(page, lruvec, lru);
519 518
520 if (PageWriteback(page) || PageDirty(page)) { 519 if (PageWriteback(page) || PageDirty(page)) {
521 /* 520 /*
@@ -523,13 +522,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
523 * It can make readahead confusing. But race window 522 * It can make readahead confusing. But race window
524 * is _really_ small and it's non-critical problem. 523 * is _really_ small and it's non-critical problem.
525 */ 524 */
525 add_page_to_lru_list(page, lruvec, lru);
526 SetPageReclaim(page); 526 SetPageReclaim(page);
527 } else { 527 } else {
528 /* 528 /*
529 * The page's writeback ends up during pagevec 529 * The page's writeback ends up during pagevec
530 * We moves tha page into tail of inactive. 530 * We moves tha page into tail of inactive.
531 */ 531 */
532 list_move_tail(&page->lru, &lruvec->lists[lru]); 532 add_page_to_lru_list_tail(page, lruvec, lru);
533 __count_vm_event(PGROTATED); 533 __count_vm_event(PGROTATED);
534 } 534 }
535 535
@@ -844,17 +844,15 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
844 get_page(page_tail); 844 get_page(page_tail);
845 list_add_tail(&page_tail->lru, list); 845 list_add_tail(&page_tail->lru, list);
846 } else { 846 } else {
847 struct list_head *list_head;
848 /* 847 /*
849 * Head page has not yet been counted, as an hpage, 848 * Head page has not yet been counted, as an hpage,
850 * so we must account for each subpage individually. 849 * so we must account for each subpage individually.
851 * 850 *
852 * Use the standard add function to put page_tail on the list, 851 * Put page_tail on the list at the correct position
853 * but then correct its position so they all end up in order. 852 * so they all end up in order.
854 */ 853 */
855 add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); 854 add_page_to_lru_list_tail(page_tail, lruvec,
856 list_head = page_tail->lru.prev; 855 page_lru(page_tail));
857 list_move_tail(&page_tail->lru, list_head);
858 } 856 }
859 857
860 if (!PageUnevictable(page)) 858 if (!PageUnevictable(page))
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8368621a0fc7..8e7ce9a9bc5e 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -116,7 +116,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
116 struct address_space *address_space = swap_address_space(entry); 116 struct address_space *address_space = swap_address_space(entry);
117 pgoff_t idx = swp_offset(entry); 117 pgoff_t idx = swp_offset(entry);
118 XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); 118 XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
119 unsigned long i, nr = 1UL << compound_order(page); 119 unsigned long i, nr = compound_nr(page);
120 120
121 VM_BUG_ON_PAGE(!PageLocked(page), page); 121 VM_BUG_ON_PAGE(!PageLocked(page), page);
122 VM_BUG_ON_PAGE(PageSwapCache(page), page); 122 VM_BUG_ON_PAGE(PageSwapCache(page), page);
@@ -133,7 +133,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
133 for (i = 0; i < nr; i++) { 133 for (i = 0; i < nr; i++) {
134 VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); 134 VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
135 set_page_private(page + i, entry.val + i); 135 set_page_private(page + i, entry.val + i);
136 xas_store(&xas, page + i); 136 xas_store(&xas, page);
137 xas_next(&xas); 137 xas_next(&xas);
138 } 138 }
139 address_space->nrpages += nr; 139 address_space->nrpages += nr;
@@ -168,7 +168,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
168 168
169 for (i = 0; i < nr; i++) { 169 for (i = 0; i < nr; i++) {
170 void *entry = xas_store(&xas, NULL); 170 void *entry = xas_store(&xas, NULL);
171 VM_BUG_ON_PAGE(entry != page + i, entry); 171 VM_BUG_ON_PAGE(entry != page, entry);
172 set_page_private(page + i, 0); 172 set_page_private(page + i, 0);
173 xas_next(&xas); 173 xas_next(&xas);
174 } 174 }
diff --git a/mm/util.c b/mm/util.c
index e6351a80f248..3ad6db9a722e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -16,6 +16,13 @@
16#include <linux/hugetlb.h> 16#include <linux/hugetlb.h>
17#include <linux/vmalloc.h> 17#include <linux/vmalloc.h>
18#include <linux/userfaultfd_k.h> 18#include <linux/userfaultfd_k.h>
19#include <linux/elf.h>
20#include <linux/elf-randomize.h>
21#include <linux/personality.h>
22#include <linux/random.h>
23#include <linux/processor.h>
24#include <linux/sizes.h>
25#include <linux/compat.h>
19 26
20#include <linux/uaccess.h> 27#include <linux/uaccess.h>
21 28
@@ -293,7 +300,105 @@ int vma_is_stack_for_current(struct vm_area_struct *vma)
293 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); 300 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
294} 301}
295 302
296#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 303#ifndef STACK_RND_MASK
304#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
305#endif
306
307unsigned long randomize_stack_top(unsigned long stack_top)
308{
309 unsigned long random_variable = 0;
310
311 if (current->flags & PF_RANDOMIZE) {
312 random_variable = get_random_long();
313 random_variable &= STACK_RND_MASK;
314 random_variable <<= PAGE_SHIFT;
315 }
316#ifdef CONFIG_STACK_GROWSUP
317 return PAGE_ALIGN(stack_top) + random_variable;
318#else
319 return PAGE_ALIGN(stack_top) - random_variable;
320#endif
321}
322
323#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
324unsigned long arch_randomize_brk(struct mm_struct *mm)
325{
326 /* Is the current task 32bit ? */
327 if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
328 return randomize_page(mm->brk, SZ_32M);
329
330 return randomize_page(mm->brk, SZ_1G);
331}
332
333unsigned long arch_mmap_rnd(void)
334{
335 unsigned long rnd;
336
337#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
338 if (is_compat_task())
339 rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
340 else
341#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
342 rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
343
344 return rnd << PAGE_SHIFT;
345}
346
347static int mmap_is_legacy(struct rlimit *rlim_stack)
348{
349 if (current->personality & ADDR_COMPAT_LAYOUT)
350 return 1;
351
352 if (rlim_stack->rlim_cur == RLIM_INFINITY)
353 return 1;
354
355 return sysctl_legacy_va_layout;
356}
357
358/*
359 * Leave enough space between the mmap area and the stack to honour ulimit in
360 * the face of randomisation.
361 */
362#define MIN_GAP (SZ_128M)
363#define MAX_GAP (STACK_TOP / 6 * 5)
364
365static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
366{
367 unsigned long gap = rlim_stack->rlim_cur;
368 unsigned long pad = stack_guard_gap;
369
370 /* Account for stack randomization if necessary */
371 if (current->flags & PF_RANDOMIZE)
372 pad += (STACK_RND_MASK << PAGE_SHIFT);
373
374 /* Values close to RLIM_INFINITY can overflow. */
375 if (gap + pad > gap)
376 gap += pad;
377
378 if (gap < MIN_GAP)
379 gap = MIN_GAP;
380 else if (gap > MAX_GAP)
381 gap = MAX_GAP;
382
383 return PAGE_ALIGN(STACK_TOP - gap - rnd);
384}
385
386void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
387{
388 unsigned long random_factor = 0UL;
389
390 if (current->flags & PF_RANDOMIZE)
391 random_factor = arch_mmap_rnd();
392
393 if (mmap_is_legacy(rlim_stack)) {
394 mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
395 mm->get_unmapped_area = arch_get_unmapped_area;
396 } else {
397 mm->mmap_base = mmap_base(random_factor, rlim_stack);
398 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
399 }
400}
401#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
297void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) 402void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
298{ 403{
299 mm->mmap_base = TASK_UNMAPPED_BASE; 404 mm->mmap_base = TASK_UNMAPPED_BASE;
@@ -521,7 +626,7 @@ bool page_mapped(struct page *page)
521 return true; 626 return true;
522 if (PageHuge(page)) 627 if (PageHuge(page))
523 return false; 628 return false;
524 for (i = 0; i < (1 << compound_order(page)); i++) { 629 for (i = 0; i < compound_nr(page); i++) {
525 if (atomic_read(&page[i]._mapcount) >= 0) 630 if (atomic_read(&page[i]._mapcount) >= 0)
526 return true; 631 return true;
527 } 632 }
@@ -783,3 +888,16 @@ out_mm:
783out: 888out:
784 return res; 889 return res;
785} 890}
891
892int memcmp_pages(struct page *page1, struct page *page2)
893{
894 char *addr1, *addr2;
895 int ret;
896
897 addr1 = kmap_atomic(page1);
898 addr2 = kmap_atomic(page2);
899 ret = memcmp(addr1, addr2, PAGE_SIZE);
900 kunmap_atomic(addr2);
901 kunmap_atomic(addr1);
902 return ret;
903}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c1246d77cf75..fcadd3e25c0c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -329,8 +329,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
329#define DEBUG_AUGMENT_PROPAGATE_CHECK 0 329#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
330#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 330#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
331 331
332#define VM_LAZY_FREE 0x02
333#define VM_VM_AREA 0x04
334 332
335static DEFINE_SPINLOCK(vmap_area_lock); 333static DEFINE_SPINLOCK(vmap_area_lock);
336/* Export for kexec only */ 334/* Export for kexec only */
@@ -1116,7 +1114,7 @@ retry:
1116 1114
1117 va->va_start = addr; 1115 va->va_start = addr;
1118 va->va_end = addr + size; 1116 va->va_end = addr + size;
1119 va->flags = 0; 1117 va->vm = NULL;
1120 insert_vmap_area(va, &vmap_area_root, &vmap_area_list); 1118 insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
1121 1119
1122 spin_unlock(&vmap_area_lock); 1120 spin_unlock(&vmap_area_lock);
@@ -1282,7 +1280,14 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
1282 llist_for_each_entry_safe(va, n_va, valist, purge_list) { 1280 llist_for_each_entry_safe(va, n_va, valist, purge_list) {
1283 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; 1281 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
1284 1282
1285 __free_vmap_area(va); 1283 /*
1284 * Finally insert or merge lazily-freed area. It is
1285 * detached and there is no need to "unlink" it from
1286 * anything.
1287 */
1288 merge_or_add_vmap_area(va,
1289 &free_vmap_area_root, &free_vmap_area_list);
1290
1286 atomic_long_sub(nr, &vmap_lazy_nr); 1291 atomic_long_sub(nr, &vmap_lazy_nr);
1287 1292
1288 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) 1293 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
@@ -1324,6 +1329,10 @@ static void free_vmap_area_noflush(struct vmap_area *va)
1324{ 1329{
1325 unsigned long nr_lazy; 1330 unsigned long nr_lazy;
1326 1331
1332 spin_lock(&vmap_area_lock);
1333 unlink_va(va, &vmap_area_root);
1334 spin_unlock(&vmap_area_lock);
1335
1327 nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> 1336 nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
1328 PAGE_SHIFT, &vmap_lazy_nr); 1337 PAGE_SHIFT, &vmap_lazy_nr);
1329 1338
@@ -1918,7 +1927,6 @@ void __init vmalloc_init(void)
1918 if (WARN_ON_ONCE(!va)) 1927 if (WARN_ON_ONCE(!va))
1919 continue; 1928 continue;
1920 1929
1921 va->flags = VM_VM_AREA;
1922 va->va_start = (unsigned long)tmp->addr; 1930 va->va_start = (unsigned long)tmp->addr;
1923 va->va_end = va->va_start + tmp->size; 1931 va->va_end = va->va_start + tmp->size;
1924 va->vm = tmp; 1932 va->vm = tmp;
@@ -2016,7 +2024,6 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
2016 vm->size = va->va_end - va->va_start; 2024 vm->size = va->va_end - va->va_start;
2017 vm->caller = caller; 2025 vm->caller = caller;
2018 va->vm = vm; 2026 va->vm = vm;
2019 va->flags |= VM_VM_AREA;
2020 spin_unlock(&vmap_area_lock); 2027 spin_unlock(&vmap_area_lock);
2021} 2028}
2022 2029
@@ -2121,10 +2128,10 @@ struct vm_struct *find_vm_area(const void *addr)
2121 struct vmap_area *va; 2128 struct vmap_area *va;
2122 2129
2123 va = find_vmap_area((unsigned long)addr); 2130 va = find_vmap_area((unsigned long)addr);
2124 if (va && va->flags & VM_VM_AREA) 2131 if (!va)
2125 return va->vm; 2132 return NULL;
2126 2133
2127 return NULL; 2134 return va->vm;
2128} 2135}
2129 2136
2130/** 2137/**
@@ -2143,14 +2150,12 @@ struct vm_struct *remove_vm_area(const void *addr)
2143 2150
2144 might_sleep(); 2151 might_sleep();
2145 2152
2146 va = find_vmap_area((unsigned long)addr); 2153 spin_lock(&vmap_area_lock);
2147 if (va && va->flags & VM_VM_AREA) { 2154 va = __find_vmap_area((unsigned long)addr);
2155 if (va && va->vm) {
2148 struct vm_struct *vm = va->vm; 2156 struct vm_struct *vm = va->vm;
2149 2157
2150 spin_lock(&vmap_area_lock);
2151 va->vm = NULL; 2158 va->vm = NULL;
2152 va->flags &= ~VM_VM_AREA;
2153 va->flags |= VM_LAZY_FREE;
2154 spin_unlock(&vmap_area_lock); 2159 spin_unlock(&vmap_area_lock);
2155 2160
2156 kasan_free_shadow(vm); 2161 kasan_free_shadow(vm);
@@ -2158,6 +2163,8 @@ struct vm_struct *remove_vm_area(const void *addr)
2158 2163
2159 return vm; 2164 return vm;
2160 } 2165 }
2166
2167 spin_unlock(&vmap_area_lock);
2161 return NULL; 2168 return NULL;
2162} 2169}
2163 2170
@@ -2402,7 +2409,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
2402 nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; 2409 nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
2403 array_size = (nr_pages * sizeof(struct page *)); 2410 array_size = (nr_pages * sizeof(struct page *));
2404 2411
2405 area->nr_pages = nr_pages;
2406 /* Please note that the recursion is strictly bounded. */ 2412 /* Please note that the recursion is strictly bounded. */
2407 if (array_size > PAGE_SIZE) { 2413 if (array_size > PAGE_SIZE) {
2408 pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask, 2414 pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
@@ -2410,13 +2416,16 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
2410 } else { 2416 } else {
2411 pages = kmalloc_node(array_size, nested_gfp, node); 2417 pages = kmalloc_node(array_size, nested_gfp, node);
2412 } 2418 }
2413 area->pages = pages; 2419
2414 if (!area->pages) { 2420 if (!pages) {
2415 remove_vm_area(area->addr); 2421 remove_vm_area(area->addr);
2416 kfree(area); 2422 kfree(area);
2417 return NULL; 2423 return NULL;
2418 } 2424 }
2419 2425
2426 area->pages = pages;
2427 area->nr_pages = nr_pages;
2428
2420 for (i = 0; i < area->nr_pages; i++) { 2429 for (i = 0; i < area->nr_pages; i++) {
2421 struct page *page; 2430 struct page *page;
2422 2431
@@ -2851,7 +2860,7 @@ long vread(char *buf, char *addr, unsigned long count)
2851 if (!count) 2860 if (!count)
2852 break; 2861 break;
2853 2862
2854 if (!(va->flags & VM_VM_AREA)) 2863 if (!va->vm)
2855 continue; 2864 continue;
2856 2865
2857 vm = va->vm; 2866 vm = va->vm;
@@ -2931,7 +2940,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
2931 if (!count) 2940 if (!count)
2932 break; 2941 break;
2933 2942
2934 if (!(va->flags & VM_VM_AREA)) 2943 if (!va->vm)
2935 continue; 2944 continue;
2936 2945
2937 vm = va->vm; 2946 vm = va->vm;
@@ -3450,6 +3459,22 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
3450 } 3459 }
3451} 3460}
3452 3461
3462static void show_purge_info(struct seq_file *m)
3463{
3464 struct llist_node *head;
3465 struct vmap_area *va;
3466
3467 head = READ_ONCE(vmap_purge_list.first);
3468 if (head == NULL)
3469 return;
3470
3471 llist_for_each_entry(va, head, purge_list) {
3472 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
3473 (void *)va->va_start, (void *)va->va_end,
3474 va->va_end - va->va_start);
3475 }
3476}
3477
3453static int s_show(struct seq_file *m, void *p) 3478static int s_show(struct seq_file *m, void *p)
3454{ 3479{
3455 struct vmap_area *va; 3480 struct vmap_area *va;
@@ -3458,14 +3483,13 @@ static int s_show(struct seq_file *m, void *p)
3458 va = list_entry(p, struct vmap_area, list); 3483 va = list_entry(p, struct vmap_area, list);
3459 3484
3460 /* 3485 /*
3461 * s_show can encounter race with remove_vm_area, !VM_VM_AREA on 3486 * s_show can encounter race with remove_vm_area, !vm on behalf
3462 * behalf of vmap area is being tear down or vm_map_ram allocation. 3487 * of vmap area is being tear down or vm_map_ram allocation.
3463 */ 3488 */
3464 if (!(va->flags & VM_VM_AREA)) { 3489 if (!va->vm) {
3465 seq_printf(m, "0x%pK-0x%pK %7ld %s\n", 3490 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
3466 (void *)va->va_start, (void *)va->va_end, 3491 (void *)va->va_start, (void *)va->va_end,
3467 va->va_end - va->va_start, 3492 va->va_end - va->va_start);
3468 va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram");
3469 3493
3470 return 0; 3494 return 0;
3471 } 3495 }
@@ -3504,6 +3528,16 @@ static int s_show(struct seq_file *m, void *p)
3504 3528
3505 show_numa_info(m, v); 3529 show_numa_info(m, v);
3506 seq_putc(m, '\n'); 3530 seq_putc(m, '\n');
3531
3532 /*
3533 * As a final step, dump "unpurged" areas. Note,
3534 * that entire "/proc/vmallocinfo" output will not
3535 * be address sorted, because the purge list is not
3536 * sorted.
3537 */
3538 if (list_is_last(&va->list, &vmap_area_list))
3539 show_purge_info(m);
3540
3507 return 0; 3541 return 0;
3508} 3542}
3509 3543
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a6c5d0b28321..4911754c93b7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -171,11 +171,22 @@ int vm_swappiness = 60;
171 */ 171 */
172unsigned long vm_total_pages; 172unsigned long vm_total_pages;
173 173
174static void set_task_reclaim_state(struct task_struct *task,
175 struct reclaim_state *rs)
176{
177 /* Check for an overwrite */
178 WARN_ON_ONCE(rs && task->reclaim_state);
179
180 /* Check for the nulling of an already-nulled member */
181 WARN_ON_ONCE(!rs && !task->reclaim_state);
182
183 task->reclaim_state = rs;
184}
185
174static LIST_HEAD(shrinker_list); 186static LIST_HEAD(shrinker_list);
175static DECLARE_RWSEM(shrinker_rwsem); 187static DECLARE_RWSEM(shrinker_rwsem);
176 188
177#ifdef CONFIG_MEMCG_KMEM 189#ifdef CONFIG_MEMCG
178
179/* 190/*
180 * We allow subsystems to populate their shrinker-related 191 * We allow subsystems to populate their shrinker-related
181 * LRU lists before register_shrinker_prepared() is called 192 * LRU lists before register_shrinker_prepared() is called
@@ -227,30 +238,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
227 idr_remove(&shrinker_idr, id); 238 idr_remove(&shrinker_idr, id);
228 up_write(&shrinker_rwsem); 239 up_write(&shrinker_rwsem);
229} 240}
230#else /* CONFIG_MEMCG_KMEM */
231static int prealloc_memcg_shrinker(struct shrinker *shrinker)
232{
233 return 0;
234}
235
236static void unregister_memcg_shrinker(struct shrinker *shrinker)
237{
238}
239#endif /* CONFIG_MEMCG_KMEM */
240
241static void set_task_reclaim_state(struct task_struct *task,
242 struct reclaim_state *rs)
243{
244 /* Check for an overwrite */
245 WARN_ON_ONCE(rs && task->reclaim_state);
246
247 /* Check for the nulling of an already-nulled member */
248 WARN_ON_ONCE(!rs && !task->reclaim_state);
249 241
250 task->reclaim_state = rs;
251}
252
253#ifdef CONFIG_MEMCG
254static bool global_reclaim(struct scan_control *sc) 242static bool global_reclaim(struct scan_control *sc)
255{ 243{
256 return !sc->target_mem_cgroup; 244 return !sc->target_mem_cgroup;
@@ -305,6 +293,15 @@ static bool memcg_congested(pg_data_t *pgdat,
305 293
306} 294}
307#else 295#else
296static int prealloc_memcg_shrinker(struct shrinker *shrinker)
297{
298 return 0;
299}
300
301static void unregister_memcg_shrinker(struct shrinker *shrinker)
302{
303}
304
308static bool global_reclaim(struct scan_control *sc) 305static bool global_reclaim(struct scan_control *sc)
309{ 306{
310 return true; 307 return true;
@@ -591,7 +588,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
591 return freed; 588 return freed;
592} 589}
593 590
594#ifdef CONFIG_MEMCG_KMEM 591#ifdef CONFIG_MEMCG
595static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, 592static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
596 struct mem_cgroup *memcg, int priority) 593 struct mem_cgroup *memcg, int priority)
597{ 594{
@@ -599,7 +596,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
599 unsigned long ret, freed = 0; 596 unsigned long ret, freed = 0;
600 int i; 597 int i;
601 598
602 if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)) 599 if (!mem_cgroup_online(memcg))
603 return 0; 600 return 0;
604 601
605 if (!down_read_trylock(&shrinker_rwsem)) 602 if (!down_read_trylock(&shrinker_rwsem))
@@ -625,6 +622,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
625 continue; 622 continue;
626 } 623 }
627 624
625 /* Call non-slab shrinkers even though kmem is disabled */
626 if (!memcg_kmem_enabled() &&
627 !(shrinker->flags & SHRINKER_NONSLAB))
628 continue;
629
628 ret = do_shrink_slab(&sc, shrinker, priority); 630 ret = do_shrink_slab(&sc, shrinker, priority);
629 if (ret == SHRINK_EMPTY) { 631 if (ret == SHRINK_EMPTY) {
630 clear_bit(i, map->map); 632 clear_bit(i, map->map);
@@ -661,13 +663,13 @@ unlock:
661 up_read(&shrinker_rwsem); 663 up_read(&shrinker_rwsem);
662 return freed; 664 return freed;
663} 665}
664#else /* CONFIG_MEMCG_KMEM */ 666#else /* CONFIG_MEMCG */
665static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, 667static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
666 struct mem_cgroup *memcg, int priority) 668 struct mem_cgroup *memcg, int priority)
667{ 669{
668 return 0; 670 return 0;
669} 671}
670#endif /* CONFIG_MEMCG_KMEM */ 672#endif /* CONFIG_MEMCG */
671 673
672/** 674/**
673 * shrink_slab - shrink slab caches 675 * shrink_slab - shrink slab caches
@@ -1149,7 +1151,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1149 1151
1150 VM_BUG_ON_PAGE(PageActive(page), page); 1152 VM_BUG_ON_PAGE(PageActive(page), page);
1151 1153
1152 nr_pages = 1 << compound_order(page); 1154 nr_pages = compound_nr(page);
1153 1155
1154 /* Account the number of base pages even though THP */ 1156 /* Account the number of base pages even though THP */
1155 sc->nr_scanned += nr_pages; 1157 sc->nr_scanned += nr_pages;
@@ -1487,10 +1489,9 @@ free_it:
1487 * Is there need to periodically free_page_list? It would 1489 * Is there need to periodically free_page_list? It would
1488 * appear not as the counts should be low 1490 * appear not as the counts should be low
1489 */ 1491 */
1490 if (unlikely(PageTransHuge(page))) { 1492 if (unlikely(PageTransHuge(page)))
1491 mem_cgroup_uncharge(page);
1492 (*get_compound_page_dtor(page))(page); 1493 (*get_compound_page_dtor(page))(page);
1493 } else 1494 else
1494 list_add(&page->lru, &free_pages); 1495 list_add(&page->lru, &free_pages);
1495 continue; 1496 continue;
1496 1497
@@ -1705,7 +1706,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1705 1706
1706 VM_BUG_ON_PAGE(!PageLRU(page), page); 1707 VM_BUG_ON_PAGE(!PageLRU(page), page);
1707 1708
1708 nr_pages = 1 << compound_order(page); 1709 nr_pages = compound_nr(page);
1709 total_scan += nr_pages; 1710 total_scan += nr_pages;
1710 1711
1711 if (page_zonenum(page) > sc->reclaim_idx) { 1712 if (page_zonenum(page) > sc->reclaim_idx) {
@@ -1911,7 +1912,6 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
1911 1912
1912 if (unlikely(PageCompound(page))) { 1913 if (unlikely(PageCompound(page))) {
1913 spin_unlock_irq(&pgdat->lru_lock); 1914 spin_unlock_irq(&pgdat->lru_lock);
1914 mem_cgroup_uncharge(page);
1915 (*get_compound_page_dtor(page))(page); 1915 (*get_compound_page_dtor(page))(page);
1916 spin_lock_irq(&pgdat->lru_lock); 1916 spin_lock_irq(&pgdat->lru_lock);
1917 } else 1917 } else
@@ -2586,7 +2586,6 @@ static bool in_reclaim_compaction(struct scan_control *sc)
2586 */ 2586 */
2587static inline bool should_continue_reclaim(struct pglist_data *pgdat, 2587static inline bool should_continue_reclaim(struct pglist_data *pgdat,
2588 unsigned long nr_reclaimed, 2588 unsigned long nr_reclaimed,
2589 unsigned long nr_scanned,
2590 struct scan_control *sc) 2589 struct scan_control *sc)
2591{ 2590{
2592 unsigned long pages_for_compaction; 2591 unsigned long pages_for_compaction;
@@ -2597,40 +2596,18 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
2597 if (!in_reclaim_compaction(sc)) 2596 if (!in_reclaim_compaction(sc))
2598 return false; 2597 return false;
2599 2598
2600 /* Consider stopping depending on scan and reclaim activity */
2601 if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) {
2602 /*
2603 * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the
2604 * full LRU list has been scanned and we are still failing
2605 * to reclaim pages. This full LRU scan is potentially
2606 * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed
2607 */
2608 if (!nr_reclaimed && !nr_scanned)
2609 return false;
2610 } else {
2611 /*
2612 * For non-__GFP_RETRY_MAYFAIL allocations which can presumably
2613 * fail without consequence, stop if we failed to reclaim
2614 * any pages from the last SWAP_CLUSTER_MAX number of
2615 * pages that were scanned. This will return to the
2616 * caller faster at the risk reclaim/compaction and
2617 * the resulting allocation attempt fails
2618 */
2619 if (!nr_reclaimed)
2620 return false;
2621 }
2622
2623 /* 2599 /*
2624 * If we have not reclaimed enough pages for compaction and the 2600 * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
2625 * inactive lists are large enough, continue reclaiming 2601 * number of pages that were scanned. This will return to the caller
2602 * with the risk reclaim/compaction and the resulting allocation attempt
2603 * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
2604 * allocations through requiring that the full LRU list has been scanned
2605 * first, by assuming that zero delta of sc->nr_scanned means full LRU
2606 * scan, but that approximation was wrong, and there were corner cases
2607 * where always a non-zero amount of pages were scanned.
2626 */ 2608 */
2627 pages_for_compaction = compact_gap(sc->order); 2609 if (!nr_reclaimed)
2628 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE); 2610 return false;
2629 if (get_nr_swap_pages() > 0)
2630 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
2631 if (sc->nr_reclaimed < pages_for_compaction &&
2632 inactive_lru_pages > pages_for_compaction)
2633 return true;
2634 2611
2635 /* If compaction would go ahead or the allocation would succeed, stop */ 2612 /* If compaction would go ahead or the allocation would succeed, stop */
2636 for (z = 0; z <= sc->reclaim_idx; z++) { 2613 for (z = 0; z <= sc->reclaim_idx; z++) {
@@ -2647,7 +2624,17 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
2647 ; 2624 ;
2648 } 2625 }
2649 } 2626 }
2650 return true; 2627
2628 /*
2629 * If we have not reclaimed enough pages for compaction and the
2630 * inactive lists are large enough, continue reclaiming
2631 */
2632 pages_for_compaction = compact_gap(sc->order);
2633 inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
2634 if (get_nr_swap_pages() > 0)
2635 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
2636
2637 return inactive_lru_pages > pages_for_compaction;
2651} 2638}
2652 2639
2653static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) 2640static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
@@ -2664,10 +2651,6 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2664 2651
2665 do { 2652 do {
2666 struct mem_cgroup *root = sc->target_mem_cgroup; 2653 struct mem_cgroup *root = sc->target_mem_cgroup;
2667 struct mem_cgroup_reclaim_cookie reclaim = {
2668 .pgdat = pgdat,
2669 .priority = sc->priority,
2670 };
2671 unsigned long node_lru_pages = 0; 2654 unsigned long node_lru_pages = 0;
2672 struct mem_cgroup *memcg; 2655 struct mem_cgroup *memcg;
2673 2656
@@ -2676,7 +2659,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2676 nr_reclaimed = sc->nr_reclaimed; 2659 nr_reclaimed = sc->nr_reclaimed;
2677 nr_scanned = sc->nr_scanned; 2660 nr_scanned = sc->nr_scanned;
2678 2661
2679 memcg = mem_cgroup_iter(root, NULL, &reclaim); 2662 memcg = mem_cgroup_iter(root, NULL, NULL);
2680 do { 2663 do {
2681 unsigned long lru_pages; 2664 unsigned long lru_pages;
2682 unsigned long reclaimed; 2665 unsigned long reclaimed;
@@ -2719,21 +2702,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2719 sc->nr_scanned - scanned, 2702 sc->nr_scanned - scanned,
2720 sc->nr_reclaimed - reclaimed); 2703 sc->nr_reclaimed - reclaimed);
2721 2704
2722 /* 2705 } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
2723 * Kswapd have to scan all memory cgroups to fulfill
2724 * the overall scan target for the node.
2725 *
2726 * Limit reclaim, on the other hand, only cares about
2727 * nr_to_reclaim pages to be reclaimed and it will
2728 * retry with decreasing priority if one round over the
2729 * whole hierarchy is not sufficient.
2730 */
2731 if (!current_is_kswapd() &&
2732 sc->nr_reclaimed >= sc->nr_to_reclaim) {
2733 mem_cgroup_iter_break(root, memcg);
2734 break;
2735 }
2736 } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
2737 2706
2738 if (reclaim_state) { 2707 if (reclaim_state) {
2739 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2708 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -2810,7 +2779,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2810 wait_iff_congested(BLK_RW_ASYNC, HZ/10); 2779 wait_iff_congested(BLK_RW_ASYNC, HZ/10);
2811 2780
2812 } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, 2781 } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
2813 sc->nr_scanned - nr_scanned, sc)); 2782 sc));
2814 2783
2815 /* 2784 /*
2816 * Kswapd gives up on balancing particular nodes after too 2785 * Kswapd gives up on balancing particular nodes after too
diff --git a/mm/vmstat.c b/mm/vmstat.c
index fd7e16ca6996..6afc892a148a 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1158,6 +1158,8 @@ const char * const vmstat_text[] = {
1158 "nr_shmem", 1158 "nr_shmem",
1159 "nr_shmem_hugepages", 1159 "nr_shmem_hugepages",
1160 "nr_shmem_pmdmapped", 1160 "nr_shmem_pmdmapped",
1161 "nr_file_hugepages",
1162 "nr_file_pmdmapped",
1161 "nr_anon_transparent_hugepages", 1163 "nr_anon_transparent_hugepages",
1162 "nr_unstable", 1164 "nr_unstable",
1163 "nr_vmscan_write", 1165 "nr_vmscan_write",
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 75b7962439ff..05bdf90646e7 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -41,7 +41,6 @@
41#include <linux/workqueue.h> 41#include <linux/workqueue.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/spinlock.h> 43#include <linux/spinlock.h>
44#include <linux/wait.h>
45#include <linux/zpool.h> 44#include <linux/zpool.h>
46#include <linux/magic.h> 45#include <linux/magic.h>
47 46
@@ -146,8 +145,6 @@ struct z3fold_header {
146 * @release_wq: workqueue for safe page release 145 * @release_wq: workqueue for safe page release
147 * @work: work_struct for safe page release 146 * @work: work_struct for safe page release
148 * @inode: inode for z3fold pseudo filesystem 147 * @inode: inode for z3fold pseudo filesystem
149 * @destroying: bool to stop migration once we start destruction
150 * @isolated: int to count the number of pages currently in isolation
151 * 148 *
152 * This structure is allocated at pool creation time and maintains metadata 149 * This structure is allocated at pool creation time and maintains metadata
153 * pertaining to a particular z3fold pool. 150 * pertaining to a particular z3fold pool.
@@ -166,11 +163,8 @@ struct z3fold_pool {
166 const struct zpool_ops *zpool_ops; 163 const struct zpool_ops *zpool_ops;
167 struct workqueue_struct *compact_wq; 164 struct workqueue_struct *compact_wq;
168 struct workqueue_struct *release_wq; 165 struct workqueue_struct *release_wq;
169 struct wait_queue_head isolate_wait;
170 struct work_struct work; 166 struct work_struct work;
171 struct inode *inode; 167 struct inode *inode;
172 bool destroying;
173 int isolated;
174}; 168};
175 169
176/* 170/*
@@ -301,14 +295,11 @@ static void z3fold_unregister_migration(struct z3fold_pool *pool)
301 } 295 }
302 296
303/* Initializes the z3fold header of a newly allocated z3fold page */ 297/* Initializes the z3fold header of a newly allocated z3fold page */
304static struct z3fold_header *init_z3fold_page(struct page *page, 298static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
305 struct z3fold_pool *pool, gfp_t gfp) 299 struct z3fold_pool *pool, gfp_t gfp)
306{ 300{
307 struct z3fold_header *zhdr = page_address(page); 301 struct z3fold_header *zhdr = page_address(page);
308 struct z3fold_buddy_slots *slots = alloc_slots(pool, gfp); 302 struct z3fold_buddy_slots *slots;
309
310 if (!slots)
311 return NULL;
312 303
313 INIT_LIST_HEAD(&page->lru); 304 INIT_LIST_HEAD(&page->lru);
314 clear_bit(PAGE_HEADLESS, &page->private); 305 clear_bit(PAGE_HEADLESS, &page->private);
@@ -316,6 +307,12 @@ static struct z3fold_header *init_z3fold_page(struct page *page,
316 clear_bit(NEEDS_COMPACTING, &page->private); 307 clear_bit(NEEDS_COMPACTING, &page->private);
317 clear_bit(PAGE_STALE, &page->private); 308 clear_bit(PAGE_STALE, &page->private);
318 clear_bit(PAGE_CLAIMED, &page->private); 309 clear_bit(PAGE_CLAIMED, &page->private);
310 if (headless)
311 return zhdr;
312
313 slots = alloc_slots(pool, gfp);
314 if (!slots)
315 return NULL;
319 316
320 spin_lock_init(&zhdr->page_lock); 317 spin_lock_init(&zhdr->page_lock);
321 kref_init(&zhdr->refcount); 318 kref_init(&zhdr->refcount);
@@ -372,9 +369,10 @@ static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
372 * Encodes the handle of a particular buddy within a z3fold page 369 * Encodes the handle of a particular buddy within a z3fold page
373 * Pool lock should be held as this function accesses first_num 370 * Pool lock should be held as this function accesses first_num
374 */ 371 */
375static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) 372static unsigned long __encode_handle(struct z3fold_header *zhdr,
373 struct z3fold_buddy_slots *slots,
374 enum buddy bud)
376{ 375{
377 struct z3fold_buddy_slots *slots;
378 unsigned long h = (unsigned long)zhdr; 376 unsigned long h = (unsigned long)zhdr;
379 int idx = 0; 377 int idx = 0;
380 378
@@ -391,11 +389,15 @@ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
391 if (bud == LAST) 389 if (bud == LAST)
392 h |= (zhdr->last_chunks << BUDDY_SHIFT); 390 h |= (zhdr->last_chunks << BUDDY_SHIFT);
393 391
394 slots = zhdr->slots;
395 slots->slot[idx] = h; 392 slots->slot[idx] = h;
396 return (unsigned long)&slots->slot[idx]; 393 return (unsigned long)&slots->slot[idx];
397} 394}
398 395
396static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
397{
398 return __encode_handle(zhdr, zhdr->slots, bud);
399}
400
399/* Returns the z3fold page where a given handle is stored */ 401/* Returns the z3fold page where a given handle is stored */
400static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) 402static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
401{ 403{
@@ -630,6 +632,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
630 } 632 }
631 633
632 if (unlikely(PageIsolated(page) || 634 if (unlikely(PageIsolated(page) ||
635 test_bit(PAGE_CLAIMED, &page->private) ||
633 test_bit(PAGE_STALE, &page->private))) { 636 test_bit(PAGE_STALE, &page->private))) {
634 z3fold_page_unlock(zhdr); 637 z3fold_page_unlock(zhdr);
635 return; 638 return;
@@ -775,7 +778,6 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
775 goto out_c; 778 goto out_c;
776 spin_lock_init(&pool->lock); 779 spin_lock_init(&pool->lock);
777 spin_lock_init(&pool->stale_lock); 780 spin_lock_init(&pool->stale_lock);
778 init_waitqueue_head(&pool->isolate_wait);
779 pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); 781 pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
780 if (!pool->unbuddied) 782 if (!pool->unbuddied)
781 goto out_pool; 783 goto out_pool;
@@ -815,15 +817,6 @@ out:
815 return NULL; 817 return NULL;
816} 818}
817 819
818static bool pool_isolated_are_drained(struct z3fold_pool *pool)
819{
820 bool ret;
821
822 spin_lock(&pool->lock);
823 ret = pool->isolated == 0;
824 spin_unlock(&pool->lock);
825 return ret;
826}
827/** 820/**
828 * z3fold_destroy_pool() - destroys an existing z3fold pool 821 * z3fold_destroy_pool() - destroys an existing z3fold pool
829 * @pool: the z3fold pool to be destroyed 822 * @pool: the z3fold pool to be destroyed
@@ -833,22 +826,6 @@ static bool pool_isolated_are_drained(struct z3fold_pool *pool)
833static void z3fold_destroy_pool(struct z3fold_pool *pool) 826static void z3fold_destroy_pool(struct z3fold_pool *pool)
834{ 827{
835 kmem_cache_destroy(pool->c_handle); 828 kmem_cache_destroy(pool->c_handle);
836 /*
837 * We set pool-> destroying under lock to ensure that
838 * z3fold_page_isolate() sees any changes to destroying. This way we
839 * avoid the need for any memory barriers.
840 */
841
842 spin_lock(&pool->lock);
843 pool->destroying = true;
844 spin_unlock(&pool->lock);
845
846 /*
847 * We need to ensure that no pages are being migrated while we destroy
848 * these workqueues, as migration can queue work on either of the
849 * workqueues.
850 */
851 wait_event(pool->isolate_wait, !pool_isolated_are_drained(pool));
852 829
853 /* 830 /*
854 * We need to destroy pool->compact_wq before pool->release_wq, 831 * We need to destroy pool->compact_wq before pool->release_wq,
@@ -956,7 +933,7 @@ retry:
956 if (!page) 933 if (!page)
957 return -ENOMEM; 934 return -ENOMEM;
958 935
959 zhdr = init_z3fold_page(page, pool, gfp); 936 zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp);
960 if (!zhdr) { 937 if (!zhdr) {
961 __free_page(page); 938 __free_page(page);
962 return -ENOMEM; 939 return -ENOMEM;
@@ -1132,6 +1109,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
1132 struct z3fold_header *zhdr = NULL; 1109 struct z3fold_header *zhdr = NULL;
1133 struct page *page = NULL; 1110 struct page *page = NULL;
1134 struct list_head *pos; 1111 struct list_head *pos;
1112 struct z3fold_buddy_slots slots;
1135 unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; 1113 unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
1136 1114
1137 spin_lock(&pool->lock); 1115 spin_lock(&pool->lock);
@@ -1150,16 +1128,22 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
1150 /* this bit could have been set by free, in which case 1128 /* this bit could have been set by free, in which case
1151 * we pass over to the next page in the pool. 1129 * we pass over to the next page in the pool.
1152 */ 1130 */
1153 if (test_and_set_bit(PAGE_CLAIMED, &page->private)) 1131 if (test_and_set_bit(PAGE_CLAIMED, &page->private)) {
1132 page = NULL;
1154 continue; 1133 continue;
1134 }
1155 1135
1156 if (unlikely(PageIsolated(page))) 1136 if (unlikely(PageIsolated(page))) {
1137 clear_bit(PAGE_CLAIMED, &page->private);
1138 page = NULL;
1157 continue; 1139 continue;
1140 }
1141 zhdr = page_address(page);
1158 if (test_bit(PAGE_HEADLESS, &page->private)) 1142 if (test_bit(PAGE_HEADLESS, &page->private))
1159 break; 1143 break;
1160 1144
1161 zhdr = page_address(page);
1162 if (!z3fold_page_trylock(zhdr)) { 1145 if (!z3fold_page_trylock(zhdr)) {
1146 clear_bit(PAGE_CLAIMED, &page->private);
1163 zhdr = NULL; 1147 zhdr = NULL;
1164 continue; /* can't evict at this point */ 1148 continue; /* can't evict at this point */
1165 } 1149 }
@@ -1177,26 +1161,30 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
1177 1161
1178 if (!test_bit(PAGE_HEADLESS, &page->private)) { 1162 if (!test_bit(PAGE_HEADLESS, &page->private)) {
1179 /* 1163 /*
1180 * We need encode the handles before unlocking, since 1164 * We need encode the handles before unlocking, and
1181 * we can race with free that will set 1165 * use our local slots structure because z3fold_free
1182 * (first|last)_chunks to 0 1166 * can zero out zhdr->slots and we can't do much
1167 * about that
1183 */ 1168 */
1184 first_handle = 0; 1169 first_handle = 0;
1185 last_handle = 0; 1170 last_handle = 0;
1186 middle_handle = 0; 1171 middle_handle = 0;
1187 if (zhdr->first_chunks) 1172 if (zhdr->first_chunks)
1188 first_handle = encode_handle(zhdr, FIRST); 1173 first_handle = __encode_handle(zhdr, &slots,
1174 FIRST);
1189 if (zhdr->middle_chunks) 1175 if (zhdr->middle_chunks)
1190 middle_handle = encode_handle(zhdr, MIDDLE); 1176 middle_handle = __encode_handle(zhdr, &slots,
1177 MIDDLE);
1191 if (zhdr->last_chunks) 1178 if (zhdr->last_chunks)
1192 last_handle = encode_handle(zhdr, LAST); 1179 last_handle = __encode_handle(zhdr, &slots,
1180 LAST);
1193 /* 1181 /*
1194 * it's safe to unlock here because we hold a 1182 * it's safe to unlock here because we hold a
1195 * reference to this page 1183 * reference to this page
1196 */ 1184 */
1197 z3fold_page_unlock(zhdr); 1185 z3fold_page_unlock(zhdr);
1198 } else { 1186 } else {
1199 first_handle = encode_handle(zhdr, HEADLESS); 1187 first_handle = __encode_handle(zhdr, &slots, HEADLESS);
1200 last_handle = middle_handle = 0; 1188 last_handle = middle_handle = 0;
1201 } 1189 }
1202 1190
@@ -1226,9 +1214,9 @@ next:
1226 spin_lock(&pool->lock); 1214 spin_lock(&pool->lock);
1227 list_add(&page->lru, &pool->lru); 1215 list_add(&page->lru, &pool->lru);
1228 spin_unlock(&pool->lock); 1216 spin_unlock(&pool->lock);
1217 clear_bit(PAGE_CLAIMED, &page->private);
1229 } else { 1218 } else {
1230 z3fold_page_lock(zhdr); 1219 z3fold_page_lock(zhdr);
1231 clear_bit(PAGE_CLAIMED, &page->private);
1232 if (kref_put(&zhdr->refcount, 1220 if (kref_put(&zhdr->refcount,
1233 release_z3fold_page_locked)) { 1221 release_z3fold_page_locked)) {
1234 atomic64_dec(&pool->pages_nr); 1222 atomic64_dec(&pool->pages_nr);
@@ -1243,6 +1231,7 @@ next:
1243 list_add(&page->lru, &pool->lru); 1231 list_add(&page->lru, &pool->lru);
1244 spin_unlock(&pool->lock); 1232 spin_unlock(&pool->lock);
1245 z3fold_page_unlock(zhdr); 1233 z3fold_page_unlock(zhdr);
1234 clear_bit(PAGE_CLAIMED, &page->private);
1246 } 1235 }
1247 1236
1248 /* We started off locked to we need to lock the pool back */ 1237 /* We started off locked to we need to lock the pool back */
@@ -1339,28 +1328,6 @@ static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
1339 return atomic64_read(&pool->pages_nr); 1328 return atomic64_read(&pool->pages_nr);
1340} 1329}
1341 1330
1342/*
1343 * z3fold_dec_isolated() expects to be called while pool->lock is held.
1344 */
1345static void z3fold_dec_isolated(struct z3fold_pool *pool)
1346{
1347 assert_spin_locked(&pool->lock);
1348 VM_BUG_ON(pool->isolated <= 0);
1349 pool->isolated--;
1350
1351 /*
1352 * If we have no more isolated pages, we have to see if
1353 * z3fold_destroy_pool() is waiting for a signal.
1354 */
1355 if (pool->isolated == 0 && waitqueue_active(&pool->isolate_wait))
1356 wake_up_all(&pool->isolate_wait);
1357}
1358
1359static void z3fold_inc_isolated(struct z3fold_pool *pool)
1360{
1361 pool->isolated++;
1362}
1363
1364static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) 1331static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
1365{ 1332{
1366 struct z3fold_header *zhdr; 1333 struct z3fold_header *zhdr;
@@ -1369,7 +1336,8 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
1369 VM_BUG_ON_PAGE(!PageMovable(page), page); 1336 VM_BUG_ON_PAGE(!PageMovable(page), page);
1370 VM_BUG_ON_PAGE(PageIsolated(page), page); 1337 VM_BUG_ON_PAGE(PageIsolated(page), page);
1371 1338
1372 if (test_bit(PAGE_HEADLESS, &page->private)) 1339 if (test_bit(PAGE_HEADLESS, &page->private) ||
1340 test_bit(PAGE_CLAIMED, &page->private))
1373 return false; 1341 return false;
1374 1342
1375 zhdr = page_address(page); 1343 zhdr = page_address(page);
@@ -1387,34 +1355,6 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
1387 spin_lock(&pool->lock); 1355 spin_lock(&pool->lock);
1388 if (!list_empty(&page->lru)) 1356 if (!list_empty(&page->lru))
1389 list_del(&page->lru); 1357 list_del(&page->lru);
1390 /*
1391 * We need to check for destruction while holding pool->lock, as
1392 * otherwise destruction could see 0 isolated pages, and
1393 * proceed.
1394 */
1395 if (unlikely(pool->destroying)) {
1396 spin_unlock(&pool->lock);
1397 /*
1398 * If this page isn't stale, somebody else holds a
1399 * reference to it. Let't drop our refcount so that they
1400 * can call the release logic.
1401 */
1402 if (unlikely(kref_put(&zhdr->refcount,
1403 release_z3fold_page_locked))) {
1404 /*
1405 * If we get here we have kref problems, so we
1406 * should freak out.
1407 */
1408 WARN(1, "Z3fold is experiencing kref problems\n");
1409 z3fold_page_unlock(zhdr);
1410 return false;
1411 }
1412 z3fold_page_unlock(zhdr);
1413 return false;
1414 }
1415
1416
1417 z3fold_inc_isolated(pool);
1418 spin_unlock(&pool->lock); 1358 spin_unlock(&pool->lock);
1419 z3fold_page_unlock(zhdr); 1359 z3fold_page_unlock(zhdr);
1420 return true; 1360 return true;
@@ -1483,10 +1423,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
1483 1423
1484 queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); 1424 queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
1485 1425
1486 spin_lock(&pool->lock);
1487 z3fold_dec_isolated(pool);
1488 spin_unlock(&pool->lock);
1489
1490 page_mapcount_reset(page); 1426 page_mapcount_reset(page);
1491 put_page(page); 1427 put_page(page);
1492 return 0; 1428 return 0;
@@ -1506,14 +1442,10 @@ static void z3fold_page_putback(struct page *page)
1506 INIT_LIST_HEAD(&page->lru); 1442 INIT_LIST_HEAD(&page->lru);
1507 if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { 1443 if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
1508 atomic64_dec(&pool->pages_nr); 1444 atomic64_dec(&pool->pages_nr);
1509 spin_lock(&pool->lock);
1510 z3fold_dec_isolated(pool);
1511 spin_unlock(&pool->lock);
1512 return; 1445 return;
1513 } 1446 }
1514 spin_lock(&pool->lock); 1447 spin_lock(&pool->lock);
1515 list_add(&page->lru, &pool->lru); 1448 list_add(&page->lru, &pool->lru);
1516 z3fold_dec_isolated(pool);
1517 spin_unlock(&pool->lock); 1449 spin_unlock(&pool->lock);
1518 z3fold_page_unlock(zhdr); 1450 z3fold_page_unlock(zhdr);
1519} 1451}
diff --git a/mm/zpool.c b/mm/zpool.c
index a2dd9107857d..863669212070 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -239,6 +239,22 @@ const char *zpool_get_type(struct zpool *zpool)
239} 239}
240 240
241/** 241/**
242 * zpool_malloc_support_movable() - Check if the zpool support
243 * allocate movable memory
244 * @zpool: The zpool to check
245 *
246 * This returns if the zpool support allocate movable memory.
247 *
248 * Implementations must guarantee this to be thread-safe.
249 *
250 * Returns: true if if the zpool support allocate movable memory, false if not
251 */
252bool zpool_malloc_support_movable(struct zpool *zpool)
253{
254 return zpool->driver->malloc_support_movable;
255}
256
257/**
242 * zpool_malloc() - Allocate memory 258 * zpool_malloc() - Allocate memory
243 * @zpool: The zpool to allocate from. 259 * @zpool: The zpool to allocate from.
244 * @size: The amount of memory to allocate. 260 * @size: The amount of memory to allocate.
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index e98bb6ab4f7e..2b2b9aae8a3c 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -443,15 +443,16 @@ static u64 zs_zpool_total_size(void *pool)
443} 443}
444 444
445static struct zpool_driver zs_zpool_driver = { 445static struct zpool_driver zs_zpool_driver = {
446 .type = "zsmalloc", 446 .type = "zsmalloc",
447 .owner = THIS_MODULE, 447 .owner = THIS_MODULE,
448 .create = zs_zpool_create, 448 .create = zs_zpool_create,
449 .destroy = zs_zpool_destroy, 449 .destroy = zs_zpool_destroy,
450 .malloc = zs_zpool_malloc, 450 .malloc_support_movable = true,
451 .free = zs_zpool_free, 451 .malloc = zs_zpool_malloc,
452 .map = zs_zpool_map, 452 .free = zs_zpool_free,
453 .unmap = zs_zpool_unmap, 453 .map = zs_zpool_map,
454 .total_size = zs_zpool_total_size, 454 .unmap = zs_zpool_unmap,
455 .total_size = zs_zpool_total_size,
455}; 456};
456 457
457MODULE_ALIAS("zpool-zsmalloc"); 458MODULE_ALIAS("zpool-zsmalloc");
@@ -476,10 +477,6 @@ static inline int get_zspage_inuse(struct zspage *zspage)
476 return zspage->inuse; 477 return zspage->inuse;
477} 478}
478 479
479static inline void set_zspage_inuse(struct zspage *zspage, int val)
480{
481 zspage->inuse = val;
482}
483 480
484static inline void mod_zspage_inuse(struct zspage *zspage, int val) 481static inline void mod_zspage_inuse(struct zspage *zspage, int val)
485{ 482{
diff --git a/mm/zswap.c b/mm/zswap.c
index 0e22744a76cb..46a322316e52 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -856,7 +856,6 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
856 /* extract swpentry from data */ 856 /* extract swpentry from data */
857 zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); 857 zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
858 swpentry = zhdr->swpentry; /* here */ 858 swpentry = zhdr->swpentry; /* here */
859 zpool_unmap_handle(pool, handle);
860 tree = zswap_trees[swp_type(swpentry)]; 859 tree = zswap_trees[swp_type(swpentry)];
861 offset = swp_offset(swpentry); 860 offset = swp_offset(swpentry);
862 861
@@ -866,6 +865,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
866 if (!entry) { 865 if (!entry) {
867 /* entry was invalidated */ 866 /* entry was invalidated */
868 spin_unlock(&tree->lock); 867 spin_unlock(&tree->lock);
868 zpool_unmap_handle(pool, handle);
869 return 0; 869 return 0;
870 } 870 }
871 spin_unlock(&tree->lock); 871 spin_unlock(&tree->lock);
@@ -886,15 +886,13 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
886 case ZSWAP_SWAPCACHE_NEW: /* page is locked */ 886 case ZSWAP_SWAPCACHE_NEW: /* page is locked */
887 /* decompress */ 887 /* decompress */
888 dlen = PAGE_SIZE; 888 dlen = PAGE_SIZE;
889 src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, 889 src = (u8 *)zhdr + sizeof(struct zswap_header);
890 ZPOOL_MM_RO) + sizeof(struct zswap_header);
891 dst = kmap_atomic(page); 890 dst = kmap_atomic(page);
892 tfm = *get_cpu_ptr(entry->pool->tfm); 891 tfm = *get_cpu_ptr(entry->pool->tfm);
893 ret = crypto_comp_decompress(tfm, src, entry->length, 892 ret = crypto_comp_decompress(tfm, src, entry->length,
894 dst, &dlen); 893 dst, &dlen);
895 put_cpu_ptr(entry->pool->tfm); 894 put_cpu_ptr(entry->pool->tfm);
896 kunmap_atomic(dst); 895 kunmap_atomic(dst);
897 zpool_unmap_handle(entry->pool->zpool, entry->handle);
898 BUG_ON(ret); 896 BUG_ON(ret);
899 BUG_ON(dlen != PAGE_SIZE); 897 BUG_ON(dlen != PAGE_SIZE);
900 898
@@ -940,6 +938,7 @@ fail:
940 spin_unlock(&tree->lock); 938 spin_unlock(&tree->lock);
941 939
942end: 940end:
941 zpool_unmap_handle(pool, handle);
943 return ret; 942 return ret;
944} 943}
945 944
@@ -997,6 +996,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
997 char *buf; 996 char *buf;
998 u8 *src, *dst; 997 u8 *src, *dst;
999 struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; 998 struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
999 gfp_t gfp;
1000 1000
1001 /* THP isn't supported */ 1001 /* THP isn't supported */
1002 if (PageTransHuge(page)) { 1002 if (PageTransHuge(page)) {
@@ -1070,9 +1070,10 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
1070 1070
1071 /* store */ 1071 /* store */
1072 hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; 1072 hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
1073 ret = zpool_malloc(entry->pool->zpool, hlen + dlen, 1073 gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
1074 __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, 1074 if (zpool_malloc_support_movable(entry->pool->zpool))
1075 &handle); 1075 gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
1076 ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
1076 if (ret == -ENOSPC) { 1077 if (ret == -ENOSPC) {
1077 zswap_reject_compress_poor++; 1078 zswap_reject_compress_poor++;
1078 goto put_dstmem; 1079 goto put_dstmem;
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index 947b8ff0227e..bba3104f128f 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -206,14 +206,7 @@ static int xdp_umem_map_pages(struct xdp_umem *umem)
206 206
207static void xdp_umem_unpin_pages(struct xdp_umem *umem) 207static void xdp_umem_unpin_pages(struct xdp_umem *umem)
208{ 208{
209 unsigned int i; 209 put_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
210
211 for (i = 0; i < umem->npgs; i++) {
212 struct page *page = umem->pgs[i];
213
214 set_page_dirty_lock(page);
215 put_page(page);
216 }
217 210
218 kfree(umem->pgs); 211 kfree(umem->pgs);
219 umem->pgs = NULL; 212 umem->pgs = NULL;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index c2f1af3b6a7c..fa8fbb8fa3c8 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -977,7 +977,7 @@ static int xsk_mmap(struct file *file, struct socket *sock,
977 /* Matches the smp_wmb() in xsk_init_queue */ 977 /* Matches the smp_wmb() in xsk_init_queue */
978 smp_rmb(); 978 smp_rmb();
979 qpg = virt_to_head_page(q->ring); 979 qpg = virt_to_head_page(q->ring);
980 if (size > (PAGE_SIZE << compound_order(qpg))) 980 if (size > page_size(qpg))
981 return -EINVAL; 981 return -EINVAL;
982 982
983 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT; 983 pfn = virt_to_phys(q->ring) >> PAGE_SHIFT;
diff --git a/usr/Makefile b/usr/Makefile
index 6a89eb019275..e6f7cb2f81db 100644
--- a/usr/Makefile
+++ b/usr/Makefile
@@ -11,6 +11,9 @@ datafile_y = initramfs_data.cpio$(suffix_y)
11datafile_d_y = .$(datafile_y).d 11datafile_d_y = .$(datafile_y).d
12AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/$(datafile_y)" 12AFLAGS_initramfs_data.o += -DINITRAMFS_IMAGE="usr/$(datafile_y)"
13 13
14# clean rules do not have CONFIG_INITRAMFS_COMPRESSION. So clean up after all
15# possible compression formats.
16clean-files += initramfs_data.cpio*
14 17
15# Generate builtin.o based on initramfs_data.o 18# Generate builtin.o based on initramfs_data.o
16obj-$(CONFIG_BLK_DEV_INITRD) := initramfs_data.o 19obj-$(CONFIG_BLK_DEV_INITRD) := initramfs_data.o