aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-07-26 22:55:54 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-26 22:55:54 -0400
commit0e06f5c0deeef0332a5da2ecb8f1fcf3e024d958 (patch)
treee0f0af4aadf10c713c5cf1b65356844b3c9b3215
parentf7816ad0f878dacd5f0120476f9b836ccf8699ea (diff)
parent8f19b0c058d93a678a99dd6fec03af2e769943f2 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton: - a few misc bits - ocfs2 - most(?) of MM * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (125 commits) thp: fix comments of __pmd_trans_huge_lock() cgroup: remove unnecessary 0 check from css_from_id() cgroup: fix idr leak for the first cgroup root mm: memcontrol: fix documentation for compound parameter mm: memcontrol: remove BUG_ON in uncharge_list mm: fix build warnings in <linux/compaction.h> mm, thp: convert from optimistic swapin collapsing to conservative mm, thp: fix comment inconsistency for swapin readahead functions thp: update Documentation/{vm/transhuge,filesystems/proc}.txt shmem: split huge pages beyond i_size under memory pressure thp: introduce CONFIG_TRANSPARENT_HUGE_PAGECACHE khugepaged: add support of collapse for tmpfs/shmem pages shmem: make shmem_inode_info::lock irq-safe khugepaged: move up_read(mmap_sem) out of khugepaged_alloc_page() thp: extract khugepaged from mm/huge_memory.c shmem, thp: respect MADV_{NO,}HUGEPAGE for file mappings shmem: add huge pages support shmem: get_unmapped_area align huge page shmem: prepare huge= mount option and sysfs knob mm, rmap: account shmem thp pages ...
-rw-r--r--Documentation/blockdev/zram.txt82
-rw-r--r--Documentation/filesystems/Locking14
-rw-r--r--Documentation/filesystems/dax.txt6
-rw-r--r--Documentation/filesystems/proc.txt9
-rw-r--r--Documentation/filesystems/vfs.txt11
-rw-r--r--Documentation/vm/page_migration108
-rw-r--r--Documentation/vm/transhuge.txt128
-rw-r--r--Documentation/vm/unevictable-lru.txt21
-rw-r--r--Makefile69
-rw-r--r--arch/alpha/mm/fault.c2
-rw-r--r--arch/arc/mm/fault.c2
-rw-r--r--arch/arm/include/asm/pgalloc.h2
-rw-r--r--arch/arm/include/asm/tlb.h29
-rw-r--r--arch/arm/mm/fault.c2
-rw-r--r--arch/arm/mm/pgd.c2
-rw-r--r--arch/arm64/mm/fault.c2
-rw-r--r--arch/avr32/mm/fault.c2
-rw-r--r--arch/cris/mm/fault.c2
-rw-r--r--arch/frv/mm/fault.c2
-rw-r--r--arch/hexagon/mm/vm_fault.c2
-rw-r--r--arch/ia64/include/asm/tlb.h31
-rw-r--r--arch/ia64/mm/fault.c2
-rw-r--r--arch/m32r/kernel/m32r_ksyms.c3
-rw-r--r--arch/m32r/lib/Makefile4
-rw-r--r--arch/m32r/lib/libgcc.h23
-rw-r--r--arch/m32r/lib/ucmpdi2.c17
-rw-r--r--arch/m32r/mm/fault.c2
-rw-r--r--arch/m68k/mm/fault.c2
-rw-r--r--arch/metag/mm/fault.c2
-rw-r--r--arch/microblaze/mm/fault.c2
-rw-r--r--arch/mips/mm/fault.c2
-rw-r--r--arch/mn10300/mm/fault.c2
-rw-r--r--arch/nios2/mm/fault.c2
-rw-r--r--arch/openrisc/mm/fault.c2
-rw-r--r--arch/parisc/mm/fault.c2
-rw-r--r--arch/powerpc/include/asm/pgtable.h6
-rw-r--r--arch/powerpc/mm/copro_fault.c2
-rw-r--r--arch/powerpc/mm/fault.c2
-rw-r--r--arch/s390/include/asm/tlb.h22
-rw-r--r--arch/s390/mm/fault.c2
-rw-r--r--arch/score/mm/fault.c2
-rw-r--r--arch/sh/include/asm/tlb.h20
-rw-r--r--arch/sh/mm/fault.c2
-rw-r--r--arch/sparc/mm/fault_32.c4
-rw-r--r--arch/sparc/mm/fault_64.c2
-rw-r--r--arch/tile/mm/fault.c2
-rw-r--r--arch/um/include/asm/tlb.h20
-rw-r--r--arch/um/kernel/trap.c2
-rw-r--r--arch/unicore32/mm/fault.c2
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/include/asm/pgalloc.h12
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/mm/pgtable.c10
-rw-r--r--arch/xtensa/mm/fault.c2
-rw-r--r--drivers/base/memory.c28
-rw-r--r--drivers/base/node.c13
-rw-r--r--drivers/block/zram/Kconfig15
-rw-r--r--drivers/block/zram/Makefile4
-rw-r--r--drivers/block/zram/zcomp.c150
-rw-r--r--drivers/block/zram/zcomp.h36
-rw-r--r--drivers/block/zram/zcomp_lz4.c56
-rw-r--r--drivers/block/zram/zcomp_lz4.h17
-rw-r--r--drivers/block/zram/zcomp_lzo.c56
-rw-r--r--drivers/block/zram/zcomp_lzo.h17
-rw-r--r--drivers/block/zram/zram_drv.c48
-rw-r--r--drivers/block/zram/zram_drv.h5
-rw-r--r--drivers/char/mem.c24
-rw-r--r--drivers/iommu/amd_iommu_v2.c3
-rw-r--r--drivers/iommu/intel-svm.c2
-rw-r--r--drivers/tty/sysrq.c1
-rw-r--r--drivers/video/fbdev/core/fbmon.c1
-rw-r--r--drivers/virtio/virtio_balloon.c52
-rw-r--r--drivers/xen/xen-selfballoon.c4
-rw-r--r--fs/btrfs/extent_io.c3
-rw-r--r--fs/cifs/file.c2
-rw-r--r--fs/dax.c73
-rw-r--r--fs/ext2/file.c4
-rw-r--r--fs/ext4/file.c4
-rw-r--r--fs/ext4/readpage.c2
-rw-r--r--fs/f2fs/data.c3
-rw-r--r--fs/fs-writeback.c111
-rw-r--r--fs/inode.c2
-rw-r--r--fs/mpage.c4
-rw-r--r--fs/ocfs2/cluster/tcp.c8
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c26
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h1
-rw-r--r--fs/ocfs2/dlmglue.c13
-rw-r--r--fs/ocfs2/inode.h7
-rw-r--r--fs/ocfs2/journal.c41
-rw-r--r--fs/ocfs2/stackglue.c2
-rw-r--r--fs/ocfs2/super.c1
-rw-r--r--fs/orangefs/inode.c2
-rw-r--r--fs/pipe.c32
-rw-r--r--fs/proc/meminfo.c7
-rw-r--r--fs/proc/task_mmu.c10
-rw-r--r--fs/super.c2
-rw-r--r--fs/userfaultfd.c22
-rw-r--r--fs/xfs/xfs_file.c6
-rw-r--r--include/asm-generic/tlb.h59
-rw-r--r--include/linux/balloon_compaction.h51
-rw-r--r--include/linux/compaction.h1
-rw-r--r--include/linux/dax.h5
-rw-r--r--include/linux/debugobjects.h2
-rw-r--r--include/linux/frontswap.h34
-rw-r--r--include/linux/fs.h6
-rw-r--r--include/linux/gfp.h10
-rw-r--r--include/linux/huge_mm.h36
-rw-r--r--include/linux/khugepaged.h5
-rw-r--r--include/linux/ksm.h3
-rw-r--r--include/linux/memblock.h20
-rw-r--r--include/linux/memcontrol.h103
-rw-r--r--include/linux/memory_hotplug.h2
-rw-r--r--include/linux/migrate.h17
-rw-r--r--include/linux/mm.h53
-rw-r--r--include/linux/mm_types.h73
-rw-r--r--include/linux/mmdebug.h2
-rw-r--r--include/linux/mmzone.h8
-rw-r--r--include/linux/oom.h10
-rw-r--r--include/linux/page-flags.h128
-rw-r--r--include/linux/page_ext.h4
-rw-r--r--include/linux/page_owner.h12
-rw-r--r--include/linux/pagemap.h6
-rw-r--r--include/linux/radix-tree.h1
-rw-r--r--include/linux/rmap.h2
-rw-r--r--include/linux/shmem_fs.h45
-rw-r--r--include/linux/slab.h2
-rw-r--r--include/linux/slab_def.h2
-rw-r--r--include/linux/slub_def.h5
-rw-r--r--include/linux/userfaultfd_k.h8
-rw-r--r--include/linux/vm_event_item.h7
-rw-r--r--include/linux/writeback.h3
-rw-r--r--include/trace/events/huge_memory.h50
-rw-r--r--include/trace/events/writeback.h22
-rw-r--r--include/uapi/linux/magic.h2
-rw-r--r--init/Kconfig4
-rw-r--r--ipc/shm.c10
-rw-r--r--kernel/fork.c6
-rw-r--r--lib/Kconfig.debug1
-rw-r--r--lib/dma-debug.c2
-rw-r--r--lib/radix-tree.c84
-rw-r--r--mm/Kconfig8
-rw-r--r--mm/Makefile2
-rw-r--r--mm/balloon_compaction.c94
-rw-r--r--mm/compaction.c123
-rw-r--r--mm/filemap.c217
-rw-r--r--mm/frontswap.c35
-rw-r--r--mm/gup.c9
-rw-r--r--mm/huge_memory.c1909
-rw-r--r--mm/hugetlb.c54
-rw-r--r--mm/internal.h4
-rw-r--r--mm/khugepaged.c1922
-rw-r--r--mm/ksm.c9
-rw-r--r--mm/memblock.c3
-rw-r--r--mm/memcontrol.c134
-rw-r--r--mm/memory.c885
-rw-r--r--mm/memory_hotplug.c70
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/migrate.c262
-rw-r--r--mm/mmap.c26
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c3
-rw-r--r--mm/nommu.c3
-rw-r--r--mm/oom_kill.c65
-rw-r--r--mm/page-writeback.c19
-rw-r--r--mm/page_alloc.c164
-rw-r--r--mm/page_isolation.c13
-rw-r--r--mm/page_owner.c157
-rw-r--r--mm/readahead.c13
-rw-r--r--mm/rmap.c78
-rw-r--r--mm/shmem.c918
-rw-r--r--mm/slab.c90
-rw-r--r--mm/slab.h30
-rw-r--r--mm/slab_common.c49
-rw-r--r--mm/slub.c145
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c28
-rw-r--r--mm/util.c12
-rw-r--r--mm/vmalloc.c6
-rw-r--r--mm/vmscan.c8
-rw-r--r--mm/vmstat.c8
-rw-r--r--mm/zsmalloc.c1350
-rw-r--r--net/unix/af_unix.c1
-rwxr-xr-xscripts/bloat-o-meter4
-rwxr-xr-xscripts/tags.sh3
-rw-r--r--tools/vm/page_owner_sort.c9
186 files changed, 7363 insertions, 4134 deletions
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 13100fb3c26d..0535ae1f73e5 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -59,23 +59,23 @@ num_devices parameter is optional and tells zram how many devices should be
59pre-created. Default: 1. 59pre-created. Default: 1.
60 60
612) Set max number of compression streams 612) Set max number of compression streams
62 Regardless the value passed to this attribute, ZRAM will always 62Regardless the value passed to this attribute, ZRAM will always
63 allocate multiple compression streams - one per online CPUs - thus 63allocate multiple compression streams - one per online CPUs - thus
64 allowing several concurrent compression operations. The number of 64allowing several concurrent compression operations. The number of
65 allocated compression streams goes down when some of the CPUs 65allocated compression streams goes down when some of the CPUs
66 become offline. There is no single-compression-stream mode anymore, 66become offline. There is no single-compression-stream mode anymore,
67 unless you are running a UP system or has only 1 CPU online. 67unless you are running a UP system or has only 1 CPU online.
68 68
69 To find out how many streams are currently available: 69To find out how many streams are currently available:
70 cat /sys/block/zram0/max_comp_streams 70 cat /sys/block/zram0/max_comp_streams
71 71
723) Select compression algorithm 723) Select compression algorithm
73 Using comp_algorithm device attribute one can see available and 73Using comp_algorithm device attribute one can see available and
74 currently selected (shown in square brackets) compression algorithms, 74currently selected (shown in square brackets) compression algorithms,
75 change selected compression algorithm (once the device is initialised 75change selected compression algorithm (once the device is initialised
76 there is no way to change compression algorithm). 76there is no way to change compression algorithm).
77 77
78 Examples: 78Examples:
79 #show supported compression algorithms 79 #show supported compression algorithms
80 cat /sys/block/zram0/comp_algorithm 80 cat /sys/block/zram0/comp_algorithm
81 lzo [lz4] 81 lzo [lz4]
@@ -83,17 +83,27 @@ pre-created. Default: 1.
83 #select lzo compression algorithm 83 #select lzo compression algorithm
84 echo lzo > /sys/block/zram0/comp_algorithm 84 echo lzo > /sys/block/zram0/comp_algorithm
85 85
86For the time being, the `comp_algorithm' content does not necessarily
87show every compression algorithm supported by the kernel. We keep this
88list primarily to simplify device configuration and one can configure
89a new device with a compression algorithm that is not listed in
90`comp_algorithm'. The thing is that, internally, ZRAM uses Crypto API
91and, if some of the algorithms were built as modules, it's impossible
92to list all of them using, for instance, /proc/crypto or any other
93method. This, however, has an advantage of permitting the usage of
94custom crypto compression modules (implementing S/W or H/W compression).
95
864) Set Disksize 964) Set Disksize
87 Set disk size by writing the value to sysfs node 'disksize'. 97Set disk size by writing the value to sysfs node 'disksize'.
88 The value can be either in bytes or you can use mem suffixes. 98The value can be either in bytes or you can use mem suffixes.
89 Examples: 99Examples:
90 # Initialize /dev/zram0 with 50MB disksize 100 # Initialize /dev/zram0 with 50MB disksize
91 echo $((50*1024*1024)) > /sys/block/zram0/disksize 101 echo $((50*1024*1024)) > /sys/block/zram0/disksize
92 102
93 # Using mem suffixes 103 # Using mem suffixes
94 echo 256K > /sys/block/zram0/disksize 104 echo 256K > /sys/block/zram0/disksize
95 echo 512M > /sys/block/zram0/disksize 105 echo 512M > /sys/block/zram0/disksize
96 echo 1G > /sys/block/zram0/disksize 106 echo 1G > /sys/block/zram0/disksize
97 107
98Note: 108Note:
99There is little point creating a zram of greater than twice the size of memory 109There is little point creating a zram of greater than twice the size of memory
@@ -101,20 +111,20 @@ since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the
101size of the disk when not in use so a huge zram is wasteful. 111size of the disk when not in use so a huge zram is wasteful.
102 112
1035) Set memory limit: Optional 1135) Set memory limit: Optional
104 Set memory limit by writing the value to sysfs node 'mem_limit'. 114Set memory limit by writing the value to sysfs node 'mem_limit'.
105 The value can be either in bytes or you can use mem suffixes. 115The value can be either in bytes or you can use mem suffixes.
106 In addition, you could change the value in runtime. 116In addition, you could change the value in runtime.
107 Examples: 117Examples:
108 # limit /dev/zram0 with 50MB memory 118 # limit /dev/zram0 with 50MB memory
109 echo $((50*1024*1024)) > /sys/block/zram0/mem_limit 119 echo $((50*1024*1024)) > /sys/block/zram0/mem_limit
110 120
111 # Using mem suffixes 121 # Using mem suffixes
112 echo 256K > /sys/block/zram0/mem_limit 122 echo 256K > /sys/block/zram0/mem_limit
113 echo 512M > /sys/block/zram0/mem_limit 123 echo 512M > /sys/block/zram0/mem_limit
114 echo 1G > /sys/block/zram0/mem_limit 124 echo 1G > /sys/block/zram0/mem_limit
115 125
116 # To disable memory limit 126 # To disable memory limit
117 echo 0 > /sys/block/zram0/mem_limit 127 echo 0 > /sys/block/zram0/mem_limit
118 128
1196) Activate: 1296) Activate:
120 mkswap /dev/zram0 130 mkswap /dev/zram0
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 75eea7ce3d7c..5a7386e38e2d 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -195,7 +195,9 @@ prototypes:
195 int (*releasepage) (struct page *, int); 195 int (*releasepage) (struct page *, int);
196 void (*freepage)(struct page *); 196 void (*freepage)(struct page *);
197 int (*direct_IO)(struct kiocb *, struct iov_iter *iter); 197 int (*direct_IO)(struct kiocb *, struct iov_iter *iter);
198 bool (*isolate_page) (struct page *, isolate_mode_t);
198 int (*migratepage)(struct address_space *, struct page *, struct page *); 199 int (*migratepage)(struct address_space *, struct page *, struct page *);
200 void (*putback_page) (struct page *);
199 int (*launder_page)(struct page *); 201 int (*launder_page)(struct page *);
200 int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long); 202 int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long);
201 int (*error_remove_page)(struct address_space *, struct page *); 203 int (*error_remove_page)(struct address_space *, struct page *);
@@ -219,7 +221,9 @@ invalidatepage: yes
219releasepage: yes 221releasepage: yes
220freepage: yes 222freepage: yes
221direct_IO: 223direct_IO:
224isolate_page: yes
222migratepage: yes (both) 225migratepage: yes (both)
226putback_page: yes
223launder_page: yes 227launder_page: yes
224is_partially_uptodate: yes 228is_partially_uptodate: yes
225error_remove_page: yes 229error_remove_page: yes
@@ -544,13 +548,13 @@ subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
544locked. The VM will unlock the page. 548locked. The VM will unlock the page.
545 549
546 ->map_pages() is called when VM asks to map easy accessible pages. 550 ->map_pages() is called when VM asks to map easy accessible pages.
547Filesystem should find and map pages associated with offsets from "pgoff" 551Filesystem should find and map pages associated with offsets from "start_pgoff"
548till "max_pgoff". ->map_pages() is called with page table locked and must 552till "end_pgoff". ->map_pages() is called with page table locked and must
549not block. If it's not possible to reach a page without blocking, 553not block. If it's not possible to reach a page without blocking,
550filesystem should skip it. Filesystem should use do_set_pte() to setup 554filesystem should skip it. Filesystem should use do_set_pte() to setup
551page table entry. Pointer to entry associated with offset "pgoff" is 555page table entry. Pointer to entry associated with the page is passed in
552passed in "pte" field in vm_fault structure. Pointers to entries for other 556"pte" field in fault_env structure. Pointers to entries for other offsets
553offsets should be calculated relative to "pte". 557should be calculated relative to "pte".
554 558
555 ->page_mkwrite() is called when a previously read-only pte is 559 ->page_mkwrite() is called when a previously read-only pte is
556about to become writeable. The filesystem again must ensure that there are 560about to become writeable. The filesystem again must ensure that there are
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index ce4587d257d2..0c16a22521a8 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -49,6 +49,7 @@ These block devices may be used for inspiration:
49- axonram: Axon DDR2 device driver 49- axonram: Axon DDR2 device driver
50- brd: RAM backed block device driver 50- brd: RAM backed block device driver
51- dcssblk: s390 dcss block device driver 51- dcssblk: s390 dcss block device driver
52- pmem: NVDIMM persistent memory driver
52 53
53 54
54Implementation Tips for Filesystem Writers 55Implementation Tips for Filesystem Writers
@@ -75,8 +76,9 @@ calls to get_block() (for example by a page-fault racing with a read()
75or a write()) work correctly. 76or a write()) work correctly.
76 77
77These filesystems may be used for inspiration: 78These filesystems may be used for inspiration:
78- ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt 79- ext2: see Documentation/filesystems/ext2.txt
79- ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt 80- ext4: see Documentation/filesystems/ext4.txt
81- xfs: see Documentation/filesystems/xfs.txt
80 82
81 83
82Handling Media Errors 84Handling Media Errors
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 5b61eeae3f6e..68080ad6a75e 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -436,6 +436,7 @@ Private_Dirty: 0 kB
436Referenced: 892 kB 436Referenced: 892 kB
437Anonymous: 0 kB 437Anonymous: 0 kB
438AnonHugePages: 0 kB 438AnonHugePages: 0 kB
439ShmemPmdMapped: 0 kB
439Shared_Hugetlb: 0 kB 440Shared_Hugetlb: 0 kB
440Private_Hugetlb: 0 kB 441Private_Hugetlb: 0 kB
441Swap: 0 kB 442Swap: 0 kB
@@ -464,6 +465,8 @@ accessed.
464a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE 465a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE
465and a page is modified, the file page is replaced by a private anonymous copy. 466and a page is modified, the file page is replaced by a private anonymous copy.
466"AnonHugePages" shows the ammount of memory backed by transparent hugepage. 467"AnonHugePages" shows the ammount of memory backed by transparent hugepage.
468"ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by
469huge pages.
467"Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by 470"Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by
468hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical 471hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical
469reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field. 472reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field.
@@ -868,6 +871,9 @@ VmallocTotal: 112216 kB
868VmallocUsed: 428 kB 871VmallocUsed: 428 kB
869VmallocChunk: 111088 kB 872VmallocChunk: 111088 kB
870AnonHugePages: 49152 kB 873AnonHugePages: 49152 kB
874ShmemHugePages: 0 kB
875ShmemPmdMapped: 0 kB
876
871 877
872 MemTotal: Total usable ram (i.e. physical ram minus a few reserved 878 MemTotal: Total usable ram (i.e. physical ram minus a few reserved
873 bits and the kernel binary code) 879 bits and the kernel binary code)
@@ -912,6 +918,9 @@ MemAvailable: An estimate of how much memory is available for starting new
912AnonHugePages: Non-file backed huge pages mapped into userspace page tables 918AnonHugePages: Non-file backed huge pages mapped into userspace page tables
913 Mapped: files which have been mmaped, such as libraries 919 Mapped: files which have been mmaped, such as libraries
914 Shmem: Total memory used by shared memory (shmem) and tmpfs 920 Shmem: Total memory used by shared memory (shmem) and tmpfs
921ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated
922 with huge pages
923ShmemPmdMapped: Shared memory mapped into userspace with huge pages
915 Slab: in-kernel data structures cache 924 Slab: in-kernel data structures cache
916SReclaimable: Part of Slab, that might be reclaimed, such as caches 925SReclaimable: Part of Slab, that might be reclaimed, such as caches
917 SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure 926 SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index c61a223ef3ff..900360cbcdae 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -592,9 +592,14 @@ struct address_space_operations {
592 int (*releasepage) (struct page *, int); 592 int (*releasepage) (struct page *, int);
593 void (*freepage)(struct page *); 593 void (*freepage)(struct page *);
594 ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); 594 ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
595 /* isolate a page for migration */
596 bool (*isolate_page) (struct page *, isolate_mode_t);
595 /* migrate the contents of a page to the specified target */ 597 /* migrate the contents of a page to the specified target */
596 int (*migratepage) (struct page *, struct page *); 598 int (*migratepage) (struct page *, struct page *);
599 /* put migration-failed page back to right list */
600 void (*putback_page) (struct page *);
597 int (*launder_page) (struct page *); 601 int (*launder_page) (struct page *);
602
598 int (*is_partially_uptodate) (struct page *, unsigned long, 603 int (*is_partially_uptodate) (struct page *, unsigned long,
599 unsigned long); 604 unsigned long);
600 void (*is_dirty_writeback) (struct page *, bool *, bool *); 605 void (*is_dirty_writeback) (struct page *, bool *, bool *);
@@ -747,6 +752,10 @@ struct address_space_operations {
747 and transfer data directly between the storage and the 752 and transfer data directly between the storage and the
748 application's address space. 753 application's address space.
749 754
755 isolate_page: Called by the VM when isolating a movable non-lru page.
756 If page is successfully isolated, VM marks the page as PG_isolated
757 via __SetPageIsolated.
758
750 migrate_page: This is used to compact the physical memory usage. 759 migrate_page: This is used to compact the physical memory usage.
751 If the VM wants to relocate a page (maybe off a memory card 760 If the VM wants to relocate a page (maybe off a memory card
752 that is signalling imminent failure) it will pass a new page 761 that is signalling imminent failure) it will pass a new page
@@ -754,6 +763,8 @@ struct address_space_operations {
754 transfer any private data across and update any references 763 transfer any private data across and update any references
755 that it has to the page. 764 that it has to the page.
756 765
766 putback_page: Called by the VM when isolated page's migration fails.
767
757 launder_page: Called before freeing a page - it writes back the dirty page. To 768 launder_page: Called before freeing a page - it writes back the dirty page. To
758 prevent redirtying the page, it is kept locked during the whole 769 prevent redirtying the page, it is kept locked during the whole
759 operation. 770 operation.
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration
index fea5c0864170..94bd9c11c4e0 100644
--- a/Documentation/vm/page_migration
+++ b/Documentation/vm/page_migration
@@ -142,5 +142,111 @@ Steps:
14220. The new page is moved to the LRU and can be scanned by the swapper 14220. The new page is moved to the LRU and can be scanned by the swapper
143 etc again. 143 etc again.
144 144
145Christoph Lameter, May 8, 2006. 145C. Non-LRU page migration
146-------------------------
147
148Although original migration aimed for reducing the latency of memory access
149for NUMA, compaction who want to create high-order page is also main customer.
150
151Current problem of the implementation is that it is designed to migrate only
152*LRU* pages. However, there are potential non-lru pages which can be migrated
153in drivers, for example, zsmalloc, virtio-balloon pages.
154
155For virtio-balloon pages, some parts of migration code path have been hooked
156up and added virtio-balloon specific functions to intercept migration logics.
157It's too specific to a driver so other drivers who want to make their pages
158movable would have to add own specific hooks in migration path.
159
160To overclome the problem, VM supports non-LRU page migration which provides
161generic functions for non-LRU movable pages without driver specific hooks
162migration path.
163
164If a driver want to make own pages movable, it should define three functions
165which are function pointers of struct address_space_operations.
166
1671. bool (*isolate_page) (struct page *page, isolate_mode_t mode);
168
169What VM expects on isolate_page function of driver is to return *true*
170if driver isolates page successfully. On returing true, VM marks the page
171as PG_isolated so concurrent isolation in several CPUs skip the page
172for isolation. If a driver cannot isolate the page, it should return *false*.
173
174Once page is successfully isolated, VM uses page.lru fields so driver
175shouldn't expect to preserve values in that fields.
176
1772. int (*migratepage) (struct address_space *mapping,
178 struct page *newpage, struct page *oldpage, enum migrate_mode);
179
180After isolation, VM calls migratepage of driver with isolated page.
181The function of migratepage is to move content of the old page to new page
182and set up fields of struct page newpage. Keep in mind that you should
183indicate to the VM the oldpage is no longer movable via __ClearPageMovable()
184under page_lock if you migrated the oldpage successfully and returns
185MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver
186can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time
187because VM interprets -EAGAIN as "temporal migration failure". On returning
188any error except -EAGAIN, VM will give up the page migration without retrying
189in this time.
190
191Driver shouldn't touch page.lru field VM using in the functions.
192
1933. void (*putback_page)(struct page *);
194
195If migration fails on isolated page, VM should return the isolated page
196to the driver so VM calls driver's putback_page with migration failed page.
197In this function, driver should put the isolated page back to the own data
198structure.
146 199
2004. non-lru movable page flags
201
202There are two page flags for supporting non-lru movable page.
203
204* PG_movable
205
206Driver should use the below function to make page movable under page_lock.
207
208 void __SetPageMovable(struct page *page, struct address_space *mapping)
209
210It needs argument of address_space for registering migration family functions
211which will be called by VM. Exactly speaking, PG_movable is not a real flag of
212struct page. Rather than, VM reuses page->mapping's lower bits to represent it.
213
214 #define PAGE_MAPPING_MOVABLE 0x2
215 page->mapping = page->mapping | PAGE_MAPPING_MOVABLE;
216
217so driver shouldn't access page->mapping directly. Instead, driver should
218use page_mapping which mask off the low two bits of page->mapping under
219page lock so it can get right struct address_space.
220
221For testing of non-lru movable page, VM supports __PageMovable function.
222However, it doesn't guarantee to identify non-lru movable page because
223page->mapping field is unified with other variables in struct page.
224As well, if driver releases the page after isolation by VM, page->mapping
225doesn't have stable value although it has PAGE_MAPPING_MOVABLE
226(Look at __ClearPageMovable). But __PageMovable is cheap to catch whether
227page is LRU or non-lru movable once the page has been isolated. Because
228LRU pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also
229good for just peeking to test non-lru movable pages before more expensive
230checking with lock_page in pfn scanning to select victim.
231
232For guaranteeing non-lru movable page, VM provides PageMovable function.
233Unlike __PageMovable, PageMovable functions validates page->mapping and
234mapping->a_ops->isolate_page under lock_page. The lock_page prevents sudden
235destroying of page->mapping.
236
237Driver using __SetPageMovable should clear the flag via __ClearMovablePage
238under page_lock before the releasing the page.
239
240* PG_isolated
241
242To prevent concurrent isolation among several CPUs, VM marks isolated page
243as PG_isolated under lock_page. So if a CPU encounters PG_isolated non-lru
244movable page, it can skip it. Driver doesn't need to manipulate the flag
245because VM will set/clear it automatically. Keep in mind that if driver
246sees PG_isolated page, it means the page have been isolated by VM so it
247shouldn't touch page.lru field.
248PG_isolated is alias with PG_reclaim flag so driver shouldn't use the flag
249for own purpose.
250
251Christoph Lameter, May 8, 2006.
252Minchan Kim, Mar 28, 2016.
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index 7c871d6beb63..2ec6adb5a4ce 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -9,8 +9,8 @@ using huge pages for the backing of virtual memory with huge pages
9that supports the automatic promotion and demotion of page sizes and 9that supports the automatic promotion and demotion of page sizes and
10without the shortcomings of hugetlbfs. 10without the shortcomings of hugetlbfs.
11 11
12Currently it only works for anonymous memory mappings but in the 12Currently it only works for anonymous memory mappings and tmpfs/shmem.
13future it can expand over the pagecache layer starting with tmpfs. 13But in the future it can expand to other filesystems.
14 14
15The reason applications are running faster is because of two 15The reason applications are running faster is because of two
16factors. The first factor is almost completely irrelevant and it's not 16factors. The first factor is almost completely irrelevant and it's not
@@ -57,10 +57,6 @@ miss is going to run faster.
57 feature that applies to all dynamic high order allocations in the 57 feature that applies to all dynamic high order allocations in the
58 kernel) 58 kernel)
59 59
60- this initial support only offers the feature in the anonymous memory
61 regions but it'd be ideal to move it to tmpfs and the pagecache
62 later
63
64Transparent Hugepage Support maximizes the usefulness of free memory 60Transparent Hugepage Support maximizes the usefulness of free memory
65if compared to the reservation approach of hugetlbfs by allowing all 61if compared to the reservation approach of hugetlbfs by allowing all
66unused memory to be used as cache or other movable (or even unmovable 62unused memory to be used as cache or other movable (or even unmovable
@@ -94,21 +90,21 @@ madvise(MADV_HUGEPAGE) on their critical mmapped regions.
94 90
95== sysfs == 91== sysfs ==
96 92
97Transparent Hugepage Support can be entirely disabled (mostly for 93Transparent Hugepage Support for anonymous memory can be entirely disabled
98debugging purposes) or only enabled inside MADV_HUGEPAGE regions (to 94(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE
99avoid the risk of consuming more memory resources) or enabled system 95regions (to avoid the risk of consuming more memory resources) or enabled
100wide. This can be achieved with one of: 96system wide. This can be achieved with one of:
101 97
102echo always >/sys/kernel/mm/transparent_hugepage/enabled 98echo always >/sys/kernel/mm/transparent_hugepage/enabled
103echo madvise >/sys/kernel/mm/transparent_hugepage/enabled 99echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
104echo never >/sys/kernel/mm/transparent_hugepage/enabled 100echo never >/sys/kernel/mm/transparent_hugepage/enabled
105 101
106It's also possible to limit defrag efforts in the VM to generate 102It's also possible to limit defrag efforts in the VM to generate
107hugepages in case they're not immediately free to madvise regions or 103anonymous hugepages in case they're not immediately free to madvise
108to never try to defrag memory and simply fallback to regular pages 104regions or to never try to defrag memory and simply fallback to regular
109unless hugepages are immediately available. Clearly if we spend CPU 105pages unless hugepages are immediately available. Clearly if we spend CPU
110time to defrag memory, we would expect to gain even more by the fact 106time to defrag memory, we would expect to gain even more by the fact we
111we use hugepages later instead of regular pages. This isn't always 107use hugepages later instead of regular pages. This isn't always
112guaranteed, but it may be more likely in case the allocation is for a 108guaranteed, but it may be more likely in case the allocation is for a
113MADV_HUGEPAGE region. 109MADV_HUGEPAGE region.
114 110
@@ -133,9 +129,9 @@ that are have used madvise(MADV_HUGEPAGE). This is the default behaviour.
133 129
134"never" should be self-explanatory. 130"never" should be self-explanatory.
135 131
136By default kernel tries to use huge zero page on read page fault. 132By default kernel tries to use huge zero page on read page fault to
137It's possible to disable huge zero page by writing 0 or enable it 133anonymous mapping. It's possible to disable huge zero page by writing 0
138back by writing 1: 134or enable it back by writing 1:
139 135
140echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page 136echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
141echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page 137echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
@@ -204,21 +200,67 @@ Support by passing the parameter "transparent_hugepage=always" or
204"transparent_hugepage=madvise" or "transparent_hugepage=never" 200"transparent_hugepage=madvise" or "transparent_hugepage=never"
205(without "") to the kernel command line. 201(without "") to the kernel command line.
206 202
203== Hugepages in tmpfs/shmem ==
204
205You can control hugepage allocation policy in tmpfs with mount option
206"huge=". It can have following values:
207
208 - "always":
209 Attempt to allocate huge pages every time we need a new page;
210
211 - "never":
212 Do not allocate huge pages;
213
214 - "within_size":
215 Only allocate huge page if it will be fully within i_size.
216 Also respect fadvise()/madvise() hints;
217
218 - "advise:
219 Only allocate huge pages if requested with fadvise()/madvise();
220
221The default policy is "never".
222
223"mount -o remount,huge= /mountpoint" works fine after mount: remounting
224huge=never will not attempt to break up huge pages at all, just stop more
225from being allocated.
226
227There's also sysfs knob to control hugepage allocation policy for internal
228shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount
229is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or
230MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem.
231
232In addition to policies listed above, shmem_enabled allows two further
233values:
234
235 - "deny":
236 For use in emergencies, to force the huge option off from
237 all mounts;
238 - "force":
239 Force the huge option on for all - very useful for testing;
240
207== Need of application restart == 241== Need of application restart ==
208 242
209The transparent_hugepage/enabled values only affect future 243The transparent_hugepage/enabled values and tmpfs mount option only affect
210behavior. So to make them effective you need to restart any 244future behavior. So to make them effective you need to restart any
211application that could have been using hugepages. This also applies to 245application that could have been using hugepages. This also applies to the
212the regions registered in khugepaged. 246regions registered in khugepaged.
213 247
214== Monitoring usage == 248== Monitoring usage ==
215 249
216The number of transparent huge pages currently used by the system is 250The number of anonymous transparent huge pages currently used by the
217available by reading the AnonHugePages field in /proc/meminfo. To 251system is available by reading the AnonHugePages field in /proc/meminfo.
218identify what applications are using transparent huge pages, it is 252To identify what applications are using anonymous transparent huge pages,
219necessary to read /proc/PID/smaps and count the AnonHugePages fields 253it is necessary to read /proc/PID/smaps and count the AnonHugePages fields
220for each mapping. Note that reading the smaps file is expensive and 254for each mapping.
221reading it frequently will incur overhead. 255
256The number of file transparent huge pages mapped to userspace is available
257by reading ShmemPmdMapped and ShmemHugePages fields in /proc/meminfo.
258To identify what applications are mapping file transparent huge pages, it
259is necessary to read /proc/PID/smaps and count the FileHugeMapped fields
260for each mapping.
261
262Note that reading the smaps file is expensive and reading it
263frequently will incur overhead.
222 264
223There are a number of counters in /proc/vmstat that may be used to 265There are a number of counters in /proc/vmstat that may be used to
224monitor how successfully the system is providing huge pages for use. 266monitor how successfully the system is providing huge pages for use.
@@ -238,6 +280,12 @@ thp_collapse_alloc_failed is incremented if khugepaged found a range
238 of pages that should be collapsed into one huge page but failed 280 of pages that should be collapsed into one huge page but failed
239 the allocation. 281 the allocation.
240 282
283thp_file_alloc is incremented every time a file huge page is successfully
284i allocated.
285
286thp_file_mapped is incremented every time a file huge page is mapped into
287 user address space.
288
241thp_split_page is incremented every time a huge page is split into base 289thp_split_page is incremented every time a huge page is split into base
242 pages. This can happen for a variety of reasons but a common 290 pages. This can happen for a variety of reasons but a common
243 reason is that a huge page is old and is being reclaimed. 291 reason is that a huge page is old and is being reclaimed.
@@ -403,19 +451,27 @@ pages:
403 on relevant sub-page of the compound page. 451 on relevant sub-page of the compound page.
404 452
405 - map/unmap of the whole compound page accounted in compound_mapcount 453 - map/unmap of the whole compound page accounted in compound_mapcount
406 (stored in first tail page). 454 (stored in first tail page). For file huge pages, we also increment
455 ->_mapcount of all sub-pages in order to have race-free detection of
456 last unmap of subpages.
407 457
408PageDoubleMap() indicates that ->_mapcount in all subpages is offset up by one. 458PageDoubleMap() indicates that the page is *possibly* mapped with PTEs.
409This additional reference is required to get race-free detection of unmap of 459
410subpages when we have them mapped with both PMDs and PTEs. 460For anonymous pages PageDoubleMap() also indicates ->_mapcount in all
461subpages is offset up by one. This additional reference is required to
462get race-free detection of unmap of subpages when we have them mapped with
463both PMDs and PTEs.
411 464
412This is optimization required to lower overhead of per-subpage mapcount 465This is optimization required to lower overhead of per-subpage mapcount
413tracking. The alternative is alter ->_mapcount in all subpages on each 466tracking. The alternative is alter ->_mapcount in all subpages on each
414map/unmap of the whole compound page. 467map/unmap of the whole compound page.
415 468
416We set PG_double_map when a PMD of the page got split for the first time, 469For anonymous pages, we set PG_double_map when a PMD of the page got split
417but still have PMD mapping. The additional references go away with last 470for the first time, but still have PMD mapping. The additional references
418compound_mapcount. 471go away with last compound_mapcount.
472
473File pages get PG_double_map set on first map of the page with PTE and
474goes away when the page gets evicted from page cache.
419 475
420split_huge_page internally has to distribute the refcounts in the head 476split_huge_page internally has to distribute the refcounts in the head
421page to the tail pages before clearing all PG_head/tail bits from the page 477page to the tail pages before clearing all PG_head/tail bits from the page
@@ -427,7 +483,7 @@ sum of mapcount of all sub-pages plus one (split_huge_page caller must
427have reference for head page). 483have reference for head page).
428 484
429split_huge_page uses migration entries to stabilize page->_refcount and 485split_huge_page uses migration entries to stabilize page->_refcount and
430page->_mapcount. 486page->_mapcount of anonymous pages. File pages just got unmapped.
431 487
432We safe against physical memory scanners too: the only legitimate way 488We safe against physical memory scanners too: the only legitimate way
433scanner can get reference to a page is get_page_unless_zero(). 489scanner can get reference to a page is get_page_unless_zero().
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt
index fa3b527086fa..0026a8d33fc0 100644
--- a/Documentation/vm/unevictable-lru.txt
+++ b/Documentation/vm/unevictable-lru.txt
@@ -461,6 +461,27 @@ unevictable LRU is enabled, the work of compaction is mostly handled by
461the page migration code and the same work flow as described in MIGRATING 461the page migration code and the same work flow as described in MIGRATING
462MLOCKED PAGES will apply. 462MLOCKED PAGES will apply.
463 463
464MLOCKING TRANSPARENT HUGE PAGES
465-------------------------------
466
467A transparent huge page is represented by a single entry on an LRU list.
468Therefore, we can only make unevictable an entire compound page, not
469individual subpages.
470
471If a user tries to mlock() part of a huge page, we want the rest of the
472page to be reclaimable.
473
474We cannot just split the page on partial mlock() as split_huge_page() can
475fail and new intermittent failure mode for the syscall is undesirable.
476
477We handle this by keeping PTE-mapped huge pages on normal LRU lists: the
478PMD on border of VM_LOCKED VMA will be split into PTE table.
479
480This way the huge page is accessible for vmscan. Under memory pressure the
481page will be split, subpages which belong to VM_LOCKED VMAs will be moved
482to unevictable LRU and the rest can be reclaimed.
483
484See also comment in follow_trans_huge_pmd().
464 485
465mmap(MAP_LOCKED) SYSTEM CALL HANDLING 486mmap(MAP_LOCKED) SYSTEM CALL HANDLING
466------------------------------------- 487-------------------------------------
diff --git a/Makefile b/Makefile
index 6cd4d62cb1c1..776b6a3c40f1 100644
--- a/Makefile
+++ b/Makefile
@@ -647,41 +647,28 @@ ifneq ($(CONFIG_FRAME_WARN),0)
647KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN}) 647KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN})
648endif 648endif
649 649
650# Handle stack protector mode. 650# This selects the stack protector compiler flag. Testing it is delayed
651# 651# until after .config has been reprocessed, in the prepare-compiler-check
652# Since kbuild can potentially perform two passes (first with the old 652# target.
653# .config values and then with updated .config values), we cannot error out
654# if a desired compiler option is unsupported. If we were to error, kbuild
655# could never get to the second pass and actually notice that we changed
656# the option to something that was supported.
657#
658# Additionally, we don't want to fallback and/or silently change which compiler
659# flags will be used, since that leads to producing kernels with different
660# security feature characteristics depending on the compiler used. ("But I
661# selected CC_STACKPROTECTOR_STRONG! Why did it build with _REGULAR?!")
662#
663# The middle ground is to warn here so that the failed option is obvious, but
664# to let the build fail with bad compiler flags so that we can't produce a
665# kernel when there is a CONFIG and compiler mismatch.
666#
667ifdef CONFIG_CC_STACKPROTECTOR_REGULAR 653ifdef CONFIG_CC_STACKPROTECTOR_REGULAR
668 stackp-flag := -fstack-protector 654 stackp-flag := -fstack-protector
669 ifeq ($(call cc-option, $(stackp-flag)),) 655 stackp-name := REGULAR
670 $(warning Cannot use CONFIG_CC_STACKPROTECTOR_REGULAR: \
671 -fstack-protector not supported by compiler)
672 endif
673else 656else
674ifdef CONFIG_CC_STACKPROTECTOR_STRONG 657ifdef CONFIG_CC_STACKPROTECTOR_STRONG
675 stackp-flag := -fstack-protector-strong 658 stackp-flag := -fstack-protector-strong
676 ifeq ($(call cc-option, $(stackp-flag)),) 659 stackp-name := STRONG
677 $(warning Cannot use CONFIG_CC_STACKPROTECTOR_STRONG: \
678 -fstack-protector-strong not supported by compiler)
679 endif
680else 660else
681 # Force off for distro compilers that enable stack protector by default. 661 # Force off for distro compilers that enable stack protector by default.
682 stackp-flag := $(call cc-option, -fno-stack-protector) 662 stackp-flag := $(call cc-option, -fno-stack-protector)
683endif 663endif
684endif 664endif
665# Find arch-specific stack protector compiler sanity-checking script.
666ifdef CONFIG_CC_STACKPROTECTOR
667 stackp-path := $(srctree)/scripts/gcc-$(ARCH)_$(BITS)-has-stack-protector.sh
668 ifneq ($(wildcard $(stackp-path)),)
669 stackp-check := $(stackp-path)
670 endif
671endif
685KBUILD_CFLAGS += $(stackp-flag) 672KBUILD_CFLAGS += $(stackp-flag)
686 673
687ifdef CONFIG_KCOV 674ifdef CONFIG_KCOV
@@ -1017,8 +1004,10 @@ ifneq ($(KBUILD_SRC),)
1017 fi; 1004 fi;
1018endif 1005endif
1019 1006
1020# prepare2 creates a makefile if using a separate output directory 1007# prepare2 creates a makefile if using a separate output directory.
1021prepare2: prepare3 outputmakefile asm-generic 1008# From this point forward, .config has been reprocessed, so any rules
1009# that need to depend on updated CONFIG_* values can be checked here.
1010prepare2: prepare3 prepare-compiler-check outputmakefile asm-generic
1022 1011
1023prepare1: prepare2 $(version_h) include/generated/utsrelease.h \ 1012prepare1: prepare2 $(version_h) include/generated/utsrelease.h \
1024 include/config/auto.conf 1013 include/config/auto.conf
@@ -1049,6 +1038,32 @@ endif
1049PHONY += prepare-objtool 1038PHONY += prepare-objtool
1050prepare-objtool: $(objtool_target) 1039prepare-objtool: $(objtool_target)
1051 1040
1041# Check for CONFIG flags that require compiler support. Abort the build
1042# after .config has been processed, but before the kernel build starts.
1043#
1044# For security-sensitive CONFIG options, we don't want to fallback and/or
1045# silently change which compiler flags will be used, since that leads to
1046# producing kernels with different security feature characteristics
1047# depending on the compiler used. (For example, "But I selected
1048# CC_STACKPROTECTOR_STRONG! Why did it build with _REGULAR?!")
1049PHONY += prepare-compiler-check
1050prepare-compiler-check: FORCE
1051# Make sure compiler supports requested stack protector flag.
1052ifdef stackp-name
1053 ifeq ($(call cc-option, $(stackp-flag)),)
1054 @echo Cannot use CONFIG_CC_STACKPROTECTOR_$(stackp-name): \
1055 $(stackp-flag) not supported by compiler >&2 && exit 1
1056 endif
1057endif
1058# Make sure compiler does not have buggy stack-protector support.
1059ifdef stackp-check
1060 ifneq ($(shell $(CONFIG_SHELL) $(stackp-check) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y)
1061 @echo Cannot use CONFIG_CC_STACKPROTECTOR_$(stackp-name): \
1062 $(stackp-flag) available but compiler is broken >&2 && exit 1
1063 endif
1064endif
1065 @:
1066
1052# Generate some files 1067# Generate some files
1053# --------------------------------------------------------------------------- 1068# ---------------------------------------------------------------------------
1054 1069
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
index 4a905bd667e2..83e9eee57a55 100644
--- a/arch/alpha/mm/fault.c
+++ b/arch/alpha/mm/fault.c
@@ -147,7 +147,7 @@ retry:
147 /* If for any reason at all we couldn't handle the fault, 147 /* If for any reason at all we couldn't handle the fault,
148 make sure we exit gracefully rather than endlessly redo 148 make sure we exit gracefully rather than endlessly redo
149 the fault. */ 149 the fault. */
150 fault = handle_mm_fault(mm, vma, address, flags); 150 fault = handle_mm_fault(vma, address, flags);
151 151
152 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 152 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
153 return; 153 return;
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
index af63f4a13e60..e94e5aa33985 100644
--- a/arch/arc/mm/fault.c
+++ b/arch/arc/mm/fault.c
@@ -137,7 +137,7 @@ good_area:
137 * make sure we exit gracefully rather than endlessly redo 137 * make sure we exit gracefully rather than endlessly redo
138 * the fault. 138 * the fault.
139 */ 139 */
140 fault = handle_mm_fault(mm, vma, address, flags); 140 fault = handle_mm_fault(vma, address, flags);
141 141
142 /* If Pagefault was interrupted by SIGKILL, exit page fault "early" */ 142 /* If Pagefault was interrupted by SIGKILL, exit page fault "early" */
143 if (unlikely(fatal_signal_pending(current))) { 143 if (unlikely(fatal_signal_pending(current))) {
diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h
index 20febb368844..b2902a5cd780 100644
--- a/arch/arm/include/asm/pgalloc.h
+++ b/arch/arm/include/asm/pgalloc.h
@@ -57,7 +57,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
57extern pgd_t *pgd_alloc(struct mm_struct *mm); 57extern pgd_t *pgd_alloc(struct mm_struct *mm);
58extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); 58extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
59 59
60#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) 60#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
61 61
62static inline void clean_pte_table(pte_t *pte) 62static inline void clean_pte_table(pte_t *pte)
63{ 63{
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index 3cadb726ec88..1e25cd80589e 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -209,17 +209,38 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
209 tlb_flush(tlb); 209 tlb_flush(tlb);
210} 210}
211 211
212static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) 212static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
213{ 213{
214 if (tlb->nr == tlb->max)
215 return true;
214 tlb->pages[tlb->nr++] = page; 216 tlb->pages[tlb->nr++] = page;
215 VM_BUG_ON(tlb->nr > tlb->max); 217 return false;
216 return tlb->max - tlb->nr;
217} 218}
218 219
219static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) 220static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
220{ 221{
221 if (!__tlb_remove_page(tlb, page)) 222 if (__tlb_remove_page(tlb, page)) {
222 tlb_flush_mmu(tlb); 223 tlb_flush_mmu(tlb);
224 __tlb_remove_page(tlb, page);
225 }
226}
227
228static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
229 struct page *page, int page_size)
230{
231 return __tlb_remove_page(tlb, page);
232}
233
234static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
235 struct page *page)
236{
237 return __tlb_remove_page(tlb, page);
238}
239
240static inline void tlb_remove_page_size(struct mmu_gather *tlb,
241 struct page *page, int page_size)
242{
243 return tlb_remove_page(tlb, page);
223} 244}
224 245
225static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, 246static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index ad5841856007..3a2e678b8d30 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -243,7 +243,7 @@ good_area:
243 goto out; 243 goto out;
244 } 244 }
245 245
246 return handle_mm_fault(mm, vma, addr & PAGE_MASK, flags); 246 return handle_mm_fault(vma, addr & PAGE_MASK, flags);
247 247
248check_stack: 248check_stack:
249 /* Don't allow expansion below FIRST_USER_ADDRESS */ 249 /* Don't allow expansion below FIRST_USER_ADDRESS */
diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index b8d477321730..c1c1a5c67da1 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -23,7 +23,7 @@
23#define __pgd_alloc() kmalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL) 23#define __pgd_alloc() kmalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL)
24#define __pgd_free(pgd) kfree(pgd) 24#define __pgd_free(pgd) kfree(pgd)
25#else 25#else
26#define __pgd_alloc() (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_REPEAT, 2) 26#define __pgd_alloc() (pgd_t *)__get_free_pages(GFP_KERNEL, 2)
27#define __pgd_free(pgd) free_pages((unsigned long)pgd, 2) 27#define __pgd_free(pgd) free_pages((unsigned long)pgd, 2)
28#endif 28#endif
29 29
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index b1166d1e5955..031820d989a8 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -233,7 +233,7 @@ good_area:
233 goto out; 233 goto out;
234 } 234 }
235 235
236 return handle_mm_fault(mm, vma, addr & PAGE_MASK, mm_flags); 236 return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags);
237 237
238check_stack: 238check_stack:
239 if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr)) 239 if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr))
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
index c03533937a9f..a4b7edac8f10 100644
--- a/arch/avr32/mm/fault.c
+++ b/arch/avr32/mm/fault.c
@@ -134,7 +134,7 @@ good_area:
134 * sure we exit gracefully rather than endlessly redo the 134 * sure we exit gracefully rather than endlessly redo the
135 * fault. 135 * fault.
136 */ 136 */
137 fault = handle_mm_fault(mm, vma, address, flags); 137 fault = handle_mm_fault(vma, address, flags);
138 138
139 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 139 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
140 return; 140 return;
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
index 3066d40a6db1..112ef26c7f2e 100644
--- a/arch/cris/mm/fault.c
+++ b/arch/cris/mm/fault.c
@@ -168,7 +168,7 @@ retry:
168 * the fault. 168 * the fault.
169 */ 169 */
170 170
171 fault = handle_mm_fault(mm, vma, address, flags); 171 fault = handle_mm_fault(vma, address, flags);
172 172
173 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 173 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
174 return; 174 return;
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
index 61d99767fe16..614a46c413d2 100644
--- a/arch/frv/mm/fault.c
+++ b/arch/frv/mm/fault.c
@@ -164,7 +164,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
164 * make sure we exit gracefully rather than endlessly redo 164 * make sure we exit gracefully rather than endlessly redo
165 * the fault. 165 * the fault.
166 */ 166 */
167 fault = handle_mm_fault(mm, vma, ear0, flags); 167 fault = handle_mm_fault(vma, ear0, flags);
168 if (unlikely(fault & VM_FAULT_ERROR)) { 168 if (unlikely(fault & VM_FAULT_ERROR)) {
169 if (fault & VM_FAULT_OOM) 169 if (fault & VM_FAULT_OOM)
170 goto out_of_memory; 170 goto out_of_memory;
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
index 8704c9320032..bd7c251e2bce 100644
--- a/arch/hexagon/mm/vm_fault.c
+++ b/arch/hexagon/mm/vm_fault.c
@@ -101,7 +101,7 @@ good_area:
101 break; 101 break;
102 } 102 }
103 103
104 fault = handle_mm_fault(mm, vma, address, flags); 104 fault = handle_mm_fault(vma, address, flags);
105 105
106 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 106 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
107 return; 107 return;
diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h
index 39d64e0df1de..77e541cf0e5d 100644
--- a/arch/ia64/include/asm/tlb.h
+++ b/arch/ia64/include/asm/tlb.h
@@ -205,17 +205,18 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
205 * must be delayed until after the TLB has been flushed (see comments at the beginning of 205 * must be delayed until after the TLB has been flushed (see comments at the beginning of
206 * this file). 206 * this file).
207 */ 207 */
208static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) 208static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
209{ 209{
210 if (tlb->nr == tlb->max)
211 return true;
212
210 tlb->need_flush = 1; 213 tlb->need_flush = 1;
211 214
212 if (!tlb->nr && tlb->pages == tlb->local) 215 if (!tlb->nr && tlb->pages == tlb->local)
213 __tlb_alloc_page(tlb); 216 __tlb_alloc_page(tlb);
214 217
215 tlb->pages[tlb->nr++] = page; 218 tlb->pages[tlb->nr++] = page;
216 VM_BUG_ON(tlb->nr > tlb->max); 219 return false;
217
218 return tlb->max - tlb->nr;
219} 220}
220 221
221static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) 222static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
@@ -235,8 +236,28 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb)
235 236
236static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) 237static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
237{ 238{
238 if (!__tlb_remove_page(tlb, page)) 239 if (__tlb_remove_page(tlb, page)) {
239 tlb_flush_mmu(tlb); 240 tlb_flush_mmu(tlb);
241 __tlb_remove_page(tlb, page);
242 }
243}
244
245static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
246 struct page *page, int page_size)
247{
248 return __tlb_remove_page(tlb, page);
249}
250
251static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
252 struct page *page)
253{
254 return __tlb_remove_page(tlb, page);
255}
256
257static inline void tlb_remove_page_size(struct mmu_gather *tlb,
258 struct page *page, int page_size)
259{
260 return tlb_remove_page(tlb, page);
240} 261}
241 262
242/* 263/*
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 70b40d1205a6..fa6ad95e992e 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -159,7 +159,7 @@ retry:
159 * sure we exit gracefully rather than endlessly redo the 159 * sure we exit gracefully rather than endlessly redo the
160 * fault. 160 * fault.
161 */ 161 */
162 fault = handle_mm_fault(mm, vma, address, flags); 162 fault = handle_mm_fault(vma, address, flags);
163 163
164 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 164 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
165 return; 165 return;
diff --git a/arch/m32r/kernel/m32r_ksyms.c b/arch/m32r/kernel/m32r_ksyms.c
index b727e693c805..23f26f4adfff 100644
--- a/arch/m32r/kernel/m32r_ksyms.c
+++ b/arch/m32r/kernel/m32r_ksyms.c
@@ -41,6 +41,9 @@ EXPORT_SYMBOL(cpu_data);
41EXPORT_SYMBOL(smp_flush_tlb_page); 41EXPORT_SYMBOL(smp_flush_tlb_page);
42#endif 42#endif
43 43
44extern int __ucmpdi2(unsigned long long a, unsigned long long b);
45EXPORT_SYMBOL(__ucmpdi2);
46
44/* compiler generated symbol */ 47/* compiler generated symbol */
45extern void __ashldi3(void); 48extern void __ashldi3(void);
46extern void __ashrdi3(void); 49extern void __ashrdi3(void);
diff --git a/arch/m32r/lib/Makefile b/arch/m32r/lib/Makefile
index d16b4e40d1ae..5889eb9610b5 100644
--- a/arch/m32r/lib/Makefile
+++ b/arch/m32r/lib/Makefile
@@ -3,5 +3,5 @@
3# 3#
4 4
5lib-y := checksum.o ashxdi3.o memset.o memcpy.o \ 5lib-y := checksum.o ashxdi3.o memset.o memcpy.o \
6 delay.o strlen.o usercopy.o csum_partial_copy.o 6 delay.o strlen.o usercopy.o csum_partial_copy.o \
7 7 ucmpdi2.o
diff --git a/arch/m32r/lib/libgcc.h b/arch/m32r/lib/libgcc.h
new file mode 100644
index 000000000000..267aa435bc35
--- /dev/null
+++ b/arch/m32r/lib/libgcc.h
@@ -0,0 +1,23 @@
1#ifndef __ASM_LIBGCC_H
2#define __ASM_LIBGCC_H
3
4#include <asm/byteorder.h>
5
6#ifdef __BIG_ENDIAN
7struct DWstruct {
8 int high, low;
9};
10#elif defined(__LITTLE_ENDIAN)
11struct DWstruct {
12 int low, high;
13};
14#else
15#error I feel sick.
16#endif
17
18typedef union {
19 struct DWstruct s;
20 long long ll;
21} DWunion;
22
23#endif /* __ASM_LIBGCC_H */
diff --git a/arch/m32r/lib/ucmpdi2.c b/arch/m32r/lib/ucmpdi2.c
new file mode 100644
index 000000000000..9d3c682c89b5
--- /dev/null
+++ b/arch/m32r/lib/ucmpdi2.c
@@ -0,0 +1,17 @@
1#include "libgcc.h"
2
3int __ucmpdi2(unsigned long long a, unsigned long long b)
4{
5 const DWunion au = {.ll = a};
6 const DWunion bu = {.ll = b};
7
8 if ((unsigned int)au.s.high < (unsigned int)bu.s.high)
9 return 0;
10 else if ((unsigned int)au.s.high > (unsigned int)bu.s.high)
11 return 2;
12 if ((unsigned int)au.s.low < (unsigned int)bu.s.low)
13 return 0;
14 else if ((unsigned int)au.s.low > (unsigned int)bu.s.low)
15 return 2;
16 return 1;
17}
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
index 8f9875b7933d..a3785d3644c2 100644
--- a/arch/m32r/mm/fault.c
+++ b/arch/m32r/mm/fault.c
@@ -196,7 +196,7 @@ good_area:
196 */ 196 */
197 addr = (address & PAGE_MASK); 197 addr = (address & PAGE_MASK);
198 set_thread_fault_code(error_code); 198 set_thread_fault_code(error_code);
199 fault = handle_mm_fault(mm, vma, addr, flags); 199 fault = handle_mm_fault(vma, addr, flags);
200 if (unlikely(fault & VM_FAULT_ERROR)) { 200 if (unlikely(fault & VM_FAULT_ERROR)) {
201 if (fault & VM_FAULT_OOM) 201 if (fault & VM_FAULT_OOM)
202 goto out_of_memory; 202 goto out_of_memory;
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
index 6a94cdd0c830..bd66a0b20c6b 100644
--- a/arch/m68k/mm/fault.c
+++ b/arch/m68k/mm/fault.c
@@ -136,7 +136,7 @@ good_area:
136 * the fault. 136 * the fault.
137 */ 137 */
138 138
139 fault = handle_mm_fault(mm, vma, address, flags); 139 fault = handle_mm_fault(vma, address, flags);
140 pr_debug("handle_mm_fault returns %d\n", fault); 140 pr_debug("handle_mm_fault returns %d\n", fault);
141 141
142 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 142 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c
index f57edca63609..372783a67dda 100644
--- a/arch/metag/mm/fault.c
+++ b/arch/metag/mm/fault.c
@@ -133,7 +133,7 @@ good_area:
133 * make sure we exit gracefully rather than endlessly redo 133 * make sure we exit gracefully rather than endlessly redo
134 * the fault. 134 * the fault.
135 */ 135 */
136 fault = handle_mm_fault(mm, vma, address, flags); 136 fault = handle_mm_fault(vma, address, flags);
137 137
138 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 138 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
139 return 0; 139 return 0;
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
index 177dfc003643..abb678ccde6f 100644
--- a/arch/microblaze/mm/fault.c
+++ b/arch/microblaze/mm/fault.c
@@ -216,7 +216,7 @@ good_area:
216 * make sure we exit gracefully rather than endlessly redo 216 * make sure we exit gracefully rather than endlessly redo
217 * the fault. 217 * the fault.
218 */ 218 */
219 fault = handle_mm_fault(mm, vma, address, flags); 219 fault = handle_mm_fault(vma, address, flags);
220 220
221 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 221 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
222 return; 222 return;
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
index 4b88fa031891..9560ad731120 100644
--- a/arch/mips/mm/fault.c
+++ b/arch/mips/mm/fault.c
@@ -153,7 +153,7 @@ good_area:
153 * make sure we exit gracefully rather than endlessly redo 153 * make sure we exit gracefully rather than endlessly redo
154 * the fault. 154 * the fault.
155 */ 155 */
156 fault = handle_mm_fault(mm, vma, address, flags); 156 fault = handle_mm_fault(vma, address, flags);
157 157
158 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 158 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
159 return; 159 return;
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
index 4a1d181ed32f..f23781d6bbb3 100644
--- a/arch/mn10300/mm/fault.c
+++ b/arch/mn10300/mm/fault.c
@@ -254,7 +254,7 @@ good_area:
254 * make sure we exit gracefully rather than endlessly redo 254 * make sure we exit gracefully rather than endlessly redo
255 * the fault. 255 * the fault.
256 */ 256 */
257 fault = handle_mm_fault(mm, vma, address, flags); 257 fault = handle_mm_fault(vma, address, flags);
258 258
259 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 259 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
260 return; 260 return;
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c
index b51878b0c6b8..affc4eb3f89e 100644
--- a/arch/nios2/mm/fault.c
+++ b/arch/nios2/mm/fault.c
@@ -131,7 +131,7 @@ good_area:
131 * make sure we exit gracefully rather than endlessly redo 131 * make sure we exit gracefully rather than endlessly redo
132 * the fault. 132 * the fault.
133 */ 133 */
134 fault = handle_mm_fault(mm, vma, address, flags); 134 fault = handle_mm_fault(vma, address, flags);
135 135
136 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 136 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
137 return; 137 return;
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
index 230ac20ae794..e94cd225e816 100644
--- a/arch/openrisc/mm/fault.c
+++ b/arch/openrisc/mm/fault.c
@@ -163,7 +163,7 @@ good_area:
163 * the fault. 163 * the fault.
164 */ 164 */
165 165
166 fault = handle_mm_fault(mm, vma, address, flags); 166 fault = handle_mm_fault(vma, address, flags);
167 167
168 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 168 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
169 return; 169 return;
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 16dbe81c97c9..163af2c31d76 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -239,7 +239,7 @@ good_area:
239 * fault. 239 * fault.
240 */ 240 */
241 241
242 fault = handle_mm_fault(mm, vma, address, flags); 242 fault = handle_mm_fault(vma, address, flags);
243 243
244 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 244 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
245 return; 245 return;
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index ee09e99097f0..9bd87f269d6d 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -71,10 +71,8 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
71static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, 71static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
72 bool *is_thp, unsigned *shift) 72 bool *is_thp, unsigned *shift)
73{ 73{
74 if (!arch_irqs_disabled()) { 74 VM_WARN(!arch_irqs_disabled(),
75 pr_info("%s called with irq enabled\n", __func__); 75 "%s called with irq enabled\n", __func__);
76 dump_stack();
77 }
78 return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift); 76 return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift);
79} 77}
80 78
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c
index 6527882ce05e..bb0354222b11 100644
--- a/arch/powerpc/mm/copro_fault.c
+++ b/arch/powerpc/mm/copro_fault.c
@@ -75,7 +75,7 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea,
75 } 75 }
76 76
77 ret = 0; 77 ret = 0;
78 *flt = handle_mm_fault(mm, vma, ea, is_write ? FAULT_FLAG_WRITE : 0); 78 *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0);
79 if (unlikely(*flt & VM_FAULT_ERROR)) { 79 if (unlikely(*flt & VM_FAULT_ERROR)) {
80 if (*flt & VM_FAULT_OOM) { 80 if (*flt & VM_FAULT_OOM) {
81 ret = -ENOMEM; 81 ret = -ENOMEM;
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index a67c6d781c52..a4db22f65021 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -429,7 +429,7 @@ good_area:
429 * make sure we exit gracefully rather than endlessly redo 429 * make sure we exit gracefully rather than endlessly redo
430 * the fault. 430 * the fault.
431 */ 431 */
432 fault = handle_mm_fault(mm, vma, address, flags); 432 fault = handle_mm_fault(vma, address, flags);
433 if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { 433 if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
434 if (fault & VM_FAULT_SIGSEGV) 434 if (fault & VM_FAULT_SIGSEGV)
435 goto bad_area; 435 goto bad_area;
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 7a92e69c50bc..15711de10403 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -87,10 +87,10 @@ static inline void tlb_finish_mmu(struct mmu_gather *tlb,
87 * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page 87 * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page
88 * has already been freed, so just do free_page_and_swap_cache. 88 * has already been freed, so just do free_page_and_swap_cache.
89 */ 89 */
90static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) 90static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
91{ 91{
92 free_page_and_swap_cache(page); 92 free_page_and_swap_cache(page);
93 return 1; /* avoid calling tlb_flush_mmu */ 93 return false; /* avoid calling tlb_flush_mmu */
94} 94}
95 95
96static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) 96static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
@@ -98,6 +98,24 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
98 free_page_and_swap_cache(page); 98 free_page_and_swap_cache(page);
99} 99}
100 100
101static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
102 struct page *page, int page_size)
103{
104 return __tlb_remove_page(tlb, page);
105}
106
107static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
108 struct page *page)
109{
110 return __tlb_remove_page(tlb, page);
111}
112
113static inline void tlb_remove_page_size(struct mmu_gather *tlb,
114 struct page *page, int page_size)
115{
116 return tlb_remove_page(tlb, page);
117}
118
101/* 119/*
102 * pte_free_tlb frees a pte table and clears the CRSTE for the 120 * pte_free_tlb frees a pte table and clears the CRSTE for the
103 * page table from the tlb. 121 * page table from the tlb.
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 6ad7eff84c82..25783dc3c813 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -456,7 +456,7 @@ retry:
456 * make sure we exit gracefully rather than endlessly redo 456 * make sure we exit gracefully rather than endlessly redo
457 * the fault. 457 * the fault.
458 */ 458 */
459 fault = handle_mm_fault(mm, vma, address, flags); 459 fault = handle_mm_fault(vma, address, flags);
460 /* No reason to continue if interrupted by SIGKILL. */ 460 /* No reason to continue if interrupted by SIGKILL. */
461 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { 461 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
462 fault = VM_FAULT_SIGNAL; 462 fault = VM_FAULT_SIGNAL;
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
index 37a6c2e0e969..995b71e4db4b 100644
--- a/arch/score/mm/fault.c
+++ b/arch/score/mm/fault.c
@@ -111,7 +111,7 @@ good_area:
111 * make sure we exit gracefully rather than endlessly redo 111 * make sure we exit gracefully rather than endlessly redo
112 * the fault. 112 * the fault.
113 */ 113 */
114 fault = handle_mm_fault(mm, vma, address, flags); 114 fault = handle_mm_fault(vma, address, flags);
115 if (unlikely(fault & VM_FAULT_ERROR)) { 115 if (unlikely(fault & VM_FAULT_ERROR)) {
116 if (fault & VM_FAULT_OOM) 116 if (fault & VM_FAULT_OOM)
117 goto out_of_memory; 117 goto out_of_memory;
diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h
index 62f80d2a9df9..025cdb1032f6 100644
--- a/arch/sh/include/asm/tlb.h
+++ b/arch/sh/include/asm/tlb.h
@@ -101,7 +101,7 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb)
101static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) 101static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
102{ 102{
103 free_page_and_swap_cache(page); 103 free_page_and_swap_cache(page);
104 return 1; /* avoid calling tlb_flush_mmu */ 104 return false; /* avoid calling tlb_flush_mmu */
105} 105}
106 106
107static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) 107static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
@@ -109,6 +109,24 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
109 __tlb_remove_page(tlb, page); 109 __tlb_remove_page(tlb, page);
110} 110}
111 111
112static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
113 struct page *page, int page_size)
114{
115 return __tlb_remove_page(tlb, page);
116}
117
118static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
119 struct page *page)
120{
121 return __tlb_remove_page(tlb, page);
122}
123
124static inline void tlb_remove_page_size(struct mmu_gather *tlb,
125 struct page *page, int page_size)
126{
127 return tlb_remove_page(tlb, page);
128}
129
112#define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep) 130#define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep)
113#define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp) 131#define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp)
114#define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) 132#define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp)
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
index 79d8276377d1..9bf876780cef 100644
--- a/arch/sh/mm/fault.c
+++ b/arch/sh/mm/fault.c
@@ -487,7 +487,7 @@ good_area:
487 * make sure we exit gracefully rather than endlessly redo 487 * make sure we exit gracefully rather than endlessly redo
488 * the fault. 488 * the fault.
489 */ 489 */
490 fault = handle_mm_fault(mm, vma, address, flags); 490 fault = handle_mm_fault(vma, address, flags);
491 491
492 if (unlikely(fault & (VM_FAULT_RETRY | VM_FAULT_ERROR))) 492 if (unlikely(fault & (VM_FAULT_RETRY | VM_FAULT_ERROR)))
493 if (mm_fault_error(regs, error_code, address, fault)) 493 if (mm_fault_error(regs, error_code, address, fault))
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
index b6c559cbd64d..4714061d6cd3 100644
--- a/arch/sparc/mm/fault_32.c
+++ b/arch/sparc/mm/fault_32.c
@@ -241,7 +241,7 @@ good_area:
241 * make sure we exit gracefully rather than endlessly redo 241 * make sure we exit gracefully rather than endlessly redo
242 * the fault. 242 * the fault.
243 */ 243 */
244 fault = handle_mm_fault(mm, vma, address, flags); 244 fault = handle_mm_fault(vma, address, flags);
245 245
246 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 246 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
247 return; 247 return;
@@ -411,7 +411,7 @@ good_area:
411 if (!(vma->vm_flags & (VM_READ | VM_EXEC))) 411 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
412 goto bad_area; 412 goto bad_area;
413 } 413 }
414 switch (handle_mm_fault(mm, vma, address, flags)) { 414 switch (handle_mm_fault(vma, address, flags)) {
415 case VM_FAULT_SIGBUS: 415 case VM_FAULT_SIGBUS:
416 case VM_FAULT_OOM: 416 case VM_FAULT_OOM:
417 goto do_sigbus; 417 goto do_sigbus;
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
index cb841a33da59..6c43b924a7a2 100644
--- a/arch/sparc/mm/fault_64.c
+++ b/arch/sparc/mm/fault_64.c
@@ -436,7 +436,7 @@ good_area:
436 goto bad_area; 436 goto bad_area;
437 } 437 }
438 438
439 fault = handle_mm_fault(mm, vma, address, flags); 439 fault = handle_mm_fault(vma, address, flags);
440 440
441 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 441 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
442 goto exit_exception; 442 goto exit_exception;
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 26734214818c..beba986589e5 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -434,7 +434,7 @@ good_area:
434 * make sure we exit gracefully rather than endlessly redo 434 * make sure we exit gracefully rather than endlessly redo
435 * the fault. 435 * the fault.
436 */ 436 */
437 fault = handle_mm_fault(mm, vma, address, flags); 437 fault = handle_mm_fault(vma, address, flags);
438 438
439 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 439 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
440 return 0; 440 return 0;
diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h
index 16eb63fac57d..821ff0acfe17 100644
--- a/arch/um/include/asm/tlb.h
+++ b/arch/um/include/asm/tlb.h
@@ -102,7 +102,7 @@ static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
102{ 102{
103 tlb->need_flush = 1; 103 tlb->need_flush = 1;
104 free_page_and_swap_cache(page); 104 free_page_and_swap_cache(page);
105 return 1; /* avoid calling tlb_flush_mmu */ 105 return false; /* avoid calling tlb_flush_mmu */
106} 106}
107 107
108static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) 108static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
@@ -110,6 +110,24 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
110 __tlb_remove_page(tlb, page); 110 __tlb_remove_page(tlb, page);
111} 111}
112 112
113static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
114 struct page *page, int page_size)
115{
116 return __tlb_remove_page(tlb, page);
117}
118
119static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
120 struct page *page)
121{
122 return __tlb_remove_page(tlb, page);
123}
124
125static inline void tlb_remove_page_size(struct mmu_gather *tlb,
126 struct page *page, int page_size)
127{
128 return tlb_remove_page(tlb, page);
129}
130
113/** 131/**
114 * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation. 132 * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation.
115 * 133 *
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 98783dd0fa2e..ad8f206ab5e8 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -73,7 +73,7 @@ good_area:
73 do { 73 do {
74 int fault; 74 int fault;
75 75
76 fault = handle_mm_fault(mm, vma, address, flags); 76 fault = handle_mm_fault(vma, address, flags);
77 77
78 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 78 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
79 goto out_nosemaphore; 79 goto out_nosemaphore;
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
index 2ec3d3adcefc..6c7f70bcaae3 100644
--- a/arch/unicore32/mm/fault.c
+++ b/arch/unicore32/mm/fault.c
@@ -194,7 +194,7 @@ good_area:
194 * If for any reason at all we couldn't handle the fault, make 194 * If for any reason at all we couldn't handle the fault, make
195 * sure we exit gracefully rather than endlessly redo the fault. 195 * sure we exit gracefully rather than endlessly redo the fault.
196 */ 196 */
197 fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, flags); 197 fault = handle_mm_fault(vma, addr & PAGE_MASK, flags);
198 return fault; 198 return fault;
199 199
200check_stack: 200check_stack:
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 6fce7f096b88..830ed391e7ef 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -126,14 +126,6 @@ else
126 KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args) 126 KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args)
127endif 127endif
128 128
129# Make sure compiler does not have buggy stack-protector support.
130ifdef CONFIG_CC_STACKPROTECTOR
131 cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh
132 ifneq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y)
133 $(warning stack-protector enabled but compiler support broken)
134 endif
135endif
136
137ifdef CONFIG_X86_X32 129ifdef CONFIG_X86_X32
138 x32_ld_ok := $(call try-run,\ 130 x32_ld_ok := $(call try-run,\
139 /bin/echo -e '1: .quad 1b' | \ 131 /bin/echo -e '1: .quad 1b' | \
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 574c23cf761a..b6d425999f99 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -81,7 +81,11 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
81static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) 81static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
82{ 82{
83 struct page *page; 83 struct page *page;
84 page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0); 84 gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO;
85
86 if (mm == &init_mm)
87 gfp &= ~__GFP_ACCOUNT;
88 page = alloc_pages(gfp, 0);
85 if (!page) 89 if (!page)
86 return NULL; 90 return NULL;
87 if (!pgtable_pmd_page_ctor(page)) { 91 if (!pgtable_pmd_page_ctor(page)) {
@@ -125,7 +129,11 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
125 129
126static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) 130static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
127{ 131{
128 return (pud_t *)get_zeroed_page(GFP_KERNEL); 132 gfp_t gfp = GFP_KERNEL_ACCOUNT;
133
134 if (mm == &init_mm)
135 gfp &= ~__GFP_ACCOUNT;
136 return (pud_t *)get_zeroed_page(gfp);
129} 137}
130 138
131static inline void pud_free(struct mm_struct *mm, pud_t *pud) 139static inline void pud_free(struct mm_struct *mm, pud_t *pud)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index d22161ab941d..dc8023060456 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1353,7 +1353,7 @@ good_area:
1353 * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if 1353 * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
1354 * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. 1354 * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
1355 */ 1355 */
1356 fault = handle_mm_fault(mm, vma, address, flags); 1356 fault = handle_mm_fault(vma, address, flags);
1357 major |= fault & VM_FAULT_MAJOR; 1357 major |= fault & VM_FAULT_MAJOR;
1358 1358
1359 /* 1359 /*
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index aa0ff4b02a96..3feec5af4e67 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -6,7 +6,7 @@
6#include <asm/fixmap.h> 6#include <asm/fixmap.h>
7#include <asm/mtrr.h> 7#include <asm/mtrr.h>
8 8
9#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO 9#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO)
10 10
11#ifdef CONFIG_HIGHPTE 11#ifdef CONFIG_HIGHPTE
12#define PGALLOC_USER_GFP __GFP_HIGHMEM 12#define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -18,7 +18,7 @@ gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
18 18
19pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 19pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
20{ 20{
21 return (pte_t *)__get_free_page(PGALLOC_GFP); 21 return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT);
22} 22}
23 23
24pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) 24pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -207,9 +207,13 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
207{ 207{
208 int i; 208 int i;
209 bool failed = false; 209 bool failed = false;
210 gfp_t gfp = PGALLOC_GFP;
211
212 if (mm == &init_mm)
213 gfp &= ~__GFP_ACCOUNT;
210 214
211 for(i = 0; i < PREALLOCATED_PMDS; i++) { 215 for(i = 0; i < PREALLOCATED_PMDS; i++) {
212 pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); 216 pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
213 if (!pmd) 217 if (!pmd)
214 failed = true; 218 failed = true;
215 if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { 219 if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
index 7f4a1fdb1502..2725e08ef353 100644
--- a/arch/xtensa/mm/fault.c
+++ b/arch/xtensa/mm/fault.c
@@ -110,7 +110,7 @@ good_area:
110 * make sure we exit gracefully rather than endlessly redo 110 * make sure we exit gracefully rather than endlessly redo
111 * the fault. 111 * the fault.
112 */ 112 */
113 fault = handle_mm_fault(mm, vma, address, flags); 113 fault = handle_mm_fault(vma, address, flags);
114 114
115 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) 115 if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
116 return; 116 return;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index f46dba8b7092..dc75de9059cd 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -391,6 +391,7 @@ static ssize_t show_valid_zones(struct device *dev,
391 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; 391 unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
392 struct page *first_page; 392 struct page *first_page;
393 struct zone *zone; 393 struct zone *zone;
394 int zone_shift = 0;
394 395
395 start_pfn = section_nr_to_pfn(mem->start_section_nr); 396 start_pfn = section_nr_to_pfn(mem->start_section_nr);
396 end_pfn = start_pfn + nr_pages; 397 end_pfn = start_pfn + nr_pages;
@@ -402,21 +403,26 @@ static ssize_t show_valid_zones(struct device *dev,
402 403
403 zone = page_zone(first_page); 404 zone = page_zone(first_page);
404 405
405 if (zone_idx(zone) == ZONE_MOVABLE - 1) { 406 /* MMOP_ONLINE_KEEP */
406 /*The mem block is the last memoryblock of this zone.*/ 407 sprintf(buf, "%s", zone->name);
407 if (end_pfn == zone_end_pfn(zone)) 408
408 return sprintf(buf, "%s %s\n", 409 /* MMOP_ONLINE_KERNEL */
409 zone->name, (zone + 1)->name); 410 zone_shift = zone_can_shift(start_pfn, nr_pages, ZONE_NORMAL);
411 if (zone_shift) {
412 strcat(buf, " ");
413 strcat(buf, (zone + zone_shift)->name);
410 } 414 }
411 415
412 if (zone_idx(zone) == ZONE_MOVABLE) { 416 /* MMOP_ONLINE_MOVABLE */
413 /*The mem block is the first memoryblock of ZONE_MOVABLE.*/ 417 zone_shift = zone_can_shift(start_pfn, nr_pages, ZONE_MOVABLE);
414 if (start_pfn == zone->zone_start_pfn) 418 if (zone_shift) {
415 return sprintf(buf, "%s %s\n", 419 strcat(buf, " ");
416 zone->name, (zone - 1)->name); 420 strcat(buf, (zone + zone_shift)->name);
417 } 421 }
418 422
419 return sprintf(buf, "%s\n", zone->name); 423 strcat(buf, "\n");
424
425 return strlen(buf);
420} 426}
421static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL); 427static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL);
422#endif 428#endif
diff --git a/drivers/base/node.c b/drivers/base/node.c
index 560751bad294..51c7db2c4ee2 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -113,6 +113,8 @@ static ssize_t node_read_meminfo(struct device *dev,
113 "Node %d SUnreclaim: %8lu kB\n" 113 "Node %d SUnreclaim: %8lu kB\n"
114#ifdef CONFIG_TRANSPARENT_HUGEPAGE 114#ifdef CONFIG_TRANSPARENT_HUGEPAGE
115 "Node %d AnonHugePages: %8lu kB\n" 115 "Node %d AnonHugePages: %8lu kB\n"
116 "Node %d ShmemHugePages: %8lu kB\n"
117 "Node %d ShmemPmdMapped: %8lu kB\n"
116#endif 118#endif
117 , 119 ,
118 nid, K(node_page_state(nid, NR_FILE_DIRTY)), 120 nid, K(node_page_state(nid, NR_FILE_DIRTY)),
@@ -131,10 +133,13 @@ static ssize_t node_read_meminfo(struct device *dev,
131 node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), 133 node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
132 nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)), 134 nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)),
133#ifdef CONFIG_TRANSPARENT_HUGEPAGE 135#ifdef CONFIG_TRANSPARENT_HUGEPAGE
134 nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)) 136 nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)),
135 , nid, 137 nid, K(node_page_state(nid, NR_ANON_THPS) *
136 K(node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) * 138 HPAGE_PMD_NR),
137 HPAGE_PMD_NR)); 139 nid, K(node_page_state(nid, NR_SHMEM_THPS) *
140 HPAGE_PMD_NR),
141 nid, K(node_page_state(nid, NR_SHMEM_PMDMAPPED) *
142 HPAGE_PMD_NR));
138#else 143#else
139 nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); 144 nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)));
140#endif 145#endif
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
index 386ba3d1a6ee..b8ecba6dcd3b 100644
--- a/drivers/block/zram/Kconfig
+++ b/drivers/block/zram/Kconfig
@@ -1,8 +1,7 @@
1config ZRAM 1config ZRAM
2 tristate "Compressed RAM block device support" 2 tristate "Compressed RAM block device support"
3 depends on BLOCK && SYSFS && ZSMALLOC 3 depends on BLOCK && SYSFS && ZSMALLOC && CRYPTO
4 select LZO_COMPRESS 4 select CRYPTO_LZO
5 select LZO_DECOMPRESS
6 default n 5 default n
7 help 6 help
8 Creates virtual block devices called /dev/zramX (X = 0, 1, ...). 7 Creates virtual block devices called /dev/zramX (X = 0, 1, ...).
@@ -14,13 +13,3 @@ config ZRAM
14 disks and maybe many more. 13 disks and maybe many more.
15 14
16 See zram.txt for more information. 15 See zram.txt for more information.
17
18config ZRAM_LZ4_COMPRESS
19 bool "Enable LZ4 algorithm support"
20 depends on ZRAM
21 select LZ4_COMPRESS
22 select LZ4_DECOMPRESS
23 default n
24 help
25 This option enables LZ4 compression algorithm support. Compression
26 algorithm can be changed using `comp_algorithm' device attribute. \ No newline at end of file
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
index be0763ff57a2..9e2b79e9a990 100644
--- a/drivers/block/zram/Makefile
+++ b/drivers/block/zram/Makefile
@@ -1,5 +1,3 @@
1zram-y := zcomp_lzo.o zcomp.o zram_drv.o 1zram-y := zcomp.o zram_drv.o
2
3zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o
4 2
5obj-$(CONFIG_ZRAM) += zram.o 3obj-$(CONFIG_ZRAM) += zram.o
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index b51a816d766b..4b5cd3a7b2b6 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -14,108 +14,150 @@
14#include <linux/wait.h> 14#include <linux/wait.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/crypto.h>
17 18
18#include "zcomp.h" 19#include "zcomp.h"
19#include "zcomp_lzo.h"
20#ifdef CONFIG_ZRAM_LZ4_COMPRESS
21#include "zcomp_lz4.h"
22#endif
23 20
24static struct zcomp_backend *backends[] = { 21static const char * const backends[] = {
25 &zcomp_lzo, 22 "lzo",
26#ifdef CONFIG_ZRAM_LZ4_COMPRESS 23#if IS_ENABLED(CONFIG_CRYPTO_LZ4)
27 &zcomp_lz4, 24 "lz4",
25#endif
26#if IS_ENABLED(CONFIG_CRYPTO_DEFLATE)
27 "deflate",
28#endif
29#if IS_ENABLED(CONFIG_CRYPTO_LZ4HC)
30 "lz4hc",
31#endif
32#if IS_ENABLED(CONFIG_CRYPTO_842)
33 "842",
28#endif 34#endif
29 NULL 35 NULL
30}; 36};
31 37
32static struct zcomp_backend *find_backend(const char *compress) 38static void zcomp_strm_free(struct zcomp_strm *zstrm)
33{
34 int i = 0;
35 while (backends[i]) {
36 if (sysfs_streq(compress, backends[i]->name))
37 break;
38 i++;
39 }
40 return backends[i];
41}
42
43static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm)
44{ 39{
45 if (zstrm->private) 40 if (!IS_ERR_OR_NULL(zstrm->tfm))
46 comp->backend->destroy(zstrm->private); 41 crypto_free_comp(zstrm->tfm);
47 free_pages((unsigned long)zstrm->buffer, 1); 42 free_pages((unsigned long)zstrm->buffer, 1);
48 kfree(zstrm); 43 kfree(zstrm);
49} 44}
50 45
51/* 46/*
52 * allocate new zcomp_strm structure with ->private initialized by 47 * allocate new zcomp_strm structure with ->tfm initialized by
53 * backend, return NULL on error 48 * backend, return NULL on error
54 */ 49 */
55static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) 50static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
56{ 51{
57 struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), flags); 52 struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL);
58 if (!zstrm) 53 if (!zstrm)
59 return NULL; 54 return NULL;
60 55
61 zstrm->private = comp->backend->create(flags); 56 zstrm->tfm = crypto_alloc_comp(comp->name, 0, 0);
62 /* 57 /*
63 * allocate 2 pages. 1 for compressed data, plus 1 extra for the 58 * allocate 2 pages. 1 for compressed data, plus 1 extra for the
64 * case when compressed size is larger than the original one 59 * case when compressed size is larger than the original one
65 */ 60 */
66 zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1); 61 zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
67 if (!zstrm->private || !zstrm->buffer) { 62 if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) {
68 zcomp_strm_free(comp, zstrm); 63 zcomp_strm_free(zstrm);
69 zstrm = NULL; 64 zstrm = NULL;
70 } 65 }
71 return zstrm; 66 return zstrm;
72} 67}
73 68
69bool zcomp_available_algorithm(const char *comp)
70{
71 int i = 0;
72
73 while (backends[i]) {
74 if (sysfs_streq(comp, backends[i]))
75 return true;
76 i++;
77 }
78
79 /*
80 * Crypto does not ignore a trailing new line symbol,
81 * so make sure you don't supply a string containing
82 * one.
83 * This also means that we permit zcomp initialisation
84 * with any compressing algorithm known to crypto api.
85 */
86 return crypto_has_comp(comp, 0, 0) == 1;
87}
88
74/* show available compressors */ 89/* show available compressors */
75ssize_t zcomp_available_show(const char *comp, char *buf) 90ssize_t zcomp_available_show(const char *comp, char *buf)
76{ 91{
92 bool known_algorithm = false;
77 ssize_t sz = 0; 93 ssize_t sz = 0;
78 int i = 0; 94 int i = 0;
79 95
80 while (backends[i]) { 96 for (; backends[i]; i++) {
81 if (!strcmp(comp, backends[i]->name)) 97 if (!strcmp(comp, backends[i])) {
98 known_algorithm = true;
82 sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, 99 sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
83 "[%s] ", backends[i]->name); 100 "[%s] ", backends[i]);
84 else 101 } else {
85 sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, 102 sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
86 "%s ", backends[i]->name); 103 "%s ", backends[i]);
87 i++; 104 }
88 } 105 }
106
107 /*
108 * Out-of-tree module known to crypto api or a missing
109 * entry in `backends'.
110 */
111 if (!known_algorithm && crypto_has_comp(comp, 0, 0) == 1)
112 sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
113 "[%s] ", comp);
114
89 sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); 115 sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
90 return sz; 116 return sz;
91} 117}
92 118
93bool zcomp_available_algorithm(const char *comp) 119struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
94{
95 return find_backend(comp) != NULL;
96}
97
98struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
99{ 120{
100 return *get_cpu_ptr(comp->stream); 121 return *get_cpu_ptr(comp->stream);
101} 122}
102 123
103void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) 124void zcomp_stream_put(struct zcomp *comp)
104{ 125{
105 put_cpu_ptr(comp->stream); 126 put_cpu_ptr(comp->stream);
106} 127}
107 128
108int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, 129int zcomp_compress(struct zcomp_strm *zstrm,
109 const unsigned char *src, size_t *dst_len) 130 const void *src, unsigned int *dst_len)
110{ 131{
111 return comp->backend->compress(src, zstrm->buffer, dst_len, 132 /*
112 zstrm->private); 133 * Our dst memory (zstrm->buffer) is always `2 * PAGE_SIZE' sized
134 * because sometimes we can endup having a bigger compressed data
135 * due to various reasons: for example compression algorithms tend
136 * to add some padding to the compressed buffer. Speaking of padding,
137 * comp algorithm `842' pads the compressed length to multiple of 8
138 * and returns -ENOSP when the dst memory is not big enough, which
139 * is not something that ZRAM wants to see. We can handle the
140 * `compressed_size > PAGE_SIZE' case easily in ZRAM, but when we
141 * receive -ERRNO from the compressing backend we can't help it
142 * anymore. To make `842' happy we need to tell the exact size of
143 * the dst buffer, zram_drv will take care of the fact that
144 * compressed buffer is too big.
145 */
146 *dst_len = PAGE_SIZE * 2;
147
148 return crypto_comp_compress(zstrm->tfm,
149 src, PAGE_SIZE,
150 zstrm->buffer, dst_len);
113} 151}
114 152
115int zcomp_decompress(struct zcomp *comp, const unsigned char *src, 153int zcomp_decompress(struct zcomp_strm *zstrm,
116 size_t src_len, unsigned char *dst) 154 const void *src, unsigned int src_len, void *dst)
117{ 155{
118 return comp->backend->decompress(src, src_len, dst); 156 unsigned int dst_len = PAGE_SIZE;
157
158 return crypto_comp_decompress(zstrm->tfm,
159 src, src_len,
160 dst, &dst_len);
119} 161}
120 162
121static int __zcomp_cpu_notifier(struct zcomp *comp, 163static int __zcomp_cpu_notifier(struct zcomp *comp,
@@ -127,7 +169,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp,
127 case CPU_UP_PREPARE: 169 case CPU_UP_PREPARE:
128 if (WARN_ON(*per_cpu_ptr(comp->stream, cpu))) 170 if (WARN_ON(*per_cpu_ptr(comp->stream, cpu)))
129 break; 171 break;
130 zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); 172 zstrm = zcomp_strm_alloc(comp);
131 if (IS_ERR_OR_NULL(zstrm)) { 173 if (IS_ERR_OR_NULL(zstrm)) {
132 pr_err("Can't allocate a compression stream\n"); 174 pr_err("Can't allocate a compression stream\n");
133 return NOTIFY_BAD; 175 return NOTIFY_BAD;
@@ -138,7 +180,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp,
138 case CPU_UP_CANCELED: 180 case CPU_UP_CANCELED:
139 zstrm = *per_cpu_ptr(comp->stream, cpu); 181 zstrm = *per_cpu_ptr(comp->stream, cpu);
140 if (!IS_ERR_OR_NULL(zstrm)) 182 if (!IS_ERR_OR_NULL(zstrm))
141 zcomp_strm_free(comp, zstrm); 183 zcomp_strm_free(zstrm);
142 *per_cpu_ptr(comp->stream, cpu) = NULL; 184 *per_cpu_ptr(comp->stream, cpu) = NULL;
143 break; 185 break;
144 default: 186 default:
@@ -209,18 +251,16 @@ void zcomp_destroy(struct zcomp *comp)
209struct zcomp *zcomp_create(const char *compress) 251struct zcomp *zcomp_create(const char *compress)
210{ 252{
211 struct zcomp *comp; 253 struct zcomp *comp;
212 struct zcomp_backend *backend;
213 int error; 254 int error;
214 255
215 backend = find_backend(compress); 256 if (!zcomp_available_algorithm(compress))
216 if (!backend)
217 return ERR_PTR(-EINVAL); 257 return ERR_PTR(-EINVAL);
218 258
219 comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); 259 comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL);
220 if (!comp) 260 if (!comp)
221 return ERR_PTR(-ENOMEM); 261 return ERR_PTR(-ENOMEM);
222 262
223 comp->backend = backend; 263 comp->name = compress;
224 error = zcomp_init(comp); 264 error = zcomp_init(comp);
225 if (error) { 265 if (error) {
226 kfree(comp); 266 kfree(comp);
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index ffd88cb747fe..478cac2ed465 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -13,33 +13,15 @@
13struct zcomp_strm { 13struct zcomp_strm {
14 /* compression/decompression buffer */ 14 /* compression/decompression buffer */
15 void *buffer; 15 void *buffer;
16 /* 16 struct crypto_comp *tfm;
17 * The private data of the compression stream, only compression
18 * stream backend can touch this (e.g. compression algorithm
19 * working memory)
20 */
21 void *private;
22};
23
24/* static compression backend */
25struct zcomp_backend {
26 int (*compress)(const unsigned char *src, unsigned char *dst,
27 size_t *dst_len, void *private);
28
29 int (*decompress)(const unsigned char *src, size_t src_len,
30 unsigned char *dst);
31
32 void *(*create)(gfp_t flags);
33 void (*destroy)(void *private);
34
35 const char *name;
36}; 17};
37 18
38/* dynamic per-device compression frontend */ 19/* dynamic per-device compression frontend */
39struct zcomp { 20struct zcomp {
40 struct zcomp_strm * __percpu *stream; 21 struct zcomp_strm * __percpu *stream;
41 struct zcomp_backend *backend;
42 struct notifier_block notifier; 22 struct notifier_block notifier;
23
24 const char *name;
43}; 25};
44 26
45ssize_t zcomp_available_show(const char *comp, char *buf); 27ssize_t zcomp_available_show(const char *comp, char *buf);
@@ -48,14 +30,14 @@ bool zcomp_available_algorithm(const char *comp);
48struct zcomp *zcomp_create(const char *comp); 30struct zcomp *zcomp_create(const char *comp);
49void zcomp_destroy(struct zcomp *comp); 31void zcomp_destroy(struct zcomp *comp);
50 32
51struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); 33struct zcomp_strm *zcomp_stream_get(struct zcomp *comp);
52void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm); 34void zcomp_stream_put(struct zcomp *comp);
53 35
54int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, 36int zcomp_compress(struct zcomp_strm *zstrm,
55 const unsigned char *src, size_t *dst_len); 37 const void *src, unsigned int *dst_len);
56 38
57int zcomp_decompress(struct zcomp *comp, const unsigned char *src, 39int zcomp_decompress(struct zcomp_strm *zstrm,
58 size_t src_len, unsigned char *dst); 40 const void *src, unsigned int src_len, void *dst);
59 41
60bool zcomp_set_max_streams(struct zcomp *comp, int num_strm); 42bool zcomp_set_max_streams(struct zcomp *comp, int num_strm);
61#endif /* _ZCOMP_H_ */ 43#endif /* _ZCOMP_H_ */
diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c
deleted file mode 100644
index 0110086accba..000000000000
--- a/drivers/block/zram/zcomp_lz4.c
+++ /dev/null
@@ -1,56 +0,0 @@
1/*
2 * Copyright (C) 2014 Sergey Senozhatsky.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9
10#include <linux/kernel.h>
11#include <linux/slab.h>
12#include <linux/lz4.h>
13#include <linux/vmalloc.h>
14#include <linux/mm.h>
15
16#include "zcomp_lz4.h"
17
18static void *zcomp_lz4_create(gfp_t flags)
19{
20 void *ret;
21
22 ret = kmalloc(LZ4_MEM_COMPRESS, flags);
23 if (!ret)
24 ret = __vmalloc(LZ4_MEM_COMPRESS,
25 flags | __GFP_HIGHMEM,
26 PAGE_KERNEL);
27 return ret;
28}
29
30static void zcomp_lz4_destroy(void *private)
31{
32 kvfree(private);
33}
34
35static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst,
36 size_t *dst_len, void *private)
37{
38 /* return : Success if return 0 */
39 return lz4_compress(src, PAGE_SIZE, dst, dst_len, private);
40}
41
42static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len,
43 unsigned char *dst)
44{
45 size_t dst_len = PAGE_SIZE;
46 /* return : Success if return 0 */
47 return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len);
48}
49
50struct zcomp_backend zcomp_lz4 = {
51 .compress = zcomp_lz4_compress,
52 .decompress = zcomp_lz4_decompress,
53 .create = zcomp_lz4_create,
54 .destroy = zcomp_lz4_destroy,
55 .name = "lz4",
56};
diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h
deleted file mode 100644
index 60613fb29dd8..000000000000
--- a/drivers/block/zram/zcomp_lz4.h
+++ /dev/null
@@ -1,17 +0,0 @@
1/*
2 * Copyright (C) 2014 Sergey Senozhatsky.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9
10#ifndef _ZCOMP_LZ4_H_
11#define _ZCOMP_LZ4_H_
12
13#include "zcomp.h"
14
15extern struct zcomp_backend zcomp_lz4;
16
17#endif /* _ZCOMP_LZ4_H_ */
diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c
deleted file mode 100644
index ed7a1f0549ec..000000000000
--- a/drivers/block/zram/zcomp_lzo.c
+++ /dev/null
@@ -1,56 +0,0 @@
1/*
2 * Copyright (C) 2014 Sergey Senozhatsky.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9
10#include <linux/kernel.h>
11#include <linux/slab.h>
12#include <linux/lzo.h>
13#include <linux/vmalloc.h>
14#include <linux/mm.h>
15
16#include "zcomp_lzo.h"
17
18static void *lzo_create(gfp_t flags)
19{
20 void *ret;
21
22 ret = kmalloc(LZO1X_MEM_COMPRESS, flags);
23 if (!ret)
24 ret = __vmalloc(LZO1X_MEM_COMPRESS,
25 flags | __GFP_HIGHMEM,
26 PAGE_KERNEL);
27 return ret;
28}
29
30static void lzo_destroy(void *private)
31{
32 kvfree(private);
33}
34
35static int lzo_compress(const unsigned char *src, unsigned char *dst,
36 size_t *dst_len, void *private)
37{
38 int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private);
39 return ret == LZO_E_OK ? 0 : ret;
40}
41
42static int lzo_decompress(const unsigned char *src, size_t src_len,
43 unsigned char *dst)
44{
45 size_t dst_len = PAGE_SIZE;
46 int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len);
47 return ret == LZO_E_OK ? 0 : ret;
48}
49
50struct zcomp_backend zcomp_lzo = {
51 .compress = lzo_compress,
52 .decompress = lzo_decompress,
53 .create = lzo_create,
54 .destroy = lzo_destroy,
55 .name = "lzo",
56};
diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h
deleted file mode 100644
index 128c5807fa14..000000000000
--- a/drivers/block/zram/zcomp_lzo.h
+++ /dev/null
@@ -1,17 +0,0 @@
1/*
2 * Copyright (C) 2014 Sergey Senozhatsky.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 */
9
10#ifndef _ZCOMP_LZO_H_
11#define _ZCOMP_LZO_H_
12
13#include "zcomp.h"
14
15extern struct zcomp_backend zcomp_lzo;
16
17#endif /* _ZCOMP_LZO_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index e5e5d19f2172..7454cf188c8e 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -342,9 +342,16 @@ static ssize_t comp_algorithm_store(struct device *dev,
342 struct device_attribute *attr, const char *buf, size_t len) 342 struct device_attribute *attr, const char *buf, size_t len)
343{ 343{
344 struct zram *zram = dev_to_zram(dev); 344 struct zram *zram = dev_to_zram(dev);
345 char compressor[CRYPTO_MAX_ALG_NAME];
345 size_t sz; 346 size_t sz;
346 347
347 if (!zcomp_available_algorithm(buf)) 348 strlcpy(compressor, buf, sizeof(compressor));
349 /* ignore trailing newline */
350 sz = strlen(compressor);
351 if (sz > 0 && compressor[sz - 1] == '\n')
352 compressor[sz - 1] = 0x00;
353
354 if (!zcomp_available_algorithm(compressor))
348 return -EINVAL; 355 return -EINVAL;
349 356
350 down_write(&zram->init_lock); 357 down_write(&zram->init_lock);
@@ -353,13 +360,8 @@ static ssize_t comp_algorithm_store(struct device *dev,
353 pr_info("Can't change algorithm for initialized device\n"); 360 pr_info("Can't change algorithm for initialized device\n");
354 return -EBUSY; 361 return -EBUSY;
355 } 362 }
356 strlcpy(zram->compressor, buf, sizeof(zram->compressor));
357
358 /* ignore trailing newline */
359 sz = strlen(zram->compressor);
360 if (sz > 0 && zram->compressor[sz - 1] == '\n')
361 zram->compressor[sz - 1] = 0x00;
362 363
364 strlcpy(zram->compressor, compressor, sizeof(compressor));
363 up_write(&zram->init_lock); 365 up_write(&zram->init_lock);
364 return len; 366 return len;
365} 367}
@@ -563,7 +565,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
563 unsigned char *cmem; 565 unsigned char *cmem;
564 struct zram_meta *meta = zram->meta; 566 struct zram_meta *meta = zram->meta;
565 unsigned long handle; 567 unsigned long handle;
566 size_t size; 568 unsigned int size;
567 569
568 bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); 570 bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
569 handle = meta->table[index].handle; 571 handle = meta->table[index].handle;
@@ -576,10 +578,14 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
576 } 578 }
577 579
578 cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); 580 cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
579 if (size == PAGE_SIZE) 581 if (size == PAGE_SIZE) {
580 copy_page(mem, cmem); 582 copy_page(mem, cmem);
581 else 583 } else {
582 ret = zcomp_decompress(zram->comp, cmem, size, mem); 584 struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp);
585
586 ret = zcomp_decompress(zstrm, cmem, size, mem);
587 zcomp_stream_put(zram->comp);
588 }
583 zs_unmap_object(meta->mem_pool, handle); 589 zs_unmap_object(meta->mem_pool, handle);
584 bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); 590 bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
585 591
@@ -646,7 +652,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
646 int offset) 652 int offset)
647{ 653{
648 int ret = 0; 654 int ret = 0;
649 size_t clen; 655 unsigned int clen;
650 unsigned long handle = 0; 656 unsigned long handle = 0;
651 struct page *page; 657 struct page *page;
652 unsigned char *user_mem, *cmem, *src, *uncmem = NULL; 658 unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
@@ -695,8 +701,8 @@ compress_again:
695 goto out; 701 goto out;
696 } 702 }
697 703
698 zstrm = zcomp_strm_find(zram->comp); 704 zstrm = zcomp_stream_get(zram->comp);
699 ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); 705 ret = zcomp_compress(zstrm, uncmem, &clen);
700 if (!is_partial_io(bvec)) { 706 if (!is_partial_io(bvec)) {
701 kunmap_atomic(user_mem); 707 kunmap_atomic(user_mem);
702 user_mem = NULL; 708 user_mem = NULL;
@@ -732,19 +738,21 @@ compress_again:
732 handle = zs_malloc(meta->mem_pool, clen, 738 handle = zs_malloc(meta->mem_pool, clen,
733 __GFP_KSWAPD_RECLAIM | 739 __GFP_KSWAPD_RECLAIM |
734 __GFP_NOWARN | 740 __GFP_NOWARN |
735 __GFP_HIGHMEM); 741 __GFP_HIGHMEM |
742 __GFP_MOVABLE);
736 if (!handle) { 743 if (!handle) {
737 zcomp_strm_release(zram->comp, zstrm); 744 zcomp_stream_put(zram->comp);
738 zstrm = NULL; 745 zstrm = NULL;
739 746
740 atomic64_inc(&zram->stats.writestall); 747 atomic64_inc(&zram->stats.writestall);
741 748
742 handle = zs_malloc(meta->mem_pool, clen, 749 handle = zs_malloc(meta->mem_pool, clen,
743 GFP_NOIO | __GFP_HIGHMEM); 750 GFP_NOIO | __GFP_HIGHMEM |
751 __GFP_MOVABLE);
744 if (handle) 752 if (handle)
745 goto compress_again; 753 goto compress_again;
746 754
747 pr_err("Error allocating memory for compressed page: %u, size=%zu\n", 755 pr_err("Error allocating memory for compressed page: %u, size=%u\n",
748 index, clen); 756 index, clen);
749 ret = -ENOMEM; 757 ret = -ENOMEM;
750 goto out; 758 goto out;
@@ -769,7 +777,7 @@ compress_again:
769 memcpy(cmem, src, clen); 777 memcpy(cmem, src, clen);
770 } 778 }
771 779
772 zcomp_strm_release(zram->comp, zstrm); 780 zcomp_stream_put(zram->comp);
773 zstrm = NULL; 781 zstrm = NULL;
774 zs_unmap_object(meta->mem_pool, handle); 782 zs_unmap_object(meta->mem_pool, handle);
775 783
@@ -789,7 +797,7 @@ compress_again:
789 atomic64_inc(&zram->stats.pages_stored); 797 atomic64_inc(&zram->stats.pages_stored);
790out: 798out:
791 if (zstrm) 799 if (zstrm)
792 zcomp_strm_release(zram->comp, zstrm); 800 zcomp_stream_put(zram->comp);
793 if (is_partial_io(bvec)) 801 if (is_partial_io(bvec))
794 kfree(uncmem); 802 kfree(uncmem);
795 return ret; 803 return ret;
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 3f5bf66a27e4..74fcf10da374 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -15,8 +15,9 @@
15#ifndef _ZRAM_DRV_H_ 15#ifndef _ZRAM_DRV_H_
16#define _ZRAM_DRV_H_ 16#define _ZRAM_DRV_H_
17 17
18#include <linux/spinlock.h> 18#include <linux/rwsem.h>
19#include <linux/zsmalloc.h> 19#include <linux/zsmalloc.h>
20#include <linux/crypto.h>
20 21
21#include "zcomp.h" 22#include "zcomp.h"
22 23
@@ -113,7 +114,7 @@ struct zram {
113 * we can store in a disk. 114 * we can store in a disk.
114 */ 115 */
115 u64 disksize; /* bytes */ 116 u64 disksize; /* bytes */
116 char compressor[10]; 117 char compressor[CRYPTO_MAX_ALG_NAME];
117 /* 118 /*
118 * zram is claimed so open request will be failed 119 * zram is claimed so open request will be failed
119 */ 120 */
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index d633974e7f8b..a33163dbb913 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -22,6 +22,7 @@
22#include <linux/device.h> 22#include <linux/device.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/backing-dev.h> 24#include <linux/backing-dev.h>
25#include <linux/shmem_fs.h>
25#include <linux/splice.h> 26#include <linux/splice.h>
26#include <linux/pfn.h> 27#include <linux/pfn.h>
27#include <linux/export.h> 28#include <linux/export.h>
@@ -657,6 +658,28 @@ static int mmap_zero(struct file *file, struct vm_area_struct *vma)
657 return 0; 658 return 0;
658} 659}
659 660
661static unsigned long get_unmapped_area_zero(struct file *file,
662 unsigned long addr, unsigned long len,
663 unsigned long pgoff, unsigned long flags)
664{
665#ifdef CONFIG_MMU
666 if (flags & MAP_SHARED) {
667 /*
668 * mmap_zero() will call shmem_zero_setup() to create a file,
669 * so use shmem's get_unmapped_area in case it can be huge;
670 * and pass NULL for file as in mmap.c's get_unmapped_area(),
671 * so as not to confuse shmem with our handle on "/dev/zero".
672 */
673 return shmem_get_unmapped_area(NULL, addr, len, pgoff, flags);
674 }
675
676 /* Otherwise flags & MAP_PRIVATE: with no shmem object beneath it */
677 return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
678#else
679 return -ENOSYS;
680#endif
681}
682
660static ssize_t write_full(struct file *file, const char __user *buf, 683static ssize_t write_full(struct file *file, const char __user *buf,
661 size_t count, loff_t *ppos) 684 size_t count, loff_t *ppos)
662{ 685{
@@ -764,6 +787,7 @@ static const struct file_operations zero_fops = {
764 .read_iter = read_iter_zero, 787 .read_iter = read_iter_zero,
765 .write_iter = write_iter_zero, 788 .write_iter = write_iter_zero,
766 .mmap = mmap_zero, 789 .mmap = mmap_zero,
790 .get_unmapped_area = get_unmapped_area_zero,
767#ifndef CONFIG_MMU 791#ifndef CONFIG_MMU
768 .mmap_capabilities = zero_mmap_capabilities, 792 .mmap_capabilities = zero_mmap_capabilities,
769#endif 793#endif
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 56999d2fac07..fbdaf81ae925 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -538,8 +538,7 @@ static void do_fault(struct work_struct *work)
538 if (access_error(vma, fault)) 538 if (access_error(vma, fault))
539 goto out; 539 goto out;
540 540
541 ret = handle_mm_fault(mm, vma, address, flags); 541 ret = handle_mm_fault(vma, address, flags);
542
543out: 542out:
544 up_read(&mm->mmap_sem); 543 up_read(&mm->mmap_sem);
545 544
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index d9939fa9b588..8ebb3530afa7 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -583,7 +583,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
583 if (access_error(vma, req)) 583 if (access_error(vma, req))
584 goto invalid; 584 goto invalid;
585 585
586 ret = handle_mm_fault(svm->mm, vma, address, 586 ret = handle_mm_fault(vma, address,
587 req->wr_req ? FAULT_FLAG_WRITE : 0); 587 req->wr_req ? FAULT_FLAG_WRITE : 0);
588 if (ret & VM_FAULT_ERROR) 588 if (ret & VM_FAULT_ERROR)
589 goto invalid; 589 goto invalid;
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c
index e5139402e7f8..52bbd27e93ae 100644
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -363,6 +363,7 @@ static void moom_callback(struct work_struct *ignored)
363 struct oom_control oc = { 363 struct oom_control oc = {
364 .zonelist = node_zonelist(first_memory_node, gfp_mask), 364 .zonelist = node_zonelist(first_memory_node, gfp_mask),
365 .nodemask = NULL, 365 .nodemask = NULL,
366 .memcg = NULL,
366 .gfp_mask = gfp_mask, 367 .gfp_mask = gfp_mask,
367 .order = -1, 368 .order = -1,
368 }; 369 };
diff --git a/drivers/video/fbdev/core/fbmon.c b/drivers/video/fbdev/core/fbmon.c
index 47c3191ec313..62c0cf79674f 100644
--- a/drivers/video/fbdev/core/fbmon.c
+++ b/drivers/video/fbdev/core/fbmon.c
@@ -1496,7 +1496,6 @@ int fb_parse_edid(unsigned char *edid, struct fb_var_screeninfo *var)
1496} 1496}
1497void fb_edid_to_monspecs(unsigned char *edid, struct fb_monspecs *specs) 1497void fb_edid_to_monspecs(unsigned char *edid, struct fb_monspecs *specs)
1498{ 1498{
1499 specs = NULL;
1500} 1499}
1501void fb_edid_add_monspecs(unsigned char *edid, struct fb_monspecs *specs) 1500void fb_edid_add_monspecs(unsigned char *edid, struct fb_monspecs *specs)
1502{ 1501{
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 476c0e3a7150..888d5f8322ce 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -30,6 +30,7 @@
30#include <linux/oom.h> 30#include <linux/oom.h>
31#include <linux/wait.h> 31#include <linux/wait.h>
32#include <linux/mm.h> 32#include <linux/mm.h>
33#include <linux/mount.h>
33 34
34/* 35/*
35 * Balloon device works in 4K page units. So each page is pointed to by 36 * Balloon device works in 4K page units. So each page is pointed to by
@@ -45,6 +46,10 @@ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
45module_param(oom_pages, int, S_IRUSR | S_IWUSR); 46module_param(oom_pages, int, S_IRUSR | S_IWUSR);
46MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); 47MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
47 48
49#ifdef CONFIG_BALLOON_COMPACTION
50static struct vfsmount *balloon_mnt;
51#endif
52
48struct virtio_balloon { 53struct virtio_balloon {
49 struct virtio_device *vdev; 54 struct virtio_device *vdev;
50 struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; 55 struct virtqueue *inflate_vq, *deflate_vq, *stats_vq;
@@ -490,6 +495,24 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info,
490 495
491 return MIGRATEPAGE_SUCCESS; 496 return MIGRATEPAGE_SUCCESS;
492} 497}
498
499static struct dentry *balloon_mount(struct file_system_type *fs_type,
500 int flags, const char *dev_name, void *data)
501{
502 static const struct dentry_operations ops = {
503 .d_dname = simple_dname,
504 };
505
506 return mount_pseudo(fs_type, "balloon-kvm:", NULL, &ops,
507 BALLOON_KVM_MAGIC);
508}
509
510static struct file_system_type balloon_fs = {
511 .name = "balloon-kvm",
512 .mount = balloon_mount,
513 .kill_sb = kill_anon_super,
514};
515
493#endif /* CONFIG_BALLOON_COMPACTION */ 516#endif /* CONFIG_BALLOON_COMPACTION */
494 517
495static int virtballoon_probe(struct virtio_device *vdev) 518static int virtballoon_probe(struct virtio_device *vdev)
@@ -519,9 +542,6 @@ static int virtballoon_probe(struct virtio_device *vdev)
519 vb->vdev = vdev; 542 vb->vdev = vdev;
520 543
521 balloon_devinfo_init(&vb->vb_dev_info); 544 balloon_devinfo_init(&vb->vb_dev_info);
522#ifdef CONFIG_BALLOON_COMPACTION
523 vb->vb_dev_info.migratepage = virtballoon_migratepage;
524#endif
525 545
526 err = init_vqs(vb); 546 err = init_vqs(vb);
527 if (err) 547 if (err)
@@ -531,13 +551,33 @@ static int virtballoon_probe(struct virtio_device *vdev)
531 vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY; 551 vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
532 err = register_oom_notifier(&vb->nb); 552 err = register_oom_notifier(&vb->nb);
533 if (err < 0) 553 if (err < 0)
534 goto out_oom_notify; 554 goto out_del_vqs;
555
556#ifdef CONFIG_BALLOON_COMPACTION
557 balloon_mnt = kern_mount(&balloon_fs);
558 if (IS_ERR(balloon_mnt)) {
559 err = PTR_ERR(balloon_mnt);
560 unregister_oom_notifier(&vb->nb);
561 goto out_del_vqs;
562 }
563
564 vb->vb_dev_info.migratepage = virtballoon_migratepage;
565 vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb);
566 if (IS_ERR(vb->vb_dev_info.inode)) {
567 err = PTR_ERR(vb->vb_dev_info.inode);
568 kern_unmount(balloon_mnt);
569 unregister_oom_notifier(&vb->nb);
570 vb->vb_dev_info.inode = NULL;
571 goto out_del_vqs;
572 }
573 vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
574#endif
535 575
536 virtio_device_ready(vdev); 576 virtio_device_ready(vdev);
537 577
538 return 0; 578 return 0;
539 579
540out_oom_notify: 580out_del_vqs:
541 vdev->config->del_vqs(vdev); 581 vdev->config->del_vqs(vdev);
542out_free_vb: 582out_free_vb:
543 kfree(vb); 583 kfree(vb);
@@ -571,6 +611,8 @@ static void virtballoon_remove(struct virtio_device *vdev)
571 cancel_work_sync(&vb->update_balloon_stats_work); 611 cancel_work_sync(&vb->update_balloon_stats_work);
572 612
573 remove_common(vb); 613 remove_common(vb);
614 if (vb->vb_dev_info.inode)
615 iput(vb->vb_dev_info.inode);
574 kfree(vb); 616 kfree(vb);
575} 617}
576 618
diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c
index 53a085fca00c..66620713242a 100644
--- a/drivers/xen/xen-selfballoon.c
+++ b/drivers/xen/xen-selfballoon.c
@@ -195,7 +195,7 @@ static void selfballoon_process(struct work_struct *work)
195 MB2PAGES(selfballoon_reserved_mb); 195 MB2PAGES(selfballoon_reserved_mb);
196#ifdef CONFIG_FRONTSWAP 196#ifdef CONFIG_FRONTSWAP
197 /* allow space for frontswap pages to be repatriated */ 197 /* allow space for frontswap pages to be repatriated */
198 if (frontswap_selfshrinking && frontswap_enabled) 198 if (frontswap_selfshrinking)
199 goal_pages += frontswap_curr_pages(); 199 goal_pages += frontswap_curr_pages();
200#endif 200#endif
201 if (cur_pages > goal_pages) 201 if (cur_pages > goal_pages)
@@ -230,7 +230,7 @@ static void selfballoon_process(struct work_struct *work)
230 reset_timer = true; 230 reset_timer = true;
231 } 231 }
232#ifdef CONFIG_FRONTSWAP 232#ifdef CONFIG_FRONTSWAP
233 if (frontswap_selfshrinking && frontswap_enabled) { 233 if (frontswap_selfshrinking) {
234 frontswap_selfshrink(); 234 frontswap_selfshrink();
235 reset_timer = true; 235 reset_timer = true;
236 } 236 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 27c214941004..cee4cb99b8ce 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4178,7 +4178,8 @@ int extent_readpages(struct extent_io_tree *tree,
4178 prefetchw(&page->flags); 4178 prefetchw(&page->flags);
4179 list_del(&page->lru); 4179 list_del(&page->lru);
4180 if (add_to_page_cache_lru(page, mapping, 4180 if (add_to_page_cache_lru(page, mapping,
4181 page->index, GFP_NOFS)) { 4181 page->index,
4182 readahead_gfp_mask(mapping))) {
4182 put_page(page); 4183 put_page(page);
4183 continue; 4184 continue;
4184 } 4185 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d4890b6dc22d..579e41b350a2 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3366,7 +3366,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
3366 struct page *page, *tpage; 3366 struct page *page, *tpage;
3367 unsigned int expected_index; 3367 unsigned int expected_index;
3368 int rc; 3368 int rc;
3369 gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); 3369 gfp_t gfp = readahead_gfp_mask(mapping);
3370 3370
3371 INIT_LIST_HEAD(tmplist); 3371 INIT_LIST_HEAD(tmplist);
3372 3372
diff --git a/fs/dax.c b/fs/dax.c
index e207f8f9b700..432b9e6dd63b 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -819,16 +819,16 @@ static int dax_insert_mapping(struct address_space *mapping,
819} 819}
820 820
821/** 821/**
822 * __dax_fault - handle a page fault on a DAX file 822 * dax_fault - handle a page fault on a DAX file
823 * @vma: The virtual memory area where the fault occurred 823 * @vma: The virtual memory area where the fault occurred
824 * @vmf: The description of the fault 824 * @vmf: The description of the fault
825 * @get_block: The filesystem method used to translate file offsets to blocks 825 * @get_block: The filesystem method used to translate file offsets to blocks
826 * 826 *
827 * When a page fault occurs, filesystems may call this helper in their 827 * When a page fault occurs, filesystems may call this helper in their
828 * fault handler for DAX files. __dax_fault() assumes the caller has done all 828 * fault handler for DAX files. dax_fault() assumes the caller has done all
829 * the necessary locking for the page fault to proceed successfully. 829 * the necessary locking for the page fault to proceed successfully.
830 */ 830 */
831int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, 831int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
832 get_block_t get_block) 832 get_block_t get_block)
833{ 833{
834 struct file *file = vma->vm_file; 834 struct file *file = vma->vm_file;
@@ -913,33 +913,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
913 return VM_FAULT_SIGBUS | major; 913 return VM_FAULT_SIGBUS | major;
914 return VM_FAULT_NOPAGE | major; 914 return VM_FAULT_NOPAGE | major;
915} 915}
916EXPORT_SYMBOL(__dax_fault);
917
918/**
919 * dax_fault - handle a page fault on a DAX file
920 * @vma: The virtual memory area where the fault occurred
921 * @vmf: The description of the fault
922 * @get_block: The filesystem method used to translate file offsets to blocks
923 *
924 * When a page fault occurs, filesystems may call this helper in their
925 * fault handler for DAX files.
926 */
927int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
928 get_block_t get_block)
929{
930 int result;
931 struct super_block *sb = file_inode(vma->vm_file)->i_sb;
932
933 if (vmf->flags & FAULT_FLAG_WRITE) {
934 sb_start_pagefault(sb);
935 file_update_time(vma->vm_file);
936 }
937 result = __dax_fault(vma, vmf, get_block);
938 if (vmf->flags & FAULT_FLAG_WRITE)
939 sb_end_pagefault(sb);
940
941 return result;
942}
943EXPORT_SYMBOL_GPL(dax_fault); 916EXPORT_SYMBOL_GPL(dax_fault);
944 917
945#if defined(CONFIG_TRANSPARENT_HUGEPAGE) 918#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
@@ -967,7 +940,16 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address,
967 940
968#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") 941#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd")
969 942
970int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, 943/**
944 * dax_pmd_fault - handle a PMD fault on a DAX file
945 * @vma: The virtual memory area where the fault occurred
946 * @vmf: The description of the fault
947 * @get_block: The filesystem method used to translate file offsets to blocks
948 *
949 * When a page fault occurs, filesystems may call this helper in their
950 * pmd_fault handler for DAX files.
951 */
952int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
971 pmd_t *pmd, unsigned int flags, get_block_t get_block) 953 pmd_t *pmd, unsigned int flags, get_block_t get_block)
972{ 954{
973 struct file *file = vma->vm_file; 955 struct file *file = vma->vm_file;
@@ -1119,7 +1101,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1119 * 1101 *
1120 * The PMD path doesn't have an equivalent to 1102 * The PMD path doesn't have an equivalent to
1121 * dax_pfn_mkwrite(), though, so for a read followed by a 1103 * dax_pfn_mkwrite(), though, so for a read followed by a
1122 * write we traverse all the way through __dax_pmd_fault() 1104 * write we traverse all the way through dax_pmd_fault()
1123 * twice. This means we can just skip inserting a radix tree 1105 * twice. This means we can just skip inserting a radix tree
1124 * entry completely on the initial read and just wait until 1106 * entry completely on the initial read and just wait until
1125 * the write to insert a dirty entry. 1107 * the write to insert a dirty entry.
@@ -1148,33 +1130,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1148 result = VM_FAULT_FALLBACK; 1130 result = VM_FAULT_FALLBACK;
1149 goto out; 1131 goto out;
1150} 1132}
1151EXPORT_SYMBOL_GPL(__dax_pmd_fault);
1152
1153/**
1154 * dax_pmd_fault - handle a PMD fault on a DAX file
1155 * @vma: The virtual memory area where the fault occurred
1156 * @vmf: The description of the fault
1157 * @get_block: The filesystem method used to translate file offsets to blocks
1158 *
1159 * When a page fault occurs, filesystems may call this helper in their
1160 * pmd_fault handler for DAX files.
1161 */
1162int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1163 pmd_t *pmd, unsigned int flags, get_block_t get_block)
1164{
1165 int result;
1166 struct super_block *sb = file_inode(vma->vm_file)->i_sb;
1167
1168 if (flags & FAULT_FLAG_WRITE) {
1169 sb_start_pagefault(sb);
1170 file_update_time(vma->vm_file);
1171 }
1172 result = __dax_pmd_fault(vma, address, pmd, flags, get_block);
1173 if (flags & FAULT_FLAG_WRITE)
1174 sb_end_pagefault(sb);
1175
1176 return result;
1177}
1178EXPORT_SYMBOL_GPL(dax_pmd_fault); 1133EXPORT_SYMBOL_GPL(dax_pmd_fault);
1179#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 1134#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1180 1135
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 868c02317b05..5efeefe17abb 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
51 } 51 }
52 down_read(&ei->dax_sem); 52 down_read(&ei->dax_sem);
53 53
54 ret = __dax_fault(vma, vmf, ext2_get_block); 54 ret = dax_fault(vma, vmf, ext2_get_block);
55 55
56 up_read(&ei->dax_sem); 56 up_read(&ei->dax_sem);
57 if (vmf->flags & FAULT_FLAG_WRITE) 57 if (vmf->flags & FAULT_FLAG_WRITE)
@@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
72 } 72 }
73 down_read(&ei->dax_sem); 73 down_read(&ei->dax_sem);
74 74
75 ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block); 75 ret = dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block);
76 76
77 up_read(&ei->dax_sem); 77 up_read(&ei->dax_sem);
78 if (flags & FAULT_FLAG_WRITE) 78 if (flags & FAULT_FLAG_WRITE)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 4f615cdd22ca..261ac3734c58 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -202,7 +202,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
202 if (IS_ERR(handle)) 202 if (IS_ERR(handle))
203 result = VM_FAULT_SIGBUS; 203 result = VM_FAULT_SIGBUS;
204 else 204 else
205 result = __dax_fault(vma, vmf, ext4_dax_get_block); 205 result = dax_fault(vma, vmf, ext4_dax_get_block);
206 206
207 if (write) { 207 if (write) {
208 if (!IS_ERR(handle)) 208 if (!IS_ERR(handle))
@@ -237,7 +237,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
237 if (IS_ERR(handle)) 237 if (IS_ERR(handle))
238 result = VM_FAULT_SIGBUS; 238 result = VM_FAULT_SIGBUS;
239 else 239 else
240 result = __dax_pmd_fault(vma, addr, pmd, flags, 240 result = dax_pmd_fault(vma, addr, pmd, flags,
241 ext4_dax_get_block); 241 ext4_dax_get_block);
242 242
243 if (write) { 243 if (write) {
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index bfc7f4d30643..a81b829d56de 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -130,7 +130,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
130 page = list_entry(pages->prev, struct page, lru); 130 page = list_entry(pages->prev, struct page, lru);
131 list_del(&page->lru); 131 list_del(&page->lru);
132 if (add_to_page_cache_lru(page, mapping, page->index, 132 if (add_to_page_cache_lru(page, mapping, page->index,
133 mapping_gfp_constraint(mapping, GFP_KERNEL))) 133 readahead_gfp_mask(mapping)))
134 goto next_page; 134 goto next_page;
135 } 135 }
136 136
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 8769e8349dff..ded224518978 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1002,7 +1002,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
1002 page = list_entry(pages->prev, struct page, lru); 1002 page = list_entry(pages->prev, struct page, lru);
1003 list_del(&page->lru); 1003 list_del(&page->lru);
1004 if (add_to_page_cache_lru(page, mapping, 1004 if (add_to_page_cache_lru(page, mapping,
1005 page->index, GFP_KERNEL)) 1005 page->index,
1006 readahead_gfp_mask(mapping)))
1006 goto next_page; 1007 goto next_page;
1007 } 1008 }
1008 1009
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index fe7e83a45eff..6f9c9f6f5157 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -981,6 +981,42 @@ void inode_io_list_del(struct inode *inode)
981} 981}
982 982
983/* 983/*
984 * mark an inode as under writeback on the sb
985 */
986void sb_mark_inode_writeback(struct inode *inode)
987{
988 struct super_block *sb = inode->i_sb;
989 unsigned long flags;
990
991 if (list_empty(&inode->i_wb_list)) {
992 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
993 if (list_empty(&inode->i_wb_list)) {
994 list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
995 trace_sb_mark_inode_writeback(inode);
996 }
997 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
998 }
999}
1000
1001/*
1002 * clear an inode as under writeback on the sb
1003 */
1004void sb_clear_inode_writeback(struct inode *inode)
1005{
1006 struct super_block *sb = inode->i_sb;
1007 unsigned long flags;
1008
1009 if (!list_empty(&inode->i_wb_list)) {
1010 spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
1011 if (!list_empty(&inode->i_wb_list)) {
1012 list_del_init(&inode->i_wb_list);
1013 trace_sb_clear_inode_writeback(inode);
1014 }
1015 spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
1016 }
1017}
1018
1019/*
984 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the 1020 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
985 * furthest end of its superblock's dirty-inode list. 1021 * furthest end of its superblock's dirty-inode list.
986 * 1022 *
@@ -2154,7 +2190,7 @@ EXPORT_SYMBOL(__mark_inode_dirty);
2154 */ 2190 */
2155static void wait_sb_inodes(struct super_block *sb) 2191static void wait_sb_inodes(struct super_block *sb)
2156{ 2192{
2157 struct inode *inode, *old_inode = NULL; 2193 LIST_HEAD(sync_list);
2158 2194
2159 /* 2195 /*
2160 * We need to be protected against the filesystem going from 2196 * We need to be protected against the filesystem going from
@@ -2163,38 +2199,60 @@ static void wait_sb_inodes(struct super_block *sb)
2163 WARN_ON(!rwsem_is_locked(&sb->s_umount)); 2199 WARN_ON(!rwsem_is_locked(&sb->s_umount));
2164 2200
2165 mutex_lock(&sb->s_sync_lock); 2201 mutex_lock(&sb->s_sync_lock);
2166 spin_lock(&sb->s_inode_list_lock);
2167 2202
2168 /* 2203 /*
2169 * Data integrity sync. Must wait for all pages under writeback, 2204 * Splice the writeback list onto a temporary list to avoid waiting on
2170 * because there may have been pages dirtied before our sync 2205 * inodes that have started writeback after this point.
2171 * call, but which had writeout started before we write it out. 2206 *
2172 * In which case, the inode may not be on the dirty list, but 2207 * Use rcu_read_lock() to keep the inodes around until we have a
2173 * we still have to wait for that writeout. 2208 * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as
2209 * the local list because inodes can be dropped from either by writeback
2210 * completion.
2211 */
2212 rcu_read_lock();
2213 spin_lock_irq(&sb->s_inode_wblist_lock);
2214 list_splice_init(&sb->s_inodes_wb, &sync_list);
2215
2216 /*
2217 * Data integrity sync. Must wait for all pages under writeback, because
2218 * there may have been pages dirtied before our sync call, but which had
2219 * writeout started before we write it out. In which case, the inode
2220 * may not be on the dirty list, but we still have to wait for that
2221 * writeout.
2174 */ 2222 */
2175 list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 2223 while (!list_empty(&sync_list)) {
2224 struct inode *inode = list_first_entry(&sync_list, struct inode,
2225 i_wb_list);
2176 struct address_space *mapping = inode->i_mapping; 2226 struct address_space *mapping = inode->i_mapping;
2177 2227
2228 /*
2229 * Move each inode back to the wb list before we drop the lock
2230 * to preserve consistency between i_wb_list and the mapping
2231 * writeback tag. Writeback completion is responsible to remove
2232 * the inode from either list once the writeback tag is cleared.
2233 */
2234 list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);
2235
2236 /*
2237 * The mapping can appear untagged while still on-list since we
2238 * do not have the mapping lock. Skip it here, wb completion
2239 * will remove it.
2240 */
2241 if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
2242 continue;
2243
2244 spin_unlock_irq(&sb->s_inode_wblist_lock);
2245
2178 spin_lock(&inode->i_lock); 2246 spin_lock(&inode->i_lock);
2179 if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || 2247 if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
2180 (mapping->nrpages == 0)) {
2181 spin_unlock(&inode->i_lock); 2248 spin_unlock(&inode->i_lock);
2249
2250 spin_lock_irq(&sb->s_inode_wblist_lock);
2182 continue; 2251 continue;
2183 } 2252 }
2184 __iget(inode); 2253 __iget(inode);
2185 spin_unlock(&inode->i_lock); 2254 spin_unlock(&inode->i_lock);
2186 spin_unlock(&sb->s_inode_list_lock); 2255 rcu_read_unlock();
2187
2188 /*
2189 * We hold a reference to 'inode' so it couldn't have been
2190 * removed from s_inodes list while we dropped the
2191 * s_inode_list_lock. We cannot iput the inode now as we can
2192 * be holding the last reference and we cannot iput it under
2193 * s_inode_list_lock. So we keep the reference and iput it
2194 * later.
2195 */
2196 iput(old_inode);
2197 old_inode = inode;
2198 2256
2199 /* 2257 /*
2200 * We keep the error status of individual mapping so that 2258 * We keep the error status of individual mapping so that
@@ -2205,10 +2263,13 @@ static void wait_sb_inodes(struct super_block *sb)
2205 2263
2206 cond_resched(); 2264 cond_resched();
2207 2265
2208 spin_lock(&sb->s_inode_list_lock); 2266 iput(inode);
2267
2268 rcu_read_lock();
2269 spin_lock_irq(&sb->s_inode_wblist_lock);
2209 } 2270 }
2210 spin_unlock(&sb->s_inode_list_lock); 2271 spin_unlock_irq(&sb->s_inode_wblist_lock);
2211 iput(old_inode); 2272 rcu_read_unlock();
2212 mutex_unlock(&sb->s_sync_lock); 2273 mutex_unlock(&sb->s_sync_lock);
2213} 2274}
2214 2275
diff --git a/fs/inode.c b/fs/inode.c
index 4ccbc21b30ce..e171f7b5f9e4 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -365,6 +365,7 @@ void inode_init_once(struct inode *inode)
365 INIT_HLIST_NODE(&inode->i_hash); 365 INIT_HLIST_NODE(&inode->i_hash);
366 INIT_LIST_HEAD(&inode->i_devices); 366 INIT_LIST_HEAD(&inode->i_devices);
367 INIT_LIST_HEAD(&inode->i_io_list); 367 INIT_LIST_HEAD(&inode->i_io_list);
368 INIT_LIST_HEAD(&inode->i_wb_list);
368 INIT_LIST_HEAD(&inode->i_lru); 369 INIT_LIST_HEAD(&inode->i_lru);
369 address_space_init_once(&inode->i_data); 370 address_space_init_once(&inode->i_data);
370 i_size_ordered_init(inode); 371 i_size_ordered_init(inode);
@@ -507,6 +508,7 @@ void clear_inode(struct inode *inode)
507 BUG_ON(!list_empty(&inode->i_data.private_list)); 508 BUG_ON(!list_empty(&inode->i_data.private_list));
508 BUG_ON(!(inode->i_state & I_FREEING)); 509 BUG_ON(!(inode->i_state & I_FREEING));
509 BUG_ON(inode->i_state & I_CLEAR); 510 BUG_ON(inode->i_state & I_CLEAR);
511 BUG_ON(!list_empty(&inode->i_wb_list));
510 /* don't need i_lock here, no concurrent mods to i_state */ 512 /* don't need i_lock here, no concurrent mods to i_state */
511 inode->i_state = I_FREEING | I_CLEAR; 513 inode->i_state = I_FREEING | I_CLEAR;
512} 514}
diff --git a/fs/mpage.c b/fs/mpage.c
index 37b28280ad04..2ca1f39c8cba 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -72,6 +72,8 @@ mpage_alloc(struct block_device *bdev,
72{ 72{
73 struct bio *bio; 73 struct bio *bio;
74 74
75 /* Restrict the given (page cache) mask for slab allocations */
76 gfp_flags &= GFP_KERNEL;
75 bio = bio_alloc(gfp_flags, nr_vecs); 77 bio = bio_alloc(gfp_flags, nr_vecs);
76 78
77 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 79 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
@@ -363,7 +365,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
363 sector_t last_block_in_bio = 0; 365 sector_t last_block_in_bio = 0;
364 struct buffer_head map_bh; 366 struct buffer_head map_bh;
365 unsigned long first_logical_block = 0; 367 unsigned long first_logical_block = 0;
366 gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); 368 gfp_t gfp = readahead_gfp_mask(mapping);
367 369
368 map_bh.b_state = 0; 370 map_bh.b_state = 0;
369 map_bh.b_size = 0; 371 map_bh.b_size = 0;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 4238eb28889f..1d67fcbf7160 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1618,16 +1618,12 @@ static void o2net_start_connect(struct work_struct *work)
1618 1618
1619 /* watch for racing with tearing a node down */ 1619 /* watch for racing with tearing a node down */
1620 node = o2nm_get_node_by_num(o2net_num_from_nn(nn)); 1620 node = o2nm_get_node_by_num(o2net_num_from_nn(nn));
1621 if (node == NULL) { 1621 if (node == NULL)
1622 ret = 0;
1623 goto out; 1622 goto out;
1624 }
1625 1623
1626 mynode = o2nm_get_node_by_num(o2nm_this_node()); 1624 mynode = o2nm_get_node_by_num(o2nm_this_node());
1627 if (mynode == NULL) { 1625 if (mynode == NULL)
1628 ret = 0;
1629 goto out; 1626 goto out;
1630 }
1631 1627
1632 spin_lock(&nn->nn_lock); 1628 spin_lock(&nn->nn_lock);
1633 /* 1629 /*
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 825136070d2c..e7b760deefae 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -347,26 +347,6 @@ static struct dentry *dlm_debugfs_root;
347#define DLM_DEBUGFS_PURGE_LIST "purge_list" 347#define DLM_DEBUGFS_PURGE_LIST "purge_list"
348 348
349/* begin - utils funcs */ 349/* begin - utils funcs */
350static void dlm_debug_free(struct kref *kref)
351{
352 struct dlm_debug_ctxt *dc;
353
354 dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt);
355
356 kfree(dc);
357}
358
359static void dlm_debug_put(struct dlm_debug_ctxt *dc)
360{
361 if (dc)
362 kref_put(&dc->debug_refcnt, dlm_debug_free);
363}
364
365static void dlm_debug_get(struct dlm_debug_ctxt *dc)
366{
367 kref_get(&dc->debug_refcnt);
368}
369
370static int debug_release(struct inode *inode, struct file *file) 350static int debug_release(struct inode *inode, struct file *file)
371{ 351{
372 free_page((unsigned long)file->private_data); 352 free_page((unsigned long)file->private_data);
@@ -932,11 +912,9 @@ int dlm_debug_init(struct dlm_ctxt *dlm)
932 goto bail; 912 goto bail;
933 } 913 }
934 914
935 dlm_debug_get(dc);
936 return 0; 915 return 0;
937 916
938bail: 917bail:
939 dlm_debug_shutdown(dlm);
940 return -ENOMEM; 918 return -ENOMEM;
941} 919}
942 920
@@ -949,7 +927,8 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm)
949 debugfs_remove(dc->debug_mle_dentry); 927 debugfs_remove(dc->debug_mle_dentry);
950 debugfs_remove(dc->debug_lockres_dentry); 928 debugfs_remove(dc->debug_lockres_dentry);
951 debugfs_remove(dc->debug_state_dentry); 929 debugfs_remove(dc->debug_state_dentry);
952 dlm_debug_put(dc); 930 kfree(dc);
931 dc = NULL;
953 } 932 }
954} 933}
955 934
@@ -969,7 +948,6 @@ int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
969 mlog_errno(-ENOMEM); 948 mlog_errno(-ENOMEM);
970 goto bail; 949 goto bail;
971 } 950 }
972 kref_init(&dlm->dlm_debug_ctxt->debug_refcnt);
973 951
974 return 0; 952 return 0;
975bail: 953bail:
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
index 1f27c4812d1a..5ced5482e7d3 100644
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -30,7 +30,6 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle);
30#ifdef CONFIG_DEBUG_FS 30#ifdef CONFIG_DEBUG_FS
31 31
32struct dlm_debug_ctxt { 32struct dlm_debug_ctxt {
33 struct kref debug_refcnt;
34 struct dentry *debug_state_dentry; 33 struct dentry *debug_state_dentry;
35 struct dentry *debug_lockres_dentry; 34 struct dentry *debug_lockres_dentry;
36 struct dentry *debug_mle_dentry; 35 struct dentry *debug_mle_dentry;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1eaa9100c889..83d576f6a287 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1635,7 +1635,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
1635 int ret; 1635 int ret;
1636 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1636 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1637 1637
1638 BUG_ON(!inode);
1639 BUG_ON(!ocfs2_inode_is_new(inode)); 1638 BUG_ON(!ocfs2_inode_is_new(inode));
1640 1639
1641 mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); 1640 mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -1665,10 +1664,8 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
1665 } 1664 }
1666 1665
1667 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0); 1666 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
1668 if (ret) { 1667 if (ret)
1669 mlog_errno(ret); 1668 mlog_errno(ret);
1670 goto bail;
1671 }
1672 1669
1673bail: 1670bail:
1674 return ret; 1671 return ret;
@@ -1680,8 +1677,6 @@ int ocfs2_rw_lock(struct inode *inode, int write)
1680 struct ocfs2_lock_res *lockres; 1677 struct ocfs2_lock_res *lockres;
1681 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1678 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1682 1679
1683 BUG_ON(!inode);
1684
1685 mlog(0, "inode %llu take %s RW lock\n", 1680 mlog(0, "inode %llu take %s RW lock\n",
1686 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1681 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1687 write ? "EXMODE" : "PRMODE"); 1682 write ? "EXMODE" : "PRMODE");
@@ -1724,8 +1719,6 @@ int ocfs2_open_lock(struct inode *inode)
1724 struct ocfs2_lock_res *lockres; 1719 struct ocfs2_lock_res *lockres;
1725 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1720 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1726 1721
1727 BUG_ON(!inode);
1728
1729 mlog(0, "inode %llu take PRMODE open lock\n", 1722 mlog(0, "inode %llu take PRMODE open lock\n",
1730 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1723 (unsigned long long)OCFS2_I(inode)->ip_blkno);
1731 1724
@@ -1749,8 +1742,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
1749 struct ocfs2_lock_res *lockres; 1742 struct ocfs2_lock_res *lockres;
1750 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1743 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1751 1744
1752 BUG_ON(!inode);
1753
1754 mlog(0, "inode %llu try to take %s open lock\n", 1745 mlog(0, "inode %llu try to take %s open lock\n",
1755 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1746 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1756 write ? "EXMODE" : "PRMODE"); 1747 write ? "EXMODE" : "PRMODE");
@@ -2328,8 +2319,6 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
2328 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2319 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2329 struct buffer_head *local_bh = NULL; 2320 struct buffer_head *local_bh = NULL;
2330 2321
2331 BUG_ON(!inode);
2332
2333 mlog(0, "inode %llu, take %s META lock\n", 2322 mlog(0, "inode %llu, take %s META lock\n",
2334 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2323 (unsigned long long)OCFS2_I(inode)->ip_blkno,
2335 ex ? "EXMODE" : "PRMODE"); 2324 ex ? "EXMODE" : "PRMODE");
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index d8f3fc8d2551..50cc55047443 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -145,22 +145,15 @@ int ocfs2_drop_inode(struct inode *inode);
145struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); 145struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff);
146struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, 146struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
147 int sysfile_type); 147 int sysfile_type);
148int ocfs2_inode_init_private(struct inode *inode);
149int ocfs2_inode_revalidate(struct dentry *dentry); 148int ocfs2_inode_revalidate(struct dentry *dentry);
150void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 149void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
151 int create_ino); 150 int create_ino);
152void ocfs2_read_inode(struct inode *inode);
153void ocfs2_read_inode2(struct inode *inode, void *opaque);
154ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
155 size_t size, loff_t *offp);
156void ocfs2_sync_blockdev(struct super_block *sb); 151void ocfs2_sync_blockdev(struct super_block *sb);
157void ocfs2_refresh_inode(struct inode *inode, 152void ocfs2_refresh_inode(struct inode *inode,
158 struct ocfs2_dinode *fe); 153 struct ocfs2_dinode *fe);
159int ocfs2_mark_inode_dirty(handle_t *handle, 154int ocfs2_mark_inode_dirty(handle_t *handle,
160 struct inode *inode, 155 struct inode *inode,
161 struct buffer_head *bh); 156 struct buffer_head *bh);
162struct buffer_head *ocfs2_bread(struct inode *inode,
163 int block, int *err, int reada);
164 157
165void ocfs2_set_inode_flags(struct inode *inode); 158void ocfs2_set_inode_flags(struct inode *inode);
166void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi); 159void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index e607419cdfa4..a244f14c6b87 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1159,10 +1159,8 @@ static int ocfs2_force_read_journal(struct inode *inode)
1159 int status = 0; 1159 int status = 0;
1160 int i; 1160 int i;
1161 u64 v_blkno, p_blkno, p_blocks, num_blocks; 1161 u64 v_blkno, p_blkno, p_blocks, num_blocks;
1162#define CONCURRENT_JOURNAL_FILL 32ULL 1162 struct buffer_head *bh = NULL;
1163 struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; 1163 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1164
1165 memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
1166 1164
1167 num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 1165 num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
1168 v_blkno = 0; 1166 v_blkno = 0;
@@ -1174,29 +1172,32 @@ static int ocfs2_force_read_journal(struct inode *inode)
1174 goto bail; 1172 goto bail;
1175 } 1173 }
1176 1174
1177 if (p_blocks > CONCURRENT_JOURNAL_FILL) 1175 for (i = 0; i < p_blocks; i++, p_blkno++) {
1178 p_blocks = CONCURRENT_JOURNAL_FILL; 1176 bh = __find_get_block(osb->sb->s_bdev, p_blkno,
1179 1177 osb->sb->s_blocksize);
1180 /* We are reading journal data which should not 1178 /* block not cached. */
1181 * be put in the uptodate cache */ 1179 if (!bh)
1182 status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb), 1180 continue;
1183 p_blkno, p_blocks, bhs); 1181
1184 if (status < 0) { 1182 brelse(bh);
1185 mlog_errno(status); 1183 bh = NULL;
1186 goto bail; 1184 /* We are reading journal data which should not
1187 } 1185 * be put in the uptodate cache.
1186 */
1187 status = ocfs2_read_blocks_sync(osb, p_blkno, 1, &bh);
1188 if (status < 0) {
1189 mlog_errno(status);
1190 goto bail;
1191 }
1188 1192
1189 for(i = 0; i < p_blocks; i++) { 1193 brelse(bh);
1190 brelse(bhs[i]); 1194 bh = NULL;
1191 bhs[i] = NULL;
1192 } 1195 }
1193 1196
1194 v_blkno += p_blocks; 1197 v_blkno += p_blocks;
1195 } 1198 }
1196 1199
1197bail: 1200bail:
1198 for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
1199 brelse(bhs[i]);
1200 return status; 1201 return status;
1201} 1202}
1202 1203
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 13219ed73e1d..52c07346bea3 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -735,8 +735,6 @@ static void __exit ocfs2_stack_glue_exit(void)
735{ 735{
736 memset(&locking_max_version, 0, 736 memset(&locking_max_version, 0,
737 sizeof(struct ocfs2_protocol_version)); 737 sizeof(struct ocfs2_protocol_version));
738 locking_max_version.pv_major = 0;
739 locking_max_version.pv_minor = 0;
740 ocfs2_sysfs_exit(); 738 ocfs2_sysfs_exit();
741 if (ocfs2_table_header) 739 if (ocfs2_table_header)
742 unregister_sysctl_table(ocfs2_table_header); 740 unregister_sysctl_table(ocfs2_table_header);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3971146228d3..603b28d6f008 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2072,7 +2072,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
2072 osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash); 2072 osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash);
2073 2073
2074 osb->sb = sb; 2074 osb->sb = sb;
2075 /* Save off for ocfs2_rw_direct */
2076 osb->s_sectsize_bits = blksize_bits(sector_size); 2075 osb->s_sectsize_bits = blksize_bits(sector_size);
2077 BUG_ON(!osb->s_sectsize_bits); 2076 BUG_ON(!osb->s_sectsize_bits);
2078 2077
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index a44caabb0fc2..8f2fa94cc4f6 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -80,7 +80,7 @@ static int orangefs_readpages(struct file *file,
80 if (!add_to_page_cache(page, 80 if (!add_to_page_cache(page,
81 mapping, 81 mapping,
82 page->index, 82 page->index,
83 GFP_KERNEL)) { 83 readahead_gfp_mask(mapping))) {
84 ret = read_one_page(page); 84 ret = read_one_page(page);
85 gossip_debug(GOSSIP_INODE_DEBUG, 85 gossip_debug(GOSSIP_INODE_DEBUG,
86 "failure adding page to cache, read_one_page returned: %d\n", 86 "failure adding page to cache, read_one_page returned: %d\n",
diff --git a/fs/pipe.c b/fs/pipe.c
index 0d3f5165cb0b..4b32928f5426 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,6 +21,7 @@
21#include <linux/audit.h> 21#include <linux/audit.h>
22#include <linux/syscalls.h> 22#include <linux/syscalls.h>
23#include <linux/fcntl.h> 23#include <linux/fcntl.h>
24#include <linux/memcontrol.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include <asm/ioctls.h> 27#include <asm/ioctls.h>
@@ -137,6 +138,22 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
137 put_page(page); 138 put_page(page);
138} 139}
139 140
141static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
142 struct pipe_buffer *buf)
143{
144 struct page *page = buf->page;
145
146 if (page_count(page) == 1) {
147 if (memcg_kmem_enabled()) {
148 memcg_kmem_uncharge(page, 0);
149 __ClearPageKmemcg(page);
150 }
151 __SetPageLocked(page);
152 return 0;
153 }
154 return 1;
155}
156
140/** 157/**
141 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 158 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
142 * @pipe: the pipe that the buffer belongs to 159 * @pipe: the pipe that the buffer belongs to
@@ -219,7 +236,7 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
219 .can_merge = 1, 236 .can_merge = 1,
220 .confirm = generic_pipe_buf_confirm, 237 .confirm = generic_pipe_buf_confirm,
221 .release = anon_pipe_buf_release, 238 .release = anon_pipe_buf_release,
222 .steal = generic_pipe_buf_steal, 239 .steal = anon_pipe_buf_steal,
223 .get = generic_pipe_buf_get, 240 .get = generic_pipe_buf_get,
224}; 241};
225 242
@@ -227,7 +244,7 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = {
227 .can_merge = 0, 244 .can_merge = 0,
228 .confirm = generic_pipe_buf_confirm, 245 .confirm = generic_pipe_buf_confirm,
229 .release = anon_pipe_buf_release, 246 .release = anon_pipe_buf_release,
230 .steal = generic_pipe_buf_steal, 247 .steal = anon_pipe_buf_steal,
231 .get = generic_pipe_buf_get, 248 .get = generic_pipe_buf_get,
232}; 249};
233 250
@@ -405,7 +422,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
405 int copied; 422 int copied;
406 423
407 if (!page) { 424 if (!page) {
408 page = alloc_page(GFP_HIGHUSER); 425 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
409 if (unlikely(!page)) { 426 if (unlikely(!page)) {
410 ret = ret ? : -ENOMEM; 427 ret = ret ? : -ENOMEM;
411 break; 428 break;
@@ -611,7 +628,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
611{ 628{
612 struct pipe_inode_info *pipe; 629 struct pipe_inode_info *pipe;
613 630
614 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 631 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
615 if (pipe) { 632 if (pipe) {
616 unsigned long pipe_bufs = PIPE_DEF_BUFFERS; 633 unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
617 struct user_struct *user = get_current_user(); 634 struct user_struct *user = get_current_user();
@@ -619,7 +636,9 @@ struct pipe_inode_info *alloc_pipe_info(void)
619 if (!too_many_pipe_buffers_hard(user)) { 636 if (!too_many_pipe_buffers_hard(user)) {
620 if (too_many_pipe_buffers_soft(user)) 637 if (too_many_pipe_buffers_soft(user))
621 pipe_bufs = 1; 638 pipe_bufs = 1;
622 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL); 639 pipe->bufs = kcalloc(pipe_bufs,
640 sizeof(struct pipe_buffer),
641 GFP_KERNEL_ACCOUNT);
623 } 642 }
624 643
625 if (pipe->bufs) { 644 if (pipe->bufs) {
@@ -1010,7 +1029,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
1010 if (nr_pages < pipe->nrbufs) 1029 if (nr_pages < pipe->nrbufs)
1011 return -EBUSY; 1030 return -EBUSY;
1012 1031
1013 bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); 1032 bufs = kcalloc(nr_pages, sizeof(*bufs),
1033 GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
1014 if (unlikely(!bufs)) 1034 if (unlikely(!bufs))
1015 return -ENOMEM; 1035 return -ENOMEM;
1016 1036
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 83720460c5bc..cf301a9ef512 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -105,6 +105,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
105#endif 105#endif
106#ifdef CONFIG_TRANSPARENT_HUGEPAGE 106#ifdef CONFIG_TRANSPARENT_HUGEPAGE
107 "AnonHugePages: %8lu kB\n" 107 "AnonHugePages: %8lu kB\n"
108 "ShmemHugePages: %8lu kB\n"
109 "ShmemPmdMapped: %8lu kB\n"
108#endif 110#endif
109#ifdef CONFIG_CMA 111#ifdef CONFIG_CMA
110 "CmaTotal: %8lu kB\n" 112 "CmaTotal: %8lu kB\n"
@@ -162,8 +164,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
162 , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) 164 , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
163#endif 165#endif
164#ifdef CONFIG_TRANSPARENT_HUGEPAGE 166#ifdef CONFIG_TRANSPARENT_HUGEPAGE
165 , K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * 167 , K(global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR)
166 HPAGE_PMD_NR) 168 , K(global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR)
169 , K(global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR)
167#endif 170#endif
168#ifdef CONFIG_CMA 171#ifdef CONFIG_CMA
169 , K(totalcma_pages) 172 , K(totalcma_pages)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4648c7f63ae2..187d84ef9de9 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -448,6 +448,7 @@ struct mem_size_stats {
448 unsigned long referenced; 448 unsigned long referenced;
449 unsigned long anonymous; 449 unsigned long anonymous;
450 unsigned long anonymous_thp; 450 unsigned long anonymous_thp;
451 unsigned long shmem_thp;
451 unsigned long swap; 452 unsigned long swap;
452 unsigned long shared_hugetlb; 453 unsigned long shared_hugetlb;
453 unsigned long private_hugetlb; 454 unsigned long private_hugetlb;
@@ -576,7 +577,12 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
576 page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); 577 page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
577 if (IS_ERR_OR_NULL(page)) 578 if (IS_ERR_OR_NULL(page))
578 return; 579 return;
579 mss->anonymous_thp += HPAGE_PMD_SIZE; 580 if (PageAnon(page))
581 mss->anonymous_thp += HPAGE_PMD_SIZE;
582 else if (PageSwapBacked(page))
583 mss->shmem_thp += HPAGE_PMD_SIZE;
584 else
585 VM_BUG_ON_PAGE(1, page);
580 smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); 586 smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
581} 587}
582#else 588#else
@@ -770,6 +776,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
770 "Referenced: %8lu kB\n" 776 "Referenced: %8lu kB\n"
771 "Anonymous: %8lu kB\n" 777 "Anonymous: %8lu kB\n"
772 "AnonHugePages: %8lu kB\n" 778 "AnonHugePages: %8lu kB\n"
779 "ShmemPmdMapped: %8lu kB\n"
773 "Shared_Hugetlb: %8lu kB\n" 780 "Shared_Hugetlb: %8lu kB\n"
774 "Private_Hugetlb: %7lu kB\n" 781 "Private_Hugetlb: %7lu kB\n"
775 "Swap: %8lu kB\n" 782 "Swap: %8lu kB\n"
@@ -787,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
787 mss.referenced >> 10, 794 mss.referenced >> 10,
788 mss.anonymous >> 10, 795 mss.anonymous >> 10,
789 mss.anonymous_thp >> 10, 796 mss.anonymous_thp >> 10,
797 mss.shmem_thp >> 10,
790 mss.shared_hugetlb >> 10, 798 mss.shared_hugetlb >> 10,
791 mss.private_hugetlb >> 10, 799 mss.private_hugetlb >> 10,
792 mss.swap >> 10, 800 mss.swap >> 10,
diff --git a/fs/super.c b/fs/super.c
index d78b9847e6cb..5806ffd45563 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -206,6 +206,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
206 mutex_init(&s->s_sync_lock); 206 mutex_init(&s->s_sync_lock);
207 INIT_LIST_HEAD(&s->s_inodes); 207 INIT_LIST_HEAD(&s->s_inodes);
208 spin_lock_init(&s->s_inode_list_lock); 208 spin_lock_init(&s->s_inode_list_lock);
209 INIT_LIST_HEAD(&s->s_inodes_wb);
210 spin_lock_init(&s->s_inode_wblist_lock);
209 211
210 if (list_lru_init_memcg(&s->s_dentry_lru)) 212 if (list_lru_init_memcg(&s->s_dentry_lru))
211 goto fail; 213 goto fail;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 2d97952e341a..85959d8324df 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -257,10 +257,9 @@ out:
257 * fatal_signal_pending()s, and the mmap_sem must be released before 257 * fatal_signal_pending()s, and the mmap_sem must be released before
258 * returning it. 258 * returning it.
259 */ 259 */
260int handle_userfault(struct vm_area_struct *vma, unsigned long address, 260int handle_userfault(struct fault_env *fe, unsigned long reason)
261 unsigned int flags, unsigned long reason)
262{ 261{
263 struct mm_struct *mm = vma->vm_mm; 262 struct mm_struct *mm = fe->vma->vm_mm;
264 struct userfaultfd_ctx *ctx; 263 struct userfaultfd_ctx *ctx;
265 struct userfaultfd_wait_queue uwq; 264 struct userfaultfd_wait_queue uwq;
266 int ret; 265 int ret;
@@ -269,7 +268,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
269 BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); 268 BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
270 269
271 ret = VM_FAULT_SIGBUS; 270 ret = VM_FAULT_SIGBUS;
272 ctx = vma->vm_userfaultfd_ctx.ctx; 271 ctx = fe->vma->vm_userfaultfd_ctx.ctx;
273 if (!ctx) 272 if (!ctx)
274 goto out; 273 goto out;
275 274
@@ -302,17 +301,17 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
302 * without first stopping userland access to the memory. For 301 * without first stopping userland access to the memory. For
303 * VM_UFFD_MISSING userfaults this is enough for now. 302 * VM_UFFD_MISSING userfaults this is enough for now.
304 */ 303 */
305 if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) { 304 if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) {
306 /* 305 /*
307 * Validate the invariant that nowait must allow retry 306 * Validate the invariant that nowait must allow retry
308 * to be sure not to return SIGBUS erroneously on 307 * to be sure not to return SIGBUS erroneously on
309 * nowait invocations. 308 * nowait invocations.
310 */ 309 */
311 BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT); 310 BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT);
312#ifdef CONFIG_DEBUG_VM 311#ifdef CONFIG_DEBUG_VM
313 if (printk_ratelimit()) { 312 if (printk_ratelimit()) {
314 printk(KERN_WARNING 313 printk(KERN_WARNING
315 "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags); 314 "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags);
316 dump_stack(); 315 dump_stack();
317 } 316 }
318#endif 317#endif
@@ -324,7 +323,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
324 * and wait. 323 * and wait.
325 */ 324 */
326 ret = VM_FAULT_RETRY; 325 ret = VM_FAULT_RETRY;
327 if (flags & FAULT_FLAG_RETRY_NOWAIT) 326 if (fe->flags & FAULT_FLAG_RETRY_NOWAIT)
328 goto out; 327 goto out;
329 328
330 /* take the reference before dropping the mmap_sem */ 329 /* take the reference before dropping the mmap_sem */
@@ -332,10 +331,11 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
332 331
333 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); 332 init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
334 uwq.wq.private = current; 333 uwq.wq.private = current;
335 uwq.msg = userfault_msg(address, flags, reason); 334 uwq.msg = userfault_msg(fe->address, fe->flags, reason);
336 uwq.ctx = ctx; 335 uwq.ctx = ctx;
337 336
338 return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == 337 return_to_userland =
338 (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
339 (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); 339 (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
340 340
341 spin_lock(&ctx->fault_pending_wqh.lock); 341 spin_lock(&ctx->fault_pending_wqh.lock);
@@ -353,7 +353,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
353 TASK_KILLABLE); 353 TASK_KILLABLE);
354 spin_unlock(&ctx->fault_pending_wqh.lock); 354 spin_unlock(&ctx->fault_pending_wqh.lock);
355 355
356 must_wait = userfaultfd_must_wait(ctx, address, flags, reason); 356 must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason);
357 up_read(&mm->mmap_sem); 357 up_read(&mm->mmap_sem);
358 358
359 if (likely(must_wait && !ACCESS_ONCE(ctx->released) && 359 if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 47fc63295422..1b3dc9dd8861 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1551,7 +1551,7 @@ xfs_filemap_page_mkwrite(
1551 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1551 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1552 1552
1553 if (IS_DAX(inode)) { 1553 if (IS_DAX(inode)) {
1554 ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); 1554 ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
1555 } else { 1555 } else {
1556 ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); 1556 ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
1557 ret = block_page_mkwrite_return(ret); 1557 ret = block_page_mkwrite_return(ret);
@@ -1585,7 +1585,7 @@ xfs_filemap_fault(
1585 * changes to xfs_get_blocks_direct() to map unwritten extent 1585 * changes to xfs_get_blocks_direct() to map unwritten extent
1586 * ioend for conversion on read-only mappings. 1586 * ioend for conversion on read-only mappings.
1587 */ 1587 */
1588 ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault); 1588 ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault);
1589 } else 1589 } else
1590 ret = filemap_fault(vma, vmf); 1590 ret = filemap_fault(vma, vmf);
1591 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1591 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1622,7 +1622,7 @@ xfs_filemap_pmd_fault(
1622 } 1622 }
1623 1623
1624 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1624 xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1625 ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); 1625 ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
1626 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); 1626 xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1627 1627
1628 if (flags & FAULT_FLAG_WRITE) 1628 if (flags & FAULT_FLAG_WRITE)
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 9dbb739cafa0..c6d667187608 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -107,6 +107,12 @@ struct mmu_gather {
107 struct mmu_gather_batch local; 107 struct mmu_gather_batch local;
108 struct page *__pages[MMU_GATHER_BUNDLE]; 108 struct page *__pages[MMU_GATHER_BUNDLE];
109 unsigned int batch_count; 109 unsigned int batch_count;
110 /*
111 * __tlb_adjust_range will track the new addr here,
112 * that that we can adjust the range after the flush
113 */
114 unsigned long addr;
115 int page_size;
110}; 116};
111 117
112#define HAVE_GENERIC_MMU_GATHER 118#define HAVE_GENERIC_MMU_GATHER
@@ -115,23 +121,20 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long
115void tlb_flush_mmu(struct mmu_gather *tlb); 121void tlb_flush_mmu(struct mmu_gather *tlb);
116void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, 122void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start,
117 unsigned long end); 123 unsigned long end);
118int __tlb_remove_page(struct mmu_gather *tlb, struct page *page); 124extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
119 125 int page_size);
120/* tlb_remove_page
121 * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when
122 * required.
123 */
124static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
125{
126 if (!__tlb_remove_page(tlb, page))
127 tlb_flush_mmu(tlb);
128}
129 126
130static inline void __tlb_adjust_range(struct mmu_gather *tlb, 127static inline void __tlb_adjust_range(struct mmu_gather *tlb,
131 unsigned long address) 128 unsigned long address)
132{ 129{
133 tlb->start = min(tlb->start, address); 130 tlb->start = min(tlb->start, address);
134 tlb->end = max(tlb->end, address + PAGE_SIZE); 131 tlb->end = max(tlb->end, address + PAGE_SIZE);
132 /*
133 * Track the last address with which we adjusted the range. This
134 * will be used later to adjust again after a mmu_flush due to
135 * failed __tlb_remove_page
136 */
137 tlb->addr = address;
135} 138}
136 139
137static inline void __tlb_reset_range(struct mmu_gather *tlb) 140static inline void __tlb_reset_range(struct mmu_gather *tlb)
@@ -144,6 +147,40 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb)
144 } 147 }
145} 148}
146 149
150static inline void tlb_remove_page_size(struct mmu_gather *tlb,
151 struct page *page, int page_size)
152{
153 if (__tlb_remove_page_size(tlb, page, page_size)) {
154 tlb_flush_mmu(tlb);
155 tlb->page_size = page_size;
156 __tlb_adjust_range(tlb, tlb->addr);
157 __tlb_remove_page_size(tlb, page, page_size);
158 }
159}
160
161static bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
162{
163 return __tlb_remove_page_size(tlb, page, PAGE_SIZE);
164}
165
166/* tlb_remove_page
167 * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when
168 * required.
169 */
170static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
171{
172 return tlb_remove_page_size(tlb, page, PAGE_SIZE);
173}
174
175static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *page)
176{
177 /* active->nr should be zero when we call this */
178 VM_BUG_ON_PAGE(tlb->active->nr, page);
179 tlb->page_size = PAGE_SIZE;
180 __tlb_adjust_range(tlb, tlb->addr);
181 return __tlb_remove_page(tlb, page);
182}
183
147/* 184/*
148 * In the case of tlb vma handling, we can optimise these away in the 185 * In the case of tlb vma handling, we can optimise these away in the
149 * case where we're doing a full MM flush. When we're doing a munmap, 186 * case where we're doing a full MM flush. When we're doing a munmap,
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h
index 9b0a15d06a4f..79542b2698ec 100644
--- a/include/linux/balloon_compaction.h
+++ b/include/linux/balloon_compaction.h
@@ -48,6 +48,7 @@
48#include <linux/migrate.h> 48#include <linux/migrate.h>
49#include <linux/gfp.h> 49#include <linux/gfp.h>
50#include <linux/err.h> 50#include <linux/err.h>
51#include <linux/fs.h>
51 52
52/* 53/*
53 * Balloon device information descriptor. 54 * Balloon device information descriptor.
@@ -62,6 +63,7 @@ struct balloon_dev_info {
62 struct list_head pages; /* Pages enqueued & handled to Host */ 63 struct list_head pages; /* Pages enqueued & handled to Host */
63 int (*migratepage)(struct balloon_dev_info *, struct page *newpage, 64 int (*migratepage)(struct balloon_dev_info *, struct page *newpage,
64 struct page *page, enum migrate_mode mode); 65 struct page *page, enum migrate_mode mode);
66 struct inode *inode;
65}; 67};
66 68
67extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info); 69extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info);
@@ -73,45 +75,19 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon)
73 spin_lock_init(&balloon->pages_lock); 75 spin_lock_init(&balloon->pages_lock);
74 INIT_LIST_HEAD(&balloon->pages); 76 INIT_LIST_HEAD(&balloon->pages);
75 balloon->migratepage = NULL; 77 balloon->migratepage = NULL;
78 balloon->inode = NULL;
76} 79}
77 80
78#ifdef CONFIG_BALLOON_COMPACTION 81#ifdef CONFIG_BALLOON_COMPACTION
79extern bool balloon_page_isolate(struct page *page); 82extern const struct address_space_operations balloon_aops;
83extern bool balloon_page_isolate(struct page *page,
84 isolate_mode_t mode);
80extern void balloon_page_putback(struct page *page); 85extern void balloon_page_putback(struct page *page);
81extern int balloon_page_migrate(struct page *newpage, 86extern int balloon_page_migrate(struct address_space *mapping,
87 struct page *newpage,
82 struct page *page, enum migrate_mode mode); 88 struct page *page, enum migrate_mode mode);
83 89
84/* 90/*
85 * __is_movable_balloon_page - helper to perform @page PageBalloon tests
86 */
87static inline bool __is_movable_balloon_page(struct page *page)
88{
89 return PageBalloon(page);
90}
91
92/*
93 * balloon_page_movable - test PageBalloon to identify balloon pages
94 * and PagePrivate to check that the page is not
95 * isolated and can be moved by compaction/migration.
96 *
97 * As we might return false positives in the case of a balloon page being just
98 * released under us, this need to be re-tested later, under the page lock.
99 */
100static inline bool balloon_page_movable(struct page *page)
101{
102 return PageBalloon(page) && PagePrivate(page);
103}
104
105/*
106 * isolated_balloon_page - identify an isolated balloon page on private
107 * compaction/migration page lists.
108 */
109static inline bool isolated_balloon_page(struct page *page)
110{
111 return PageBalloon(page);
112}
113
114/*
115 * balloon_page_insert - insert a page into the balloon's page list and make 91 * balloon_page_insert - insert a page into the balloon's page list and make
116 * the page->private assignment accordingly. 92 * the page->private assignment accordingly.
117 * @balloon : pointer to balloon device 93 * @balloon : pointer to balloon device
@@ -124,7 +100,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
124 struct page *page) 100 struct page *page)
125{ 101{
126 __SetPageBalloon(page); 102 __SetPageBalloon(page);
127 SetPagePrivate(page); 103 __SetPageMovable(page, balloon->inode->i_mapping);
128 set_page_private(page, (unsigned long)balloon); 104 set_page_private(page, (unsigned long)balloon);
129 list_add(&page->lru, &balloon->pages); 105 list_add(&page->lru, &balloon->pages);
130} 106}
@@ -140,11 +116,14 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon,
140static inline void balloon_page_delete(struct page *page) 116static inline void balloon_page_delete(struct page *page)
141{ 117{
142 __ClearPageBalloon(page); 118 __ClearPageBalloon(page);
119 __ClearPageMovable(page);
143 set_page_private(page, 0); 120 set_page_private(page, 0);
144 if (PagePrivate(page)) { 121 /*
145 ClearPagePrivate(page); 122 * No touch page.lru field once @page has been isolated
123 * because VM is using the field.
124 */
125 if (!PageIsolated(page))
146 list_del(&page->lru); 126 list_del(&page->lru);
147 }
148} 127}
149 128
150/* 129/*
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index a58c852a268f..1a02dab16646 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -212,6 +212,7 @@ static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_i
212#endif /* CONFIG_COMPACTION */ 212#endif /* CONFIG_COMPACTION */
213 213
214#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) 214#if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
215struct node;
215extern int compaction_register_node(struct node *node); 216extern int compaction_register_node(struct node *node);
216extern void compaction_unregister_node(struct node *node); 217extern void compaction_unregister_node(struct node *node);
217 218
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 43d5f0b799c7..9c6dc7704043 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -14,7 +14,6 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
14int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); 14int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
15int dax_truncate_page(struct inode *, loff_t from, get_block_t); 15int dax_truncate_page(struct inode *, loff_t from, get_block_t);
16int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); 16int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
17int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
18int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); 17int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
19void dax_wake_mapping_entry_waiter(struct address_space *mapping, 18void dax_wake_mapping_entry_waiter(struct address_space *mapping,
20 pgoff_t index, bool wake_all); 19 pgoff_t index, bool wake_all);
@@ -46,19 +45,15 @@ static inline int __dax_zero_page_range(struct block_device *bdev,
46#if defined(CONFIG_TRANSPARENT_HUGEPAGE) 45#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
47int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, 46int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
48 unsigned int flags, get_block_t); 47 unsigned int flags, get_block_t);
49int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *,
50 unsigned int flags, get_block_t);
51#else 48#else
52static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, 49static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
53 pmd_t *pmd, unsigned int flags, get_block_t gb) 50 pmd_t *pmd, unsigned int flags, get_block_t gb)
54{ 51{
55 return VM_FAULT_FALLBACK; 52 return VM_FAULT_FALLBACK;
56} 53}
57#define __dax_pmd_fault dax_pmd_fault
58#endif 54#endif
59int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); 55int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
60#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) 56#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
61#define __dax_mkwrite(vma, vmf, gb) __dax_fault(vma, vmf, gb)
62 57
63static inline bool vma_is_dax(struct vm_area_struct *vma) 58static inline bool vma_is_dax(struct vm_area_struct *vma)
64{ 59{
diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index 46056cb161fc..d82bf1994485 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -38,7 +38,7 @@ struct debug_obj {
38 * @name: name of the object typee 38 * @name: name of the object typee
39 * @debug_hint: function returning address, which have associated 39 * @debug_hint: function returning address, which have associated
40 * kernel symbol, to allow identify the object 40 * kernel symbol, to allow identify the object
41 * @is_static_object return true if the obj is static, otherwise return false 41 * @is_static_object: return true if the obj is static, otherwise return false
42 * @fixup_init: fixup function, which is called when the init check 42 * @fixup_init: fixup function, which is called when the init check
43 * fails. All fixup functions must return true if fixup 43 * fails. All fixup functions must return true if fixup
44 * was successful, otherwise return false 44 * was successful, otherwise return false
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index e65ef959546c..c46d2aa16d81 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -4,6 +4,7 @@
4#include <linux/swap.h> 4#include <linux/swap.h>
5#include <linux/mm.h> 5#include <linux/mm.h>
6#include <linux/bitops.h> 6#include <linux/bitops.h>
7#include <linux/jump_label.h>
7 8
8struct frontswap_ops { 9struct frontswap_ops {
9 void (*init)(unsigned); /* this swap type was just swapon'ed */ 10 void (*init)(unsigned); /* this swap type was just swapon'ed */
@@ -14,7 +15,6 @@ struct frontswap_ops {
14 struct frontswap_ops *next; /* private pointer to next ops */ 15 struct frontswap_ops *next; /* private pointer to next ops */
15}; 16};
16 17
17extern bool frontswap_enabled;
18extern void frontswap_register_ops(struct frontswap_ops *ops); 18extern void frontswap_register_ops(struct frontswap_ops *ops);
19extern void frontswap_shrink(unsigned long); 19extern void frontswap_shrink(unsigned long);
20extern unsigned long frontswap_curr_pages(void); 20extern unsigned long frontswap_curr_pages(void);
@@ -30,7 +30,12 @@ extern void __frontswap_invalidate_page(unsigned, pgoff_t);
30extern void __frontswap_invalidate_area(unsigned); 30extern void __frontswap_invalidate_area(unsigned);
31 31
32#ifdef CONFIG_FRONTSWAP 32#ifdef CONFIG_FRONTSWAP
33#define frontswap_enabled (1) 33extern struct static_key_false frontswap_enabled_key;
34
35static inline bool frontswap_enabled(void)
36{
37 return static_branch_unlikely(&frontswap_enabled_key);
38}
34 39
35static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) 40static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
36{ 41{
@@ -50,7 +55,10 @@ static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
50#else 55#else
51/* all inline routines become no-ops and all externs are ignored */ 56/* all inline routines become no-ops and all externs are ignored */
52 57
53#define frontswap_enabled (0) 58static inline bool frontswap_enabled(void)
59{
60 return false;
61}
54 62
55static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) 63static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
56{ 64{
@@ -70,37 +78,35 @@ static inline unsigned long *frontswap_map_get(struct swap_info_struct *p)
70 78
71static inline int frontswap_store(struct page *page) 79static inline int frontswap_store(struct page *page)
72{ 80{
73 int ret = -1; 81 if (frontswap_enabled())
82 return __frontswap_store(page);
74 83
75 if (frontswap_enabled) 84 return -1;
76 ret = __frontswap_store(page);
77 return ret;
78} 85}
79 86
80static inline int frontswap_load(struct page *page) 87static inline int frontswap_load(struct page *page)
81{ 88{
82 int ret = -1; 89 if (frontswap_enabled())
90 return __frontswap_load(page);
83 91
84 if (frontswap_enabled) 92 return -1;
85 ret = __frontswap_load(page);
86 return ret;
87} 93}
88 94
89static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset) 95static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset)
90{ 96{
91 if (frontswap_enabled) 97 if (frontswap_enabled())
92 __frontswap_invalidate_page(type, offset); 98 __frontswap_invalidate_page(type, offset);
93} 99}
94 100
95static inline void frontswap_invalidate_area(unsigned type) 101static inline void frontswap_invalidate_area(unsigned type)
96{ 102{
97 if (frontswap_enabled) 103 if (frontswap_enabled())
98 __frontswap_invalidate_area(type); 104 __frontswap_invalidate_area(type);
99} 105}
100 106
101static inline void frontswap_init(unsigned type, unsigned long *map) 107static inline void frontswap_init(unsigned type, unsigned long *map)
102{ 108{
103 if (frontswap_enabled) 109 if (frontswap_enabled())
104 __frontswap_init(type, map); 110 __frontswap_init(type, map);
105} 111}
106 112
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dc488662ce0b..f2a69f20926f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -398,6 +398,8 @@ struct address_space_operations {
398 */ 398 */
399 int (*migratepage) (struct address_space *, 399 int (*migratepage) (struct address_space *,
400 struct page *, struct page *, enum migrate_mode); 400 struct page *, struct page *, enum migrate_mode);
401 bool (*isolate_page)(struct page *, isolate_mode_t);
402 void (*putback_page)(struct page *);
401 int (*launder_page) (struct page *); 403 int (*launder_page) (struct page *);
402 int (*is_partially_uptodate) (struct page *, unsigned long, 404 int (*is_partially_uptodate) (struct page *, unsigned long,
403 unsigned long); 405 unsigned long);
@@ -661,6 +663,7 @@ struct inode {
661#endif 663#endif
662 struct list_head i_lru; /* inode LRU list */ 664 struct list_head i_lru; /* inode LRU list */
663 struct list_head i_sb_list; 665 struct list_head i_sb_list;
666 struct list_head i_wb_list; /* backing dev writeback list */
664 union { 667 union {
665 struct hlist_head i_dentry; 668 struct hlist_head i_dentry;
666 struct rcu_head i_rcu; 669 struct rcu_head i_rcu;
@@ -1444,6 +1447,9 @@ struct super_block {
1444 /* s_inode_list_lock protects s_inodes */ 1447 /* s_inode_list_lock protects s_inodes */
1445 spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; 1448 spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp;
1446 struct list_head s_inodes; /* all inodes */ 1449 struct list_head s_inodes; /* all inodes */
1450
1451 spinlock_t s_inode_wblist_lock;
1452 struct list_head s_inodes_wb; /* writeback inodes */
1447}; 1453};
1448 1454
1449extern struct timespec current_fs_time(struct super_block *sb); 1455extern struct timespec current_fs_time(struct super_block *sb);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 570383a41853..c29e9d347bc6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -78,8 +78,7 @@ struct vm_area_struct;
78 * __GFP_THISNODE forces the allocation to be satisified from the requested 78 * __GFP_THISNODE forces the allocation to be satisified from the requested
79 * node with no fallbacks or placement policy enforcements. 79 * node with no fallbacks or placement policy enforcements.
80 * 80 *
81 * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg (only relevant 81 * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
82 * to kmem allocations).
83 */ 82 */
84#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) 83#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
85#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) 84#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE)
@@ -486,10 +485,6 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
486#define alloc_page_vma_node(gfp_mask, vma, addr, node) \ 485#define alloc_page_vma_node(gfp_mask, vma, addr, node) \
487 alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) 486 alloc_pages_vma(gfp_mask, 0, vma, addr, node, false)
488 487
489extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order);
490extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask,
491 unsigned int order);
492
493extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); 488extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
494extern unsigned long get_zeroed_page(gfp_t gfp_mask); 489extern unsigned long get_zeroed_page(gfp_t gfp_mask);
495 490
@@ -513,9 +508,6 @@ extern void *__alloc_page_frag(struct page_frag_cache *nc,
513 unsigned int fragsz, gfp_t gfp_mask); 508 unsigned int fragsz, gfp_t gfp_mask);
514extern void __free_page_frag(void *addr); 509extern void __free_page_frag(void *addr);
515 510
516extern void __free_kmem_pages(struct page *page, unsigned int order);
517extern void free_kmem_pages(unsigned long addr, unsigned int order);
518
519#define __free_page(page) __free_pages((page), 0) 511#define __free_page(page) __free_pages((page), 0)
520#define free_page(addr) free_pages((addr), 0) 512#define free_page(addr) free_pages((addr), 0)
521 513
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index f0a7a0320300..92ce91c03cd0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -1,20 +1,12 @@
1#ifndef _LINUX_HUGE_MM_H 1#ifndef _LINUX_HUGE_MM_H
2#define _LINUX_HUGE_MM_H 2#define _LINUX_HUGE_MM_H
3 3
4extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, 4extern int do_huge_pmd_anonymous_page(struct fault_env *fe);
5 struct vm_area_struct *vma,
6 unsigned long address, pmd_t *pmd,
7 unsigned int flags);
8extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, 5extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
9 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, 6 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
10 struct vm_area_struct *vma); 7 struct vm_area_struct *vma);
11extern void huge_pmd_set_accessed(struct mm_struct *mm, 8extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd);
12 struct vm_area_struct *vma, 9extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd);
13 unsigned long address, pmd_t *pmd,
14 pmd_t orig_pmd, int dirty);
15extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
16 unsigned long address, pmd_t *pmd,
17 pmd_t orig_pmd);
18extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, 10extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
19 unsigned long addr, 11 unsigned long addr,
20 pmd_t *pmd, 12 pmd_t *pmd,
@@ -49,6 +41,18 @@ enum transparent_hugepage_flag {
49#endif 41#endif
50}; 42};
51 43
44struct kobject;
45struct kobj_attribute;
46
47extern ssize_t single_hugepage_flag_store(struct kobject *kobj,
48 struct kobj_attribute *attr,
49 const char *buf, size_t count,
50 enum transparent_hugepage_flag flag);
51extern ssize_t single_hugepage_flag_show(struct kobject *kobj,
52 struct kobj_attribute *attr, char *buf,
53 enum transparent_hugepage_flag flag);
54extern struct kobj_attribute shmem_enabled_attr;
55
52#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) 56#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
53#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) 57#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
54 58
@@ -134,8 +138,7 @@ static inline int hpage_nr_pages(struct page *page)
134 return 1; 138 return 1;
135} 139}
136 140
137extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 141extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd);
138 unsigned long addr, pmd_t pmd, pmd_t *pmdp);
139 142
140extern struct page *huge_zero_page; 143extern struct page *huge_zero_page;
141 144
@@ -152,6 +155,8 @@ static inline bool is_huge_zero_pmd(pmd_t pmd)
152struct page *get_huge_zero_page(void); 155struct page *get_huge_zero_page(void);
153void put_huge_zero_page(void); 156void put_huge_zero_page(void);
154 157
158#define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))
159
155#else /* CONFIG_TRANSPARENT_HUGEPAGE */ 160#else /* CONFIG_TRANSPARENT_HUGEPAGE */
156#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) 161#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
157#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) 162#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -161,6 +166,8 @@ void put_huge_zero_page(void);
161 166
162#define transparent_hugepage_enabled(__vma) 0 167#define transparent_hugepage_enabled(__vma) 0
163 168
169static inline void prep_transhuge_page(struct page *page) {}
170
164#define transparent_hugepage_flags 0UL 171#define transparent_hugepage_flags 0UL
165static inline int 172static inline int
166split_huge_page_to_list(struct page *page, struct list_head *list) 173split_huge_page_to_list(struct page *page, struct list_head *list)
@@ -196,8 +203,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
196 return NULL; 203 return NULL;
197} 204}
198 205
199static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 206static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd)
200 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
201{ 207{
202 return 0; 208 return 0;
203} 209}
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index eeb307985715..1e032a1ddb3e 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -4,6 +4,11 @@
4#include <linux/sched.h> /* MMF_VM_HUGEPAGE */ 4#include <linux/sched.h> /* MMF_VM_HUGEPAGE */
5 5
6#ifdef CONFIG_TRANSPARENT_HUGEPAGE 6#ifdef CONFIG_TRANSPARENT_HUGEPAGE
7extern struct attribute_group khugepaged_attr_group;
8
9extern int khugepaged_init(void);
10extern void khugepaged_destroy(void);
11extern int start_stop_khugepaged(void);
7extern int __khugepaged_enter(struct mm_struct *mm); 12extern int __khugepaged_enter(struct mm_struct *mm);
8extern void __khugepaged_exit(struct mm_struct *mm); 13extern void __khugepaged_exit(struct mm_struct *mm);
9extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, 14extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 7ae216a39c9e..481c8c4627ca 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -43,8 +43,7 @@ static inline struct stable_node *page_stable_node(struct page *page)
43static inline void set_page_stable_node(struct page *page, 43static inline void set_page_stable_node(struct page *page,
44 struct stable_node *stable_node) 44 struct stable_node *stable_node)
45{ 45{
46 page->mapping = (void *)stable_node + 46 page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
47 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
48} 47}
49 48
50/* 49/*
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 3106ac1c895e..6c14b6179727 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -73,8 +73,8 @@ extern bool movable_node_enabled;
73 if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) 73 if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
74 74
75phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, 75phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
76 phys_addr_t start, phys_addr_t end, 76 phys_addr_t start, phys_addr_t end,
77 int nid, ulong flags); 77 int nid, ulong flags);
78phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, 78phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
79 phys_addr_t size, phys_addr_t align); 79 phys_addr_t size, phys_addr_t align);
80phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); 80phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
@@ -110,7 +110,7 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
110 phys_addr_t *out_end, int *out_nid); 110 phys_addr_t *out_end, int *out_nid);
111 111
112void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, 112void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
113 phys_addr_t *out_end); 113 phys_addr_t *out_end);
114 114
115/** 115/**
116 * for_each_mem_range - iterate through memblock areas from type_a and not 116 * for_each_mem_range - iterate through memblock areas from type_a and not
@@ -148,7 +148,7 @@ void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
148 p_start, p_end, p_nid) \ 148 p_start, p_end, p_nid) \
149 for (i = (u64)ULLONG_MAX, \ 149 for (i = (u64)ULLONG_MAX, \
150 __next_mem_range_rev(&i, nid, flags, type_a, type_b,\ 150 __next_mem_range_rev(&i, nid, flags, type_a, type_b,\
151 p_start, p_end, p_nid); \ 151 p_start, p_end, p_nid); \
152 i != (u64)ULLONG_MAX; \ 152 i != (u64)ULLONG_MAX; \
153 __next_mem_range_rev(&i, nid, flags, type_a, type_b, \ 153 __next_mem_range_rev(&i, nid, flags, type_a, type_b, \
154 p_start, p_end, p_nid)) 154 p_start, p_end, p_nid))
@@ -163,8 +163,7 @@ void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
163 * is initialized. 163 * is initialized.
164 */ 164 */
165#define for_each_reserved_mem_region(i, p_start, p_end) \ 165#define for_each_reserved_mem_region(i, p_start, p_end) \
166 for (i = 0UL, \ 166 for (i = 0UL, __next_reserved_mem_region(&i, p_start, p_end); \
167 __next_reserved_mem_region(&i, p_start, p_end); \
168 i != (u64)ULLONG_MAX; \ 167 i != (u64)ULLONG_MAX; \
169 __next_reserved_mem_region(&i, p_start, p_end)) 168 __next_reserved_mem_region(&i, p_start, p_end))
170 169
@@ -403,15 +402,14 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
403} 402}
404 403
405#define for_each_memblock(memblock_type, region) \ 404#define for_each_memblock(memblock_type, region) \
406 for (region = memblock.memblock_type.regions; \ 405 for (region = memblock.memblock_type.regions; \
407 region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \ 406 region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \
408 region++) 407 region++)
409 408
410#define for_each_memblock_type(memblock_type, rgn) \ 409#define for_each_memblock_type(memblock_type, rgn) \
411 idx = 0; \ 410 for (idx = 0, rgn = &memblock_type->regions[0]; \
412 rgn = &memblock_type->regions[idx]; \ 411 idx < memblock_type->cnt; \
413 for (idx = 0; idx < memblock_type->cnt; \ 412 idx++, rgn = &memblock_type->regions[idx])
414 idx++,rgn = &memblock_type->regions[idx])
415 413
416#ifdef CONFIG_MEMTEST 414#ifdef CONFIG_MEMTEST
417extern void early_memtest(phys_addr_t start, phys_addr_t end); 415extern void early_memtest(phys_addr_t start, phys_addr_t end);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 56e6069d2452..71aff733a497 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -749,6 +749,13 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
749} 749}
750#endif 750#endif
751 751
752struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep);
753void memcg_kmem_put_cache(struct kmem_cache *cachep);
754int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
755 struct mem_cgroup *memcg);
756int memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
757void memcg_kmem_uncharge(struct page *page, int order);
758
752#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) 759#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB)
753extern struct static_key_false memcg_kmem_enabled_key; 760extern struct static_key_false memcg_kmem_enabled_key;
754 761
@@ -770,22 +777,6 @@ static inline bool memcg_kmem_enabled(void)
770} 777}
771 778
772/* 779/*
773 * In general, we'll do everything in our power to not incur in any overhead
774 * for non-memcg users for the kmem functions. Not even a function call, if we
775 * can avoid it.
776 *
777 * Therefore, we'll inline all those functions so that in the best case, we'll
778 * see that kmemcg is off for everybody and proceed quickly. If it is on,
779 * we'll still do most of the flag checking inline. We check a lot of
780 * conditions, but because they are pretty simple, they are expected to be
781 * fast.
782 */
783int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
784 struct mem_cgroup *memcg);
785int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order);
786void __memcg_kmem_uncharge(struct page *page, int order);
787
788/*
789 * helper for accessing a memcg's index. It will be used as an index in the 780 * helper for accessing a memcg's index. It will be used as an index in the
790 * child cache array in kmem_cache, and also to derive its name. This function 781 * child cache array in kmem_cache, and also to derive its name. This function
791 * will return -1 when this is not a kmem-limited memcg. 782 * will return -1 when this is not a kmem-limited memcg.
@@ -795,67 +786,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg)
795 return memcg ? memcg->kmemcg_id : -1; 786 return memcg ? memcg->kmemcg_id : -1;
796} 787}
797 788
798struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
799void __memcg_kmem_put_cache(struct kmem_cache *cachep);
800
801static inline bool __memcg_kmem_bypass(void)
802{
803 if (!memcg_kmem_enabled())
804 return true;
805 if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
806 return true;
807 return false;
808}
809
810/**
811 * memcg_kmem_charge: charge a kmem page
812 * @page: page to charge
813 * @gfp: reclaim mode
814 * @order: allocation order
815 *
816 * Returns 0 on success, an error code on failure.
817 */
818static __always_inline int memcg_kmem_charge(struct page *page,
819 gfp_t gfp, int order)
820{
821 if (__memcg_kmem_bypass())
822 return 0;
823 if (!(gfp & __GFP_ACCOUNT))
824 return 0;
825 return __memcg_kmem_charge(page, gfp, order);
826}
827
828/**
829 * memcg_kmem_uncharge: uncharge a kmem page
830 * @page: page to uncharge
831 * @order: allocation order
832 */
833static __always_inline void memcg_kmem_uncharge(struct page *page, int order)
834{
835 if (memcg_kmem_enabled())
836 __memcg_kmem_uncharge(page, order);
837}
838
839/**
840 * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
841 * @cachep: the original global kmem cache
842 *
843 * All memory allocated from a per-memcg cache is charged to the owner memcg.
844 */
845static __always_inline struct kmem_cache *
846memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
847{
848 if (__memcg_kmem_bypass())
849 return cachep;
850 return __memcg_kmem_get_cache(cachep, gfp);
851}
852
853static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
854{
855 if (memcg_kmem_enabled())
856 __memcg_kmem_put_cache(cachep);
857}
858
859/** 789/**
860 * memcg_kmem_update_page_stat - update kmem page state statistics 790 * memcg_kmem_update_page_stat - update kmem page state statistics
861 * @page: the page 791 * @page: the page
@@ -878,15 +808,6 @@ static inline bool memcg_kmem_enabled(void)
878 return false; 808 return false;
879} 809}
880 810
881static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
882{
883 return 0;
884}
885
886static inline void memcg_kmem_uncharge(struct page *page, int order)
887{
888}
889
890static inline int memcg_cache_id(struct mem_cgroup *memcg) 811static inline int memcg_cache_id(struct mem_cgroup *memcg)
891{ 812{
892 return -1; 813 return -1;
@@ -900,16 +821,6 @@ static inline void memcg_put_cache_ids(void)
900{ 821{
901} 822}
902 823
903static inline struct kmem_cache *
904memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
905{
906 return cachep;
907}
908
909static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
910{
911}
912
913static inline void memcg_kmem_update_page_stat(struct page *page, 824static inline void memcg_kmem_update_page_stat(struct page *page,
914 enum mem_cgroup_stat_index idx, int val) 825 enum mem_cgroup_stat_index idx, int val)
915{ 826{
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 5145620ba48a..01033fadea47 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -284,5 +284,7 @@ extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
284 unsigned long map_offset); 284 unsigned long map_offset);
285extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, 285extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
286 unsigned long pnum); 286 unsigned long pnum);
287extern int zone_can_shift(unsigned long pfn, unsigned long nr_pages,
288 enum zone_type target);
287 289
288#endif /* __LINUX_MEMORY_HOTPLUG_H */ 290#endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 9b50325e4ddf..ae8d475a9385 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -37,6 +37,8 @@ extern int migrate_page(struct address_space *,
37 struct page *, struct page *, enum migrate_mode); 37 struct page *, struct page *, enum migrate_mode);
38extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, 38extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free,
39 unsigned long private, enum migrate_mode mode, int reason); 39 unsigned long private, enum migrate_mode mode, int reason);
40extern bool isolate_movable_page(struct page *page, isolate_mode_t mode);
41extern void putback_movable_page(struct page *page);
40 42
41extern int migrate_prep(void); 43extern int migrate_prep(void);
42extern int migrate_prep_local(void); 44extern int migrate_prep_local(void);
@@ -69,6 +71,21 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
69 71
70#endif /* CONFIG_MIGRATION */ 72#endif /* CONFIG_MIGRATION */
71 73
74#ifdef CONFIG_COMPACTION
75extern int PageMovable(struct page *page);
76extern void __SetPageMovable(struct page *page, struct address_space *mapping);
77extern void __ClearPageMovable(struct page *page);
78#else
79static inline int PageMovable(struct page *page) { return 0; };
80static inline void __SetPageMovable(struct page *page,
81 struct address_space *mapping)
82{
83}
84static inline void __ClearPageMovable(struct page *page)
85{
86}
87#endif
88
72#ifdef CONFIG_NUMA_BALANCING 89#ifdef CONFIG_NUMA_BALANCING
73extern bool pmd_trans_migrating(pmd_t pmd); 90extern bool pmd_trans_migrating(pmd_t pmd);
74extern int migrate_misplaced_page(struct page *page, 91extern int migrate_misplaced_page(struct page *page,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ece042dfe23c..192c1bbe5fcd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -309,10 +309,34 @@ struct vm_fault {
309 * VM_FAULT_DAX_LOCKED and fill in 309 * VM_FAULT_DAX_LOCKED and fill in
310 * entry here. 310 * entry here.
311 */ 311 */
312 /* for ->map_pages() only */ 312};
313 pgoff_t max_pgoff; /* map pages for offset from pgoff till 313
314 * max_pgoff inclusive */ 314/*
315 pte_t *pte; /* pte entry associated with ->pgoff */ 315 * Page fault context: passes though page fault handler instead of endless list
316 * of function arguments.
317 */
318struct fault_env {
319 struct vm_area_struct *vma; /* Target VMA */
320 unsigned long address; /* Faulting virtual address */
321 unsigned int flags; /* FAULT_FLAG_xxx flags */
322 pmd_t *pmd; /* Pointer to pmd entry matching
323 * the 'address'
324 */
325 pte_t *pte; /* Pointer to pte entry matching
326 * the 'address'. NULL if the page
327 * table hasn't been allocated.
328 */
329 spinlock_t *ptl; /* Page table lock.
330 * Protects pte page table if 'pte'
331 * is not NULL, otherwise pmd.
332 */
333 pgtable_t prealloc_pte; /* Pre-allocated pte page table.
334 * vm_ops->map_pages() calls
335 * alloc_set_pte() from atomic context.
336 * do_fault_around() pre-allocates
337 * page table to avoid allocation from
338 * atomic context.
339 */
316}; 340};
317 341
318/* 342/*
@@ -327,7 +351,8 @@ struct vm_operations_struct {
327 int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); 351 int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
328 int (*pmd_fault)(struct vm_area_struct *, unsigned long address, 352 int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
329 pmd_t *, unsigned int flags); 353 pmd_t *, unsigned int flags);
330 void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); 354 void (*map_pages)(struct fault_env *fe,
355 pgoff_t start_pgoff, pgoff_t end_pgoff);
331 356
332 /* notification that a previously read-only page is about to become 357 /* notification that a previously read-only page is about to become
333 * writable, if an error is returned it will cause a SIGBUS */ 358 * writable, if an error is returned it will cause a SIGBUS */
@@ -537,7 +562,6 @@ void __put_page(struct page *page);
537void put_pages_list(struct list_head *pages); 562void put_pages_list(struct list_head *pages);
538 563
539void split_page(struct page *page, unsigned int order); 564void split_page(struct page *page, unsigned int order);
540int split_free_page(struct page *page);
541 565
542/* 566/*
543 * Compound pages have a destructor function. Provide a 567 * Compound pages have a destructor function. Provide a
@@ -601,8 +625,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
601 return pte; 625 return pte;
602} 626}
603 627
604void do_set_pte(struct vm_area_struct *vma, unsigned long address, 628int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
605 struct page *page, pte_t *pte, bool write, bool anon); 629 struct page *page);
606#endif 630#endif
607 631
608/* 632/*
@@ -1035,6 +1059,7 @@ static inline pgoff_t page_file_index(struct page *page)
1035} 1059}
1036 1060
1037bool page_mapped(struct page *page); 1061bool page_mapped(struct page *page);
1062struct address_space *page_mapping(struct page *page);
1038 1063
1039/* 1064/*
1040 * Return true only if the page has been allocated with 1065 * Return true only if the page has been allocated with
@@ -1215,15 +1240,14 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page);
1215int invalidate_inode_page(struct page *page); 1240int invalidate_inode_page(struct page *page);
1216 1241
1217#ifdef CONFIG_MMU 1242#ifdef CONFIG_MMU
1218extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 1243extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
1219 unsigned long address, unsigned int flags); 1244 unsigned int flags);
1220extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, 1245extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
1221 unsigned long address, unsigned int fault_flags, 1246 unsigned long address, unsigned int fault_flags,
1222 bool *unlocked); 1247 bool *unlocked);
1223#else 1248#else
1224static inline int handle_mm_fault(struct mm_struct *mm, 1249static inline int handle_mm_fault(struct vm_area_struct *vma,
1225 struct vm_area_struct *vma, unsigned long address, 1250 unsigned long address, unsigned int flags)
1226 unsigned int flags)
1227{ 1251{
1228 /* should never happen if there's no MMU */ 1252 /* should never happen if there's no MMU */
1229 BUG(); 1253 BUG();
@@ -2063,7 +2087,8 @@ extern void truncate_inode_pages_final(struct address_space *);
2063 2087
2064/* generic vm_area_ops exported for stackable file systems */ 2088/* generic vm_area_ops exported for stackable file systems */
2065extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); 2089extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
2066extern void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf); 2090extern void filemap_map_pages(struct fault_env *fe,
2091 pgoff_t start_pgoff, pgoff_t end_pgoff);
2067extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2092extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2068 2093
2069/* mm/page-writeback.c */ 2094/* mm/page-writeback.c */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 917f2b6a0cde..79472b22d23f 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -60,51 +60,52 @@ struct page {
60 }; 60 };
61 61
62 /* Second double word */ 62 /* Second double word */
63 struct { 63 union {
64 union { 64 pgoff_t index; /* Our offset within mapping. */
65 pgoff_t index; /* Our offset within mapping. */ 65 void *freelist; /* sl[aou]b first free object */
66 void *freelist; /* sl[aou]b first free object */ 66 /* page_deferred_list().prev -- second tail page */
67 /* page_deferred_list().prev -- second tail page */ 67 };
68 };
69 68
70 union { 69 union {
71#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 70#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
72 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) 71 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
73 /* Used for cmpxchg_double in slub */ 72 /* Used for cmpxchg_double in slub */
74 unsigned long counters; 73 unsigned long counters;
75#else 74#else
76 /* 75 /*
77 * Keep _refcount separate from slub cmpxchg_double 76 * Keep _refcount separate from slub cmpxchg_double data.
78 * data. As the rest of the double word is protected by 77 * As the rest of the double word is protected by slab_lock
79 * slab_lock but _refcount is not. 78 * but _refcount is not.
80 */ 79 */
81 unsigned counters; 80 unsigned counters;
82#endif 81#endif
82 struct {
83 83
84 struct { 84 union {
85
86 union {
87 /*
88 * Count of ptes mapped in mms, to show
89 * when page is mapped & limit reverse
90 * map searches.
91 */
92 atomic_t _mapcount;
93
94 struct { /* SLUB */
95 unsigned inuse:16;
96 unsigned objects:15;
97 unsigned frozen:1;
98 };
99 int units; /* SLOB */
100 };
101 /* 85 /*
102 * Usage count, *USE WRAPPER FUNCTION* 86 * Count of ptes mapped in mms, to show when
103 * when manual accounting. See page_ref.h 87 * page is mapped & limit reverse map searches.
88 *
89 * Extra information about page type may be
90 * stored here for pages that are never mapped,
91 * in which case the value MUST BE <= -2.
92 * See page-flags.h for more details.
104 */ 93 */
105 atomic_t _refcount; 94 atomic_t _mapcount;
95
96 unsigned int active; /* SLAB */
97 struct { /* SLUB */
98 unsigned inuse:16;
99 unsigned objects:15;
100 unsigned frozen:1;
101 };
102 int units; /* SLOB */
106 }; 103 };
107 unsigned int active; /* SLAB */ 104 /*
105 * Usage count, *USE WRAPPER FUNCTION* when manual
106 * accounting. See page_ref.h
107 */
108 atomic_t _refcount;
108 }; 109 };
109 }; 110 };
110 111
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
index de7be78c6f0e..451a811f48f2 100644
--- a/include/linux/mmdebug.h
+++ b/include/linux/mmdebug.h
@@ -39,6 +39,7 @@ void dump_mm(const struct mm_struct *mm);
39#define VM_WARN_ON(cond) WARN_ON(cond) 39#define VM_WARN_ON(cond) WARN_ON(cond)
40#define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) 40#define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond)
41#define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) 41#define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format)
42#define VM_WARN(cond, format...) WARN(cond, format)
42#else 43#else
43#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) 44#define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
44#define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) 45#define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond)
@@ -47,6 +48,7 @@ void dump_mm(const struct mm_struct *mm);
47#define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) 48#define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
48#define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) 49#define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
49#define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) 50#define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
51#define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
50#endif 52#endif
51 53
52#ifdef CONFIG_DEBUG_VIRTUAL 54#ifdef CONFIG_DEBUG_VIRTUAL
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 02069c23486d..19425e988bdc 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -140,6 +140,9 @@ enum zone_stat_item {
140 NR_DIRTIED, /* page dirtyings since bootup */ 140 NR_DIRTIED, /* page dirtyings since bootup */
141 NR_WRITTEN, /* page writings since bootup */ 141 NR_WRITTEN, /* page writings since bootup */
142 NR_PAGES_SCANNED, /* pages scanned since last reclaim */ 142 NR_PAGES_SCANNED, /* pages scanned since last reclaim */
143#if IS_ENABLED(CONFIG_ZSMALLOC)
144 NR_ZSPAGES, /* allocated in zsmalloc */
145#endif
143#ifdef CONFIG_NUMA 146#ifdef CONFIG_NUMA
144 NUMA_HIT, /* allocated in intended node */ 147 NUMA_HIT, /* allocated in intended node */
145 NUMA_MISS, /* allocated in non intended node */ 148 NUMA_MISS, /* allocated in non intended node */
@@ -151,7 +154,9 @@ enum zone_stat_item {
151 WORKINGSET_REFAULT, 154 WORKINGSET_REFAULT,
152 WORKINGSET_ACTIVATE, 155 WORKINGSET_ACTIVATE,
153 WORKINGSET_NODERECLAIM, 156 WORKINGSET_NODERECLAIM,
154 NR_ANON_TRANSPARENT_HUGEPAGES, 157 NR_ANON_THPS,
158 NR_SHMEM_THPS,
159 NR_SHMEM_PMDMAPPED,
155 NR_FREE_CMA_PAGES, 160 NR_FREE_CMA_PAGES,
156 NR_VM_ZONE_STAT_ITEMS }; 161 NR_VM_ZONE_STAT_ITEMS };
157 162
@@ -524,7 +529,6 @@ struct zone {
524 529
525enum zone_flags { 530enum zone_flags {
526 ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ 531 ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */
527 ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */
528 ZONE_CONGESTED, /* zone has many dirty pages backed by 532 ZONE_CONGESTED, /* zone has many dirty pages backed by
529 * a congested BDI 533 * a congested BDI
530 */ 534 */
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 83469522690a..606137b3b778 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -23,6 +23,9 @@ struct oom_control {
23 /* Used to determine mempolicy */ 23 /* Used to determine mempolicy */
24 nodemask_t *nodemask; 24 nodemask_t *nodemask;
25 25
26 /* Memory cgroup in which oom is invoked, or NULL for global oom */
27 struct mem_cgroup *memcg;
28
26 /* Used to determine cpuset and node locality requirement */ 29 /* Used to determine cpuset and node locality requirement */
27 const gfp_t gfp_mask; 30 const gfp_t gfp_mask;
28 31
@@ -83,14 +86,13 @@ extern unsigned long oom_badness(struct task_struct *p,
83 86
84extern void oom_kill_process(struct oom_control *oc, struct task_struct *p, 87extern void oom_kill_process(struct oom_control *oc, struct task_struct *p,
85 unsigned int points, unsigned long totalpages, 88 unsigned int points, unsigned long totalpages,
86 struct mem_cgroup *memcg, const char *message); 89 const char *message);
87 90
88extern void check_panic_on_oom(struct oom_control *oc, 91extern void check_panic_on_oom(struct oom_control *oc,
89 enum oom_constraint constraint, 92 enum oom_constraint constraint);
90 struct mem_cgroup *memcg);
91 93
92extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, 94extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
93 struct task_struct *task, unsigned long totalpages); 95 struct task_struct *task);
94 96
95extern bool out_of_memory(struct oom_control *oc); 97extern bool out_of_memory(struct oom_control *oc);
96 98
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index e5a32445f930..74e4dda91238 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -129,6 +129,9 @@ enum pageflags {
129 129
130 /* Compound pages. Stored in first tail page's flags */ 130 /* Compound pages. Stored in first tail page's flags */
131 PG_double_map = PG_private_2, 131 PG_double_map = PG_private_2,
132
133 /* non-lru isolated movable page */
134 PG_isolated = PG_reclaim,
132}; 135};
133 136
134#ifndef __GENERATING_BOUNDS_H 137#ifndef __GENERATING_BOUNDS_H
@@ -292,11 +295,11 @@ PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY)
292 */ 295 */
293TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND) 296TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND)
294 TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND) 297 TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND)
295PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND) 298PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL)
296 299
297/* PG_readahead is only used for reads; PG_reclaim is only for writes */ 300/* PG_readahead is only used for reads; PG_reclaim is only for writes */
298PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND) 301PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
299 TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND) 302 TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
300PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) 303PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND)
301 TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND) 304 TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND)
302 305
@@ -357,29 +360,37 @@ PAGEFLAG(Idle, idle, PF_ANY)
357 * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. 360 * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h.
358 * 361 *
359 * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, 362 * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
360 * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit; 363 * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON
361 * and then page->mapping points, not to an anon_vma, but to a private 364 * bit; and then page->mapping points, not to an anon_vma, but to a private
362 * structure which KSM associates with that merged page. See ksm.h. 365 * structure which KSM associates with that merged page. See ksm.h.
363 * 366 *
364 * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used. 367 * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable
368 * page and then page->mapping points a struct address_space.
365 * 369 *
366 * Please note that, confusingly, "page_mapping" refers to the inode 370 * Please note that, confusingly, "page_mapping" refers to the inode
367 * address_space which maps the page from disk; whereas "page_mapped" 371 * address_space which maps the page from disk; whereas "page_mapped"
368 * refers to user virtual address space into which the page is mapped. 372 * refers to user virtual address space into which the page is mapped.
369 */ 373 */
370#define PAGE_MAPPING_ANON 1 374#define PAGE_MAPPING_ANON 0x1
371#define PAGE_MAPPING_KSM 2 375#define PAGE_MAPPING_MOVABLE 0x2
372#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) 376#define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
377#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
373 378
374static __always_inline int PageAnonHead(struct page *page) 379static __always_inline int PageMappingFlags(struct page *page)
375{ 380{
376 return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; 381 return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
377} 382}
378 383
379static __always_inline int PageAnon(struct page *page) 384static __always_inline int PageAnon(struct page *page)
380{ 385{
381 page = compound_head(page); 386 page = compound_head(page);
382 return PageAnonHead(page); 387 return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
388}
389
390static __always_inline int __PageMovable(struct page *page)
391{
392 return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
393 PAGE_MAPPING_MOVABLE;
383} 394}
384 395
385#ifdef CONFIG_KSM 396#ifdef CONFIG_KSM
@@ -393,7 +404,7 @@ static __always_inline int PageKsm(struct page *page)
393{ 404{
394 page = compound_head(page); 405 page = compound_head(page);
395 return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == 406 return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
396 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); 407 PAGE_MAPPING_KSM;
397} 408}
398#else 409#else
399TESTPAGEFLAG_FALSE(Ksm) 410TESTPAGEFLAG_FALSE(Ksm)
@@ -570,6 +581,17 @@ static inline int PageDoubleMap(struct page *page)
570 return PageHead(page) && test_bit(PG_double_map, &page[1].flags); 581 return PageHead(page) && test_bit(PG_double_map, &page[1].flags);
571} 582}
572 583
584static inline void SetPageDoubleMap(struct page *page)
585{
586 VM_BUG_ON_PAGE(!PageHead(page), page);
587 set_bit(PG_double_map, &page[1].flags);
588}
589
590static inline void ClearPageDoubleMap(struct page *page)
591{
592 VM_BUG_ON_PAGE(!PageHead(page), page);
593 clear_bit(PG_double_map, &page[1].flags);
594}
573static inline int TestSetPageDoubleMap(struct page *page) 595static inline int TestSetPageDoubleMap(struct page *page)
574{ 596{
575 VM_BUG_ON_PAGE(!PageHead(page), page); 597 VM_BUG_ON_PAGE(!PageHead(page), page);
@@ -587,59 +609,59 @@ TESTPAGEFLAG_FALSE(TransHuge)
587TESTPAGEFLAG_FALSE(TransCompound) 609TESTPAGEFLAG_FALSE(TransCompound)
588TESTPAGEFLAG_FALSE(TransCompoundMap) 610TESTPAGEFLAG_FALSE(TransCompoundMap)
589TESTPAGEFLAG_FALSE(TransTail) 611TESTPAGEFLAG_FALSE(TransTail)
590TESTPAGEFLAG_FALSE(DoubleMap) 612PAGEFLAG_FALSE(DoubleMap)
591 TESTSETFLAG_FALSE(DoubleMap) 613 TESTSETFLAG_FALSE(DoubleMap)
592 TESTCLEARFLAG_FALSE(DoubleMap) 614 TESTCLEARFLAG_FALSE(DoubleMap)
593#endif 615#endif
594 616
595/* 617/*
618 * For pages that are never mapped to userspace, page->mapcount may be
619 * used for storing extra information about page type. Any value used
620 * for this purpose must be <= -2, but it's better start not too close
621 * to -2 so that an underflow of the page_mapcount() won't be mistaken
622 * for a special page.
623 */
624#define PAGE_MAPCOUNT_OPS(uname, lname) \
625static __always_inline int Page##uname(struct page *page) \
626{ \
627 return atomic_read(&page->_mapcount) == \
628 PAGE_##lname##_MAPCOUNT_VALUE; \
629} \
630static __always_inline void __SetPage##uname(struct page *page) \
631{ \
632 VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); \
633 atomic_set(&page->_mapcount, PAGE_##lname##_MAPCOUNT_VALUE); \
634} \
635static __always_inline void __ClearPage##uname(struct page *page) \
636{ \
637 VM_BUG_ON_PAGE(!Page##uname(page), page); \
638 atomic_set(&page->_mapcount, -1); \
639}
640
641/*
596 * PageBuddy() indicate that the page is free and in the buddy system 642 * PageBuddy() indicate that the page is free and in the buddy system
597 * (see mm/page_alloc.c). 643 * (see mm/page_alloc.c).
598 *
599 * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to
600 * -2 so that an underflow of the page_mapcount() won't be mistaken
601 * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very
602 * efficiently by most CPU architectures.
603 */ 644 */
604#define PAGE_BUDDY_MAPCOUNT_VALUE (-128) 645#define PAGE_BUDDY_MAPCOUNT_VALUE (-128)
605 646PAGE_MAPCOUNT_OPS(Buddy, BUDDY)
606static inline int PageBuddy(struct page *page)
607{
608 return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE;
609}
610 647
611static inline void __SetPageBuddy(struct page *page) 648/*
612{ 649 * PageBalloon() is set on pages that are on the balloon page list
613 VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); 650 * (see mm/balloon_compaction.c).
614 atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE); 651 */
615} 652#define PAGE_BALLOON_MAPCOUNT_VALUE (-256)
653PAGE_MAPCOUNT_OPS(Balloon, BALLOON)
616 654
617static inline void __ClearPageBuddy(struct page *page) 655/*
618{ 656 * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on
619 VM_BUG_ON_PAGE(!PageBuddy(page), page); 657 * pages allocated with __GFP_ACCOUNT. It gets cleared on page free.
620 atomic_set(&page->_mapcount, -1); 658 */
621} 659#define PAGE_KMEMCG_MAPCOUNT_VALUE (-512)
660PAGE_MAPCOUNT_OPS(Kmemcg, KMEMCG)
622 661
623extern bool is_free_buddy_page(struct page *page); 662extern bool is_free_buddy_page(struct page *page);
624 663
625#define PAGE_BALLOON_MAPCOUNT_VALUE (-256) 664__PAGEFLAG(Isolated, isolated, PF_ANY);
626
627static inline int PageBalloon(struct page *page)
628{
629 return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE;
630}
631
632static inline void __SetPageBalloon(struct page *page)
633{
634 VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
635 atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE);
636}
637
638static inline void __ClearPageBalloon(struct page *page)
639{
640 VM_BUG_ON_PAGE(!PageBalloon(page), page);
641 atomic_set(&page->_mapcount, -1);
642}
643 665
644/* 666/*
645 * If network-based swap is enabled, sl*b must keep track of whether pages 667 * If network-based swap is enabled, sl*b must keep track of whether pages
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index e1fe7cf5bddf..03f2a3e7d76d 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -3,6 +3,7 @@
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <linux/stacktrace.h> 5#include <linux/stacktrace.h>
6#include <linux/stackdepot.h>
6 7
7struct pglist_data; 8struct pglist_data;
8struct page_ext_operations { 9struct page_ext_operations {
@@ -44,9 +45,8 @@ struct page_ext {
44#ifdef CONFIG_PAGE_OWNER 45#ifdef CONFIG_PAGE_OWNER
45 unsigned int order; 46 unsigned int order;
46 gfp_t gfp_mask; 47 gfp_t gfp_mask;
47 unsigned int nr_entries;
48 int last_migrate_reason; 48 int last_migrate_reason;
49 unsigned long trace_entries[8]; 49 depot_stack_handle_t handle;
50#endif 50#endif
51}; 51};
52 52
diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index 46f1b939948c..30583ab0ffb1 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -10,7 +10,7 @@ extern struct page_ext_operations page_owner_ops;
10extern void __reset_page_owner(struct page *page, unsigned int order); 10extern void __reset_page_owner(struct page *page, unsigned int order);
11extern void __set_page_owner(struct page *page, 11extern void __set_page_owner(struct page *page,
12 unsigned int order, gfp_t gfp_mask); 12 unsigned int order, gfp_t gfp_mask);
13extern gfp_t __get_page_owner_gfp(struct page *page); 13extern void __split_page_owner(struct page *page, unsigned int order);
14extern void __copy_page_owner(struct page *oldpage, struct page *newpage); 14extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
15extern void __set_page_owner_migrate_reason(struct page *page, int reason); 15extern void __set_page_owner_migrate_reason(struct page *page, int reason);
16extern void __dump_page_owner(struct page *page); 16extern void __dump_page_owner(struct page *page);
@@ -28,12 +28,10 @@ static inline void set_page_owner(struct page *page,
28 __set_page_owner(page, order, gfp_mask); 28 __set_page_owner(page, order, gfp_mask);
29} 29}
30 30
31static inline gfp_t get_page_owner_gfp(struct page *page) 31static inline void split_page_owner(struct page *page, unsigned int order)
32{ 32{
33 if (static_branch_unlikely(&page_owner_inited)) 33 if (static_branch_unlikely(&page_owner_inited))
34 return __get_page_owner_gfp(page); 34 __split_page_owner(page, order);
35 else
36 return 0;
37} 35}
38static inline void copy_page_owner(struct page *oldpage, struct page *newpage) 36static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
39{ 37{
@@ -58,9 +56,9 @@ static inline void set_page_owner(struct page *page,
58 unsigned int order, gfp_t gfp_mask) 56 unsigned int order, gfp_t gfp_mask)
59{ 57{
60} 58}
61static inline gfp_t get_page_owner_gfp(struct page *page) 59static inline void split_page_owner(struct page *page,
60 unsigned int order)
62{ 61{
63 return 0;
64} 62}
65static inline void copy_page_owner(struct page *oldpage, struct page *newpage) 63static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
66{ 64{
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 97354102794d..81363b834900 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -209,10 +209,10 @@ static inline struct page *page_cache_alloc_cold(struct address_space *x)
209 return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD); 209 return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD);
210} 210}
211 211
212static inline struct page *page_cache_alloc_readahead(struct address_space *x) 212static inline gfp_t readahead_gfp_mask(struct address_space *x)
213{ 213{
214 return __page_cache_alloc(mapping_gfp_mask(x) | 214 return mapping_gfp_mask(x) |
215 __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN); 215 __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN;
216} 216}
217 217
218typedef int filler_t(void *, struct page *); 218typedef int filler_t(void *, struct page *);
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index eca6f626c16e..cbfee507c839 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -291,6 +291,7 @@ unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
291 unsigned long first_index, unsigned int max_items); 291 unsigned long first_index, unsigned int max_items);
292int radix_tree_preload(gfp_t gfp_mask); 292int radix_tree_preload(gfp_t gfp_mask);
293int radix_tree_maybe_preload(gfp_t gfp_mask); 293int radix_tree_maybe_preload(gfp_t gfp_mask);
294int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order);
294void radix_tree_init(void); 295void radix_tree_init(void);
295void *radix_tree_tag_set(struct radix_tree_root *root, 296void *radix_tree_tag_set(struct radix_tree_root *root,
296 unsigned long index, unsigned int tag); 297 unsigned long index, unsigned int tag);
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 2b0fad83683f..b46bb5620a76 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -165,7 +165,7 @@ void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
165 unsigned long, int); 165 unsigned long, int);
166void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, 166void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
167 unsigned long, bool); 167 unsigned long, bool);
168void page_add_file_rmap(struct page *); 168void page_add_file_rmap(struct page *, bool);
169void page_remove_rmap(struct page *, bool); 169void page_remove_rmap(struct page *, bool);
170 170
171void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, 171void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 4d4780c00d34..ff078e7043b6 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -16,8 +16,9 @@ struct shmem_inode_info {
16 unsigned long flags; 16 unsigned long flags;
17 unsigned long alloced; /* data pages alloced to file */ 17 unsigned long alloced; /* data pages alloced to file */
18 unsigned long swapped; /* subtotal assigned to swap */ 18 unsigned long swapped; /* subtotal assigned to swap */
19 struct shared_policy policy; /* NUMA memory alloc policy */ 19 struct list_head shrinklist; /* shrinkable hpage inodes */
20 struct list_head swaplist; /* chain of maybes on swap */ 20 struct list_head swaplist; /* chain of maybes on swap */
21 struct shared_policy policy; /* NUMA memory alloc policy */
21 struct simple_xattrs xattrs; /* list of xattrs */ 22 struct simple_xattrs xattrs; /* list of xattrs */
22 struct inode vfs_inode; 23 struct inode vfs_inode;
23}; 24};
@@ -28,10 +29,14 @@ struct shmem_sb_info {
28 unsigned long max_inodes; /* How many inodes are allowed */ 29 unsigned long max_inodes; /* How many inodes are allowed */
29 unsigned long free_inodes; /* How many are left for allocation */ 30 unsigned long free_inodes; /* How many are left for allocation */
30 spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ 31 spinlock_t stat_lock; /* Serialize shmem_sb_info changes */
32 umode_t mode; /* Mount mode for root directory */
33 unsigned char huge; /* Whether to try for hugepages */
31 kuid_t uid; /* Mount uid for root directory */ 34 kuid_t uid; /* Mount uid for root directory */
32 kgid_t gid; /* Mount gid for root directory */ 35 kgid_t gid; /* Mount gid for root directory */
33 umode_t mode; /* Mount mode for root directory */
34 struct mempolicy *mpol; /* default memory policy for mappings */ 36 struct mempolicy *mpol; /* default memory policy for mappings */
37 spinlock_t shrinklist_lock; /* Protects shrinklist */
38 struct list_head shrinklist; /* List of shinkable inodes */
39 unsigned long shrinklist_len; /* Length of shrinklist */
35}; 40};
36 41
37static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) 42static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
@@ -49,6 +54,8 @@ extern struct file *shmem_file_setup(const char *name,
49extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, 54extern struct file *shmem_kernel_file_setup(const char *name, loff_t size,
50 unsigned long flags); 55 unsigned long flags);
51extern int shmem_zero_setup(struct vm_area_struct *); 56extern int shmem_zero_setup(struct vm_area_struct *);
57extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
58 unsigned long len, unsigned long pgoff, unsigned long flags);
52extern int shmem_lock(struct file *file, int lock, struct user_struct *user); 59extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
53extern bool shmem_mapping(struct address_space *mapping); 60extern bool shmem_mapping(struct address_space *mapping);
54extern void shmem_unlock_mapping(struct address_space *mapping); 61extern void shmem_unlock_mapping(struct address_space *mapping);
@@ -61,6 +68,19 @@ extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
61extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, 68extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
62 pgoff_t start, pgoff_t end); 69 pgoff_t start, pgoff_t end);
63 70
71/* Flag allocation requirements to shmem_getpage */
72enum sgp_type {
73 SGP_READ, /* don't exceed i_size, don't allocate page */
74 SGP_CACHE, /* don't exceed i_size, may allocate page */
75 SGP_NOHUGE, /* like SGP_CACHE, but no huge pages */
76 SGP_HUGE, /* like SGP_CACHE, huge pages preferred */
77 SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */
78 SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
79};
80
81extern int shmem_getpage(struct inode *inode, pgoff_t index,
82 struct page **pagep, enum sgp_type sgp);
83
64static inline struct page *shmem_read_mapping_page( 84static inline struct page *shmem_read_mapping_page(
65 struct address_space *mapping, pgoff_t index) 85 struct address_space *mapping, pgoff_t index)
66{ 86{
@@ -68,6 +88,18 @@ static inline struct page *shmem_read_mapping_page(
68 mapping_gfp_mask(mapping)); 88 mapping_gfp_mask(mapping));
69} 89}
70 90
91static inline bool shmem_file(struct file *file)
92{
93 if (!IS_ENABLED(CONFIG_SHMEM))
94 return false;
95 if (!file || !file->f_mapping)
96 return false;
97 return shmem_mapping(file->f_mapping);
98}
99
100extern bool shmem_charge(struct inode *inode, long pages);
101extern void shmem_uncharge(struct inode *inode, long pages);
102
71#ifdef CONFIG_TMPFS 103#ifdef CONFIG_TMPFS
72 104
73extern int shmem_add_seals(struct file *file, unsigned int seals); 105extern int shmem_add_seals(struct file *file, unsigned int seals);
@@ -83,4 +115,13 @@ static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a)
83 115
84#endif 116#endif
85 117
118#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
119extern bool shmem_huge_enabled(struct vm_area_struct *vma);
120#else
121static inline bool shmem_huge_enabled(struct vm_area_struct *vma)
122{
123 return false;
124}
125#endif
126
86#endif 127#endif
diff --git a/include/linux/slab.h b/include/linux/slab.h
index aeb3e6d00a66..1a4ea551aae5 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -565,6 +565,8 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
565{ 565{
566 if (size != 0 && n > SIZE_MAX / size) 566 if (size != 0 && n > SIZE_MAX / size)
567 return NULL; 567 return NULL;
568 if (__builtin_constant_p(n) && __builtin_constant_p(size))
569 return kmalloc(n * size, flags);
568 return __kmalloc(n * size, flags); 570 return __kmalloc(n * size, flags);
569} 571}
570 572
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 8694f7a5d92b..339ba027ade9 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -81,7 +81,7 @@ struct kmem_cache {
81#endif 81#endif
82 82
83#ifdef CONFIG_SLAB_FREELIST_RANDOM 83#ifdef CONFIG_SLAB_FREELIST_RANDOM
84 void *random_seq; 84 unsigned int *random_seq;
85#endif 85#endif
86 86
87 struct kmem_cache_node *node[MAX_NUMNODES]; 87 struct kmem_cache_node *node[MAX_NUMNODES];
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index d1faa019c02a..5624c1f3eb0a 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -99,6 +99,11 @@ struct kmem_cache {
99 */ 99 */
100 int remote_node_defrag_ratio; 100 int remote_node_defrag_ratio;
101#endif 101#endif
102
103#ifdef CONFIG_SLAB_FREELIST_RANDOM
104 unsigned int *random_seq;
105#endif
106
102 struct kmem_cache_node *node[MAX_NUMNODES]; 107 struct kmem_cache_node *node[MAX_NUMNODES];
103}; 108};
104 109
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 587480ad41b7..dd66a952e8cd 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -27,8 +27,7 @@
27#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) 27#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
28#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) 28#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)
29 29
30extern int handle_userfault(struct vm_area_struct *vma, unsigned long address, 30extern int handle_userfault(struct fault_env *fe, unsigned long reason);
31 unsigned int flags, unsigned long reason);
32 31
33extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, 32extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
34 unsigned long src_start, unsigned long len); 33 unsigned long src_start, unsigned long len);
@@ -56,10 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
56#else /* CONFIG_USERFAULTFD */ 55#else /* CONFIG_USERFAULTFD */
57 56
58/* mm helpers */ 57/* mm helpers */
59static inline int handle_userfault(struct vm_area_struct *vma, 58static inline int handle_userfault(struct fault_env *fe, unsigned long reason)
60 unsigned long address,
61 unsigned int flags,
62 unsigned long reason)
63{ 59{
64 return VM_FAULT_SIGBUS; 60 return VM_FAULT_SIGBUS;
65} 61}
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index ec084321fe09..42604173f122 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -70,6 +70,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
70 THP_FAULT_FALLBACK, 70 THP_FAULT_FALLBACK,
71 THP_COLLAPSE_ALLOC, 71 THP_COLLAPSE_ALLOC,
72 THP_COLLAPSE_ALLOC_FAILED, 72 THP_COLLAPSE_ALLOC_FAILED,
73 THP_FILE_ALLOC,
74 THP_FILE_MAPPED,
73 THP_SPLIT_PAGE, 75 THP_SPLIT_PAGE,
74 THP_SPLIT_PAGE_FAILED, 76 THP_SPLIT_PAGE_FAILED,
75 THP_DEFERRED_SPLIT_PAGE, 77 THP_DEFERRED_SPLIT_PAGE,
@@ -100,4 +102,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
100 NR_VM_EVENT_ITEMS 102 NR_VM_EVENT_ITEMS
101}; 103};
102 104
105#ifndef CONFIG_TRANSPARENT_HUGEPAGE
106#define THP_FILE_ALLOC ({ BUILD_BUG(); 0; })
107#define THP_FILE_MAPPED ({ BUILD_BUG(); 0; })
108#endif
109
103#endif /* VM_EVENT_ITEM_H_INCLUDED */ 110#endif /* VM_EVENT_ITEM_H_INCLUDED */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d0b5ca5d4e08..717e6149e753 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -384,4 +384,7 @@ void tag_pages_for_writeback(struct address_space *mapping,
384 384
385void account_page_redirty(struct page *page); 385void account_page_redirty(struct page *page);
386 386
387void sb_mark_inode_writeback(struct inode *inode);
388void sb_clear_inode_writeback(struct inode *inode);
389
387#endif /* WRITEBACK_H */ 390#endif /* WRITEBACK_H */
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
index 551ba4acde4d..04f58acda8e8 100644
--- a/include/trace/events/huge_memory.h
+++ b/include/trace/events/huge_memory.h
@@ -13,7 +13,7 @@
13 EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ 13 EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \
14 EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ 14 EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \
15 EM( SCAN_PAGE_RO, "no_writable_page") \ 15 EM( SCAN_PAGE_RO, "no_writable_page") \
16 EM( SCAN_NO_REFERENCED_PAGE, "no_referenced_page") \ 16 EM( SCAN_LACK_REFERENCED_PAGE, "lack_referenced_page") \
17 EM( SCAN_PAGE_NULL, "page_null") \ 17 EM( SCAN_PAGE_NULL, "page_null") \
18 EM( SCAN_SCAN_ABORT, "scan_aborted") \ 18 EM( SCAN_SCAN_ABORT, "scan_aborted") \
19 EM( SCAN_PAGE_COUNT, "not_suitable_page_count") \ 19 EM( SCAN_PAGE_COUNT, "not_suitable_page_count") \
@@ -28,7 +28,9 @@
28 EM( SCAN_SWAP_CACHE_PAGE, "page_swap_cache") \ 28 EM( SCAN_SWAP_CACHE_PAGE, "page_swap_cache") \
29 EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\ 29 EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\
30 EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ 30 EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \
31 EMe( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") 31 EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \
32 EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \
33 EMe(SCAN_TRUNCATED, "truncated") \
32 34
33#undef EM 35#undef EM
34#undef EMe 36#undef EMe
@@ -45,17 +47,18 @@ SCAN_STATUS
45TRACE_EVENT(mm_khugepaged_scan_pmd, 47TRACE_EVENT(mm_khugepaged_scan_pmd,
46 48
47 TP_PROTO(struct mm_struct *mm, struct page *page, bool writable, 49 TP_PROTO(struct mm_struct *mm, struct page *page, bool writable,
48 bool referenced, int none_or_zero, int status), 50 int referenced, int none_or_zero, int status, int unmapped),
49 51
50 TP_ARGS(mm, page, writable, referenced, none_or_zero, status), 52 TP_ARGS(mm, page, writable, referenced, none_or_zero, status, unmapped),
51 53
52 TP_STRUCT__entry( 54 TP_STRUCT__entry(
53 __field(struct mm_struct *, mm) 55 __field(struct mm_struct *, mm)
54 __field(unsigned long, pfn) 56 __field(unsigned long, pfn)
55 __field(bool, writable) 57 __field(bool, writable)
56 __field(bool, referenced) 58 __field(int, referenced)
57 __field(int, none_or_zero) 59 __field(int, none_or_zero)
58 __field(int, status) 60 __field(int, status)
61 __field(int, unmapped)
59 ), 62 ),
60 63
61 TP_fast_assign( 64 TP_fast_assign(
@@ -65,15 +68,17 @@ TRACE_EVENT(mm_khugepaged_scan_pmd,
65 __entry->referenced = referenced; 68 __entry->referenced = referenced;
66 __entry->none_or_zero = none_or_zero; 69 __entry->none_or_zero = none_or_zero;
67 __entry->status = status; 70 __entry->status = status;
71 __entry->unmapped = unmapped;
68 ), 72 ),
69 73
70 TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s", 74 TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d",
71 __entry->mm, 75 __entry->mm,
72 __entry->pfn, 76 __entry->pfn,
73 __entry->writable, 77 __entry->writable,
74 __entry->referenced, 78 __entry->referenced,
75 __entry->none_or_zero, 79 __entry->none_or_zero,
76 __print_symbolic(__entry->status, SCAN_STATUS)) 80 __print_symbolic(__entry->status, SCAN_STATUS),
81 __entry->unmapped)
77); 82);
78 83
79TRACE_EVENT(mm_collapse_huge_page, 84TRACE_EVENT(mm_collapse_huge_page,
@@ -103,14 +108,14 @@ TRACE_EVENT(mm_collapse_huge_page,
103TRACE_EVENT(mm_collapse_huge_page_isolate, 108TRACE_EVENT(mm_collapse_huge_page_isolate,
104 109
105 TP_PROTO(struct page *page, int none_or_zero, 110 TP_PROTO(struct page *page, int none_or_zero,
106 bool referenced, bool writable, int status), 111 int referenced, bool writable, int status),
107 112
108 TP_ARGS(page, none_or_zero, referenced, writable, status), 113 TP_ARGS(page, none_or_zero, referenced, writable, status),
109 114
110 TP_STRUCT__entry( 115 TP_STRUCT__entry(
111 __field(unsigned long, pfn) 116 __field(unsigned long, pfn)
112 __field(int, none_or_zero) 117 __field(int, none_or_zero)
113 __field(bool, referenced) 118 __field(int, referenced)
114 __field(bool, writable) 119 __field(bool, writable)
115 __field(int, status) 120 __field(int, status)
116 ), 121 ),
@@ -131,5 +136,32 @@ TRACE_EVENT(mm_collapse_huge_page_isolate,
131 __print_symbolic(__entry->status, SCAN_STATUS)) 136 __print_symbolic(__entry->status, SCAN_STATUS))
132); 137);
133 138
139TRACE_EVENT(mm_collapse_huge_page_swapin,
140
141 TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret),
142
143 TP_ARGS(mm, swapped_in, referenced, ret),
144
145 TP_STRUCT__entry(
146 __field(struct mm_struct *, mm)
147 __field(int, swapped_in)
148 __field(int, referenced)
149 __field(int, ret)
150 ),
151
152 TP_fast_assign(
153 __entry->mm = mm;
154 __entry->swapped_in = swapped_in;
155 __entry->referenced = referenced;
156 __entry->ret = ret;
157 ),
158
159 TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d",
160 __entry->mm,
161 __entry->swapped_in,
162 __entry->referenced,
163 __entry->ret)
164);
165
134#endif /* __HUGE_MEMORY_H */ 166#endif /* __HUGE_MEMORY_H */
135#include <trace/define_trace.h> 167#include <trace/define_trace.h>
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 73614ce1d204..531f5811ff6b 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -696,7 +696,7 @@ DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
696 TP_ARGS(inode, wbc, nr_to_write) 696 TP_ARGS(inode, wbc, nr_to_write)
697); 697);
698 698
699DECLARE_EVENT_CLASS(writeback_lazytime_template, 699DECLARE_EVENT_CLASS(writeback_inode_template,
700 TP_PROTO(struct inode *inode), 700 TP_PROTO(struct inode *inode),
701 701
702 TP_ARGS(inode), 702 TP_ARGS(inode),
@@ -723,25 +723,39 @@ DECLARE_EVENT_CLASS(writeback_lazytime_template,
723 show_inode_state(__entry->state), __entry->mode) 723 show_inode_state(__entry->state), __entry->mode)
724); 724);
725 725
726DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime, 726DEFINE_EVENT(writeback_inode_template, writeback_lazytime,
727 TP_PROTO(struct inode *inode), 727 TP_PROTO(struct inode *inode),
728 728
729 TP_ARGS(inode) 729 TP_ARGS(inode)
730); 730);
731 731
732DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime_iput, 732DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput,
733 TP_PROTO(struct inode *inode), 733 TP_PROTO(struct inode *inode),
734 734
735 TP_ARGS(inode) 735 TP_ARGS(inode)
736); 736);
737 737
738DEFINE_EVENT(writeback_lazytime_template, writeback_dirty_inode_enqueue, 738DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue,
739 739
740 TP_PROTO(struct inode *inode), 740 TP_PROTO(struct inode *inode),
741 741
742 TP_ARGS(inode) 742 TP_ARGS(inode)
743); 743);
744 744
745/*
746 * Inode writeback list tracking.
747 */
748
749DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback,
750 TP_PROTO(struct inode *inode),
751 TP_ARGS(inode)
752);
753
754DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback,
755 TP_PROTO(struct inode *inode),
756 TP_ARGS(inode)
757);
758
745#endif /* _TRACE_WRITEBACK_H */ 759#endif /* _TRACE_WRITEBACK_H */
746 760
747/* This part must be outside protection */ 761/* This part must be outside protection */
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 546b38886e11..e398beac67b8 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -80,5 +80,7 @@
80#define BPF_FS_MAGIC 0xcafe4a11 80#define BPF_FS_MAGIC 0xcafe4a11
81/* Since UDF 2.01 is ISO 13346 based... */ 81/* Since UDF 2.01 is ISO 13346 based... */
82#define UDF_SUPER_MAGIC 0x15013346 82#define UDF_SUPER_MAGIC 0x15013346
83#define BALLOON_KVM_MAGIC 0x13661366
84#define ZSMALLOC_MAGIC 0x58295829
83 85
84#endif /* __LINUX_MAGIC_H__ */ 86#endif /* __LINUX_MAGIC_H__ */
diff --git a/init/Kconfig b/init/Kconfig
index 557bdf10cd44..504057925ee9 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1786,10 +1786,10 @@ endchoice
1786 1786
1787config SLAB_FREELIST_RANDOM 1787config SLAB_FREELIST_RANDOM
1788 default n 1788 default n
1789 depends on SLAB 1789 depends on SLAB || SLUB
1790 bool "SLAB freelist randomization" 1790 bool "SLAB freelist randomization"
1791 help 1791 help
1792 Randomizes the freelist order used on creating new SLABs. This 1792 Randomizes the freelist order used on creating new pages. This
1793 security feature reduces the predictability of the kernel slab 1793 security feature reduces the predictability of the kernel slab
1794 allocator against heap overflows. 1794 allocator against heap overflows.
1795 1795
diff --git a/ipc/shm.c b/ipc/shm.c
index 13282510bc0d..dbac8860c721 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -476,13 +476,15 @@ static const struct file_operations shm_file_operations = {
476 .mmap = shm_mmap, 476 .mmap = shm_mmap,
477 .fsync = shm_fsync, 477 .fsync = shm_fsync,
478 .release = shm_release, 478 .release = shm_release,
479#ifndef CONFIG_MMU
480 .get_unmapped_area = shm_get_unmapped_area, 479 .get_unmapped_area = shm_get_unmapped_area,
481#endif
482 .llseek = noop_llseek, 480 .llseek = noop_llseek,
483 .fallocate = shm_fallocate, 481 .fallocate = shm_fallocate,
484}; 482};
485 483
484/*
485 * shm_file_operations_huge is now identical to shm_file_operations,
486 * but we keep it distinct for the sake of is_file_shm_hugepages().
487 */
486static const struct file_operations shm_file_operations_huge = { 488static const struct file_operations shm_file_operations_huge = {
487 .mmap = shm_mmap, 489 .mmap = shm_mmap,
488 .fsync = shm_fsync, 490 .fsync = shm_fsync,
@@ -764,10 +766,10 @@ static void shm_add_rss_swap(struct shmid_kernel *shp,
764 } else { 766 } else {
765#ifdef CONFIG_SHMEM 767#ifdef CONFIG_SHMEM
766 struct shmem_inode_info *info = SHMEM_I(inode); 768 struct shmem_inode_info *info = SHMEM_I(inode);
767 spin_lock(&info->lock); 769 spin_lock_irq(&info->lock);
768 *rss_add += inode->i_mapping->nrpages; 770 *rss_add += inode->i_mapping->nrpages;
769 *swp_add += info->swapped; 771 *swp_add += info->swapped;
770 spin_unlock(&info->lock); 772 spin_unlock_irq(&info->lock);
771#else 773#else
772 *rss_add += inode->i_mapping->nrpages; 774 *rss_add += inode->i_mapping->nrpages;
773#endif 775#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 4a7ec0c6c88c..de21f25e0d2c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -162,8 +162,8 @@ void __weak arch_release_thread_stack(unsigned long *stack)
162static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, 162static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
163 int node) 163 int node)
164{ 164{
165 struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, 165 struct page *page = alloc_pages_node(node, THREADINFO_GFP,
166 THREAD_SIZE_ORDER); 166 THREAD_SIZE_ORDER);
167 167
168 if (page) 168 if (page)
169 memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, 169 memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
@@ -178,7 +178,7 @@ static inline void free_thread_stack(unsigned long *stack)
178 178
179 memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, 179 memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,
180 -(1 << THREAD_SIZE_ORDER)); 180 -(1 << THREAD_SIZE_ORDER));
181 __free_kmem_pages(page, THREAD_SIZE_ORDER); 181 __free_pages(page, THREAD_SIZE_ORDER);
182} 182}
183# else 183# else
184static struct kmem_cache *thread_stack_cache; 184static struct kmem_cache *thread_stack_cache;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 805b7048a1bd..f07842e2d69f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -244,6 +244,7 @@ config PAGE_OWNER
244 depends on DEBUG_KERNEL && STACKTRACE_SUPPORT 244 depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
245 select DEBUG_FS 245 select DEBUG_FS
246 select STACKTRACE 246 select STACKTRACE
247 select STACKDEPOT
247 select PAGE_EXTENSION 248 select PAGE_EXTENSION
248 help 249 help
249 This keeps track of what call chain is the owner of a page, may 250 This keeps track of what call chain is the owner of a page, may
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 51a76af25c66..fcfa1939ac41 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -253,6 +253,7 @@ static int hash_fn(struct dma_debug_entry *entry)
253 */ 253 */
254static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, 254static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry,
255 unsigned long *flags) 255 unsigned long *flags)
256 __acquires(&dma_entry_hash[idx].lock)
256{ 257{
257 int idx = hash_fn(entry); 258 int idx = hash_fn(entry);
258 unsigned long __flags; 259 unsigned long __flags;
@@ -267,6 +268,7 @@ static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry,
267 */ 268 */
268static void put_hash_bucket(struct hash_bucket *bucket, 269static void put_hash_bucket(struct hash_bucket *bucket,
269 unsigned long *flags) 270 unsigned long *flags)
271 __releases(&bucket->lock)
270{ 272{
271 unsigned long __flags = *flags; 273 unsigned long __flags = *flags;
272 274
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 8b7d8459bb9d..61b8fb529cef 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -38,6 +38,9 @@
38#include <linux/preempt.h> /* in_interrupt() */ 38#include <linux/preempt.h> /* in_interrupt() */
39 39
40 40
41/* Number of nodes in fully populated tree of given height */
42static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly;
43
41/* 44/*
42 * Radix tree node cache. 45 * Radix tree node cache.
43 */ 46 */
@@ -342,7 +345,7 @@ radix_tree_node_free(struct radix_tree_node *node)
342 * To make use of this facility, the radix tree must be initialised without 345 * To make use of this facility, the radix tree must be initialised without
343 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). 346 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
344 */ 347 */
345static int __radix_tree_preload(gfp_t gfp_mask) 348static int __radix_tree_preload(gfp_t gfp_mask, int nr)
346{ 349{
347 struct radix_tree_preload *rtp; 350 struct radix_tree_preload *rtp;
348 struct radix_tree_node *node; 351 struct radix_tree_node *node;
@@ -350,14 +353,14 @@ static int __radix_tree_preload(gfp_t gfp_mask)
350 353
351 preempt_disable(); 354 preempt_disable();
352 rtp = this_cpu_ptr(&radix_tree_preloads); 355 rtp = this_cpu_ptr(&radix_tree_preloads);
353 while (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { 356 while (rtp->nr < nr) {
354 preempt_enable(); 357 preempt_enable();
355 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); 358 node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
356 if (node == NULL) 359 if (node == NULL)
357 goto out; 360 goto out;
358 preempt_disable(); 361 preempt_disable();
359 rtp = this_cpu_ptr(&radix_tree_preloads); 362 rtp = this_cpu_ptr(&radix_tree_preloads);
360 if (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { 363 if (rtp->nr < nr) {
361 node->private_data = rtp->nodes; 364 node->private_data = rtp->nodes;
362 rtp->nodes = node; 365 rtp->nodes = node;
363 rtp->nr++; 366 rtp->nr++;
@@ -383,7 +386,7 @@ int radix_tree_preload(gfp_t gfp_mask)
383{ 386{
384 /* Warn on non-sensical use... */ 387 /* Warn on non-sensical use... */
385 WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); 388 WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
386 return __radix_tree_preload(gfp_mask); 389 return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
387} 390}
388EXPORT_SYMBOL(radix_tree_preload); 391EXPORT_SYMBOL(radix_tree_preload);
389 392
@@ -395,7 +398,7 @@ EXPORT_SYMBOL(radix_tree_preload);
395int radix_tree_maybe_preload(gfp_t gfp_mask) 398int radix_tree_maybe_preload(gfp_t gfp_mask)
396{ 399{
397 if (gfpflags_allow_blocking(gfp_mask)) 400 if (gfpflags_allow_blocking(gfp_mask))
398 return __radix_tree_preload(gfp_mask); 401 return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
399 /* Preloading doesn't help anything with this gfp mask, skip it */ 402 /* Preloading doesn't help anything with this gfp mask, skip it */
400 preempt_disable(); 403 preempt_disable();
401 return 0; 404 return 0;
@@ -403,6 +406,51 @@ int radix_tree_maybe_preload(gfp_t gfp_mask)
403EXPORT_SYMBOL(radix_tree_maybe_preload); 406EXPORT_SYMBOL(radix_tree_maybe_preload);
404 407
405/* 408/*
409 * The same as function above, but preload number of nodes required to insert
410 * (1 << order) continuous naturally-aligned elements.
411 */
412int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order)
413{
414 unsigned long nr_subtrees;
415 int nr_nodes, subtree_height;
416
417 /* Preloading doesn't help anything with this gfp mask, skip it */
418 if (!gfpflags_allow_blocking(gfp_mask)) {
419 preempt_disable();
420 return 0;
421 }
422
423 /*
424 * Calculate number and height of fully populated subtrees it takes to
425 * store (1 << order) elements.
426 */
427 nr_subtrees = 1 << order;
428 for (subtree_height = 0; nr_subtrees > RADIX_TREE_MAP_SIZE;
429 subtree_height++)
430 nr_subtrees >>= RADIX_TREE_MAP_SHIFT;
431
432 /*
433 * The worst case is zero height tree with a single item at index 0 and
434 * then inserting items starting at ULONG_MAX - (1 << order).
435 *
436 * This requires RADIX_TREE_MAX_PATH nodes to build branch from root to
437 * 0-index item.
438 */
439 nr_nodes = RADIX_TREE_MAX_PATH;
440
441 /* Plus branch to fully populated subtrees. */
442 nr_nodes += RADIX_TREE_MAX_PATH - subtree_height;
443
444 /* Root node is shared. */
445 nr_nodes--;
446
447 /* Plus nodes required to build subtrees. */
448 nr_nodes += nr_subtrees * height_to_maxnodes[subtree_height];
449
450 return __radix_tree_preload(gfp_mask, nr_nodes);
451}
452
453/*
406 * The maximum index which can be stored in a radix tree 454 * The maximum index which can be stored in a radix tree
407 */ 455 */
408static inline unsigned long shift_maxindex(unsigned int shift) 456static inline unsigned long shift_maxindex(unsigned int shift)
@@ -1571,6 +1619,31 @@ radix_tree_node_ctor(void *arg)
1571 INIT_LIST_HEAD(&node->private_list); 1619 INIT_LIST_HEAD(&node->private_list);
1572} 1620}
1573 1621
1622static __init unsigned long __maxindex(unsigned int height)
1623{
1624 unsigned int width = height * RADIX_TREE_MAP_SHIFT;
1625 int shift = RADIX_TREE_INDEX_BITS - width;
1626
1627 if (shift < 0)
1628 return ~0UL;
1629 if (shift >= BITS_PER_LONG)
1630 return 0UL;
1631 return ~0UL >> shift;
1632}
1633
1634static __init void radix_tree_init_maxnodes(void)
1635{
1636 unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1];
1637 unsigned int i, j;
1638
1639 for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
1640 height_to_maxindex[i] = __maxindex(i);
1641 for (i = 0; i < ARRAY_SIZE(height_to_maxnodes); i++) {
1642 for (j = i; j > 0; j--)
1643 height_to_maxnodes[i] += height_to_maxindex[j - 1] + 1;
1644 }
1645}
1646
1574static int radix_tree_callback(struct notifier_block *nfb, 1647static int radix_tree_callback(struct notifier_block *nfb,
1575 unsigned long action, void *hcpu) 1648 unsigned long action, void *hcpu)
1576{ 1649{
@@ -1597,5 +1670,6 @@ void __init radix_tree_init(void)
1597 sizeof(struct radix_tree_node), 0, 1670 sizeof(struct radix_tree_node), 0,
1598 SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, 1671 SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
1599 radix_tree_node_ctor); 1672 radix_tree_node_ctor);
1673 radix_tree_init_maxnodes();
1600 hotcpu_notifier(radix_tree_callback, 0); 1674 hotcpu_notifier(radix_tree_callback, 0);
1601} 1675}
diff --git a/mm/Kconfig b/mm/Kconfig
index 3e2daef3c946..3c81803b00a3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -440,6 +440,14 @@ choice
440endchoice 440endchoice
441 441
442# 442#
443# We don't deposit page tables on file THP mapping,
444# but Power makes use of them to address MMU quirk.
445#
446config TRANSPARENT_HUGE_PAGECACHE
447 def_bool y
448 depends on TRANSPARENT_HUGEPAGE && !PPC
449
450#
443# UP and nommu archs use km based percpu allocator 451# UP and nommu archs use km based percpu allocator
444# 452#
445config NEED_PER_CPU_KM 453config NEED_PER_CPU_KM
diff --git a/mm/Makefile b/mm/Makefile
index 78c6f7dedb83..fc059666c760 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -74,7 +74,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
74obj-$(CONFIG_MEMTEST) += memtest.o 74obj-$(CONFIG_MEMTEST) += memtest.o
75obj-$(CONFIG_MIGRATION) += migrate.o 75obj-$(CONFIG_MIGRATION) += migrate.o
76obj-$(CONFIG_QUICKLIST) += quicklist.o 76obj-$(CONFIG_QUICKLIST) += quicklist.o
77obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o 77obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
78obj-$(CONFIG_PAGE_COUNTER) += page_counter.o 78obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
79obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o 79obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
80obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o 80obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 57b3e9bd6bc5..da91df50ba31 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -70,7 +70,7 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
70 */ 70 */
71 if (trylock_page(page)) { 71 if (trylock_page(page)) {
72#ifdef CONFIG_BALLOON_COMPACTION 72#ifdef CONFIG_BALLOON_COMPACTION
73 if (!PagePrivate(page)) { 73 if (PageIsolated(page)) {
74 /* raced with isolation */ 74 /* raced with isolation */
75 unlock_page(page); 75 unlock_page(page);
76 continue; 76 continue;
@@ -106,110 +106,50 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue);
106 106
107#ifdef CONFIG_BALLOON_COMPACTION 107#ifdef CONFIG_BALLOON_COMPACTION
108 108
109static inline void __isolate_balloon_page(struct page *page) 109bool balloon_page_isolate(struct page *page, isolate_mode_t mode)
110
110{ 111{
111 struct balloon_dev_info *b_dev_info = balloon_page_device(page); 112 struct balloon_dev_info *b_dev_info = balloon_page_device(page);
112 unsigned long flags; 113 unsigned long flags;
113 114
114 spin_lock_irqsave(&b_dev_info->pages_lock, flags); 115 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
115 ClearPagePrivate(page);
116 list_del(&page->lru); 116 list_del(&page->lru);
117 b_dev_info->isolated_pages++; 117 b_dev_info->isolated_pages++;
118 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); 118 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
119
120 return true;
119} 121}
120 122
121static inline void __putback_balloon_page(struct page *page) 123void balloon_page_putback(struct page *page)
122{ 124{
123 struct balloon_dev_info *b_dev_info = balloon_page_device(page); 125 struct balloon_dev_info *b_dev_info = balloon_page_device(page);
124 unsigned long flags; 126 unsigned long flags;
125 127
126 spin_lock_irqsave(&b_dev_info->pages_lock, flags); 128 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
127 SetPagePrivate(page);
128 list_add(&page->lru, &b_dev_info->pages); 129 list_add(&page->lru, &b_dev_info->pages);
129 b_dev_info->isolated_pages--; 130 b_dev_info->isolated_pages--;
130 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); 131 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
131} 132}
132 133
133/* __isolate_lru_page() counterpart for a ballooned page */
134bool balloon_page_isolate(struct page *page)
135{
136 /*
137 * Avoid burning cycles with pages that are yet under __free_pages(),
138 * or just got freed under us.
139 *
140 * In case we 'win' a race for a balloon page being freed under us and
141 * raise its refcount preventing __free_pages() from doing its job
142 * the put_page() at the end of this block will take care of
143 * release this page, thus avoiding a nasty leakage.
144 */
145 if (likely(get_page_unless_zero(page))) {
146 /*
147 * As balloon pages are not isolated from LRU lists, concurrent
148 * compaction threads can race against page migration functions
149 * as well as race against the balloon driver releasing a page.
150 *
151 * In order to avoid having an already isolated balloon page
152 * being (wrongly) re-isolated while it is under migration,
153 * or to avoid attempting to isolate pages being released by
154 * the balloon driver, lets be sure we have the page lock
155 * before proceeding with the balloon page isolation steps.
156 */
157 if (likely(trylock_page(page))) {
158 /*
159 * A ballooned page, by default, has PagePrivate set.
160 * Prevent concurrent compaction threads from isolating
161 * an already isolated balloon page by clearing it.
162 */
163 if (balloon_page_movable(page)) {
164 __isolate_balloon_page(page);
165 unlock_page(page);
166 return true;
167 }
168 unlock_page(page);
169 }
170 put_page(page);
171 }
172 return false;
173}
174
175/* putback_lru_page() counterpart for a ballooned page */
176void balloon_page_putback(struct page *page)
177{
178 /*
179 * 'lock_page()' stabilizes the page and prevents races against
180 * concurrent isolation threads attempting to re-isolate it.
181 */
182 lock_page(page);
183
184 if (__is_movable_balloon_page(page)) {
185 __putback_balloon_page(page);
186 /* drop the extra ref count taken for page isolation */
187 put_page(page);
188 } else {
189 WARN_ON(1);
190 dump_page(page, "not movable balloon page");
191 }
192 unlock_page(page);
193}
194 134
195/* move_to_new_page() counterpart for a ballooned page */ 135/* move_to_new_page() counterpart for a ballooned page */
196int balloon_page_migrate(struct page *newpage, 136int balloon_page_migrate(struct address_space *mapping,
197 struct page *page, enum migrate_mode mode) 137 struct page *newpage, struct page *page,
138 enum migrate_mode mode)
198{ 139{
199 struct balloon_dev_info *balloon = balloon_page_device(page); 140 struct balloon_dev_info *balloon = balloon_page_device(page);
200 int rc = -EAGAIN;
201 141
202 VM_BUG_ON_PAGE(!PageLocked(page), page); 142 VM_BUG_ON_PAGE(!PageLocked(page), page);
203 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 143 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
204 144
205 if (WARN_ON(!__is_movable_balloon_page(page))) { 145 return balloon->migratepage(balloon, newpage, page, mode);
206 dump_page(page, "not movable balloon page"); 146}
207 return rc;
208 }
209 147
210 if (balloon && balloon->migratepage) 148const struct address_space_operations balloon_aops = {
211 rc = balloon->migratepage(balloon, newpage, page, mode); 149 .migratepage = balloon_page_migrate,
150 .isolate_page = balloon_page_isolate,
151 .putback_page = balloon_page_putback,
152};
153EXPORT_SYMBOL_GPL(balloon_aops);
212 154
213 return rc;
214}
215#endif /* CONFIG_BALLOON_COMPACTION */ 155#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/compaction.c b/mm/compaction.c
index 7bc04778f84d..64df5fe052db 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -15,11 +15,11 @@
15#include <linux/backing-dev.h> 15#include <linux/backing-dev.h>
16#include <linux/sysctl.h> 16#include <linux/sysctl.h>
17#include <linux/sysfs.h> 17#include <linux/sysfs.h>
18#include <linux/balloon_compaction.h>
19#include <linux/page-isolation.h> 18#include <linux/page-isolation.h>
20#include <linux/kasan.h> 19#include <linux/kasan.h>
21#include <linux/kthread.h> 20#include <linux/kthread.h>
22#include <linux/freezer.h> 21#include <linux/freezer.h>
22#include <linux/page_owner.h>
23#include "internal.h" 23#include "internal.h"
24 24
25#ifdef CONFIG_COMPACTION 25#ifdef CONFIG_COMPACTION
@@ -65,13 +65,27 @@ static unsigned long release_freepages(struct list_head *freelist)
65 65
66static void map_pages(struct list_head *list) 66static void map_pages(struct list_head *list)
67{ 67{
68 struct page *page; 68 unsigned int i, order, nr_pages;
69 struct page *page, *next;
70 LIST_HEAD(tmp_list);
71
72 list_for_each_entry_safe(page, next, list, lru) {
73 list_del(&page->lru);
69 74
70 list_for_each_entry(page, list, lru) { 75 order = page_private(page);
71 arch_alloc_page(page, 0); 76 nr_pages = 1 << order;
72 kernel_map_pages(page, 1, 1); 77
73 kasan_alloc_pages(page, 0); 78 post_alloc_hook(page, order, __GFP_MOVABLE);
79 if (order)
80 split_page(page, order);
81
82 for (i = 0; i < nr_pages; i++) {
83 list_add(&page->lru, &tmp_list);
84 page++;
85 }
74 } 86 }
87
88 list_splice(&tmp_list, list);
75} 89}
76 90
77static inline bool migrate_async_suitable(int migratetype) 91static inline bool migrate_async_suitable(int migratetype)
@@ -81,6 +95,44 @@ static inline bool migrate_async_suitable(int migratetype)
81 95
82#ifdef CONFIG_COMPACTION 96#ifdef CONFIG_COMPACTION
83 97
98int PageMovable(struct page *page)
99{
100 struct address_space *mapping;
101
102 VM_BUG_ON_PAGE(!PageLocked(page), page);
103 if (!__PageMovable(page))
104 return 0;
105
106 mapping = page_mapping(page);
107 if (mapping && mapping->a_ops && mapping->a_ops->isolate_page)
108 return 1;
109
110 return 0;
111}
112EXPORT_SYMBOL(PageMovable);
113
114void __SetPageMovable(struct page *page, struct address_space *mapping)
115{
116 VM_BUG_ON_PAGE(!PageLocked(page), page);
117 VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page);
118 page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE);
119}
120EXPORT_SYMBOL(__SetPageMovable);
121
122void __ClearPageMovable(struct page *page)
123{
124 VM_BUG_ON_PAGE(!PageLocked(page), page);
125 VM_BUG_ON_PAGE(!PageMovable(page), page);
126 /*
127 * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
128 * flag so that VM can catch up released page by driver after isolation.
129 * With it, VM migration doesn't try to put it back.
130 */
131 page->mapping = (void *)((unsigned long)page->mapping &
132 PAGE_MAPPING_MOVABLE);
133}
134EXPORT_SYMBOL(__ClearPageMovable);
135
84/* Do not skip compaction more than 64 times */ 136/* Do not skip compaction more than 64 times */
85#define COMPACT_MAX_DEFER_SHIFT 6 137#define COMPACT_MAX_DEFER_SHIFT 6
86 138
@@ -368,12 +420,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
368 unsigned long flags = 0; 420 unsigned long flags = 0;
369 bool locked = false; 421 bool locked = false;
370 unsigned long blockpfn = *start_pfn; 422 unsigned long blockpfn = *start_pfn;
423 unsigned int order;
371 424
372 cursor = pfn_to_page(blockpfn); 425 cursor = pfn_to_page(blockpfn);
373 426
374 /* Isolate free pages. */ 427 /* Isolate free pages. */
375 for (; blockpfn < end_pfn; blockpfn++, cursor++) { 428 for (; blockpfn < end_pfn; blockpfn++, cursor++) {
376 int isolated, i; 429 int isolated;
377 struct page *page = cursor; 430 struct page *page = cursor;
378 431
379 /* 432 /*
@@ -439,17 +492,17 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
439 goto isolate_fail; 492 goto isolate_fail;
440 } 493 }
441 494
442 /* Found a free page, break it into order-0 pages */ 495 /* Found a free page, will break it into order-0 pages */
443 isolated = split_free_page(page); 496 order = page_order(page);
497 isolated = __isolate_free_page(page, order);
444 if (!isolated) 498 if (!isolated)
445 break; 499 break;
500 set_page_private(page, order);
446 501
447 total_isolated += isolated; 502 total_isolated += isolated;
448 cc->nr_freepages += isolated; 503 cc->nr_freepages += isolated;
449 for (i = 0; i < isolated; i++) { 504 list_add_tail(&page->lru, freelist);
450 list_add(&page->lru, freelist); 505
451 page++;
452 }
453 if (!strict && cc->nr_migratepages <= cc->nr_freepages) { 506 if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
454 blockpfn += isolated; 507 blockpfn += isolated;
455 break; 508 break;
@@ -568,7 +621,7 @@ isolate_freepages_range(struct compact_control *cc,
568 */ 621 */
569 } 622 }
570 623
571 /* split_free_page does not map the pages */ 624 /* __isolate_free_page() does not map the pages */
572 map_pages(&freelist); 625 map_pages(&freelist);
573 626
574 if (pfn < end_pfn) { 627 if (pfn < end_pfn) {
@@ -670,7 +723,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
670 723
671 /* Time to isolate some pages for migration */ 724 /* Time to isolate some pages for migration */
672 for (; low_pfn < end_pfn; low_pfn++) { 725 for (; low_pfn < end_pfn; low_pfn++) {
673 bool is_lru;
674 726
675 if (skip_on_failure && low_pfn >= next_skip_pfn) { 727 if (skip_on_failure && low_pfn >= next_skip_pfn) {
676 /* 728 /*
@@ -733,21 +785,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
733 } 785 }
734 786
735 /* 787 /*
736 * Check may be lockless but that's ok as we recheck later.
737 * It's possible to migrate LRU pages and balloon pages
738 * Skip any other type of page
739 */
740 is_lru = PageLRU(page);
741 if (!is_lru) {
742 if (unlikely(balloon_page_movable(page))) {
743 if (balloon_page_isolate(page)) {
744 /* Successfully isolated */
745 goto isolate_success;
746 }
747 }
748 }
749
750 /*
751 * Regardless of being on LRU, compound pages such as THP and 788 * Regardless of being on LRU, compound pages such as THP and
752 * hugetlbfs are not to be compacted. We can potentially save 789 * hugetlbfs are not to be compacted. We can potentially save
753 * a lot of iterations if we skip them at once. The check is 790 * a lot of iterations if we skip them at once. The check is
@@ -763,8 +800,30 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
763 goto isolate_fail; 800 goto isolate_fail;
764 } 801 }
765 802
766 if (!is_lru) 803 /*
804 * Check may be lockless but that's ok as we recheck later.
805 * It's possible to migrate LRU and non-lru movable pages.
806 * Skip any other type of page
807 */
808 if (!PageLRU(page)) {
809 /*
810 * __PageMovable can return false positive so we need
811 * to verify it under page_lock.
812 */
813 if (unlikely(__PageMovable(page)) &&
814 !PageIsolated(page)) {
815 if (locked) {
816 spin_unlock_irqrestore(&zone->lru_lock,
817 flags);
818 locked = false;
819 }
820
821 if (isolate_movable_page(page, isolate_mode))
822 goto isolate_success;
823 }
824
767 goto isolate_fail; 825 goto isolate_fail;
826 }
768 827
769 /* 828 /*
770 * Migration will fail if an anonymous page is pinned in memory, 829 * Migration will fail if an anonymous page is pinned in memory,
@@ -1059,7 +1118,7 @@ static void isolate_freepages(struct compact_control *cc)
1059 } 1118 }
1060 } 1119 }
1061 1120
1062 /* split_free_page does not map the pages */ 1121 /* __isolate_free_page() does not map the pages */
1063 map_pages(freelist); 1122 map_pages(freelist);
1064 1123
1065 /* 1124 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 20f3b1f33f0e..e90c1543ec2d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -114,14 +114,14 @@ static void page_cache_tree_delete(struct address_space *mapping,
114 struct page *page, void *shadow) 114 struct page *page, void *shadow)
115{ 115{
116 struct radix_tree_node *node; 116 struct radix_tree_node *node;
117 int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
117 118
118 VM_BUG_ON(!PageLocked(page)); 119 VM_BUG_ON_PAGE(!PageLocked(page), page);
119 120 VM_BUG_ON_PAGE(PageTail(page), page);
120 node = radix_tree_replace_clear_tags(&mapping->page_tree, page->index, 121 VM_BUG_ON_PAGE(nr != 1 && shadow, page);
121 shadow);
122 122
123 if (shadow) { 123 if (shadow) {
124 mapping->nrexceptional++; 124 mapping->nrexceptional += nr;
125 /* 125 /*
126 * Make sure the nrexceptional update is committed before 126 * Make sure the nrexceptional update is committed before
127 * the nrpages update so that final truncate racing 127 * the nrpages update so that final truncate racing
@@ -130,31 +130,38 @@ static void page_cache_tree_delete(struct address_space *mapping,
130 */ 130 */
131 smp_wmb(); 131 smp_wmb();
132 } 132 }
133 mapping->nrpages--; 133 mapping->nrpages -= nr;
134 134
135 if (!node) 135 for (i = 0; i < nr; i++) {
136 return; 136 node = radix_tree_replace_clear_tags(&mapping->page_tree,
137 137 page->index + i, shadow);
138 workingset_node_pages_dec(node); 138 if (!node) {
139 if (shadow) 139 VM_BUG_ON_PAGE(nr != 1, page);
140 workingset_node_shadows_inc(node);
141 else
142 if (__radix_tree_delete_node(&mapping->page_tree, node))
143 return; 140 return;
141 }
144 142
145 /* 143 workingset_node_pages_dec(node);
146 * Track node that only contains shadow entries. DAX mappings contain 144 if (shadow)
147 * no shadow entries and may contain other exceptional entries so skip 145 workingset_node_shadows_inc(node);
148 * those. 146 else
149 * 147 if (__radix_tree_delete_node(&mapping->page_tree, node))
150 * Avoid acquiring the list_lru lock if already tracked. The 148 continue;
151 * list_empty() test is safe as node->private_list is 149
152 * protected by mapping->tree_lock. 150 /*
153 */ 151 * Track node that only contains shadow entries. DAX mappings
154 if (!dax_mapping(mapping) && !workingset_node_pages(node) && 152 * contain no shadow entries and may contain other exceptional
155 list_empty(&node->private_list)) { 153 * entries so skip those.
156 node->private_data = mapping; 154 *
157 list_lru_add(&workingset_shadow_nodes, &node->private_list); 155 * Avoid acquiring the list_lru lock if already tracked.
156 * The list_empty() test is safe as node->private_list is
157 * protected by mapping->tree_lock.
158 */
159 if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
160 list_empty(&node->private_list)) {
161 node->private_data = mapping;
162 list_lru_add(&workingset_shadow_nodes,
163 &node->private_list);
164 }
158 } 165 }
159} 166}
160 167
@@ -166,6 +173,7 @@ static void page_cache_tree_delete(struct address_space *mapping,
166void __delete_from_page_cache(struct page *page, void *shadow) 173void __delete_from_page_cache(struct page *page, void *shadow)
167{ 174{
168 struct address_space *mapping = page->mapping; 175 struct address_space *mapping = page->mapping;
176 int nr = hpage_nr_pages(page);
169 177
170 trace_mm_filemap_delete_from_page_cache(page); 178 trace_mm_filemap_delete_from_page_cache(page);
171 /* 179 /*
@@ -178,6 +186,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
178 else 186 else
179 cleancache_invalidate_page(mapping, page); 187 cleancache_invalidate_page(mapping, page);
180 188
189 VM_BUG_ON_PAGE(PageTail(page), page);
181 VM_BUG_ON_PAGE(page_mapped(page), page); 190 VM_BUG_ON_PAGE(page_mapped(page), page);
182 if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { 191 if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) {
183 int mapcount; 192 int mapcount;
@@ -209,9 +218,14 @@ void __delete_from_page_cache(struct page *page, void *shadow)
209 218
210 /* hugetlb pages do not participate in page cache accounting. */ 219 /* hugetlb pages do not participate in page cache accounting. */
211 if (!PageHuge(page)) 220 if (!PageHuge(page))
212 __dec_zone_page_state(page, NR_FILE_PAGES); 221 __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, -nr);
213 if (PageSwapBacked(page)) 222 if (PageSwapBacked(page)) {
214 __dec_zone_page_state(page, NR_SHMEM); 223 __mod_zone_page_state(page_zone(page), NR_SHMEM, -nr);
224 if (PageTransHuge(page))
225 __dec_zone_page_state(page, NR_SHMEM_THPS);
226 } else {
227 VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page);
228 }
215 229
216 /* 230 /*
217 * At this point page must be either written or cleaned by truncate. 231 * At this point page must be either written or cleaned by truncate.
@@ -235,9 +249,8 @@ void __delete_from_page_cache(struct page *page, void *shadow)
235 */ 249 */
236void delete_from_page_cache(struct page *page) 250void delete_from_page_cache(struct page *page)
237{ 251{
238 struct address_space *mapping = page->mapping; 252 struct address_space *mapping = page_mapping(page);
239 unsigned long flags; 253 unsigned long flags;
240
241 void (*freepage)(struct page *); 254 void (*freepage)(struct page *);
242 255
243 BUG_ON(!PageLocked(page)); 256 BUG_ON(!PageLocked(page));
@@ -250,7 +263,13 @@ void delete_from_page_cache(struct page *page)
250 263
251 if (freepage) 264 if (freepage)
252 freepage(page); 265 freepage(page);
253 put_page(page); 266
267 if (PageTransHuge(page) && !PageHuge(page)) {
268 page_ref_sub(page, HPAGE_PMD_NR);
269 VM_BUG_ON_PAGE(page_count(page) <= 0, page);
270 } else {
271 put_page(page);
272 }
254} 273}
255EXPORT_SYMBOL(delete_from_page_cache); 274EXPORT_SYMBOL(delete_from_page_cache);
256 275
@@ -1053,7 +1072,7 @@ EXPORT_SYMBOL(page_cache_prev_hole);
1053struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) 1072struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
1054{ 1073{
1055 void **pagep; 1074 void **pagep;
1056 struct page *page; 1075 struct page *head, *page;
1057 1076
1058 rcu_read_lock(); 1077 rcu_read_lock();
1059repeat: 1078repeat:
@@ -1073,16 +1092,24 @@ repeat:
1073 */ 1092 */
1074 goto out; 1093 goto out;
1075 } 1094 }
1076 if (!page_cache_get_speculative(page)) 1095
1096 head = compound_head(page);
1097 if (!page_cache_get_speculative(head))
1077 goto repeat; 1098 goto repeat;
1078 1099
1100 /* The page was split under us? */
1101 if (compound_head(page) != head) {
1102 put_page(head);
1103 goto repeat;
1104 }
1105
1079 /* 1106 /*
1080 * Has the page moved? 1107 * Has the page moved?
1081 * This is part of the lockless pagecache protocol. See 1108 * This is part of the lockless pagecache protocol. See
1082 * include/linux/pagemap.h for details. 1109 * include/linux/pagemap.h for details.
1083 */ 1110 */
1084 if (unlikely(page != *pagep)) { 1111 if (unlikely(page != *pagep)) {
1085 put_page(page); 1112 put_page(head);
1086 goto repeat; 1113 goto repeat;
1087 } 1114 }
1088 } 1115 }
@@ -1118,12 +1145,12 @@ repeat:
1118 if (page && !radix_tree_exception(page)) { 1145 if (page && !radix_tree_exception(page)) {
1119 lock_page(page); 1146 lock_page(page);
1120 /* Has the page been truncated? */ 1147 /* Has the page been truncated? */
1121 if (unlikely(page->mapping != mapping)) { 1148 if (unlikely(page_mapping(page) != mapping)) {
1122 unlock_page(page); 1149 unlock_page(page);
1123 put_page(page); 1150 put_page(page);
1124 goto repeat; 1151 goto repeat;
1125 } 1152 }
1126 VM_BUG_ON_PAGE(page->index != offset, page); 1153 VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
1127 } 1154 }
1128 return page; 1155 return page;
1129} 1156}
@@ -1255,7 +1282,7 @@ unsigned find_get_entries(struct address_space *mapping,
1255 1282
1256 rcu_read_lock(); 1283 rcu_read_lock();
1257 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { 1284 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
1258 struct page *page; 1285 struct page *head, *page;
1259repeat: 1286repeat:
1260 page = radix_tree_deref_slot(slot); 1287 page = radix_tree_deref_slot(slot);
1261 if (unlikely(!page)) 1288 if (unlikely(!page))
@@ -1272,12 +1299,20 @@ repeat:
1272 */ 1299 */
1273 goto export; 1300 goto export;
1274 } 1301 }
1275 if (!page_cache_get_speculative(page)) 1302
1303 head = compound_head(page);
1304 if (!page_cache_get_speculative(head))
1276 goto repeat; 1305 goto repeat;
1277 1306
1307 /* The page was split under us? */
1308 if (compound_head(page) != head) {
1309 put_page(head);
1310 goto repeat;
1311 }
1312
1278 /* Has the page moved? */ 1313 /* Has the page moved? */
1279 if (unlikely(page != *slot)) { 1314 if (unlikely(page != *slot)) {
1280 put_page(page); 1315 put_page(head);
1281 goto repeat; 1316 goto repeat;
1282 } 1317 }
1283export: 1318export:
@@ -1318,7 +1353,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
1318 1353
1319 rcu_read_lock(); 1354 rcu_read_lock();
1320 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { 1355 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
1321 struct page *page; 1356 struct page *head, *page;
1322repeat: 1357repeat:
1323 page = radix_tree_deref_slot(slot); 1358 page = radix_tree_deref_slot(slot);
1324 if (unlikely(!page)) 1359 if (unlikely(!page))
@@ -1337,12 +1372,19 @@ repeat:
1337 continue; 1372 continue;
1338 } 1373 }
1339 1374
1340 if (!page_cache_get_speculative(page)) 1375 head = compound_head(page);
1376 if (!page_cache_get_speculative(head))
1341 goto repeat; 1377 goto repeat;
1342 1378
1379 /* The page was split under us? */
1380 if (compound_head(page) != head) {
1381 put_page(head);
1382 goto repeat;
1383 }
1384
1343 /* Has the page moved? */ 1385 /* Has the page moved? */
1344 if (unlikely(page != *slot)) { 1386 if (unlikely(page != *slot)) {
1345 put_page(page); 1387 put_page(head);
1346 goto repeat; 1388 goto repeat;
1347 } 1389 }
1348 1390
@@ -1379,7 +1421,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
1379 1421
1380 rcu_read_lock(); 1422 rcu_read_lock();
1381 radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { 1423 radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) {
1382 struct page *page; 1424 struct page *head, *page;
1383repeat: 1425repeat:
1384 page = radix_tree_deref_slot(slot); 1426 page = radix_tree_deref_slot(slot);
1385 /* The hole, there no reason to continue */ 1427 /* The hole, there no reason to continue */
@@ -1399,12 +1441,19 @@ repeat:
1399 break; 1441 break;
1400 } 1442 }
1401 1443
1402 if (!page_cache_get_speculative(page)) 1444 head = compound_head(page);
1445 if (!page_cache_get_speculative(head))
1403 goto repeat; 1446 goto repeat;
1404 1447
1448 /* The page was split under us? */
1449 if (compound_head(page) != head) {
1450 put_page(head);
1451 goto repeat;
1452 }
1453
1405 /* Has the page moved? */ 1454 /* Has the page moved? */
1406 if (unlikely(page != *slot)) { 1455 if (unlikely(page != *slot)) {
1407 put_page(page); 1456 put_page(head);
1408 goto repeat; 1457 goto repeat;
1409 } 1458 }
1410 1459
@@ -1413,7 +1462,7 @@ repeat:
1413 * otherwise we can get both false positives and false 1462 * otherwise we can get both false positives and false
1414 * negatives, which is just confusing to the caller. 1463 * negatives, which is just confusing to the caller.
1415 */ 1464 */
1416 if (page->mapping == NULL || page->index != iter.index) { 1465 if (page->mapping == NULL || page_to_pgoff(page) != iter.index) {
1417 put_page(page); 1466 put_page(page);
1418 break; 1467 break;
1419 } 1468 }
@@ -1451,7 +1500,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
1451 rcu_read_lock(); 1500 rcu_read_lock();
1452 radix_tree_for_each_tagged(slot, &mapping->page_tree, 1501 radix_tree_for_each_tagged(slot, &mapping->page_tree,
1453 &iter, *index, tag) { 1502 &iter, *index, tag) {
1454 struct page *page; 1503 struct page *head, *page;
1455repeat: 1504repeat:
1456 page = radix_tree_deref_slot(slot); 1505 page = radix_tree_deref_slot(slot);
1457 if (unlikely(!page)) 1506 if (unlikely(!page))
@@ -1476,12 +1525,19 @@ repeat:
1476 continue; 1525 continue;
1477 } 1526 }
1478 1527
1479 if (!page_cache_get_speculative(page)) 1528 head = compound_head(page);
1529 if (!page_cache_get_speculative(head))
1480 goto repeat; 1530 goto repeat;
1481 1531
1532 /* The page was split under us? */
1533 if (compound_head(page) != head) {
1534 put_page(head);
1535 goto repeat;
1536 }
1537
1482 /* Has the page moved? */ 1538 /* Has the page moved? */
1483 if (unlikely(page != *slot)) { 1539 if (unlikely(page != *slot)) {
1484 put_page(page); 1540 put_page(head);
1485 goto repeat; 1541 goto repeat;
1486 } 1542 }
1487 1543
@@ -1525,7 +1581,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
1525 rcu_read_lock(); 1581 rcu_read_lock();
1526 radix_tree_for_each_tagged(slot, &mapping->page_tree, 1582 radix_tree_for_each_tagged(slot, &mapping->page_tree,
1527 &iter, start, tag) { 1583 &iter, start, tag) {
1528 struct page *page; 1584 struct page *head, *page;
1529repeat: 1585repeat:
1530 page = radix_tree_deref_slot(slot); 1586 page = radix_tree_deref_slot(slot);
1531 if (unlikely(!page)) 1587 if (unlikely(!page))
@@ -1543,12 +1599,20 @@ repeat:
1543 */ 1599 */
1544 goto export; 1600 goto export;
1545 } 1601 }
1546 if (!page_cache_get_speculative(page)) 1602
1603 head = compound_head(page);
1604 if (!page_cache_get_speculative(head))
1547 goto repeat; 1605 goto repeat;
1548 1606
1607 /* The page was split under us? */
1608 if (compound_head(page) != head) {
1609 put_page(head);
1610 goto repeat;
1611 }
1612
1549 /* Has the page moved? */ 1613 /* Has the page moved? */
1550 if (unlikely(page != *slot)) { 1614 if (unlikely(page != *slot)) {
1551 put_page(page); 1615 put_page(head);
1552 goto repeat; 1616 goto repeat;
1553 } 1617 }
1554export: 1618export:
@@ -2128,21 +2192,21 @@ page_not_uptodate:
2128} 2192}
2129EXPORT_SYMBOL(filemap_fault); 2193EXPORT_SYMBOL(filemap_fault);
2130 2194
2131void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) 2195void filemap_map_pages(struct fault_env *fe,
2196 pgoff_t start_pgoff, pgoff_t end_pgoff)
2132{ 2197{
2133 struct radix_tree_iter iter; 2198 struct radix_tree_iter iter;
2134 void **slot; 2199 void **slot;
2135 struct file *file = vma->vm_file; 2200 struct file *file = fe->vma->vm_file;
2136 struct address_space *mapping = file->f_mapping; 2201 struct address_space *mapping = file->f_mapping;
2202 pgoff_t last_pgoff = start_pgoff;
2137 loff_t size; 2203 loff_t size;
2138 struct page *page; 2204 struct page *head, *page;
2139 unsigned long address = (unsigned long) vmf->virtual_address;
2140 unsigned long addr;
2141 pte_t *pte;
2142 2205
2143 rcu_read_lock(); 2206 rcu_read_lock();
2144 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { 2207 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
2145 if (iter.index > vmf->max_pgoff) 2208 start_pgoff) {
2209 if (iter.index > end_pgoff)
2146 break; 2210 break;
2147repeat: 2211repeat:
2148 page = radix_tree_deref_slot(slot); 2212 page = radix_tree_deref_slot(slot);
@@ -2156,12 +2220,19 @@ repeat:
2156 goto next; 2220 goto next;
2157 } 2221 }
2158 2222
2159 if (!page_cache_get_speculative(page)) 2223 head = compound_head(page);
2224 if (!page_cache_get_speculative(head))
2160 goto repeat; 2225 goto repeat;
2161 2226
2227 /* The page was split under us? */
2228 if (compound_head(page) != head) {
2229 put_page(head);
2230 goto repeat;
2231 }
2232
2162 /* Has the page moved? */ 2233 /* Has the page moved? */
2163 if (unlikely(page != *slot)) { 2234 if (unlikely(page != *slot)) {
2164 put_page(page); 2235 put_page(head);
2165 goto repeat; 2236 goto repeat;
2166 } 2237 }
2167 2238
@@ -2179,14 +2250,15 @@ repeat:
2179 if (page->index >= size >> PAGE_SHIFT) 2250 if (page->index >= size >> PAGE_SHIFT)
2180 goto unlock; 2251 goto unlock;
2181 2252
2182 pte = vmf->pte + page->index - vmf->pgoff;
2183 if (!pte_none(*pte))
2184 goto unlock;
2185
2186 if (file->f_ra.mmap_miss > 0) 2253 if (file->f_ra.mmap_miss > 0)
2187 file->f_ra.mmap_miss--; 2254 file->f_ra.mmap_miss--;
2188 addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; 2255
2189 do_set_pte(vma, addr, page, pte, false, false); 2256 fe->address += (iter.index - last_pgoff) << PAGE_SHIFT;
2257 if (fe->pte)
2258 fe->pte += iter.index - last_pgoff;
2259 last_pgoff = iter.index;
2260 if (alloc_set_pte(fe, NULL, page))
2261 goto unlock;
2190 unlock_page(page); 2262 unlock_page(page);
2191 goto next; 2263 goto next;
2192unlock: 2264unlock:
@@ -2194,7 +2266,10 @@ unlock:
2194skip: 2266skip:
2195 put_page(page); 2267 put_page(page);
2196next: 2268next:
2197 if (iter.index == vmf->max_pgoff) 2269 /* Huge page is mapped? No need to proceed. */
2270 if (pmd_trans_huge(*fe->pmd))
2271 break;
2272 if (iter.index == end_pgoff)
2198 break; 2273 break;
2199 } 2274 }
2200 rcu_read_unlock(); 2275 rcu_read_unlock();
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 27a9924caf61..fec8b5044040 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -20,6 +20,8 @@
20#include <linux/frontswap.h> 20#include <linux/frontswap.h>
21#include <linux/swapfile.h> 21#include <linux/swapfile.h>
22 22
23DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key);
24
23/* 25/*
24 * frontswap_ops are added by frontswap_register_ops, and provide the 26 * frontswap_ops are added by frontswap_register_ops, and provide the
25 * frontswap "backend" implementation functions. Multiple implementations 27 * frontswap "backend" implementation functions. Multiple implementations
@@ -139,6 +141,8 @@ void frontswap_register_ops(struct frontswap_ops *ops)
139 ops->next = frontswap_ops; 141 ops->next = frontswap_ops;
140 } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next); 142 } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
141 143
144 static_branch_inc(&frontswap_enabled_key);
145
142 spin_lock(&swap_lock); 146 spin_lock(&swap_lock);
143 plist_for_each_entry(si, &swap_active_head, list) { 147 plist_for_each_entry(si, &swap_active_head, list) {
144 if (si->frontswap_map) 148 if (si->frontswap_map)
@@ -189,7 +193,7 @@ void __frontswap_init(unsigned type, unsigned long *map)
189 struct swap_info_struct *sis = swap_info[type]; 193 struct swap_info_struct *sis = swap_info[type];
190 struct frontswap_ops *ops; 194 struct frontswap_ops *ops;
191 195
192 BUG_ON(sis == NULL); 196 VM_BUG_ON(sis == NULL);
193 197
194 /* 198 /*
195 * p->frontswap is a bitmap that we MUST have to figure out which page 199 * p->frontswap is a bitmap that we MUST have to figure out which page
@@ -248,15 +252,9 @@ int __frontswap_store(struct page *page)
248 pgoff_t offset = swp_offset(entry); 252 pgoff_t offset = swp_offset(entry);
249 struct frontswap_ops *ops; 253 struct frontswap_ops *ops;
250 254
251 /* 255 VM_BUG_ON(!frontswap_ops);
252 * Return if no backend registed. 256 VM_BUG_ON(!PageLocked(page));
253 * Don't need to inc frontswap_failed_stores here. 257 VM_BUG_ON(sis == NULL);
254 */
255 if (!frontswap_ops)
256 return -1;
257
258 BUG_ON(!PageLocked(page));
259 BUG_ON(sis == NULL);
260 258
261 /* 259 /*
262 * If a dup, we must remove the old page first; we can't leave the 260 * If a dup, we must remove the old page first; we can't leave the
@@ -303,11 +301,10 @@ int __frontswap_load(struct page *page)
303 pgoff_t offset = swp_offset(entry); 301 pgoff_t offset = swp_offset(entry);
304 struct frontswap_ops *ops; 302 struct frontswap_ops *ops;
305 303
306 if (!frontswap_ops) 304 VM_BUG_ON(!frontswap_ops);
307 return -1; 305 VM_BUG_ON(!PageLocked(page));
306 VM_BUG_ON(sis == NULL);
308 307
309 BUG_ON(!PageLocked(page));
310 BUG_ON(sis == NULL);
311 if (!__frontswap_test(sis, offset)) 308 if (!__frontswap_test(sis, offset))
312 return -1; 309 return -1;
313 310
@@ -337,10 +334,9 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
337 struct swap_info_struct *sis = swap_info[type]; 334 struct swap_info_struct *sis = swap_info[type];
338 struct frontswap_ops *ops; 335 struct frontswap_ops *ops;
339 336
340 if (!frontswap_ops) 337 VM_BUG_ON(!frontswap_ops);
341 return; 338 VM_BUG_ON(sis == NULL);
342 339
343 BUG_ON(sis == NULL);
344 if (!__frontswap_test(sis, offset)) 340 if (!__frontswap_test(sis, offset))
345 return; 341 return;
346 342
@@ -360,10 +356,9 @@ void __frontswap_invalidate_area(unsigned type)
360 struct swap_info_struct *sis = swap_info[type]; 356 struct swap_info_struct *sis = swap_info[type];
361 struct frontswap_ops *ops; 357 struct frontswap_ops *ops;
362 358
363 if (!frontswap_ops) 359 VM_BUG_ON(!frontswap_ops);
364 return; 360 VM_BUG_ON(sis == NULL);
365 361
366 BUG_ON(sis == NULL);
367 if (sis->frontswap_map == NULL) 362 if (sis->frontswap_map == NULL)
368 return; 363 return;
369 364
diff --git a/mm/gup.c b/mm/gup.c
index c057784c8444..547741f5f7a7 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -279,6 +279,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
279 spin_unlock(ptl); 279 spin_unlock(ptl);
280 ret = 0; 280 ret = 0;
281 split_huge_pmd(vma, pmd, address); 281 split_huge_pmd(vma, pmd, address);
282 if (pmd_trans_unstable(pmd))
283 ret = -EBUSY;
282 } else { 284 } else {
283 get_page(page); 285 get_page(page);
284 spin_unlock(ptl); 286 spin_unlock(ptl);
@@ -286,6 +288,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
286 ret = split_huge_page(page); 288 ret = split_huge_page(page);
287 unlock_page(page); 289 unlock_page(page);
288 put_page(page); 290 put_page(page);
291 if (pmd_none(*pmd))
292 return no_page_table(vma, flags);
289 } 293 }
290 294
291 return ret ? ERR_PTR(ret) : 295 return ret ? ERR_PTR(ret) :
@@ -350,7 +354,6 @@ unmap:
350static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, 354static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
351 unsigned long address, unsigned int *flags, int *nonblocking) 355 unsigned long address, unsigned int *flags, int *nonblocking)
352{ 356{
353 struct mm_struct *mm = vma->vm_mm;
354 unsigned int fault_flags = 0; 357 unsigned int fault_flags = 0;
355 int ret; 358 int ret;
356 359
@@ -375,7 +378,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
375 fault_flags |= FAULT_FLAG_TRIED; 378 fault_flags |= FAULT_FLAG_TRIED;
376 } 379 }
377 380
378 ret = handle_mm_fault(mm, vma, address, fault_flags); 381 ret = handle_mm_fault(vma, address, fault_flags);
379 if (ret & VM_FAULT_ERROR) { 382 if (ret & VM_FAULT_ERROR) {
380 if (ret & VM_FAULT_OOM) 383 if (ret & VM_FAULT_OOM)
381 return -ENOMEM; 384 return -ENOMEM;
@@ -690,7 +693,7 @@ retry:
690 if (!vma_permits_fault(vma, fault_flags)) 693 if (!vma_permits_fault(vma, fault_flags))
691 return -EFAULT; 694 return -EFAULT;
692 695
693 ret = handle_mm_fault(mm, vma, address, fault_flags); 696 ret = handle_mm_fault(vma, address, fault_flags);
694 major |= ret & VM_FAULT_MAJOR; 697 major |= ret & VM_FAULT_MAJOR;
695 if (ret & VM_FAULT_ERROR) { 698 if (ret & VM_FAULT_ERROR) {
696 if (ret & VM_FAULT_OOM) 699 if (ret & VM_FAULT_OOM)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 343a2b7e57aa..3647334c2ef9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -18,7 +18,6 @@
18#include <linux/mm_inline.h> 18#include <linux/mm_inline.h>
19#include <linux/swapops.h> 19#include <linux/swapops.h>
20#include <linux/dax.h> 20#include <linux/dax.h>
21#include <linux/kthread.h>
22#include <linux/khugepaged.h> 21#include <linux/khugepaged.h>
23#include <linux/freezer.h> 22#include <linux/freezer.h>
24#include <linux/pfn_t.h> 23#include <linux/pfn_t.h>
@@ -30,39 +29,12 @@
30#include <linux/hashtable.h> 29#include <linux/hashtable.h>
31#include <linux/userfaultfd_k.h> 30#include <linux/userfaultfd_k.h>
32#include <linux/page_idle.h> 31#include <linux/page_idle.h>
32#include <linux/shmem_fs.h>
33 33
34#include <asm/tlb.h> 34#include <asm/tlb.h>
35#include <asm/pgalloc.h> 35#include <asm/pgalloc.h>
36#include "internal.h" 36#include "internal.h"
37 37
38enum scan_result {
39 SCAN_FAIL,
40 SCAN_SUCCEED,
41 SCAN_PMD_NULL,
42 SCAN_EXCEED_NONE_PTE,
43 SCAN_PTE_NON_PRESENT,
44 SCAN_PAGE_RO,
45 SCAN_NO_REFERENCED_PAGE,
46 SCAN_PAGE_NULL,
47 SCAN_SCAN_ABORT,
48 SCAN_PAGE_COUNT,
49 SCAN_PAGE_LRU,
50 SCAN_PAGE_LOCK,
51 SCAN_PAGE_ANON,
52 SCAN_PAGE_COMPOUND,
53 SCAN_ANY_PROCESS,
54 SCAN_VMA_NULL,
55 SCAN_VMA_CHECK,
56 SCAN_ADDRESS_RANGE,
57 SCAN_SWAP_CACHE_PAGE,
58 SCAN_DEL_PAGE_LRU,
59 SCAN_ALLOC_HUGE_PAGE_FAIL,
60 SCAN_CGROUP_CHARGE_FAIL
61};
62
63#define CREATE_TRACE_POINTS
64#include <trace/events/huge_memory.h>
65
66/* 38/*
67 * By default transparent hugepage support is disabled in order that avoid 39 * By default transparent hugepage support is disabled in order that avoid
68 * to risk increase the memory footprint of applications without a guaranteed 40 * to risk increase the memory footprint of applications without a guaranteed
@@ -82,127 +54,8 @@ unsigned long transparent_hugepage_flags __read_mostly =
82 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| 54 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
83 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 55 (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
84 56
85/* default scan 8*512 pte (or vmas) every 30 second */
86static unsigned int khugepaged_pages_to_scan __read_mostly;
87static unsigned int khugepaged_pages_collapsed;
88static unsigned int khugepaged_full_scans;
89static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
90/* during fragmentation poll the hugepage allocator once every minute */
91static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
92static unsigned long khugepaged_sleep_expire;
93static struct task_struct *khugepaged_thread __read_mostly;
94static DEFINE_MUTEX(khugepaged_mutex);
95static DEFINE_SPINLOCK(khugepaged_mm_lock);
96static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
97/*
98 * default collapse hugepages if there is at least one pte mapped like
99 * it would have happened if the vma was large enough during page
100 * fault.
101 */
102static unsigned int khugepaged_max_ptes_none __read_mostly;
103
104static int khugepaged(void *none);
105static int khugepaged_slab_init(void);
106static void khugepaged_slab_exit(void);
107
108#define MM_SLOTS_HASH_BITS 10
109static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
110
111static struct kmem_cache *mm_slot_cache __read_mostly;
112
113/**
114 * struct mm_slot - hash lookup from mm to mm_slot
115 * @hash: hash collision list
116 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
117 * @mm: the mm that this information is valid for
118 */
119struct mm_slot {
120 struct hlist_node hash;
121 struct list_head mm_node;
122 struct mm_struct *mm;
123};
124
125/**
126 * struct khugepaged_scan - cursor for scanning
127 * @mm_head: the head of the mm list to scan
128 * @mm_slot: the current mm_slot we are scanning
129 * @address: the next address inside that to be scanned
130 *
131 * There is only the one khugepaged_scan instance of this cursor structure.
132 */
133struct khugepaged_scan {
134 struct list_head mm_head;
135 struct mm_slot *mm_slot;
136 unsigned long address;
137};
138static struct khugepaged_scan khugepaged_scan = {
139 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
140};
141
142static struct shrinker deferred_split_shrinker; 57static struct shrinker deferred_split_shrinker;
143 58
144static void set_recommended_min_free_kbytes(void)
145{
146 struct zone *zone;
147 int nr_zones = 0;
148 unsigned long recommended_min;
149
150 for_each_populated_zone(zone)
151 nr_zones++;
152
153 /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
154 recommended_min = pageblock_nr_pages * nr_zones * 2;
155
156 /*
157 * Make sure that on average at least two pageblocks are almost free
158 * of another type, one for a migratetype to fall back to and a
159 * second to avoid subsequent fallbacks of other types There are 3
160 * MIGRATE_TYPES we care about.
161 */
162 recommended_min += pageblock_nr_pages * nr_zones *
163 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
164
165 /* don't ever allow to reserve more than 5% of the lowmem */
166 recommended_min = min(recommended_min,
167 (unsigned long) nr_free_buffer_pages() / 20);
168 recommended_min <<= (PAGE_SHIFT-10);
169
170 if (recommended_min > min_free_kbytes) {
171 if (user_min_free_kbytes >= 0)
172 pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
173 min_free_kbytes, recommended_min);
174
175 min_free_kbytes = recommended_min;
176 }
177 setup_per_zone_wmarks();
178}
179
180static int start_stop_khugepaged(void)
181{
182 int err = 0;
183 if (khugepaged_enabled()) {
184 if (!khugepaged_thread)
185 khugepaged_thread = kthread_run(khugepaged, NULL,
186 "khugepaged");
187 if (IS_ERR(khugepaged_thread)) {
188 pr_err("khugepaged: kthread_run(khugepaged) failed\n");
189 err = PTR_ERR(khugepaged_thread);
190 khugepaged_thread = NULL;
191 goto fail;
192 }
193
194 if (!list_empty(&khugepaged_scan.mm_head))
195 wake_up_interruptible(&khugepaged_wait);
196
197 set_recommended_min_free_kbytes();
198 } else if (khugepaged_thread) {
199 kthread_stop(khugepaged_thread);
200 khugepaged_thread = NULL;
201 }
202fail:
203 return err;
204}
205
206static atomic_t huge_zero_refcount; 59static atomic_t huge_zero_refcount;
207struct page *huge_zero_page __read_mostly; 60struct page *huge_zero_page __read_mostly;
208 61
@@ -328,12 +181,7 @@ static ssize_t enabled_store(struct kobject *kobj,
328 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 181 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
329 182
330 if (ret > 0) { 183 if (ret > 0) {
331 int err; 184 int err = start_stop_khugepaged();
332
333 mutex_lock(&khugepaged_mutex);
334 err = start_stop_khugepaged();
335 mutex_unlock(&khugepaged_mutex);
336
337 if (err) 185 if (err)
338 ret = err; 186 ret = err;
339 } 187 }
@@ -343,7 +191,7 @@ static ssize_t enabled_store(struct kobject *kobj,
343static struct kobj_attribute enabled_attr = 191static struct kobj_attribute enabled_attr =
344 __ATTR(enabled, 0644, enabled_show, enabled_store); 192 __ATTR(enabled, 0644, enabled_show, enabled_store);
345 193
346static ssize_t single_flag_show(struct kobject *kobj, 194ssize_t single_hugepage_flag_show(struct kobject *kobj,
347 struct kobj_attribute *attr, char *buf, 195 struct kobj_attribute *attr, char *buf,
348 enum transparent_hugepage_flag flag) 196 enum transparent_hugepage_flag flag)
349{ 197{
@@ -351,7 +199,7 @@ static ssize_t single_flag_show(struct kobject *kobj,
351 !!test_bit(flag, &transparent_hugepage_flags)); 199 !!test_bit(flag, &transparent_hugepage_flags));
352} 200}
353 201
354static ssize_t single_flag_store(struct kobject *kobj, 202ssize_t single_hugepage_flag_store(struct kobject *kobj,
355 struct kobj_attribute *attr, 203 struct kobj_attribute *attr,
356 const char *buf, size_t count, 204 const char *buf, size_t count,
357 enum transparent_hugepage_flag flag) 205 enum transparent_hugepage_flag flag)
@@ -406,13 +254,13 @@ static struct kobj_attribute defrag_attr =
406static ssize_t use_zero_page_show(struct kobject *kobj, 254static ssize_t use_zero_page_show(struct kobject *kobj,
407 struct kobj_attribute *attr, char *buf) 255 struct kobj_attribute *attr, char *buf)
408{ 256{
409 return single_flag_show(kobj, attr, buf, 257 return single_hugepage_flag_show(kobj, attr, buf,
410 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 258 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
411} 259}
412static ssize_t use_zero_page_store(struct kobject *kobj, 260static ssize_t use_zero_page_store(struct kobject *kobj,
413 struct kobj_attribute *attr, const char *buf, size_t count) 261 struct kobj_attribute *attr, const char *buf, size_t count)
414{ 262{
415 return single_flag_store(kobj, attr, buf, count, 263 return single_hugepage_flag_store(kobj, attr, buf, count,
416 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); 264 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
417} 265}
418static struct kobj_attribute use_zero_page_attr = 266static struct kobj_attribute use_zero_page_attr =
@@ -421,14 +269,14 @@ static struct kobj_attribute use_zero_page_attr =
421static ssize_t debug_cow_show(struct kobject *kobj, 269static ssize_t debug_cow_show(struct kobject *kobj,
422 struct kobj_attribute *attr, char *buf) 270 struct kobj_attribute *attr, char *buf)
423{ 271{
424 return single_flag_show(kobj, attr, buf, 272 return single_hugepage_flag_show(kobj, attr, buf,
425 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 273 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
426} 274}
427static ssize_t debug_cow_store(struct kobject *kobj, 275static ssize_t debug_cow_store(struct kobject *kobj,
428 struct kobj_attribute *attr, 276 struct kobj_attribute *attr,
429 const char *buf, size_t count) 277 const char *buf, size_t count)
430{ 278{
431 return single_flag_store(kobj, attr, buf, count, 279 return single_hugepage_flag_store(kobj, attr, buf, count,
432 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); 280 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
433} 281}
434static struct kobj_attribute debug_cow_attr = 282static struct kobj_attribute debug_cow_attr =
@@ -439,6 +287,9 @@ static struct attribute *hugepage_attr[] = {
439 &enabled_attr.attr, 287 &enabled_attr.attr,
440 &defrag_attr.attr, 288 &defrag_attr.attr,
441 &use_zero_page_attr.attr, 289 &use_zero_page_attr.attr,
290#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
291 &shmem_enabled_attr.attr,
292#endif
442#ifdef CONFIG_DEBUG_VM 293#ifdef CONFIG_DEBUG_VM
443 &debug_cow_attr.attr, 294 &debug_cow_attr.attr,
444#endif 295#endif
@@ -449,171 +300,6 @@ static struct attribute_group hugepage_attr_group = {
449 .attrs = hugepage_attr, 300 .attrs = hugepage_attr,
450}; 301};
451 302
452static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
453 struct kobj_attribute *attr,
454 char *buf)
455{
456 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
457}
458
459static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
460 struct kobj_attribute *attr,
461 const char *buf, size_t count)
462{
463 unsigned long msecs;
464 int err;
465
466 err = kstrtoul(buf, 10, &msecs);
467 if (err || msecs > UINT_MAX)
468 return -EINVAL;
469
470 khugepaged_scan_sleep_millisecs = msecs;
471 khugepaged_sleep_expire = 0;
472 wake_up_interruptible(&khugepaged_wait);
473
474 return count;
475}
476static struct kobj_attribute scan_sleep_millisecs_attr =
477 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
478 scan_sleep_millisecs_store);
479
480static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
481 struct kobj_attribute *attr,
482 char *buf)
483{
484 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
485}
486
487static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
488 struct kobj_attribute *attr,
489 const char *buf, size_t count)
490{
491 unsigned long msecs;
492 int err;
493
494 err = kstrtoul(buf, 10, &msecs);
495 if (err || msecs > UINT_MAX)
496 return -EINVAL;
497
498 khugepaged_alloc_sleep_millisecs = msecs;
499 khugepaged_sleep_expire = 0;
500 wake_up_interruptible(&khugepaged_wait);
501
502 return count;
503}
504static struct kobj_attribute alloc_sleep_millisecs_attr =
505 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
506 alloc_sleep_millisecs_store);
507
508static ssize_t pages_to_scan_show(struct kobject *kobj,
509 struct kobj_attribute *attr,
510 char *buf)
511{
512 return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
513}
514static ssize_t pages_to_scan_store(struct kobject *kobj,
515 struct kobj_attribute *attr,
516 const char *buf, size_t count)
517{
518 int err;
519 unsigned long pages;
520
521 err = kstrtoul(buf, 10, &pages);
522 if (err || !pages || pages > UINT_MAX)
523 return -EINVAL;
524
525 khugepaged_pages_to_scan = pages;
526
527 return count;
528}
529static struct kobj_attribute pages_to_scan_attr =
530 __ATTR(pages_to_scan, 0644, pages_to_scan_show,
531 pages_to_scan_store);
532
533static ssize_t pages_collapsed_show(struct kobject *kobj,
534 struct kobj_attribute *attr,
535 char *buf)
536{
537 return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
538}
539static struct kobj_attribute pages_collapsed_attr =
540 __ATTR_RO(pages_collapsed);
541
542static ssize_t full_scans_show(struct kobject *kobj,
543 struct kobj_attribute *attr,
544 char *buf)
545{
546 return sprintf(buf, "%u\n", khugepaged_full_scans);
547}
548static struct kobj_attribute full_scans_attr =
549 __ATTR_RO(full_scans);
550
551static ssize_t khugepaged_defrag_show(struct kobject *kobj,
552 struct kobj_attribute *attr, char *buf)
553{
554 return single_flag_show(kobj, attr, buf,
555 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
556}
557static ssize_t khugepaged_defrag_store(struct kobject *kobj,
558 struct kobj_attribute *attr,
559 const char *buf, size_t count)
560{
561 return single_flag_store(kobj, attr, buf, count,
562 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
563}
564static struct kobj_attribute khugepaged_defrag_attr =
565 __ATTR(defrag, 0644, khugepaged_defrag_show,
566 khugepaged_defrag_store);
567
568/*
569 * max_ptes_none controls if khugepaged should collapse hugepages over
570 * any unmapped ptes in turn potentially increasing the memory
571 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
572 * reduce the available free memory in the system as it
573 * runs. Increasing max_ptes_none will instead potentially reduce the
574 * free memory in the system during the khugepaged scan.
575 */
576static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
577 struct kobj_attribute *attr,
578 char *buf)
579{
580 return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
581}
582static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
583 struct kobj_attribute *attr,
584 const char *buf, size_t count)
585{
586 int err;
587 unsigned long max_ptes_none;
588
589 err = kstrtoul(buf, 10, &max_ptes_none);
590 if (err || max_ptes_none > HPAGE_PMD_NR-1)
591 return -EINVAL;
592
593 khugepaged_max_ptes_none = max_ptes_none;
594
595 return count;
596}
597static struct kobj_attribute khugepaged_max_ptes_none_attr =
598 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
599 khugepaged_max_ptes_none_store);
600
601static struct attribute *khugepaged_attr[] = {
602 &khugepaged_defrag_attr.attr,
603 &khugepaged_max_ptes_none_attr.attr,
604 &pages_to_scan_attr.attr,
605 &pages_collapsed_attr.attr,
606 &full_scans_attr.attr,
607 &scan_sleep_millisecs_attr.attr,
608 &alloc_sleep_millisecs_attr.attr,
609 NULL,
610};
611
612static struct attribute_group khugepaged_attr_group = {
613 .attrs = khugepaged_attr,
614 .name = "khugepaged",
615};
616
617static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) 303static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
618{ 304{
619 int err; 305 int err;
@@ -672,8 +358,6 @@ static int __init hugepage_init(void)
672 return -EINVAL; 358 return -EINVAL;
673 } 359 }
674 360
675 khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
676 khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
677 /* 361 /*
678 * hugepages can't be allocated by the buddy allocator 362 * hugepages can't be allocated by the buddy allocator
679 */ 363 */
@@ -688,7 +372,7 @@ static int __init hugepage_init(void)
688 if (err) 372 if (err)
689 goto err_sysfs; 373 goto err_sysfs;
690 374
691 err = khugepaged_slab_init(); 375 err = khugepaged_init();
692 if (err) 376 if (err)
693 goto err_slab; 377 goto err_slab;
694 378
@@ -719,7 +403,7 @@ err_khugepaged:
719err_split_shrinker: 403err_split_shrinker:
720 unregister_shrinker(&huge_zero_page_shrinker); 404 unregister_shrinker(&huge_zero_page_shrinker);
721err_hzp_shrinker: 405err_hzp_shrinker:
722 khugepaged_slab_exit(); 406 khugepaged_destroy();
723err_slab: 407err_slab:
724 hugepage_exit_sysfs(hugepage_kobj); 408 hugepage_exit_sysfs(hugepage_kobj);
725err_sysfs: 409err_sysfs:
@@ -765,11 +449,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
765 return pmd; 449 return pmd;
766} 450}
767 451
768static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
769{
770 return pmd_mkhuge(mk_pmd(page, prot));
771}
772
773static inline struct list_head *page_deferred_list(struct page *page) 452static inline struct list_head *page_deferred_list(struct page *page)
774{ 453{
775 /* 454 /*
@@ -790,26 +469,23 @@ void prep_transhuge_page(struct page *page)
790 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); 469 set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
791} 470}
792 471
793static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 472static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page,
794 struct vm_area_struct *vma, 473 gfp_t gfp)
795 unsigned long address, pmd_t *pmd,
796 struct page *page, gfp_t gfp,
797 unsigned int flags)
798{ 474{
475 struct vm_area_struct *vma = fe->vma;
799 struct mem_cgroup *memcg; 476 struct mem_cgroup *memcg;
800 pgtable_t pgtable; 477 pgtable_t pgtable;
801 spinlock_t *ptl; 478 unsigned long haddr = fe->address & HPAGE_PMD_MASK;
802 unsigned long haddr = address & HPAGE_PMD_MASK;
803 479
804 VM_BUG_ON_PAGE(!PageCompound(page), page); 480 VM_BUG_ON_PAGE(!PageCompound(page), page);
805 481
806 if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) { 482 if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) {
807 put_page(page); 483 put_page(page);
808 count_vm_event(THP_FAULT_FALLBACK); 484 count_vm_event(THP_FAULT_FALLBACK);
809 return VM_FAULT_FALLBACK; 485 return VM_FAULT_FALLBACK;
810 } 486 }
811 487
812 pgtable = pte_alloc_one(mm, haddr); 488 pgtable = pte_alloc_one(vma->vm_mm, haddr);
813 if (unlikely(!pgtable)) { 489 if (unlikely(!pgtable)) {
814 mem_cgroup_cancel_charge(page, memcg, true); 490 mem_cgroup_cancel_charge(page, memcg, true);
815 put_page(page); 491 put_page(page);
@@ -824,12 +500,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
824 */ 500 */
825 __SetPageUptodate(page); 501 __SetPageUptodate(page);
826 502
827 ptl = pmd_lock(mm, pmd); 503 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
828 if (unlikely(!pmd_none(*pmd))) { 504 if (unlikely(!pmd_none(*fe->pmd))) {
829 spin_unlock(ptl); 505 spin_unlock(fe->ptl);
830 mem_cgroup_cancel_charge(page, memcg, true); 506 mem_cgroup_cancel_charge(page, memcg, true);
831 put_page(page); 507 put_page(page);
832 pte_free(mm, pgtable); 508 pte_free(vma->vm_mm, pgtable);
833 } else { 509 } else {
834 pmd_t entry; 510 pmd_t entry;
835 511
@@ -837,12 +513,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
837 if (userfaultfd_missing(vma)) { 513 if (userfaultfd_missing(vma)) {
838 int ret; 514 int ret;
839 515
840 spin_unlock(ptl); 516 spin_unlock(fe->ptl);
841 mem_cgroup_cancel_charge(page, memcg, true); 517 mem_cgroup_cancel_charge(page, memcg, true);
842 put_page(page); 518 put_page(page);
843 pte_free(mm, pgtable); 519 pte_free(vma->vm_mm, pgtable);
844 ret = handle_userfault(vma, address, flags, 520 ret = handle_userfault(fe, VM_UFFD_MISSING);
845 VM_UFFD_MISSING);
846 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 521 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
847 return ret; 522 return ret;
848 } 523 }
@@ -852,11 +527,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
852 page_add_new_anon_rmap(page, vma, haddr, true); 527 page_add_new_anon_rmap(page, vma, haddr, true);
853 mem_cgroup_commit_charge(page, memcg, false, true); 528 mem_cgroup_commit_charge(page, memcg, false, true);
854 lru_cache_add_active_or_unevictable(page, vma); 529 lru_cache_add_active_or_unevictable(page, vma);
855 pgtable_trans_huge_deposit(mm, pmd, pgtable); 530 pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable);
856 set_pmd_at(mm, haddr, pmd, entry); 531 set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
857 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 532 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
858 atomic_long_inc(&mm->nr_ptes); 533 atomic_long_inc(&vma->vm_mm->nr_ptes);
859 spin_unlock(ptl); 534 spin_unlock(fe->ptl);
860 count_vm_event(THP_FAULT_ALLOC); 535 count_vm_event(THP_FAULT_ALLOC);
861 } 536 }
862 537
@@ -883,12 +558,6 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
883 return GFP_TRANSHUGE | reclaim_flags; 558 return GFP_TRANSHUGE | reclaim_flags;
884} 559}
885 560
886/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
887static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
888{
889 return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0);
890}
891
892/* Caller must hold page table lock. */ 561/* Caller must hold page table lock. */
893static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 562static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
894 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 563 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
@@ -906,13 +575,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
906 return true; 575 return true;
907} 576}
908 577
909int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 578int do_huge_pmd_anonymous_page(struct fault_env *fe)
910 unsigned long address, pmd_t *pmd,
911 unsigned int flags)
912{ 579{
580 struct vm_area_struct *vma = fe->vma;
913 gfp_t gfp; 581 gfp_t gfp;
914 struct page *page; 582 struct page *page;
915 unsigned long haddr = address & HPAGE_PMD_MASK; 583 unsigned long haddr = fe->address & HPAGE_PMD_MASK;
916 584
917 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) 585 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
918 return VM_FAULT_FALLBACK; 586 return VM_FAULT_FALLBACK;
@@ -920,42 +588,40 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
920 return VM_FAULT_OOM; 588 return VM_FAULT_OOM;
921 if (unlikely(khugepaged_enter(vma, vma->vm_flags))) 589 if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
922 return VM_FAULT_OOM; 590 return VM_FAULT_OOM;
923 if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) && 591 if (!(fe->flags & FAULT_FLAG_WRITE) &&
592 !mm_forbids_zeropage(vma->vm_mm) &&
924 transparent_hugepage_use_zero_page()) { 593 transparent_hugepage_use_zero_page()) {
925 spinlock_t *ptl;
926 pgtable_t pgtable; 594 pgtable_t pgtable;
927 struct page *zero_page; 595 struct page *zero_page;
928 bool set; 596 bool set;
929 int ret; 597 int ret;
930 pgtable = pte_alloc_one(mm, haddr); 598 pgtable = pte_alloc_one(vma->vm_mm, haddr);
931 if (unlikely(!pgtable)) 599 if (unlikely(!pgtable))
932 return VM_FAULT_OOM; 600 return VM_FAULT_OOM;
933 zero_page = get_huge_zero_page(); 601 zero_page = get_huge_zero_page();
934 if (unlikely(!zero_page)) { 602 if (unlikely(!zero_page)) {
935 pte_free(mm, pgtable); 603 pte_free(vma->vm_mm, pgtable);
936 count_vm_event(THP_FAULT_FALLBACK); 604 count_vm_event(THP_FAULT_FALLBACK);
937 return VM_FAULT_FALLBACK; 605 return VM_FAULT_FALLBACK;
938 } 606 }
939 ptl = pmd_lock(mm, pmd); 607 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
940 ret = 0; 608 ret = 0;
941 set = false; 609 set = false;
942 if (pmd_none(*pmd)) { 610 if (pmd_none(*fe->pmd)) {
943 if (userfaultfd_missing(vma)) { 611 if (userfaultfd_missing(vma)) {
944 spin_unlock(ptl); 612 spin_unlock(fe->ptl);
945 ret = handle_userfault(vma, address, flags, 613 ret = handle_userfault(fe, VM_UFFD_MISSING);
946 VM_UFFD_MISSING);
947 VM_BUG_ON(ret & VM_FAULT_FALLBACK); 614 VM_BUG_ON(ret & VM_FAULT_FALLBACK);
948 } else { 615 } else {
949 set_huge_zero_page(pgtable, mm, vma, 616 set_huge_zero_page(pgtable, vma->vm_mm, vma,
950 haddr, pmd, 617 haddr, fe->pmd, zero_page);
951 zero_page); 618 spin_unlock(fe->ptl);
952 spin_unlock(ptl);
953 set = true; 619 set = true;
954 } 620 }
955 } else 621 } else
956 spin_unlock(ptl); 622 spin_unlock(fe->ptl);
957 if (!set) { 623 if (!set) {
958 pte_free(mm, pgtable); 624 pte_free(vma->vm_mm, pgtable);
959 put_huge_zero_page(); 625 put_huge_zero_page();
960 } 626 }
961 return ret; 627 return ret;
@@ -967,8 +633,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
967 return VM_FAULT_FALLBACK; 633 return VM_FAULT_FALLBACK;
968 } 634 }
969 prep_transhuge_page(page); 635 prep_transhuge_page(page);
970 return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, 636 return __do_huge_pmd_anonymous_page(fe, page, gfp);
971 flags);
972} 637}
973 638
974static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, 639static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -1080,14 +745,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1080 struct page *src_page; 745 struct page *src_page;
1081 pmd_t pmd; 746 pmd_t pmd;
1082 pgtable_t pgtable = NULL; 747 pgtable_t pgtable = NULL;
1083 int ret; 748 int ret = -ENOMEM;
1084 749
1085 if (!vma_is_dax(vma)) { 750 /* Skip if can be re-fill on fault */
1086 ret = -ENOMEM; 751 if (!vma_is_anonymous(vma))
1087 pgtable = pte_alloc_one(dst_mm, addr); 752 return 0;
1088 if (unlikely(!pgtable)) 753
1089 goto out; 754 pgtable = pte_alloc_one(dst_mm, addr);
1090 } 755 if (unlikely(!pgtable))
756 goto out;
1091 757
1092 dst_ptl = pmd_lock(dst_mm, dst_pmd); 758 dst_ptl = pmd_lock(dst_mm, dst_pmd);
1093 src_ptl = pmd_lockptr(src_mm, src_pmd); 759 src_ptl = pmd_lockptr(src_mm, src_pmd);
@@ -1095,7 +761,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1095 761
1096 ret = -EAGAIN; 762 ret = -EAGAIN;
1097 pmd = *src_pmd; 763 pmd = *src_pmd;
1098 if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) { 764 if (unlikely(!pmd_trans_huge(pmd))) {
1099 pte_free(dst_mm, pgtable); 765 pte_free(dst_mm, pgtable);
1100 goto out_unlock; 766 goto out_unlock;
1101 } 767 }
@@ -1118,16 +784,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1118 goto out_unlock; 784 goto out_unlock;
1119 } 785 }
1120 786
1121 if (!vma_is_dax(vma)) { 787 src_page = pmd_page(pmd);
1122 /* thp accounting separate from pmd_devmap accounting */ 788 VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1123 src_page = pmd_page(pmd); 789 get_page(src_page);
1124 VM_BUG_ON_PAGE(!PageHead(src_page), src_page); 790 page_dup_rmap(src_page, true);
1125 get_page(src_page); 791 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1126 page_dup_rmap(src_page, true); 792 atomic_long_inc(&dst_mm->nr_ptes);
1127 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); 793 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1128 atomic_long_inc(&dst_mm->nr_ptes);
1129 pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1130 }
1131 794
1132 pmdp_set_wrprotect(src_mm, addr, src_pmd); 795 pmdp_set_wrprotect(src_mm, addr, src_pmd);
1133 pmd = pmd_mkold(pmd_wrprotect(pmd)); 796 pmd = pmd_mkold(pmd_wrprotect(pmd));
@@ -1141,38 +804,31 @@ out:
1141 return ret; 804 return ret;
1142} 805}
1143 806
1144void huge_pmd_set_accessed(struct mm_struct *mm, 807void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd)
1145 struct vm_area_struct *vma,
1146 unsigned long address,
1147 pmd_t *pmd, pmd_t orig_pmd,
1148 int dirty)
1149{ 808{
1150 spinlock_t *ptl;
1151 pmd_t entry; 809 pmd_t entry;
1152 unsigned long haddr; 810 unsigned long haddr;
1153 811
1154 ptl = pmd_lock(mm, pmd); 812 fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd);
1155 if (unlikely(!pmd_same(*pmd, orig_pmd))) 813 if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
1156 goto unlock; 814 goto unlock;
1157 815
1158 entry = pmd_mkyoung(orig_pmd); 816 entry = pmd_mkyoung(orig_pmd);
1159 haddr = address & HPAGE_PMD_MASK; 817 haddr = fe->address & HPAGE_PMD_MASK;
1160 if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) 818 if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry,
1161 update_mmu_cache_pmd(vma, address, pmd); 819 fe->flags & FAULT_FLAG_WRITE))
820 update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd);
1162 821
1163unlock: 822unlock:
1164 spin_unlock(ptl); 823 spin_unlock(fe->ptl);
1165} 824}
1166 825
1167static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 826static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd,
1168 struct vm_area_struct *vma, 827 struct page *page)
1169 unsigned long address,
1170 pmd_t *pmd, pmd_t orig_pmd,
1171 struct page *page,
1172 unsigned long haddr)
1173{ 828{
829 struct vm_area_struct *vma = fe->vma;
830 unsigned long haddr = fe->address & HPAGE_PMD_MASK;
1174 struct mem_cgroup *memcg; 831 struct mem_cgroup *memcg;
1175 spinlock_t *ptl;
1176 pgtable_t pgtable; 832 pgtable_t pgtable;
1177 pmd_t _pmd; 833 pmd_t _pmd;
1178 int ret = 0, i; 834 int ret = 0, i;
@@ -1189,11 +845,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1189 845
1190 for (i = 0; i < HPAGE_PMD_NR; i++) { 846 for (i = 0; i < HPAGE_PMD_NR; i++) {
1191 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | 847 pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
1192 __GFP_OTHER_NODE, 848 __GFP_OTHER_NODE, vma,
1193 vma, address, page_to_nid(page)); 849 fe->address, page_to_nid(page));
1194 if (unlikely(!pages[i] || 850 if (unlikely(!pages[i] ||
1195 mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, 851 mem_cgroup_try_charge(pages[i], vma->vm_mm,
1196 &memcg, false))) { 852 GFP_KERNEL, &memcg, false))) {
1197 if (pages[i]) 853 if (pages[i])
1198 put_page(pages[i]); 854 put_page(pages[i]);
1199 while (--i >= 0) { 855 while (--i >= 0) {
@@ -1219,41 +875,41 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
1219 875
1220 mmun_start = haddr; 876 mmun_start = haddr;
1221 mmun_end = haddr + HPAGE_PMD_SIZE; 877 mmun_end = haddr + HPAGE_PMD_SIZE;
1222 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 878 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
1223 879
1224 ptl = pmd_lock(mm, pmd); 880 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
1225 if (unlikely(!pmd_same(*pmd, orig_pmd))) 881 if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
1226 goto out_free_pages; 882 goto out_free_pages;
1227 VM_BUG_ON_PAGE(!PageHead(page), page); 883 VM_BUG_ON_PAGE(!PageHead(page), page);
1228 884
1229 pmdp_huge_clear_flush_notify(vma, haddr, pmd); 885 pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
1230 /* leave pmd empty until pte is filled */ 886 /* leave pmd empty until pte is filled */
1231 887
1232 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 888 pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd);
1233 pmd_populate(mm, &_pmd, pgtable); 889 pmd_populate(vma->vm_mm, &_pmd, pgtable);
1234 890
1235 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 891 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1236 pte_t *pte, entry; 892 pte_t entry;
1237 entry = mk_pte(pages[i], vma->vm_page_prot); 893 entry = mk_pte(pages[i], vma->vm_page_prot);
1238 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 894 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1239 memcg = (void *)page_private(pages[i]); 895 memcg = (void *)page_private(pages[i]);
1240 set_page_private(pages[i], 0); 896 set_page_private(pages[i], 0);
1241 page_add_new_anon_rmap(pages[i], vma, haddr, false); 897 page_add_new_anon_rmap(pages[i], fe->vma, haddr, false);
1242 mem_cgroup_commit_charge(pages[i], memcg, false, false); 898 mem_cgroup_commit_charge(pages[i], memcg, false, false);
1243 lru_cache_add_active_or_unevictable(pages[i], vma); 899 lru_cache_add_active_or_unevictable(pages[i], vma);
1244 pte = pte_offset_map(&_pmd, haddr); 900 fe->pte = pte_offset_map(&_pmd, haddr);
1245 VM_BUG_ON(!pte_none(*pte)); 901 VM_BUG_ON(!pte_none(*fe->pte));
1246 set_pte_at(mm, haddr, pte, entry); 902 set_pte_at(vma->vm_mm, haddr, fe->pte, entry);
1247 pte_unmap(pte); 903 pte_unmap(fe->pte);
1248 } 904 }
1249 kfree(pages); 905 kfree(pages);
1250 906
1251 smp_wmb(); /* make pte visible before pmd */ 907 smp_wmb(); /* make pte visible before pmd */
1252 pmd_populate(mm, pmd, pgtable); 908 pmd_populate(vma->vm_mm, fe->pmd, pgtable);
1253 page_remove_rmap(page, true); 909 page_remove_rmap(page, true);
1254 spin_unlock(ptl); 910 spin_unlock(fe->ptl);
1255 911
1256 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 912 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
1257 913
1258 ret |= VM_FAULT_WRITE; 914 ret |= VM_FAULT_WRITE;
1259 put_page(page); 915 put_page(page);
@@ -1262,8 +918,8 @@ out:
1262 return ret; 918 return ret;
1263 919
1264out_free_pages: 920out_free_pages:
1265 spin_unlock(ptl); 921 spin_unlock(fe->ptl);
1266 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 922 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
1267 for (i = 0; i < HPAGE_PMD_NR; i++) { 923 for (i = 0; i < HPAGE_PMD_NR; i++) {
1268 memcg = (void *)page_private(pages[i]); 924 memcg = (void *)page_private(pages[i]);
1269 set_page_private(pages[i], 0); 925 set_page_private(pages[i], 0);
@@ -1274,25 +930,23 @@ out_free_pages:
1274 goto out; 930 goto out;
1275} 931}
1276 932
1277int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 933int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd)
1278 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
1279{ 934{
1280 spinlock_t *ptl; 935 struct vm_area_struct *vma = fe->vma;
1281 int ret = 0;
1282 struct page *page = NULL, *new_page; 936 struct page *page = NULL, *new_page;
1283 struct mem_cgroup *memcg; 937 struct mem_cgroup *memcg;
1284 unsigned long haddr; 938 unsigned long haddr = fe->address & HPAGE_PMD_MASK;
1285 unsigned long mmun_start; /* For mmu_notifiers */ 939 unsigned long mmun_start; /* For mmu_notifiers */
1286 unsigned long mmun_end; /* For mmu_notifiers */ 940 unsigned long mmun_end; /* For mmu_notifiers */
1287 gfp_t huge_gfp; /* for allocation and charge */ 941 gfp_t huge_gfp; /* for allocation and charge */
942 int ret = 0;
1288 943
1289 ptl = pmd_lockptr(mm, pmd); 944 fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd);
1290 VM_BUG_ON_VMA(!vma->anon_vma, vma); 945 VM_BUG_ON_VMA(!vma->anon_vma, vma);
1291 haddr = address & HPAGE_PMD_MASK;
1292 if (is_huge_zero_pmd(orig_pmd)) 946 if (is_huge_zero_pmd(orig_pmd))
1293 goto alloc; 947 goto alloc;
1294 spin_lock(ptl); 948 spin_lock(fe->ptl);
1295 if (unlikely(!pmd_same(*pmd, orig_pmd))) 949 if (unlikely(!pmd_same(*fe->pmd, orig_pmd)))
1296 goto out_unlock; 950 goto out_unlock;
1297 951
1298 page = pmd_page(orig_pmd); 952 page = pmd_page(orig_pmd);
@@ -1305,13 +959,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1305 pmd_t entry; 959 pmd_t entry;
1306 entry = pmd_mkyoung(orig_pmd); 960 entry = pmd_mkyoung(orig_pmd);
1307 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 961 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1308 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) 962 if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1))
1309 update_mmu_cache_pmd(vma, address, pmd); 963 update_mmu_cache_pmd(vma, fe->address, fe->pmd);
1310 ret |= VM_FAULT_WRITE; 964 ret |= VM_FAULT_WRITE;
1311 goto out_unlock; 965 goto out_unlock;
1312 } 966 }
1313 get_page(page); 967 get_page(page);
1314 spin_unlock(ptl); 968 spin_unlock(fe->ptl);
1315alloc: 969alloc:
1316 if (transparent_hugepage_enabled(vma) && 970 if (transparent_hugepage_enabled(vma) &&
1317 !transparent_hugepage_debug_cow()) { 971 !transparent_hugepage_debug_cow()) {
@@ -1324,13 +978,12 @@ alloc:
1324 prep_transhuge_page(new_page); 978 prep_transhuge_page(new_page);
1325 } else { 979 } else {
1326 if (!page) { 980 if (!page) {
1327 split_huge_pmd(vma, pmd, address); 981 split_huge_pmd(vma, fe->pmd, fe->address);
1328 ret |= VM_FAULT_FALLBACK; 982 ret |= VM_FAULT_FALLBACK;
1329 } else { 983 } else {
1330 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 984 ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page);
1331 pmd, orig_pmd, page, haddr);
1332 if (ret & VM_FAULT_OOM) { 985 if (ret & VM_FAULT_OOM) {
1333 split_huge_pmd(vma, pmd, address); 986 split_huge_pmd(vma, fe->pmd, fe->address);
1334 ret |= VM_FAULT_FALLBACK; 987 ret |= VM_FAULT_FALLBACK;
1335 } 988 }
1336 put_page(page); 989 put_page(page);
@@ -1339,14 +992,12 @@ alloc:
1339 goto out; 992 goto out;
1340 } 993 }
1341 994
1342 if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg, 995 if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm,
1343 true))) { 996 huge_gfp, &memcg, true))) {
1344 put_page(new_page); 997 put_page(new_page);
1345 if (page) { 998 split_huge_pmd(vma, fe->pmd, fe->address);
1346 split_huge_pmd(vma, pmd, address); 999 if (page)
1347 put_page(page); 1000 put_page(page);
1348 } else
1349 split_huge_pmd(vma, pmd, address);
1350 ret |= VM_FAULT_FALLBACK; 1001 ret |= VM_FAULT_FALLBACK;
1351 count_vm_event(THP_FAULT_FALLBACK); 1002 count_vm_event(THP_FAULT_FALLBACK);
1352 goto out; 1003 goto out;
@@ -1362,13 +1013,13 @@ alloc:
1362 1013
1363 mmun_start = haddr; 1014 mmun_start = haddr;
1364 mmun_end = haddr + HPAGE_PMD_SIZE; 1015 mmun_end = haddr + HPAGE_PMD_SIZE;
1365 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 1016 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
1366 1017
1367 spin_lock(ptl); 1018 spin_lock(fe->ptl);
1368 if (page) 1019 if (page)
1369 put_page(page); 1020 put_page(page);
1370 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 1021 if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) {
1371 spin_unlock(ptl); 1022 spin_unlock(fe->ptl);
1372 mem_cgroup_cancel_charge(new_page, memcg, true); 1023 mem_cgroup_cancel_charge(new_page, memcg, true);
1373 put_page(new_page); 1024 put_page(new_page);
1374 goto out_mn; 1025 goto out_mn;
@@ -1376,14 +1027,14 @@ alloc:
1376 pmd_t entry; 1027 pmd_t entry;
1377 entry = mk_huge_pmd(new_page, vma->vm_page_prot); 1028 entry = mk_huge_pmd(new_page, vma->vm_page_prot);
1378 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1029 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1379 pmdp_huge_clear_flush_notify(vma, haddr, pmd); 1030 pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd);
1380 page_add_new_anon_rmap(new_page, vma, haddr, true); 1031 page_add_new_anon_rmap(new_page, vma, haddr, true);
1381 mem_cgroup_commit_charge(new_page, memcg, false, true); 1032 mem_cgroup_commit_charge(new_page, memcg, false, true);
1382 lru_cache_add_active_or_unevictable(new_page, vma); 1033 lru_cache_add_active_or_unevictable(new_page, vma);
1383 set_pmd_at(mm, haddr, pmd, entry); 1034 set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
1384 update_mmu_cache_pmd(vma, address, pmd); 1035 update_mmu_cache_pmd(vma, fe->address, fe->pmd);
1385 if (!page) { 1036 if (!page) {
1386 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 1037 add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1387 put_huge_zero_page(); 1038 put_huge_zero_page();
1388 } else { 1039 } else {
1389 VM_BUG_ON_PAGE(!PageHead(page), page); 1040 VM_BUG_ON_PAGE(!PageHead(page), page);
@@ -1392,13 +1043,13 @@ alloc:
1392 } 1043 }
1393 ret |= VM_FAULT_WRITE; 1044 ret |= VM_FAULT_WRITE;
1394 } 1045 }
1395 spin_unlock(ptl); 1046 spin_unlock(fe->ptl);
1396out_mn: 1047out_mn:
1397 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 1048 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
1398out: 1049out:
1399 return ret; 1050 return ret;
1400out_unlock: 1051out_unlock:
1401 spin_unlock(ptl); 1052 spin_unlock(fe->ptl);
1402 return ret; 1053 return ret;
1403} 1054}
1404 1055
@@ -1432,6 +1083,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1432 * We don't mlock() pte-mapped THPs. This way we can avoid 1083 * We don't mlock() pte-mapped THPs. This way we can avoid
1433 * leaking mlocked pages into non-VM_LOCKED VMAs. 1084 * leaking mlocked pages into non-VM_LOCKED VMAs.
1434 * 1085 *
1086 * For anon THP:
1087 *
1435 * In most cases the pmd is the only mapping of the page as we 1088 * In most cases the pmd is the only mapping of the page as we
1436 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for 1089 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
1437 * writable private mappings in populate_vma_page_range(). 1090 * writable private mappings in populate_vma_page_range().
@@ -1439,15 +1092,26 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1439 * The only scenario when we have the page shared here is if we 1092 * The only scenario when we have the page shared here is if we
1440 * mlocking read-only mapping shared over fork(). We skip 1093 * mlocking read-only mapping shared over fork(). We skip
1441 * mlocking such pages. 1094 * mlocking such pages.
1095 *
1096 * For file THP:
1097 *
1098 * We can expect PageDoubleMap() to be stable under page lock:
1099 * for file pages we set it in page_add_file_rmap(), which
1100 * requires page to be locked.
1442 */ 1101 */
1443 if (compound_mapcount(page) == 1 && !PageDoubleMap(page) && 1102
1444 page->mapping && trylock_page(page)) { 1103 if (PageAnon(page) && compound_mapcount(page) != 1)
1445 lru_add_drain(); 1104 goto skip_mlock;
1446 if (page->mapping) 1105 if (PageDoubleMap(page) || !page->mapping)
1447 mlock_vma_page(page); 1106 goto skip_mlock;
1448 unlock_page(page); 1107 if (!trylock_page(page))
1449 } 1108 goto skip_mlock;
1109 lru_add_drain();
1110 if (page->mapping && !PageDoubleMap(page))
1111 mlock_vma_page(page);
1112 unlock_page(page);
1450 } 1113 }
1114skip_mlock:
1451 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1115 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1452 VM_BUG_ON_PAGE(!PageCompound(page), page); 1116 VM_BUG_ON_PAGE(!PageCompound(page), page);
1453 if (flags & FOLL_GET) 1117 if (flags & FOLL_GET)
@@ -1458,13 +1122,12 @@ out:
1458} 1122}
1459 1123
1460/* NUMA hinting page fault entry point for trans huge pmds */ 1124/* NUMA hinting page fault entry point for trans huge pmds */
1461int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 1125int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
1462 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
1463{ 1126{
1464 spinlock_t *ptl; 1127 struct vm_area_struct *vma = fe->vma;
1465 struct anon_vma *anon_vma = NULL; 1128 struct anon_vma *anon_vma = NULL;
1466 struct page *page; 1129 struct page *page;
1467 unsigned long haddr = addr & HPAGE_PMD_MASK; 1130 unsigned long haddr = fe->address & HPAGE_PMD_MASK;
1468 int page_nid = -1, this_nid = numa_node_id(); 1131 int page_nid = -1, this_nid = numa_node_id();
1469 int target_nid, last_cpupid = -1; 1132 int target_nid, last_cpupid = -1;
1470 bool page_locked; 1133 bool page_locked;
@@ -1475,8 +1138,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1475 /* A PROT_NONE fault should not end up here */ 1138 /* A PROT_NONE fault should not end up here */
1476 BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); 1139 BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
1477 1140
1478 ptl = pmd_lock(mm, pmdp); 1141 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
1479 if (unlikely(!pmd_same(pmd, *pmdp))) 1142 if (unlikely(!pmd_same(pmd, *fe->pmd)))
1480 goto out_unlock; 1143 goto out_unlock;
1481 1144
1482 /* 1145 /*
@@ -1484,9 +1147,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1484 * without disrupting NUMA hinting information. Do not relock and 1147 * without disrupting NUMA hinting information. Do not relock and
1485 * check_same as the page may no longer be mapped. 1148 * check_same as the page may no longer be mapped.
1486 */ 1149 */
1487 if (unlikely(pmd_trans_migrating(*pmdp))) { 1150 if (unlikely(pmd_trans_migrating(*fe->pmd))) {
1488 page = pmd_page(*pmdp); 1151 page = pmd_page(*fe->pmd);
1489 spin_unlock(ptl); 1152 spin_unlock(fe->ptl);
1490 wait_on_page_locked(page); 1153 wait_on_page_locked(page);
1491 goto out; 1154 goto out;
1492 } 1155 }
@@ -1519,7 +1182,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1519 1182
1520 /* Migration could have started since the pmd_trans_migrating check */ 1183 /* Migration could have started since the pmd_trans_migrating check */
1521 if (!page_locked) { 1184 if (!page_locked) {
1522 spin_unlock(ptl); 1185 spin_unlock(fe->ptl);
1523 wait_on_page_locked(page); 1186 wait_on_page_locked(page);
1524 page_nid = -1; 1187 page_nid = -1;
1525 goto out; 1188 goto out;
@@ -1530,12 +1193,12 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1530 * to serialises splits 1193 * to serialises splits
1531 */ 1194 */
1532 get_page(page); 1195 get_page(page);
1533 spin_unlock(ptl); 1196 spin_unlock(fe->ptl);
1534 anon_vma = page_lock_anon_vma_read(page); 1197 anon_vma = page_lock_anon_vma_read(page);
1535 1198
1536 /* Confirm the PMD did not change while page_table_lock was released */ 1199 /* Confirm the PMD did not change while page_table_lock was released */
1537 spin_lock(ptl); 1200 spin_lock(fe->ptl);
1538 if (unlikely(!pmd_same(pmd, *pmdp))) { 1201 if (unlikely(!pmd_same(pmd, *fe->pmd))) {
1539 unlock_page(page); 1202 unlock_page(page);
1540 put_page(page); 1203 put_page(page);
1541 page_nid = -1; 1204 page_nid = -1;
@@ -1553,9 +1216,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1553 * Migrate the THP to the requested node, returns with page unlocked 1216 * Migrate the THP to the requested node, returns with page unlocked
1554 * and access rights restored. 1217 * and access rights restored.
1555 */ 1218 */
1556 spin_unlock(ptl); 1219 spin_unlock(fe->ptl);
1557 migrated = migrate_misplaced_transhuge_page(mm, vma, 1220 migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
1558 pmdp, pmd, addr, page, target_nid); 1221 fe->pmd, pmd, fe->address, page, target_nid);
1559 if (migrated) { 1222 if (migrated) {
1560 flags |= TNF_MIGRATED; 1223 flags |= TNF_MIGRATED;
1561 page_nid = target_nid; 1224 page_nid = target_nid;
@@ -1570,18 +1233,18 @@ clear_pmdnuma:
1570 pmd = pmd_mkyoung(pmd); 1233 pmd = pmd_mkyoung(pmd);
1571 if (was_writable) 1234 if (was_writable)
1572 pmd = pmd_mkwrite(pmd); 1235 pmd = pmd_mkwrite(pmd);
1573 set_pmd_at(mm, haddr, pmdp, pmd); 1236 set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd);
1574 update_mmu_cache_pmd(vma, addr, pmdp); 1237 update_mmu_cache_pmd(vma, fe->address, fe->pmd);
1575 unlock_page(page); 1238 unlock_page(page);
1576out_unlock: 1239out_unlock:
1577 spin_unlock(ptl); 1240 spin_unlock(fe->ptl);
1578 1241
1579out: 1242out:
1580 if (anon_vma) 1243 if (anon_vma)
1581 page_unlock_anon_vma_read(anon_vma); 1244 page_unlock_anon_vma_read(anon_vma);
1582 1245
1583 if (page_nid != -1) 1246 if (page_nid != -1)
1584 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); 1247 task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags);
1585 1248
1586 return 0; 1249 return 0;
1587} 1250}
@@ -1684,12 +1347,18 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1684 struct page *page = pmd_page(orig_pmd); 1347 struct page *page = pmd_page(orig_pmd);
1685 page_remove_rmap(page, true); 1348 page_remove_rmap(page, true);
1686 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); 1349 VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
1687 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1688 VM_BUG_ON_PAGE(!PageHead(page), page); 1350 VM_BUG_ON_PAGE(!PageHead(page), page);
1689 pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); 1351 if (PageAnon(page)) {
1690 atomic_long_dec(&tlb->mm->nr_ptes); 1352 pgtable_t pgtable;
1353 pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
1354 pte_free(tlb->mm, pgtable);
1355 atomic_long_dec(&tlb->mm->nr_ptes);
1356 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1357 } else {
1358 add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
1359 }
1691 spin_unlock(ptl); 1360 spin_unlock(ptl);
1692 tlb_remove_page(tlb, page); 1361 tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
1693 } 1362 }
1694 return 1; 1363 return 1;
1695} 1364}
@@ -1779,7 +1448,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1779 entry = pmd_mkwrite(entry); 1448 entry = pmd_mkwrite(entry);
1780 ret = HPAGE_PMD_NR; 1449 ret = HPAGE_PMD_NR;
1781 set_pmd_at(mm, addr, pmd, entry); 1450 set_pmd_at(mm, addr, pmd, entry);
1782 BUG_ON(!preserve_write && pmd_write(entry)); 1451 BUG_ON(vma_is_anonymous(vma) && !preserve_write &&
1452 pmd_write(entry));
1783 } 1453 }
1784 spin_unlock(ptl); 1454 spin_unlock(ptl);
1785 } 1455 }
@@ -1788,10 +1458,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1788} 1458}
1789 1459
1790/* 1460/*
1791 * Returns true if a given pmd maps a thp, false otherwise. 1461 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
1792 * 1462 *
1793 * Note that if it returns true, this routine returns without unlocking page 1463 * Note that if it returns page table lock pointer, this routine returns without
1794 * table lock. So callers must unlock it. 1464 * unlocking page table lock. So callers must unlock it.
1795 */ 1465 */
1796spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) 1466spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1797{ 1467{
@@ -1803,1040 +1473,6 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1803 return NULL; 1473 return NULL;
1804} 1474}
1805 1475
1806#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE)
1807
1808int hugepage_madvise(struct vm_area_struct *vma,
1809 unsigned long *vm_flags, int advice)
1810{
1811 switch (advice) {
1812 case MADV_HUGEPAGE:
1813#ifdef CONFIG_S390
1814 /*
1815 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
1816 * can't handle this properly after s390_enable_sie, so we simply
1817 * ignore the madvise to prevent qemu from causing a SIGSEGV.
1818 */
1819 if (mm_has_pgste(vma->vm_mm))
1820 return 0;
1821#endif
1822 /*
1823 * Be somewhat over-protective like KSM for now!
1824 */
1825 if (*vm_flags & VM_NO_THP)
1826 return -EINVAL;
1827 *vm_flags &= ~VM_NOHUGEPAGE;
1828 *vm_flags |= VM_HUGEPAGE;
1829 /*
1830 * If the vma become good for khugepaged to scan,
1831 * register it here without waiting a page fault that
1832 * may not happen any time soon.
1833 */
1834 if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags)))
1835 return -ENOMEM;
1836 break;
1837 case MADV_NOHUGEPAGE:
1838 /*
1839 * Be somewhat over-protective like KSM for now!
1840 */
1841 if (*vm_flags & VM_NO_THP)
1842 return -EINVAL;
1843 *vm_flags &= ~VM_HUGEPAGE;
1844 *vm_flags |= VM_NOHUGEPAGE;
1845 /*
1846 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
1847 * this vma even if we leave the mm registered in khugepaged if
1848 * it got registered before VM_NOHUGEPAGE was set.
1849 */
1850 break;
1851 }
1852
1853 return 0;
1854}
1855
1856static int __init khugepaged_slab_init(void)
1857{
1858 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
1859 sizeof(struct mm_slot),
1860 __alignof__(struct mm_slot), 0, NULL);
1861 if (!mm_slot_cache)
1862 return -ENOMEM;
1863
1864 return 0;
1865}
1866
1867static void __init khugepaged_slab_exit(void)
1868{
1869 kmem_cache_destroy(mm_slot_cache);
1870}
1871
1872static inline struct mm_slot *alloc_mm_slot(void)
1873{
1874 if (!mm_slot_cache) /* initialization failed */
1875 return NULL;
1876 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1877}
1878
1879static inline void free_mm_slot(struct mm_slot *mm_slot)
1880{
1881 kmem_cache_free(mm_slot_cache, mm_slot);
1882}
1883
1884static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1885{
1886 struct mm_slot *mm_slot;
1887
1888 hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
1889 if (mm == mm_slot->mm)
1890 return mm_slot;
1891
1892 return NULL;
1893}
1894
1895static void insert_to_mm_slots_hash(struct mm_struct *mm,
1896 struct mm_slot *mm_slot)
1897{
1898 mm_slot->mm = mm;
1899 hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
1900}
1901
1902static inline int khugepaged_test_exit(struct mm_struct *mm)
1903{
1904 return atomic_read(&mm->mm_users) == 0;
1905}
1906
1907int __khugepaged_enter(struct mm_struct *mm)
1908{
1909 struct mm_slot *mm_slot;
1910 int wakeup;
1911
1912 mm_slot = alloc_mm_slot();
1913 if (!mm_slot)
1914 return -ENOMEM;
1915
1916 /* __khugepaged_exit() must not run from under us */
1917 VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
1918 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
1919 free_mm_slot(mm_slot);
1920 return 0;
1921 }
1922
1923 spin_lock(&khugepaged_mm_lock);
1924 insert_to_mm_slots_hash(mm, mm_slot);
1925 /*
1926 * Insert just behind the scanning cursor, to let the area settle
1927 * down a little.
1928 */
1929 wakeup = list_empty(&khugepaged_scan.mm_head);
1930 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
1931 spin_unlock(&khugepaged_mm_lock);
1932
1933 atomic_inc(&mm->mm_count);
1934 if (wakeup)
1935 wake_up_interruptible(&khugepaged_wait);
1936
1937 return 0;
1938}
1939
1940int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
1941 unsigned long vm_flags)
1942{
1943 unsigned long hstart, hend;
1944 if (!vma->anon_vma)
1945 /*
1946 * Not yet faulted in so we will register later in the
1947 * page fault if needed.
1948 */
1949 return 0;
1950 if (vma->vm_ops || (vm_flags & VM_NO_THP))
1951 /* khugepaged not yet working on file or special mappings */
1952 return 0;
1953 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1954 hend = vma->vm_end & HPAGE_PMD_MASK;
1955 if (hstart < hend)
1956 return khugepaged_enter(vma, vm_flags);
1957 return 0;
1958}
1959
1960void __khugepaged_exit(struct mm_struct *mm)
1961{
1962 struct mm_slot *mm_slot;
1963 int free = 0;
1964
1965 spin_lock(&khugepaged_mm_lock);
1966 mm_slot = get_mm_slot(mm);
1967 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
1968 hash_del(&mm_slot->hash);
1969 list_del(&mm_slot->mm_node);
1970 free = 1;
1971 }
1972 spin_unlock(&khugepaged_mm_lock);
1973
1974 if (free) {
1975 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1976 free_mm_slot(mm_slot);
1977 mmdrop(mm);
1978 } else if (mm_slot) {
1979 /*
1980 * This is required to serialize against
1981 * khugepaged_test_exit() (which is guaranteed to run
1982 * under mmap sem read mode). Stop here (after we
1983 * return all pagetables will be destroyed) until
1984 * khugepaged has finished working on the pagetables
1985 * under the mmap_sem.
1986 */
1987 down_write(&mm->mmap_sem);
1988 up_write(&mm->mmap_sem);
1989 }
1990}
1991
1992static void release_pte_page(struct page *page)
1993{
1994 /* 0 stands for page_is_file_cache(page) == false */
1995 dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
1996 unlock_page(page);
1997 putback_lru_page(page);
1998}
1999
2000static void release_pte_pages(pte_t *pte, pte_t *_pte)
2001{
2002 while (--_pte >= pte) {
2003 pte_t pteval = *_pte;
2004 if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
2005 release_pte_page(pte_page(pteval));
2006 }
2007}
2008
2009static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2010 unsigned long address,
2011 pte_t *pte)
2012{
2013 struct page *page = NULL;
2014 pte_t *_pte;
2015 int none_or_zero = 0, result = 0;
2016 bool referenced = false, writable = false;
2017
2018 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
2019 _pte++, address += PAGE_SIZE) {
2020 pte_t pteval = *_pte;
2021 if (pte_none(pteval) || (pte_present(pteval) &&
2022 is_zero_pfn(pte_pfn(pteval)))) {
2023 if (!userfaultfd_armed(vma) &&
2024 ++none_or_zero <= khugepaged_max_ptes_none) {
2025 continue;
2026 } else {
2027 result = SCAN_EXCEED_NONE_PTE;
2028 goto out;
2029 }
2030 }
2031 if (!pte_present(pteval)) {
2032 result = SCAN_PTE_NON_PRESENT;
2033 goto out;
2034 }
2035 page = vm_normal_page(vma, address, pteval);
2036 if (unlikely(!page)) {
2037 result = SCAN_PAGE_NULL;
2038 goto out;
2039 }
2040
2041 VM_BUG_ON_PAGE(PageCompound(page), page);
2042 VM_BUG_ON_PAGE(!PageAnon(page), page);
2043 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
2044
2045 /*
2046 * We can do it before isolate_lru_page because the
2047 * page can't be freed from under us. NOTE: PG_lock
2048 * is needed to serialize against split_huge_page
2049 * when invoked from the VM.
2050 */
2051 if (!trylock_page(page)) {
2052 result = SCAN_PAGE_LOCK;
2053 goto out;
2054 }
2055
2056 /*
2057 * cannot use mapcount: can't collapse if there's a gup pin.
2058 * The page must only be referenced by the scanned process
2059 * and page swap cache.
2060 */
2061 if (page_count(page) != 1 + !!PageSwapCache(page)) {
2062 unlock_page(page);
2063 result = SCAN_PAGE_COUNT;
2064 goto out;
2065 }
2066 if (pte_write(pteval)) {
2067 writable = true;
2068 } else {
2069 if (PageSwapCache(page) &&
2070 !reuse_swap_page(page, NULL)) {
2071 unlock_page(page);
2072 result = SCAN_SWAP_CACHE_PAGE;
2073 goto out;
2074 }
2075 /*
2076 * Page is not in the swap cache. It can be collapsed
2077 * into a THP.
2078 */
2079 }
2080
2081 /*
2082 * Isolate the page to avoid collapsing an hugepage
2083 * currently in use by the VM.
2084 */
2085 if (isolate_lru_page(page)) {
2086 unlock_page(page);
2087 result = SCAN_DEL_PAGE_LRU;
2088 goto out;
2089 }
2090 /* 0 stands for page_is_file_cache(page) == false */
2091 inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
2092 VM_BUG_ON_PAGE(!PageLocked(page), page);
2093 VM_BUG_ON_PAGE(PageLRU(page), page);
2094
2095 /* If there is no mapped pte young don't collapse the page */
2096 if (pte_young(pteval) ||
2097 page_is_young(page) || PageReferenced(page) ||
2098 mmu_notifier_test_young(vma->vm_mm, address))
2099 referenced = true;
2100 }
2101 if (likely(writable)) {
2102 if (likely(referenced)) {
2103 result = SCAN_SUCCEED;
2104 trace_mm_collapse_huge_page_isolate(page, none_or_zero,
2105 referenced, writable, result);
2106 return 1;
2107 }
2108 } else {
2109 result = SCAN_PAGE_RO;
2110 }
2111
2112out:
2113 release_pte_pages(pte, _pte);
2114 trace_mm_collapse_huge_page_isolate(page, none_or_zero,
2115 referenced, writable, result);
2116 return 0;
2117}
2118
2119static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
2120 struct vm_area_struct *vma,
2121 unsigned long address,
2122 spinlock_t *ptl)
2123{
2124 pte_t *_pte;
2125 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
2126 pte_t pteval = *_pte;
2127 struct page *src_page;
2128
2129 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
2130 clear_user_highpage(page, address);
2131 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
2132 if (is_zero_pfn(pte_pfn(pteval))) {
2133 /*
2134 * ptl mostly unnecessary.
2135 */
2136 spin_lock(ptl);
2137 /*
2138 * paravirt calls inside pte_clear here are
2139 * superfluous.
2140 */
2141 pte_clear(vma->vm_mm, address, _pte);
2142 spin_unlock(ptl);
2143 }
2144 } else {
2145 src_page = pte_page(pteval);
2146 copy_user_highpage(page, src_page, address, vma);
2147 VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
2148 release_pte_page(src_page);
2149 /*
2150 * ptl mostly unnecessary, but preempt has to
2151 * be disabled to update the per-cpu stats
2152 * inside page_remove_rmap().
2153 */
2154 spin_lock(ptl);
2155 /*
2156 * paravirt calls inside pte_clear here are
2157 * superfluous.
2158 */
2159 pte_clear(vma->vm_mm, address, _pte);
2160 page_remove_rmap(src_page, false);
2161 spin_unlock(ptl);
2162 free_page_and_swap_cache(src_page);
2163 }
2164
2165 address += PAGE_SIZE;
2166 page++;
2167 }
2168}
2169
2170static void khugepaged_alloc_sleep(void)
2171{
2172 DEFINE_WAIT(wait);
2173
2174 add_wait_queue(&khugepaged_wait, &wait);
2175 freezable_schedule_timeout_interruptible(
2176 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
2177 remove_wait_queue(&khugepaged_wait, &wait);
2178}
2179
2180static int khugepaged_node_load[MAX_NUMNODES];
2181
2182static bool khugepaged_scan_abort(int nid)
2183{
2184 int i;
2185
2186 /*
2187 * If zone_reclaim_mode is disabled, then no extra effort is made to
2188 * allocate memory locally.
2189 */
2190 if (!zone_reclaim_mode)
2191 return false;
2192
2193 /* If there is a count for this node already, it must be acceptable */
2194 if (khugepaged_node_load[nid])
2195 return false;
2196
2197 for (i = 0; i < MAX_NUMNODES; i++) {
2198 if (!khugepaged_node_load[i])
2199 continue;
2200 if (node_distance(nid, i) > RECLAIM_DISTANCE)
2201 return true;
2202 }
2203 return false;
2204}
2205
2206#ifdef CONFIG_NUMA
2207static int khugepaged_find_target_node(void)
2208{
2209 static int last_khugepaged_target_node = NUMA_NO_NODE;
2210 int nid, target_node = 0, max_value = 0;
2211
2212 /* find first node with max normal pages hit */
2213 for (nid = 0; nid < MAX_NUMNODES; nid++)
2214 if (khugepaged_node_load[nid] > max_value) {
2215 max_value = khugepaged_node_load[nid];
2216 target_node = nid;
2217 }
2218
2219 /* do some balance if several nodes have the same hit record */
2220 if (target_node <= last_khugepaged_target_node)
2221 for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
2222 nid++)
2223 if (max_value == khugepaged_node_load[nid]) {
2224 target_node = nid;
2225 break;
2226 }
2227
2228 last_khugepaged_target_node = target_node;
2229 return target_node;
2230}
2231
2232static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
2233{
2234 if (IS_ERR(*hpage)) {
2235 if (!*wait)
2236 return false;
2237
2238 *wait = false;
2239 *hpage = NULL;
2240 khugepaged_alloc_sleep();
2241 } else if (*hpage) {
2242 put_page(*hpage);
2243 *hpage = NULL;
2244 }
2245
2246 return true;
2247}
2248
2249static struct page *
2250khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
2251 unsigned long address, int node)
2252{
2253 VM_BUG_ON_PAGE(*hpage, *hpage);
2254
2255 /*
2256 * Before allocating the hugepage, release the mmap_sem read lock.
2257 * The allocation can take potentially a long time if it involves
2258 * sync compaction, and we do not need to hold the mmap_sem during
2259 * that. We will recheck the vma after taking it again in write mode.
2260 */
2261 up_read(&mm->mmap_sem);
2262
2263 *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
2264 if (unlikely(!*hpage)) {
2265 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2266 *hpage = ERR_PTR(-ENOMEM);
2267 return NULL;
2268 }
2269
2270 prep_transhuge_page(*hpage);
2271 count_vm_event(THP_COLLAPSE_ALLOC);
2272 return *hpage;
2273}
2274#else
2275static int khugepaged_find_target_node(void)
2276{
2277 return 0;
2278}
2279
2280static inline struct page *alloc_khugepaged_hugepage(void)
2281{
2282 struct page *page;
2283
2284 page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
2285 HPAGE_PMD_ORDER);
2286 if (page)
2287 prep_transhuge_page(page);
2288 return page;
2289}
2290
2291static struct page *khugepaged_alloc_hugepage(bool *wait)
2292{
2293 struct page *hpage;
2294
2295 do {
2296 hpage = alloc_khugepaged_hugepage();
2297 if (!hpage) {
2298 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2299 if (!*wait)
2300 return NULL;
2301
2302 *wait = false;
2303 khugepaged_alloc_sleep();
2304 } else
2305 count_vm_event(THP_COLLAPSE_ALLOC);
2306 } while (unlikely(!hpage) && likely(khugepaged_enabled()));
2307
2308 return hpage;
2309}
2310
2311static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
2312{
2313 if (!*hpage)
2314 *hpage = khugepaged_alloc_hugepage(wait);
2315
2316 if (unlikely(!*hpage))
2317 return false;
2318
2319 return true;
2320}
2321
2322static struct page *
2323khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
2324 unsigned long address, int node)
2325{
2326 up_read(&mm->mmap_sem);
2327 VM_BUG_ON(!*hpage);
2328
2329 return *hpage;
2330}
2331#endif
2332
2333static bool hugepage_vma_check(struct vm_area_struct *vma)
2334{
2335 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
2336 (vma->vm_flags & VM_NOHUGEPAGE))
2337 return false;
2338 if (!vma->anon_vma || vma->vm_ops)
2339 return false;
2340 if (is_vma_temporary_stack(vma))
2341 return false;
2342 return !(vma->vm_flags & VM_NO_THP);
2343}
2344
2345static void collapse_huge_page(struct mm_struct *mm,
2346 unsigned long address,
2347 struct page **hpage,
2348 struct vm_area_struct *vma,
2349 int node)
2350{
2351 pmd_t *pmd, _pmd;
2352 pte_t *pte;
2353 pgtable_t pgtable;
2354 struct page *new_page;
2355 spinlock_t *pmd_ptl, *pte_ptl;
2356 int isolated = 0, result = 0;
2357 unsigned long hstart, hend;
2358 struct mem_cgroup *memcg;
2359 unsigned long mmun_start; /* For mmu_notifiers */
2360 unsigned long mmun_end; /* For mmu_notifiers */
2361 gfp_t gfp;
2362
2363 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2364
2365 /* Only allocate from the target node */
2366 gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE;
2367
2368 /* release the mmap_sem read lock. */
2369 new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node);
2370 if (!new_page) {
2371 result = SCAN_ALLOC_HUGE_PAGE_FAIL;
2372 goto out_nolock;
2373 }
2374
2375 if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
2376 result = SCAN_CGROUP_CHARGE_FAIL;
2377 goto out_nolock;
2378 }
2379
2380 /*
2381 * Prevent all access to pagetables with the exception of
2382 * gup_fast later hanlded by the ptep_clear_flush and the VM
2383 * handled by the anon_vma lock + PG_lock.
2384 */
2385 down_write(&mm->mmap_sem);
2386 if (unlikely(khugepaged_test_exit(mm))) {
2387 result = SCAN_ANY_PROCESS;
2388 goto out;
2389 }
2390
2391 vma = find_vma(mm, address);
2392 if (!vma) {
2393 result = SCAN_VMA_NULL;
2394 goto out;
2395 }
2396 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2397 hend = vma->vm_end & HPAGE_PMD_MASK;
2398 if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
2399 result = SCAN_ADDRESS_RANGE;
2400 goto out;
2401 }
2402 if (!hugepage_vma_check(vma)) {
2403 result = SCAN_VMA_CHECK;
2404 goto out;
2405 }
2406 pmd = mm_find_pmd(mm, address);
2407 if (!pmd) {
2408 result = SCAN_PMD_NULL;
2409 goto out;
2410 }
2411
2412 anon_vma_lock_write(vma->anon_vma);
2413
2414 pte = pte_offset_map(pmd, address);
2415 pte_ptl = pte_lockptr(mm, pmd);
2416
2417 mmun_start = address;
2418 mmun_end = address + HPAGE_PMD_SIZE;
2419 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2420 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
2421 /*
2422 * After this gup_fast can't run anymore. This also removes
2423 * any huge TLB entry from the CPU so we won't allow
2424 * huge and small TLB entries for the same virtual address
2425 * to avoid the risk of CPU bugs in that area.
2426 */
2427 _pmd = pmdp_collapse_flush(vma, address, pmd);
2428 spin_unlock(pmd_ptl);
2429 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2430
2431 spin_lock(pte_ptl);
2432 isolated = __collapse_huge_page_isolate(vma, address, pte);
2433 spin_unlock(pte_ptl);
2434
2435 if (unlikely(!isolated)) {
2436 pte_unmap(pte);
2437 spin_lock(pmd_ptl);
2438 BUG_ON(!pmd_none(*pmd));
2439 /*
2440 * We can only use set_pmd_at when establishing
2441 * hugepmds and never for establishing regular pmds that
2442 * points to regular pagetables. Use pmd_populate for that
2443 */
2444 pmd_populate(mm, pmd, pmd_pgtable(_pmd));
2445 spin_unlock(pmd_ptl);
2446 anon_vma_unlock_write(vma->anon_vma);
2447 result = SCAN_FAIL;
2448 goto out;
2449 }
2450
2451 /*
2452 * All pages are isolated and locked so anon_vma rmap
2453 * can't run anymore.
2454 */
2455 anon_vma_unlock_write(vma->anon_vma);
2456
2457 __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
2458 pte_unmap(pte);
2459 __SetPageUptodate(new_page);
2460 pgtable = pmd_pgtable(_pmd);
2461
2462 _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
2463 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
2464
2465 /*
2466 * spin_lock() below is not the equivalent of smp_wmb(), so
2467 * this is needed to avoid the copy_huge_page writes to become
2468 * visible after the set_pmd_at() write.
2469 */
2470 smp_wmb();
2471
2472 spin_lock(pmd_ptl);
2473 BUG_ON(!pmd_none(*pmd));
2474 page_add_new_anon_rmap(new_page, vma, address, true);
2475 mem_cgroup_commit_charge(new_page, memcg, false, true);
2476 lru_cache_add_active_or_unevictable(new_page, vma);
2477 pgtable_trans_huge_deposit(mm, pmd, pgtable);
2478 set_pmd_at(mm, address, pmd, _pmd);
2479 update_mmu_cache_pmd(vma, address, pmd);
2480 spin_unlock(pmd_ptl);
2481
2482 *hpage = NULL;
2483
2484 khugepaged_pages_collapsed++;
2485 result = SCAN_SUCCEED;
2486out_up_write:
2487 up_write(&mm->mmap_sem);
2488 trace_mm_collapse_huge_page(mm, isolated, result);
2489 return;
2490
2491out_nolock:
2492 trace_mm_collapse_huge_page(mm, isolated, result);
2493 return;
2494out:
2495 mem_cgroup_cancel_charge(new_page, memcg, true);
2496 goto out_up_write;
2497}
2498
2499static int khugepaged_scan_pmd(struct mm_struct *mm,
2500 struct vm_area_struct *vma,
2501 unsigned long address,
2502 struct page **hpage)
2503{
2504 pmd_t *pmd;
2505 pte_t *pte, *_pte;
2506 int ret = 0, none_or_zero = 0, result = 0;
2507 struct page *page = NULL;
2508 unsigned long _address;
2509 spinlock_t *ptl;
2510 int node = NUMA_NO_NODE;
2511 bool writable = false, referenced = false;
2512
2513 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2514
2515 pmd = mm_find_pmd(mm, address);
2516 if (!pmd) {
2517 result = SCAN_PMD_NULL;
2518 goto out;
2519 }
2520
2521 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
2522 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2523 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
2524 _pte++, _address += PAGE_SIZE) {
2525 pte_t pteval = *_pte;
2526 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
2527 if (!userfaultfd_armed(vma) &&
2528 ++none_or_zero <= khugepaged_max_ptes_none) {
2529 continue;
2530 } else {
2531 result = SCAN_EXCEED_NONE_PTE;
2532 goto out_unmap;
2533 }
2534 }
2535 if (!pte_present(pteval)) {
2536 result = SCAN_PTE_NON_PRESENT;
2537 goto out_unmap;
2538 }
2539 if (pte_write(pteval))
2540 writable = true;
2541
2542 page = vm_normal_page(vma, _address, pteval);
2543 if (unlikely(!page)) {
2544 result = SCAN_PAGE_NULL;
2545 goto out_unmap;
2546 }
2547
2548 /* TODO: teach khugepaged to collapse THP mapped with pte */
2549 if (PageCompound(page)) {
2550 result = SCAN_PAGE_COMPOUND;
2551 goto out_unmap;
2552 }
2553
2554 /*
2555 * Record which node the original page is from and save this
2556 * information to khugepaged_node_load[].
2557 * Khupaged will allocate hugepage from the node has the max
2558 * hit record.
2559 */
2560 node = page_to_nid(page);
2561 if (khugepaged_scan_abort(node)) {
2562 result = SCAN_SCAN_ABORT;
2563 goto out_unmap;
2564 }
2565 khugepaged_node_load[node]++;
2566 if (!PageLRU(page)) {
2567 result = SCAN_PAGE_LRU;
2568 goto out_unmap;
2569 }
2570 if (PageLocked(page)) {
2571 result = SCAN_PAGE_LOCK;
2572 goto out_unmap;
2573 }
2574 if (!PageAnon(page)) {
2575 result = SCAN_PAGE_ANON;
2576 goto out_unmap;
2577 }
2578
2579 /*
2580 * cannot use mapcount: can't collapse if there's a gup pin.
2581 * The page must only be referenced by the scanned process
2582 * and page swap cache.
2583 */
2584 if (page_count(page) != 1 + !!PageSwapCache(page)) {
2585 result = SCAN_PAGE_COUNT;
2586 goto out_unmap;
2587 }
2588 if (pte_young(pteval) ||
2589 page_is_young(page) || PageReferenced(page) ||
2590 mmu_notifier_test_young(vma->vm_mm, address))
2591 referenced = true;
2592 }
2593 if (writable) {
2594 if (referenced) {
2595 result = SCAN_SUCCEED;
2596 ret = 1;
2597 } else {
2598 result = SCAN_NO_REFERENCED_PAGE;
2599 }
2600 } else {
2601 result = SCAN_PAGE_RO;
2602 }
2603out_unmap:
2604 pte_unmap_unlock(pte, ptl);
2605 if (ret) {
2606 node = khugepaged_find_target_node();
2607 /* collapse_huge_page will return with the mmap_sem released */
2608 collapse_huge_page(mm, address, hpage, vma, node);
2609 }
2610out:
2611 trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
2612 none_or_zero, result);
2613 return ret;
2614}
2615
2616static void collect_mm_slot(struct mm_slot *mm_slot)
2617{
2618 struct mm_struct *mm = mm_slot->mm;
2619
2620 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2621
2622 if (khugepaged_test_exit(mm)) {
2623 /* free mm_slot */
2624 hash_del(&mm_slot->hash);
2625 list_del(&mm_slot->mm_node);
2626
2627 /*
2628 * Not strictly needed because the mm exited already.
2629 *
2630 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
2631 */
2632
2633 /* khugepaged_mm_lock actually not necessary for the below */
2634 free_mm_slot(mm_slot);
2635 mmdrop(mm);
2636 }
2637}
2638
2639static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2640 struct page **hpage)
2641 __releases(&khugepaged_mm_lock)
2642 __acquires(&khugepaged_mm_lock)
2643{
2644 struct mm_slot *mm_slot;
2645 struct mm_struct *mm;
2646 struct vm_area_struct *vma;
2647 int progress = 0;
2648
2649 VM_BUG_ON(!pages);
2650 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
2651
2652 if (khugepaged_scan.mm_slot)
2653 mm_slot = khugepaged_scan.mm_slot;
2654 else {
2655 mm_slot = list_entry(khugepaged_scan.mm_head.next,
2656 struct mm_slot, mm_node);
2657 khugepaged_scan.address = 0;
2658 khugepaged_scan.mm_slot = mm_slot;
2659 }
2660 spin_unlock(&khugepaged_mm_lock);
2661
2662 mm = mm_slot->mm;
2663 down_read(&mm->mmap_sem);
2664 if (unlikely(khugepaged_test_exit(mm)))
2665 vma = NULL;
2666 else
2667 vma = find_vma(mm, khugepaged_scan.address);
2668
2669 progress++;
2670 for (; vma; vma = vma->vm_next) {
2671 unsigned long hstart, hend;
2672
2673 cond_resched();
2674 if (unlikely(khugepaged_test_exit(mm))) {
2675 progress++;
2676 break;
2677 }
2678 if (!hugepage_vma_check(vma)) {
2679skip:
2680 progress++;
2681 continue;
2682 }
2683 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2684 hend = vma->vm_end & HPAGE_PMD_MASK;
2685 if (hstart >= hend)
2686 goto skip;
2687 if (khugepaged_scan.address > hend)
2688 goto skip;
2689 if (khugepaged_scan.address < hstart)
2690 khugepaged_scan.address = hstart;
2691 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2692
2693 while (khugepaged_scan.address < hend) {
2694 int ret;
2695 cond_resched();
2696 if (unlikely(khugepaged_test_exit(mm)))
2697 goto breakouterloop;
2698
2699 VM_BUG_ON(khugepaged_scan.address < hstart ||
2700 khugepaged_scan.address + HPAGE_PMD_SIZE >
2701 hend);
2702 ret = khugepaged_scan_pmd(mm, vma,
2703 khugepaged_scan.address,
2704 hpage);
2705 /* move to next address */
2706 khugepaged_scan.address += HPAGE_PMD_SIZE;
2707 progress += HPAGE_PMD_NR;
2708 if (ret)
2709 /* we released mmap_sem so break loop */
2710 goto breakouterloop_mmap_sem;
2711 if (progress >= pages)
2712 goto breakouterloop;
2713 }
2714 }
2715breakouterloop:
2716 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
2717breakouterloop_mmap_sem:
2718
2719 spin_lock(&khugepaged_mm_lock);
2720 VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2721 /*
2722 * Release the current mm_slot if this mm is about to die, or
2723 * if we scanned all vmas of this mm.
2724 */
2725 if (khugepaged_test_exit(mm) || !vma) {
2726 /*
2727 * Make sure that if mm_users is reaching zero while
2728 * khugepaged runs here, khugepaged_exit will find
2729 * mm_slot not pointing to the exiting mm.
2730 */
2731 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
2732 khugepaged_scan.mm_slot = list_entry(
2733 mm_slot->mm_node.next,
2734 struct mm_slot, mm_node);
2735 khugepaged_scan.address = 0;
2736 } else {
2737 khugepaged_scan.mm_slot = NULL;
2738 khugepaged_full_scans++;
2739 }
2740
2741 collect_mm_slot(mm_slot);
2742 }
2743
2744 return progress;
2745}
2746
2747static int khugepaged_has_work(void)
2748{
2749 return !list_empty(&khugepaged_scan.mm_head) &&
2750 khugepaged_enabled();
2751}
2752
2753static int khugepaged_wait_event(void)
2754{
2755 return !list_empty(&khugepaged_scan.mm_head) ||
2756 kthread_should_stop();
2757}
2758
2759static void khugepaged_do_scan(void)
2760{
2761 struct page *hpage = NULL;
2762 unsigned int progress = 0, pass_through_head = 0;
2763 unsigned int pages = khugepaged_pages_to_scan;
2764 bool wait = true;
2765
2766 barrier(); /* write khugepaged_pages_to_scan to local stack */
2767
2768 while (progress < pages) {
2769 if (!khugepaged_prealloc_page(&hpage, &wait))
2770 break;
2771
2772 cond_resched();
2773
2774 if (unlikely(kthread_should_stop() || try_to_freeze()))
2775 break;
2776
2777 spin_lock(&khugepaged_mm_lock);
2778 if (!khugepaged_scan.mm_slot)
2779 pass_through_head++;
2780 if (khugepaged_has_work() &&
2781 pass_through_head < 2)
2782 progress += khugepaged_scan_mm_slot(pages - progress,
2783 &hpage);
2784 else
2785 progress = pages;
2786 spin_unlock(&khugepaged_mm_lock);
2787 }
2788
2789 if (!IS_ERR_OR_NULL(hpage))
2790 put_page(hpage);
2791}
2792
2793static bool khugepaged_should_wakeup(void)
2794{
2795 return kthread_should_stop() ||
2796 time_after_eq(jiffies, khugepaged_sleep_expire);
2797}
2798
2799static void khugepaged_wait_work(void)
2800{
2801 if (khugepaged_has_work()) {
2802 const unsigned long scan_sleep_jiffies =
2803 msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
2804
2805 if (!scan_sleep_jiffies)
2806 return;
2807
2808 khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
2809 wait_event_freezable_timeout(khugepaged_wait,
2810 khugepaged_should_wakeup(),
2811 scan_sleep_jiffies);
2812 return;
2813 }
2814
2815 if (khugepaged_enabled())
2816 wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
2817}
2818
2819static int khugepaged(void *none)
2820{
2821 struct mm_slot *mm_slot;
2822
2823 set_freezable();
2824 set_user_nice(current, MAX_NICE);
2825
2826 while (!kthread_should_stop()) {
2827 khugepaged_do_scan();
2828 khugepaged_wait_work();
2829 }
2830
2831 spin_lock(&khugepaged_mm_lock);
2832 mm_slot = khugepaged_scan.mm_slot;
2833 khugepaged_scan.mm_slot = NULL;
2834 if (mm_slot)
2835 collect_mm_slot(mm_slot);
2836 spin_unlock(&khugepaged_mm_lock);
2837 return 0;
2838}
2839
2840static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, 1476static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2841 unsigned long haddr, pmd_t *pmd) 1477 unsigned long haddr, pmd_t *pmd)
2842{ 1478{
@@ -2883,10 +1519,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2883 1519
2884 count_vm_event(THP_SPLIT_PMD); 1520 count_vm_event(THP_SPLIT_PMD);
2885 1521
2886 if (vma_is_dax(vma)) { 1522 if (!vma_is_anonymous(vma)) {
2887 pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); 1523 _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
2888 if (is_huge_zero_pmd(_pmd)) 1524 if (is_huge_zero_pmd(_pmd))
2889 put_huge_zero_page(); 1525 put_huge_zero_page();
1526 if (vma_is_dax(vma))
1527 return;
1528 page = pmd_page(_pmd);
1529 if (!PageReferenced(page) && pmd_young(_pmd))
1530 SetPageReferenced(page);
1531 page_remove_rmap(page, true);
1532 put_page(page);
1533 add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
2890 return; 1534 return;
2891 } else if (is_huge_zero_pmd(*pmd)) { 1535 } else if (is_huge_zero_pmd(*pmd)) {
2892 return __split_huge_zero_page_pmd(vma, haddr, pmd); 1536 return __split_huge_zero_page_pmd(vma, haddr, pmd);
@@ -2942,7 +1586,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2942 1586
2943 if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { 1587 if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
2944 /* Last compound_mapcount is gone. */ 1588 /* Last compound_mapcount is gone. */
2945 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1589 __dec_zone_page_state(page, NR_ANON_THPS);
2946 if (TestClearPageDoubleMap(page)) { 1590 if (TestClearPageDoubleMap(page)) {
2947 /* No need in mapcount reference anymore */ 1591 /* No need in mapcount reference anymore */
2948 for (i = 0; i < HPAGE_PMD_NR; i++) 1592 for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -3076,12 +1720,15 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
3076 1720
3077static void freeze_page(struct page *page) 1721static void freeze_page(struct page *page)
3078{ 1722{
3079 enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | 1723 enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
3080 TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED; 1724 TTU_RMAP_LOCKED;
3081 int i, ret; 1725 int i, ret;
3082 1726
3083 VM_BUG_ON_PAGE(!PageHead(page), page); 1727 VM_BUG_ON_PAGE(!PageHead(page), page);
3084 1728
1729 if (PageAnon(page))
1730 ttu_flags |= TTU_MIGRATION;
1731
3085 /* We only need TTU_SPLIT_HUGE_PMD once */ 1732 /* We only need TTU_SPLIT_HUGE_PMD once */
3086 ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); 1733 ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD);
3087 for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { 1734 for (i = 1; !ret && i < HPAGE_PMD_NR; i++) {
@@ -3091,7 +1738,7 @@ static void freeze_page(struct page *page)
3091 1738
3092 ret = try_to_unmap(page + i, ttu_flags); 1739 ret = try_to_unmap(page + i, ttu_flags);
3093 } 1740 }
3094 VM_BUG_ON(ret); 1741 VM_BUG_ON_PAGE(ret, page + i - 1);
3095} 1742}
3096 1743
3097static void unfreeze_page(struct page *page) 1744static void unfreeze_page(struct page *page)
@@ -3113,15 +1760,20 @@ static void __split_huge_page_tail(struct page *head, int tail,
3113 /* 1760 /*
3114 * tail_page->_refcount is zero and not changing from under us. But 1761 * tail_page->_refcount is zero and not changing from under us. But
3115 * get_page_unless_zero() may be running from under us on the 1762 * get_page_unless_zero() may be running from under us on the
3116 * tail_page. If we used atomic_set() below instead of atomic_inc(), we 1763 * tail_page. If we used atomic_set() below instead of atomic_inc() or
3117 * would then run atomic_set() concurrently with 1764 * atomic_add(), we would then run atomic_set() concurrently with
3118 * get_page_unless_zero(), and atomic_set() is implemented in C not 1765 * get_page_unless_zero(), and atomic_set() is implemented in C not
3119 * using locked ops. spin_unlock on x86 sometime uses locked ops 1766 * using locked ops. spin_unlock on x86 sometime uses locked ops
3120 * because of PPro errata 66, 92, so unless somebody can guarantee 1767 * because of PPro errata 66, 92, so unless somebody can guarantee
3121 * atomic_set() here would be safe on all archs (and not only on x86), 1768 * atomic_set() here would be safe on all archs (and not only on x86),
3122 * it's safer to use atomic_inc(). 1769 * it's safer to use atomic_inc()/atomic_add().
3123 */ 1770 */
3124 page_ref_inc(page_tail); 1771 if (PageAnon(head)) {
1772 page_ref_inc(page_tail);
1773 } else {
1774 /* Additional pin to radix tree */
1775 page_ref_add(page_tail, 2);
1776 }
3125 1777
3126 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1778 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
3127 page_tail->flags |= (head->flags & 1779 page_tail->flags |= (head->flags &
@@ -3157,25 +1809,46 @@ static void __split_huge_page_tail(struct page *head, int tail,
3157 lru_add_page_tail(head, page_tail, lruvec, list); 1809 lru_add_page_tail(head, page_tail, lruvec, list);
3158} 1810}
3159 1811
3160static void __split_huge_page(struct page *page, struct list_head *list) 1812static void __split_huge_page(struct page *page, struct list_head *list,
1813 unsigned long flags)
3161{ 1814{
3162 struct page *head = compound_head(page); 1815 struct page *head = compound_head(page);
3163 struct zone *zone = page_zone(head); 1816 struct zone *zone = page_zone(head);
3164 struct lruvec *lruvec; 1817 struct lruvec *lruvec;
1818 pgoff_t end = -1;
3165 int i; 1819 int i;
3166 1820
3167 /* prevent PageLRU to go away from under us, and freeze lru stats */
3168 spin_lock_irq(&zone->lru_lock);
3169 lruvec = mem_cgroup_page_lruvec(head, zone); 1821 lruvec = mem_cgroup_page_lruvec(head, zone);
3170 1822
3171 /* complete memcg works before add pages to LRU */ 1823 /* complete memcg works before add pages to LRU */
3172 mem_cgroup_split_huge_fixup(head); 1824 mem_cgroup_split_huge_fixup(head);
3173 1825
3174 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) 1826 if (!PageAnon(page))
1827 end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE);
1828
1829 for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
3175 __split_huge_page_tail(head, i, lruvec, list); 1830 __split_huge_page_tail(head, i, lruvec, list);
1831 /* Some pages can be beyond i_size: drop them from page cache */
1832 if (head[i].index >= end) {
1833 __ClearPageDirty(head + i);
1834 __delete_from_page_cache(head + i, NULL);
1835 if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
1836 shmem_uncharge(head->mapping->host, 1);
1837 put_page(head + i);
1838 }
1839 }
3176 1840
3177 ClearPageCompound(head); 1841 ClearPageCompound(head);
3178 spin_unlock_irq(&zone->lru_lock); 1842 /* See comment in __split_huge_page_tail() */
1843 if (PageAnon(head)) {
1844 page_ref_inc(head);
1845 } else {
1846 /* Additional pin to radix tree */
1847 page_ref_add(head, 2);
1848 spin_unlock(&head->mapping->tree_lock);
1849 }
1850
1851 spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
3179 1852
3180 unfreeze_page(head); 1853 unfreeze_page(head);
3181 1854
@@ -3198,18 +1871,22 @@ static void __split_huge_page(struct page *page, struct list_head *list)
3198 1871
3199int total_mapcount(struct page *page) 1872int total_mapcount(struct page *page)
3200{ 1873{
3201 int i, ret; 1874 int i, compound, ret;
3202 1875
3203 VM_BUG_ON_PAGE(PageTail(page), page); 1876 VM_BUG_ON_PAGE(PageTail(page), page);
3204 1877
3205 if (likely(!PageCompound(page))) 1878 if (likely(!PageCompound(page)))
3206 return atomic_read(&page->_mapcount) + 1; 1879 return atomic_read(&page->_mapcount) + 1;
3207 1880
3208 ret = compound_mapcount(page); 1881 compound = compound_mapcount(page);
3209 if (PageHuge(page)) 1882 if (PageHuge(page))
3210 return ret; 1883 return compound;
1884 ret = compound;
3211 for (i = 0; i < HPAGE_PMD_NR; i++) 1885 for (i = 0; i < HPAGE_PMD_NR; i++)
3212 ret += atomic_read(&page[i]._mapcount) + 1; 1886 ret += atomic_read(&page[i]._mapcount) + 1;
1887 /* File pages has compound_mapcount included in _mapcount */
1888 if (!PageAnon(page))
1889 return ret - compound * HPAGE_PMD_NR;
3213 if (PageDoubleMap(page)) 1890 if (PageDoubleMap(page))
3214 ret -= HPAGE_PMD_NR; 1891 ret -= HPAGE_PMD_NR;
3215 return ret; 1892 return ret;
@@ -3296,36 +1973,54 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
3296{ 1973{
3297 struct page *head = compound_head(page); 1974 struct page *head = compound_head(page);
3298 struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); 1975 struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
3299 struct anon_vma *anon_vma; 1976 struct anon_vma *anon_vma = NULL;
3300 int count, mapcount, ret; 1977 struct address_space *mapping = NULL;
1978 int count, mapcount, extra_pins, ret;
3301 bool mlocked; 1979 bool mlocked;
3302 unsigned long flags; 1980 unsigned long flags;
3303 1981
3304 VM_BUG_ON_PAGE(is_huge_zero_page(page), page); 1982 VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
3305 VM_BUG_ON_PAGE(!PageAnon(page), page);
3306 VM_BUG_ON_PAGE(!PageLocked(page), page); 1983 VM_BUG_ON_PAGE(!PageLocked(page), page);
3307 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 1984 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
3308 VM_BUG_ON_PAGE(!PageCompound(page), page); 1985 VM_BUG_ON_PAGE(!PageCompound(page), page);
3309 1986
3310 /* 1987 if (PageAnon(head)) {
3311 * The caller does not necessarily hold an mmap_sem that would prevent 1988 /*
3312 * the anon_vma disappearing so we first we take a reference to it 1989 * The caller does not necessarily hold an mmap_sem that would
3313 * and then lock the anon_vma for write. This is similar to 1990 * prevent the anon_vma disappearing so we first we take a
3314 * page_lock_anon_vma_read except the write lock is taken to serialise 1991 * reference to it and then lock the anon_vma for write. This
3315 * against parallel split or collapse operations. 1992 * is similar to page_lock_anon_vma_read except the write lock
3316 */ 1993 * is taken to serialise against parallel split or collapse
3317 anon_vma = page_get_anon_vma(head); 1994 * operations.
3318 if (!anon_vma) { 1995 */
3319 ret = -EBUSY; 1996 anon_vma = page_get_anon_vma(head);
3320 goto out; 1997 if (!anon_vma) {
1998 ret = -EBUSY;
1999 goto out;
2000 }
2001 extra_pins = 0;
2002 mapping = NULL;
2003 anon_vma_lock_write(anon_vma);
2004 } else {
2005 mapping = head->mapping;
2006
2007 /* Truncated ? */
2008 if (!mapping) {
2009 ret = -EBUSY;
2010 goto out;
2011 }
2012
2013 /* Addidional pins from radix tree */
2014 extra_pins = HPAGE_PMD_NR;
2015 anon_vma = NULL;
2016 i_mmap_lock_read(mapping);
3321 } 2017 }
3322 anon_vma_lock_write(anon_vma);
3323 2018
3324 /* 2019 /*
3325 * Racy check if we can split the page, before freeze_page() will 2020 * Racy check if we can split the page, before freeze_page() will
3326 * split PMDs 2021 * split PMDs
3327 */ 2022 */
3328 if (total_mapcount(head) != page_count(head) - 1) { 2023 if (total_mapcount(head) != page_count(head) - extra_pins - 1) {
3329 ret = -EBUSY; 2024 ret = -EBUSY;
3330 goto out_unlock; 2025 goto out_unlock;
3331 } 2026 }
@@ -3338,35 +2033,62 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
3338 if (mlocked) 2033 if (mlocked)
3339 lru_add_drain(); 2034 lru_add_drain();
3340 2035
2036 /* prevent PageLRU to go away from under us, and freeze lru stats */
2037 spin_lock_irqsave(&page_zone(head)->lru_lock, flags);
2038
2039 if (mapping) {
2040 void **pslot;
2041
2042 spin_lock(&mapping->tree_lock);
2043 pslot = radix_tree_lookup_slot(&mapping->page_tree,
2044 page_index(head));
2045 /*
2046 * Check if the head page is present in radix tree.
2047 * We assume all tail are present too, if head is there.
2048 */
2049 if (radix_tree_deref_slot_protected(pslot,
2050 &mapping->tree_lock) != head)
2051 goto fail;
2052 }
2053
3341 /* Prevent deferred_split_scan() touching ->_refcount */ 2054 /* Prevent deferred_split_scan() touching ->_refcount */
3342 spin_lock_irqsave(&pgdata->split_queue_lock, flags); 2055 spin_lock(&pgdata->split_queue_lock);
3343 count = page_count(head); 2056 count = page_count(head);
3344 mapcount = total_mapcount(head); 2057 mapcount = total_mapcount(head);
3345 if (!mapcount && count == 1) { 2058 if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
3346 if (!list_empty(page_deferred_list(head))) { 2059 if (!list_empty(page_deferred_list(head))) {
3347 pgdata->split_queue_len--; 2060 pgdata->split_queue_len--;
3348 list_del(page_deferred_list(head)); 2061 list_del(page_deferred_list(head));
3349 } 2062 }
3350 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2063 if (mapping)
3351 __split_huge_page(page, list); 2064 __dec_zone_page_state(page, NR_SHMEM_THPS);
2065 spin_unlock(&pgdata->split_queue_lock);
2066 __split_huge_page(page, list, flags);
3352 ret = 0; 2067 ret = 0;
3353 } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
3354 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
3355 pr_alert("total_mapcount: %u, page_count(): %u\n",
3356 mapcount, count);
3357 if (PageTail(page))
3358 dump_page(head, NULL);
3359 dump_page(page, "total_mapcount(head) > 0");
3360 BUG();
3361 } else { 2068 } else {
3362 spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); 2069 if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
2070 pr_alert("total_mapcount: %u, page_count(): %u\n",
2071 mapcount, count);
2072 if (PageTail(page))
2073 dump_page(head, NULL);
2074 dump_page(page, "total_mapcount(head) > 0");
2075 BUG();
2076 }
2077 spin_unlock(&pgdata->split_queue_lock);
2078fail: if (mapping)
2079 spin_unlock(&mapping->tree_lock);
2080 spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
3363 unfreeze_page(head); 2081 unfreeze_page(head);
3364 ret = -EBUSY; 2082 ret = -EBUSY;
3365 } 2083 }
3366 2084
3367out_unlock: 2085out_unlock:
3368 anon_vma_unlock_write(anon_vma); 2086 if (anon_vma) {
3369 put_anon_vma(anon_vma); 2087 anon_vma_unlock_write(anon_vma);
2088 put_anon_vma(anon_vma);
2089 }
2090 if (mapping)
2091 i_mmap_unlock_read(mapping);
3370out: 2092out:
3371 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 2093 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
3372 return ret; 2094 return ret;
@@ -3489,8 +2211,7 @@ static int split_huge_pages_set(void *data, u64 val)
3489 if (zone != page_zone(page)) 2211 if (zone != page_zone(page))
3490 goto next; 2212 goto next;
3491 2213
3492 if (!PageHead(page) || !PageAnon(page) || 2214 if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
3493 PageHuge(page))
3494 goto next; 2215 goto next;
3495 2216
3496 total++; 2217 total++;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cc2a99e9cbc8..abc1c5fb7222 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3179,7 +3179,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
3179 unsigned long start, unsigned long end, 3179 unsigned long start, unsigned long end,
3180 struct page *ref_page) 3180 struct page *ref_page)
3181{ 3181{
3182 int force_flush = 0;
3183 struct mm_struct *mm = vma->vm_mm; 3182 struct mm_struct *mm = vma->vm_mm;
3184 unsigned long address; 3183 unsigned long address;
3185 pte_t *ptep; 3184 pte_t *ptep;
@@ -3198,19 +3197,22 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
3198 tlb_start_vma(tlb, vma); 3197 tlb_start_vma(tlb, vma);
3199 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 3198 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
3200 address = start; 3199 address = start;
3201again:
3202 for (; address < end; address += sz) { 3200 for (; address < end; address += sz) {
3203 ptep = huge_pte_offset(mm, address); 3201 ptep = huge_pte_offset(mm, address);
3204 if (!ptep) 3202 if (!ptep)
3205 continue; 3203 continue;
3206 3204
3207 ptl = huge_pte_lock(h, mm, ptep); 3205 ptl = huge_pte_lock(h, mm, ptep);
3208 if (huge_pmd_unshare(mm, &address, ptep)) 3206 if (huge_pmd_unshare(mm, &address, ptep)) {
3209 goto unlock; 3207 spin_unlock(ptl);
3208 continue;
3209 }
3210 3210
3211 pte = huge_ptep_get(ptep); 3211 pte = huge_ptep_get(ptep);
3212 if (huge_pte_none(pte)) 3212 if (huge_pte_none(pte)) {
3213 goto unlock; 3213 spin_unlock(ptl);
3214 continue;
3215 }
3214 3216
3215 /* 3217 /*
3216 * Migrating hugepage or HWPoisoned hugepage is already 3218 * Migrating hugepage or HWPoisoned hugepage is already
@@ -3218,7 +3220,8 @@ again:
3218 */ 3220 */
3219 if (unlikely(!pte_present(pte))) { 3221 if (unlikely(!pte_present(pte))) {
3220 huge_pte_clear(mm, address, ptep); 3222 huge_pte_clear(mm, address, ptep);
3221 goto unlock; 3223 spin_unlock(ptl);
3224 continue;
3222 } 3225 }
3223 3226
3224 page = pte_page(pte); 3227 page = pte_page(pte);
@@ -3228,9 +3231,10 @@ again:
3228 * are about to unmap is the actual page of interest. 3231 * are about to unmap is the actual page of interest.
3229 */ 3232 */
3230 if (ref_page) { 3233 if (ref_page) {
3231 if (page != ref_page) 3234 if (page != ref_page) {
3232 goto unlock; 3235 spin_unlock(ptl);
3233 3236 continue;
3237 }
3234 /* 3238 /*
3235 * Mark the VMA as having unmapped its page so that 3239 * Mark the VMA as having unmapped its page so that
3236 * future faults in this VMA will fail rather than 3240 * future faults in this VMA will fail rather than
@@ -3246,30 +3250,14 @@ again:
3246 3250
3247 hugetlb_count_sub(pages_per_huge_page(h), mm); 3251 hugetlb_count_sub(pages_per_huge_page(h), mm);
3248 page_remove_rmap(page, true); 3252 page_remove_rmap(page, true);
3249 force_flush = !__tlb_remove_page(tlb, page); 3253
3250 if (force_flush) {
3251 address += sz;
3252 spin_unlock(ptl);
3253 break;
3254 }
3255 /* Bail out after unmapping reference page if supplied */
3256 if (ref_page) {
3257 spin_unlock(ptl);
3258 break;
3259 }
3260unlock:
3261 spin_unlock(ptl); 3254 spin_unlock(ptl);
3262 } 3255 tlb_remove_page_size(tlb, page, huge_page_size(h));
3263 /* 3256 /*
3264 * mmu_gather ran out of room to batch pages, we break out of 3257 * Bail out after unmapping reference page if supplied
3265 * the PTE lock to avoid doing the potential expensive TLB invalidate 3258 */
3266 * and page-free while holding it. 3259 if (ref_page)
3267 */ 3260 break;
3268 if (force_flush) {
3269 force_flush = 0;
3270 tlb_flush_mmu(tlb);
3271 if (address < end && !ref_page)
3272 goto again;
3273 } 3261 }
3274 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 3262 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
3275 tlb_end_vma(tlb, vma); 3263 tlb_end_vma(tlb, vma);
diff --git a/mm/internal.h b/mm/internal.h
index 2524ec880e24..9b6a6c43ac39 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -36,6 +36,8 @@
36/* Do not use these with a slab allocator */ 36/* Do not use these with a slab allocator */
37#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) 37#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)
38 38
39int do_swap_page(struct fault_env *fe, pte_t orig_pte);
40
39void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, 41void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
40 unsigned long floor, unsigned long ceiling); 42 unsigned long floor, unsigned long ceiling);
41 43
@@ -150,6 +152,8 @@ extern int __isolate_free_page(struct page *page, unsigned int order);
150extern void __free_pages_bootmem(struct page *page, unsigned long pfn, 152extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
151 unsigned int order); 153 unsigned int order);
152extern void prep_compound_page(struct page *page, unsigned int order); 154extern void prep_compound_page(struct page *page, unsigned int order);
155extern void post_alloc_hook(struct page *page, unsigned int order,
156 gfp_t gfp_flags);
153extern int user_min_free_kbytes; 157extern int user_min_free_kbytes;
154 158
155#if defined CONFIG_COMPACTION || defined CONFIG_CMA 159#if defined CONFIG_COMPACTION || defined CONFIG_CMA
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
new file mode 100644
index 000000000000..7dbee698d6aa
--- /dev/null
+++ b/mm/khugepaged.c
@@ -0,0 +1,1922 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
3#include <linux/mm.h>
4#include <linux/sched.h>
5#include <linux/mmu_notifier.h>
6#include <linux/rmap.h>
7#include <linux/swap.h>
8#include <linux/mm_inline.h>
9#include <linux/kthread.h>
10#include <linux/khugepaged.h>
11#include <linux/freezer.h>
12#include <linux/mman.h>
13#include <linux/hashtable.h>
14#include <linux/userfaultfd_k.h>
15#include <linux/page_idle.h>
16#include <linux/swapops.h>
17#include <linux/shmem_fs.h>
18
19#include <asm/tlb.h>
20#include <asm/pgalloc.h>
21#include "internal.h"
22
23enum scan_result {
24 SCAN_FAIL,
25 SCAN_SUCCEED,
26 SCAN_PMD_NULL,
27 SCAN_EXCEED_NONE_PTE,
28 SCAN_PTE_NON_PRESENT,
29 SCAN_PAGE_RO,
30 SCAN_LACK_REFERENCED_PAGE,
31 SCAN_PAGE_NULL,
32 SCAN_SCAN_ABORT,
33 SCAN_PAGE_COUNT,
34 SCAN_PAGE_LRU,
35 SCAN_PAGE_LOCK,
36 SCAN_PAGE_ANON,
37 SCAN_PAGE_COMPOUND,
38 SCAN_ANY_PROCESS,
39 SCAN_VMA_NULL,
40 SCAN_VMA_CHECK,
41 SCAN_ADDRESS_RANGE,
42 SCAN_SWAP_CACHE_PAGE,
43 SCAN_DEL_PAGE_LRU,
44 SCAN_ALLOC_HUGE_PAGE_FAIL,
45 SCAN_CGROUP_CHARGE_FAIL,
46 SCAN_EXCEED_SWAP_PTE,
47 SCAN_TRUNCATED,
48};
49
50#define CREATE_TRACE_POINTS
51#include <trace/events/huge_memory.h>
52
53/* default scan 8*512 pte (or vmas) every 30 second */
54static unsigned int khugepaged_pages_to_scan __read_mostly;
55static unsigned int khugepaged_pages_collapsed;
56static unsigned int khugepaged_full_scans;
57static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
58/* during fragmentation poll the hugepage allocator once every minute */
59static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
60static unsigned long khugepaged_sleep_expire;
61static DEFINE_SPINLOCK(khugepaged_mm_lock);
62static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
63/*
64 * default collapse hugepages if there is at least one pte mapped like
65 * it would have happened if the vma was large enough during page
66 * fault.
67 */
68static unsigned int khugepaged_max_ptes_none __read_mostly;
69static unsigned int khugepaged_max_ptes_swap __read_mostly;
70
71#define MM_SLOTS_HASH_BITS 10
72static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
73
74static struct kmem_cache *mm_slot_cache __read_mostly;
75
76/**
77 * struct mm_slot - hash lookup from mm to mm_slot
78 * @hash: hash collision list
79 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
80 * @mm: the mm that this information is valid for
81 */
82struct mm_slot {
83 struct hlist_node hash;
84 struct list_head mm_node;
85 struct mm_struct *mm;
86};
87
88/**
89 * struct khugepaged_scan - cursor for scanning
90 * @mm_head: the head of the mm list to scan
91 * @mm_slot: the current mm_slot we are scanning
92 * @address: the next address inside that to be scanned
93 *
94 * There is only the one khugepaged_scan instance of this cursor structure.
95 */
96struct khugepaged_scan {
97 struct list_head mm_head;
98 struct mm_slot *mm_slot;
99 unsigned long address;
100};
101
102static struct khugepaged_scan khugepaged_scan = {
103 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
104};
105
106static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
107 struct kobj_attribute *attr,
108 char *buf)
109{
110 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
111}
112
113static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
114 struct kobj_attribute *attr,
115 const char *buf, size_t count)
116{
117 unsigned long msecs;
118 int err;
119
120 err = kstrtoul(buf, 10, &msecs);
121 if (err || msecs > UINT_MAX)
122 return -EINVAL;
123
124 khugepaged_scan_sleep_millisecs = msecs;
125 khugepaged_sleep_expire = 0;
126 wake_up_interruptible(&khugepaged_wait);
127
128 return count;
129}
130static struct kobj_attribute scan_sleep_millisecs_attr =
131 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
132 scan_sleep_millisecs_store);
133
134static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
135 struct kobj_attribute *attr,
136 char *buf)
137{
138 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
139}
140
141static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
142 struct kobj_attribute *attr,
143 const char *buf, size_t count)
144{
145 unsigned long msecs;
146 int err;
147
148 err = kstrtoul(buf, 10, &msecs);
149 if (err || msecs > UINT_MAX)
150 return -EINVAL;
151
152 khugepaged_alloc_sleep_millisecs = msecs;
153 khugepaged_sleep_expire = 0;
154 wake_up_interruptible(&khugepaged_wait);
155
156 return count;
157}
158static struct kobj_attribute alloc_sleep_millisecs_attr =
159 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
160 alloc_sleep_millisecs_store);
161
162static ssize_t pages_to_scan_show(struct kobject *kobj,
163 struct kobj_attribute *attr,
164 char *buf)
165{
166 return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
167}
168static ssize_t pages_to_scan_store(struct kobject *kobj,
169 struct kobj_attribute *attr,
170 const char *buf, size_t count)
171{
172 int err;
173 unsigned long pages;
174
175 err = kstrtoul(buf, 10, &pages);
176 if (err || !pages || pages > UINT_MAX)
177 return -EINVAL;
178
179 khugepaged_pages_to_scan = pages;
180
181 return count;
182}
183static struct kobj_attribute pages_to_scan_attr =
184 __ATTR(pages_to_scan, 0644, pages_to_scan_show,
185 pages_to_scan_store);
186
187static ssize_t pages_collapsed_show(struct kobject *kobj,
188 struct kobj_attribute *attr,
189 char *buf)
190{
191 return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
192}
193static struct kobj_attribute pages_collapsed_attr =
194 __ATTR_RO(pages_collapsed);
195
196static ssize_t full_scans_show(struct kobject *kobj,
197 struct kobj_attribute *attr,
198 char *buf)
199{
200 return sprintf(buf, "%u\n", khugepaged_full_scans);
201}
202static struct kobj_attribute full_scans_attr =
203 __ATTR_RO(full_scans);
204
205static ssize_t khugepaged_defrag_show(struct kobject *kobj,
206 struct kobj_attribute *attr, char *buf)
207{
208 return single_hugepage_flag_show(kobj, attr, buf,
209 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
210}
211static ssize_t khugepaged_defrag_store(struct kobject *kobj,
212 struct kobj_attribute *attr,
213 const char *buf, size_t count)
214{
215 return single_hugepage_flag_store(kobj, attr, buf, count,
216 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
217}
218static struct kobj_attribute khugepaged_defrag_attr =
219 __ATTR(defrag, 0644, khugepaged_defrag_show,
220 khugepaged_defrag_store);
221
222/*
223 * max_ptes_none controls if khugepaged should collapse hugepages over
224 * any unmapped ptes in turn potentially increasing the memory
225 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
226 * reduce the available free memory in the system as it
227 * runs. Increasing max_ptes_none will instead potentially reduce the
228 * free memory in the system during the khugepaged scan.
229 */
230static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
231 struct kobj_attribute *attr,
232 char *buf)
233{
234 return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
235}
236static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
237 struct kobj_attribute *attr,
238 const char *buf, size_t count)
239{
240 int err;
241 unsigned long max_ptes_none;
242
243 err = kstrtoul(buf, 10, &max_ptes_none);
244 if (err || max_ptes_none > HPAGE_PMD_NR-1)
245 return -EINVAL;
246
247 khugepaged_max_ptes_none = max_ptes_none;
248
249 return count;
250}
251static struct kobj_attribute khugepaged_max_ptes_none_attr =
252 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
253 khugepaged_max_ptes_none_store);
254
255static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
256 struct kobj_attribute *attr,
257 char *buf)
258{
259 return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
260}
261
262static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
263 struct kobj_attribute *attr,
264 const char *buf, size_t count)
265{
266 int err;
267 unsigned long max_ptes_swap;
268
269 err = kstrtoul(buf, 10, &max_ptes_swap);
270 if (err || max_ptes_swap > HPAGE_PMD_NR-1)
271 return -EINVAL;
272
273 khugepaged_max_ptes_swap = max_ptes_swap;
274
275 return count;
276}
277
278static struct kobj_attribute khugepaged_max_ptes_swap_attr =
279 __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
280 khugepaged_max_ptes_swap_store);
281
282static struct attribute *khugepaged_attr[] = {
283 &khugepaged_defrag_attr.attr,
284 &khugepaged_max_ptes_none_attr.attr,
285 &pages_to_scan_attr.attr,
286 &pages_collapsed_attr.attr,
287 &full_scans_attr.attr,
288 &scan_sleep_millisecs_attr.attr,
289 &alloc_sleep_millisecs_attr.attr,
290 &khugepaged_max_ptes_swap_attr.attr,
291 NULL,
292};
293
294struct attribute_group khugepaged_attr_group = {
295 .attrs = khugepaged_attr,
296 .name = "khugepaged",
297};
298
299#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
300
301int hugepage_madvise(struct vm_area_struct *vma,
302 unsigned long *vm_flags, int advice)
303{
304 switch (advice) {
305 case MADV_HUGEPAGE:
306#ifdef CONFIG_S390
307 /*
308 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
309 * can't handle this properly after s390_enable_sie, so we simply
310 * ignore the madvise to prevent qemu from causing a SIGSEGV.
311 */
312 if (mm_has_pgste(vma->vm_mm))
313 return 0;
314#endif
315 *vm_flags &= ~VM_NOHUGEPAGE;
316 *vm_flags |= VM_HUGEPAGE;
317 /*
318 * If the vma become good for khugepaged to scan,
319 * register it here without waiting a page fault that
320 * may not happen any time soon.
321 */
322 if (!(*vm_flags & VM_NO_KHUGEPAGED) &&
323 khugepaged_enter_vma_merge(vma, *vm_flags))
324 return -ENOMEM;
325 break;
326 case MADV_NOHUGEPAGE:
327 *vm_flags &= ~VM_HUGEPAGE;
328 *vm_flags |= VM_NOHUGEPAGE;
329 /*
330 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
331 * this vma even if we leave the mm registered in khugepaged if
332 * it got registered before VM_NOHUGEPAGE was set.
333 */
334 break;
335 }
336
337 return 0;
338}
339
340int __init khugepaged_init(void)
341{
342 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
343 sizeof(struct mm_slot),
344 __alignof__(struct mm_slot), 0, NULL);
345 if (!mm_slot_cache)
346 return -ENOMEM;
347
348 khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
349 khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
350 khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
351
352 return 0;
353}
354
355void __init khugepaged_destroy(void)
356{
357 kmem_cache_destroy(mm_slot_cache);
358}
359
360static inline struct mm_slot *alloc_mm_slot(void)
361{
362 if (!mm_slot_cache) /* initialization failed */
363 return NULL;
364 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
365}
366
367static inline void free_mm_slot(struct mm_slot *mm_slot)
368{
369 kmem_cache_free(mm_slot_cache, mm_slot);
370}
371
372static struct mm_slot *get_mm_slot(struct mm_struct *mm)
373{
374 struct mm_slot *mm_slot;
375
376 hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm)
377 if (mm == mm_slot->mm)
378 return mm_slot;
379
380 return NULL;
381}
382
383static void insert_to_mm_slots_hash(struct mm_struct *mm,
384 struct mm_slot *mm_slot)
385{
386 mm_slot->mm = mm;
387 hash_add(mm_slots_hash, &mm_slot->hash, (long)mm);
388}
389
390static inline int khugepaged_test_exit(struct mm_struct *mm)
391{
392 return atomic_read(&mm->mm_users) == 0;
393}
394
395int __khugepaged_enter(struct mm_struct *mm)
396{
397 struct mm_slot *mm_slot;
398 int wakeup;
399
400 mm_slot = alloc_mm_slot();
401 if (!mm_slot)
402 return -ENOMEM;
403
404 /* __khugepaged_exit() must not run from under us */
405 VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
406 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
407 free_mm_slot(mm_slot);
408 return 0;
409 }
410
411 spin_lock(&khugepaged_mm_lock);
412 insert_to_mm_slots_hash(mm, mm_slot);
413 /*
414 * Insert just behind the scanning cursor, to let the area settle
415 * down a little.
416 */
417 wakeup = list_empty(&khugepaged_scan.mm_head);
418 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
419 spin_unlock(&khugepaged_mm_lock);
420
421 atomic_inc(&mm->mm_count);
422 if (wakeup)
423 wake_up_interruptible(&khugepaged_wait);
424
425 return 0;
426}
427
428int khugepaged_enter_vma_merge(struct vm_area_struct *vma,
429 unsigned long vm_flags)
430{
431 unsigned long hstart, hend;
432 if (!vma->anon_vma)
433 /*
434 * Not yet faulted in so we will register later in the
435 * page fault if needed.
436 */
437 return 0;
438 if (vma->vm_ops || (vm_flags & VM_NO_KHUGEPAGED))
439 /* khugepaged not yet working on file or special mappings */
440 return 0;
441 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
442 hend = vma->vm_end & HPAGE_PMD_MASK;
443 if (hstart < hend)
444 return khugepaged_enter(vma, vm_flags);
445 return 0;
446}
447
448void __khugepaged_exit(struct mm_struct *mm)
449{
450 struct mm_slot *mm_slot;
451 int free = 0;
452
453 spin_lock(&khugepaged_mm_lock);
454 mm_slot = get_mm_slot(mm);
455 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
456 hash_del(&mm_slot->hash);
457 list_del(&mm_slot->mm_node);
458 free = 1;
459 }
460 spin_unlock(&khugepaged_mm_lock);
461
462 if (free) {
463 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
464 free_mm_slot(mm_slot);
465 mmdrop(mm);
466 } else if (mm_slot) {
467 /*
468 * This is required to serialize against
469 * khugepaged_test_exit() (which is guaranteed to run
470 * under mmap sem read mode). Stop here (after we
471 * return all pagetables will be destroyed) until
472 * khugepaged has finished working on the pagetables
473 * under the mmap_sem.
474 */
475 down_write(&mm->mmap_sem);
476 up_write(&mm->mmap_sem);
477 }
478}
479
480static void release_pte_page(struct page *page)
481{
482 /* 0 stands for page_is_file_cache(page) == false */
483 dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
484 unlock_page(page);
485 putback_lru_page(page);
486}
487
488static void release_pte_pages(pte_t *pte, pte_t *_pte)
489{
490 while (--_pte >= pte) {
491 pte_t pteval = *_pte;
492 if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
493 release_pte_page(pte_page(pteval));
494 }
495}
496
497static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
498 unsigned long address,
499 pte_t *pte)
500{
501 struct page *page = NULL;
502 pte_t *_pte;
503 int none_or_zero = 0, result = 0, referenced = 0;
504 bool writable = false;
505
506 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
507 _pte++, address += PAGE_SIZE) {
508 pte_t pteval = *_pte;
509 if (pte_none(pteval) || (pte_present(pteval) &&
510 is_zero_pfn(pte_pfn(pteval)))) {
511 if (!userfaultfd_armed(vma) &&
512 ++none_or_zero <= khugepaged_max_ptes_none) {
513 continue;
514 } else {
515 result = SCAN_EXCEED_NONE_PTE;
516 goto out;
517 }
518 }
519 if (!pte_present(pteval)) {
520 result = SCAN_PTE_NON_PRESENT;
521 goto out;
522 }
523 page = vm_normal_page(vma, address, pteval);
524 if (unlikely(!page)) {
525 result = SCAN_PAGE_NULL;
526 goto out;
527 }
528
529 VM_BUG_ON_PAGE(PageCompound(page), page);
530 VM_BUG_ON_PAGE(!PageAnon(page), page);
531 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
532
533 /*
534 * We can do it before isolate_lru_page because the
535 * page can't be freed from under us. NOTE: PG_lock
536 * is needed to serialize against split_huge_page
537 * when invoked from the VM.
538 */
539 if (!trylock_page(page)) {
540 result = SCAN_PAGE_LOCK;
541 goto out;
542 }
543
544 /*
545 * cannot use mapcount: can't collapse if there's a gup pin.
546 * The page must only be referenced by the scanned process
547 * and page swap cache.
548 */
549 if (page_count(page) != 1 + !!PageSwapCache(page)) {
550 unlock_page(page);
551 result = SCAN_PAGE_COUNT;
552 goto out;
553 }
554 if (pte_write(pteval)) {
555 writable = true;
556 } else {
557 if (PageSwapCache(page) &&
558 !reuse_swap_page(page, NULL)) {
559 unlock_page(page);
560 result = SCAN_SWAP_CACHE_PAGE;
561 goto out;
562 }
563 /*
564 * Page is not in the swap cache. It can be collapsed
565 * into a THP.
566 */
567 }
568
569 /*
570 * Isolate the page to avoid collapsing an hugepage
571 * currently in use by the VM.
572 */
573 if (isolate_lru_page(page)) {
574 unlock_page(page);
575 result = SCAN_DEL_PAGE_LRU;
576 goto out;
577 }
578 /* 0 stands for page_is_file_cache(page) == false */
579 inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
580 VM_BUG_ON_PAGE(!PageLocked(page), page);
581 VM_BUG_ON_PAGE(PageLRU(page), page);
582
583 /* There should be enough young pte to collapse the page */
584 if (pte_young(pteval) ||
585 page_is_young(page) || PageReferenced(page) ||
586 mmu_notifier_test_young(vma->vm_mm, address))
587 referenced++;
588 }
589 if (likely(writable)) {
590 if (likely(referenced)) {
591 result = SCAN_SUCCEED;
592 trace_mm_collapse_huge_page_isolate(page, none_or_zero,
593 referenced, writable, result);
594 return 1;
595 }
596 } else {
597 result = SCAN_PAGE_RO;
598 }
599
600out:
601 release_pte_pages(pte, _pte);
602 trace_mm_collapse_huge_page_isolate(page, none_or_zero,
603 referenced, writable, result);
604 return 0;
605}
606
607static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
608 struct vm_area_struct *vma,
609 unsigned long address,
610 spinlock_t *ptl)
611{
612 pte_t *_pte;
613 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
614 pte_t pteval = *_pte;
615 struct page *src_page;
616
617 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
618 clear_user_highpage(page, address);
619 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
620 if (is_zero_pfn(pte_pfn(pteval))) {
621 /*
622 * ptl mostly unnecessary.
623 */
624 spin_lock(ptl);
625 /*
626 * paravirt calls inside pte_clear here are
627 * superfluous.
628 */
629 pte_clear(vma->vm_mm, address, _pte);
630 spin_unlock(ptl);
631 }
632 } else {
633 src_page = pte_page(pteval);
634 copy_user_highpage(page, src_page, address, vma);
635 VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
636 release_pte_page(src_page);
637 /*
638 * ptl mostly unnecessary, but preempt has to
639 * be disabled to update the per-cpu stats
640 * inside page_remove_rmap().
641 */
642 spin_lock(ptl);
643 /*
644 * paravirt calls inside pte_clear here are
645 * superfluous.
646 */
647 pte_clear(vma->vm_mm, address, _pte);
648 page_remove_rmap(src_page, false);
649 spin_unlock(ptl);
650 free_page_and_swap_cache(src_page);
651 }
652
653 address += PAGE_SIZE;
654 page++;
655 }
656}
657
658static void khugepaged_alloc_sleep(void)
659{
660 DEFINE_WAIT(wait);
661
662 add_wait_queue(&khugepaged_wait, &wait);
663 freezable_schedule_timeout_interruptible(
664 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
665 remove_wait_queue(&khugepaged_wait, &wait);
666}
667
668static int khugepaged_node_load[MAX_NUMNODES];
669
670static bool khugepaged_scan_abort(int nid)
671{
672 int i;
673
674 /*
675 * If zone_reclaim_mode is disabled, then no extra effort is made to
676 * allocate memory locally.
677 */
678 if (!zone_reclaim_mode)
679 return false;
680
681 /* If there is a count for this node already, it must be acceptable */
682 if (khugepaged_node_load[nid])
683 return false;
684
685 for (i = 0; i < MAX_NUMNODES; i++) {
686 if (!khugepaged_node_load[i])
687 continue;
688 if (node_distance(nid, i) > RECLAIM_DISTANCE)
689 return true;
690 }
691 return false;
692}
693
694/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
695static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
696{
697 return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0);
698}
699
700#ifdef CONFIG_NUMA
701static int khugepaged_find_target_node(void)
702{
703 static int last_khugepaged_target_node = NUMA_NO_NODE;
704 int nid, target_node = 0, max_value = 0;
705
706 /* find first node with max normal pages hit */
707 for (nid = 0; nid < MAX_NUMNODES; nid++)
708 if (khugepaged_node_load[nid] > max_value) {
709 max_value = khugepaged_node_load[nid];
710 target_node = nid;
711 }
712
713 /* do some balance if several nodes have the same hit record */
714 if (target_node <= last_khugepaged_target_node)
715 for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
716 nid++)
717 if (max_value == khugepaged_node_load[nid]) {
718 target_node = nid;
719 break;
720 }
721
722 last_khugepaged_target_node = target_node;
723 return target_node;
724}
725
726static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
727{
728 if (IS_ERR(*hpage)) {
729 if (!*wait)
730 return false;
731
732 *wait = false;
733 *hpage = NULL;
734 khugepaged_alloc_sleep();
735 } else if (*hpage) {
736 put_page(*hpage);
737 *hpage = NULL;
738 }
739
740 return true;
741}
742
743static struct page *
744khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
745{
746 VM_BUG_ON_PAGE(*hpage, *hpage);
747
748 *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
749 if (unlikely(!*hpage)) {
750 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
751 *hpage = ERR_PTR(-ENOMEM);
752 return NULL;
753 }
754
755 prep_transhuge_page(*hpage);
756 count_vm_event(THP_COLLAPSE_ALLOC);
757 return *hpage;
758}
759#else
760static int khugepaged_find_target_node(void)
761{
762 return 0;
763}
764
765static inline struct page *alloc_khugepaged_hugepage(void)
766{
767 struct page *page;
768
769 page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(),
770 HPAGE_PMD_ORDER);
771 if (page)
772 prep_transhuge_page(page);
773 return page;
774}
775
776static struct page *khugepaged_alloc_hugepage(bool *wait)
777{
778 struct page *hpage;
779
780 do {
781 hpage = alloc_khugepaged_hugepage();
782 if (!hpage) {
783 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
784 if (!*wait)
785 return NULL;
786
787 *wait = false;
788 khugepaged_alloc_sleep();
789 } else
790 count_vm_event(THP_COLLAPSE_ALLOC);
791 } while (unlikely(!hpage) && likely(khugepaged_enabled()));
792
793 return hpage;
794}
795
796static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
797{
798 if (!*hpage)
799 *hpage = khugepaged_alloc_hugepage(wait);
800
801 if (unlikely(!*hpage))
802 return false;
803
804 return true;
805}
806
807static struct page *
808khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
809{
810 VM_BUG_ON(!*hpage);
811
812 return *hpage;
813}
814#endif
815
816static bool hugepage_vma_check(struct vm_area_struct *vma)
817{
818 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
819 (vma->vm_flags & VM_NOHUGEPAGE))
820 return false;
821 if (shmem_file(vma->vm_file)) {
822 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
823 return false;
824 return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
825 HPAGE_PMD_NR);
826 }
827 if (!vma->anon_vma || vma->vm_ops)
828 return false;
829 if (is_vma_temporary_stack(vma))
830 return false;
831 return !(vma->vm_flags & VM_NO_KHUGEPAGED);
832}
833
834/*
835 * If mmap_sem temporarily dropped, revalidate vma
836 * before taking mmap_sem.
837 * Return 0 if succeeds, otherwise return none-zero
838 * value (scan code).
839 */
840
841static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address)
842{
843 struct vm_area_struct *vma;
844 unsigned long hstart, hend;
845
846 if (unlikely(khugepaged_test_exit(mm)))
847 return SCAN_ANY_PROCESS;
848
849 vma = find_vma(mm, address);
850 if (!vma)
851 return SCAN_VMA_NULL;
852
853 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
854 hend = vma->vm_end & HPAGE_PMD_MASK;
855 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
856 return SCAN_ADDRESS_RANGE;
857 if (!hugepage_vma_check(vma))
858 return SCAN_VMA_CHECK;
859 return 0;
860}
861
862/*
863 * Bring missing pages in from swap, to complete THP collapse.
864 * Only done if khugepaged_scan_pmd believes it is worthwhile.
865 *
866 * Called and returns without pte mapped or spinlocks held,
867 * but with mmap_sem held to protect against vma changes.
868 */
869
870static bool __collapse_huge_page_swapin(struct mm_struct *mm,
871 struct vm_area_struct *vma,
872 unsigned long address, pmd_t *pmd,
873 int referenced)
874{
875 pte_t pteval;
876 int swapped_in = 0, ret = 0;
877 struct fault_env fe = {
878 .vma = vma,
879 .address = address,
880 .flags = FAULT_FLAG_ALLOW_RETRY,
881 .pmd = pmd,
882 };
883
884 fe.pte = pte_offset_map(pmd, address);
885 for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE;
886 fe.pte++, fe.address += PAGE_SIZE) {
887 pteval = *fe.pte;
888 if (!is_swap_pte(pteval))
889 continue;
890 swapped_in++;
891 /* we only decide to swapin, if there is enough young ptes */
892 if (referenced < HPAGE_PMD_NR/2) {
893 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
894 return false;
895 }
896 ret = do_swap_page(&fe, pteval);
897
898 /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
899 if (ret & VM_FAULT_RETRY) {
900 down_read(&mm->mmap_sem);
901 if (hugepage_vma_revalidate(mm, address)) {
902 /* vma is no longer available, don't continue to swapin */
903 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
904 return false;
905 }
906 /* check if the pmd is still valid */
907 if (mm_find_pmd(mm, address) != pmd)
908 return false;
909 }
910 if (ret & VM_FAULT_ERROR) {
911 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
912 return false;
913 }
914 /* pte is unmapped now, we need to map it */
915 fe.pte = pte_offset_map(pmd, fe.address);
916 }
917 fe.pte--;
918 pte_unmap(fe.pte);
919 trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
920 return true;
921}
922
923static void collapse_huge_page(struct mm_struct *mm,
924 unsigned long address,
925 struct page **hpage,
926 struct vm_area_struct *vma,
927 int node, int referenced)
928{
929 pmd_t *pmd, _pmd;
930 pte_t *pte;
931 pgtable_t pgtable;
932 struct page *new_page;
933 spinlock_t *pmd_ptl, *pte_ptl;
934 int isolated = 0, result = 0;
935 struct mem_cgroup *memcg;
936 unsigned long mmun_start; /* For mmu_notifiers */
937 unsigned long mmun_end; /* For mmu_notifiers */
938 gfp_t gfp;
939
940 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
941
942 /* Only allocate from the target node */
943 gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE;
944
945 /*
946 * Before allocating the hugepage, release the mmap_sem read lock.
947 * The allocation can take potentially a long time if it involves
948 * sync compaction, and we do not need to hold the mmap_sem during
949 * that. We will recheck the vma after taking it again in write mode.
950 */
951 up_read(&mm->mmap_sem);
952 new_page = khugepaged_alloc_page(hpage, gfp, node);
953 if (!new_page) {
954 result = SCAN_ALLOC_HUGE_PAGE_FAIL;
955 goto out_nolock;
956 }
957
958 if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
959 result = SCAN_CGROUP_CHARGE_FAIL;
960 goto out_nolock;
961 }
962
963 down_read(&mm->mmap_sem);
964 result = hugepage_vma_revalidate(mm, address);
965 if (result) {
966 mem_cgroup_cancel_charge(new_page, memcg, true);
967 up_read(&mm->mmap_sem);
968 goto out_nolock;
969 }
970
971 pmd = mm_find_pmd(mm, address);
972 if (!pmd) {
973 result = SCAN_PMD_NULL;
974 mem_cgroup_cancel_charge(new_page, memcg, true);
975 up_read(&mm->mmap_sem);
976 goto out_nolock;
977 }
978
979 /*
980 * __collapse_huge_page_swapin always returns with mmap_sem locked.
981 * If it fails, we release mmap_sem and jump out_nolock.
982 * Continuing to collapse causes inconsistency.
983 */
984 if (!__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) {
985 mem_cgroup_cancel_charge(new_page, memcg, true);
986 up_read(&mm->mmap_sem);
987 goto out_nolock;
988 }
989
990 up_read(&mm->mmap_sem);
991 /*
992 * Prevent all access to pagetables with the exception of
993 * gup_fast later handled by the ptep_clear_flush and the VM
994 * handled by the anon_vma lock + PG_lock.
995 */
996 down_write(&mm->mmap_sem);
997 result = hugepage_vma_revalidate(mm, address);
998 if (result)
999 goto out;
1000 /* check if the pmd is still valid */
1001 if (mm_find_pmd(mm, address) != pmd)
1002 goto out;
1003
1004 anon_vma_lock_write(vma->anon_vma);
1005
1006 pte = pte_offset_map(pmd, address);
1007 pte_ptl = pte_lockptr(mm, pmd);
1008
1009 mmun_start = address;
1010 mmun_end = address + HPAGE_PMD_SIZE;
1011 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1012 pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
1013 /*
1014 * After this gup_fast can't run anymore. This also removes
1015 * any huge TLB entry from the CPU so we won't allow
1016 * huge and small TLB entries for the same virtual address
1017 * to avoid the risk of CPU bugs in that area.
1018 */
1019 _pmd = pmdp_collapse_flush(vma, address, pmd);
1020 spin_unlock(pmd_ptl);
1021 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1022
1023 spin_lock(pte_ptl);
1024 isolated = __collapse_huge_page_isolate(vma, address, pte);
1025 spin_unlock(pte_ptl);
1026
1027 if (unlikely(!isolated)) {
1028 pte_unmap(pte);
1029 spin_lock(pmd_ptl);
1030 BUG_ON(!pmd_none(*pmd));
1031 /*
1032 * We can only use set_pmd_at when establishing
1033 * hugepmds and never for establishing regular pmds that
1034 * points to regular pagetables. Use pmd_populate for that
1035 */
1036 pmd_populate(mm, pmd, pmd_pgtable(_pmd));
1037 spin_unlock(pmd_ptl);
1038 anon_vma_unlock_write(vma->anon_vma);
1039 result = SCAN_FAIL;
1040 goto out;
1041 }
1042
1043 /*
1044 * All pages are isolated and locked so anon_vma rmap
1045 * can't run anymore.
1046 */
1047 anon_vma_unlock_write(vma->anon_vma);
1048
1049 __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
1050 pte_unmap(pte);
1051 __SetPageUptodate(new_page);
1052 pgtable = pmd_pgtable(_pmd);
1053
1054 _pmd = mk_huge_pmd(new_page, vma->vm_page_prot);
1055 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
1056
1057 /*
1058 * spin_lock() below is not the equivalent of smp_wmb(), so
1059 * this is needed to avoid the copy_huge_page writes to become
1060 * visible after the set_pmd_at() write.
1061 */
1062 smp_wmb();
1063
1064 spin_lock(pmd_ptl);
1065 BUG_ON(!pmd_none(*pmd));
1066 page_add_new_anon_rmap(new_page, vma, address, true);
1067 mem_cgroup_commit_charge(new_page, memcg, false, true);
1068 lru_cache_add_active_or_unevictable(new_page, vma);
1069 pgtable_trans_huge_deposit(mm, pmd, pgtable);
1070 set_pmd_at(mm, address, pmd, _pmd);
1071 update_mmu_cache_pmd(vma, address, pmd);
1072 spin_unlock(pmd_ptl);
1073
1074 *hpage = NULL;
1075
1076 khugepaged_pages_collapsed++;
1077 result = SCAN_SUCCEED;
1078out_up_write:
1079 up_write(&mm->mmap_sem);
1080out_nolock:
1081 trace_mm_collapse_huge_page(mm, isolated, result);
1082 return;
1083out:
1084 mem_cgroup_cancel_charge(new_page, memcg, true);
1085 goto out_up_write;
1086}
1087
1088static int khugepaged_scan_pmd(struct mm_struct *mm,
1089 struct vm_area_struct *vma,
1090 unsigned long address,
1091 struct page **hpage)
1092{
1093 pmd_t *pmd;
1094 pte_t *pte, *_pte;
1095 int ret = 0, none_or_zero = 0, result = 0, referenced = 0;
1096 struct page *page = NULL;
1097 unsigned long _address;
1098 spinlock_t *ptl;
1099 int node = NUMA_NO_NODE, unmapped = 0;
1100 bool writable = false;
1101
1102 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1103
1104 pmd = mm_find_pmd(mm, address);
1105 if (!pmd) {
1106 result = SCAN_PMD_NULL;
1107 goto out;
1108 }
1109
1110 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
1111 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1112 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
1113 _pte++, _address += PAGE_SIZE) {
1114 pte_t pteval = *_pte;
1115 if (is_swap_pte(pteval)) {
1116 if (++unmapped <= khugepaged_max_ptes_swap) {
1117 continue;
1118 } else {
1119 result = SCAN_EXCEED_SWAP_PTE;
1120 goto out_unmap;
1121 }
1122 }
1123 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
1124 if (!userfaultfd_armed(vma) &&
1125 ++none_or_zero <= khugepaged_max_ptes_none) {
1126 continue;
1127 } else {
1128 result = SCAN_EXCEED_NONE_PTE;
1129 goto out_unmap;
1130 }
1131 }
1132 if (!pte_present(pteval)) {
1133 result = SCAN_PTE_NON_PRESENT;
1134 goto out_unmap;
1135 }
1136 if (pte_write(pteval))
1137 writable = true;
1138
1139 page = vm_normal_page(vma, _address, pteval);
1140 if (unlikely(!page)) {
1141 result = SCAN_PAGE_NULL;
1142 goto out_unmap;
1143 }
1144
1145 /* TODO: teach khugepaged to collapse THP mapped with pte */
1146 if (PageCompound(page)) {
1147 result = SCAN_PAGE_COMPOUND;
1148 goto out_unmap;
1149 }
1150
1151 /*
1152 * Record which node the original page is from and save this
1153 * information to khugepaged_node_load[].
1154 * Khupaged will allocate hugepage from the node has the max
1155 * hit record.
1156 */
1157 node = page_to_nid(page);
1158 if (khugepaged_scan_abort(node)) {
1159 result = SCAN_SCAN_ABORT;
1160 goto out_unmap;
1161 }
1162 khugepaged_node_load[node]++;
1163 if (!PageLRU(page)) {
1164 result = SCAN_PAGE_LRU;
1165 goto out_unmap;
1166 }
1167 if (PageLocked(page)) {
1168 result = SCAN_PAGE_LOCK;
1169 goto out_unmap;
1170 }
1171 if (!PageAnon(page)) {
1172 result = SCAN_PAGE_ANON;
1173 goto out_unmap;
1174 }
1175
1176 /*
1177 * cannot use mapcount: can't collapse if there's a gup pin.
1178 * The page must only be referenced by the scanned process
1179 * and page swap cache.
1180 */
1181 if (page_count(page) != 1 + !!PageSwapCache(page)) {
1182 result = SCAN_PAGE_COUNT;
1183 goto out_unmap;
1184 }
1185 if (pte_young(pteval) ||
1186 page_is_young(page) || PageReferenced(page) ||
1187 mmu_notifier_test_young(vma->vm_mm, address))
1188 referenced++;
1189 }
1190 if (writable) {
1191 if (referenced) {
1192 result = SCAN_SUCCEED;
1193 ret = 1;
1194 } else {
1195 result = SCAN_LACK_REFERENCED_PAGE;
1196 }
1197 } else {
1198 result = SCAN_PAGE_RO;
1199 }
1200out_unmap:
1201 pte_unmap_unlock(pte, ptl);
1202 if (ret) {
1203 node = khugepaged_find_target_node();
1204 /* collapse_huge_page will return with the mmap_sem released */
1205 collapse_huge_page(mm, address, hpage, vma, node, referenced);
1206 }
1207out:
1208 trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
1209 none_or_zero, result, unmapped);
1210 return ret;
1211}
1212
1213static void collect_mm_slot(struct mm_slot *mm_slot)
1214{
1215 struct mm_struct *mm = mm_slot->mm;
1216
1217 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
1218
1219 if (khugepaged_test_exit(mm)) {
1220 /* free mm_slot */
1221 hash_del(&mm_slot->hash);
1222 list_del(&mm_slot->mm_node);
1223
1224 /*
1225 * Not strictly needed because the mm exited already.
1226 *
1227 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1228 */
1229
1230 /* khugepaged_mm_lock actually not necessary for the below */
1231 free_mm_slot(mm_slot);
1232 mmdrop(mm);
1233 }
1234}
1235
1236#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
1237static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
1238{
1239 struct vm_area_struct *vma;
1240 unsigned long addr;
1241 pmd_t *pmd, _pmd;
1242
1243 i_mmap_lock_write(mapping);
1244 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1245 /* probably overkill */
1246 if (vma->anon_vma)
1247 continue;
1248 addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
1249 if (addr & ~HPAGE_PMD_MASK)
1250 continue;
1251 if (vma->vm_end < addr + HPAGE_PMD_SIZE)
1252 continue;
1253 pmd = mm_find_pmd(vma->vm_mm, addr);
1254 if (!pmd)
1255 continue;
1256 /*
1257 * We need exclusive mmap_sem to retract page table.
1258 * If trylock fails we would end up with pte-mapped THP after
1259 * re-fault. Not ideal, but it's more important to not disturb
1260 * the system too much.
1261 */
1262 if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
1263 spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
1264 /* assume page table is clear */
1265 _pmd = pmdp_collapse_flush(vma, addr, pmd);
1266 spin_unlock(ptl);
1267 up_write(&vma->vm_mm->mmap_sem);
1268 atomic_long_dec(&vma->vm_mm->nr_ptes);
1269 pte_free(vma->vm_mm, pmd_pgtable(_pmd));
1270 }
1271 }
1272 i_mmap_unlock_write(mapping);
1273}
1274
1275/**
1276 * collapse_shmem - collapse small tmpfs/shmem pages into huge one.
1277 *
1278 * Basic scheme is simple, details are more complex:
1279 * - allocate and freeze a new huge page;
1280 * - scan over radix tree replacing old pages the new one
1281 * + swap in pages if necessary;
1282 * + fill in gaps;
1283 * + keep old pages around in case if rollback is required;
1284 * - if replacing succeed:
1285 * + copy data over;
1286 * + free old pages;
1287 * + unfreeze huge page;
1288 * - if replacing failed;
1289 * + put all pages back and unfreeze them;
1290 * + restore gaps in the radix-tree;
1291 * + free huge page;
1292 */
1293static void collapse_shmem(struct mm_struct *mm,
1294 struct address_space *mapping, pgoff_t start,
1295 struct page **hpage, int node)
1296{
1297 gfp_t gfp;
1298 struct page *page, *new_page, *tmp;
1299 struct mem_cgroup *memcg;
1300 pgoff_t index, end = start + HPAGE_PMD_NR;
1301 LIST_HEAD(pagelist);
1302 struct radix_tree_iter iter;
1303 void **slot;
1304 int nr_none = 0, result = SCAN_SUCCEED;
1305
1306 VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
1307
1308 /* Only allocate from the target node */
1309 gfp = alloc_hugepage_khugepaged_gfpmask() |
1310 __GFP_OTHER_NODE | __GFP_THISNODE;
1311
1312 new_page = khugepaged_alloc_page(hpage, gfp, node);
1313 if (!new_page) {
1314 result = SCAN_ALLOC_HUGE_PAGE_FAIL;
1315 goto out;
1316 }
1317
1318 if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
1319 result = SCAN_CGROUP_CHARGE_FAIL;
1320 goto out;
1321 }
1322
1323 new_page->index = start;
1324 new_page->mapping = mapping;
1325 __SetPageSwapBacked(new_page);
1326 __SetPageLocked(new_page);
1327 BUG_ON(!page_ref_freeze(new_page, 1));
1328
1329
1330 /*
1331 * At this point the new_page is 'frozen' (page_count() is zero), locked
1332 * and not up-to-date. It's safe to insert it into radix tree, because
1333 * nobody would be able to map it or use it in other way until we
1334 * unfreeze it.
1335 */
1336
1337 index = start;
1338 spin_lock_irq(&mapping->tree_lock);
1339 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
1340 int n = min(iter.index, end) - index;
1341
1342 /*
1343 * Handle holes in the radix tree: charge it from shmem and
1344 * insert relevant subpage of new_page into the radix-tree.
1345 */
1346 if (n && !shmem_charge(mapping->host, n)) {
1347 result = SCAN_FAIL;
1348 break;
1349 }
1350 nr_none += n;
1351 for (; index < min(iter.index, end); index++) {
1352 radix_tree_insert(&mapping->page_tree, index,
1353 new_page + (index % HPAGE_PMD_NR));
1354 }
1355
1356 /* We are done. */
1357 if (index >= end)
1358 break;
1359
1360 page = radix_tree_deref_slot_protected(slot,
1361 &mapping->tree_lock);
1362 if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) {
1363 spin_unlock_irq(&mapping->tree_lock);
1364 /* swap in or instantiate fallocated page */
1365 if (shmem_getpage(mapping->host, index, &page,
1366 SGP_NOHUGE)) {
1367 result = SCAN_FAIL;
1368 goto tree_unlocked;
1369 }
1370 spin_lock_irq(&mapping->tree_lock);
1371 } else if (trylock_page(page)) {
1372 get_page(page);
1373 } else {
1374 result = SCAN_PAGE_LOCK;
1375 break;
1376 }
1377
1378 /*
1379 * The page must be locked, so we can drop the tree_lock
1380 * without racing with truncate.
1381 */
1382 VM_BUG_ON_PAGE(!PageLocked(page), page);
1383 VM_BUG_ON_PAGE(!PageUptodate(page), page);
1384 VM_BUG_ON_PAGE(PageTransCompound(page), page);
1385
1386 if (page_mapping(page) != mapping) {
1387 result = SCAN_TRUNCATED;
1388 goto out_unlock;
1389 }
1390 spin_unlock_irq(&mapping->tree_lock);
1391
1392 if (isolate_lru_page(page)) {
1393 result = SCAN_DEL_PAGE_LRU;
1394 goto out_isolate_failed;
1395 }
1396
1397 if (page_mapped(page))
1398 unmap_mapping_range(mapping, index << PAGE_SHIFT,
1399 PAGE_SIZE, 0);
1400
1401 spin_lock_irq(&mapping->tree_lock);
1402
1403 VM_BUG_ON_PAGE(page_mapped(page), page);
1404
1405 /*
1406 * The page is expected to have page_count() == 3:
1407 * - we hold a pin on it;
1408 * - one reference from radix tree;
1409 * - one from isolate_lru_page;
1410 */
1411 if (!page_ref_freeze(page, 3)) {
1412 result = SCAN_PAGE_COUNT;
1413 goto out_lru;
1414 }
1415
1416 /*
1417 * Add the page to the list to be able to undo the collapse if
1418 * something go wrong.
1419 */
1420 list_add_tail(&page->lru, &pagelist);
1421
1422 /* Finally, replace with the new page. */
1423 radix_tree_replace_slot(slot,
1424 new_page + (index % HPAGE_PMD_NR));
1425
1426 index++;
1427 continue;
1428out_lru:
1429 spin_unlock_irq(&mapping->tree_lock);
1430 putback_lru_page(page);
1431out_isolate_failed:
1432 unlock_page(page);
1433 put_page(page);
1434 goto tree_unlocked;
1435out_unlock:
1436 unlock_page(page);
1437 put_page(page);
1438 break;
1439 }
1440
1441 /*
1442 * Handle hole in radix tree at the end of the range.
1443 * This code only triggers if there's nothing in radix tree
1444 * beyond 'end'.
1445 */
1446 if (result == SCAN_SUCCEED && index < end) {
1447 int n = end - index;
1448
1449 if (!shmem_charge(mapping->host, n)) {
1450 result = SCAN_FAIL;
1451 goto tree_locked;
1452 }
1453
1454 for (; index < end; index++) {
1455 radix_tree_insert(&mapping->page_tree, index,
1456 new_page + (index % HPAGE_PMD_NR));
1457 }
1458 nr_none += n;
1459 }
1460
1461tree_locked:
1462 spin_unlock_irq(&mapping->tree_lock);
1463tree_unlocked:
1464
1465 if (result == SCAN_SUCCEED) {
1466 unsigned long flags;
1467 struct zone *zone = page_zone(new_page);
1468
1469 /*
1470 * Replacing old pages with new one has succeed, now we need to
1471 * copy the content and free old pages.
1472 */
1473 list_for_each_entry_safe(page, tmp, &pagelist, lru) {
1474 copy_highpage(new_page + (page->index % HPAGE_PMD_NR),
1475 page);
1476 list_del(&page->lru);
1477 unlock_page(page);
1478 page_ref_unfreeze(page, 1);
1479 page->mapping = NULL;
1480 ClearPageActive(page);
1481 ClearPageUnevictable(page);
1482 put_page(page);
1483 }
1484
1485 local_irq_save(flags);
1486 __inc_zone_page_state(new_page, NR_SHMEM_THPS);
1487 if (nr_none) {
1488 __mod_zone_page_state(zone, NR_FILE_PAGES, nr_none);
1489 __mod_zone_page_state(zone, NR_SHMEM, nr_none);
1490 }
1491 local_irq_restore(flags);
1492
1493 /*
1494 * Remove pte page tables, so we can re-faulti
1495 * the page as huge.
1496 */
1497 retract_page_tables(mapping, start);
1498
1499 /* Everything is ready, let's unfreeze the new_page */
1500 set_page_dirty(new_page);
1501 SetPageUptodate(new_page);
1502 page_ref_unfreeze(new_page, HPAGE_PMD_NR);
1503 mem_cgroup_commit_charge(new_page, memcg, false, true);
1504 lru_cache_add_anon(new_page);
1505 unlock_page(new_page);
1506
1507 *hpage = NULL;
1508 } else {
1509 /* Something went wrong: rollback changes to the radix-tree */
1510 shmem_uncharge(mapping->host, nr_none);
1511 spin_lock_irq(&mapping->tree_lock);
1512 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter,
1513 start) {
1514 if (iter.index >= end)
1515 break;
1516 page = list_first_entry_or_null(&pagelist,
1517 struct page, lru);
1518 if (!page || iter.index < page->index) {
1519 if (!nr_none)
1520 break;
1521 /* Put holes back where they were */
1522 radix_tree_replace_slot(slot, NULL);
1523 nr_none--;
1524 continue;
1525 }
1526
1527 VM_BUG_ON_PAGE(page->index != iter.index, page);
1528
1529 /* Unfreeze the page. */
1530 list_del(&page->lru);
1531 page_ref_unfreeze(page, 2);
1532 radix_tree_replace_slot(slot, page);
1533 spin_unlock_irq(&mapping->tree_lock);
1534 putback_lru_page(page);
1535 unlock_page(page);
1536 spin_lock_irq(&mapping->tree_lock);
1537 }
1538 VM_BUG_ON(nr_none);
1539 spin_unlock_irq(&mapping->tree_lock);
1540
1541 /* Unfreeze new_page, caller would take care about freeing it */
1542 page_ref_unfreeze(new_page, 1);
1543 mem_cgroup_cancel_charge(new_page, memcg, true);
1544 unlock_page(new_page);
1545 new_page->mapping = NULL;
1546 }
1547out:
1548 VM_BUG_ON(!list_empty(&pagelist));
1549 /* TODO: tracepoints */
1550}
1551
1552static void khugepaged_scan_shmem(struct mm_struct *mm,
1553 struct address_space *mapping,
1554 pgoff_t start, struct page **hpage)
1555{
1556 struct page *page = NULL;
1557 struct radix_tree_iter iter;
1558 void **slot;
1559 int present, swap;
1560 int node = NUMA_NO_NODE;
1561 int result = SCAN_SUCCEED;
1562
1563 present = 0;
1564 swap = 0;
1565 memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
1566 rcu_read_lock();
1567 radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
1568 if (iter.index >= start + HPAGE_PMD_NR)
1569 break;
1570
1571 page = radix_tree_deref_slot(slot);
1572 if (radix_tree_deref_retry(page)) {
1573 slot = radix_tree_iter_retry(&iter);
1574 continue;
1575 }
1576
1577 if (radix_tree_exception(page)) {
1578 if (++swap > khugepaged_max_ptes_swap) {
1579 result = SCAN_EXCEED_SWAP_PTE;
1580 break;
1581 }
1582 continue;
1583 }
1584
1585 if (PageTransCompound(page)) {
1586 result = SCAN_PAGE_COMPOUND;
1587 break;
1588 }
1589
1590 node = page_to_nid(page);
1591 if (khugepaged_scan_abort(node)) {
1592 result = SCAN_SCAN_ABORT;
1593 break;
1594 }
1595 khugepaged_node_load[node]++;
1596
1597 if (!PageLRU(page)) {
1598 result = SCAN_PAGE_LRU;
1599 break;
1600 }
1601
1602 if (page_count(page) != 1 + page_mapcount(page)) {
1603 result = SCAN_PAGE_COUNT;
1604 break;
1605 }
1606
1607 /*
1608 * We probably should check if the page is referenced here, but
1609 * nobody would transfer pte_young() to PageReferenced() for us.
1610 * And rmap walk here is just too costly...
1611 */
1612
1613 present++;
1614
1615 if (need_resched()) {
1616 cond_resched_rcu();
1617 slot = radix_tree_iter_next(&iter);
1618 }
1619 }
1620 rcu_read_unlock();
1621
1622 if (result == SCAN_SUCCEED) {
1623 if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
1624 result = SCAN_EXCEED_NONE_PTE;
1625 } else {
1626 node = khugepaged_find_target_node();
1627 collapse_shmem(mm, mapping, start, hpage, node);
1628 }
1629 }
1630
1631 /* TODO: tracepoints */
1632}
1633#else
1634static void khugepaged_scan_shmem(struct mm_struct *mm,
1635 struct address_space *mapping,
1636 pgoff_t start, struct page **hpage)
1637{
1638 BUILD_BUG();
1639}
1640#endif
1641
1642static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
1643 struct page **hpage)
1644 __releases(&khugepaged_mm_lock)
1645 __acquires(&khugepaged_mm_lock)
1646{
1647 struct mm_slot *mm_slot;
1648 struct mm_struct *mm;
1649 struct vm_area_struct *vma;
1650 int progress = 0;
1651
1652 VM_BUG_ON(!pages);
1653 VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
1654
1655 if (khugepaged_scan.mm_slot)
1656 mm_slot = khugepaged_scan.mm_slot;
1657 else {
1658 mm_slot = list_entry(khugepaged_scan.mm_head.next,
1659 struct mm_slot, mm_node);
1660 khugepaged_scan.address = 0;
1661 khugepaged_scan.mm_slot = mm_slot;
1662 }
1663 spin_unlock(&khugepaged_mm_lock);
1664
1665 mm = mm_slot->mm;
1666 down_read(&mm->mmap_sem);
1667 if (unlikely(khugepaged_test_exit(mm)))
1668 vma = NULL;
1669 else
1670 vma = find_vma(mm, khugepaged_scan.address);
1671
1672 progress++;
1673 for (; vma; vma = vma->vm_next) {
1674 unsigned long hstart, hend;
1675
1676 cond_resched();
1677 if (unlikely(khugepaged_test_exit(mm))) {
1678 progress++;
1679 break;
1680 }
1681 if (!hugepage_vma_check(vma)) {
1682skip:
1683 progress++;
1684 continue;
1685 }
1686 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1687 hend = vma->vm_end & HPAGE_PMD_MASK;
1688 if (hstart >= hend)
1689 goto skip;
1690 if (khugepaged_scan.address > hend)
1691 goto skip;
1692 if (khugepaged_scan.address < hstart)
1693 khugepaged_scan.address = hstart;
1694 VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
1695
1696 while (khugepaged_scan.address < hend) {
1697 int ret;
1698 cond_resched();
1699 if (unlikely(khugepaged_test_exit(mm)))
1700 goto breakouterloop;
1701
1702 VM_BUG_ON(khugepaged_scan.address < hstart ||
1703 khugepaged_scan.address + HPAGE_PMD_SIZE >
1704 hend);
1705 if (shmem_file(vma->vm_file)) {
1706 struct file *file;
1707 pgoff_t pgoff = linear_page_index(vma,
1708 khugepaged_scan.address);
1709 if (!shmem_huge_enabled(vma))
1710 goto skip;
1711 file = get_file(vma->vm_file);
1712 up_read(&mm->mmap_sem);
1713 ret = 1;
1714 khugepaged_scan_shmem(mm, file->f_mapping,
1715 pgoff, hpage);
1716 fput(file);
1717 } else {
1718 ret = khugepaged_scan_pmd(mm, vma,
1719 khugepaged_scan.address,
1720 hpage);
1721 }
1722 /* move to next address */
1723 khugepaged_scan.address += HPAGE_PMD_SIZE;
1724 progress += HPAGE_PMD_NR;
1725 if (ret)
1726 /* we released mmap_sem so break loop */
1727 goto breakouterloop_mmap_sem;
1728 if (progress >= pages)
1729 goto breakouterloop;
1730 }
1731 }
1732breakouterloop:
1733 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
1734breakouterloop_mmap_sem:
1735
1736 spin_lock(&khugepaged_mm_lock);
1737 VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
1738 /*
1739 * Release the current mm_slot if this mm is about to die, or
1740 * if we scanned all vmas of this mm.
1741 */
1742 if (khugepaged_test_exit(mm) || !vma) {
1743 /*
1744 * Make sure that if mm_users is reaching zero while
1745 * khugepaged runs here, khugepaged_exit will find
1746 * mm_slot not pointing to the exiting mm.
1747 */
1748 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
1749 khugepaged_scan.mm_slot = list_entry(
1750 mm_slot->mm_node.next,
1751 struct mm_slot, mm_node);
1752 khugepaged_scan.address = 0;
1753 } else {
1754 khugepaged_scan.mm_slot = NULL;
1755 khugepaged_full_scans++;
1756 }
1757
1758 collect_mm_slot(mm_slot);
1759 }
1760
1761 return progress;
1762}
1763
1764static int khugepaged_has_work(void)
1765{
1766 return !list_empty(&khugepaged_scan.mm_head) &&
1767 khugepaged_enabled();
1768}
1769
1770static int khugepaged_wait_event(void)
1771{
1772 return !list_empty(&khugepaged_scan.mm_head) ||
1773 kthread_should_stop();
1774}
1775
1776static void khugepaged_do_scan(void)
1777{
1778 struct page *hpage = NULL;
1779 unsigned int progress = 0, pass_through_head = 0;
1780 unsigned int pages = khugepaged_pages_to_scan;
1781 bool wait = true;
1782
1783 barrier(); /* write khugepaged_pages_to_scan to local stack */
1784
1785 while (progress < pages) {
1786 if (!khugepaged_prealloc_page(&hpage, &wait))
1787 break;
1788
1789 cond_resched();
1790
1791 if (unlikely(kthread_should_stop() || try_to_freeze()))
1792 break;
1793
1794 spin_lock(&khugepaged_mm_lock);
1795 if (!khugepaged_scan.mm_slot)
1796 pass_through_head++;
1797 if (khugepaged_has_work() &&
1798 pass_through_head < 2)
1799 progress += khugepaged_scan_mm_slot(pages - progress,
1800 &hpage);
1801 else
1802 progress = pages;
1803 spin_unlock(&khugepaged_mm_lock);
1804 }
1805
1806 if (!IS_ERR_OR_NULL(hpage))
1807 put_page(hpage);
1808}
1809
1810static bool khugepaged_should_wakeup(void)
1811{
1812 return kthread_should_stop() ||
1813 time_after_eq(jiffies, khugepaged_sleep_expire);
1814}
1815
1816static void khugepaged_wait_work(void)
1817{
1818 if (khugepaged_has_work()) {
1819 const unsigned long scan_sleep_jiffies =
1820 msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
1821
1822 if (!scan_sleep_jiffies)
1823 return;
1824
1825 khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
1826 wait_event_freezable_timeout(khugepaged_wait,
1827 khugepaged_should_wakeup(),
1828 scan_sleep_jiffies);
1829 return;
1830 }
1831
1832 if (khugepaged_enabled())
1833 wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
1834}
1835
1836static int khugepaged(void *none)
1837{
1838 struct mm_slot *mm_slot;
1839
1840 set_freezable();
1841 set_user_nice(current, MAX_NICE);
1842
1843 while (!kthread_should_stop()) {
1844 khugepaged_do_scan();
1845 khugepaged_wait_work();
1846 }
1847
1848 spin_lock(&khugepaged_mm_lock);
1849 mm_slot = khugepaged_scan.mm_slot;
1850 khugepaged_scan.mm_slot = NULL;
1851 if (mm_slot)
1852 collect_mm_slot(mm_slot);
1853 spin_unlock(&khugepaged_mm_lock);
1854 return 0;
1855}
1856
1857static void set_recommended_min_free_kbytes(void)
1858{
1859 struct zone *zone;
1860 int nr_zones = 0;
1861 unsigned long recommended_min;
1862
1863 for_each_populated_zone(zone)
1864 nr_zones++;
1865
1866 /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
1867 recommended_min = pageblock_nr_pages * nr_zones * 2;
1868
1869 /*
1870 * Make sure that on average at least two pageblocks are almost free
1871 * of another type, one for a migratetype to fall back to and a
1872 * second to avoid subsequent fallbacks of other types There are 3
1873 * MIGRATE_TYPES we care about.
1874 */
1875 recommended_min += pageblock_nr_pages * nr_zones *
1876 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
1877
1878 /* don't ever allow to reserve more than 5% of the lowmem */
1879 recommended_min = min(recommended_min,
1880 (unsigned long) nr_free_buffer_pages() / 20);
1881 recommended_min <<= (PAGE_SHIFT-10);
1882
1883 if (recommended_min > min_free_kbytes) {
1884 if (user_min_free_kbytes >= 0)
1885 pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
1886 min_free_kbytes, recommended_min);
1887
1888 min_free_kbytes = recommended_min;
1889 }
1890 setup_per_zone_wmarks();
1891}
1892
1893int start_stop_khugepaged(void)
1894{
1895 static struct task_struct *khugepaged_thread __read_mostly;
1896 static DEFINE_MUTEX(khugepaged_mutex);
1897 int err = 0;
1898
1899 mutex_lock(&khugepaged_mutex);
1900 if (khugepaged_enabled()) {
1901 if (!khugepaged_thread)
1902 khugepaged_thread = kthread_run(khugepaged, NULL,
1903 "khugepaged");
1904 if (IS_ERR(khugepaged_thread)) {
1905 pr_err("khugepaged: kthread_run(khugepaged) failed\n");
1906 err = PTR_ERR(khugepaged_thread);
1907 khugepaged_thread = NULL;
1908 goto fail;
1909 }
1910
1911 if (!list_empty(&khugepaged_scan.mm_head))
1912 wake_up_interruptible(&khugepaged_wait);
1913
1914 set_recommended_min_free_kbytes();
1915 } else if (khugepaged_thread) {
1916 kthread_stop(khugepaged_thread);
1917 khugepaged_thread = NULL;
1918 }
1919fail:
1920 mutex_unlock(&khugepaged_mutex);
1921 return err;
1922}
diff --git a/mm/ksm.c b/mm/ksm.c
index 4786b4150f62..73d43bafd9fb 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -376,9 +376,8 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
376 if (IS_ERR_OR_NULL(page)) 376 if (IS_ERR_OR_NULL(page))
377 break; 377 break;
378 if (PageKsm(page)) 378 if (PageKsm(page))
379 ret = handle_mm_fault(vma->vm_mm, vma, addr, 379 ret = handle_mm_fault(vma, addr,
380 FAULT_FLAG_WRITE | 380 FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
381 FAULT_FLAG_REMOTE);
382 else 381 else
383 ret = VM_FAULT_WRITE; 382 ret = VM_FAULT_WRITE;
384 put_page(page); 383 put_page(page);
@@ -532,8 +531,8 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
532 void *expected_mapping; 531 void *expected_mapping;
533 unsigned long kpfn; 532 unsigned long kpfn;
534 533
535 expected_mapping = (void *)stable_node + 534 expected_mapping = (void *)((unsigned long)stable_node |
536 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); 535 PAGE_MAPPING_KSM);
537again: 536again:
538 kpfn = READ_ONCE(stable_node->kpfn); 537 kpfn = READ_ONCE(stable_node->kpfn);
539 page = pfn_to_page(kpfn); 538 page = pfn_to_page(kpfn);
diff --git a/mm/memblock.c b/mm/memblock.c
index ac1248933b31..ca099159b45a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -584,6 +584,9 @@ repeat:
584 nid, flags); 584 nid, flags);
585 } 585 }
586 586
587 if (!nr_new)
588 return 0;
589
587 /* 590 /*
588 * If this was the first round, resize array and repeat for actual 591 * If this was the first round, resize array and repeat for actual
589 * insertions; otherwise, merge and return. 592 * insertions; otherwise, merge and return.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5339c89dff63..f3a84c64f35c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1259,6 +1259,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1259 struct oom_control oc = { 1259 struct oom_control oc = {
1260 .zonelist = NULL, 1260 .zonelist = NULL,
1261 .nodemask = NULL, 1261 .nodemask = NULL,
1262 .memcg = memcg,
1262 .gfp_mask = gfp_mask, 1263 .gfp_mask = gfp_mask,
1263 .order = order, 1264 .order = order,
1264 }; 1265 };
@@ -1281,7 +1282,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1281 goto unlock; 1282 goto unlock;
1282 } 1283 }
1283 1284
1284 check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg); 1285 check_panic_on_oom(&oc, CONSTRAINT_MEMCG);
1285 totalpages = mem_cgroup_get_limit(memcg) ? : 1; 1286 totalpages = mem_cgroup_get_limit(memcg) ? : 1;
1286 for_each_mem_cgroup_tree(iter, memcg) { 1287 for_each_mem_cgroup_tree(iter, memcg) {
1287 struct css_task_iter it; 1288 struct css_task_iter it;
@@ -1289,7 +1290,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1289 1290
1290 css_task_iter_start(&iter->css, &it); 1291 css_task_iter_start(&iter->css, &it);
1291 while ((task = css_task_iter_next(&it))) { 1292 while ((task = css_task_iter_next(&it))) {
1292 switch (oom_scan_process_thread(&oc, task, totalpages)) { 1293 switch (oom_scan_process_thread(&oc, task)) {
1293 case OOM_SCAN_SELECT: 1294 case OOM_SCAN_SELECT:
1294 if (chosen) 1295 if (chosen)
1295 put_task_struct(chosen); 1296 put_task_struct(chosen);
@@ -1329,7 +1330,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1329 1330
1330 if (chosen) { 1331 if (chosen) {
1331 points = chosen_points * 1000 / totalpages; 1332 points = chosen_points * 1000 / totalpages;
1332 oom_kill_process(&oc, chosen, points, totalpages, memcg, 1333 oom_kill_process(&oc, chosen, points, totalpages,
1333 "Memory cgroup out of memory"); 1334 "Memory cgroup out of memory");
1334 } 1335 }
1335unlock: 1336unlock:
@@ -2272,20 +2273,30 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2272 current->memcg_kmem_skip_account = 0; 2273 current->memcg_kmem_skip_account = 0;
2273} 2274}
2274 2275
2275/* 2276static inline bool memcg_kmem_bypass(void)
2277{
2278 if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2279 return true;
2280 return false;
2281}
2282
2283/**
2284 * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2285 * @cachep: the original global kmem cache
2286 *
2276 * Return the kmem_cache we're supposed to use for a slab allocation. 2287 * Return the kmem_cache we're supposed to use for a slab allocation.
2277 * We try to use the current memcg's version of the cache. 2288 * We try to use the current memcg's version of the cache.
2278 * 2289 *
2279 * If the cache does not exist yet, if we are the first user of it, 2290 * If the cache does not exist yet, if we are the first user of it, we
2280 * we either create it immediately, if possible, or create it asynchronously 2291 * create it asynchronously in a workqueue and let the current allocation
2281 * in a workqueue. 2292 * go through with the original cache.
2282 * In the latter case, we will let the current allocation go through with
2283 * the original cache.
2284 * 2293 *
2285 * Can't be called in interrupt context or from kernel threads. 2294 * This function takes a reference to the cache it returns to assure it
2286 * This function needs to be called with rcu_read_lock() held. 2295 * won't get destroyed while we are working with it. Once the caller is
2296 * done with it, memcg_kmem_put_cache() must be called to release the
2297 * reference.
2287 */ 2298 */
2288struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) 2299struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2289{ 2300{
2290 struct mem_cgroup *memcg; 2301 struct mem_cgroup *memcg;
2291 struct kmem_cache *memcg_cachep; 2302 struct kmem_cache *memcg_cachep;
@@ -2293,10 +2304,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
2293 2304
2294 VM_BUG_ON(!is_root_cache(cachep)); 2305 VM_BUG_ON(!is_root_cache(cachep));
2295 2306
2296 if (cachep->flags & SLAB_ACCOUNT) 2307 if (memcg_kmem_bypass())
2297 gfp |= __GFP_ACCOUNT;
2298
2299 if (!(gfp & __GFP_ACCOUNT))
2300 return cachep; 2308 return cachep;
2301 2309
2302 if (current->memcg_kmem_skip_account) 2310 if (current->memcg_kmem_skip_account)
@@ -2329,14 +2337,27 @@ out:
2329 return cachep; 2337 return cachep;
2330} 2338}
2331 2339
2332void __memcg_kmem_put_cache(struct kmem_cache *cachep) 2340/**
2341 * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2342 * @cachep: the cache returned by memcg_kmem_get_cache
2343 */
2344void memcg_kmem_put_cache(struct kmem_cache *cachep)
2333{ 2345{
2334 if (!is_root_cache(cachep)) 2346 if (!is_root_cache(cachep))
2335 css_put(&cachep->memcg_params.memcg->css); 2347 css_put(&cachep->memcg_params.memcg->css);
2336} 2348}
2337 2349
2338int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, 2350/**
2339 struct mem_cgroup *memcg) 2351 * memcg_kmem_charge: charge a kmem page
2352 * @page: page to charge
2353 * @gfp: reclaim mode
2354 * @order: allocation order
2355 * @memcg: memory cgroup to charge
2356 *
2357 * Returns 0 on success, an error code on failure.
2358 */
2359int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2360 struct mem_cgroup *memcg)
2340{ 2361{
2341 unsigned int nr_pages = 1 << order; 2362 unsigned int nr_pages = 1 << order;
2342 struct page_counter *counter; 2363 struct page_counter *counter;
@@ -2357,19 +2378,34 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2357 return 0; 2378 return 0;
2358} 2379}
2359 2380
2360int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) 2381/**
2382 * memcg_kmem_charge: charge a kmem page to the current memory cgroup
2383 * @page: page to charge
2384 * @gfp: reclaim mode
2385 * @order: allocation order
2386 *
2387 * Returns 0 on success, an error code on failure.
2388 */
2389int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2361{ 2390{
2362 struct mem_cgroup *memcg; 2391 struct mem_cgroup *memcg;
2363 int ret = 0; 2392 int ret = 0;
2364 2393
2394 if (memcg_kmem_bypass())
2395 return 0;
2396
2365 memcg = get_mem_cgroup_from_mm(current->mm); 2397 memcg = get_mem_cgroup_from_mm(current->mm);
2366 if (!mem_cgroup_is_root(memcg)) 2398 if (!mem_cgroup_is_root(memcg))
2367 ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); 2399 ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
2368 css_put(&memcg->css); 2400 css_put(&memcg->css);
2369 return ret; 2401 return ret;
2370} 2402}
2371 2403/**
2372void __memcg_kmem_uncharge(struct page *page, int order) 2404 * memcg_kmem_uncharge: uncharge a kmem page
2405 * @page: page to uncharge
2406 * @order: allocation order
2407 */
2408void memcg_kmem_uncharge(struct page *page, int order)
2373{ 2409{
2374 struct mem_cgroup *memcg = page->mem_cgroup; 2410 struct mem_cgroup *memcg = page->mem_cgroup;
2375 unsigned int nr_pages = 1 << order; 2411 unsigned int nr_pages = 1 << order;
@@ -4409,7 +4445,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4409 4445
4410#ifdef CONFIG_SWAP 4446#ifdef CONFIG_SWAP
4411static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4447static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4412 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4448 pte_t ptent, swp_entry_t *entry)
4413{ 4449{
4414 struct page *page = NULL; 4450 struct page *page = NULL;
4415 swp_entry_t ent = pte_to_swp_entry(ptent); 4451 swp_entry_t ent = pte_to_swp_entry(ptent);
@@ -4428,7 +4464,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4428} 4464}
4429#else 4465#else
4430static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4466static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4431 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4467 pte_t ptent, swp_entry_t *entry)
4432{ 4468{
4433 return NULL; 4469 return NULL;
4434} 4470}
@@ -4471,7 +4507,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4471/** 4507/**
4472 * mem_cgroup_move_account - move account of the page 4508 * mem_cgroup_move_account - move account of the page
4473 * @page: the page 4509 * @page: the page
4474 * @nr_pages: number of regular pages (>1 for huge pages) 4510 * @compound: charge the page as compound or small page
4475 * @from: mem_cgroup which the page is moved from. 4511 * @from: mem_cgroup which the page is moved from.
4476 * @to: mem_cgroup which the page is moved to. @from != @to. 4512 * @to: mem_cgroup which the page is moved to. @from != @to.
4477 * 4513 *
@@ -4593,7 +4629,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4593 if (pte_present(ptent)) 4629 if (pte_present(ptent))
4594 page = mc_handle_present_pte(vma, addr, ptent); 4630 page = mc_handle_present_pte(vma, addr, ptent);
4595 else if (is_swap_pte(ptent)) 4631 else if (is_swap_pte(ptent))
4596 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 4632 page = mc_handle_swap_pte(vma, ptent, &ent);
4597 else if (pte_none(ptent)) 4633 else if (pte_none(ptent))
4598 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4634 page = mc_handle_file_pte(vma, addr, ptent, &ent);
4599 4635
@@ -5333,6 +5369,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
5333 * @mm: mm context of the victim 5369 * @mm: mm context of the victim
5334 * @gfp_mask: reclaim mode 5370 * @gfp_mask: reclaim mode
5335 * @memcgp: charged memcg return 5371 * @memcgp: charged memcg return
5372 * @compound: charge the page as compound or small page
5336 * 5373 *
5337 * Try to charge @page to the memcg that @mm belongs to, reclaiming 5374 * Try to charge @page to the memcg that @mm belongs to, reclaiming
5338 * pages according to @gfp_mask if necessary. 5375 * pages according to @gfp_mask if necessary.
@@ -5395,6 +5432,7 @@ out:
5395 * @page: page to charge 5432 * @page: page to charge
5396 * @memcg: memcg to charge the page to 5433 * @memcg: memcg to charge the page to
5397 * @lrucare: page might be on LRU already 5434 * @lrucare: page might be on LRU already
5435 * @compound: charge the page as compound or small page
5398 * 5436 *
5399 * Finalize a charge transaction started by mem_cgroup_try_charge(), 5437 * Finalize a charge transaction started by mem_cgroup_try_charge(),
5400 * after page->mapping has been set up. This must happen atomically 5438 * after page->mapping has been set up. This must happen atomically
@@ -5446,6 +5484,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
5446 * mem_cgroup_cancel_charge - cancel a page charge 5484 * mem_cgroup_cancel_charge - cancel a page charge
5447 * @page: page to charge 5485 * @page: page to charge
5448 * @memcg: memcg to charge the page to 5486 * @memcg: memcg to charge the page to
5487 * @compound: charge the page as compound or small page
5449 * 5488 *
5450 * Cancel a charge transaction started by mem_cgroup_try_charge(). 5489 * Cancel a charge transaction started by mem_cgroup_try_charge().
5451 */ 5490 */
@@ -5469,15 +5508,18 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
5469 5508
5470static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 5509static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
5471 unsigned long nr_anon, unsigned long nr_file, 5510 unsigned long nr_anon, unsigned long nr_file,
5472 unsigned long nr_huge, struct page *dummy_page) 5511 unsigned long nr_huge, unsigned long nr_kmem,
5512 struct page *dummy_page)
5473{ 5513{
5474 unsigned long nr_pages = nr_anon + nr_file; 5514 unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
5475 unsigned long flags; 5515 unsigned long flags;
5476 5516
5477 if (!mem_cgroup_is_root(memcg)) { 5517 if (!mem_cgroup_is_root(memcg)) {
5478 page_counter_uncharge(&memcg->memory, nr_pages); 5518 page_counter_uncharge(&memcg->memory, nr_pages);
5479 if (do_memsw_account()) 5519 if (do_memsw_account())
5480 page_counter_uncharge(&memcg->memsw, nr_pages); 5520 page_counter_uncharge(&memcg->memsw, nr_pages);
5521 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
5522 page_counter_uncharge(&memcg->kmem, nr_kmem);
5481 memcg_oom_recover(memcg); 5523 memcg_oom_recover(memcg);
5482 } 5524 }
5483 5525
@@ -5500,6 +5542,7 @@ static void uncharge_list(struct list_head *page_list)
5500 unsigned long nr_anon = 0; 5542 unsigned long nr_anon = 0;
5501 unsigned long nr_file = 0; 5543 unsigned long nr_file = 0;
5502 unsigned long nr_huge = 0; 5544 unsigned long nr_huge = 0;
5545 unsigned long nr_kmem = 0;
5503 unsigned long pgpgout = 0; 5546 unsigned long pgpgout = 0;
5504 struct list_head *next; 5547 struct list_head *next;
5505 struct page *page; 5548 struct page *page;
@@ -5510,8 +5553,6 @@ static void uncharge_list(struct list_head *page_list)
5510 */ 5553 */
5511 next = page_list->next; 5554 next = page_list->next;
5512 do { 5555 do {
5513 unsigned int nr_pages = 1;
5514
5515 page = list_entry(next, struct page, lru); 5556 page = list_entry(next, struct page, lru);
5516 next = page->lru.next; 5557 next = page->lru.next;
5517 5558
@@ -5530,31 +5571,34 @@ static void uncharge_list(struct list_head *page_list)
5530 if (memcg != page->mem_cgroup) { 5571 if (memcg != page->mem_cgroup) {
5531 if (memcg) { 5572 if (memcg) {
5532 uncharge_batch(memcg, pgpgout, nr_anon, nr_file, 5573 uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
5533 nr_huge, page); 5574 nr_huge, nr_kmem, page);
5534 pgpgout = nr_anon = nr_file = nr_huge = 0; 5575 pgpgout = nr_anon = nr_file =
5576 nr_huge = nr_kmem = 0;
5535 } 5577 }
5536 memcg = page->mem_cgroup; 5578 memcg = page->mem_cgroup;
5537 } 5579 }
5538 5580
5539 if (PageTransHuge(page)) { 5581 if (!PageKmemcg(page)) {
5540 nr_pages <<= compound_order(page); 5582 unsigned int nr_pages = 1;
5541 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
5542 nr_huge += nr_pages;
5543 }
5544 5583
5545 if (PageAnon(page)) 5584 if (PageTransHuge(page)) {
5546 nr_anon += nr_pages; 5585 nr_pages <<= compound_order(page);
5547 else 5586 nr_huge += nr_pages;
5548 nr_file += nr_pages; 5587 }
5588 if (PageAnon(page))
5589 nr_anon += nr_pages;
5590 else
5591 nr_file += nr_pages;
5592 pgpgout++;
5593 } else
5594 nr_kmem += 1 << compound_order(page);
5549 5595
5550 page->mem_cgroup = NULL; 5596 page->mem_cgroup = NULL;
5551
5552 pgpgout++;
5553 } while (next != page_list); 5597 } while (next != page_list);
5554 5598
5555 if (memcg) 5599 if (memcg)
5556 uncharge_batch(memcg, pgpgout, nr_anon, nr_file, 5600 uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
5557 nr_huge, page); 5601 nr_huge, nr_kmem, page);
5558} 5602}
5559 5603
5560/** 5604/**
diff --git a/mm/memory.c b/mm/memory.c
index 9e046819e619..4425b6059339 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -233,6 +233,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long
233#ifdef CONFIG_HAVE_RCU_TABLE_FREE 233#ifdef CONFIG_HAVE_RCU_TABLE_FREE
234 tlb->batch = NULL; 234 tlb->batch = NULL;
235#endif 235#endif
236 tlb->page_size = 0;
236 237
237 __tlb_reset_range(tlb); 238 __tlb_reset_range(tlb);
238} 239}
@@ -292,23 +293,31 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e
292 * handling the additional races in SMP caused by other CPUs caching valid 293 * handling the additional races in SMP caused by other CPUs caching valid
293 * mappings in their TLBs. Returns the number of free page slots left. 294 * mappings in their TLBs. Returns the number of free page slots left.
294 * When out of page slots we must call tlb_flush_mmu(). 295 * When out of page slots we must call tlb_flush_mmu().
296 *returns true if the caller should flush.
295 */ 297 */
296int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) 298bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
297{ 299{
298 struct mmu_gather_batch *batch; 300 struct mmu_gather_batch *batch;
299 301
300 VM_BUG_ON(!tlb->end); 302 VM_BUG_ON(!tlb->end);
301 303
304 if (!tlb->page_size)
305 tlb->page_size = page_size;
306 else {
307 if (page_size != tlb->page_size)
308 return true;
309 }
310
302 batch = tlb->active; 311 batch = tlb->active;
303 batch->pages[batch->nr++] = page;
304 if (batch->nr == batch->max) { 312 if (batch->nr == batch->max) {
305 if (!tlb_next_batch(tlb)) 313 if (!tlb_next_batch(tlb))
306 return 0; 314 return true;
307 batch = tlb->active; 315 batch = tlb->active;
308 } 316 }
309 VM_BUG_ON_PAGE(batch->nr > batch->max, page); 317 VM_BUG_ON_PAGE(batch->nr > batch->max, page);
310 318
311 return batch->max - batch->nr; 319 batch->pages[batch->nr++] = page;
320 return false;
312} 321}
313 322
314#endif /* HAVE_GENERIC_MMU_GATHER */ 323#endif /* HAVE_GENERIC_MMU_GATHER */
@@ -1109,6 +1118,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
1109 pte_t *start_pte; 1118 pte_t *start_pte;
1110 pte_t *pte; 1119 pte_t *pte;
1111 swp_entry_t entry; 1120 swp_entry_t entry;
1121 struct page *pending_page = NULL;
1112 1122
1113again: 1123again:
1114 init_rss_vec(rss); 1124 init_rss_vec(rss);
@@ -1132,7 +1142,7 @@ again:
1132 * unmap shared but keep private pages. 1142 * unmap shared but keep private pages.
1133 */ 1143 */
1134 if (details->check_mapping && 1144 if (details->check_mapping &&
1135 details->check_mapping != page->mapping) 1145 details->check_mapping != page_rmapping(page))
1136 continue; 1146 continue;
1137 } 1147 }
1138 ptent = ptep_get_and_clear_full(mm, addr, pte, 1148 ptent = ptep_get_and_clear_full(mm, addr, pte,
@@ -1160,8 +1170,9 @@ again:
1160 page_remove_rmap(page, false); 1170 page_remove_rmap(page, false);
1161 if (unlikely(page_mapcount(page) < 0)) 1171 if (unlikely(page_mapcount(page) < 0))
1162 print_bad_pte(vma, addr, ptent, page); 1172 print_bad_pte(vma, addr, ptent, page);
1163 if (unlikely(!__tlb_remove_page(tlb, page))) { 1173 if (unlikely(__tlb_remove_page(tlb, page))) {
1164 force_flush = 1; 1174 force_flush = 1;
1175 pending_page = page;
1165 addr += PAGE_SIZE; 1176 addr += PAGE_SIZE;
1166 break; 1177 break;
1167 } 1178 }
@@ -1202,7 +1213,11 @@ again:
1202 if (force_flush) { 1213 if (force_flush) {
1203 force_flush = 0; 1214 force_flush = 0;
1204 tlb_flush_mmu_free(tlb); 1215 tlb_flush_mmu_free(tlb);
1205 1216 if (pending_page) {
1217 /* remove the page with new size */
1218 __tlb_remove_pte_page(tlb, pending_page);
1219 pending_page = NULL;
1220 }
1206 if (addr != end) 1221 if (addr != end)
1207 goto again; 1222 goto again;
1208 } 1223 }
@@ -1479,7 +1494,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1479 /* Ok, finally just insert the thing.. */ 1494 /* Ok, finally just insert the thing.. */
1480 get_page(page); 1495 get_page(page);
1481 inc_mm_counter_fast(mm, mm_counter_file(page)); 1496 inc_mm_counter_fast(mm, mm_counter_file(page));
1482 page_add_file_rmap(page); 1497 page_add_file_rmap(page, false);
1483 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 1498 set_pte_at(mm, addr, pte, mk_pte(page, prot));
1484 1499
1485 retval = 0; 1500 retval = 0;
@@ -2055,13 +2070,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2055 * case, all we need to do here is to mark the page as writable and update 2070 * case, all we need to do here is to mark the page as writable and update
2056 * any related book-keeping. 2071 * any related book-keeping.
2057 */ 2072 */
2058static inline int wp_page_reuse(struct mm_struct *mm, 2073static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
2059 struct vm_area_struct *vma, unsigned long address, 2074 struct page *page, int page_mkwrite, int dirty_shared)
2060 pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, 2075 __releases(fe->ptl)
2061 struct page *page, int page_mkwrite,
2062 int dirty_shared)
2063 __releases(ptl)
2064{ 2076{
2077 struct vm_area_struct *vma = fe->vma;
2065 pte_t entry; 2078 pte_t entry;
2066 /* 2079 /*
2067 * Clear the pages cpupid information as the existing 2080 * Clear the pages cpupid information as the existing
@@ -2071,12 +2084,12 @@ static inline int wp_page_reuse(struct mm_struct *mm,
2071 if (page) 2084 if (page)
2072 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); 2085 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2073 2086
2074 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2087 flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
2075 entry = pte_mkyoung(orig_pte); 2088 entry = pte_mkyoung(orig_pte);
2076 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2089 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2077 if (ptep_set_access_flags(vma, address, page_table, entry, 1)) 2090 if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1))
2078 update_mmu_cache(vma, address, page_table); 2091 update_mmu_cache(vma, fe->address, fe->pte);
2079 pte_unmap_unlock(page_table, ptl); 2092 pte_unmap_unlock(fe->pte, fe->ptl);
2080 2093
2081 if (dirty_shared) { 2094 if (dirty_shared) {
2082 struct address_space *mapping; 2095 struct address_space *mapping;
@@ -2122,30 +2135,31 @@ static inline int wp_page_reuse(struct mm_struct *mm,
2122 * held to the old page, as well as updating the rmap. 2135 * held to the old page, as well as updating the rmap.
2123 * - In any case, unlock the PTL and drop the reference we took to the old page. 2136 * - In any case, unlock the PTL and drop the reference we took to the old page.
2124 */ 2137 */
2125static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, 2138static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
2126 unsigned long address, pte_t *page_table, pmd_t *pmd, 2139 struct page *old_page)
2127 pte_t orig_pte, struct page *old_page)
2128{ 2140{
2141 struct vm_area_struct *vma = fe->vma;
2142 struct mm_struct *mm = vma->vm_mm;
2129 struct page *new_page = NULL; 2143 struct page *new_page = NULL;
2130 spinlock_t *ptl = NULL;
2131 pte_t entry; 2144 pte_t entry;
2132 int page_copied = 0; 2145 int page_copied = 0;
2133 const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */ 2146 const unsigned long mmun_start = fe->address & PAGE_MASK;
2134 const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */ 2147 const unsigned long mmun_end = mmun_start + PAGE_SIZE;
2135 struct mem_cgroup *memcg; 2148 struct mem_cgroup *memcg;
2136 2149
2137 if (unlikely(anon_vma_prepare(vma))) 2150 if (unlikely(anon_vma_prepare(vma)))
2138 goto oom; 2151 goto oom;
2139 2152
2140 if (is_zero_pfn(pte_pfn(orig_pte))) { 2153 if (is_zero_pfn(pte_pfn(orig_pte))) {
2141 new_page = alloc_zeroed_user_highpage_movable(vma, address); 2154 new_page = alloc_zeroed_user_highpage_movable(vma, fe->address);
2142 if (!new_page) 2155 if (!new_page)
2143 goto oom; 2156 goto oom;
2144 } else { 2157 } else {
2145 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 2158 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
2159 fe->address);
2146 if (!new_page) 2160 if (!new_page)
2147 goto oom; 2161 goto oom;
2148 cow_user_page(new_page, old_page, address, vma); 2162 cow_user_page(new_page, old_page, fe->address, vma);
2149 } 2163 }
2150 2164
2151 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) 2165 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
@@ -2158,8 +2172,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2158 /* 2172 /*
2159 * Re-check the pte - we dropped the lock 2173 * Re-check the pte - we dropped the lock
2160 */ 2174 */
2161 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2175 fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl);
2162 if (likely(pte_same(*page_table, orig_pte))) { 2176 if (likely(pte_same(*fe->pte, orig_pte))) {
2163 if (old_page) { 2177 if (old_page) {
2164 if (!PageAnon(old_page)) { 2178 if (!PageAnon(old_page)) {
2165 dec_mm_counter_fast(mm, 2179 dec_mm_counter_fast(mm,
@@ -2169,7 +2183,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2169 } else { 2183 } else {
2170 inc_mm_counter_fast(mm, MM_ANONPAGES); 2184 inc_mm_counter_fast(mm, MM_ANONPAGES);
2171 } 2185 }
2172 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2186 flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
2173 entry = mk_pte(new_page, vma->vm_page_prot); 2187 entry = mk_pte(new_page, vma->vm_page_prot);
2174 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2188 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2175 /* 2189 /*
@@ -2178,8 +2192,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2178 * seen in the presence of one thread doing SMC and another 2192 * seen in the presence of one thread doing SMC and another
2179 * thread doing COW. 2193 * thread doing COW.
2180 */ 2194 */
2181 ptep_clear_flush_notify(vma, address, page_table); 2195 ptep_clear_flush_notify(vma, fe->address, fe->pte);
2182 page_add_new_anon_rmap(new_page, vma, address, false); 2196 page_add_new_anon_rmap(new_page, vma, fe->address, false);
2183 mem_cgroup_commit_charge(new_page, memcg, false, false); 2197 mem_cgroup_commit_charge(new_page, memcg, false, false);
2184 lru_cache_add_active_or_unevictable(new_page, vma); 2198 lru_cache_add_active_or_unevictable(new_page, vma);
2185 /* 2199 /*
@@ -2187,8 +2201,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2187 * mmu page tables (such as kvm shadow page tables), we want the 2201 * mmu page tables (such as kvm shadow page tables), we want the
2188 * new page to be mapped directly into the secondary page table. 2202 * new page to be mapped directly into the secondary page table.
2189 */ 2203 */
2190 set_pte_at_notify(mm, address, page_table, entry); 2204 set_pte_at_notify(mm, fe->address, fe->pte, entry);
2191 update_mmu_cache(vma, address, page_table); 2205 update_mmu_cache(vma, fe->address, fe->pte);
2192 if (old_page) { 2206 if (old_page) {
2193 /* 2207 /*
2194 * Only after switching the pte to the new page may 2208 * Only after switching the pte to the new page may
@@ -2225,7 +2239,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2225 if (new_page) 2239 if (new_page)
2226 put_page(new_page); 2240 put_page(new_page);
2227 2241
2228 pte_unmap_unlock(page_table, ptl); 2242 pte_unmap_unlock(fe->pte, fe->ptl);
2229 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2243 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2230 if (old_page) { 2244 if (old_page) {
2231 /* 2245 /*
@@ -2253,44 +2267,43 @@ oom:
2253 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED 2267 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
2254 * mapping 2268 * mapping
2255 */ 2269 */
2256static int wp_pfn_shared(struct mm_struct *mm, 2270static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte)
2257 struct vm_area_struct *vma, unsigned long address,
2258 pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
2259 pmd_t *pmd)
2260{ 2271{
2272 struct vm_area_struct *vma = fe->vma;
2273
2261 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { 2274 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2262 struct vm_fault vmf = { 2275 struct vm_fault vmf = {
2263 .page = NULL, 2276 .page = NULL,
2264 .pgoff = linear_page_index(vma, address), 2277 .pgoff = linear_page_index(vma, fe->address),
2265 .virtual_address = (void __user *)(address & PAGE_MASK), 2278 .virtual_address =
2279 (void __user *)(fe->address & PAGE_MASK),
2266 .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, 2280 .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
2267 }; 2281 };
2268 int ret; 2282 int ret;
2269 2283
2270 pte_unmap_unlock(page_table, ptl); 2284 pte_unmap_unlock(fe->pte, fe->ptl);
2271 ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); 2285 ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
2272 if (ret & VM_FAULT_ERROR) 2286 if (ret & VM_FAULT_ERROR)
2273 return ret; 2287 return ret;
2274 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2288 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2289 &fe->ptl);
2275 /* 2290 /*
2276 * We might have raced with another page fault while we 2291 * We might have raced with another page fault while we
2277 * released the pte_offset_map_lock. 2292 * released the pte_offset_map_lock.
2278 */ 2293 */
2279 if (!pte_same(*page_table, orig_pte)) { 2294 if (!pte_same(*fe->pte, orig_pte)) {
2280 pte_unmap_unlock(page_table, ptl); 2295 pte_unmap_unlock(fe->pte, fe->ptl);
2281 return 0; 2296 return 0;
2282 } 2297 }
2283 } 2298 }
2284 return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, 2299 return wp_page_reuse(fe, orig_pte, NULL, 0, 0);
2285 NULL, 0, 0);
2286} 2300}
2287 2301
2288static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, 2302static int wp_page_shared(struct fault_env *fe, pte_t orig_pte,
2289 unsigned long address, pte_t *page_table, 2303 struct page *old_page)
2290 pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, 2304 __releases(fe->ptl)
2291 struct page *old_page)
2292 __releases(ptl)
2293{ 2305{
2306 struct vm_area_struct *vma = fe->vma;
2294 int page_mkwrite = 0; 2307 int page_mkwrite = 0;
2295 2308
2296 get_page(old_page); 2309 get_page(old_page);
@@ -2298,8 +2311,8 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
2298 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 2311 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2299 int tmp; 2312 int tmp;
2300 2313
2301 pte_unmap_unlock(page_table, ptl); 2314 pte_unmap_unlock(fe->pte, fe->ptl);
2302 tmp = do_page_mkwrite(vma, old_page, address); 2315 tmp = do_page_mkwrite(vma, old_page, fe->address);
2303 if (unlikely(!tmp || (tmp & 2316 if (unlikely(!tmp || (tmp &
2304 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { 2317 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2305 put_page(old_page); 2318 put_page(old_page);
@@ -2311,19 +2324,18 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
2311 * they did, we just return, as we can count on the 2324 * they did, we just return, as we can count on the
2312 * MMU to tell us if they didn't also make it writable. 2325 * MMU to tell us if they didn't also make it writable.
2313 */ 2326 */
2314 page_table = pte_offset_map_lock(mm, pmd, address, 2327 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2315 &ptl); 2328 &fe->ptl);
2316 if (!pte_same(*page_table, orig_pte)) { 2329 if (!pte_same(*fe->pte, orig_pte)) {
2317 unlock_page(old_page); 2330 unlock_page(old_page);
2318 pte_unmap_unlock(page_table, ptl); 2331 pte_unmap_unlock(fe->pte, fe->ptl);
2319 put_page(old_page); 2332 put_page(old_page);
2320 return 0; 2333 return 0;
2321 } 2334 }
2322 page_mkwrite = 1; 2335 page_mkwrite = 1;
2323 } 2336 }
2324 2337
2325 return wp_page_reuse(mm, vma, address, page_table, ptl, 2338 return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1);
2326 orig_pte, old_page, page_mkwrite, 1);
2327} 2339}
2328 2340
2329/* 2341/*
@@ -2344,14 +2356,13 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
2344 * but allow concurrent faults), with pte both mapped and locked. 2356 * but allow concurrent faults), with pte both mapped and locked.
2345 * We return with mmap_sem still held, but pte unmapped and unlocked. 2357 * We return with mmap_sem still held, but pte unmapped and unlocked.
2346 */ 2358 */
2347static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 2359static int do_wp_page(struct fault_env *fe, pte_t orig_pte)
2348 unsigned long address, pte_t *page_table, pmd_t *pmd, 2360 __releases(fe->ptl)
2349 spinlock_t *ptl, pte_t orig_pte)
2350 __releases(ptl)
2351{ 2361{
2362 struct vm_area_struct *vma = fe->vma;
2352 struct page *old_page; 2363 struct page *old_page;
2353 2364
2354 old_page = vm_normal_page(vma, address, orig_pte); 2365 old_page = vm_normal_page(vma, fe->address, orig_pte);
2355 if (!old_page) { 2366 if (!old_page) {
2356 /* 2367 /*
2357 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a 2368 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
@@ -2362,12 +2373,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2362 */ 2373 */
2363 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2374 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2364 (VM_WRITE|VM_SHARED)) 2375 (VM_WRITE|VM_SHARED))
2365 return wp_pfn_shared(mm, vma, address, page_table, ptl, 2376 return wp_pfn_shared(fe, orig_pte);
2366 orig_pte, pmd);
2367 2377
2368 pte_unmap_unlock(page_table, ptl); 2378 pte_unmap_unlock(fe->pte, fe->ptl);
2369 return wp_page_copy(mm, vma, address, page_table, pmd, 2379 return wp_page_copy(fe, orig_pte, old_page);
2370 orig_pte, old_page);
2371 } 2380 }
2372 2381
2373 /* 2382 /*
@@ -2378,13 +2387,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2378 int total_mapcount; 2387 int total_mapcount;
2379 if (!trylock_page(old_page)) { 2388 if (!trylock_page(old_page)) {
2380 get_page(old_page); 2389 get_page(old_page);
2381 pte_unmap_unlock(page_table, ptl); 2390 pte_unmap_unlock(fe->pte, fe->ptl);
2382 lock_page(old_page); 2391 lock_page(old_page);
2383 page_table = pte_offset_map_lock(mm, pmd, address, 2392 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
2384 &ptl); 2393 fe->address, &fe->ptl);
2385 if (!pte_same(*page_table, orig_pte)) { 2394 if (!pte_same(*fe->pte, orig_pte)) {
2386 unlock_page(old_page); 2395 unlock_page(old_page);
2387 pte_unmap_unlock(page_table, ptl); 2396 pte_unmap_unlock(fe->pte, fe->ptl);
2388 put_page(old_page); 2397 put_page(old_page);
2389 return 0; 2398 return 0;
2390 } 2399 }
@@ -2402,14 +2411,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2402 page_move_anon_rmap(old_page, vma); 2411 page_move_anon_rmap(old_page, vma);
2403 } 2412 }
2404 unlock_page(old_page); 2413 unlock_page(old_page);
2405 return wp_page_reuse(mm, vma, address, page_table, ptl, 2414 return wp_page_reuse(fe, orig_pte, old_page, 0, 0);
2406 orig_pte, old_page, 0, 0);
2407 } 2415 }
2408 unlock_page(old_page); 2416 unlock_page(old_page);
2409 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2417 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2410 (VM_WRITE|VM_SHARED))) { 2418 (VM_WRITE|VM_SHARED))) {
2411 return wp_page_shared(mm, vma, address, page_table, pmd, 2419 return wp_page_shared(fe, orig_pte, old_page);
2412 ptl, orig_pte, old_page);
2413 } 2420 }
2414 2421
2415 /* 2422 /*
@@ -2417,9 +2424,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2417 */ 2424 */
2418 get_page(old_page); 2425 get_page(old_page);
2419 2426
2420 pte_unmap_unlock(page_table, ptl); 2427 pte_unmap_unlock(fe->pte, fe->ptl);
2421 return wp_page_copy(mm, vma, address, page_table, pmd, 2428 return wp_page_copy(fe, orig_pte, old_page);
2422 orig_pte, old_page);
2423} 2429}
2424 2430
2425static void unmap_mapping_range_vma(struct vm_area_struct *vma, 2431static void unmap_mapping_range_vma(struct vm_area_struct *vma,
@@ -2507,11 +2513,9 @@ EXPORT_SYMBOL(unmap_mapping_range);
2507 * We return with the mmap_sem locked or unlocked in the same cases 2513 * We return with the mmap_sem locked or unlocked in the same cases
2508 * as does filemap_fault(). 2514 * as does filemap_fault().
2509 */ 2515 */
2510static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, 2516int do_swap_page(struct fault_env *fe, pte_t orig_pte)
2511 unsigned long address, pte_t *page_table, pmd_t *pmd,
2512 unsigned int flags, pte_t orig_pte)
2513{ 2517{
2514 spinlock_t *ptl; 2518 struct vm_area_struct *vma = fe->vma;
2515 struct page *page, *swapcache; 2519 struct page *page, *swapcache;
2516 struct mem_cgroup *memcg; 2520 struct mem_cgroup *memcg;
2517 swp_entry_t entry; 2521 swp_entry_t entry;
@@ -2520,17 +2524,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2520 int exclusive = 0; 2524 int exclusive = 0;
2521 int ret = 0; 2525 int ret = 0;
2522 2526
2523 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2527 if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte))
2524 goto out; 2528 goto out;
2525 2529
2526 entry = pte_to_swp_entry(orig_pte); 2530 entry = pte_to_swp_entry(orig_pte);
2527 if (unlikely(non_swap_entry(entry))) { 2531 if (unlikely(non_swap_entry(entry))) {
2528 if (is_migration_entry(entry)) { 2532 if (is_migration_entry(entry)) {
2529 migration_entry_wait(mm, pmd, address); 2533 migration_entry_wait(vma->vm_mm, fe->pmd, fe->address);
2530 } else if (is_hwpoison_entry(entry)) { 2534 } else if (is_hwpoison_entry(entry)) {
2531 ret = VM_FAULT_HWPOISON; 2535 ret = VM_FAULT_HWPOISON;
2532 } else { 2536 } else {
2533 print_bad_pte(vma, address, orig_pte, NULL); 2537 print_bad_pte(vma, fe->address, orig_pte, NULL);
2534 ret = VM_FAULT_SIGBUS; 2538 ret = VM_FAULT_SIGBUS;
2535 } 2539 }
2536 goto out; 2540 goto out;
@@ -2539,14 +2543,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2539 page = lookup_swap_cache(entry); 2543 page = lookup_swap_cache(entry);
2540 if (!page) { 2544 if (!page) {
2541 page = swapin_readahead(entry, 2545 page = swapin_readahead(entry,
2542 GFP_HIGHUSER_MOVABLE, vma, address); 2546 GFP_HIGHUSER_MOVABLE, vma, fe->address);
2543 if (!page) { 2547 if (!page) {
2544 /* 2548 /*
2545 * Back out if somebody else faulted in this pte 2549 * Back out if somebody else faulted in this pte
2546 * while we released the pte lock. 2550 * while we released the pte lock.
2547 */ 2551 */
2548 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2552 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd,
2549 if (likely(pte_same(*page_table, orig_pte))) 2553 fe->address, &fe->ptl);
2554 if (likely(pte_same(*fe->pte, orig_pte)))
2550 ret = VM_FAULT_OOM; 2555 ret = VM_FAULT_OOM;
2551 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2556 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2552 goto unlock; 2557 goto unlock;
@@ -2555,7 +2560,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2555 /* Had to read the page from swap area: Major fault */ 2560 /* Had to read the page from swap area: Major fault */
2556 ret = VM_FAULT_MAJOR; 2561 ret = VM_FAULT_MAJOR;
2557 count_vm_event(PGMAJFAULT); 2562 count_vm_event(PGMAJFAULT);
2558 mem_cgroup_count_vm_event(mm, PGMAJFAULT); 2563 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
2559 } else if (PageHWPoison(page)) { 2564 } else if (PageHWPoison(page)) {
2560 /* 2565 /*
2561 * hwpoisoned dirty swapcache pages are kept for killing 2566 * hwpoisoned dirty swapcache pages are kept for killing
@@ -2568,7 +2573,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2568 } 2573 }
2569 2574
2570 swapcache = page; 2575 swapcache = page;
2571 locked = lock_page_or_retry(page, mm, flags); 2576 locked = lock_page_or_retry(page, vma->vm_mm, fe->flags);
2572 2577
2573 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2578 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2574 if (!locked) { 2579 if (!locked) {
@@ -2585,14 +2590,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2585 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) 2590 if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
2586 goto out_page; 2591 goto out_page;
2587 2592
2588 page = ksm_might_need_to_copy(page, vma, address); 2593 page = ksm_might_need_to_copy(page, vma, fe->address);
2589 if (unlikely(!page)) { 2594 if (unlikely(!page)) {
2590 ret = VM_FAULT_OOM; 2595 ret = VM_FAULT_OOM;
2591 page = swapcache; 2596 page = swapcache;
2592 goto out_page; 2597 goto out_page;
2593 } 2598 }
2594 2599
2595 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) { 2600 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
2601 &memcg, false)) {
2596 ret = VM_FAULT_OOM; 2602 ret = VM_FAULT_OOM;
2597 goto out_page; 2603 goto out_page;
2598 } 2604 }
@@ -2600,8 +2606,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2600 /* 2606 /*
2601 * Back out if somebody else already faulted in this pte. 2607 * Back out if somebody else already faulted in this pte.
2602 */ 2608 */
2603 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2609 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2604 if (unlikely(!pte_same(*page_table, orig_pte))) 2610 &fe->ptl);
2611 if (unlikely(!pte_same(*fe->pte, orig_pte)))
2605 goto out_nomap; 2612 goto out_nomap;
2606 2613
2607 if (unlikely(!PageUptodate(page))) { 2614 if (unlikely(!PageUptodate(page))) {
@@ -2619,24 +2626,24 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2619 * must be called after the swap_free(), or it will never succeed. 2626 * must be called after the swap_free(), or it will never succeed.
2620 */ 2627 */
2621 2628
2622 inc_mm_counter_fast(mm, MM_ANONPAGES); 2629 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2623 dec_mm_counter_fast(mm, MM_SWAPENTS); 2630 dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
2624 pte = mk_pte(page, vma->vm_page_prot); 2631 pte = mk_pte(page, vma->vm_page_prot);
2625 if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { 2632 if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
2626 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2633 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2627 flags &= ~FAULT_FLAG_WRITE; 2634 fe->flags &= ~FAULT_FLAG_WRITE;
2628 ret |= VM_FAULT_WRITE; 2635 ret |= VM_FAULT_WRITE;
2629 exclusive = RMAP_EXCLUSIVE; 2636 exclusive = RMAP_EXCLUSIVE;
2630 } 2637 }
2631 flush_icache_page(vma, page); 2638 flush_icache_page(vma, page);
2632 if (pte_swp_soft_dirty(orig_pte)) 2639 if (pte_swp_soft_dirty(orig_pte))
2633 pte = pte_mksoft_dirty(pte); 2640 pte = pte_mksoft_dirty(pte);
2634 set_pte_at(mm, address, page_table, pte); 2641 set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
2635 if (page == swapcache) { 2642 if (page == swapcache) {
2636 do_page_add_anon_rmap(page, vma, address, exclusive); 2643 do_page_add_anon_rmap(page, vma, fe->address, exclusive);
2637 mem_cgroup_commit_charge(page, memcg, true, false); 2644 mem_cgroup_commit_charge(page, memcg, true, false);
2638 } else { /* ksm created a completely new copy */ 2645 } else { /* ksm created a completely new copy */
2639 page_add_new_anon_rmap(page, vma, address, false); 2646 page_add_new_anon_rmap(page, vma, fe->address, false);
2640 mem_cgroup_commit_charge(page, memcg, false, false); 2647 mem_cgroup_commit_charge(page, memcg, false, false);
2641 lru_cache_add_active_or_unevictable(page, vma); 2648 lru_cache_add_active_or_unevictable(page, vma);
2642 } 2649 }
@@ -2659,22 +2666,22 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2659 put_page(swapcache); 2666 put_page(swapcache);
2660 } 2667 }
2661 2668
2662 if (flags & FAULT_FLAG_WRITE) { 2669 if (fe->flags & FAULT_FLAG_WRITE) {
2663 ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); 2670 ret |= do_wp_page(fe, pte);
2664 if (ret & VM_FAULT_ERROR) 2671 if (ret & VM_FAULT_ERROR)
2665 ret &= VM_FAULT_ERROR; 2672 ret &= VM_FAULT_ERROR;
2666 goto out; 2673 goto out;
2667 } 2674 }
2668 2675
2669 /* No need to invalidate - it was non-present before */ 2676 /* No need to invalidate - it was non-present before */
2670 update_mmu_cache(vma, address, page_table); 2677 update_mmu_cache(vma, fe->address, fe->pte);
2671unlock: 2678unlock:
2672 pte_unmap_unlock(page_table, ptl); 2679 pte_unmap_unlock(fe->pte, fe->ptl);
2673out: 2680out:
2674 return ret; 2681 return ret;
2675out_nomap: 2682out_nomap:
2676 mem_cgroup_cancel_charge(page, memcg, false); 2683 mem_cgroup_cancel_charge(page, memcg, false);
2677 pte_unmap_unlock(page_table, ptl); 2684 pte_unmap_unlock(fe->pte, fe->ptl);
2678out_page: 2685out_page:
2679 unlock_page(page); 2686 unlock_page(page);
2680out_release: 2687out_release:
@@ -2725,37 +2732,51 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
2725 * but allow concurrent faults), and pte mapped but not yet locked. 2732 * but allow concurrent faults), and pte mapped but not yet locked.
2726 * We return with mmap_sem still held, but pte unmapped and unlocked. 2733 * We return with mmap_sem still held, but pte unmapped and unlocked.
2727 */ 2734 */
2728static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 2735static int do_anonymous_page(struct fault_env *fe)
2729 unsigned long address, pte_t *page_table, pmd_t *pmd,
2730 unsigned int flags)
2731{ 2736{
2737 struct vm_area_struct *vma = fe->vma;
2732 struct mem_cgroup *memcg; 2738 struct mem_cgroup *memcg;
2733 struct page *page; 2739 struct page *page;
2734 spinlock_t *ptl;
2735 pte_t entry; 2740 pte_t entry;
2736 2741
2737 pte_unmap(page_table);
2738
2739 /* File mapping without ->vm_ops ? */ 2742 /* File mapping without ->vm_ops ? */
2740 if (vma->vm_flags & VM_SHARED) 2743 if (vma->vm_flags & VM_SHARED)
2741 return VM_FAULT_SIGBUS; 2744 return VM_FAULT_SIGBUS;
2742 2745
2743 /* Check if we need to add a guard page to the stack */ 2746 /* Check if we need to add a guard page to the stack */
2744 if (check_stack_guard_page(vma, address) < 0) 2747 if (check_stack_guard_page(vma, fe->address) < 0)
2745 return VM_FAULT_SIGSEGV; 2748 return VM_FAULT_SIGSEGV;
2746 2749
2750 /*
2751 * Use pte_alloc() instead of pte_alloc_map(). We can't run
2752 * pte_offset_map() on pmds where a huge pmd might be created
2753 * from a different thread.
2754 *
2755 * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
2756 * parallel threads are excluded by other means.
2757 *
2758 * Here we only have down_read(mmap_sem).
2759 */
2760 if (pte_alloc(vma->vm_mm, fe->pmd, fe->address))
2761 return VM_FAULT_OOM;
2762
2763 /* See the comment in pte_alloc_one_map() */
2764 if (unlikely(pmd_trans_unstable(fe->pmd)))
2765 return 0;
2766
2747 /* Use the zero-page for reads */ 2767 /* Use the zero-page for reads */
2748 if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) { 2768 if (!(fe->flags & FAULT_FLAG_WRITE) &&
2749 entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), 2769 !mm_forbids_zeropage(vma->vm_mm)) {
2770 entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address),
2750 vma->vm_page_prot)); 2771 vma->vm_page_prot));
2751 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2772 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2752 if (!pte_none(*page_table)) 2773 &fe->ptl);
2774 if (!pte_none(*fe->pte))
2753 goto unlock; 2775 goto unlock;
2754 /* Deliver the page fault to userland, check inside PT lock */ 2776 /* Deliver the page fault to userland, check inside PT lock */
2755 if (userfaultfd_missing(vma)) { 2777 if (userfaultfd_missing(vma)) {
2756 pte_unmap_unlock(page_table, ptl); 2778 pte_unmap_unlock(fe->pte, fe->ptl);
2757 return handle_userfault(vma, address, flags, 2779 return handle_userfault(fe, VM_UFFD_MISSING);
2758 VM_UFFD_MISSING);
2759 } 2780 }
2760 goto setpte; 2781 goto setpte;
2761 } 2782 }
@@ -2763,11 +2784,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2763 /* Allocate our own private page. */ 2784 /* Allocate our own private page. */
2764 if (unlikely(anon_vma_prepare(vma))) 2785 if (unlikely(anon_vma_prepare(vma)))
2765 goto oom; 2786 goto oom;
2766 page = alloc_zeroed_user_highpage_movable(vma, address); 2787 page = alloc_zeroed_user_highpage_movable(vma, fe->address);
2767 if (!page) 2788 if (!page)
2768 goto oom; 2789 goto oom;
2769 2790
2770 if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) 2791 if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
2771 goto oom_free_page; 2792 goto oom_free_page;
2772 2793
2773 /* 2794 /*
@@ -2781,30 +2802,30 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2781 if (vma->vm_flags & VM_WRITE) 2802 if (vma->vm_flags & VM_WRITE)
2782 entry = pte_mkwrite(pte_mkdirty(entry)); 2803 entry = pte_mkwrite(pte_mkdirty(entry));
2783 2804
2784 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2805 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2785 if (!pte_none(*page_table)) 2806 &fe->ptl);
2807 if (!pte_none(*fe->pte))
2786 goto release; 2808 goto release;
2787 2809
2788 /* Deliver the page fault to userland, check inside PT lock */ 2810 /* Deliver the page fault to userland, check inside PT lock */
2789 if (userfaultfd_missing(vma)) { 2811 if (userfaultfd_missing(vma)) {
2790 pte_unmap_unlock(page_table, ptl); 2812 pte_unmap_unlock(fe->pte, fe->ptl);
2791 mem_cgroup_cancel_charge(page, memcg, false); 2813 mem_cgroup_cancel_charge(page, memcg, false);
2792 put_page(page); 2814 put_page(page);
2793 return handle_userfault(vma, address, flags, 2815 return handle_userfault(fe, VM_UFFD_MISSING);
2794 VM_UFFD_MISSING);
2795 } 2816 }
2796 2817
2797 inc_mm_counter_fast(mm, MM_ANONPAGES); 2818 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2798 page_add_new_anon_rmap(page, vma, address, false); 2819 page_add_new_anon_rmap(page, vma, fe->address, false);
2799 mem_cgroup_commit_charge(page, memcg, false, false); 2820 mem_cgroup_commit_charge(page, memcg, false, false);
2800 lru_cache_add_active_or_unevictable(page, vma); 2821 lru_cache_add_active_or_unevictable(page, vma);
2801setpte: 2822setpte:
2802 set_pte_at(mm, address, page_table, entry); 2823 set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
2803 2824
2804 /* No need to invalidate - it was non-present before */ 2825 /* No need to invalidate - it was non-present before */
2805 update_mmu_cache(vma, address, page_table); 2826 update_mmu_cache(vma, fe->address, fe->pte);
2806unlock: 2827unlock:
2807 pte_unmap_unlock(page_table, ptl); 2828 pte_unmap_unlock(fe->pte, fe->ptl);
2808 return 0; 2829 return 0;
2809release: 2830release:
2810 mem_cgroup_cancel_charge(page, memcg, false); 2831 mem_cgroup_cancel_charge(page, memcg, false);
@@ -2821,17 +2842,16 @@ oom:
2821 * released depending on flags and vma->vm_ops->fault() return value. 2842 * released depending on flags and vma->vm_ops->fault() return value.
2822 * See filemap_fault() and __lock_page_retry(). 2843 * See filemap_fault() and __lock_page_retry().
2823 */ 2844 */
2824static int __do_fault(struct vm_area_struct *vma, unsigned long address, 2845static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
2825 pgoff_t pgoff, unsigned int flags, 2846 struct page *cow_page, struct page **page, void **entry)
2826 struct page *cow_page, struct page **page,
2827 void **entry)
2828{ 2847{
2848 struct vm_area_struct *vma = fe->vma;
2829 struct vm_fault vmf; 2849 struct vm_fault vmf;
2830 int ret; 2850 int ret;
2831 2851
2832 vmf.virtual_address = (void __user *)(address & PAGE_MASK); 2852 vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK);
2833 vmf.pgoff = pgoff; 2853 vmf.pgoff = pgoff;
2834 vmf.flags = flags; 2854 vmf.flags = fe->flags;
2835 vmf.page = NULL; 2855 vmf.page = NULL;
2836 vmf.gfp_mask = __get_fault_gfp_mask(vma); 2856 vmf.gfp_mask = __get_fault_gfp_mask(vma);
2837 vmf.cow_page = cow_page; 2857 vmf.cow_page = cow_page;
@@ -2860,41 +2880,168 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
2860 return ret; 2880 return ret;
2861} 2881}
2862 2882
2883static int pte_alloc_one_map(struct fault_env *fe)
2884{
2885 struct vm_area_struct *vma = fe->vma;
2886
2887 if (!pmd_none(*fe->pmd))
2888 goto map_pte;
2889 if (fe->prealloc_pte) {
2890 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
2891 if (unlikely(!pmd_none(*fe->pmd))) {
2892 spin_unlock(fe->ptl);
2893 goto map_pte;
2894 }
2895
2896 atomic_long_inc(&vma->vm_mm->nr_ptes);
2897 pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte);
2898 spin_unlock(fe->ptl);
2899 fe->prealloc_pte = 0;
2900 } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) {
2901 return VM_FAULT_OOM;
2902 }
2903map_pte:
2904 /*
2905 * If a huge pmd materialized under us just retry later. Use
2906 * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
2907 * didn't become pmd_trans_huge under us and then back to pmd_none, as
2908 * a result of MADV_DONTNEED running immediately after a huge pmd fault
2909 * in a different thread of this mm, in turn leading to a misleading
2910 * pmd_trans_huge() retval. All we have to ensure is that it is a
2911 * regular pmd that we can walk with pte_offset_map() and we can do that
2912 * through an atomic read in C, which is what pmd_trans_unstable()
2913 * provides.
2914 */
2915 if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
2916 return VM_FAULT_NOPAGE;
2917
2918 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2919 &fe->ptl);
2920 return 0;
2921}
2922
2923#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
2924
2925#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
2926static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
2927 unsigned long haddr)
2928{
2929 if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
2930 (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
2931 return false;
2932 if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
2933 return false;
2934 return true;
2935}
2936
2937static int do_set_pmd(struct fault_env *fe, struct page *page)
2938{
2939 struct vm_area_struct *vma = fe->vma;
2940 bool write = fe->flags & FAULT_FLAG_WRITE;
2941 unsigned long haddr = fe->address & HPAGE_PMD_MASK;
2942 pmd_t entry;
2943 int i, ret;
2944
2945 if (!transhuge_vma_suitable(vma, haddr))
2946 return VM_FAULT_FALLBACK;
2947
2948 ret = VM_FAULT_FALLBACK;
2949 page = compound_head(page);
2950
2951 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
2952 if (unlikely(!pmd_none(*fe->pmd)))
2953 goto out;
2954
2955 for (i = 0; i < HPAGE_PMD_NR; i++)
2956 flush_icache_page(vma, page + i);
2957
2958 entry = mk_huge_pmd(page, vma->vm_page_prot);
2959 if (write)
2960 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
2961
2962 add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR);
2963 page_add_file_rmap(page, true);
2964
2965 set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry);
2966
2967 update_mmu_cache_pmd(vma, haddr, fe->pmd);
2968
2969 /* fault is handled */
2970 ret = 0;
2971 count_vm_event(THP_FILE_MAPPED);
2972out:
2973 spin_unlock(fe->ptl);
2974 return ret;
2975}
2976#else
2977static int do_set_pmd(struct fault_env *fe, struct page *page)
2978{
2979 BUILD_BUG();
2980 return 0;
2981}
2982#endif
2983
2863/** 2984/**
2864 * do_set_pte - setup new PTE entry for given page and add reverse page mapping. 2985 * alloc_set_pte - setup new PTE entry for given page and add reverse page
2986 * mapping. If needed, the fucntion allocates page table or use pre-allocated.
2865 * 2987 *
2866 * @vma: virtual memory area 2988 * @fe: fault environment
2867 * @address: user virtual address 2989 * @memcg: memcg to charge page (only for private mappings)
2868 * @page: page to map 2990 * @page: page to map
2869 * @pte: pointer to target page table entry
2870 * @write: true, if new entry is writable
2871 * @anon: true, if it's anonymous page
2872 * 2991 *
2873 * Caller must hold page table lock relevant for @pte. 2992 * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return.
2874 * 2993 *
2875 * Target users are page handler itself and implementations of 2994 * Target users are page handler itself and implementations of
2876 * vm_ops->map_pages. 2995 * vm_ops->map_pages.
2877 */ 2996 */
2878void do_set_pte(struct vm_area_struct *vma, unsigned long address, 2997int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
2879 struct page *page, pte_t *pte, bool write, bool anon) 2998 struct page *page)
2880{ 2999{
3000 struct vm_area_struct *vma = fe->vma;
3001 bool write = fe->flags & FAULT_FLAG_WRITE;
2881 pte_t entry; 3002 pte_t entry;
3003 int ret;
3004
3005 if (pmd_none(*fe->pmd) && PageTransCompound(page) &&
3006 IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
3007 /* THP on COW? */
3008 VM_BUG_ON_PAGE(memcg, page);
3009
3010 ret = do_set_pmd(fe, page);
3011 if (ret != VM_FAULT_FALLBACK)
3012 return ret;
3013 }
3014
3015 if (!fe->pte) {
3016 ret = pte_alloc_one_map(fe);
3017 if (ret)
3018 return ret;
3019 }
3020
3021 /* Re-check under ptl */
3022 if (unlikely(!pte_none(*fe->pte)))
3023 return VM_FAULT_NOPAGE;
2882 3024
2883 flush_icache_page(vma, page); 3025 flush_icache_page(vma, page);
2884 entry = mk_pte(page, vma->vm_page_prot); 3026 entry = mk_pte(page, vma->vm_page_prot);
2885 if (write) 3027 if (write)
2886 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 3028 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2887 if (anon) { 3029 /* copy-on-write page */
3030 if (write && !(vma->vm_flags & VM_SHARED)) {
2888 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 3031 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2889 page_add_new_anon_rmap(page, vma, address, false); 3032 page_add_new_anon_rmap(page, vma, fe->address, false);
3033 mem_cgroup_commit_charge(page, memcg, false, false);
3034 lru_cache_add_active_or_unevictable(page, vma);
2890 } else { 3035 } else {
2891 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); 3036 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
2892 page_add_file_rmap(page); 3037 page_add_file_rmap(page, false);
2893 } 3038 }
2894 set_pte_at(vma->vm_mm, address, pte, entry); 3039 set_pte_at(vma->vm_mm, fe->address, fe->pte, entry);
2895 3040
2896 /* no need to invalidate: a not-present page won't be cached */ 3041 /* no need to invalidate: a not-present page won't be cached */
2897 update_mmu_cache(vma, address, pte); 3042 update_mmu_cache(vma, fe->address, fe->pte);
3043
3044 return 0;
2898} 3045}
2899 3046
2900static unsigned long fault_around_bytes __read_mostly = 3047static unsigned long fault_around_bytes __read_mostly =
@@ -2961,57 +3108,66 @@ late_initcall(fault_around_debugfs);
2961 * fault_around_pages() value (and therefore to page order). This way it's 3108 * fault_around_pages() value (and therefore to page order). This way it's
2962 * easier to guarantee that we don't cross page table boundaries. 3109 * easier to guarantee that we don't cross page table boundaries.
2963 */ 3110 */
2964static void do_fault_around(struct vm_area_struct *vma, unsigned long address, 3111static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
2965 pte_t *pte, pgoff_t pgoff, unsigned int flags)
2966{ 3112{
2967 unsigned long start_addr, nr_pages, mask; 3113 unsigned long address = fe->address, nr_pages, mask;
2968 pgoff_t max_pgoff; 3114 pgoff_t end_pgoff;
2969 struct vm_fault vmf; 3115 int off, ret = 0;
2970 int off;
2971 3116
2972 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; 3117 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
2973 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; 3118 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
2974 3119
2975 start_addr = max(address & mask, vma->vm_start); 3120 fe->address = max(address & mask, fe->vma->vm_start);
2976 off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); 3121 off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
2977 pte -= off; 3122 start_pgoff -= off;
2978 pgoff -= off;
2979 3123
2980 /* 3124 /*
2981 * max_pgoff is either end of page table or end of vma 3125 * end_pgoff is either end of page table or end of vma
2982 * or fault_around_pages() from pgoff, depending what is nearest. 3126 * or fault_around_pages() from start_pgoff, depending what is nearest.
2983 */ 3127 */
2984 max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + 3128 end_pgoff = start_pgoff -
3129 ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
2985 PTRS_PER_PTE - 1; 3130 PTRS_PER_PTE - 1;
2986 max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, 3131 end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1,
2987 pgoff + nr_pages - 1); 3132 start_pgoff + nr_pages - 1);
2988 3133
2989 /* Check if it makes any sense to call ->map_pages */ 3134 if (pmd_none(*fe->pmd)) {
2990 while (!pte_none(*pte)) { 3135 fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address);
2991 if (++pgoff > max_pgoff) 3136 smp_wmb(); /* See comment in __pte_alloc() */
2992 return;
2993 start_addr += PAGE_SIZE;
2994 if (start_addr >= vma->vm_end)
2995 return;
2996 pte++;
2997 } 3137 }
2998 3138
2999 vmf.virtual_address = (void __user *) start_addr; 3139 fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
3000 vmf.pte = pte; 3140
3001 vmf.pgoff = pgoff; 3141 /* preallocated pagetable is unused: free it */
3002 vmf.max_pgoff = max_pgoff; 3142 if (fe->prealloc_pte) {
3003 vmf.flags = flags; 3143 pte_free(fe->vma->vm_mm, fe->prealloc_pte);
3004 vmf.gfp_mask = __get_fault_gfp_mask(vma); 3144 fe->prealloc_pte = 0;
3005 vma->vm_ops->map_pages(vma, &vmf); 3145 }
3146 /* Huge page is mapped? Page fault is solved */
3147 if (pmd_trans_huge(*fe->pmd)) {
3148 ret = VM_FAULT_NOPAGE;
3149 goto out;
3150 }
3151
3152 /* ->map_pages() haven't done anything useful. Cold page cache? */
3153 if (!fe->pte)
3154 goto out;
3155
3156 /* check if the page fault is solved */
3157 fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3158 if (!pte_none(*fe->pte))
3159 ret = VM_FAULT_NOPAGE;
3160 pte_unmap_unlock(fe->pte, fe->ptl);
3161out:
3162 fe->address = address;
3163 fe->pte = NULL;
3164 return ret;
3006} 3165}
3007 3166
3008static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3167static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
3009 unsigned long address, pmd_t *pmd,
3010 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3011{ 3168{
3169 struct vm_area_struct *vma = fe->vma;
3012 struct page *fault_page; 3170 struct page *fault_page;
3013 spinlock_t *ptl;
3014 pte_t *pte;
3015 int ret = 0; 3171 int ret = 0;
3016 3172
3017 /* 3173 /*
@@ -3020,85 +3176,64 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3020 * something). 3176 * something).
3021 */ 3177 */
3022 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { 3178 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3023 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 3179 ret = do_fault_around(fe, pgoff);
3024 do_fault_around(vma, address, pte, pgoff, flags); 3180 if (ret)
3025 if (!pte_same(*pte, orig_pte)) 3181 return ret;
3026 goto unlock_out;
3027 pte_unmap_unlock(pte, ptl);
3028 } 3182 }
3029 3183
3030 ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); 3184 ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
3031 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3185 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3032 return ret; 3186 return ret;
3033 3187
3034 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 3188 ret |= alloc_set_pte(fe, NULL, fault_page);
3035 if (unlikely(!pte_same(*pte, orig_pte))) { 3189 if (fe->pte)
3036 pte_unmap_unlock(pte, ptl); 3190 pte_unmap_unlock(fe->pte, fe->ptl);
3037 unlock_page(fault_page);
3038 put_page(fault_page);
3039 return ret;
3040 }
3041 do_set_pte(vma, address, fault_page, pte, false, false);
3042 unlock_page(fault_page); 3191 unlock_page(fault_page);
3043unlock_out: 3192 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3044 pte_unmap_unlock(pte, ptl); 3193 put_page(fault_page);
3045 return ret; 3194 return ret;
3046} 3195}
3047 3196
3048static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3197static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff)
3049 unsigned long address, pmd_t *pmd,
3050 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3051{ 3198{
3199 struct vm_area_struct *vma = fe->vma;
3052 struct page *fault_page, *new_page; 3200 struct page *fault_page, *new_page;
3053 void *fault_entry; 3201 void *fault_entry;
3054 struct mem_cgroup *memcg; 3202 struct mem_cgroup *memcg;
3055 spinlock_t *ptl;
3056 pte_t *pte;
3057 int ret; 3203 int ret;
3058 3204
3059 if (unlikely(anon_vma_prepare(vma))) 3205 if (unlikely(anon_vma_prepare(vma)))
3060 return VM_FAULT_OOM; 3206 return VM_FAULT_OOM;
3061 3207
3062 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 3208 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address);
3063 if (!new_page) 3209 if (!new_page)
3064 return VM_FAULT_OOM; 3210 return VM_FAULT_OOM;
3065 3211
3066 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) { 3212 if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL,
3213 &memcg, false)) {
3067 put_page(new_page); 3214 put_page(new_page);
3068 return VM_FAULT_OOM; 3215 return VM_FAULT_OOM;
3069 } 3216 }
3070 3217
3071 ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page, 3218 ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry);
3072 &fault_entry);
3073 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3219 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3074 goto uncharge_out; 3220 goto uncharge_out;
3075 3221
3076 if (!(ret & VM_FAULT_DAX_LOCKED)) 3222 if (!(ret & VM_FAULT_DAX_LOCKED))
3077 copy_user_highpage(new_page, fault_page, address, vma); 3223 copy_user_highpage(new_page, fault_page, fe->address, vma);
3078 __SetPageUptodate(new_page); 3224 __SetPageUptodate(new_page);
3079 3225
3080 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 3226 ret |= alloc_set_pte(fe, memcg, new_page);
3081 if (unlikely(!pte_same(*pte, orig_pte))) { 3227 if (fe->pte)
3082 pte_unmap_unlock(pte, ptl); 3228 pte_unmap_unlock(fe->pte, fe->ptl);
3083 if (!(ret & VM_FAULT_DAX_LOCKED)) {
3084 unlock_page(fault_page);
3085 put_page(fault_page);
3086 } else {
3087 dax_unlock_mapping_entry(vma->vm_file->f_mapping,
3088 pgoff);
3089 }
3090 goto uncharge_out;
3091 }
3092 do_set_pte(vma, address, new_page, pte, true, true);
3093 mem_cgroup_commit_charge(new_page, memcg, false, false);
3094 lru_cache_add_active_or_unevictable(new_page, vma);
3095 pte_unmap_unlock(pte, ptl);
3096 if (!(ret & VM_FAULT_DAX_LOCKED)) { 3229 if (!(ret & VM_FAULT_DAX_LOCKED)) {
3097 unlock_page(fault_page); 3230 unlock_page(fault_page);
3098 put_page(fault_page); 3231 put_page(fault_page);
3099 } else { 3232 } else {
3100 dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); 3233 dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
3101 } 3234 }
3235 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3236 goto uncharge_out;
3102 return ret; 3237 return ret;
3103uncharge_out: 3238uncharge_out:
3104 mem_cgroup_cancel_charge(new_page, memcg, false); 3239 mem_cgroup_cancel_charge(new_page, memcg, false);
@@ -3106,18 +3241,15 @@ uncharge_out:
3106 return ret; 3241 return ret;
3107} 3242}
3108 3243
3109static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3244static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
3110 unsigned long address, pmd_t *pmd,
3111 pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
3112{ 3245{
3246 struct vm_area_struct *vma = fe->vma;
3113 struct page *fault_page; 3247 struct page *fault_page;
3114 struct address_space *mapping; 3248 struct address_space *mapping;
3115 spinlock_t *ptl;
3116 pte_t *pte;
3117 int dirtied = 0; 3249 int dirtied = 0;
3118 int ret, tmp; 3250 int ret, tmp;
3119 3251
3120 ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); 3252 ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
3121 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3253 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3122 return ret; 3254 return ret;
3123 3255
@@ -3127,7 +3259,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3127 */ 3259 */
3128 if (vma->vm_ops->page_mkwrite) { 3260 if (vma->vm_ops->page_mkwrite) {
3129 unlock_page(fault_page); 3261 unlock_page(fault_page);
3130 tmp = do_page_mkwrite(vma, fault_page, address); 3262 tmp = do_page_mkwrite(vma, fault_page, fe->address);
3131 if (unlikely(!tmp || 3263 if (unlikely(!tmp ||
3132 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { 3264 (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
3133 put_page(fault_page); 3265 put_page(fault_page);
@@ -3135,15 +3267,15 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3135 } 3267 }
3136 } 3268 }
3137 3269
3138 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 3270 ret |= alloc_set_pte(fe, NULL, fault_page);
3139 if (unlikely(!pte_same(*pte, orig_pte))) { 3271 if (fe->pte)
3140 pte_unmap_unlock(pte, ptl); 3272 pte_unmap_unlock(fe->pte, fe->ptl);
3273 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3274 VM_FAULT_RETRY))) {
3141 unlock_page(fault_page); 3275 unlock_page(fault_page);
3142 put_page(fault_page); 3276 put_page(fault_page);
3143 return ret; 3277 return ret;
3144 } 3278 }
3145 do_set_pte(vma, address, fault_page, pte, true, false);
3146 pte_unmap_unlock(pte, ptl);
3147 3279
3148 if (set_page_dirty(fault_page)) 3280 if (set_page_dirty(fault_page))
3149 dirtied = 1; 3281 dirtied = 1;
@@ -3175,23 +3307,19 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3175 * The mmap_sem may have been released depending on flags and our 3307 * The mmap_sem may have been released depending on flags and our
3176 * return value. See filemap_fault() and __lock_page_or_retry(). 3308 * return value. See filemap_fault() and __lock_page_or_retry().
3177 */ 3309 */
3178static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3310static int do_fault(struct fault_env *fe)
3179 unsigned long address, pte_t *page_table, pmd_t *pmd,
3180 unsigned int flags, pte_t orig_pte)
3181{ 3311{
3182 pgoff_t pgoff = linear_page_index(vma, address); 3312 struct vm_area_struct *vma = fe->vma;
3313 pgoff_t pgoff = linear_page_index(vma, fe->address);
3183 3314
3184 pte_unmap(page_table);
3185 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ 3315 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
3186 if (!vma->vm_ops->fault) 3316 if (!vma->vm_ops->fault)
3187 return VM_FAULT_SIGBUS; 3317 return VM_FAULT_SIGBUS;
3188 if (!(flags & FAULT_FLAG_WRITE)) 3318 if (!(fe->flags & FAULT_FLAG_WRITE))
3189 return do_read_fault(mm, vma, address, pmd, pgoff, flags, 3319 return do_read_fault(fe, pgoff);
3190 orig_pte);
3191 if (!(vma->vm_flags & VM_SHARED)) 3320 if (!(vma->vm_flags & VM_SHARED))
3192 return do_cow_fault(mm, vma, address, pmd, pgoff, flags, 3321 return do_cow_fault(fe, pgoff);
3193 orig_pte); 3322 return do_shared_fault(fe, pgoff);
3194 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3195} 3323}
3196 3324
3197static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 3325static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3209,11 +3337,10 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3209 return mpol_misplaced(page, vma, addr); 3337 return mpol_misplaced(page, vma, addr);
3210} 3338}
3211 3339
3212static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, 3340static int do_numa_page(struct fault_env *fe, pte_t pte)
3213 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
3214{ 3341{
3342 struct vm_area_struct *vma = fe->vma;
3215 struct page *page = NULL; 3343 struct page *page = NULL;
3216 spinlock_t *ptl;
3217 int page_nid = -1; 3344 int page_nid = -1;
3218 int last_cpupid; 3345 int last_cpupid;
3219 int target_nid; 3346 int target_nid;
@@ -3233,10 +3360,10 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3233 * page table entry is not accessible, so there would be no 3360 * page table entry is not accessible, so there would be no
3234 * concurrent hardware modifications to the PTE. 3361 * concurrent hardware modifications to the PTE.
3235 */ 3362 */
3236 ptl = pte_lockptr(mm, pmd); 3363 fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd);
3237 spin_lock(ptl); 3364 spin_lock(fe->ptl);
3238 if (unlikely(!pte_same(*ptep, pte))) { 3365 if (unlikely(!pte_same(*fe->pte, pte))) {
3239 pte_unmap_unlock(ptep, ptl); 3366 pte_unmap_unlock(fe->pte, fe->ptl);
3240 goto out; 3367 goto out;
3241 } 3368 }
3242 3369
@@ -3245,18 +3372,18 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3245 pte = pte_mkyoung(pte); 3372 pte = pte_mkyoung(pte);
3246 if (was_writable) 3373 if (was_writable)
3247 pte = pte_mkwrite(pte); 3374 pte = pte_mkwrite(pte);
3248 set_pte_at(mm, addr, ptep, pte); 3375 set_pte_at(vma->vm_mm, fe->address, fe->pte, pte);
3249 update_mmu_cache(vma, addr, ptep); 3376 update_mmu_cache(vma, fe->address, fe->pte);
3250 3377
3251 page = vm_normal_page(vma, addr, pte); 3378 page = vm_normal_page(vma, fe->address, pte);
3252 if (!page) { 3379 if (!page) {
3253 pte_unmap_unlock(ptep, ptl); 3380 pte_unmap_unlock(fe->pte, fe->ptl);
3254 return 0; 3381 return 0;
3255 } 3382 }
3256 3383
3257 /* TODO: handle PTE-mapped THP */ 3384 /* TODO: handle PTE-mapped THP */
3258 if (PageCompound(page)) { 3385 if (PageCompound(page)) {
3259 pte_unmap_unlock(ptep, ptl); 3386 pte_unmap_unlock(fe->pte, fe->ptl);
3260 return 0; 3387 return 0;
3261 } 3388 }
3262 3389
@@ -3280,8 +3407,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3280 3407
3281 last_cpupid = page_cpupid_last(page); 3408 last_cpupid = page_cpupid_last(page);
3282 page_nid = page_to_nid(page); 3409 page_nid = page_to_nid(page);
3283 target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); 3410 target_nid = numa_migrate_prep(page, vma, fe->address, page_nid,
3284 pte_unmap_unlock(ptep, ptl); 3411 &flags);
3412 pte_unmap_unlock(fe->pte, fe->ptl);
3285 if (target_nid == -1) { 3413 if (target_nid == -1) {
3286 put_page(page); 3414 put_page(page);
3287 goto out; 3415 goto out;
@@ -3301,24 +3429,29 @@ out:
3301 return 0; 3429 return 0;
3302} 3430}
3303 3431
3304static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, 3432static int create_huge_pmd(struct fault_env *fe)
3305 unsigned long address, pmd_t *pmd, unsigned int flags)
3306{ 3433{
3434 struct vm_area_struct *vma = fe->vma;
3307 if (vma_is_anonymous(vma)) 3435 if (vma_is_anonymous(vma))
3308 return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); 3436 return do_huge_pmd_anonymous_page(fe);
3309 if (vma->vm_ops->pmd_fault) 3437 if (vma->vm_ops->pmd_fault)
3310 return vma->vm_ops->pmd_fault(vma, address, pmd, flags); 3438 return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd,
3439 fe->flags);
3311 return VM_FAULT_FALLBACK; 3440 return VM_FAULT_FALLBACK;
3312} 3441}
3313 3442
3314static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, 3443static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
3315 unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
3316 unsigned int flags)
3317{ 3444{
3318 if (vma_is_anonymous(vma)) 3445 if (vma_is_anonymous(fe->vma))
3319 return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); 3446 return do_huge_pmd_wp_page(fe, orig_pmd);
3320 if (vma->vm_ops->pmd_fault) 3447 if (fe->vma->vm_ops->pmd_fault)
3321 return vma->vm_ops->pmd_fault(vma, address, pmd, flags); 3448 return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd,
3449 fe->flags);
3450
3451 /* COW handled on pte level: split pmd */
3452 VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma);
3453 split_huge_pmd(fe->vma, fe->pmd, fe->address);
3454
3322 return VM_FAULT_FALLBACK; 3455 return VM_FAULT_FALLBACK;
3323} 3456}
3324 3457
@@ -3331,59 +3464,79 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
3331 * with external mmu caches can use to update those (ie the Sparc or 3464 * with external mmu caches can use to update those (ie the Sparc or
3332 * PowerPC hashed page tables that act as extended TLBs). 3465 * PowerPC hashed page tables that act as extended TLBs).
3333 * 3466 *
3334 * We enter with non-exclusive mmap_sem (to exclude vma changes, 3467 * We enter with non-exclusive mmap_sem (to exclude vma changes, but allow
3335 * but allow concurrent faults), and pte mapped but not yet locked. 3468 * concurrent faults).
3336 * We return with pte unmapped and unlocked.
3337 * 3469 *
3338 * The mmap_sem may have been released depending on flags and our 3470 * The mmap_sem may have been released depending on flags and our return value.
3339 * return value. See filemap_fault() and __lock_page_or_retry(). 3471 * See filemap_fault() and __lock_page_or_retry().
3340 */ 3472 */
3341static int handle_pte_fault(struct mm_struct *mm, 3473static int handle_pte_fault(struct fault_env *fe)
3342 struct vm_area_struct *vma, unsigned long address,
3343 pte_t *pte, pmd_t *pmd, unsigned int flags)
3344{ 3474{
3345 pte_t entry; 3475 pte_t entry;
3346 spinlock_t *ptl;
3347 3476
3348 /* 3477 if (unlikely(pmd_none(*fe->pmd))) {
3349 * some architectures can have larger ptes than wordsize, 3478 /*
3350 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y, 3479 * Leave __pte_alloc() until later: because vm_ops->fault may
3351 * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses. 3480 * want to allocate huge page, and if we expose page table
3352 * The code below just needs a consistent view for the ifs and 3481 * for an instant, it will be difficult to retract from
3353 * we later double check anyway with the ptl lock held. So here 3482 * concurrent faults and from rmap lookups.
3354 * a barrier will do. 3483 */
3355 */ 3484 fe->pte = NULL;
3356 entry = *pte; 3485 } else {
3357 barrier(); 3486 /* See comment in pte_alloc_one_map() */
3358 if (!pte_present(entry)) { 3487 if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
3488 return 0;
3489 /*
3490 * A regular pmd is established and it can't morph into a huge
3491 * pmd from under us anymore at this point because we hold the
3492 * mmap_sem read mode and khugepaged takes it in write mode.
3493 * So now it's safe to run pte_offset_map().
3494 */
3495 fe->pte = pte_offset_map(fe->pmd, fe->address);
3496
3497 entry = *fe->pte;
3498
3499 /*
3500 * some architectures can have larger ptes than wordsize,
3501 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
3502 * CONFIG_32BIT=y, so READ_ONCE or ACCESS_ONCE cannot guarantee
3503 * atomic accesses. The code below just needs a consistent
3504 * view for the ifs and we later double check anyway with the
3505 * ptl lock held. So here a barrier will do.
3506 */
3507 barrier();
3359 if (pte_none(entry)) { 3508 if (pte_none(entry)) {
3360 if (vma_is_anonymous(vma)) 3509 pte_unmap(fe->pte);
3361 return do_anonymous_page(mm, vma, address, 3510 fe->pte = NULL;
3362 pte, pmd, flags);
3363 else
3364 return do_fault(mm, vma, address, pte, pmd,
3365 flags, entry);
3366 } 3511 }
3367 return do_swap_page(mm, vma, address,
3368 pte, pmd, flags, entry);
3369 } 3512 }
3370 3513
3514 if (!fe->pte) {
3515 if (vma_is_anonymous(fe->vma))
3516 return do_anonymous_page(fe);
3517 else
3518 return do_fault(fe);
3519 }
3520
3521 if (!pte_present(entry))
3522 return do_swap_page(fe, entry);
3523
3371 if (pte_protnone(entry)) 3524 if (pte_protnone(entry))
3372 return do_numa_page(mm, vma, address, entry, pte, pmd); 3525 return do_numa_page(fe, entry);
3373 3526
3374 ptl = pte_lockptr(mm, pmd); 3527 fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd);
3375 spin_lock(ptl); 3528 spin_lock(fe->ptl);
3376 if (unlikely(!pte_same(*pte, entry))) 3529 if (unlikely(!pte_same(*fe->pte, entry)))
3377 goto unlock; 3530 goto unlock;
3378 if (flags & FAULT_FLAG_WRITE) { 3531 if (fe->flags & FAULT_FLAG_WRITE) {
3379 if (!pte_write(entry)) 3532 if (!pte_write(entry))
3380 return do_wp_page(mm, vma, address, 3533 return do_wp_page(fe, entry);
3381 pte, pmd, ptl, entry);
3382 entry = pte_mkdirty(entry); 3534 entry = pte_mkdirty(entry);
3383 } 3535 }
3384 entry = pte_mkyoung(entry); 3536 entry = pte_mkyoung(entry);
3385 if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { 3537 if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry,
3386 update_mmu_cache(vma, address, pte); 3538 fe->flags & FAULT_FLAG_WRITE)) {
3539 update_mmu_cache(fe->vma, fe->address, fe->pte);
3387 } else { 3540 } else {
3388 /* 3541 /*
3389 * This is needed only for protection faults but the arch code 3542 * This is needed only for protection faults but the arch code
@@ -3391,11 +3544,11 @@ static int handle_pte_fault(struct mm_struct *mm,
3391 * This still avoids useless tlb flushes for .text page faults 3544 * This still avoids useless tlb flushes for .text page faults
3392 * with threads. 3545 * with threads.
3393 */ 3546 */
3394 if (flags & FAULT_FLAG_WRITE) 3547 if (fe->flags & FAULT_FLAG_WRITE)
3395 flush_tlb_fix_spurious_fault(vma, address); 3548 flush_tlb_fix_spurious_fault(fe->vma, fe->address);
3396 } 3549 }
3397unlock: 3550unlock:
3398 pte_unmap_unlock(pte, ptl); 3551 pte_unmap_unlock(fe->pte, fe->ptl);
3399 return 0; 3552 return 0;
3400} 3553}
3401 3554
@@ -3405,87 +3558,51 @@ unlock:
3405 * The mmap_sem may have been released depending on flags and our 3558 * The mmap_sem may have been released depending on flags and our
3406 * return value. See filemap_fault() and __lock_page_or_retry(). 3559 * return value. See filemap_fault() and __lock_page_or_retry().
3407 */ 3560 */
3408static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3561static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3409 unsigned long address, unsigned int flags) 3562 unsigned int flags)
3410{ 3563{
3564 struct fault_env fe = {
3565 .vma = vma,
3566 .address = address,
3567 .flags = flags,
3568 };
3569 struct mm_struct *mm = vma->vm_mm;
3411 pgd_t *pgd; 3570 pgd_t *pgd;
3412 pud_t *pud; 3571 pud_t *pud;
3413 pmd_t *pmd;
3414 pte_t *pte;
3415
3416 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
3417 flags & FAULT_FLAG_INSTRUCTION,
3418 flags & FAULT_FLAG_REMOTE))
3419 return VM_FAULT_SIGSEGV;
3420
3421 if (unlikely(is_vm_hugetlb_page(vma)))
3422 return hugetlb_fault(mm, vma, address, flags);
3423 3572
3424 pgd = pgd_offset(mm, address); 3573 pgd = pgd_offset(mm, address);
3425 pud = pud_alloc(mm, pgd, address); 3574 pud = pud_alloc(mm, pgd, address);
3426 if (!pud) 3575 if (!pud)
3427 return VM_FAULT_OOM; 3576 return VM_FAULT_OOM;
3428 pmd = pmd_alloc(mm, pud, address); 3577 fe.pmd = pmd_alloc(mm, pud, address);
3429 if (!pmd) 3578 if (!fe.pmd)
3430 return VM_FAULT_OOM; 3579 return VM_FAULT_OOM;
3431 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { 3580 if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) {
3432 int ret = create_huge_pmd(mm, vma, address, pmd, flags); 3581 int ret = create_huge_pmd(&fe);
3433 if (!(ret & VM_FAULT_FALLBACK)) 3582 if (!(ret & VM_FAULT_FALLBACK))
3434 return ret; 3583 return ret;
3435 } else { 3584 } else {
3436 pmd_t orig_pmd = *pmd; 3585 pmd_t orig_pmd = *fe.pmd;
3437 int ret; 3586 int ret;
3438 3587
3439 barrier(); 3588 barrier();
3440 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { 3589 if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
3441 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3442
3443 if (pmd_protnone(orig_pmd)) 3590 if (pmd_protnone(orig_pmd))
3444 return do_huge_pmd_numa_page(mm, vma, address, 3591 return do_huge_pmd_numa_page(&fe, orig_pmd);
3445 orig_pmd, pmd);
3446 3592
3447 if (dirty && !pmd_write(orig_pmd)) { 3593 if ((fe.flags & FAULT_FLAG_WRITE) &&
3448 ret = wp_huge_pmd(mm, vma, address, pmd, 3594 !pmd_write(orig_pmd)) {
3449 orig_pmd, flags); 3595 ret = wp_huge_pmd(&fe, orig_pmd);
3450 if (!(ret & VM_FAULT_FALLBACK)) 3596 if (!(ret & VM_FAULT_FALLBACK))
3451 return ret; 3597 return ret;
3452 } else { 3598 } else {
3453 huge_pmd_set_accessed(mm, vma, address, pmd, 3599 huge_pmd_set_accessed(&fe, orig_pmd);
3454 orig_pmd, dirty);
3455 return 0; 3600 return 0;
3456 } 3601 }
3457 } 3602 }
3458 } 3603 }
3459 3604
3460 /* 3605 return handle_pte_fault(&fe);
3461 * Use pte_alloc() instead of pte_alloc_map, because we can't
3462 * run pte_offset_map on the pmd, if an huge pmd could
3463 * materialize from under us from a different thread.
3464 */
3465 if (unlikely(pte_alloc(mm, pmd, address)))
3466 return VM_FAULT_OOM;
3467 /*
3468 * If a huge pmd materialized under us just retry later. Use
3469 * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
3470 * didn't become pmd_trans_huge under us and then back to pmd_none, as
3471 * a result of MADV_DONTNEED running immediately after a huge pmd fault
3472 * in a different thread of this mm, in turn leading to a misleading
3473 * pmd_trans_huge() retval. All we have to ensure is that it is a
3474 * regular pmd that we can walk with pte_offset_map() and we can do that
3475 * through an atomic read in C, which is what pmd_trans_unstable()
3476 * provides.
3477 */
3478 if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd)))
3479 return 0;
3480 /*
3481 * A regular pmd is established and it can't morph into a huge pmd
3482 * from under us anymore at this point because we hold the mmap_sem
3483 * read mode and khugepaged takes it in write mode. So now it's
3484 * safe to run pte_offset_map().
3485 */
3486 pte = pte_offset_map(pmd, address);
3487
3488 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3489} 3606}
3490 3607
3491/* 3608/*
@@ -3494,15 +3611,15 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3494 * The mmap_sem may have been released depending on flags and our 3611 * The mmap_sem may have been released depending on flags and our
3495 * return value. See filemap_fault() and __lock_page_or_retry(). 3612 * return value. See filemap_fault() and __lock_page_or_retry().
3496 */ 3613 */
3497int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3614int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3498 unsigned long address, unsigned int flags) 3615 unsigned int flags)
3499{ 3616{
3500 int ret; 3617 int ret;
3501 3618
3502 __set_current_state(TASK_RUNNING); 3619 __set_current_state(TASK_RUNNING);
3503 3620
3504 count_vm_event(PGFAULT); 3621 count_vm_event(PGFAULT);
3505 mem_cgroup_count_vm_event(mm, PGFAULT); 3622 mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT);
3506 3623
3507 /* do counter updates before entering really critical section. */ 3624 /* do counter updates before entering really critical section. */
3508 check_sync_rss_stat(current); 3625 check_sync_rss_stat(current);
@@ -3514,7 +3631,15 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3514 if (flags & FAULT_FLAG_USER) 3631 if (flags & FAULT_FLAG_USER)
3515 mem_cgroup_oom_enable(); 3632 mem_cgroup_oom_enable();
3516 3633
3517 ret = __handle_mm_fault(mm, vma, address, flags); 3634 if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
3635 flags & FAULT_FLAG_INSTRUCTION,
3636 flags & FAULT_FLAG_REMOTE))
3637 return VM_FAULT_SIGSEGV;
3638
3639 if (unlikely(is_vm_hugetlb_page(vma)))
3640 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
3641 else
3642 ret = __handle_mm_fault(vma, address, flags);
3518 3643
3519 if (flags & FAULT_FLAG_USER) { 3644 if (flags & FAULT_FLAG_USER) {
3520 mem_cgroup_oom_disable(); 3645 mem_cgroup_oom_disable();
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e3cbdcaff2a5..82d0b98d27f8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -449,6 +449,25 @@ out_fail:
449 return -1; 449 return -1;
450} 450}
451 451
452static struct zone * __meminit move_pfn_range(int zone_shift,
453 unsigned long start_pfn, unsigned long end_pfn)
454{
455 struct zone *zone = page_zone(pfn_to_page(start_pfn));
456 int ret = 0;
457
458 if (zone_shift < 0)
459 ret = move_pfn_range_left(zone + zone_shift, zone,
460 start_pfn, end_pfn);
461 else if (zone_shift)
462 ret = move_pfn_range_right(zone, zone + zone_shift,
463 start_pfn, end_pfn);
464
465 if (ret)
466 return NULL;
467
468 return zone + zone_shift;
469}
470
452static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 471static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
453 unsigned long end_pfn) 472 unsigned long end_pfn)
454{ 473{
@@ -1028,6 +1047,37 @@ static void node_states_set_node(int node, struct memory_notify *arg)
1028 node_set_state(node, N_MEMORY); 1047 node_set_state(node, N_MEMORY);
1029} 1048}
1030 1049
1050int zone_can_shift(unsigned long pfn, unsigned long nr_pages,
1051 enum zone_type target)
1052{
1053 struct zone *zone = page_zone(pfn_to_page(pfn));
1054 enum zone_type idx = zone_idx(zone);
1055 int i;
1056
1057 if (idx < target) {
1058 /* pages must be at end of current zone */
1059 if (pfn + nr_pages != zone_end_pfn(zone))
1060 return 0;
1061
1062 /* no zones in use between current zone and target */
1063 for (i = idx + 1; i < target; i++)
1064 if (zone_is_initialized(zone - idx + i))
1065 return 0;
1066 }
1067
1068 if (target < idx) {
1069 /* pages must be at beginning of current zone */
1070 if (pfn != zone->zone_start_pfn)
1071 return 0;
1072
1073 /* no zones in use between current zone and target */
1074 for (i = target + 1; i < idx; i++)
1075 if (zone_is_initialized(zone - idx + i))
1076 return 0;
1077 }
1078
1079 return target - idx;
1080}
1031 1081
1032/* Must be protected by mem_hotplug_begin() */ 1082/* Must be protected by mem_hotplug_begin() */
1033int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 1083int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
@@ -1039,6 +1089,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
1039 int nid; 1089 int nid;
1040 int ret; 1090 int ret;
1041 struct memory_notify arg; 1091 struct memory_notify arg;
1092 int zone_shift = 0;
1042 1093
1043 /* 1094 /*
1044 * This doesn't need a lock to do pfn_to_page(). 1095 * This doesn't need a lock to do pfn_to_page().
@@ -1052,19 +1103,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
1052 !can_online_high_movable(zone)) 1103 !can_online_high_movable(zone))
1053 return -EINVAL; 1104 return -EINVAL;
1054 1105
1055 if (online_type == MMOP_ONLINE_KERNEL && 1106 if (online_type == MMOP_ONLINE_KERNEL)
1056 zone_idx(zone) == ZONE_MOVABLE) { 1107 zone_shift = zone_can_shift(pfn, nr_pages, ZONE_NORMAL);
1057 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) 1108 else if (online_type == MMOP_ONLINE_MOVABLE)
1058 return -EINVAL; 1109 zone_shift = zone_can_shift(pfn, nr_pages, ZONE_MOVABLE);
1059 }
1060 if (online_type == MMOP_ONLINE_MOVABLE &&
1061 zone_idx(zone) == ZONE_MOVABLE - 1) {
1062 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
1063 return -EINVAL;
1064 }
1065 1110
1066 /* Previous code may changed the zone of the pfn range */ 1111 zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages);
1067 zone = page_zone(pfn_to_page(pfn)); 1112 if (!zone)
1113 return -EINVAL;
1068 1114
1069 arg.start_pfn = pfn; 1115 arg.start_pfn = pfn;
1070 arg.nr_pages = nr_pages; 1116 arg.nr_pages = nr_pages;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 297d6854f849..53e40d3f3933 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -512,6 +512,8 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
512 } 512 }
513 } 513 }
514 514
515 if (pmd_trans_unstable(pmd))
516 return 0;
515retry: 517retry:
516 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 518 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
517 for (; addr != end; pte++, addr += PAGE_SIZE) { 519 for (; addr != end; pte++, addr += PAGE_SIZE) {
@@ -529,7 +531,7 @@ retry:
529 nid = page_to_nid(page); 531 nid = page_to_nid(page);
530 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) 532 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
531 continue; 533 continue;
532 if (PageTransCompound(page) && PageAnon(page)) { 534 if (PageTransCompound(page)) {
533 get_page(page); 535 get_page(page);
534 pte_unmap_unlock(pte, ptl); 536 pte_unmap_unlock(pte, ptl);
535 lock_page(page); 537 lock_page(page);
diff --git a/mm/migrate.c b/mm/migrate.c
index bd3fdc202e8b..2232f6923cc7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -31,6 +31,7 @@
31#include <linux/vmalloc.h> 31#include <linux/vmalloc.h>
32#include <linux/security.h> 32#include <linux/security.h>
33#include <linux/backing-dev.h> 33#include <linux/backing-dev.h>
34#include <linux/compaction.h>
34#include <linux/syscalls.h> 35#include <linux/syscalls.h>
35#include <linux/hugetlb.h> 36#include <linux/hugetlb.h>
36#include <linux/hugetlb_cgroup.h> 37#include <linux/hugetlb_cgroup.h>
@@ -73,6 +74,81 @@ int migrate_prep_local(void)
73 return 0; 74 return 0;
74} 75}
75 76
77bool isolate_movable_page(struct page *page, isolate_mode_t mode)
78{
79 struct address_space *mapping;
80
81 /*
82 * Avoid burning cycles with pages that are yet under __free_pages(),
83 * or just got freed under us.
84 *
85 * In case we 'win' a race for a movable page being freed under us and
86 * raise its refcount preventing __free_pages() from doing its job
87 * the put_page() at the end of this block will take care of
88 * release this page, thus avoiding a nasty leakage.
89 */
90 if (unlikely(!get_page_unless_zero(page)))
91 goto out;
92
93 /*
94 * Check PageMovable before holding a PG_lock because page's owner
95 * assumes anybody doesn't touch PG_lock of newly allocated page
96 * so unconditionally grapping the lock ruins page's owner side.
97 */
98 if (unlikely(!__PageMovable(page)))
99 goto out_putpage;
100 /*
101 * As movable pages are not isolated from LRU lists, concurrent
102 * compaction threads can race against page migration functions
103 * as well as race against the releasing a page.
104 *
105 * In order to avoid having an already isolated movable page
106 * being (wrongly) re-isolated while it is under migration,
107 * or to avoid attempting to isolate pages being released,
108 * lets be sure we have the page lock
109 * before proceeding with the movable page isolation steps.
110 */
111 if (unlikely(!trylock_page(page)))
112 goto out_putpage;
113
114 if (!PageMovable(page) || PageIsolated(page))
115 goto out_no_isolated;
116
117 mapping = page_mapping(page);
118 VM_BUG_ON_PAGE(!mapping, page);
119
120 if (!mapping->a_ops->isolate_page(page, mode))
121 goto out_no_isolated;
122
123 /* Driver shouldn't use PG_isolated bit of page->flags */
124 WARN_ON_ONCE(PageIsolated(page));
125 __SetPageIsolated(page);
126 unlock_page(page);
127
128 return true;
129
130out_no_isolated:
131 unlock_page(page);
132out_putpage:
133 put_page(page);
134out:
135 return false;
136}
137
138/* It should be called on page which is PG_movable */
139void putback_movable_page(struct page *page)
140{
141 struct address_space *mapping;
142
143 VM_BUG_ON_PAGE(!PageLocked(page), page);
144 VM_BUG_ON_PAGE(!PageMovable(page), page);
145 VM_BUG_ON_PAGE(!PageIsolated(page), page);
146
147 mapping = page_mapping(page);
148 mapping->a_ops->putback_page(page);
149 __ClearPageIsolated(page);
150}
151
76/* 152/*
77 * Put previously isolated pages back onto the appropriate lists 153 * Put previously isolated pages back onto the appropriate lists
78 * from where they were once taken off for compaction/migration. 154 * from where they were once taken off for compaction/migration.
@@ -94,10 +170,23 @@ void putback_movable_pages(struct list_head *l)
94 list_del(&page->lru); 170 list_del(&page->lru);
95 dec_zone_page_state(page, NR_ISOLATED_ANON + 171 dec_zone_page_state(page, NR_ISOLATED_ANON +
96 page_is_file_cache(page)); 172 page_is_file_cache(page));
97 if (unlikely(isolated_balloon_page(page))) 173 /*
98 balloon_page_putback(page); 174 * We isolated non-lru movable page so here we can use
99 else 175 * __PageMovable because LRU page's mapping cannot have
176 * PAGE_MAPPING_MOVABLE.
177 */
178 if (unlikely(__PageMovable(page))) {
179 VM_BUG_ON_PAGE(!PageIsolated(page), page);
180 lock_page(page);
181 if (PageMovable(page))
182 putback_movable_page(page);
183 else
184 __ClearPageIsolated(page);
185 unlock_page(page);
186 put_page(page);
187 } else {
100 putback_lru_page(page); 188 putback_lru_page(page);
189 }
101 } 190 }
102} 191}
103 192
@@ -170,7 +259,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
170 } else if (PageAnon(new)) 259 } else if (PageAnon(new))
171 page_add_anon_rmap(new, vma, addr, false); 260 page_add_anon_rmap(new, vma, addr, false);
172 else 261 else
173 page_add_file_rmap(new); 262 page_add_file_rmap(new, false);
174 263
175 if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) 264 if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
176 mlock_vma_page(new); 265 mlock_vma_page(new);
@@ -594,7 +683,7 @@ EXPORT_SYMBOL(migrate_page_copy);
594 ***********************************************************/ 683 ***********************************************************/
595 684
596/* 685/*
597 * Common logic to directly migrate a single page suitable for 686 * Common logic to directly migrate a single LRU page suitable for
598 * pages that do not use PagePrivate/PagePrivate2. 687 * pages that do not use PagePrivate/PagePrivate2.
599 * 688 *
600 * Pages are locked upon entry and exit. 689 * Pages are locked upon entry and exit.
@@ -757,33 +846,72 @@ static int move_to_new_page(struct page *newpage, struct page *page,
757 enum migrate_mode mode) 846 enum migrate_mode mode)
758{ 847{
759 struct address_space *mapping; 848 struct address_space *mapping;
760 int rc; 849 int rc = -EAGAIN;
850 bool is_lru = !__PageMovable(page);
761 851
762 VM_BUG_ON_PAGE(!PageLocked(page), page); 852 VM_BUG_ON_PAGE(!PageLocked(page), page);
763 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 853 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
764 854
765 mapping = page_mapping(page); 855 mapping = page_mapping(page);
766 if (!mapping) 856
767 rc = migrate_page(mapping, newpage, page, mode); 857 if (likely(is_lru)) {
768 else if (mapping->a_ops->migratepage) 858 if (!mapping)
859 rc = migrate_page(mapping, newpage, page, mode);
860 else if (mapping->a_ops->migratepage)
861 /*
862 * Most pages have a mapping and most filesystems
863 * provide a migratepage callback. Anonymous pages
864 * are part of swap space which also has its own
865 * migratepage callback. This is the most common path
866 * for page migration.
867 */
868 rc = mapping->a_ops->migratepage(mapping, newpage,
869 page, mode);
870 else
871 rc = fallback_migrate_page(mapping, newpage,
872 page, mode);
873 } else {
769 /* 874 /*
770 * Most pages have a mapping and most filesystems provide a 875 * In case of non-lru page, it could be released after
771 * migratepage callback. Anonymous pages are part of swap 876 * isolation step. In that case, we shouldn't try migration.
772 * space which also has its own migratepage callback. This
773 * is the most common path for page migration.
774 */ 877 */
775 rc = mapping->a_ops->migratepage(mapping, newpage, page, mode); 878 VM_BUG_ON_PAGE(!PageIsolated(page), page);
776 else 879 if (!PageMovable(page)) {
777 rc = fallback_migrate_page(mapping, newpage, page, mode); 880 rc = MIGRATEPAGE_SUCCESS;
881 __ClearPageIsolated(page);
882 goto out;
883 }
884
885 rc = mapping->a_ops->migratepage(mapping, newpage,
886 page, mode);
887 WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
888 !PageIsolated(page));
889 }
778 890
779 /* 891 /*
780 * When successful, old pagecache page->mapping must be cleared before 892 * When successful, old pagecache page->mapping must be cleared before
781 * page is freed; but stats require that PageAnon be left as PageAnon. 893 * page is freed; but stats require that PageAnon be left as PageAnon.
782 */ 894 */
783 if (rc == MIGRATEPAGE_SUCCESS) { 895 if (rc == MIGRATEPAGE_SUCCESS) {
784 if (!PageAnon(page)) 896 if (__PageMovable(page)) {
897 VM_BUG_ON_PAGE(!PageIsolated(page), page);
898
899 /*
900 * We clear PG_movable under page_lock so any compactor
901 * cannot try to migrate this page.
902 */
903 __ClearPageIsolated(page);
904 }
905
906 /*
907 * Anonymous and movable page->mapping will be cleard by
908 * free_pages_prepare so don't reset it here for keeping
909 * the type to work PageAnon, for example.
910 */
911 if (!PageMappingFlags(page))
785 page->mapping = NULL; 912 page->mapping = NULL;
786 } 913 }
914out:
787 return rc; 915 return rc;
788} 916}
789 917
@@ -793,6 +921,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
793 int rc = -EAGAIN; 921 int rc = -EAGAIN;
794 int page_was_mapped = 0; 922 int page_was_mapped = 0;
795 struct anon_vma *anon_vma = NULL; 923 struct anon_vma *anon_vma = NULL;
924 bool is_lru = !__PageMovable(page);
796 925
797 if (!trylock_page(page)) { 926 if (!trylock_page(page)) {
798 if (!force || mode == MIGRATE_ASYNC) 927 if (!force || mode == MIGRATE_ASYNC)
@@ -861,15 +990,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
861 if (unlikely(!trylock_page(newpage))) 990 if (unlikely(!trylock_page(newpage)))
862 goto out_unlock; 991 goto out_unlock;
863 992
864 if (unlikely(isolated_balloon_page(page))) { 993 if (unlikely(!is_lru)) {
865 /* 994 rc = move_to_new_page(newpage, page, mode);
866 * A ballooned page does not need any special attention from
867 * physical to virtual reverse mapping procedures.
868 * Skip any attempt to unmap PTEs or to remap swap cache,
869 * in order to avoid burning cycles at rmap level, and perform
870 * the page migration right away (proteced by page lock).
871 */
872 rc = balloon_page_migrate(newpage, page, mode);
873 goto out_unlock_both; 995 goto out_unlock_both;
874 } 996 }
875 997
@@ -915,6 +1037,19 @@ out_unlock:
915 put_anon_vma(anon_vma); 1037 put_anon_vma(anon_vma);
916 unlock_page(page); 1038 unlock_page(page);
917out: 1039out:
1040 /*
1041 * If migration is successful, decrease refcount of the newpage
1042 * which will not free the page because new page owner increased
1043 * refcounter. As well, if it is LRU page, add the page to LRU
1044 * list in here.
1045 */
1046 if (rc == MIGRATEPAGE_SUCCESS) {
1047 if (unlikely(__PageMovable(newpage)))
1048 put_page(newpage);
1049 else
1050 putback_lru_page(newpage);
1051 }
1052
918 return rc; 1053 return rc;
919} 1054}
920 1055
@@ -948,6 +1083,18 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
948 1083
949 if (page_count(page) == 1) { 1084 if (page_count(page) == 1) {
950 /* page was freed from under us. So we are done. */ 1085 /* page was freed from under us. So we are done. */
1086 ClearPageActive(page);
1087 ClearPageUnevictable(page);
1088 if (unlikely(__PageMovable(page))) {
1089 lock_page(page);
1090 if (!PageMovable(page))
1091 __ClearPageIsolated(page);
1092 unlock_page(page);
1093 }
1094 if (put_new_page)
1095 put_new_page(newpage, private);
1096 else
1097 put_page(newpage);
951 goto out; 1098 goto out;
952 } 1099 }
953 1100
@@ -960,10 +1107,8 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
960 } 1107 }
961 1108
962 rc = __unmap_and_move(page, newpage, force, mode); 1109 rc = __unmap_and_move(page, newpage, force, mode);
963 if (rc == MIGRATEPAGE_SUCCESS) { 1110 if (rc == MIGRATEPAGE_SUCCESS)
964 put_new_page = NULL;
965 set_page_owner_migrate_reason(newpage, reason); 1111 set_page_owner_migrate_reason(newpage, reason);
966 }
967 1112
968out: 1113out:
969 if (rc != -EAGAIN) { 1114 if (rc != -EAGAIN) {
@@ -976,33 +1121,45 @@ out:
976 list_del(&page->lru); 1121 list_del(&page->lru);
977 dec_zone_page_state(page, NR_ISOLATED_ANON + 1122 dec_zone_page_state(page, NR_ISOLATED_ANON +
978 page_is_file_cache(page)); 1123 page_is_file_cache(page));
979 /* Soft-offlined page shouldn't go through lru cache list */ 1124 }
980 if (reason == MR_MEMORY_FAILURE && rc == MIGRATEPAGE_SUCCESS) { 1125
1126 /*
1127 * If migration is successful, releases reference grabbed during
1128 * isolation. Otherwise, restore the page to right list unless
1129 * we want to retry.
1130 */
1131 if (rc == MIGRATEPAGE_SUCCESS) {
1132 put_page(page);
1133 if (reason == MR_MEMORY_FAILURE) {
981 /* 1134 /*
982 * With this release, we free successfully migrated 1135 * Set PG_HWPoison on just freed page
983 * page and set PG_HWPoison on just freed page 1136 * intentionally. Although it's rather weird,
984 * intentionally. Although it's rather weird, it's how 1137 * it's how HWPoison flag works at the moment.
985 * HWPoison flag works at the moment.
986 */ 1138 */
987 put_page(page);
988 if (!test_set_page_hwpoison(page)) 1139 if (!test_set_page_hwpoison(page))
989 num_poisoned_pages_inc(); 1140 num_poisoned_pages_inc();
990 } else 1141 }
991 putback_lru_page(page); 1142 } else {
992 } 1143 if (rc != -EAGAIN) {
1144 if (likely(!__PageMovable(page))) {
1145 putback_lru_page(page);
1146 goto put_new;
1147 }
993 1148
994 /* 1149 lock_page(page);
995 * If migration was not successful and there's a freeing callback, use 1150 if (PageMovable(page))
996 * it. Otherwise, putback_lru_page() will drop the reference grabbed 1151 putback_movable_page(page);
997 * during isolation. 1152 else
998 */ 1153 __ClearPageIsolated(page);
999 if (put_new_page) 1154 unlock_page(page);
1000 put_new_page(newpage, private); 1155 put_page(page);
1001 else if (unlikely(__is_movable_balloon_page(newpage))) { 1156 }
1002 /* drop our reference, page already in the balloon */ 1157put_new:
1003 put_page(newpage); 1158 if (put_new_page)
1004 } else 1159 put_new_page(newpage, private);
1005 putback_lru_page(newpage); 1160 else
1161 put_page(newpage);
1162 }
1006 1163
1007 if (result) { 1164 if (result) {
1008 if (rc) 1165 if (rc)
@@ -1829,8 +1986,7 @@ fail_putback:
1829 } 1986 }
1830 1987
1831 orig_entry = *pmd; 1988 orig_entry = *pmd;
1832 entry = mk_pmd(new_page, vma->vm_page_prot); 1989 entry = mk_huge_pmd(new_page, vma->vm_page_prot);
1833 entry = pmd_mkhuge(entry);
1834 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 1990 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1835 1991
1836 /* 1992 /*
diff --git a/mm/mmap.c b/mm/mmap.c
index 234edffec1d0..86b18f334f4f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -25,6 +25,7 @@
25#include <linux/personality.h> 25#include <linux/personality.h>
26#include <linux/security.h> 26#include <linux/security.h>
27#include <linux/hugetlb.h> 27#include <linux/hugetlb.h>
28#include <linux/shmem_fs.h>
28#include <linux/profile.h> 29#include <linux/profile.h>
29#include <linux/export.h> 30#include <linux/export.h>
30#include <linux/mount.h> 31#include <linux/mount.h>
@@ -675,6 +676,8 @@ again: remove_next = 1 + (end > next->vm_end);
675 } 676 }
676 } 677 }
677 678
679 vma_adjust_trans_huge(vma, start, end, adjust_next);
680
678 if (file) { 681 if (file) {
679 mapping = file->f_mapping; 682 mapping = file->f_mapping;
680 root = &mapping->i_mmap; 683 root = &mapping->i_mmap;
@@ -695,8 +698,6 @@ again: remove_next = 1 + (end > next->vm_end);
695 } 698 }
696 } 699 }
697 700
698 vma_adjust_trans_huge(vma, start, end, adjust_next);
699
700 anon_vma = vma->anon_vma; 701 anon_vma = vma->anon_vma;
701 if (!anon_vma && adjust_next) 702 if (!anon_vma && adjust_next)
702 anon_vma = next->anon_vma; 703 anon_vma = next->anon_vma;
@@ -1897,8 +1898,19 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1897 return -ENOMEM; 1898 return -ENOMEM;
1898 1899
1899 get_area = current->mm->get_unmapped_area; 1900 get_area = current->mm->get_unmapped_area;
1900 if (file && file->f_op->get_unmapped_area) 1901 if (file) {
1901 get_area = file->f_op->get_unmapped_area; 1902 if (file->f_op->get_unmapped_area)
1903 get_area = file->f_op->get_unmapped_area;
1904 } else if (flags & MAP_SHARED) {
1905 /*
1906 * mmap_region() will call shmem_zero_setup() to create a file,
1907 * so use shmem's get_unmapped_area in case it can be huge.
1908 * do_mmap_pgoff() will clear pgoff, so match alignment.
1909 */
1910 pgoff = 0;
1911 get_area = shmem_get_unmapped_area;
1912 }
1913
1902 addr = get_area(file, addr, len, pgoff, flags); 1914 addr = get_area(file, addr, len, pgoff, flags);
1903 if (IS_ERR_VALUE(addr)) 1915 if (IS_ERR_VALUE(addr))
1904 return addr; 1916 return addr;
@@ -2591,6 +2603,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
2591 /* drop PG_Mlocked flag for over-mapped range */ 2603 /* drop PG_Mlocked flag for over-mapped range */
2592 for (tmp = vma; tmp->vm_start >= start + size; 2604 for (tmp = vma; tmp->vm_start >= start + size;
2593 tmp = tmp->vm_next) { 2605 tmp = tmp->vm_next) {
2606 /*
2607 * Split pmd and munlock page on the border
2608 * of the range.
2609 */
2610 vma_adjust_trans_huge(tmp, start, start + size, 0);
2611
2594 munlock_vma_pages_range(tmp, 2612 munlock_vma_pages_range(tmp,
2595 max(tmp->vm_start, start), 2613 max(tmp->vm_start, start),
2596 min(tmp->vm_end, start + size)); 2614 min(tmp->vm_end, start + size));
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 5019a1ef2848..a4830f0325fe 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -163,7 +163,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
163 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { 163 if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
164 if (next - addr != HPAGE_PMD_SIZE) { 164 if (next - addr != HPAGE_PMD_SIZE) {
165 split_huge_pmd(vma, pmd, addr); 165 split_huge_pmd(vma, pmd, addr);
166 if (pmd_none(*pmd)) 166 if (pmd_trans_unstable(pmd))
167 continue; 167 continue;
168 } else { 168 } else {
169 int nr_ptes = change_huge_pmd(vma, pmd, addr, 169 int nr_ptes = change_huge_pmd(vma, pmd, addr,
diff --git a/mm/mremap.c b/mm/mremap.c
index 1f157adfdaf9..da22ad2a5678 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -210,9 +210,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
210 } 210 }
211 } 211 }
212 split_huge_pmd(vma, old_pmd, old_addr); 212 split_huge_pmd(vma, old_pmd, old_addr);
213 if (pmd_none(*old_pmd)) 213 if (pmd_trans_unstable(old_pmd))
214 continue; 214 continue;
215 VM_BUG_ON(pmd_trans_huge(*old_pmd));
216 } 215 }
217 if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr)) 216 if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr))
218 break; 217 break;
diff --git a/mm/nommu.c b/mm/nommu.c
index c2e58880207f..95daf81a4855 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1809,7 +1809,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1809} 1809}
1810EXPORT_SYMBOL(filemap_fault); 1810EXPORT_SYMBOL(filemap_fault);
1811 1811
1812void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) 1812void filemap_map_pages(struct fault_env *fe,
1813 pgoff_t start_pgoff, pgoff_t end_pgoff)
1813{ 1814{
1814 BUG(); 1815 BUG();
1815} 1816}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ddf74487f848..d4a929d79470 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -274,7 +274,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc,
274#endif 274#endif
275 275
276enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, 276enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
277 struct task_struct *task, unsigned long totalpages) 277 struct task_struct *task)
278{ 278{
279 if (oom_unkillable_task(task, NULL, oc->nodemask)) 279 if (oom_unkillable_task(task, NULL, oc->nodemask))
280 return OOM_SCAN_CONTINUE; 280 return OOM_SCAN_CONTINUE;
@@ -311,7 +311,7 @@ static struct task_struct *select_bad_process(struct oom_control *oc,
311 for_each_process(p) { 311 for_each_process(p) {
312 unsigned int points; 312 unsigned int points;
313 313
314 switch (oom_scan_process_thread(oc, p, totalpages)) { 314 switch (oom_scan_process_thread(oc, p)) {
315 case OOM_SCAN_SELECT: 315 case OOM_SCAN_SELECT:
316 chosen = p; 316 chosen = p;
317 chosen_points = ULONG_MAX; 317 chosen_points = ULONG_MAX;
@@ -383,8 +383,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
383 rcu_read_unlock(); 383 rcu_read_unlock();
384} 384}
385 385
386static void dump_header(struct oom_control *oc, struct task_struct *p, 386static void dump_header(struct oom_control *oc, struct task_struct *p)
387 struct mem_cgroup *memcg)
388{ 387{
389 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", 388 pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
390 current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, 389 current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
@@ -392,12 +391,12 @@ static void dump_header(struct oom_control *oc, struct task_struct *p,
392 391
393 cpuset_print_current_mems_allowed(); 392 cpuset_print_current_mems_allowed();
394 dump_stack(); 393 dump_stack();
395 if (memcg) 394 if (oc->memcg)
396 mem_cgroup_print_oom_info(memcg, p); 395 mem_cgroup_print_oom_info(oc->memcg, p);
397 else 396 else
398 show_mem(SHOW_MEM_FILTER_NODES); 397 show_mem(SHOW_MEM_FILTER_NODES);
399 if (sysctl_oom_dump_tasks) 398 if (sysctl_oom_dump_tasks)
400 dump_tasks(memcg, oc->nodemask); 399 dump_tasks(oc->memcg, oc->nodemask);
401} 400}
402 401
403/* 402/*
@@ -453,7 +452,7 @@ static bool __oom_reap_task(struct task_struct *tsk)
453 * We have to make sure to not race with the victim exit path 452 * We have to make sure to not race with the victim exit path
454 * and cause premature new oom victim selection: 453 * and cause premature new oom victim selection:
455 * __oom_reap_task exit_mm 454 * __oom_reap_task exit_mm
456 * atomic_inc_not_zero 455 * mmget_not_zero
457 * mmput 456 * mmput
458 * atomic_dec_and_test 457 * atomic_dec_and_test
459 * exit_oom_victim 458 * exit_oom_victim
@@ -475,12 +474,22 @@ static bool __oom_reap_task(struct task_struct *tsk)
475 if (!p) 474 if (!p)
476 goto unlock_oom; 475 goto unlock_oom;
477 mm = p->mm; 476 mm = p->mm;
478 atomic_inc(&mm->mm_users); 477 atomic_inc(&mm->mm_count);
479 task_unlock(p); 478 task_unlock(p);
480 479
481 if (!down_read_trylock(&mm->mmap_sem)) { 480 if (!down_read_trylock(&mm->mmap_sem)) {
482 ret = false; 481 ret = false;
483 goto unlock_oom; 482 goto mm_drop;
483 }
484
485 /*
486 * increase mm_users only after we know we will reap something so
487 * that the mmput_async is called only when we have reaped something
488 * and delayed __mmput doesn't matter that much
489 */
490 if (!mmget_not_zero(mm)) {
491 up_read(&mm->mmap_sem);
492 goto mm_drop;
484 } 493 }
485 494
486 tlb_gather_mmu(&tlb, mm, 0, -1); 495 tlb_gather_mmu(&tlb, mm, 0, -1);
@@ -522,15 +531,16 @@ static bool __oom_reap_task(struct task_struct *tsk)
522 * to release its memory. 531 * to release its memory.
523 */ 532 */
524 set_bit(MMF_OOM_REAPED, &mm->flags); 533 set_bit(MMF_OOM_REAPED, &mm->flags);
525unlock_oom:
526 mutex_unlock(&oom_lock);
527 /* 534 /*
528 * Drop our reference but make sure the mmput slow path is called from a 535 * Drop our reference but make sure the mmput slow path is called from a
529 * different context because we shouldn't risk we get stuck there and 536 * different context because we shouldn't risk we get stuck there and
530 * put the oom_reaper out of the way. 537 * put the oom_reaper out of the way.
531 */ 538 */
532 if (mm) 539 mmput_async(mm);
533 mmput_async(mm); 540mm_drop:
541 mmdrop(mm);
542unlock_oom:
543 mutex_unlock(&oom_lock);
534 return ret; 544 return ret;
535} 545}
536 546
@@ -739,7 +749,7 @@ void oom_killer_enable(void)
739 */ 749 */
740void oom_kill_process(struct oom_control *oc, struct task_struct *p, 750void oom_kill_process(struct oom_control *oc, struct task_struct *p,
741 unsigned int points, unsigned long totalpages, 751 unsigned int points, unsigned long totalpages,
742 struct mem_cgroup *memcg, const char *message) 752 const char *message)
743{ 753{
744 struct task_struct *victim = p; 754 struct task_struct *victim = p;
745 struct task_struct *child; 755 struct task_struct *child;
@@ -765,7 +775,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
765 task_unlock(p); 775 task_unlock(p);
766 776
767 if (__ratelimit(&oom_rs)) 777 if (__ratelimit(&oom_rs))
768 dump_header(oc, p, memcg); 778 dump_header(oc, p);
769 779
770 pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", 780 pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
771 message, task_pid_nr(p), p->comm, points); 781 message, task_pid_nr(p), p->comm, points);
@@ -786,8 +796,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
786 /* 796 /*
787 * oom_badness() returns 0 if the thread is unkillable 797 * oom_badness() returns 0 if the thread is unkillable
788 */ 798 */
789 child_points = oom_badness(child, memcg, oc->nodemask, 799 child_points = oom_badness(child,
790 totalpages); 800 oc->memcg, oc->nodemask, totalpages);
791 if (child_points > victim_points) { 801 if (child_points > victim_points) {
792 put_task_struct(victim); 802 put_task_struct(victim);
793 victim = child; 803 victim = child;
@@ -865,8 +875,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
865/* 875/*
866 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 876 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
867 */ 877 */
868void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, 878void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint)
869 struct mem_cgroup *memcg)
870{ 879{
871 if (likely(!sysctl_panic_on_oom)) 880 if (likely(!sysctl_panic_on_oom))
872 return; 881 return;
@@ -882,7 +891,7 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint,
882 /* Do not panic for oom kills triggered by sysrq */ 891 /* Do not panic for oom kills triggered by sysrq */
883 if (is_sysrq_oom(oc)) 892 if (is_sysrq_oom(oc))
884 return; 893 return;
885 dump_header(oc, NULL, memcg); 894 dump_header(oc, NULL);
886 panic("Out of memory: %s panic_on_oom is enabled\n", 895 panic("Out of memory: %s panic_on_oom is enabled\n",
887 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); 896 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
888} 897}
@@ -957,13 +966,13 @@ bool out_of_memory(struct oom_control *oc)
957 constraint = constrained_alloc(oc, &totalpages); 966 constraint = constrained_alloc(oc, &totalpages);
958 if (constraint != CONSTRAINT_MEMORY_POLICY) 967 if (constraint != CONSTRAINT_MEMORY_POLICY)
959 oc->nodemask = NULL; 968 oc->nodemask = NULL;
960 check_panic_on_oom(oc, constraint, NULL); 969 check_panic_on_oom(oc, constraint);
961 970
962 if (sysctl_oom_kill_allocating_task && current->mm && 971 if (sysctl_oom_kill_allocating_task && current->mm &&
963 !oom_unkillable_task(current, NULL, oc->nodemask) && 972 !oom_unkillable_task(current, NULL, oc->nodemask) &&
964 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { 973 current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
965 get_task_struct(current); 974 get_task_struct(current);
966 oom_kill_process(oc, current, 0, totalpages, NULL, 975 oom_kill_process(oc, current, 0, totalpages,
967 "Out of memory (oom_kill_allocating_task)"); 976 "Out of memory (oom_kill_allocating_task)");
968 return true; 977 return true;
969 } 978 }
@@ -971,12 +980,11 @@ bool out_of_memory(struct oom_control *oc)
971 p = select_bad_process(oc, &points, totalpages); 980 p = select_bad_process(oc, &points, totalpages);
972 /* Found nothing?!?! Either we hang forever, or we panic. */ 981 /* Found nothing?!?! Either we hang forever, or we panic. */
973 if (!p && !is_sysrq_oom(oc)) { 982 if (!p && !is_sysrq_oom(oc)) {
974 dump_header(oc, NULL, NULL); 983 dump_header(oc, NULL);
975 panic("Out of memory and no killable processes...\n"); 984 panic("Out of memory and no killable processes...\n");
976 } 985 }
977 if (p && p != (void *)-1UL) { 986 if (p && p != (void *)-1UL) {
978 oom_kill_process(oc, p, points, totalpages, NULL, 987 oom_kill_process(oc, p, points, totalpages, "Out of memory");
979 "Out of memory");
980 /* 988 /*
981 * Give the killed process a good chance to exit before trying 989 * Give the killed process a good chance to exit before trying
982 * to allocate memory again. 990 * to allocate memory again.
@@ -988,14 +996,15 @@ bool out_of_memory(struct oom_control *oc)
988 996
989/* 997/*
990 * The pagefault handler calls here because it is out of memory, so kill a 998 * The pagefault handler calls here because it is out of memory, so kill a
991 * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a 999 * memory-hogging task. If oom_lock is held by somebody else, a parallel oom
992 * parallel oom killing is already in progress so do nothing. 1000 * killing is already in progress so do nothing.
993 */ 1001 */
994void pagefault_out_of_memory(void) 1002void pagefault_out_of_memory(void)
995{ 1003{
996 struct oom_control oc = { 1004 struct oom_control oc = {
997 .zonelist = NULL, 1005 .zonelist = NULL,
998 .nodemask = NULL, 1006 .nodemask = NULL,
1007 .memcg = NULL,
999 .gfp_mask = 0, 1008 .gfp_mask = 0,
1000 .order = 0, 1009 .order = 0,
1001 }; 1010 };
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e2481949494c..d578d2a56b19 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2563,6 +2563,7 @@ int set_page_dirty(struct page *page)
2563{ 2563{
2564 struct address_space *mapping = page_mapping(page); 2564 struct address_space *mapping = page_mapping(page);
2565 2565
2566 page = compound_head(page);
2566 if (likely(mapping)) { 2567 if (likely(mapping)) {
2567 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; 2568 int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
2568 /* 2569 /*
@@ -2747,6 +2748,11 @@ int test_clear_page_writeback(struct page *page)
2747 __wb_writeout_inc(wb); 2748 __wb_writeout_inc(wb);
2748 } 2749 }
2749 } 2750 }
2751
2752 if (mapping->host && !mapping_tagged(mapping,
2753 PAGECACHE_TAG_WRITEBACK))
2754 sb_clear_inode_writeback(mapping->host);
2755
2750 spin_unlock_irqrestore(&mapping->tree_lock, flags); 2756 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2751 } else { 2757 } else {
2752 ret = TestClearPageWriteback(page); 2758 ret = TestClearPageWriteback(page);
@@ -2774,11 +2780,24 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
2774 spin_lock_irqsave(&mapping->tree_lock, flags); 2780 spin_lock_irqsave(&mapping->tree_lock, flags);
2775 ret = TestSetPageWriteback(page); 2781 ret = TestSetPageWriteback(page);
2776 if (!ret) { 2782 if (!ret) {
2783 bool on_wblist;
2784
2785 on_wblist = mapping_tagged(mapping,
2786 PAGECACHE_TAG_WRITEBACK);
2787
2777 radix_tree_tag_set(&mapping->page_tree, 2788 radix_tree_tag_set(&mapping->page_tree,
2778 page_index(page), 2789 page_index(page),
2779 PAGECACHE_TAG_WRITEBACK); 2790 PAGECACHE_TAG_WRITEBACK);
2780 if (bdi_cap_account_writeback(bdi)) 2791 if (bdi_cap_account_writeback(bdi))
2781 __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); 2792 __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
2793
2794 /*
2795 * We can come through here when swapping anonymous
2796 * pages, so we don't necessarily have an inode to track
2797 * for sync.
2798 */
2799 if (mapping->host && !on_wblist)
2800 sb_mark_inode_writeback(mapping->host);
2782 } 2801 }
2783 if (!PageDirty(page)) 2802 if (!PageDirty(page))
2784 radix_tree_tag_clear(&mapping->page_tree, 2803 radix_tree_tag_clear(&mapping->page_tree,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8b3e1341b754..452513bf02ce 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -63,6 +63,7 @@
63#include <linux/sched/rt.h> 63#include <linux/sched/rt.h>
64#include <linux/page_owner.h> 64#include <linux/page_owner.h>
65#include <linux/kthread.h> 65#include <linux/kthread.h>
66#include <linux/memcontrol.h>
66 67
67#include <asm/sections.h> 68#include <asm/sections.h>
68#include <asm/tlbflush.h> 69#include <asm/tlbflush.h>
@@ -1006,6 +1007,8 @@ static __always_inline bool free_pages_prepare(struct page *page,
1006 1007
1007 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); 1008 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
1008 1009
1010 if (compound)
1011 ClearPageDoubleMap(page);
1009 for (i = 1; i < (1 << order); i++) { 1012 for (i = 1; i < (1 << order); i++) {
1010 if (compound) 1013 if (compound)
1011 bad += free_tail_pages_check(page, page + i); 1014 bad += free_tail_pages_check(page, page + i);
@@ -1016,8 +1019,12 @@ static __always_inline bool free_pages_prepare(struct page *page,
1016 (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 1019 (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1017 } 1020 }
1018 } 1021 }
1019 if (PageAnonHead(page)) 1022 if (PageMappingFlags(page))
1020 page->mapping = NULL; 1023 page->mapping = NULL;
1024 if (memcg_kmem_enabled() && PageKmemcg(page)) {
1025 memcg_kmem_uncharge(page, order);
1026 __ClearPageKmemcg(page);
1027 }
1021 if (check_free) 1028 if (check_free)
1022 bad += free_pages_check(page); 1029 bad += free_pages_check(page);
1023 if (bad) 1030 if (bad)
@@ -1724,6 +1731,19 @@ static bool check_new_pages(struct page *page, unsigned int order)
1724 return false; 1731 return false;
1725} 1732}
1726 1733
1734inline void post_alloc_hook(struct page *page, unsigned int order,
1735 gfp_t gfp_flags)
1736{
1737 set_page_private(page, 0);
1738 set_page_refcounted(page);
1739
1740 arch_alloc_page(page, order);
1741 kernel_map_pages(page, 1 << order, 1);
1742 kernel_poison_pages(page, 1 << order, 1);
1743 kasan_alloc_pages(page, order);
1744 set_page_owner(page, order, gfp_flags);
1745}
1746
1727static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, 1747static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
1728 unsigned int alloc_flags) 1748 unsigned int alloc_flags)
1729{ 1749{
@@ -1736,13 +1756,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
1736 poisoned &= page_is_poisoned(p); 1756 poisoned &= page_is_poisoned(p);
1737 } 1757 }
1738 1758
1739 set_page_private(page, 0); 1759 post_alloc_hook(page, order, gfp_flags);
1740 set_page_refcounted(page);
1741
1742 arch_alloc_page(page, order);
1743 kernel_map_pages(page, 1 << order, 1);
1744 kernel_poison_pages(page, 1 << order, 1);
1745 kasan_alloc_pages(page, order);
1746 1760
1747 if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) 1761 if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
1748 for (i = 0; i < (1 << order); i++) 1762 for (i = 0; i < (1 << order); i++)
@@ -1751,8 +1765,6 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
1751 if (order && (gfp_flags & __GFP_COMP)) 1765 if (order && (gfp_flags & __GFP_COMP))
1752 prep_compound_page(page, order); 1766 prep_compound_page(page, order);
1753 1767
1754 set_page_owner(page, order, gfp_flags);
1755
1756 /* 1768 /*
1757 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to 1769 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
1758 * allocate the page. The expectation is that the caller is taking 1770 * allocate the page. The expectation is that the caller is taking
@@ -2461,7 +2473,6 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
2461void split_page(struct page *page, unsigned int order) 2473void split_page(struct page *page, unsigned int order)
2462{ 2474{
2463 int i; 2475 int i;
2464 gfp_t gfp_mask;
2465 2476
2466 VM_BUG_ON_PAGE(PageCompound(page), page); 2477 VM_BUG_ON_PAGE(PageCompound(page), page);
2467 VM_BUG_ON_PAGE(!page_count(page), page); 2478 VM_BUG_ON_PAGE(!page_count(page), page);
@@ -2475,12 +2486,9 @@ void split_page(struct page *page, unsigned int order)
2475 split_page(virt_to_page(page[0].shadow), order); 2486 split_page(virt_to_page(page[0].shadow), order);
2476#endif 2487#endif
2477 2488
2478 gfp_mask = get_page_owner_gfp(page); 2489 for (i = 1; i < (1 << order); i++)
2479 set_page_owner(page, 0, gfp_mask);
2480 for (i = 1; i < (1 << order); i++) {
2481 set_page_refcounted(page + i); 2490 set_page_refcounted(page + i);
2482 set_page_owner(page + i, 0, gfp_mask); 2491 split_page_owner(page, order);
2483 }
2484} 2492}
2485EXPORT_SYMBOL_GPL(split_page); 2493EXPORT_SYMBOL_GPL(split_page);
2486 2494
@@ -2509,8 +2517,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
2509 zone->free_area[order].nr_free--; 2517 zone->free_area[order].nr_free--;
2510 rmv_page_order(page); 2518 rmv_page_order(page);
2511 2519
2512 set_page_owner(page, order, __GFP_MOVABLE);
2513
2514 /* Set the pageblock if the isolated page is at least a pageblock */ 2520 /* Set the pageblock if the isolated page is at least a pageblock */
2515 if (order >= pageblock_order - 1) { 2521 if (order >= pageblock_order - 1) {
2516 struct page *endpage = page + (1 << order) - 1; 2522 struct page *endpage = page + (1 << order) - 1;
@@ -2527,33 +2533,6 @@ int __isolate_free_page(struct page *page, unsigned int order)
2527} 2533}
2528 2534
2529/* 2535/*
2530 * Similar to split_page except the page is already free. As this is only
2531 * being used for migration, the migratetype of the block also changes.
2532 * As this is called with interrupts disabled, the caller is responsible
2533 * for calling arch_alloc_page() and kernel_map_page() after interrupts
2534 * are enabled.
2535 *
2536 * Note: this is probably too low level an operation for use in drivers.
2537 * Please consult with lkml before using this in your driver.
2538 */
2539int split_free_page(struct page *page)
2540{
2541 unsigned int order;
2542 int nr_pages;
2543
2544 order = page_order(page);
2545
2546 nr_pages = __isolate_free_page(page, order);
2547 if (!nr_pages)
2548 return 0;
2549
2550 /* Split into individual pages */
2551 set_page_refcounted(page);
2552 split_page(page, order);
2553 return nr_pages;
2554}
2555
2556/*
2557 * Update NUMA hit/miss statistics 2536 * Update NUMA hit/miss statistics
2558 * 2537 *
2559 * Must be called with interrupts disabled. 2538 * Must be called with interrupts disabled.
@@ -3105,6 +3084,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
3105 struct oom_control oc = { 3084 struct oom_control oc = {
3106 .zonelist = ac->zonelist, 3085 .zonelist = ac->zonelist,
3107 .nodemask = ac->nodemask, 3086 .nodemask = ac->nodemask,
3087 .memcg = NULL,
3108 .gfp_mask = gfp_mask, 3088 .gfp_mask = gfp_mask,
3109 .order = order, 3089 .order = order,
3110 }; 3090 };
@@ -3868,6 +3848,14 @@ no_zone:
3868 } 3848 }
3869 3849
3870out: 3850out:
3851 if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page) {
3852 if (unlikely(memcg_kmem_charge(page, gfp_mask, order))) {
3853 __free_pages(page, order);
3854 page = NULL;
3855 } else
3856 __SetPageKmemcg(page);
3857 }
3858
3871 if (kmemcheck_enabled && page) 3859 if (kmemcheck_enabled && page)
3872 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 3860 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
3873 3861
@@ -4023,56 +4011,6 @@ void __free_page_frag(void *addr)
4023} 4011}
4024EXPORT_SYMBOL(__free_page_frag); 4012EXPORT_SYMBOL(__free_page_frag);
4025 4013
4026/*
4027 * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
4028 * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is
4029 * equivalent to alloc_pages.
4030 *
4031 * It should be used when the caller would like to use kmalloc, but since the
4032 * allocation is large, it has to fall back to the page allocator.
4033 */
4034struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
4035{
4036 struct page *page;
4037
4038 page = alloc_pages(gfp_mask, order);
4039 if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
4040 __free_pages(page, order);
4041 page = NULL;
4042 }
4043 return page;
4044}
4045
4046struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
4047{
4048 struct page *page;
4049
4050 page = alloc_pages_node(nid, gfp_mask, order);
4051 if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
4052 __free_pages(page, order);
4053 page = NULL;
4054 }
4055 return page;
4056}
4057
4058/*
4059 * __free_kmem_pages and free_kmem_pages will free pages allocated with
4060 * alloc_kmem_pages.
4061 */
4062void __free_kmem_pages(struct page *page, unsigned int order)
4063{
4064 memcg_kmem_uncharge(page, order);
4065 __free_pages(page, order);
4066}
4067
4068void free_kmem_pages(unsigned long addr, unsigned int order)
4069{
4070 if (addr != 0) {
4071 VM_BUG_ON(!virt_addr_valid((void *)addr));
4072 __free_kmem_pages(virt_to_page((void *)addr), order);
4073 }
4074}
4075
4076static void *make_alloc_exact(unsigned long addr, unsigned int order, 4014static void *make_alloc_exact(unsigned long addr, unsigned int order,
4077 size_t size) 4015 size_t size)
4078{ 4016{
@@ -4374,6 +4312,9 @@ void show_free_areas(unsigned int filter)
4374 " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" 4312 " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
4375 " slab_reclaimable:%lu slab_unreclaimable:%lu\n" 4313 " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
4376 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 4314 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
4315#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4316 " anon_thp: %lu shmem_thp: %lu shmem_pmdmapped: %lu\n"
4317#endif
4377 " free:%lu free_pcp:%lu free_cma:%lu\n", 4318 " free:%lu free_pcp:%lu free_cma:%lu\n",
4378 global_page_state(NR_ACTIVE_ANON), 4319 global_page_state(NR_ACTIVE_ANON),
4379 global_page_state(NR_INACTIVE_ANON), 4320 global_page_state(NR_INACTIVE_ANON),
@@ -4391,6 +4332,11 @@ void show_free_areas(unsigned int filter)
4391 global_page_state(NR_SHMEM), 4332 global_page_state(NR_SHMEM),
4392 global_page_state(NR_PAGETABLE), 4333 global_page_state(NR_PAGETABLE),
4393 global_page_state(NR_BOUNCE), 4334 global_page_state(NR_BOUNCE),
4335#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4336 global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR,
4337 global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR,
4338 global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR,
4339#endif
4394 global_page_state(NR_FREE_PAGES), 4340 global_page_state(NR_FREE_PAGES),
4395 free_pcp, 4341 free_pcp,
4396 global_page_state(NR_FREE_CMA_PAGES)); 4342 global_page_state(NR_FREE_CMA_PAGES));
@@ -4425,6 +4371,11 @@ void show_free_areas(unsigned int filter)
4425 " writeback:%lukB" 4371 " writeback:%lukB"
4426 " mapped:%lukB" 4372 " mapped:%lukB"
4427 " shmem:%lukB" 4373 " shmem:%lukB"
4374#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4375 " shmem_thp: %lukB"
4376 " shmem_pmdmapped: %lukB"
4377 " anon_thp: %lukB"
4378#endif
4428 " slab_reclaimable:%lukB" 4379 " slab_reclaimable:%lukB"
4429 " slab_unreclaimable:%lukB" 4380 " slab_unreclaimable:%lukB"
4430 " kernel_stack:%lukB" 4381 " kernel_stack:%lukB"
@@ -4457,6 +4408,12 @@ void show_free_areas(unsigned int filter)
4457 K(zone_page_state(zone, NR_WRITEBACK)), 4408 K(zone_page_state(zone, NR_WRITEBACK)),
4458 K(zone_page_state(zone, NR_FILE_MAPPED)), 4409 K(zone_page_state(zone, NR_FILE_MAPPED)),
4459 K(zone_page_state(zone, NR_SHMEM)), 4410 K(zone_page_state(zone, NR_SHMEM)),
4411#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4412 K(zone_page_state(zone, NR_SHMEM_THPS) * HPAGE_PMD_NR),
4413 K(zone_page_state(zone, NR_SHMEM_PMDMAPPED)
4414 * HPAGE_PMD_NR),
4415 K(zone_page_state(zone, NR_ANON_THPS) * HPAGE_PMD_NR),
4416#endif
4460 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), 4417 K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
4461 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), 4418 K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
4462 zone_page_state(zone, NR_KERNEL_STACK) * 4419 zone_page_state(zone, NR_KERNEL_STACK) *
@@ -6467,15 +6424,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
6467 sizeof(arch_zone_lowest_possible_pfn)); 6424 sizeof(arch_zone_lowest_possible_pfn));
6468 memset(arch_zone_highest_possible_pfn, 0, 6425 memset(arch_zone_highest_possible_pfn, 0,
6469 sizeof(arch_zone_highest_possible_pfn)); 6426 sizeof(arch_zone_highest_possible_pfn));
6470 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 6427
6471 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 6428 start_pfn = find_min_pfn_with_active_regions();
6472 for (i = 1; i < MAX_NR_ZONES; i++) { 6429
6430 for (i = 0; i < MAX_NR_ZONES; i++) {
6473 if (i == ZONE_MOVABLE) 6431 if (i == ZONE_MOVABLE)
6474 continue; 6432 continue;
6475 arch_zone_lowest_possible_pfn[i] = 6433
6476 arch_zone_highest_possible_pfn[i-1]; 6434 end_pfn = max(max_zone_pfn[i], start_pfn);
6477 arch_zone_highest_possible_pfn[i] = 6435 arch_zone_lowest_possible_pfn[i] = start_pfn;
6478 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 6436 arch_zone_highest_possible_pfn[i] = end_pfn;
6437
6438 start_pfn = end_pfn;
6479 } 6439 }
6480 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; 6440 arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
6481 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; 6441 arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 612122bf6a42..064b7fb6e0b5 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -7,6 +7,7 @@
7#include <linux/pageblock-flags.h> 7#include <linux/pageblock-flags.h>
8#include <linux/memory.h> 8#include <linux/memory.h>
9#include <linux/hugetlb.h> 9#include <linux/hugetlb.h>
10#include <linux/page_owner.h>
10#include "internal.h" 11#include "internal.h"
11 12
12#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
@@ -80,7 +81,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
80{ 81{
81 struct zone *zone; 82 struct zone *zone;
82 unsigned long flags, nr_pages; 83 unsigned long flags, nr_pages;
83 struct page *isolated_page = NULL; 84 bool isolated_page = false;
84 unsigned int order; 85 unsigned int order;
85 unsigned long page_idx, buddy_idx; 86 unsigned long page_idx, buddy_idx;
86 struct page *buddy; 87 struct page *buddy;
@@ -108,9 +109,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
108 if (pfn_valid_within(page_to_pfn(buddy)) && 109 if (pfn_valid_within(page_to_pfn(buddy)) &&
109 !is_migrate_isolate_page(buddy)) { 110 !is_migrate_isolate_page(buddy)) {
110 __isolate_free_page(page, order); 111 __isolate_free_page(page, order);
111 kernel_map_pages(page, (1 << order), 1); 112 isolated_page = true;
112 set_page_refcounted(page);
113 isolated_page = page;
114 } 113 }
115 } 114 }
116 } 115 }
@@ -128,8 +127,10 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
128 zone->nr_isolate_pageblock--; 127 zone->nr_isolate_pageblock--;
129out: 128out:
130 spin_unlock_irqrestore(&zone->lock, flags); 129 spin_unlock_irqrestore(&zone->lock, flags);
131 if (isolated_page) 130 if (isolated_page) {
132 __free_pages(isolated_page, order); 131 post_alloc_hook(page, order, __GFP_MOVABLE);
132 __free_pages(page, order);
133 }
133} 134}
134 135
135static inline struct page * 136static inline struct page *
diff --git a/mm/page_owner.c b/mm/page_owner.c
index fedeba88c9cb..ec6dc1886f71 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -7,11 +7,22 @@
7#include <linux/page_owner.h> 7#include <linux/page_owner.h>
8#include <linux/jump_label.h> 8#include <linux/jump_label.h>
9#include <linux/migrate.h> 9#include <linux/migrate.h>
10#include <linux/stackdepot.h>
11
10#include "internal.h" 12#include "internal.h"
11 13
14/*
15 * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack)
16 * to use off stack temporal storage
17 */
18#define PAGE_OWNER_STACK_DEPTH (16)
19
12static bool page_owner_disabled = true; 20static bool page_owner_disabled = true;
13DEFINE_STATIC_KEY_FALSE(page_owner_inited); 21DEFINE_STATIC_KEY_FALSE(page_owner_inited);
14 22
23static depot_stack_handle_t dummy_handle;
24static depot_stack_handle_t failure_handle;
25
15static void init_early_allocated_pages(void); 26static void init_early_allocated_pages(void);
16 27
17static int early_page_owner_param(char *buf) 28static int early_page_owner_param(char *buf)
@@ -34,11 +45,41 @@ static bool need_page_owner(void)
34 return true; 45 return true;
35} 46}
36 47
48static noinline void register_dummy_stack(void)
49{
50 unsigned long entries[4];
51 struct stack_trace dummy;
52
53 dummy.nr_entries = 0;
54 dummy.max_entries = ARRAY_SIZE(entries);
55 dummy.entries = &entries[0];
56 dummy.skip = 0;
57
58 save_stack_trace(&dummy);
59 dummy_handle = depot_save_stack(&dummy, GFP_KERNEL);
60}
61
62static noinline void register_failure_stack(void)
63{
64 unsigned long entries[4];
65 struct stack_trace failure;
66
67 failure.nr_entries = 0;
68 failure.max_entries = ARRAY_SIZE(entries);
69 failure.entries = &entries[0];
70 failure.skip = 0;
71
72 save_stack_trace(&failure);
73 failure_handle = depot_save_stack(&failure, GFP_KERNEL);
74}
75
37static void init_page_owner(void) 76static void init_page_owner(void)
38{ 77{
39 if (page_owner_disabled) 78 if (page_owner_disabled)
40 return; 79 return;
41 80
81 register_dummy_stack();
82 register_failure_stack();
42 static_branch_enable(&page_owner_inited); 83 static_branch_enable(&page_owner_inited);
43 init_early_allocated_pages(); 84 init_early_allocated_pages();
44} 85}
@@ -61,25 +102,66 @@ void __reset_page_owner(struct page *page, unsigned int order)
61 } 102 }
62} 103}
63 104
64void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) 105static inline bool check_recursive_alloc(struct stack_trace *trace,
106 unsigned long ip)
65{ 107{
66 struct page_ext *page_ext = lookup_page_ext(page); 108 int i, count;
109
110 if (!trace->nr_entries)
111 return false;
112
113 for (i = 0, count = 0; i < trace->nr_entries; i++) {
114 if (trace->entries[i] == ip && ++count == 2)
115 return true;
116 }
117
118 return false;
119}
67 120
121static noinline depot_stack_handle_t save_stack(gfp_t flags)
122{
123 unsigned long entries[PAGE_OWNER_STACK_DEPTH];
68 struct stack_trace trace = { 124 struct stack_trace trace = {
69 .nr_entries = 0, 125 .nr_entries = 0,
70 .max_entries = ARRAY_SIZE(page_ext->trace_entries), 126 .entries = entries,
71 .entries = &page_ext->trace_entries[0], 127 .max_entries = PAGE_OWNER_STACK_DEPTH,
72 .skip = 3, 128 .skip = 0
73 }; 129 };
130 depot_stack_handle_t handle;
131
132 save_stack_trace(&trace);
133 if (trace.nr_entries != 0 &&
134 trace.entries[trace.nr_entries-1] == ULONG_MAX)
135 trace.nr_entries--;
136
137 /*
138 * We need to check recursion here because our request to stackdepot
139 * could trigger memory allocation to save new entry. New memory
140 * allocation would reach here and call depot_save_stack() again
141 * if we don't catch it. There is still not enough memory in stackdepot
142 * so it would try to allocate memory again and loop forever.
143 */
144 if (check_recursive_alloc(&trace, _RET_IP_))
145 return dummy_handle;
146
147 handle = depot_save_stack(&trace, flags);
148 if (!handle)
149 handle = failure_handle;
150
151 return handle;
152}
153
154noinline void __set_page_owner(struct page *page, unsigned int order,
155 gfp_t gfp_mask)
156{
157 struct page_ext *page_ext = lookup_page_ext(page);
74 158
75 if (unlikely(!page_ext)) 159 if (unlikely(!page_ext))
76 return; 160 return;
77 161
78 save_stack_trace(&trace); 162 page_ext->handle = save_stack(gfp_mask);
79
80 page_ext->order = order; 163 page_ext->order = order;
81 page_ext->gfp_mask = gfp_mask; 164 page_ext->gfp_mask = gfp_mask;
82 page_ext->nr_entries = trace.nr_entries;
83 page_ext->last_migrate_reason = -1; 165 page_ext->last_migrate_reason = -1;
84 166
85 __set_bit(PAGE_EXT_OWNER, &page_ext->flags); 167 __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
@@ -94,34 +176,31 @@ void __set_page_owner_migrate_reason(struct page *page, int reason)
94 page_ext->last_migrate_reason = reason; 176 page_ext->last_migrate_reason = reason;
95} 177}
96 178
97gfp_t __get_page_owner_gfp(struct page *page) 179void __split_page_owner(struct page *page, unsigned int order)
98{ 180{
181 int i;
99 struct page_ext *page_ext = lookup_page_ext(page); 182 struct page_ext *page_ext = lookup_page_ext(page);
183
100 if (unlikely(!page_ext)) 184 if (unlikely(!page_ext))
101 /* 185 return;
102 * The caller just returns 0 if no valid gfp
103 * So return 0 here too.
104 */
105 return 0;
106 186
107 return page_ext->gfp_mask; 187 page_ext->order = 0;
188 for (i = 1; i < (1 << order); i++)
189 __copy_page_owner(page, page + i);
108} 190}
109 191
110void __copy_page_owner(struct page *oldpage, struct page *newpage) 192void __copy_page_owner(struct page *oldpage, struct page *newpage)
111{ 193{
112 struct page_ext *old_ext = lookup_page_ext(oldpage); 194 struct page_ext *old_ext = lookup_page_ext(oldpage);
113 struct page_ext *new_ext = lookup_page_ext(newpage); 195 struct page_ext *new_ext = lookup_page_ext(newpage);
114 int i;
115 196
116 if (unlikely(!old_ext || !new_ext)) 197 if (unlikely(!old_ext || !new_ext))
117 return; 198 return;
118 199
119 new_ext->order = old_ext->order; 200 new_ext->order = old_ext->order;
120 new_ext->gfp_mask = old_ext->gfp_mask; 201 new_ext->gfp_mask = old_ext->gfp_mask;
121 new_ext->nr_entries = old_ext->nr_entries; 202 new_ext->last_migrate_reason = old_ext->last_migrate_reason;
122 203 new_ext->handle = old_ext->handle;
123 for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++)
124 new_ext->trace_entries[i] = old_ext->trace_entries[i];
125 204
126 /* 205 /*
127 * We don't clear the bit on the oldpage as it's going to be freed 206 * We don't clear the bit on the oldpage as it's going to be freed
@@ -137,14 +216,18 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
137 216
138static ssize_t 217static ssize_t
139print_page_owner(char __user *buf, size_t count, unsigned long pfn, 218print_page_owner(char __user *buf, size_t count, unsigned long pfn,
140 struct page *page, struct page_ext *page_ext) 219 struct page *page, struct page_ext *page_ext,
220 depot_stack_handle_t handle)
141{ 221{
142 int ret; 222 int ret;
143 int pageblock_mt, page_mt; 223 int pageblock_mt, page_mt;
144 char *kbuf; 224 char *kbuf;
225 unsigned long entries[PAGE_OWNER_STACK_DEPTH];
145 struct stack_trace trace = { 226 struct stack_trace trace = {
146 .nr_entries = page_ext->nr_entries, 227 .nr_entries = 0,
147 .entries = &page_ext->trace_entries[0], 228 .entries = entries,
229 .max_entries = PAGE_OWNER_STACK_DEPTH,
230 .skip = 0
148 }; 231 };
149 232
150 kbuf = kmalloc(count, GFP_KERNEL); 233 kbuf = kmalloc(count, GFP_KERNEL);
@@ -173,6 +256,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
173 if (ret >= count) 256 if (ret >= count)
174 goto err; 257 goto err;
175 258
259 depot_fetch_stack(handle, &trace);
176 ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0); 260 ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
177 if (ret >= count) 261 if (ret >= count)
178 goto err; 262 goto err;
@@ -203,10 +287,14 @@ err:
203void __dump_page_owner(struct page *page) 287void __dump_page_owner(struct page *page)
204{ 288{
205 struct page_ext *page_ext = lookup_page_ext(page); 289 struct page_ext *page_ext = lookup_page_ext(page);
290 unsigned long entries[PAGE_OWNER_STACK_DEPTH];
206 struct stack_trace trace = { 291 struct stack_trace trace = {
207 .nr_entries = page_ext->nr_entries, 292 .nr_entries = 0,
208 .entries = &page_ext->trace_entries[0], 293 .entries = entries,
294 .max_entries = PAGE_OWNER_STACK_DEPTH,
295 .skip = 0
209 }; 296 };
297 depot_stack_handle_t handle;
210 gfp_t gfp_mask; 298 gfp_t gfp_mask;
211 int mt; 299 int mt;
212 300
@@ -222,6 +310,13 @@ void __dump_page_owner(struct page *page)
222 return; 310 return;
223 } 311 }
224 312
313 handle = READ_ONCE(page_ext->handle);
314 if (!handle) {
315 pr_alert("page_owner info is not active (free page?)\n");
316 return;
317 }
318
319 depot_fetch_stack(handle, &trace);
225 pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", 320 pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
226 page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask); 321 page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask);
227 print_stack_trace(&trace, 0); 322 print_stack_trace(&trace, 0);
@@ -237,6 +332,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
237 unsigned long pfn; 332 unsigned long pfn;
238 struct page *page; 333 struct page *page;
239 struct page_ext *page_ext; 334 struct page_ext *page_ext;
335 depot_stack_handle_t handle;
240 336
241 if (!static_branch_unlikely(&page_owner_inited)) 337 if (!static_branch_unlikely(&page_owner_inited))
242 return -EINVAL; 338 return -EINVAL;
@@ -285,10 +381,19 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos)
285 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) 381 if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
286 continue; 382 continue;
287 383
384 /*
385 * Access to page_ext->handle isn't synchronous so we should
386 * be careful to access it.
387 */
388 handle = READ_ONCE(page_ext->handle);
389 if (!handle)
390 continue;
391
288 /* Record the next PFN to read in the file offset */ 392 /* Record the next PFN to read in the file offset */
289 *ppos = (pfn - min_low_pfn) + 1; 393 *ppos = (pfn - min_low_pfn) + 1;
290 394
291 return print_page_owner(buf, count, pfn, page, page_ext); 395 return print_page_owner(buf, count, pfn, page,
396 page_ext, handle);
292 } 397 }
293 398
294 return 0; 399 return 0;
diff --git a/mm/readahead.c b/mm/readahead.c
index 40be3ae0afe3..65ec288dc057 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -89,7 +89,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
89 page = lru_to_page(pages); 89 page = lru_to_page(pages);
90 list_del(&page->lru); 90 list_del(&page->lru);
91 if (add_to_page_cache_lru(page, mapping, page->index, 91 if (add_to_page_cache_lru(page, mapping, page->index,
92 mapping_gfp_constraint(mapping, GFP_KERNEL))) { 92 readahead_gfp_mask(mapping))) {
93 read_cache_pages_invalidate_page(mapping, page); 93 read_cache_pages_invalidate_page(mapping, page);
94 continue; 94 continue;
95 } 95 }
@@ -108,7 +108,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
108EXPORT_SYMBOL(read_cache_pages); 108EXPORT_SYMBOL(read_cache_pages);
109 109
110static int read_pages(struct address_space *mapping, struct file *filp, 110static int read_pages(struct address_space *mapping, struct file *filp,
111 struct list_head *pages, unsigned nr_pages) 111 struct list_head *pages, unsigned int nr_pages, gfp_t gfp)
112{ 112{
113 struct blk_plug plug; 113 struct blk_plug plug;
114 unsigned page_idx; 114 unsigned page_idx;
@@ -126,10 +126,8 @@ static int read_pages(struct address_space *mapping, struct file *filp,
126 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 126 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
127 struct page *page = lru_to_page(pages); 127 struct page *page = lru_to_page(pages);
128 list_del(&page->lru); 128 list_del(&page->lru);
129 if (!add_to_page_cache_lru(page, mapping, page->index, 129 if (!add_to_page_cache_lru(page, mapping, page->index, gfp))
130 mapping_gfp_constraint(mapping, GFP_KERNEL))) {
131 mapping->a_ops->readpage(filp, page); 130 mapping->a_ops->readpage(filp, page);
132 }
133 put_page(page); 131 put_page(page);
134 } 132 }
135 ret = 0; 133 ret = 0;
@@ -159,6 +157,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
159 int page_idx; 157 int page_idx;
160 int ret = 0; 158 int ret = 0;
161 loff_t isize = i_size_read(inode); 159 loff_t isize = i_size_read(inode);
160 gfp_t gfp_mask = readahead_gfp_mask(mapping);
162 161
163 if (isize == 0) 162 if (isize == 0)
164 goto out; 163 goto out;
@@ -180,7 +179,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
180 if (page && !radix_tree_exceptional_entry(page)) 179 if (page && !radix_tree_exceptional_entry(page))
181 continue; 180 continue;
182 181
183 page = page_cache_alloc_readahead(mapping); 182 page = __page_cache_alloc(gfp_mask);
184 if (!page) 183 if (!page)
185 break; 184 break;
186 page->index = page_offset; 185 page->index = page_offset;
@@ -196,7 +195,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
196 * will then handle the error. 195 * will then handle the error.
197 */ 196 */
198 if (ret) 197 if (ret)
199 read_pages(mapping, filp, &page_pool, ret); 198 read_pages(mapping, filp, &page_pool, ret, gfp_mask);
200 BUG_ON(!list_empty(&page_pool)); 199 BUG_ON(!list_empty(&page_pool));
201out: 200out:
202 return ret; 201 return ret;
diff --git a/mm/rmap.c b/mm/rmap.c
index 701b93fea2a0..8a13d9f7b566 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1212,10 +1212,8 @@ void do_page_add_anon_rmap(struct page *page,
1212 * pte lock(a spinlock) is held, which implies preemption 1212 * pte lock(a spinlock) is held, which implies preemption
1213 * disabled. 1213 * disabled.
1214 */ 1214 */
1215 if (compound) { 1215 if (compound)
1216 __inc_zone_page_state(page, 1216 __inc_zone_page_state(page, NR_ANON_THPS);
1217 NR_ANON_TRANSPARENT_HUGEPAGES);
1218 }
1219 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); 1217 __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr);
1220 } 1218 }
1221 if (unlikely(PageKsm(page))) 1219 if (unlikely(PageKsm(page)))
@@ -1253,7 +1251,7 @@ void page_add_new_anon_rmap(struct page *page,
1253 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 1251 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
1254 /* increment count (starts at -1) */ 1252 /* increment count (starts at -1) */
1255 atomic_set(compound_mapcount_ptr(page), 0); 1253 atomic_set(compound_mapcount_ptr(page), 0);
1256 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1254 __inc_zone_page_state(page, NR_ANON_THPS);
1257 } else { 1255 } else {
1258 /* Anon THP always mapped first with PMD */ 1256 /* Anon THP always mapped first with PMD */
1259 VM_BUG_ON_PAGE(PageTransCompound(page), page); 1257 VM_BUG_ON_PAGE(PageTransCompound(page), page);
@@ -1270,18 +1268,42 @@ void page_add_new_anon_rmap(struct page *page,
1270 * 1268 *
1271 * The caller needs to hold the pte lock. 1269 * The caller needs to hold the pte lock.
1272 */ 1270 */
1273void page_add_file_rmap(struct page *page) 1271void page_add_file_rmap(struct page *page, bool compound)
1274{ 1272{
1273 int i, nr = 1;
1274
1275 VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
1275 lock_page_memcg(page); 1276 lock_page_memcg(page);
1276 if (atomic_inc_and_test(&page->_mapcount)) { 1277 if (compound && PageTransHuge(page)) {
1277 __inc_zone_page_state(page, NR_FILE_MAPPED); 1278 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
1278 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); 1279 if (atomic_inc_and_test(&page[i]._mapcount))
1280 nr++;
1281 }
1282 if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
1283 goto out;
1284 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
1285 __inc_zone_page_state(page, NR_SHMEM_PMDMAPPED);
1286 } else {
1287 if (PageTransCompound(page)) {
1288 VM_BUG_ON_PAGE(!PageLocked(page), page);
1289 SetPageDoubleMap(compound_head(page));
1290 if (PageMlocked(page))
1291 clear_page_mlock(compound_head(page));
1292 }
1293 if (!atomic_inc_and_test(&page->_mapcount))
1294 goto out;
1279 } 1295 }
1296 __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, nr);
1297 mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
1298out:
1280 unlock_page_memcg(page); 1299 unlock_page_memcg(page);
1281} 1300}
1282 1301
1283static void page_remove_file_rmap(struct page *page) 1302static void page_remove_file_rmap(struct page *page, bool compound)
1284{ 1303{
1304 int i, nr = 1;
1305
1306 VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
1285 lock_page_memcg(page); 1307 lock_page_memcg(page);
1286 1308
1287 /* Hugepages are not counted in NR_FILE_MAPPED for now. */ 1309 /* Hugepages are not counted in NR_FILE_MAPPED for now. */
@@ -1292,15 +1314,26 @@ static void page_remove_file_rmap(struct page *page)
1292 } 1314 }
1293 1315
1294 /* page still mapped by someone else? */ 1316 /* page still mapped by someone else? */
1295 if (!atomic_add_negative(-1, &page->_mapcount)) 1317 if (compound && PageTransHuge(page)) {
1296 goto out; 1318 for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
1319 if (atomic_add_negative(-1, &page[i]._mapcount))
1320 nr++;
1321 }
1322 if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
1323 goto out;
1324 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
1325 __dec_zone_page_state(page, NR_SHMEM_PMDMAPPED);
1326 } else {
1327 if (!atomic_add_negative(-1, &page->_mapcount))
1328 goto out;
1329 }
1297 1330
1298 /* 1331 /*
1299 * We use the irq-unsafe __{inc|mod}_zone_page_stat because 1332 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1300 * these counters are not modified in interrupt context, and 1333 * these counters are not modified in interrupt context, and
1301 * pte lock(a spinlock) is held, which implies preemption disabled. 1334 * pte lock(a spinlock) is held, which implies preemption disabled.
1302 */ 1335 */
1303 __dec_zone_page_state(page, NR_FILE_MAPPED); 1336 __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, -nr);
1304 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); 1337 mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED);
1305 1338
1306 if (unlikely(PageMlocked(page))) 1339 if (unlikely(PageMlocked(page)))
@@ -1323,7 +1356,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
1323 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 1356 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1324 return; 1357 return;
1325 1358
1326 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1359 __dec_zone_page_state(page, NR_ANON_THPS);
1327 1360
1328 if (TestClearPageDoubleMap(page)) { 1361 if (TestClearPageDoubleMap(page)) {
1329 /* 1362 /*
@@ -1356,11 +1389,8 @@ static void page_remove_anon_compound_rmap(struct page *page)
1356 */ 1389 */
1357void page_remove_rmap(struct page *page, bool compound) 1390void page_remove_rmap(struct page *page, bool compound)
1358{ 1391{
1359 if (!PageAnon(page)) { 1392 if (!PageAnon(page))
1360 VM_BUG_ON_PAGE(compound && !PageHuge(page), page); 1393 return page_remove_file_rmap(page, compound);
1361 page_remove_file_rmap(page);
1362 return;
1363 }
1364 1394
1365 if (compound) 1395 if (compound)
1366 return page_remove_anon_compound_rmap(page); 1396 return page_remove_anon_compound_rmap(page);
@@ -1436,8 +1466,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1436 */ 1466 */
1437 if (!(flags & TTU_IGNORE_MLOCK)) { 1467 if (!(flags & TTU_IGNORE_MLOCK)) {
1438 if (vma->vm_flags & VM_LOCKED) { 1468 if (vma->vm_flags & VM_LOCKED) {
1439 /* Holding pte lock, we do *not* need mmap_sem here */ 1469 /* PTE-mapped THP are never mlocked */
1440 mlock_vma_page(page); 1470 if (!PageTransCompound(page)) {
1471 /*
1472 * Holding pte lock, we do *not* need
1473 * mmap_sem here
1474 */
1475 mlock_vma_page(page);
1476 }
1441 ret = SWAP_MLOCK; 1477 ret = SWAP_MLOCK;
1442 goto out_unmap; 1478 goto out_unmap;
1443 } 1479 }
diff --git a/mm/shmem.c b/mm/shmem.c
index 171dee7a131f..62e42c7d544c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -32,6 +32,7 @@
32#include <linux/export.h> 32#include <linux/export.h>
33#include <linux/swap.h> 33#include <linux/swap.h>
34#include <linux/uio.h> 34#include <linux/uio.h>
35#include <linux/khugepaged.h>
35 36
36static struct vfsmount *shm_mnt; 37static struct vfsmount *shm_mnt;
37 38
@@ -97,14 +98,6 @@ struct shmem_falloc {
97 pgoff_t nr_unswapped; /* how often writepage refused to swap out */ 98 pgoff_t nr_unswapped; /* how often writepage refused to swap out */
98}; 99};
99 100
100/* Flag allocation requirements to shmem_getpage */
101enum sgp_type {
102 SGP_READ, /* don't exceed i_size, don't allocate page */
103 SGP_CACHE, /* don't exceed i_size, may allocate page */
104 SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */
105 SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
106};
107
108#ifdef CONFIG_TMPFS 101#ifdef CONFIG_TMPFS
109static unsigned long shmem_default_max_blocks(void) 102static unsigned long shmem_default_max_blocks(void)
110{ 103{
@@ -124,7 +117,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
124 struct page **pagep, enum sgp_type sgp, 117 struct page **pagep, enum sgp_type sgp,
125 gfp_t gfp, struct mm_struct *fault_mm, int *fault_type); 118 gfp_t gfp, struct mm_struct *fault_mm, int *fault_type);
126 119
127static inline int shmem_getpage(struct inode *inode, pgoff_t index, 120int shmem_getpage(struct inode *inode, pgoff_t index,
128 struct page **pagep, enum sgp_type sgp) 121 struct page **pagep, enum sgp_type sgp)
129{ 122{
130 return shmem_getpage_gfp(inode, index, pagep, sgp, 123 return shmem_getpage_gfp(inode, index, pagep, sgp,
@@ -173,10 +166,13 @@ static inline int shmem_reacct_size(unsigned long flags,
173 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, 166 * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
174 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. 167 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
175 */ 168 */
176static inline int shmem_acct_block(unsigned long flags) 169static inline int shmem_acct_block(unsigned long flags, long pages)
177{ 170{
178 return (flags & VM_NORESERVE) ? 171 if (!(flags & VM_NORESERVE))
179 security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_SIZE)) : 0; 172 return 0;
173
174 return security_vm_enough_memory_mm(current->mm,
175 pages * VM_ACCT(PAGE_SIZE));
180} 176}
181 177
182static inline void shmem_unacct_blocks(unsigned long flags, long pages) 178static inline void shmem_unacct_blocks(unsigned long flags, long pages)
@@ -192,6 +188,7 @@ static const struct inode_operations shmem_inode_operations;
192static const struct inode_operations shmem_dir_inode_operations; 188static const struct inode_operations shmem_dir_inode_operations;
193static const struct inode_operations shmem_special_inode_operations; 189static const struct inode_operations shmem_special_inode_operations;
194static const struct vm_operations_struct shmem_vm_ops; 190static const struct vm_operations_struct shmem_vm_ops;
191static struct file_system_type shmem_fs_type;
195 192
196static LIST_HEAD(shmem_swaplist); 193static LIST_HEAD(shmem_swaplist);
197static DEFINE_MUTEX(shmem_swaplist_mutex); 194static DEFINE_MUTEX(shmem_swaplist_mutex);
@@ -249,6 +246,53 @@ static void shmem_recalc_inode(struct inode *inode)
249 } 246 }
250} 247}
251 248
249bool shmem_charge(struct inode *inode, long pages)
250{
251 struct shmem_inode_info *info = SHMEM_I(inode);
252 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
253 unsigned long flags;
254
255 if (shmem_acct_block(info->flags, pages))
256 return false;
257 spin_lock_irqsave(&info->lock, flags);
258 info->alloced += pages;
259 inode->i_blocks += pages * BLOCKS_PER_PAGE;
260 shmem_recalc_inode(inode);
261 spin_unlock_irqrestore(&info->lock, flags);
262 inode->i_mapping->nrpages += pages;
263
264 if (!sbinfo->max_blocks)
265 return true;
266 if (percpu_counter_compare(&sbinfo->used_blocks,
267 sbinfo->max_blocks - pages) > 0) {
268 inode->i_mapping->nrpages -= pages;
269 spin_lock_irqsave(&info->lock, flags);
270 info->alloced -= pages;
271 shmem_recalc_inode(inode);
272 spin_unlock_irqrestore(&info->lock, flags);
273
274 return false;
275 }
276 percpu_counter_add(&sbinfo->used_blocks, pages);
277 return true;
278}
279
280void shmem_uncharge(struct inode *inode, long pages)
281{
282 struct shmem_inode_info *info = SHMEM_I(inode);
283 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
284 unsigned long flags;
285
286 spin_lock_irqsave(&info->lock, flags);
287 info->alloced -= pages;
288 inode->i_blocks -= pages * BLOCKS_PER_PAGE;
289 shmem_recalc_inode(inode);
290 spin_unlock_irqrestore(&info->lock, flags);
291
292 if (sbinfo->max_blocks)
293 percpu_counter_sub(&sbinfo->used_blocks, pages);
294}
295
252/* 296/*
253 * Replace item expected in radix tree by a new item, while holding tree lock. 297 * Replace item expected in radix tree by a new item, while holding tree lock.
254 */ 298 */
@@ -289,36 +333,256 @@ static bool shmem_confirm_swap(struct address_space *mapping,
289} 333}
290 334
291/* 335/*
336 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
337 *
338 * SHMEM_HUGE_NEVER:
339 * disables huge pages for the mount;
340 * SHMEM_HUGE_ALWAYS:
341 * enables huge pages for the mount;
342 * SHMEM_HUGE_WITHIN_SIZE:
343 * only allocate huge pages if the page will be fully within i_size,
344 * also respect fadvise()/madvise() hints;
345 * SHMEM_HUGE_ADVISE:
346 * only allocate huge pages if requested with fadvise()/madvise();
347 */
348
349#define SHMEM_HUGE_NEVER 0
350#define SHMEM_HUGE_ALWAYS 1
351#define SHMEM_HUGE_WITHIN_SIZE 2
352#define SHMEM_HUGE_ADVISE 3
353
354/*
355 * Special values.
356 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
357 *
358 * SHMEM_HUGE_DENY:
359 * disables huge on shm_mnt and all mounts, for emergency use;
360 * SHMEM_HUGE_FORCE:
361 * enables huge on shm_mnt and all mounts, w/o needing option, for testing;
362 *
363 */
364#define SHMEM_HUGE_DENY (-1)
365#define SHMEM_HUGE_FORCE (-2)
366
367#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
368/* ifdef here to avoid bloating shmem.o when not necessary */
369
370int shmem_huge __read_mostly;
371
372static int shmem_parse_huge(const char *str)
373{
374 if (!strcmp(str, "never"))
375 return SHMEM_HUGE_NEVER;
376 if (!strcmp(str, "always"))
377 return SHMEM_HUGE_ALWAYS;
378 if (!strcmp(str, "within_size"))
379 return SHMEM_HUGE_WITHIN_SIZE;
380 if (!strcmp(str, "advise"))
381 return SHMEM_HUGE_ADVISE;
382 if (!strcmp(str, "deny"))
383 return SHMEM_HUGE_DENY;
384 if (!strcmp(str, "force"))
385 return SHMEM_HUGE_FORCE;
386 return -EINVAL;
387}
388
389static const char *shmem_format_huge(int huge)
390{
391 switch (huge) {
392 case SHMEM_HUGE_NEVER:
393 return "never";
394 case SHMEM_HUGE_ALWAYS:
395 return "always";
396 case SHMEM_HUGE_WITHIN_SIZE:
397 return "within_size";
398 case SHMEM_HUGE_ADVISE:
399 return "advise";
400 case SHMEM_HUGE_DENY:
401 return "deny";
402 case SHMEM_HUGE_FORCE:
403 return "force";
404 default:
405 VM_BUG_ON(1);
406 return "bad_val";
407 }
408}
409
410static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
411 struct shrink_control *sc, unsigned long nr_to_split)
412{
413 LIST_HEAD(list), *pos, *next;
414 struct inode *inode;
415 struct shmem_inode_info *info;
416 struct page *page;
417 unsigned long batch = sc ? sc->nr_to_scan : 128;
418 int removed = 0, split = 0;
419
420 if (list_empty(&sbinfo->shrinklist))
421 return SHRINK_STOP;
422
423 spin_lock(&sbinfo->shrinklist_lock);
424 list_for_each_safe(pos, next, &sbinfo->shrinklist) {
425 info = list_entry(pos, struct shmem_inode_info, shrinklist);
426
427 /* pin the inode */
428 inode = igrab(&info->vfs_inode);
429
430 /* inode is about to be evicted */
431 if (!inode) {
432 list_del_init(&info->shrinklist);
433 removed++;
434 goto next;
435 }
436
437 /* Check if there's anything to gain */
438 if (round_up(inode->i_size, PAGE_SIZE) ==
439 round_up(inode->i_size, HPAGE_PMD_SIZE)) {
440 list_del_init(&info->shrinklist);
441 removed++;
442 iput(inode);
443 goto next;
444 }
445
446 list_move(&info->shrinklist, &list);
447next:
448 if (!--batch)
449 break;
450 }
451 spin_unlock(&sbinfo->shrinklist_lock);
452
453 list_for_each_safe(pos, next, &list) {
454 int ret;
455
456 info = list_entry(pos, struct shmem_inode_info, shrinklist);
457 inode = &info->vfs_inode;
458
459 if (nr_to_split && split >= nr_to_split) {
460 iput(inode);
461 continue;
462 }
463
464 page = find_lock_page(inode->i_mapping,
465 (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
466 if (!page)
467 goto drop;
468
469 if (!PageTransHuge(page)) {
470 unlock_page(page);
471 put_page(page);
472 goto drop;
473 }
474
475 ret = split_huge_page(page);
476 unlock_page(page);
477 put_page(page);
478
479 if (ret) {
480 /* split failed: leave it on the list */
481 iput(inode);
482 continue;
483 }
484
485 split++;
486drop:
487 list_del_init(&info->shrinklist);
488 removed++;
489 iput(inode);
490 }
491
492 spin_lock(&sbinfo->shrinklist_lock);
493 list_splice_tail(&list, &sbinfo->shrinklist);
494 sbinfo->shrinklist_len -= removed;
495 spin_unlock(&sbinfo->shrinklist_lock);
496
497 return split;
498}
499
500static long shmem_unused_huge_scan(struct super_block *sb,
501 struct shrink_control *sc)
502{
503 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
504
505 if (!READ_ONCE(sbinfo->shrinklist_len))
506 return SHRINK_STOP;
507
508 return shmem_unused_huge_shrink(sbinfo, sc, 0);
509}
510
511static long shmem_unused_huge_count(struct super_block *sb,
512 struct shrink_control *sc)
513{
514 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
515 return READ_ONCE(sbinfo->shrinklist_len);
516}
517#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
518
519#define shmem_huge SHMEM_HUGE_DENY
520
521static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
522 struct shrink_control *sc, unsigned long nr_to_split)
523{
524 return 0;
525}
526#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
527
528/*
292 * Like add_to_page_cache_locked, but error if expected item has gone. 529 * Like add_to_page_cache_locked, but error if expected item has gone.
293 */ 530 */
294static int shmem_add_to_page_cache(struct page *page, 531static int shmem_add_to_page_cache(struct page *page,
295 struct address_space *mapping, 532 struct address_space *mapping,
296 pgoff_t index, void *expected) 533 pgoff_t index, void *expected)
297{ 534{
298 int error; 535 int error, nr = hpage_nr_pages(page);
299 536
537 VM_BUG_ON_PAGE(PageTail(page), page);
538 VM_BUG_ON_PAGE(index != round_down(index, nr), page);
300 VM_BUG_ON_PAGE(!PageLocked(page), page); 539 VM_BUG_ON_PAGE(!PageLocked(page), page);
301 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 540 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
541 VM_BUG_ON(expected && PageTransHuge(page));
302 542
303 get_page(page); 543 page_ref_add(page, nr);
304 page->mapping = mapping; 544 page->mapping = mapping;
305 page->index = index; 545 page->index = index;
306 546
307 spin_lock_irq(&mapping->tree_lock); 547 spin_lock_irq(&mapping->tree_lock);
308 if (!expected) 548 if (PageTransHuge(page)) {
549 void __rcu **results;
550 pgoff_t idx;
551 int i;
552
553 error = 0;
554 if (radix_tree_gang_lookup_slot(&mapping->page_tree,
555 &results, &idx, index, 1) &&
556 idx < index + HPAGE_PMD_NR) {
557 error = -EEXIST;
558 }
559
560 if (!error) {
561 for (i = 0; i < HPAGE_PMD_NR; i++) {
562 error = radix_tree_insert(&mapping->page_tree,
563 index + i, page + i);
564 VM_BUG_ON(error);
565 }
566 count_vm_event(THP_FILE_ALLOC);
567 }
568 } else if (!expected) {
309 error = radix_tree_insert(&mapping->page_tree, index, page); 569 error = radix_tree_insert(&mapping->page_tree, index, page);
310 else 570 } else {
311 error = shmem_radix_tree_replace(mapping, index, expected, 571 error = shmem_radix_tree_replace(mapping, index, expected,
312 page); 572 page);
573 }
574
313 if (!error) { 575 if (!error) {
314 mapping->nrpages++; 576 mapping->nrpages += nr;
315 __inc_zone_page_state(page, NR_FILE_PAGES); 577 if (PageTransHuge(page))
316 __inc_zone_page_state(page, NR_SHMEM); 578 __inc_zone_page_state(page, NR_SHMEM_THPS);
579 __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, nr);
580 __mod_zone_page_state(page_zone(page), NR_SHMEM, nr);
317 spin_unlock_irq(&mapping->tree_lock); 581 spin_unlock_irq(&mapping->tree_lock);
318 } else { 582 } else {
319 page->mapping = NULL; 583 page->mapping = NULL;
320 spin_unlock_irq(&mapping->tree_lock); 584 spin_unlock_irq(&mapping->tree_lock);
321 put_page(page); 585 page_ref_sub(page, nr);
322 } 586 }
323 return error; 587 return error;
324} 588}
@@ -331,6 +595,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
331 struct address_space *mapping = page->mapping; 595 struct address_space *mapping = page->mapping;
332 int error; 596 int error;
333 597
598 VM_BUG_ON_PAGE(PageCompound(page), page);
599
334 spin_lock_irq(&mapping->tree_lock); 600 spin_lock_irq(&mapping->tree_lock);
335 error = shmem_radix_tree_replace(mapping, page->index, page, radswap); 601 error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
336 page->mapping = NULL; 602 page->mapping = NULL;
@@ -510,10 +776,33 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
510 continue; 776 continue;
511 } 777 }
512 778
779 VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page);
780
513 if (!trylock_page(page)) 781 if (!trylock_page(page))
514 continue; 782 continue;
783
784 if (PageTransTail(page)) {
785 /* Middle of THP: zero out the page */
786 clear_highpage(page);
787 unlock_page(page);
788 continue;
789 } else if (PageTransHuge(page)) {
790 if (index == round_down(end, HPAGE_PMD_NR)) {
791 /*
792 * Range ends in the middle of THP:
793 * zero out the page
794 */
795 clear_highpage(page);
796 unlock_page(page);
797 continue;
798 }
799 index += HPAGE_PMD_NR - 1;
800 i += HPAGE_PMD_NR - 1;
801 }
802
515 if (!unfalloc || !PageUptodate(page)) { 803 if (!unfalloc || !PageUptodate(page)) {
516 if (page->mapping == mapping) { 804 VM_BUG_ON_PAGE(PageTail(page), page);
805 if (page_mapping(page) == mapping) {
517 VM_BUG_ON_PAGE(PageWriteback(page), page); 806 VM_BUG_ON_PAGE(PageWriteback(page), page);
518 truncate_inode_page(mapping, page); 807 truncate_inode_page(mapping, page);
519 } 808 }
@@ -589,8 +878,36 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
589 } 878 }
590 879
591 lock_page(page); 880 lock_page(page);
881
882 if (PageTransTail(page)) {
883 /* Middle of THP: zero out the page */
884 clear_highpage(page);
885 unlock_page(page);
886 /*
887 * Partial thp truncate due 'start' in middle
888 * of THP: don't need to look on these pages
889 * again on !pvec.nr restart.
890 */
891 if (index != round_down(end, HPAGE_PMD_NR))
892 start++;
893 continue;
894 } else if (PageTransHuge(page)) {
895 if (index == round_down(end, HPAGE_PMD_NR)) {
896 /*
897 * Range ends in the middle of THP:
898 * zero out the page
899 */
900 clear_highpage(page);
901 unlock_page(page);
902 continue;
903 }
904 index += HPAGE_PMD_NR - 1;
905 i += HPAGE_PMD_NR - 1;
906 }
907
592 if (!unfalloc || !PageUptodate(page)) { 908 if (!unfalloc || !PageUptodate(page)) {
593 if (page->mapping == mapping) { 909 VM_BUG_ON_PAGE(PageTail(page), page);
910 if (page_mapping(page) == mapping) {
594 VM_BUG_ON_PAGE(PageWriteback(page), page); 911 VM_BUG_ON_PAGE(PageWriteback(page), page);
595 truncate_inode_page(mapping, page); 912 truncate_inode_page(mapping, page);
596 } else { 913 } else {
@@ -607,10 +924,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
607 index++; 924 index++;
608 } 925 }
609 926
610 spin_lock(&info->lock); 927 spin_lock_irq(&info->lock);
611 info->swapped -= nr_swaps_freed; 928 info->swapped -= nr_swaps_freed;
612 shmem_recalc_inode(inode); 929 shmem_recalc_inode(inode);
613 spin_unlock(&info->lock); 930 spin_unlock_irq(&info->lock);
614} 931}
615 932
616void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 933void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
@@ -627,9 +944,9 @@ static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry,
627 struct shmem_inode_info *info = SHMEM_I(inode); 944 struct shmem_inode_info *info = SHMEM_I(inode);
628 945
629 if (info->alloced - info->swapped != inode->i_mapping->nrpages) { 946 if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
630 spin_lock(&info->lock); 947 spin_lock_irq(&info->lock);
631 shmem_recalc_inode(inode); 948 shmem_recalc_inode(inode);
632 spin_unlock(&info->lock); 949 spin_unlock_irq(&info->lock);
633 } 950 }
634 generic_fillattr(inode, stat); 951 generic_fillattr(inode, stat);
635 return 0; 952 return 0;
@@ -639,6 +956,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
639{ 956{
640 struct inode *inode = d_inode(dentry); 957 struct inode *inode = d_inode(dentry);
641 struct shmem_inode_info *info = SHMEM_I(inode); 958 struct shmem_inode_info *info = SHMEM_I(inode);
959 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
642 int error; 960 int error;
643 961
644 error = inode_change_ok(inode, attr); 962 error = inode_change_ok(inode, attr);
@@ -674,6 +992,20 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
674 if (oldsize > holebegin) 992 if (oldsize > holebegin)
675 unmap_mapping_range(inode->i_mapping, 993 unmap_mapping_range(inode->i_mapping,
676 holebegin, 0, 1); 994 holebegin, 0, 1);
995
996 /*
997 * Part of the huge page can be beyond i_size: subject
998 * to shrink under memory pressure.
999 */
1000 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
1001 spin_lock(&sbinfo->shrinklist_lock);
1002 if (list_empty(&info->shrinklist)) {
1003 list_add_tail(&info->shrinklist,
1004 &sbinfo->shrinklist);
1005 sbinfo->shrinklist_len++;
1006 }
1007 spin_unlock(&sbinfo->shrinklist_lock);
1008 }
677 } 1009 }
678 } 1010 }
679 1011
@@ -686,11 +1018,20 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
686static void shmem_evict_inode(struct inode *inode) 1018static void shmem_evict_inode(struct inode *inode)
687{ 1019{
688 struct shmem_inode_info *info = SHMEM_I(inode); 1020 struct shmem_inode_info *info = SHMEM_I(inode);
1021 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
689 1022
690 if (inode->i_mapping->a_ops == &shmem_aops) { 1023 if (inode->i_mapping->a_ops == &shmem_aops) {
691 shmem_unacct_size(info->flags, inode->i_size); 1024 shmem_unacct_size(info->flags, inode->i_size);
692 inode->i_size = 0; 1025 inode->i_size = 0;
693 shmem_truncate_range(inode, 0, (loff_t)-1); 1026 shmem_truncate_range(inode, 0, (loff_t)-1);
1027 if (!list_empty(&info->shrinklist)) {
1028 spin_lock(&sbinfo->shrinklist_lock);
1029 if (!list_empty(&info->shrinklist)) {
1030 list_del_init(&info->shrinklist);
1031 sbinfo->shrinklist_len--;
1032 }
1033 spin_unlock(&sbinfo->shrinklist_lock);
1034 }
694 if (!list_empty(&info->swaplist)) { 1035 if (!list_empty(&info->swaplist)) {
695 mutex_lock(&shmem_swaplist_mutex); 1036 mutex_lock(&shmem_swaplist_mutex);
696 list_del_init(&info->swaplist); 1037 list_del_init(&info->swaplist);
@@ -773,9 +1114,9 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
773 delete_from_swap_cache(*pagep); 1114 delete_from_swap_cache(*pagep);
774 set_page_dirty(*pagep); 1115 set_page_dirty(*pagep);
775 if (!error) { 1116 if (!error) {
776 spin_lock(&info->lock); 1117 spin_lock_irq(&info->lock);
777 info->swapped--; 1118 info->swapped--;
778 spin_unlock(&info->lock); 1119 spin_unlock_irq(&info->lock);
779 swap_free(swap); 1120 swap_free(swap);
780 } 1121 }
781 } 1122 }
@@ -848,6 +1189,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
848 swp_entry_t swap; 1189 swp_entry_t swap;
849 pgoff_t index; 1190 pgoff_t index;
850 1191
1192 VM_BUG_ON_PAGE(PageCompound(page), page);
851 BUG_ON(!PageLocked(page)); 1193 BUG_ON(!PageLocked(page));
852 mapping = page->mapping; 1194 mapping = page->mapping;
853 index = page->index; 1195 index = page->index;
@@ -922,10 +1264,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
922 list_add_tail(&info->swaplist, &shmem_swaplist); 1264 list_add_tail(&info->swaplist, &shmem_swaplist);
923 1265
924 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { 1266 if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
925 spin_lock(&info->lock); 1267 spin_lock_irq(&info->lock);
926 shmem_recalc_inode(inode); 1268 shmem_recalc_inode(inode);
927 info->swapped++; 1269 info->swapped++;
928 spin_unlock(&info->lock); 1270 spin_unlock_irq(&info->lock);
929 1271
930 swap_shmem_alloc(swap); 1272 swap_shmem_alloc(swap);
931 shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); 1273 shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
@@ -984,24 +1326,63 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
984#define vm_policy vm_private_data 1326#define vm_policy vm_private_data
985#endif 1327#endif
986 1328
1329static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
1330 struct shmem_inode_info *info, pgoff_t index)
1331{
1332 /* Create a pseudo vma that just contains the policy */
1333 vma->vm_start = 0;
1334 /* Bias interleave by inode number to distribute better across nodes */
1335 vma->vm_pgoff = index + info->vfs_inode.i_ino;
1336 vma->vm_ops = NULL;
1337 vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
1338}
1339
1340static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
1341{
1342 /* Drop reference taken by mpol_shared_policy_lookup() */
1343 mpol_cond_put(vma->vm_policy);
1344}
1345
987static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, 1346static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
988 struct shmem_inode_info *info, pgoff_t index) 1347 struct shmem_inode_info *info, pgoff_t index)
989{ 1348{
990 struct vm_area_struct pvma; 1349 struct vm_area_struct pvma;
991 struct page *page; 1350 struct page *page;
992 1351
993 /* Create a pseudo vma that just contains the policy */ 1352 shmem_pseudo_vma_init(&pvma, info, index);
994 pvma.vm_start = 0;
995 /* Bias interleave by inode number to distribute better across nodes */
996 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
997 pvma.vm_ops = NULL;
998 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
999
1000 page = swapin_readahead(swap, gfp, &pvma, 0); 1353 page = swapin_readahead(swap, gfp, &pvma, 0);
1354 shmem_pseudo_vma_destroy(&pvma);
1001 1355
1002 /* Drop reference taken by mpol_shared_policy_lookup() */ 1356 return page;
1003 mpol_cond_put(pvma.vm_policy); 1357}
1358
1359static struct page *shmem_alloc_hugepage(gfp_t gfp,
1360 struct shmem_inode_info *info, pgoff_t index)
1361{
1362 struct vm_area_struct pvma;
1363 struct inode *inode = &info->vfs_inode;
1364 struct address_space *mapping = inode->i_mapping;
1365 pgoff_t idx, hindex = round_down(index, HPAGE_PMD_NR);
1366 void __rcu **results;
1367 struct page *page;
1004 1368
1369 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1370 return NULL;
1371
1372 rcu_read_lock();
1373 if (radix_tree_gang_lookup_slot(&mapping->page_tree, &results, &idx,
1374 hindex, 1) && idx < hindex + HPAGE_PMD_NR) {
1375 rcu_read_unlock();
1376 return NULL;
1377 }
1378 rcu_read_unlock();
1379
1380 shmem_pseudo_vma_init(&pvma, info, hindex);
1381 page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
1382 HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
1383 shmem_pseudo_vma_destroy(&pvma);
1384 if (page)
1385 prep_transhuge_page(page);
1005 return page; 1386 return page;
1006} 1387}
1007 1388
@@ -1011,23 +1392,51 @@ static struct page *shmem_alloc_page(gfp_t gfp,
1011 struct vm_area_struct pvma; 1392 struct vm_area_struct pvma;
1012 struct page *page; 1393 struct page *page;
1013 1394
1014 /* Create a pseudo vma that just contains the policy */ 1395 shmem_pseudo_vma_init(&pvma, info, index);
1015 pvma.vm_start = 0; 1396 page = alloc_page_vma(gfp, &pvma, 0);
1016 /* Bias interleave by inode number to distribute better across nodes */ 1397 shmem_pseudo_vma_destroy(&pvma);
1017 pvma.vm_pgoff = index + info->vfs_inode.i_ino; 1398
1018 pvma.vm_ops = NULL; 1399 return page;
1019 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 1400}
1401
1402static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
1403 struct shmem_inode_info *info, struct shmem_sb_info *sbinfo,
1404 pgoff_t index, bool huge)
1405{
1406 struct page *page;
1407 int nr;
1408 int err = -ENOSPC;
1409
1410 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1411 huge = false;
1412 nr = huge ? HPAGE_PMD_NR : 1;
1413
1414 if (shmem_acct_block(info->flags, nr))
1415 goto failed;
1416 if (sbinfo->max_blocks) {
1417 if (percpu_counter_compare(&sbinfo->used_blocks,
1418 sbinfo->max_blocks - nr) > 0)
1419 goto unacct;
1420 percpu_counter_add(&sbinfo->used_blocks, nr);
1421 }
1020 1422
1021 page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false); 1423 if (huge)
1424 page = shmem_alloc_hugepage(gfp, info, index);
1425 else
1426 page = shmem_alloc_page(gfp, info, index);
1022 if (page) { 1427 if (page) {
1023 __SetPageLocked(page); 1428 __SetPageLocked(page);
1024 __SetPageSwapBacked(page); 1429 __SetPageSwapBacked(page);
1430 return page;
1025 } 1431 }
1026 1432
1027 /* Drop reference taken by mpol_shared_policy_lookup() */ 1433 err = -ENOMEM;
1028 mpol_cond_put(pvma.vm_policy); 1434 if (sbinfo->max_blocks)
1029 1435 percpu_counter_add(&sbinfo->used_blocks, -nr);
1030 return page; 1436unacct:
1437 shmem_unacct_blocks(info->flags, nr);
1438failed:
1439 return ERR_PTR(err);
1031} 1440}
1032 1441
1033/* 1442/*
@@ -1132,12 +1541,16 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
1132 struct mem_cgroup *memcg; 1541 struct mem_cgroup *memcg;
1133 struct page *page; 1542 struct page *page;
1134 swp_entry_t swap; 1543 swp_entry_t swap;
1544 enum sgp_type sgp_huge = sgp;
1545 pgoff_t hindex = index;
1135 int error; 1546 int error;
1136 int once = 0; 1547 int once = 0;
1137 int alloced = 0; 1548 int alloced = 0;
1138 1549
1139 if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) 1550 if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
1140 return -EFBIG; 1551 return -EFBIG;
1552 if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
1553 sgp = SGP_CACHE;
1141repeat: 1554repeat:
1142 swap.val = 0; 1555 swap.val = 0;
1143 page = find_lock_entry(mapping, index); 1556 page = find_lock_entry(mapping, index);
@@ -1240,10 +1653,10 @@ repeat:
1240 1653
1241 mem_cgroup_commit_charge(page, memcg, true, false); 1654 mem_cgroup_commit_charge(page, memcg, true, false);
1242 1655
1243 spin_lock(&info->lock); 1656 spin_lock_irq(&info->lock);
1244 info->swapped--; 1657 info->swapped--;
1245 shmem_recalc_inode(inode); 1658 shmem_recalc_inode(inode);
1246 spin_unlock(&info->lock); 1659 spin_unlock_irq(&info->lock);
1247 1660
1248 if (sgp == SGP_WRITE) 1661 if (sgp == SGP_WRITE)
1249 mark_page_accessed(page); 1662 mark_page_accessed(page);
@@ -1253,51 +1666,111 @@ repeat:
1253 swap_free(swap); 1666 swap_free(swap);
1254 1667
1255 } else { 1668 } else {
1256 if (shmem_acct_block(info->flags)) { 1669 /* shmem_symlink() */
1257 error = -ENOSPC; 1670 if (mapping->a_ops != &shmem_aops)
1258 goto failed; 1671 goto alloc_nohuge;
1672 if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
1673 goto alloc_nohuge;
1674 if (shmem_huge == SHMEM_HUGE_FORCE)
1675 goto alloc_huge;
1676 switch (sbinfo->huge) {
1677 loff_t i_size;
1678 pgoff_t off;
1679 case SHMEM_HUGE_NEVER:
1680 goto alloc_nohuge;
1681 case SHMEM_HUGE_WITHIN_SIZE:
1682 off = round_up(index, HPAGE_PMD_NR);
1683 i_size = round_up(i_size_read(inode), PAGE_SIZE);
1684 if (i_size >= HPAGE_PMD_SIZE &&
1685 i_size >> PAGE_SHIFT >= off)
1686 goto alloc_huge;
1687 /* fallthrough */
1688 case SHMEM_HUGE_ADVISE:
1689 if (sgp_huge == SGP_HUGE)
1690 goto alloc_huge;
1691 /* TODO: implement fadvise() hints */
1692 goto alloc_nohuge;
1259 } 1693 }
1260 if (sbinfo->max_blocks) { 1694
1261 if (percpu_counter_compare(&sbinfo->used_blocks, 1695alloc_huge:
1262 sbinfo->max_blocks) >= 0) { 1696 page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
1263 error = -ENOSPC; 1697 index, true);
1264 goto unacct; 1698 if (IS_ERR(page)) {
1699alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo,
1700 index, false);
1701 }
1702 if (IS_ERR(page)) {
1703 int retry = 5;
1704 error = PTR_ERR(page);
1705 page = NULL;
1706 if (error != -ENOSPC)
1707 goto failed;
1708 /*
1709 * Try to reclaim some spece by splitting a huge page
1710 * beyond i_size on the filesystem.
1711 */
1712 while (retry--) {
1713 int ret;
1714 ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
1715 if (ret == SHRINK_STOP)
1716 break;
1717 if (ret)
1718 goto alloc_nohuge;
1265 } 1719 }
1266 percpu_counter_inc(&sbinfo->used_blocks); 1720 goto failed;
1267 } 1721 }
1268 1722
1269 page = shmem_alloc_page(gfp, info, index); 1723 if (PageTransHuge(page))
1270 if (!page) { 1724 hindex = round_down(index, HPAGE_PMD_NR);
1271 error = -ENOMEM; 1725 else
1272 goto decused; 1726 hindex = index;
1273 } 1727
1274 if (sgp == SGP_WRITE) 1728 if (sgp == SGP_WRITE)
1275 __SetPageReferenced(page); 1729 __SetPageReferenced(page);
1276 1730
1277 error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, 1731 error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
1278 false); 1732 PageTransHuge(page));
1279 if (error) 1733 if (error)
1280 goto decused; 1734 goto unacct;
1281 error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); 1735 error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK,
1736 compound_order(page));
1282 if (!error) { 1737 if (!error) {
1283 error = shmem_add_to_page_cache(page, mapping, index, 1738 error = shmem_add_to_page_cache(page, mapping, hindex,
1284 NULL); 1739 NULL);
1285 radix_tree_preload_end(); 1740 radix_tree_preload_end();
1286 } 1741 }
1287 if (error) { 1742 if (error) {
1288 mem_cgroup_cancel_charge(page, memcg, false); 1743 mem_cgroup_cancel_charge(page, memcg,
1289 goto decused; 1744 PageTransHuge(page));
1745 goto unacct;
1290 } 1746 }
1291 mem_cgroup_commit_charge(page, memcg, false, false); 1747 mem_cgroup_commit_charge(page, memcg, false,
1748 PageTransHuge(page));
1292 lru_cache_add_anon(page); 1749 lru_cache_add_anon(page);
1293 1750
1294 spin_lock(&info->lock); 1751 spin_lock_irq(&info->lock);
1295 info->alloced++; 1752 info->alloced += 1 << compound_order(page);
1296 inode->i_blocks += BLOCKS_PER_PAGE; 1753 inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
1297 shmem_recalc_inode(inode); 1754 shmem_recalc_inode(inode);
1298 spin_unlock(&info->lock); 1755 spin_unlock_irq(&info->lock);
1299 alloced = true; 1756 alloced = true;
1300 1757
1758 if (PageTransHuge(page) &&
1759 DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
1760 hindex + HPAGE_PMD_NR - 1) {
1761 /*
1762 * Part of the huge page is beyond i_size: subject
1763 * to shrink under memory pressure.
1764 */
1765 spin_lock(&sbinfo->shrinklist_lock);
1766 if (list_empty(&info->shrinklist)) {
1767 list_add_tail(&info->shrinklist,
1768 &sbinfo->shrinklist);
1769 sbinfo->shrinklist_len++;
1770 }
1771 spin_unlock(&sbinfo->shrinklist_lock);
1772 }
1773
1301 /* 1774 /*
1302 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. 1775 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
1303 */ 1776 */
@@ -1309,10 +1782,15 @@ clear:
1309 * but SGP_FALLOC on a page fallocated earlier must initialize 1782 * but SGP_FALLOC on a page fallocated earlier must initialize
1310 * it now, lest undo on failure cancel our earlier guarantee. 1783 * it now, lest undo on failure cancel our earlier guarantee.
1311 */ 1784 */
1312 if (sgp != SGP_WRITE) { 1785 if (sgp != SGP_WRITE && !PageUptodate(page)) {
1313 clear_highpage(page); 1786 struct page *head = compound_head(page);
1314 flush_dcache_page(page); 1787 int i;
1315 SetPageUptodate(page); 1788
1789 for (i = 0; i < (1 << compound_order(head)); i++) {
1790 clear_highpage(head + i);
1791 flush_dcache_page(head + i);
1792 }
1793 SetPageUptodate(head);
1316 } 1794 }
1317 } 1795 }
1318 1796
@@ -1322,24 +1800,30 @@ clear:
1322 if (alloced) { 1800 if (alloced) {
1323 ClearPageDirty(page); 1801 ClearPageDirty(page);
1324 delete_from_page_cache(page); 1802 delete_from_page_cache(page);
1325 spin_lock(&info->lock); 1803 spin_lock_irq(&info->lock);
1326 shmem_recalc_inode(inode); 1804 shmem_recalc_inode(inode);
1327 spin_unlock(&info->lock); 1805 spin_unlock_irq(&info->lock);
1328 } 1806 }
1329 error = -EINVAL; 1807 error = -EINVAL;
1330 goto unlock; 1808 goto unlock;
1331 } 1809 }
1332 *pagep = page; 1810 *pagep = page + index - hindex;
1333 return 0; 1811 return 0;
1334 1812
1335 /* 1813 /*
1336 * Error recovery. 1814 * Error recovery.
1337 */ 1815 */
1338decused:
1339 if (sbinfo->max_blocks)
1340 percpu_counter_add(&sbinfo->used_blocks, -1);
1341unacct: 1816unacct:
1342 shmem_unacct_blocks(info->flags, 1); 1817 if (sbinfo->max_blocks)
1818 percpu_counter_sub(&sbinfo->used_blocks,
1819 1 << compound_order(page));
1820 shmem_unacct_blocks(info->flags, 1 << compound_order(page));
1821
1822 if (PageTransHuge(page)) {
1823 unlock_page(page);
1824 put_page(page);
1825 goto alloc_nohuge;
1826 }
1343failed: 1827failed:
1344 if (swap.val && !shmem_confirm_swap(mapping, index, swap)) 1828 if (swap.val && !shmem_confirm_swap(mapping, index, swap))
1345 error = -EEXIST; 1829 error = -EEXIST;
@@ -1350,9 +1834,9 @@ unlock:
1350 } 1834 }
1351 if (error == -ENOSPC && !once++) { 1835 if (error == -ENOSPC && !once++) {
1352 info = SHMEM_I(inode); 1836 info = SHMEM_I(inode);
1353 spin_lock(&info->lock); 1837 spin_lock_irq(&info->lock);
1354 shmem_recalc_inode(inode); 1838 shmem_recalc_inode(inode);
1355 spin_unlock(&info->lock); 1839 spin_unlock_irq(&info->lock);
1356 goto repeat; 1840 goto repeat;
1357 } 1841 }
1358 if (error == -EEXIST) /* from above or from radix_tree_insert */ 1842 if (error == -EEXIST) /* from above or from radix_tree_insert */
@@ -1364,6 +1848,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1364{ 1848{
1365 struct inode *inode = file_inode(vma->vm_file); 1849 struct inode *inode = file_inode(vma->vm_file);
1366 gfp_t gfp = mapping_gfp_mask(inode->i_mapping); 1850 gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
1851 enum sgp_type sgp;
1367 int error; 1852 int error;
1368 int ret = VM_FAULT_LOCKED; 1853 int ret = VM_FAULT_LOCKED;
1369 1854
@@ -1425,13 +1910,107 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1425 spin_unlock(&inode->i_lock); 1910 spin_unlock(&inode->i_lock);
1426 } 1911 }
1427 1912
1428 error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, 1913 sgp = SGP_CACHE;
1914 if (vma->vm_flags & VM_HUGEPAGE)
1915 sgp = SGP_HUGE;
1916 else if (vma->vm_flags & VM_NOHUGEPAGE)
1917 sgp = SGP_NOHUGE;
1918
1919 error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
1429 gfp, vma->vm_mm, &ret); 1920 gfp, vma->vm_mm, &ret);
1430 if (error) 1921 if (error)
1431 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1922 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1432 return ret; 1923 return ret;
1433} 1924}
1434 1925
1926unsigned long shmem_get_unmapped_area(struct file *file,
1927 unsigned long uaddr, unsigned long len,
1928 unsigned long pgoff, unsigned long flags)
1929{
1930 unsigned long (*get_area)(struct file *,
1931 unsigned long, unsigned long, unsigned long, unsigned long);
1932 unsigned long addr;
1933 unsigned long offset;
1934 unsigned long inflated_len;
1935 unsigned long inflated_addr;
1936 unsigned long inflated_offset;
1937
1938 if (len > TASK_SIZE)
1939 return -ENOMEM;
1940
1941 get_area = current->mm->get_unmapped_area;
1942 addr = get_area(file, uaddr, len, pgoff, flags);
1943
1944 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
1945 return addr;
1946 if (IS_ERR_VALUE(addr))
1947 return addr;
1948 if (addr & ~PAGE_MASK)
1949 return addr;
1950 if (addr > TASK_SIZE - len)
1951 return addr;
1952
1953 if (shmem_huge == SHMEM_HUGE_DENY)
1954 return addr;
1955 if (len < HPAGE_PMD_SIZE)
1956 return addr;
1957 if (flags & MAP_FIXED)
1958 return addr;
1959 /*
1960 * Our priority is to support MAP_SHARED mapped hugely;
1961 * and support MAP_PRIVATE mapped hugely too, until it is COWed.
1962 * But if caller specified an address hint, respect that as before.
1963 */
1964 if (uaddr)
1965 return addr;
1966
1967 if (shmem_huge != SHMEM_HUGE_FORCE) {
1968 struct super_block *sb;
1969
1970 if (file) {
1971 VM_BUG_ON(file->f_op != &shmem_file_operations);
1972 sb = file_inode(file)->i_sb;
1973 } else {
1974 /*
1975 * Called directly from mm/mmap.c, or drivers/char/mem.c
1976 * for "/dev/zero", to create a shared anonymous object.
1977 */
1978 if (IS_ERR(shm_mnt))
1979 return addr;
1980 sb = shm_mnt->mnt_sb;
1981 }
1982 if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER)
1983 return addr;
1984 }
1985
1986 offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1);
1987 if (offset && offset + len < 2 * HPAGE_PMD_SIZE)
1988 return addr;
1989 if ((addr & (HPAGE_PMD_SIZE-1)) == offset)
1990 return addr;
1991
1992 inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE;
1993 if (inflated_len > TASK_SIZE)
1994 return addr;
1995 if (inflated_len < len)
1996 return addr;
1997
1998 inflated_addr = get_area(NULL, 0, inflated_len, 0, flags);
1999 if (IS_ERR_VALUE(inflated_addr))
2000 return addr;
2001 if (inflated_addr & ~PAGE_MASK)
2002 return addr;
2003
2004 inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1);
2005 inflated_addr += offset - inflated_offset;
2006 if (inflated_offset > offset)
2007 inflated_addr += HPAGE_PMD_SIZE;
2008
2009 if (inflated_addr > TASK_SIZE - len)
2010 return addr;
2011 return inflated_addr;
2012}
2013
1435#ifdef CONFIG_NUMA 2014#ifdef CONFIG_NUMA
1436static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) 2015static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
1437{ 2016{
@@ -1456,7 +2035,7 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
1456 struct shmem_inode_info *info = SHMEM_I(inode); 2035 struct shmem_inode_info *info = SHMEM_I(inode);
1457 int retval = -ENOMEM; 2036 int retval = -ENOMEM;
1458 2037
1459 spin_lock(&info->lock); 2038 spin_lock_irq(&info->lock);
1460 if (lock && !(info->flags & VM_LOCKED)) { 2039 if (lock && !(info->flags & VM_LOCKED)) {
1461 if (!user_shm_lock(inode->i_size, user)) 2040 if (!user_shm_lock(inode->i_size, user))
1462 goto out_nomem; 2041 goto out_nomem;
@@ -1471,7 +2050,7 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
1471 retval = 0; 2050 retval = 0;
1472 2051
1473out_nomem: 2052out_nomem:
1474 spin_unlock(&info->lock); 2053 spin_unlock_irq(&info->lock);
1475 return retval; 2054 return retval;
1476} 2055}
1477 2056
@@ -1479,6 +2058,11 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1479{ 2058{
1480 file_accessed(file); 2059 file_accessed(file);
1481 vma->vm_ops = &shmem_vm_ops; 2060 vma->vm_ops = &shmem_vm_ops;
2061 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
2062 ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
2063 (vma->vm_end & HPAGE_PMD_MASK)) {
2064 khugepaged_enter(vma, vma->vm_flags);
2065 }
1482 return 0; 2066 return 0;
1483} 2067}
1484 2068
@@ -1504,6 +2088,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1504 spin_lock_init(&info->lock); 2088 spin_lock_init(&info->lock);
1505 info->seals = F_SEAL_SEAL; 2089 info->seals = F_SEAL_SEAL;
1506 info->flags = flags & VM_NORESERVE; 2090 info->flags = flags & VM_NORESERVE;
2091 INIT_LIST_HEAD(&info->shrinklist);
1507 INIT_LIST_HEAD(&info->swaplist); 2092 INIT_LIST_HEAD(&info->swaplist);
1508 simple_xattrs_init(&info->xattrs); 2093 simple_xattrs_init(&info->xattrs);
1509 cache_no_acl(inode); 2094 cache_no_acl(inode);
@@ -1589,12 +2174,23 @@ shmem_write_end(struct file *file, struct address_space *mapping,
1589 i_size_write(inode, pos + copied); 2174 i_size_write(inode, pos + copied);
1590 2175
1591 if (!PageUptodate(page)) { 2176 if (!PageUptodate(page)) {
2177 struct page *head = compound_head(page);
2178 if (PageTransCompound(page)) {
2179 int i;
2180
2181 for (i = 0; i < HPAGE_PMD_NR; i++) {
2182 if (head + i == page)
2183 continue;
2184 clear_highpage(head + i);
2185 flush_dcache_page(head + i);
2186 }
2187 }
1592 if (copied < PAGE_SIZE) { 2188 if (copied < PAGE_SIZE) {
1593 unsigned from = pos & (PAGE_SIZE - 1); 2189 unsigned from = pos & (PAGE_SIZE - 1);
1594 zero_user_segments(page, 0, from, 2190 zero_user_segments(page, 0, from,
1595 from + copied, PAGE_SIZE); 2191 from + copied, PAGE_SIZE);
1596 } 2192 }
1597 SetPageUptodate(page); 2193 SetPageUptodate(head);
1598 } 2194 }
1599 set_page_dirty(page); 2195 set_page_dirty(page);
1600 unlock_page(page); 2196 unlock_page(page);
@@ -2860,11 +3456,24 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2860 sbinfo->gid = make_kgid(current_user_ns(), gid); 3456 sbinfo->gid = make_kgid(current_user_ns(), gid);
2861 if (!gid_valid(sbinfo->gid)) 3457 if (!gid_valid(sbinfo->gid))
2862 goto bad_val; 3458 goto bad_val;
3459#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3460 } else if (!strcmp(this_char, "huge")) {
3461 int huge;
3462 huge = shmem_parse_huge(value);
3463 if (huge < 0)
3464 goto bad_val;
3465 if (!has_transparent_hugepage() &&
3466 huge != SHMEM_HUGE_NEVER)
3467 goto bad_val;
3468 sbinfo->huge = huge;
3469#endif
3470#ifdef CONFIG_NUMA
2863 } else if (!strcmp(this_char,"mpol")) { 3471 } else if (!strcmp(this_char,"mpol")) {
2864 mpol_put(mpol); 3472 mpol_put(mpol);
2865 mpol = NULL; 3473 mpol = NULL;
2866 if (mpol_parse_str(value, &mpol)) 3474 if (mpol_parse_str(value, &mpol))
2867 goto bad_val; 3475 goto bad_val;
3476#endif
2868 } else { 3477 } else {
2869 pr_err("tmpfs: Bad mount option %s\n", this_char); 3478 pr_err("tmpfs: Bad mount option %s\n", this_char);
2870 goto error; 3479 goto error;
@@ -2910,6 +3519,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2910 goto out; 3519 goto out;
2911 3520
2912 error = 0; 3521 error = 0;
3522 sbinfo->huge = config.huge;
2913 sbinfo->max_blocks = config.max_blocks; 3523 sbinfo->max_blocks = config.max_blocks;
2914 sbinfo->max_inodes = config.max_inodes; 3524 sbinfo->max_inodes = config.max_inodes;
2915 sbinfo->free_inodes = config.max_inodes - inodes; 3525 sbinfo->free_inodes = config.max_inodes - inodes;
@@ -2943,6 +3553,11 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
2943 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) 3553 if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
2944 seq_printf(seq, ",gid=%u", 3554 seq_printf(seq, ",gid=%u",
2945 from_kgid_munged(&init_user_ns, sbinfo->gid)); 3555 from_kgid_munged(&init_user_ns, sbinfo->gid));
3556#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3557 /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
3558 if (sbinfo->huge)
3559 seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
3560#endif
2946 shmem_show_mpol(seq, sbinfo->mpol); 3561 shmem_show_mpol(seq, sbinfo->mpol);
2947 return 0; 3562 return 0;
2948} 3563}
@@ -3072,6 +3687,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
3072 if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) 3687 if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
3073 goto failed; 3688 goto failed;
3074 sbinfo->free_inodes = sbinfo->max_inodes; 3689 sbinfo->free_inodes = sbinfo->max_inodes;
3690 spin_lock_init(&sbinfo->shrinklist_lock);
3691 INIT_LIST_HEAD(&sbinfo->shrinklist);
3075 3692
3076 sb->s_maxbytes = MAX_LFS_FILESIZE; 3693 sb->s_maxbytes = MAX_LFS_FILESIZE;
3077 sb->s_blocksize = PAGE_SIZE; 3694 sb->s_blocksize = PAGE_SIZE;
@@ -3161,6 +3778,7 @@ static const struct address_space_operations shmem_aops = {
3161 3778
3162static const struct file_operations shmem_file_operations = { 3779static const struct file_operations shmem_file_operations = {
3163 .mmap = shmem_mmap, 3780 .mmap = shmem_mmap,
3781 .get_unmapped_area = shmem_get_unmapped_area,
3164#ifdef CONFIG_TMPFS 3782#ifdef CONFIG_TMPFS
3165 .llseek = shmem_file_llseek, 3783 .llseek = shmem_file_llseek,
3166 .read_iter = shmem_file_read_iter, 3784 .read_iter = shmem_file_read_iter,
@@ -3233,6 +3851,10 @@ static const struct super_operations shmem_ops = {
3233 .evict_inode = shmem_evict_inode, 3851 .evict_inode = shmem_evict_inode,
3234 .drop_inode = generic_delete_inode, 3852 .drop_inode = generic_delete_inode,
3235 .put_super = shmem_put_super, 3853 .put_super = shmem_put_super,
3854#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3855 .nr_cached_objects = shmem_unused_huge_count,
3856 .free_cached_objects = shmem_unused_huge_scan,
3857#endif
3236}; 3858};
3237 3859
3238static const struct vm_operations_struct shmem_vm_ops = { 3860static const struct vm_operations_struct shmem_vm_ops = {
@@ -3282,6 +3904,13 @@ int __init shmem_init(void)
3282 pr_err("Could not kern_mount tmpfs\n"); 3904 pr_err("Could not kern_mount tmpfs\n");
3283 goto out1; 3905 goto out1;
3284 } 3906 }
3907
3908#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
3909 if (has_transparent_hugepage() && shmem_huge < SHMEM_HUGE_DENY)
3910 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
3911 else
3912 shmem_huge = 0; /* just in case it was patched */
3913#endif
3285 return 0; 3914 return 0;
3286 3915
3287out1: 3916out1:
@@ -3293,6 +3922,91 @@ out3:
3293 return error; 3922 return error;
3294} 3923}
3295 3924
3925#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS)
3926static ssize_t shmem_enabled_show(struct kobject *kobj,
3927 struct kobj_attribute *attr, char *buf)
3928{
3929 int values[] = {
3930 SHMEM_HUGE_ALWAYS,
3931 SHMEM_HUGE_WITHIN_SIZE,
3932 SHMEM_HUGE_ADVISE,
3933 SHMEM_HUGE_NEVER,
3934 SHMEM_HUGE_DENY,
3935 SHMEM_HUGE_FORCE,
3936 };
3937 int i, count;
3938
3939 for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) {
3940 const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s ";
3941
3942 count += sprintf(buf + count, fmt,
3943 shmem_format_huge(values[i]));
3944 }
3945 buf[count - 1] = '\n';
3946 return count;
3947}
3948
3949static ssize_t shmem_enabled_store(struct kobject *kobj,
3950 struct kobj_attribute *attr, const char *buf, size_t count)
3951{
3952 char tmp[16];
3953 int huge;
3954
3955 if (count + 1 > sizeof(tmp))
3956 return -EINVAL;
3957 memcpy(tmp, buf, count);
3958 tmp[count] = '\0';
3959 if (count && tmp[count - 1] == '\n')
3960 tmp[count - 1] = '\0';
3961
3962 huge = shmem_parse_huge(tmp);
3963 if (huge == -EINVAL)
3964 return -EINVAL;
3965 if (!has_transparent_hugepage() &&
3966 huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
3967 return -EINVAL;
3968
3969 shmem_huge = huge;
3970 if (shmem_huge < SHMEM_HUGE_DENY)
3971 SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
3972 return count;
3973}
3974
3975struct kobj_attribute shmem_enabled_attr =
3976 __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
3977
3978bool shmem_huge_enabled(struct vm_area_struct *vma)
3979{
3980 struct inode *inode = file_inode(vma->vm_file);
3981 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
3982 loff_t i_size;
3983 pgoff_t off;
3984
3985 if (shmem_huge == SHMEM_HUGE_FORCE)
3986 return true;
3987 if (shmem_huge == SHMEM_HUGE_DENY)
3988 return false;
3989 switch (sbinfo->huge) {
3990 case SHMEM_HUGE_NEVER:
3991 return false;
3992 case SHMEM_HUGE_ALWAYS:
3993 return true;
3994 case SHMEM_HUGE_WITHIN_SIZE:
3995 off = round_up(vma->vm_pgoff, HPAGE_PMD_NR);
3996 i_size = round_up(i_size_read(inode), PAGE_SIZE);
3997 if (i_size >= HPAGE_PMD_SIZE &&
3998 i_size >> PAGE_SHIFT >= off)
3999 return true;
4000 case SHMEM_HUGE_ADVISE:
4001 /* TODO: implement fadvise() hints */
4002 return (vma->vm_flags & VM_HUGEPAGE);
4003 default:
4004 VM_BUG_ON(1);
4005 return false;
4006 }
4007}
4008#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
4009
3296#else /* !CONFIG_SHMEM */ 4010#else /* !CONFIG_SHMEM */
3297 4011
3298/* 4012/*
@@ -3335,6 +4049,15 @@ void shmem_unlock_mapping(struct address_space *mapping)
3335{ 4049{
3336} 4050}
3337 4051
4052#ifdef CONFIG_MMU
4053unsigned long shmem_get_unmapped_area(struct file *file,
4054 unsigned long addr, unsigned long len,
4055 unsigned long pgoff, unsigned long flags)
4056{
4057 return current->mm->get_unmapped_area(file, addr, len, pgoff, flags);
4058}
4059#endif
4060
3338void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 4061void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
3339{ 4062{
3340 truncate_inode_pages_range(inode->i_mapping, lstart, lend); 4063 truncate_inode_pages_range(inode->i_mapping, lstart, lend);
@@ -3461,6 +4184,13 @@ int shmem_zero_setup(struct vm_area_struct *vma)
3461 fput(vma->vm_file); 4184 fput(vma->vm_file);
3462 vma->vm_file = file; 4185 vma->vm_file = file;
3463 vma->vm_ops = &shmem_vm_ops; 4186 vma->vm_ops = &shmem_vm_ops;
4187
4188 if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
4189 ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
4190 (vma->vm_end & HPAGE_PMD_MASK)) {
4191 khugepaged_enter(vma, vma->vm_flags);
4192 }
4193
3464 return 0; 4194 return 0;
3465} 4195}
3466 4196
diff --git a/mm/slab.c b/mm/slab.c
index cc8bbc1e6bc9..09771ed3e693 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1236,61 +1236,6 @@ static void __init set_up_node(struct kmem_cache *cachep, int index)
1236 } 1236 }
1237} 1237}
1238 1238
1239#ifdef CONFIG_SLAB_FREELIST_RANDOM
1240static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list,
1241 size_t count)
1242{
1243 size_t i;
1244 unsigned int rand;
1245
1246 for (i = 0; i < count; i++)
1247 list[i] = i;
1248
1249 /* Fisher-Yates shuffle */
1250 for (i = count - 1; i > 0; i--) {
1251 rand = prandom_u32_state(state);
1252 rand %= (i + 1);
1253 swap(list[i], list[rand]);
1254 }
1255}
1256
1257/* Create a random sequence per cache */
1258static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
1259{
1260 unsigned int seed, count = cachep->num;
1261 struct rnd_state state;
1262
1263 if (count < 2)
1264 return 0;
1265
1266 /* If it fails, we will just use the global lists */
1267 cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp);
1268 if (!cachep->random_seq)
1269 return -ENOMEM;
1270
1271 /* Get best entropy at this stage */
1272 get_random_bytes_arch(&seed, sizeof(seed));
1273 prandom_seed_state(&state, seed);
1274
1275 freelist_randomize(&state, cachep->random_seq, count);
1276 return 0;
1277}
1278
1279/* Destroy the per-cache random freelist sequence */
1280static void cache_random_seq_destroy(struct kmem_cache *cachep)
1281{
1282 kfree(cachep->random_seq);
1283 cachep->random_seq = NULL;
1284}
1285#else
1286static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
1287{
1288 return 0;
1289}
1290static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
1291#endif /* CONFIG_SLAB_FREELIST_RANDOM */
1292
1293
1294/* 1239/*
1295 * Initialisation. Called after the page allocator have been initialised and 1240 * Initialisation. Called after the page allocator have been initialised and
1296 * before smp_init(). 1241 * before smp_init().
@@ -2535,7 +2480,7 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
2535union freelist_init_state { 2480union freelist_init_state {
2536 struct { 2481 struct {
2537 unsigned int pos; 2482 unsigned int pos;
2538 freelist_idx_t *list; 2483 unsigned int *list;
2539 unsigned int count; 2484 unsigned int count;
2540 unsigned int rand; 2485 unsigned int rand;
2541 }; 2486 };
@@ -2554,7 +2499,7 @@ static bool freelist_state_initialize(union freelist_init_state *state,
2554 unsigned int rand; 2499 unsigned int rand;
2555 2500
2556 /* Use best entropy available to define a random shift */ 2501 /* Use best entropy available to define a random shift */
2557 get_random_bytes_arch(&rand, sizeof(rand)); 2502 rand = get_random_int();
2558 2503
2559 /* Use a random state if the pre-computed list is not available */ 2504 /* Use a random state if the pre-computed list is not available */
2560 if (!cachep->random_seq) { 2505 if (!cachep->random_seq) {
@@ -2576,13 +2521,20 @@ static freelist_idx_t next_random_slot(union freelist_init_state *state)
2576 return (state->list[state->pos++] + state->rand) % state->count; 2521 return (state->list[state->pos++] + state->rand) % state->count;
2577} 2522}
2578 2523
2524/* Swap two freelist entries */
2525static void swap_free_obj(struct page *page, unsigned int a, unsigned int b)
2526{
2527 swap(((freelist_idx_t *)page->freelist)[a],
2528 ((freelist_idx_t *)page->freelist)[b]);
2529}
2530
2579/* 2531/*
2580 * Shuffle the freelist initialization state based on pre-computed lists. 2532 * Shuffle the freelist initialization state based on pre-computed lists.
2581 * return true if the list was successfully shuffled, false otherwise. 2533 * return true if the list was successfully shuffled, false otherwise.
2582 */ 2534 */
2583static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page) 2535static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
2584{ 2536{
2585 unsigned int objfreelist = 0, i, count = cachep->num; 2537 unsigned int objfreelist = 0, i, rand, count = cachep->num;
2586 union freelist_init_state state; 2538 union freelist_init_state state;
2587 bool precomputed; 2539 bool precomputed;
2588 2540
@@ -2607,7 +2559,15 @@ static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
2607 * Later use a pre-computed list for speed. 2559 * Later use a pre-computed list for speed.
2608 */ 2560 */
2609 if (!precomputed) { 2561 if (!precomputed) {
2610 freelist_randomize(&state.rnd_state, page->freelist, count); 2562 for (i = 0; i < count; i++)
2563 set_free_obj(page, i, i);
2564
2565 /* Fisher-Yates shuffle */
2566 for (i = count - 1; i > 0; i--) {
2567 rand = prandom_u32_state(&state.rnd_state);
2568 rand %= (i + 1);
2569 swap_free_obj(page, i, rand);
2570 }
2611 } else { 2571 } else {
2612 for (i = 0; i < count; i++) 2572 for (i = 0; i < count; i++)
2613 set_free_obj(page, i, next_random_slot(&state)); 2573 set_free_obj(page, i, next_random_slot(&state));
@@ -2726,8 +2686,11 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
2726 * critical path in kmem_cache_alloc(). 2686 * critical path in kmem_cache_alloc().
2727 */ 2687 */
2728 if (unlikely(flags & GFP_SLAB_BUG_MASK)) { 2688 if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
2729 pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); 2689 gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
2730 BUG(); 2690 flags &= ~GFP_SLAB_BUG_MASK;
2691 pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
2692 invalid_mask, &invalid_mask, flags, &flags);
2693 dump_stack();
2731 } 2694 }
2732 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 2695 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
2733 2696
@@ -3489,8 +3452,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
3489 n->free_objects -= cachep->num; 3452 n->free_objects -= cachep->num;
3490 3453
3491 page = list_last_entry(&n->slabs_free, struct page, lru); 3454 page = list_last_entry(&n->slabs_free, struct page, lru);
3492 list_del(&page->lru); 3455 list_move(&page->lru, list);
3493 list_add(&page->lru, list);
3494 } 3456 }
3495} 3457}
3496 3458
@@ -3979,7 +3941,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
3979 int shared = 0; 3941 int shared = 0;
3980 int batchcount = 0; 3942 int batchcount = 0;
3981 3943
3982 err = cache_random_seq_create(cachep, gfp); 3944 err = cache_random_seq_create(cachep, cachep->num, gfp);
3983 if (err) 3945 if (err)
3984 goto end; 3946 goto end;
3985 3947
diff --git a/mm/slab.h b/mm/slab.h
index dedb1a920fb8..f33980ab0406 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -42,6 +42,7 @@ struct kmem_cache {
42#include <linux/kmemcheck.h> 42#include <linux/kmemcheck.h>
43#include <linux/kasan.h> 43#include <linux/kasan.h>
44#include <linux/kmemleak.h> 44#include <linux/kmemleak.h>
45#include <linux/random.h>
45 46
46/* 47/*
47 * State of the slab allocator. 48 * State of the slab allocator.
@@ -253,8 +254,7 @@ static __always_inline int memcg_charge_slab(struct page *page,
253 if (is_root_cache(s)) 254 if (is_root_cache(s))
254 return 0; 255 return 0;
255 256
256 ret = __memcg_kmem_charge_memcg(page, gfp, order, 257 ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
257 s->memcg_params.memcg);
258 if (ret) 258 if (ret)
259 return ret; 259 return ret;
260 260
@@ -268,6 +268,9 @@ static __always_inline int memcg_charge_slab(struct page *page,
268static __always_inline void memcg_uncharge_slab(struct page *page, int order, 268static __always_inline void memcg_uncharge_slab(struct page *page, int order,
269 struct kmem_cache *s) 269 struct kmem_cache *s)
270{ 270{
271 if (!memcg_kmem_enabled())
272 return;
273
271 memcg_kmem_update_page_stat(page, 274 memcg_kmem_update_page_stat(page,
272 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 275 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
273 MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, 276 MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE,
@@ -390,7 +393,11 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
390 if (should_failslab(s, flags)) 393 if (should_failslab(s, flags))
391 return NULL; 394 return NULL;
392 395
393 return memcg_kmem_get_cache(s, flags); 396 if (memcg_kmem_enabled() &&
397 ((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)))
398 return memcg_kmem_get_cache(s);
399
400 return s;
394} 401}
395 402
396static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, 403static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
@@ -407,7 +414,9 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
407 s->flags, flags); 414 s->flags, flags);
408 kasan_slab_alloc(s, object, flags); 415 kasan_slab_alloc(s, object, flags);
409 } 416 }
410 memcg_kmem_put_cache(s); 417
418 if (memcg_kmem_enabled())
419 memcg_kmem_put_cache(s);
411} 420}
412 421
413#ifndef CONFIG_SLOB 422#ifndef CONFIG_SLOB
@@ -464,4 +473,17 @@ int memcg_slab_show(struct seq_file *m, void *p);
464 473
465void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); 474void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
466 475
476#ifdef CONFIG_SLAB_FREELIST_RANDOM
477int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
478 gfp_t gfp);
479void cache_random_seq_destroy(struct kmem_cache *cachep);
480#else
481static inline int cache_random_seq_create(struct kmem_cache *cachep,
482 unsigned int count, gfp_t gfp)
483{
484 return 0;
485}
486static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
487#endif /* CONFIG_SLAB_FREELIST_RANDOM */
488
467#endif /* MM_SLAB_H */ 489#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 82317abb03ed..71f0b28a1bec 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1012,7 +1012,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
1012 struct page *page; 1012 struct page *page;
1013 1013
1014 flags |= __GFP_COMP; 1014 flags |= __GFP_COMP;
1015 page = alloc_kmem_pages(flags, order); 1015 page = alloc_pages(flags, order);
1016 ret = page ? page_address(page) : NULL; 1016 ret = page ? page_address(page) : NULL;
1017 kmemleak_alloc(ret, size, 1, flags); 1017 kmemleak_alloc(ret, size, 1, flags);
1018 kasan_kmalloc_large(ret, size, flags); 1018 kasan_kmalloc_large(ret, size, flags);
@@ -1030,6 +1030,53 @@ void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
1030EXPORT_SYMBOL(kmalloc_order_trace); 1030EXPORT_SYMBOL(kmalloc_order_trace);
1031#endif 1031#endif
1032 1032
1033#ifdef CONFIG_SLAB_FREELIST_RANDOM
1034/* Randomize a generic freelist */
1035static void freelist_randomize(struct rnd_state *state, unsigned int *list,
1036 size_t count)
1037{
1038 size_t i;
1039 unsigned int rand;
1040
1041 for (i = 0; i < count; i++)
1042 list[i] = i;
1043
1044 /* Fisher-Yates shuffle */
1045 for (i = count - 1; i > 0; i--) {
1046 rand = prandom_u32_state(state);
1047 rand %= (i + 1);
1048 swap(list[i], list[rand]);
1049 }
1050}
1051
1052/* Create a random sequence per cache */
1053int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count,
1054 gfp_t gfp)
1055{
1056 struct rnd_state state;
1057
1058 if (count < 2 || cachep->random_seq)
1059 return 0;
1060
1061 cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp);
1062 if (!cachep->random_seq)
1063 return -ENOMEM;
1064
1065 /* Get best entropy at this stage of boot */
1066 prandom_seed_state(&state, get_random_long());
1067
1068 freelist_randomize(&state, cachep->random_seq, count);
1069 return 0;
1070}
1071
1072/* Destroy the per-cache random freelist sequence */
1073void cache_random_seq_destroy(struct kmem_cache *cachep)
1074{
1075 kfree(cachep->random_seq);
1076 cachep->random_seq = NULL;
1077}
1078#endif /* CONFIG_SLAB_FREELIST_RANDOM */
1079
1033#ifdef CONFIG_SLABINFO 1080#ifdef CONFIG_SLABINFO
1034 1081
1035#ifdef CONFIG_SLAB 1082#ifdef CONFIG_SLAB
diff --git a/mm/slub.c b/mm/slub.c
index 825ff4505336..f9da8716b8b3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1405,6 +1405,109 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
1405 return page; 1405 return page;
1406} 1406}
1407 1407
1408#ifdef CONFIG_SLAB_FREELIST_RANDOM
1409/* Pre-initialize the random sequence cache */
1410static int init_cache_random_seq(struct kmem_cache *s)
1411{
1412 int err;
1413 unsigned long i, count = oo_objects(s->oo);
1414
1415 err = cache_random_seq_create(s, count, GFP_KERNEL);
1416 if (err) {
1417 pr_err("SLUB: Unable to initialize free list for %s\n",
1418 s->name);
1419 return err;
1420 }
1421
1422 /* Transform to an offset on the set of pages */
1423 if (s->random_seq) {
1424 for (i = 0; i < count; i++)
1425 s->random_seq[i] *= s->size;
1426 }
1427 return 0;
1428}
1429
1430/* Initialize each random sequence freelist per cache */
1431static void __init init_freelist_randomization(void)
1432{
1433 struct kmem_cache *s;
1434
1435 mutex_lock(&slab_mutex);
1436
1437 list_for_each_entry(s, &slab_caches, list)
1438 init_cache_random_seq(s);
1439
1440 mutex_unlock(&slab_mutex);
1441}
1442
1443/* Get the next entry on the pre-computed freelist randomized */
1444static void *next_freelist_entry(struct kmem_cache *s, struct page *page,
1445 unsigned long *pos, void *start,
1446 unsigned long page_limit,
1447 unsigned long freelist_count)
1448{
1449 unsigned int idx;
1450
1451 /*
1452 * If the target page allocation failed, the number of objects on the
1453 * page might be smaller than the usual size defined by the cache.
1454 */
1455 do {
1456 idx = s->random_seq[*pos];
1457 *pos += 1;
1458 if (*pos >= freelist_count)
1459 *pos = 0;
1460 } while (unlikely(idx >= page_limit));
1461
1462 return (char *)start + idx;
1463}
1464
1465/* Shuffle the single linked freelist based on a random pre-computed sequence */
1466static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
1467{
1468 void *start;
1469 void *cur;
1470 void *next;
1471 unsigned long idx, pos, page_limit, freelist_count;
1472
1473 if (page->objects < 2 || !s->random_seq)
1474 return false;
1475
1476 freelist_count = oo_objects(s->oo);
1477 pos = get_random_int() % freelist_count;
1478
1479 page_limit = page->objects * s->size;
1480 start = fixup_red_left(s, page_address(page));
1481
1482 /* First entry is used as the base of the freelist */
1483 cur = next_freelist_entry(s, page, &pos, start, page_limit,
1484 freelist_count);
1485 page->freelist = cur;
1486
1487 for (idx = 1; idx < page->objects; idx++) {
1488 setup_object(s, page, cur);
1489 next = next_freelist_entry(s, page, &pos, start, page_limit,
1490 freelist_count);
1491 set_freepointer(s, cur, next);
1492 cur = next;
1493 }
1494 setup_object(s, page, cur);
1495 set_freepointer(s, cur, NULL);
1496
1497 return true;
1498}
1499#else
1500static inline int init_cache_random_seq(struct kmem_cache *s)
1501{
1502 return 0;
1503}
1504static inline void init_freelist_randomization(void) { }
1505static inline bool shuffle_freelist(struct kmem_cache *s, struct page *page)
1506{
1507 return false;
1508}
1509#endif /* CONFIG_SLAB_FREELIST_RANDOM */
1510
1408static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) 1511static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1409{ 1512{
1410 struct page *page; 1513 struct page *page;
@@ -1412,6 +1515,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1412 gfp_t alloc_gfp; 1515 gfp_t alloc_gfp;
1413 void *start, *p; 1516 void *start, *p;
1414 int idx, order; 1517 int idx, order;
1518 bool shuffle;
1415 1519
1416 flags &= gfp_allowed_mask; 1520 flags &= gfp_allowed_mask;
1417 1521
@@ -1473,15 +1577,19 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1473 1577
1474 kasan_poison_slab(page); 1578 kasan_poison_slab(page);
1475 1579
1476 for_each_object_idx(p, idx, s, start, page->objects) { 1580 shuffle = shuffle_freelist(s, page);
1477 setup_object(s, page, p); 1581
1478 if (likely(idx < page->objects)) 1582 if (!shuffle) {
1479 set_freepointer(s, p, p + s->size); 1583 for_each_object_idx(p, idx, s, start, page->objects) {
1480 else 1584 setup_object(s, page, p);
1481 set_freepointer(s, p, NULL); 1585 if (likely(idx < page->objects))
1586 set_freepointer(s, p, p + s->size);
1587 else
1588 set_freepointer(s, p, NULL);
1589 }
1590 page->freelist = fixup_red_left(s, start);
1482 } 1591 }
1483 1592
1484 page->freelist = fixup_red_left(s, start);
1485 page->inuse = page->objects; 1593 page->inuse = page->objects;
1486 page->frozen = 1; 1594 page->frozen = 1;
1487 1595
@@ -1504,8 +1612,10 @@ out:
1504static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1612static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1505{ 1613{
1506 if (unlikely(flags & GFP_SLAB_BUG_MASK)) { 1614 if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
1507 pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); 1615 gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
1508 BUG(); 1616 flags &= ~GFP_SLAB_BUG_MASK;
1617 pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
1618 invalid_mask, &invalid_mask, flags, &flags);
1509 } 1619 }
1510 1620
1511 return allocate_slab(s, 1621 return allocate_slab(s,
@@ -2867,7 +2977,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
2867 if (unlikely(!PageSlab(page))) { 2977 if (unlikely(!PageSlab(page))) {
2868 BUG_ON(!PageCompound(page)); 2978 BUG_ON(!PageCompound(page));
2869 kfree_hook(object); 2979 kfree_hook(object);
2870 __free_kmem_pages(page, compound_order(page)); 2980 __free_pages(page, compound_order(page));
2871 p[size] = NULL; /* mark object processed */ 2981 p[size] = NULL; /* mark object processed */
2872 return size; 2982 return size;
2873 } 2983 }
@@ -3207,6 +3317,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
3207 3317
3208void __kmem_cache_release(struct kmem_cache *s) 3318void __kmem_cache_release(struct kmem_cache *s)
3209{ 3319{
3320 cache_random_seq_destroy(s);
3210 free_percpu(s->cpu_slab); 3321 free_percpu(s->cpu_slab);
3211 free_kmem_cache_nodes(s); 3322 free_kmem_cache_nodes(s);
3212} 3323}
@@ -3431,6 +3542,13 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3431#ifdef CONFIG_NUMA 3542#ifdef CONFIG_NUMA
3432 s->remote_node_defrag_ratio = 1000; 3543 s->remote_node_defrag_ratio = 1000;
3433#endif 3544#endif
3545
3546 /* Initialize the pre-computed randomized freelist if slab is up */
3547 if (slab_state >= UP) {
3548 if (init_cache_random_seq(s))
3549 goto error;
3550 }
3551
3434 if (!init_kmem_cache_nodes(s)) 3552 if (!init_kmem_cache_nodes(s))
3435 goto error; 3553 goto error;
3436 3554
@@ -3575,7 +3693,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3575 void *ptr = NULL; 3693 void *ptr = NULL;
3576 3694
3577 flags |= __GFP_COMP | __GFP_NOTRACK; 3695 flags |= __GFP_COMP | __GFP_NOTRACK;
3578 page = alloc_kmem_pages_node(node, flags, get_order(size)); 3696 page = alloc_pages_node(node, flags, get_order(size));
3579 if (page) 3697 if (page)
3580 ptr = page_address(page); 3698 ptr = page_address(page);
3581 3699
@@ -3656,7 +3774,7 @@ void kfree(const void *x)
3656 if (unlikely(!PageSlab(page))) { 3774 if (unlikely(!PageSlab(page))) {
3657 BUG_ON(!PageCompound(page)); 3775 BUG_ON(!PageCompound(page));
3658 kfree_hook(x); 3776 kfree_hook(x);
3659 __free_kmem_pages(page, compound_order(page)); 3777 __free_pages(page, compound_order(page));
3660 return; 3778 return;
3661 } 3779 }
3662 slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); 3780 slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
@@ -3947,6 +4065,9 @@ void __init kmem_cache_init(void)
3947 setup_kmalloc_cache_index_table(); 4065 setup_kmalloc_cache_index_table();
3948 create_kmalloc_caches(0); 4066 create_kmalloc_caches(0);
3949 4067
4068 /* Setup random freelists for each cache */
4069 init_freelist_randomization();
4070
3950#ifdef CONFIG_SMP 4071#ifdef CONFIG_SMP
3951 register_cpu_notifier(&slab_notifier); 4072 register_cpu_notifier(&slab_notifier);
3952#endif 4073#endif
diff --git a/mm/swap.c b/mm/swap.c
index 90530ff8ed16..616df4ddd870 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -292,6 +292,7 @@ static bool need_activate_page_drain(int cpu)
292 292
293void activate_page(struct page *page) 293void activate_page(struct page *page)
294{ 294{
295 page = compound_head(page);
295 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 296 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
296 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); 297 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
297 298
@@ -316,6 +317,7 @@ void activate_page(struct page *page)
316{ 317{
317 struct zone *zone = page_zone(page); 318 struct zone *zone = page_zone(page);
318 319
320 page = compound_head(page);
319 spin_lock_irq(&zone->lru_lock); 321 spin_lock_irq(&zone->lru_lock);
320 __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); 322 __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
321 spin_unlock_irq(&zone->lru_lock); 323 spin_unlock_irq(&zone->lru_lock);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 031713ab40ce..78cfa292a29a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2493,7 +2493,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2493 goto bad_swap; 2493 goto bad_swap;
2494 } 2494 }
2495 /* frontswap enabled? set up bit-per-page map for frontswap */ 2495 /* frontswap enabled? set up bit-per-page map for frontswap */
2496 if (frontswap_enabled) 2496 if (IS_ENABLED(CONFIG_FRONTSWAP))
2497 frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); 2497 frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
2498 2498
2499 if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { 2499 if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
diff --git a/mm/truncate.c b/mm/truncate.c
index 4064f8f53daa..a01cce450a26 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -155,10 +155,14 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
155 155
156int truncate_inode_page(struct address_space *mapping, struct page *page) 156int truncate_inode_page(struct address_space *mapping, struct page *page)
157{ 157{
158 loff_t holelen;
159 VM_BUG_ON_PAGE(PageTail(page), page);
160
161 holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE;
158 if (page_mapped(page)) { 162 if (page_mapped(page)) {
159 unmap_mapping_range(mapping, 163 unmap_mapping_range(mapping,
160 (loff_t)page->index << PAGE_SHIFT, 164 (loff_t)page->index << PAGE_SHIFT,
161 PAGE_SIZE, 0); 165 holelen, 0);
162 } 166 }
163 return truncate_complete_page(mapping, page); 167 return truncate_complete_page(mapping, page);
164} 168}
@@ -279,7 +283,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
279 283
280 if (!trylock_page(page)) 284 if (!trylock_page(page))
281 continue; 285 continue;
282 WARN_ON(page->index != index); 286 WARN_ON(page_to_pgoff(page) != index);
283 if (PageWriteback(page)) { 287 if (PageWriteback(page)) {
284 unlock_page(page); 288 unlock_page(page);
285 continue; 289 continue;
@@ -367,7 +371,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
367 } 371 }
368 372
369 lock_page(page); 373 lock_page(page);
370 WARN_ON(page->index != index); 374 WARN_ON(page_to_pgoff(page) != index);
371 wait_on_page_writeback(page); 375 wait_on_page_writeback(page);
372 truncate_inode_page(mapping, page); 376 truncate_inode_page(mapping, page);
373 unlock_page(page); 377 unlock_page(page);
@@ -487,7 +491,21 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
487 491
488 if (!trylock_page(page)) 492 if (!trylock_page(page))
489 continue; 493 continue;
490 WARN_ON(page->index != index); 494
495 WARN_ON(page_to_pgoff(page) != index);
496
497 /* Middle of THP: skip */
498 if (PageTransTail(page)) {
499 unlock_page(page);
500 continue;
501 } else if (PageTransHuge(page)) {
502 index += HPAGE_PMD_NR - 1;
503 i += HPAGE_PMD_NR - 1;
504 /* 'end' is in the middle of THP */
505 if (index == round_down(end, HPAGE_PMD_NR))
506 continue;
507 }
508
491 ret = invalidate_inode_page(page); 509 ret = invalidate_inode_page(page);
492 unlock_page(page); 510 unlock_page(page);
493 /* 511 /*
@@ -594,7 +612,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
594 } 612 }
595 613
596 lock_page(page); 614 lock_page(page);
597 WARN_ON(page->index != index); 615 WARN_ON(page_to_pgoff(page) != index);
598 if (page->mapping != mapping) { 616 if (page->mapping != mapping) {
599 unlock_page(page); 617 unlock_page(page);
600 continue; 618 continue;
diff --git a/mm/util.c b/mm/util.c
index 917e0e3d0f8e..8d010ef2ce1c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -399,10 +399,12 @@ struct address_space *page_mapping(struct page *page)
399 } 399 }
400 400
401 mapping = page->mapping; 401 mapping = page->mapping;
402 if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) 402 if ((unsigned long)mapping & PAGE_MAPPING_ANON)
403 return NULL; 403 return NULL;
404 return mapping; 404
405 return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
405} 406}
407EXPORT_SYMBOL(page_mapping);
406 408
407/* Slow path of page_mapcount() for compound pages */ 409/* Slow path of page_mapcount() for compound pages */
408int __page_mapcount(struct page *page) 410int __page_mapcount(struct page *page)
@@ -410,6 +412,12 @@ int __page_mapcount(struct page *page)
410 int ret; 412 int ret;
411 413
412 ret = atomic_read(&page->_mapcount) + 1; 414 ret = atomic_read(&page->_mapcount) + 1;
415 /*
416 * For file THP page->_mapcount contains total number of mapping
417 * of the page: no need to look into compound_mapcount.
418 */
419 if (!PageAnon(page) && !PageHuge(page))
420 return ret;
413 page = compound_head(page); 421 page = compound_head(page);
414 ret += atomic_read(compound_mapcount_ptr(page)) + 1; 422 ret += atomic_read(compound_mapcount_ptr(page)) + 1;
415 if (PageDoubleMap(page)) 423 if (PageDoubleMap(page))
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e11475cdeb7a..91f44e78c516 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1501,7 +1501,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
1501 struct page *page = area->pages[i]; 1501 struct page *page = area->pages[i];
1502 1502
1503 BUG_ON(!page); 1503 BUG_ON(!page);
1504 __free_kmem_pages(page, 0); 1504 __free_pages(page, 0);
1505 } 1505 }
1506 1506
1507 kvfree(area->pages); 1507 kvfree(area->pages);
@@ -1629,9 +1629,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1629 struct page *page; 1629 struct page *page;
1630 1630
1631 if (node == NUMA_NO_NODE) 1631 if (node == NUMA_NO_NODE)
1632 page = alloc_kmem_pages(alloc_mask, order); 1632 page = alloc_pages(alloc_mask, order);
1633 else 1633 else
1634 page = alloc_kmem_pages_node(node, alloc_mask, order); 1634 page = alloc_pages_node(node, alloc_mask, order);
1635 1635
1636 if (unlikely(!page)) { 1636 if (unlikely(!page)) {
1637 /* Successfully allocated i pages, free them in __vunmap() */ 1637 /* Successfully allocated i pages, free them in __vunmap() */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c4a2f4512fca..21d417ccff69 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1055,8 +1055,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
1055 1055
1056 /* Adding to swap updated mapping */ 1056 /* Adding to swap updated mapping */
1057 mapping = page_mapping(page); 1057 mapping = page_mapping(page);
1058 } else if (unlikely(PageTransHuge(page))) {
1059 /* Split file THP */
1060 if (split_huge_page_to_list(page, page_list))
1061 goto keep_locked;
1058 } 1062 }
1059 1063
1064 VM_BUG_ON_PAGE(PageTransHuge(page), page);
1065
1060 /* 1066 /*
1061 * The page is mapped into the page tables of one or more 1067 * The page is mapped into the page tables of one or more
1062 * processes. Try to unmap it here. 1068 * processes. Try to unmap it here.
@@ -1254,7 +1260,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
1254 1260
1255 list_for_each_entry_safe(page, next, page_list, lru) { 1261 list_for_each_entry_safe(page, next, page_list, lru) {
1256 if (page_is_file_cache(page) && !PageDirty(page) && 1262 if (page_is_file_cache(page) && !PageDirty(page) &&
1257 !isolated_balloon_page(page)) { 1263 !__PageMovable(page)) {
1258 ClearPageActive(page); 1264 ClearPageActive(page);
1259 list_move(&page->lru, &clean_pages); 1265 list_move(&page->lru, &clean_pages);
1260 } 1266 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index cb2a67bb4158..7997f52935c9 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -718,7 +718,9 @@ const char * const vmstat_text[] = {
718 "nr_dirtied", 718 "nr_dirtied",
719 "nr_written", 719 "nr_written",
720 "nr_pages_scanned", 720 "nr_pages_scanned",
721 721#if IS_ENABLED(CONFIG_ZSMALLOC)
722 "nr_zspages",
723#endif
722#ifdef CONFIG_NUMA 724#ifdef CONFIG_NUMA
723 "numa_hit", 725 "numa_hit",
724 "numa_miss", 726 "numa_miss",
@@ -731,6 +733,8 @@ const char * const vmstat_text[] = {
731 "workingset_activate", 733 "workingset_activate",
732 "workingset_nodereclaim", 734 "workingset_nodereclaim",
733 "nr_anon_transparent_hugepages", 735 "nr_anon_transparent_hugepages",
736 "nr_shmem_hugepages",
737 "nr_shmem_pmdmapped",
734 "nr_free_cma", 738 "nr_free_cma",
735 739
736 /* enum writeback_stat_item counters */ 740 /* enum writeback_stat_item counters */
@@ -815,6 +819,8 @@ const char * const vmstat_text[] = {
815 "thp_fault_fallback", 819 "thp_fault_fallback",
816 "thp_collapse_alloc", 820 "thp_collapse_alloc",
817 "thp_collapse_alloc_failed", 821 "thp_collapse_alloc_failed",
822 "thp_file_alloc",
823 "thp_file_mapped",
818 "thp_split_page", 824 "thp_split_page",
819 "thp_split_page_failed", 825 "thp_split_page_failed",
820 "thp_deferred_split_page", 826 "thp_deferred_split_page",
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b6d4f258cb53..04176de6df70 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -16,32 +16,15 @@
16 * struct page(s) to form a zspage. 16 * struct page(s) to form a zspage.
17 * 17 *
18 * Usage of struct page fields: 18 * Usage of struct page fields:
19 * page->private: points to the first component (0-order) page 19 * page->private: points to zspage
20 * page->index (union with page->freelist): offset of the first object 20 * page->freelist(index): links together all component pages of a zspage
21 * starting in this page. For the first page, this is 21 * For the huge page, this is always 0, so we use this field
22 * always 0, so we use this field (aka freelist) to point 22 * to store handle.
23 * to the first free object in zspage.
24 * page->lru: links together all component pages (except the first page)
25 * of a zspage
26 *
27 * For _first_ page only:
28 *
29 * page->private: refers to the component page after the first page
30 * If the page is first_page for huge object, it stores handle.
31 * Look at size_class->huge.
32 * page->freelist: points to the first free object in zspage.
33 * Free objects are linked together using in-place
34 * metadata.
35 * page->objects: maximum number of objects we can store in this
36 * zspage (class->zspage_order * PAGE_SIZE / class->size)
37 * page->lru: links together first pages of various zspages.
38 * Basically forming list of zspages in a fullness group.
39 * page->mapping: class index and fullness group of the zspage
40 * page->inuse: the number of objects that are used in this zspage
41 * 23 *
42 * Usage of struct page flags: 24 * Usage of struct page flags:
43 * PG_private: identifies the first component page 25 * PG_private: identifies the first component page
44 * PG_private2: identifies the last component page 26 * PG_private2: identifies the last component page
27 * PG_owner_priv_1: indentifies the huge component page
45 * 28 *
46 */ 29 */
47 30
@@ -66,6 +49,11 @@
66#include <linux/debugfs.h> 49#include <linux/debugfs.h>
67#include <linux/zsmalloc.h> 50#include <linux/zsmalloc.h>
68#include <linux/zpool.h> 51#include <linux/zpool.h>
52#include <linux/mount.h>
53#include <linux/migrate.h>
54#include <linux/pagemap.h>
55
56#define ZSPAGE_MAGIC 0x58
69 57
70/* 58/*
71 * This must be power of 2 and greater than of equal to sizeof(link_free). 59 * This must be power of 2 and greater than of equal to sizeof(link_free).
@@ -88,9 +76,7 @@
88 * Object location (<PFN>, <obj_idx>) is encoded as 76 * Object location (<PFN>, <obj_idx>) is encoded as
89 * as single (unsigned long) handle value. 77 * as single (unsigned long) handle value.
90 * 78 *
91 * Note that object index <obj_idx> is relative to system 79 * Note that object index <obj_idx> starts from 0.
92 * page <PFN> it is stored in, so for each sub-page belonging
93 * to a zspage, obj_idx starts with 0.
94 * 80 *
95 * This is made more complicated by various memory models and PAE. 81 * This is made more complicated by various memory models and PAE.
96 */ 82 */
@@ -149,33 +135,29 @@
149 * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN 135 * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN
150 * (reason above) 136 * (reason above)
151 */ 137 */
152#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) 138#define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS)
153 139
154/* 140/*
155 * We do not maintain any list for completely empty or full pages 141 * We do not maintain any list for completely empty or full pages
156 */ 142 */
157enum fullness_group { 143enum fullness_group {
158 ZS_ALMOST_FULL,
159 ZS_ALMOST_EMPTY,
160 _ZS_NR_FULLNESS_GROUPS,
161
162 ZS_EMPTY, 144 ZS_EMPTY,
163 ZS_FULL 145 ZS_ALMOST_EMPTY,
146 ZS_ALMOST_FULL,
147 ZS_FULL,
148 NR_ZS_FULLNESS,
164}; 149};
165 150
166enum zs_stat_type { 151enum zs_stat_type {
152 CLASS_EMPTY,
153 CLASS_ALMOST_EMPTY,
154 CLASS_ALMOST_FULL,
155 CLASS_FULL,
167 OBJ_ALLOCATED, 156 OBJ_ALLOCATED,
168 OBJ_USED, 157 OBJ_USED,
169 CLASS_ALMOST_FULL, 158 NR_ZS_STAT_TYPE,
170 CLASS_ALMOST_EMPTY,
171}; 159};
172 160
173#ifdef CONFIG_ZSMALLOC_STAT
174#define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1)
175#else
176#define NR_ZS_STAT_TYPE (OBJ_USED + 1)
177#endif
178
179struct zs_size_stat { 161struct zs_size_stat {
180 unsigned long objs[NR_ZS_STAT_TYPE]; 162 unsigned long objs[NR_ZS_STAT_TYPE];
181}; 163};
@@ -184,6 +166,10 @@ struct zs_size_stat {
184static struct dentry *zs_stat_root; 166static struct dentry *zs_stat_root;
185#endif 167#endif
186 168
169#ifdef CONFIG_COMPACTION
170static struct vfsmount *zsmalloc_mnt;
171#endif
172
187/* 173/*
188 * number of size_classes 174 * number of size_classes
189 */ 175 */
@@ -207,35 +193,49 @@ static const int fullness_threshold_frac = 4;
207 193
208struct size_class { 194struct size_class {
209 spinlock_t lock; 195 spinlock_t lock;
210 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; 196 struct list_head fullness_list[NR_ZS_FULLNESS];
211 /* 197 /*
212 * Size of objects stored in this class. Must be multiple 198 * Size of objects stored in this class. Must be multiple
213 * of ZS_ALIGN. 199 * of ZS_ALIGN.
214 */ 200 */
215 int size; 201 int size;
216 unsigned int index; 202 int objs_per_zspage;
217
218 struct zs_size_stat stats;
219
220 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 203 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
221 int pages_per_zspage; 204 int pages_per_zspage;
222 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ 205
223 bool huge; 206 unsigned int index;
207 struct zs_size_stat stats;
224}; 208};
225 209
210/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
211static void SetPageHugeObject(struct page *page)
212{
213 SetPageOwnerPriv1(page);
214}
215
216static void ClearPageHugeObject(struct page *page)
217{
218 ClearPageOwnerPriv1(page);
219}
220
221static int PageHugeObject(struct page *page)
222{
223 return PageOwnerPriv1(page);
224}
225
226/* 226/*
227 * Placed within free objects to form a singly linked list. 227 * Placed within free objects to form a singly linked list.
228 * For every zspage, first_page->freelist gives head of this list. 228 * For every zspage, zspage->freeobj gives head of this list.
229 * 229 *
230 * This must be power of 2 and less than or equal to ZS_ALIGN 230 * This must be power of 2 and less than or equal to ZS_ALIGN
231 */ 231 */
232struct link_free { 232struct link_free {
233 union { 233 union {
234 /* 234 /*
235 * Position of next free chunk (encodes <PFN, obj_idx>) 235 * Free object index;
236 * It's valid for non-allocated object 236 * It's valid for non-allocated object
237 */ 237 */
238 void *next; 238 unsigned long next;
239 /* 239 /*
240 * Handle of allocated object. 240 * Handle of allocated object.
241 */ 241 */
@@ -248,6 +248,7 @@ struct zs_pool {
248 248
249 struct size_class **size_class; 249 struct size_class **size_class;
250 struct kmem_cache *handle_cachep; 250 struct kmem_cache *handle_cachep;
251 struct kmem_cache *zspage_cachep;
251 252
252 atomic_long_t pages_allocated; 253 atomic_long_t pages_allocated;
253 254
@@ -263,16 +264,36 @@ struct zs_pool {
263#ifdef CONFIG_ZSMALLOC_STAT 264#ifdef CONFIG_ZSMALLOC_STAT
264 struct dentry *stat_dentry; 265 struct dentry *stat_dentry;
265#endif 266#endif
267#ifdef CONFIG_COMPACTION
268 struct inode *inode;
269 struct work_struct free_work;
270#endif
266}; 271};
267 272
268/* 273/*
269 * A zspage's class index and fullness group 274 * A zspage's class index and fullness group
270 * are encoded in its (first)page->mapping 275 * are encoded in its (first)page->mapping
271 */ 276 */
272#define CLASS_IDX_BITS 28 277#define FULLNESS_BITS 2
273#define FULLNESS_BITS 4 278#define CLASS_BITS 8
274#define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) 279#define ISOLATED_BITS 3
275#define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) 280#define MAGIC_VAL_BITS 8
281
282struct zspage {
283 struct {
284 unsigned int fullness:FULLNESS_BITS;
285 unsigned int class:CLASS_BITS;
286 unsigned int isolated:ISOLATED_BITS;
287 unsigned int magic:MAGIC_VAL_BITS;
288 };
289 unsigned int inuse;
290 unsigned int freeobj;
291 struct page *first_page;
292 struct list_head list; /* fullness list */
293#ifdef CONFIG_COMPACTION
294 rwlock_t lock;
295#endif
296};
276 297
277struct mapping_area { 298struct mapping_area {
278#ifdef CONFIG_PGTABLE_MAPPING 299#ifdef CONFIG_PGTABLE_MAPPING
@@ -284,29 +305,74 @@ struct mapping_area {
284 enum zs_mapmode vm_mm; /* mapping mode */ 305 enum zs_mapmode vm_mm; /* mapping mode */
285}; 306};
286 307
287static int create_handle_cache(struct zs_pool *pool) 308#ifdef CONFIG_COMPACTION
309static int zs_register_migration(struct zs_pool *pool);
310static void zs_unregister_migration(struct zs_pool *pool);
311static void migrate_lock_init(struct zspage *zspage);
312static void migrate_read_lock(struct zspage *zspage);
313static void migrate_read_unlock(struct zspage *zspage);
314static void kick_deferred_free(struct zs_pool *pool);
315static void init_deferred_free(struct zs_pool *pool);
316static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
317#else
318static int zsmalloc_mount(void) { return 0; }
319static void zsmalloc_unmount(void) {}
320static int zs_register_migration(struct zs_pool *pool) { return 0; }
321static void zs_unregister_migration(struct zs_pool *pool) {}
322static void migrate_lock_init(struct zspage *zspage) {}
323static void migrate_read_lock(struct zspage *zspage) {}
324static void migrate_read_unlock(struct zspage *zspage) {}
325static void kick_deferred_free(struct zs_pool *pool) {}
326static void init_deferred_free(struct zs_pool *pool) {}
327static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {}
328#endif
329
330static int create_cache(struct zs_pool *pool)
288{ 331{
289 pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 332 pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
290 0, 0, NULL); 333 0, 0, NULL);
291 return pool->handle_cachep ? 0 : 1; 334 if (!pool->handle_cachep)
335 return 1;
336
337 pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage),
338 0, 0, NULL);
339 if (!pool->zspage_cachep) {
340 kmem_cache_destroy(pool->handle_cachep);
341 pool->handle_cachep = NULL;
342 return 1;
343 }
344
345 return 0;
292} 346}
293 347
294static void destroy_handle_cache(struct zs_pool *pool) 348static void destroy_cache(struct zs_pool *pool)
295{ 349{
296 kmem_cache_destroy(pool->handle_cachep); 350 kmem_cache_destroy(pool->handle_cachep);
351 kmem_cache_destroy(pool->zspage_cachep);
297} 352}
298 353
299static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp) 354static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp)
300{ 355{
301 return (unsigned long)kmem_cache_alloc(pool->handle_cachep, 356 return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
302 gfp & ~__GFP_HIGHMEM); 357 gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
303} 358}
304 359
305static void free_handle(struct zs_pool *pool, unsigned long handle) 360static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
306{ 361{
307 kmem_cache_free(pool->handle_cachep, (void *)handle); 362 kmem_cache_free(pool->handle_cachep, (void *)handle);
308} 363}
309 364
365static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
366{
367 return kmem_cache_alloc(pool->zspage_cachep,
368 flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
369};
370
371static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage)
372{
373 kmem_cache_free(pool->zspage_cachep, zspage);
374}
375
310static void record_obj(unsigned long handle, unsigned long obj) 376static void record_obj(unsigned long handle, unsigned long obj)
311{ 377{
312 /* 378 /*
@@ -409,38 +475,76 @@ static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
409/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 475/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
410static DEFINE_PER_CPU(struct mapping_area, zs_map_area); 476static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
411 477
478static bool is_zspage_isolated(struct zspage *zspage)
479{
480 return zspage->isolated;
481}
482
412static int is_first_page(struct page *page) 483static int is_first_page(struct page *page)
413{ 484{
414 return PagePrivate(page); 485 return PagePrivate(page);
415} 486}
416 487
417static int is_last_page(struct page *page) 488/* Protected by class->lock */
489static inline int get_zspage_inuse(struct zspage *zspage)
490{
491 return zspage->inuse;
492}
493
494static inline void set_zspage_inuse(struct zspage *zspage, int val)
495{
496 zspage->inuse = val;
497}
498
499static inline void mod_zspage_inuse(struct zspage *zspage, int val)
500{
501 zspage->inuse += val;
502}
503
504static inline struct page *get_first_page(struct zspage *zspage)
505{
506 struct page *first_page = zspage->first_page;
507
508 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
509 return first_page;
510}
511
512static inline int get_first_obj_offset(struct page *page)
513{
514 return page->units;
515}
516
517static inline void set_first_obj_offset(struct page *page, int offset)
518{
519 page->units = offset;
520}
521
522static inline unsigned int get_freeobj(struct zspage *zspage)
523{
524 return zspage->freeobj;
525}
526
527static inline void set_freeobj(struct zspage *zspage, unsigned int obj)
418{ 528{
419 return PagePrivate2(page); 529 zspage->freeobj = obj;
420} 530}
421 531
422static void get_zspage_mapping(struct page *first_page, 532static void get_zspage_mapping(struct zspage *zspage,
423 unsigned int *class_idx, 533 unsigned int *class_idx,
424 enum fullness_group *fullness) 534 enum fullness_group *fullness)
425{ 535{
426 unsigned long m; 536 BUG_ON(zspage->magic != ZSPAGE_MAGIC);
427 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
428 537
429 m = (unsigned long)first_page->mapping; 538 *fullness = zspage->fullness;
430 *fullness = m & FULLNESS_MASK; 539 *class_idx = zspage->class;
431 *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK;
432} 540}
433 541
434static void set_zspage_mapping(struct page *first_page, 542static void set_zspage_mapping(struct zspage *zspage,
435 unsigned int class_idx, 543 unsigned int class_idx,
436 enum fullness_group fullness) 544 enum fullness_group fullness)
437{ 545{
438 unsigned long m; 546 zspage->class = class_idx;
439 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 547 zspage->fullness = fullness;
440
441 m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
442 (fullness & FULLNESS_MASK);
443 first_page->mapping = (struct address_space *)m;
444} 548}
445 549
446/* 550/*
@@ -464,23 +568,19 @@ static int get_size_class_index(int size)
464static inline void zs_stat_inc(struct size_class *class, 568static inline void zs_stat_inc(struct size_class *class,
465 enum zs_stat_type type, unsigned long cnt) 569 enum zs_stat_type type, unsigned long cnt)
466{ 570{
467 if (type < NR_ZS_STAT_TYPE) 571 class->stats.objs[type] += cnt;
468 class->stats.objs[type] += cnt;
469} 572}
470 573
471static inline void zs_stat_dec(struct size_class *class, 574static inline void zs_stat_dec(struct size_class *class,
472 enum zs_stat_type type, unsigned long cnt) 575 enum zs_stat_type type, unsigned long cnt)
473{ 576{
474 if (type < NR_ZS_STAT_TYPE) 577 class->stats.objs[type] -= cnt;
475 class->stats.objs[type] -= cnt;
476} 578}
477 579
478static inline unsigned long zs_stat_get(struct size_class *class, 580static inline unsigned long zs_stat_get(struct size_class *class,
479 enum zs_stat_type type) 581 enum zs_stat_type type)
480{ 582{
481 if (type < NR_ZS_STAT_TYPE) 583 return class->stats.objs[type];
482 return class->stats.objs[type];
483 return 0;
484} 584}
485 585
486#ifdef CONFIG_ZSMALLOC_STAT 586#ifdef CONFIG_ZSMALLOC_STAT
@@ -624,6 +724,7 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
624} 724}
625#endif 725#endif
626 726
727
627/* 728/*
628 * For each size class, zspages are divided into different groups 729 * For each size class, zspages are divided into different groups
629 * depending on how "full" they are. This was done so that we could 730 * depending on how "full" they are. This was done so that we could
@@ -631,21 +732,20 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
631 * the pool (not yet implemented). This function returns fullness 732 * the pool (not yet implemented). This function returns fullness
632 * status of the given page. 733 * status of the given page.
633 */ 734 */
634static enum fullness_group get_fullness_group(struct page *first_page) 735static enum fullness_group get_fullness_group(struct size_class *class,
736 struct zspage *zspage)
635{ 737{
636 int inuse, max_objects; 738 int inuse, objs_per_zspage;
637 enum fullness_group fg; 739 enum fullness_group fg;
638 740
639 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 741 inuse = get_zspage_inuse(zspage);
640 742 objs_per_zspage = class->objs_per_zspage;
641 inuse = first_page->inuse;
642 max_objects = first_page->objects;
643 743
644 if (inuse == 0) 744 if (inuse == 0)
645 fg = ZS_EMPTY; 745 fg = ZS_EMPTY;
646 else if (inuse == max_objects) 746 else if (inuse == objs_per_zspage)
647 fg = ZS_FULL; 747 fg = ZS_FULL;
648 else if (inuse <= 3 * max_objects / fullness_threshold_frac) 748 else if (inuse <= 3 * objs_per_zspage / fullness_threshold_frac)
649 fg = ZS_ALMOST_EMPTY; 749 fg = ZS_ALMOST_EMPTY;
650 else 750 else
651 fg = ZS_ALMOST_FULL; 751 fg = ZS_ALMOST_FULL;
@@ -660,32 +760,25 @@ static enum fullness_group get_fullness_group(struct page *first_page)
660 * identified by <class, fullness_group>. 760 * identified by <class, fullness_group>.
661 */ 761 */
662static void insert_zspage(struct size_class *class, 762static void insert_zspage(struct size_class *class,
663 enum fullness_group fullness, 763 struct zspage *zspage,
664 struct page *first_page) 764 enum fullness_group fullness)
665{ 765{
666 struct page **head; 766 struct zspage *head;
667
668 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
669
670 if (fullness >= _ZS_NR_FULLNESS_GROUPS)
671 return;
672
673 zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
674 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
675
676 head = &class->fullness_list[fullness];
677 if (!*head) {
678 *head = first_page;
679 return;
680 }
681 767
768 zs_stat_inc(class, fullness, 1);
769 head = list_first_entry_or_null(&class->fullness_list[fullness],
770 struct zspage, list);
682 /* 771 /*
683 * We want to see more ZS_FULL pages and less almost 772 * We want to see more ZS_FULL pages and less almost empty/full.
684 * empty/full. Put pages with higher ->inuse first. 773 * Put pages with higher ->inuse first.
685 */ 774 */
686 list_add_tail(&first_page->lru, &(*head)->lru); 775 if (head) {
687 if (first_page->inuse >= (*head)->inuse) 776 if (get_zspage_inuse(zspage) < get_zspage_inuse(head)) {
688 *head = first_page; 777 list_add(&zspage->list, &head->list);
778 return;
779 }
780 }
781 list_add(&zspage->list, &class->fullness_list[fullness]);
689} 782}
690 783
691/* 784/*
@@ -693,27 +786,14 @@ static void insert_zspage(struct size_class *class,
693 * by <class, fullness_group>. 786 * by <class, fullness_group>.
694 */ 787 */
695static void remove_zspage(struct size_class *class, 788static void remove_zspage(struct size_class *class,
696 enum fullness_group fullness, 789 struct zspage *zspage,
697 struct page *first_page) 790 enum fullness_group fullness)
698{ 791{
699 struct page **head; 792 VM_BUG_ON(list_empty(&class->fullness_list[fullness]));
700 793 VM_BUG_ON(is_zspage_isolated(zspage));
701 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
702
703 if (fullness >= _ZS_NR_FULLNESS_GROUPS)
704 return;
705
706 head = &class->fullness_list[fullness];
707 VM_BUG_ON_PAGE(!*head, first_page);
708 if (list_empty(&(*head)->lru))
709 *head = NULL;
710 else if (*head == first_page)
711 *head = (struct page *)list_entry((*head)->lru.next,
712 struct page, lru);
713 794
714 list_del_init(&first_page->lru); 795 list_del_init(&zspage->list);
715 zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? 796 zs_stat_dec(class, fullness, 1);
716 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
717} 797}
718 798
719/* 799/*
@@ -726,19 +806,22 @@ static void remove_zspage(struct size_class *class,
726 * fullness group. 806 * fullness group.
727 */ 807 */
728static enum fullness_group fix_fullness_group(struct size_class *class, 808static enum fullness_group fix_fullness_group(struct size_class *class,
729 struct page *first_page) 809 struct zspage *zspage)
730{ 810{
731 int class_idx; 811 int class_idx;
732 enum fullness_group currfg, newfg; 812 enum fullness_group currfg, newfg;
733 813
734 get_zspage_mapping(first_page, &class_idx, &currfg); 814 get_zspage_mapping(zspage, &class_idx, &currfg);
735 newfg = get_fullness_group(first_page); 815 newfg = get_fullness_group(class, zspage);
736 if (newfg == currfg) 816 if (newfg == currfg)
737 goto out; 817 goto out;
738 818
739 remove_zspage(class, currfg, first_page); 819 if (!is_zspage_isolated(zspage)) {
740 insert_zspage(class, newfg, first_page); 820 remove_zspage(class, zspage, currfg);
741 set_zspage_mapping(first_page, class_idx, newfg); 821 insert_zspage(class, zspage, newfg);
822 }
823
824 set_zspage_mapping(zspage, class_idx, newfg);
742 825
743out: 826out:
744 return newfg; 827 return newfg;
@@ -780,64 +863,49 @@ static int get_pages_per_zspage(int class_size)
780 return max_usedpc_order; 863 return max_usedpc_order;
781} 864}
782 865
783/* 866static struct zspage *get_zspage(struct page *page)
784 * A single 'zspage' is composed of many system pages which are
785 * linked together using fields in struct page. This function finds
786 * the first/head page, given any component page of a zspage.
787 */
788static struct page *get_first_page(struct page *page)
789{ 867{
790 if (is_first_page(page)) 868 struct zspage *zspage = (struct zspage *)page->private;
791 return page; 869
792 else 870 BUG_ON(zspage->magic != ZSPAGE_MAGIC);
793 return (struct page *)page_private(page); 871 return zspage;
794} 872}
795 873
796static struct page *get_next_page(struct page *page) 874static struct page *get_next_page(struct page *page)
797{ 875{
798 struct page *next; 876 if (unlikely(PageHugeObject(page)))
877 return NULL;
799 878
800 if (is_last_page(page)) 879 return page->freelist;
801 next = NULL; 880}
802 else if (is_first_page(page))
803 next = (struct page *)page_private(page);
804 else
805 next = list_entry(page->lru.next, struct page, lru);
806 881
807 return next; 882/**
883 * obj_to_location - get (<page>, <obj_idx>) from encoded object value
884 * @page: page object resides in zspage
885 * @obj_idx: object index
886 */
887static void obj_to_location(unsigned long obj, struct page **page,
888 unsigned int *obj_idx)
889{
890 obj >>= OBJ_TAG_BITS;
891 *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
892 *obj_idx = (obj & OBJ_INDEX_MASK);
808} 893}
809 894
810/* 895/**
811 * Encode <page, obj_idx> as a single handle value. 896 * location_to_obj - get obj value encoded from (<page>, <obj_idx>)
812 * We use the least bit of handle for tagging. 897 * @page: page object resides in zspage
898 * @obj_idx: object index
813 */ 899 */
814static void *location_to_obj(struct page *page, unsigned long obj_idx) 900static unsigned long location_to_obj(struct page *page, unsigned int obj_idx)
815{ 901{
816 unsigned long obj; 902 unsigned long obj;
817 903
818 if (!page) {
819 VM_BUG_ON(obj_idx);
820 return NULL;
821 }
822
823 obj = page_to_pfn(page) << OBJ_INDEX_BITS; 904 obj = page_to_pfn(page) << OBJ_INDEX_BITS;
824 obj |= ((obj_idx) & OBJ_INDEX_MASK); 905 obj |= obj_idx & OBJ_INDEX_MASK;
825 obj <<= OBJ_TAG_BITS; 906 obj <<= OBJ_TAG_BITS;
826 907
827 return (void *)obj; 908 return obj;
828}
829
830/*
831 * Decode <page, obj_idx> pair from the given object handle. We adjust the
832 * decoded obj_idx back to its original value since it was adjusted in
833 * location_to_obj().
834 */
835static void obj_to_location(unsigned long obj, struct page **page,
836 unsigned long *obj_idx)
837{
838 obj >>= OBJ_TAG_BITS;
839 *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
840 *obj_idx = (obj & OBJ_INDEX_MASK);
841} 909}
842 910
843static unsigned long handle_to_obj(unsigned long handle) 911static unsigned long handle_to_obj(unsigned long handle)
@@ -845,109 +913,147 @@ static unsigned long handle_to_obj(unsigned long handle)
845 return *(unsigned long *)handle; 913 return *(unsigned long *)handle;
846} 914}
847 915
848static unsigned long obj_to_head(struct size_class *class, struct page *page, 916static unsigned long obj_to_head(struct page *page, void *obj)
849 void *obj)
850{ 917{
851 if (class->huge) { 918 if (unlikely(PageHugeObject(page))) {
852 VM_BUG_ON_PAGE(!is_first_page(page), page); 919 VM_BUG_ON_PAGE(!is_first_page(page), page);
853 return page_private(page); 920 return page->index;
854 } else 921 } else
855 return *(unsigned long *)obj; 922 return *(unsigned long *)obj;
856} 923}
857 924
858static unsigned long obj_idx_to_offset(struct page *page, 925static inline int testpin_tag(unsigned long handle)
859 unsigned long obj_idx, int class_size)
860{ 926{
861 unsigned long off = 0; 927 return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle);
862
863 if (!is_first_page(page))
864 off = page->index;
865
866 return off + obj_idx * class_size;
867} 928}
868 929
869static inline int trypin_tag(unsigned long handle) 930static inline int trypin_tag(unsigned long handle)
870{ 931{
871 unsigned long *ptr = (unsigned long *)handle; 932 return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle);
872
873 return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr);
874} 933}
875 934
876static void pin_tag(unsigned long handle) 935static void pin_tag(unsigned long handle)
877{ 936{
878 while (!trypin_tag(handle)); 937 bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle);
879} 938}
880 939
881static void unpin_tag(unsigned long handle) 940static void unpin_tag(unsigned long handle)
882{ 941{
883 unsigned long *ptr = (unsigned long *)handle; 942 bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle);
884
885 clear_bit_unlock(HANDLE_PIN_BIT, ptr);
886} 943}
887 944
888static void reset_page(struct page *page) 945static void reset_page(struct page *page)
889{ 946{
947 __ClearPageMovable(page);
890 clear_bit(PG_private, &page->flags); 948 clear_bit(PG_private, &page->flags);
891 clear_bit(PG_private_2, &page->flags); 949 clear_bit(PG_private_2, &page->flags);
892 set_page_private(page, 0); 950 set_page_private(page, 0);
893 page->mapping = NULL;
894 page->freelist = NULL;
895 page_mapcount_reset(page); 951 page_mapcount_reset(page);
952 ClearPageHugeObject(page);
953 page->freelist = NULL;
954}
955
956/*
957 * To prevent zspage destroy during migration, zspage freeing should
958 * hold locks of all pages in the zspage.
959 */
960void lock_zspage(struct zspage *zspage)
961{
962 struct page *page = get_first_page(zspage);
963
964 do {
965 lock_page(page);
966 } while ((page = get_next_page(page)) != NULL);
896} 967}
897 968
898static void free_zspage(struct page *first_page) 969int trylock_zspage(struct zspage *zspage)
899{ 970{
900 struct page *nextp, *tmp, *head_extra; 971 struct page *cursor, *fail;
901 972
902 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 973 for (cursor = get_first_page(zspage); cursor != NULL; cursor =
903 VM_BUG_ON_PAGE(first_page->inuse, first_page); 974 get_next_page(cursor)) {
975 if (!trylock_page(cursor)) {
976 fail = cursor;
977 goto unlock;
978 }
979 }
904 980
905 head_extra = (struct page *)page_private(first_page); 981 return 1;
982unlock:
983 for (cursor = get_first_page(zspage); cursor != fail; cursor =
984 get_next_page(cursor))
985 unlock_page(cursor);
906 986
907 reset_page(first_page); 987 return 0;
908 __free_page(first_page); 988}
909 989
910 /* zspage with only 1 system page */ 990static void __free_zspage(struct zs_pool *pool, struct size_class *class,
911 if (!head_extra) 991 struct zspage *zspage)
912 return; 992{
993 struct page *page, *next;
994 enum fullness_group fg;
995 unsigned int class_idx;
913 996
914 list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { 997 get_zspage_mapping(zspage, &class_idx, &fg);
915 list_del(&nextp->lru); 998
916 reset_page(nextp); 999 assert_spin_locked(&class->lock);
917 __free_page(nextp); 1000
1001 VM_BUG_ON(get_zspage_inuse(zspage));
1002 VM_BUG_ON(fg != ZS_EMPTY);
1003
1004 next = page = get_first_page(zspage);
1005 do {
1006 VM_BUG_ON_PAGE(!PageLocked(page), page);
1007 next = get_next_page(page);
1008 reset_page(page);
1009 unlock_page(page);
1010 dec_zone_page_state(page, NR_ZSPAGES);
1011 put_page(page);
1012 page = next;
1013 } while (page != NULL);
1014
1015 cache_free_zspage(pool, zspage);
1016
1017 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
1018 class->size, class->pages_per_zspage));
1019 atomic_long_sub(class->pages_per_zspage,
1020 &pool->pages_allocated);
1021}
1022
1023static void free_zspage(struct zs_pool *pool, struct size_class *class,
1024 struct zspage *zspage)
1025{
1026 VM_BUG_ON(get_zspage_inuse(zspage));
1027 VM_BUG_ON(list_empty(&zspage->list));
1028
1029 if (!trylock_zspage(zspage)) {
1030 kick_deferred_free(pool);
1031 return;
918 } 1032 }
919 reset_page(head_extra); 1033
920 __free_page(head_extra); 1034 remove_zspage(class, zspage, ZS_EMPTY);
1035 __free_zspage(pool, class, zspage);
921} 1036}
922 1037
923/* Initialize a newly allocated zspage */ 1038/* Initialize a newly allocated zspage */
924static void init_zspage(struct size_class *class, struct page *first_page) 1039static void init_zspage(struct size_class *class, struct zspage *zspage)
925{ 1040{
1041 unsigned int freeobj = 1;
926 unsigned long off = 0; 1042 unsigned long off = 0;
927 struct page *page = first_page; 1043 struct page *page = get_first_page(zspage);
928
929 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
930 1044
931 while (page) { 1045 while (page) {
932 struct page *next_page; 1046 struct page *next_page;
933 struct link_free *link; 1047 struct link_free *link;
934 unsigned int i = 1;
935 void *vaddr; 1048 void *vaddr;
936 1049
937 /* 1050 set_first_obj_offset(page, off);
938 * page->index stores offset of first object starting
939 * in the page. For the first page, this is always 0,
940 * so we use first_page->index (aka ->freelist) to store
941 * head of corresponding zspage's freelist.
942 */
943 if (page != first_page)
944 page->index = off;
945 1051
946 vaddr = kmap_atomic(page); 1052 vaddr = kmap_atomic(page);
947 link = (struct link_free *)vaddr + off / sizeof(*link); 1053 link = (struct link_free *)vaddr + off / sizeof(*link);
948 1054
949 while ((off += class->size) < PAGE_SIZE) { 1055 while ((off += class->size) < PAGE_SIZE) {
950 link->next = location_to_obj(page, i++); 1056 link->next = freeobj++ << OBJ_TAG_BITS;
951 link += class->size / sizeof(*link); 1057 link += class->size / sizeof(*link);
952 } 1058 }
953 1059
@@ -957,87 +1063,112 @@ static void init_zspage(struct size_class *class, struct page *first_page)
957 * page (if present) 1063 * page (if present)
958 */ 1064 */
959 next_page = get_next_page(page); 1065 next_page = get_next_page(page);
960 link->next = location_to_obj(next_page, 0); 1066 if (next_page) {
1067 link->next = freeobj++ << OBJ_TAG_BITS;
1068 } else {
1069 /*
1070 * Reset OBJ_TAG_BITS bit to last link to tell
1071 * whether it's allocated object or not.
1072 */
1073 link->next = -1 << OBJ_TAG_BITS;
1074 }
961 kunmap_atomic(vaddr); 1075 kunmap_atomic(vaddr);
962 page = next_page; 1076 page = next_page;
963 off %= PAGE_SIZE; 1077 off %= PAGE_SIZE;
964 } 1078 }
1079
1080 set_freeobj(zspage, 0);
965} 1081}
966 1082
967/* 1083static void create_page_chain(struct size_class *class, struct zspage *zspage,
968 * Allocate a zspage for the given size class 1084 struct page *pages[])
969 */
970static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
971{ 1085{
972 int i, error; 1086 int i;
973 struct page *first_page = NULL, *uninitialized_var(prev_page); 1087 struct page *page;
1088 struct page *prev_page = NULL;
1089 int nr_pages = class->pages_per_zspage;
974 1090
975 /* 1091 /*
976 * Allocate individual pages and link them together as: 1092 * Allocate individual pages and link them together as:
977 * 1. first page->private = first sub-page 1093 * 1. all pages are linked together using page->freelist
978 * 2. all sub-pages are linked together using page->lru 1094 * 2. each sub-page point to zspage using page->private
979 * 3. each sub-page is linked to the first page using page->private
980 * 1095 *
981 * For each size class, First/Head pages are linked together using 1096 * we set PG_private to identify the first page (i.e. no other sub-page
982 * page->lru. Also, we set PG_private to identify the first page 1097 * has this flag set) and PG_private_2 to identify the last page.
983 * (i.e. no other sub-page has this flag set) and PG_private_2 to
984 * identify the last page.
985 */ 1098 */
986 error = -ENOMEM; 1099 for (i = 0; i < nr_pages; i++) {
987 for (i = 0; i < class->pages_per_zspage; i++) { 1100 page = pages[i];
988 struct page *page; 1101 set_page_private(page, (unsigned long)zspage);
989 1102 page->freelist = NULL;
990 page = alloc_page(flags); 1103 if (i == 0) {
991 if (!page) 1104 zspage->first_page = page;
992 goto cleanup;
993
994 INIT_LIST_HEAD(&page->lru);
995 if (i == 0) { /* first page */
996 SetPagePrivate(page); 1105 SetPagePrivate(page);
997 set_page_private(page, 0); 1106 if (unlikely(class->objs_per_zspage == 1 &&
998 first_page = page; 1107 class->pages_per_zspage == 1))
999 first_page->inuse = 0; 1108 SetPageHugeObject(page);
1109 } else {
1110 prev_page->freelist = page;
1000 } 1111 }
1001 if (i == 1) 1112 if (i == nr_pages - 1)
1002 set_page_private(first_page, (unsigned long)page);
1003 if (i >= 1)
1004 set_page_private(page, (unsigned long)first_page);
1005 if (i >= 2)
1006 list_add(&page->lru, &prev_page->lru);
1007 if (i == class->pages_per_zspage - 1) /* last page */
1008 SetPagePrivate2(page); 1113 SetPagePrivate2(page);
1009 prev_page = page; 1114 prev_page = page;
1010 } 1115 }
1116}
1117
1118/*
1119 * Allocate a zspage for the given size class
1120 */
1121static struct zspage *alloc_zspage(struct zs_pool *pool,
1122 struct size_class *class,
1123 gfp_t gfp)
1124{
1125 int i;
1126 struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE];
1127 struct zspage *zspage = cache_alloc_zspage(pool, gfp);
1128
1129 if (!zspage)
1130 return NULL;
1011 1131
1012 init_zspage(class, first_page); 1132 memset(zspage, 0, sizeof(struct zspage));
1133 zspage->magic = ZSPAGE_MAGIC;
1134 migrate_lock_init(zspage);
1013 1135
1014 first_page->freelist = location_to_obj(first_page, 0); 1136 for (i = 0; i < class->pages_per_zspage; i++) {
1015 /* Maximum number of objects we can store in this zspage */ 1137 struct page *page;
1016 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
1017 1138
1018 error = 0; /* Success */ 1139 page = alloc_page(gfp);
1140 if (!page) {
1141 while (--i >= 0) {
1142 dec_zone_page_state(pages[i], NR_ZSPAGES);
1143 __free_page(pages[i]);
1144 }
1145 cache_free_zspage(pool, zspage);
1146 return NULL;
1147 }
1019 1148
1020cleanup: 1149 inc_zone_page_state(page, NR_ZSPAGES);
1021 if (unlikely(error) && first_page) { 1150 pages[i] = page;
1022 free_zspage(first_page);
1023 first_page = NULL;
1024 } 1151 }
1025 1152
1026 return first_page; 1153 create_page_chain(class, zspage, pages);
1154 init_zspage(class, zspage);
1155
1156 return zspage;
1027} 1157}
1028 1158
1029static struct page *find_get_zspage(struct size_class *class) 1159static struct zspage *find_get_zspage(struct size_class *class)
1030{ 1160{
1031 int i; 1161 int i;
1032 struct page *page; 1162 struct zspage *zspage;
1033 1163
1034 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1164 for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) {
1035 page = class->fullness_list[i]; 1165 zspage = list_first_entry_or_null(&class->fullness_list[i],
1036 if (page) 1166 struct zspage, list);
1167 if (zspage)
1037 break; 1168 break;
1038 } 1169 }
1039 1170
1040 return page; 1171 return zspage;
1041} 1172}
1042 1173
1043#ifdef CONFIG_PGTABLE_MAPPING 1174#ifdef CONFIG_PGTABLE_MAPPING
@@ -1242,11 +1373,9 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
1242 return true; 1373 return true;
1243} 1374}
1244 1375
1245static bool zspage_full(struct page *first_page) 1376static bool zspage_full(struct size_class *class, struct zspage *zspage)
1246{ 1377{
1247 VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); 1378 return get_zspage_inuse(zspage) == class->objs_per_zspage;
1248
1249 return first_page->inuse == first_page->objects;
1250} 1379}
1251 1380
1252unsigned long zs_get_total_pages(struct zs_pool *pool) 1381unsigned long zs_get_total_pages(struct zs_pool *pool)
@@ -1272,8 +1401,10 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages);
1272void *zs_map_object(struct zs_pool *pool, unsigned long handle, 1401void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1273 enum zs_mapmode mm) 1402 enum zs_mapmode mm)
1274{ 1403{
1404 struct zspage *zspage;
1275 struct page *page; 1405 struct page *page;
1276 unsigned long obj, obj_idx, off; 1406 unsigned long obj, off;
1407 unsigned int obj_idx;
1277 1408
1278 unsigned int class_idx; 1409 unsigned int class_idx;
1279 enum fullness_group fg; 1410 enum fullness_group fg;
@@ -1294,9 +1425,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1294 1425
1295 obj = handle_to_obj(handle); 1426 obj = handle_to_obj(handle);
1296 obj_to_location(obj, &page, &obj_idx); 1427 obj_to_location(obj, &page, &obj_idx);
1297 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1428 zspage = get_zspage(page);
1429
1430 /* migration cannot move any subpage in this zspage */
1431 migrate_read_lock(zspage);
1432
1433 get_zspage_mapping(zspage, &class_idx, &fg);
1298 class = pool->size_class[class_idx]; 1434 class = pool->size_class[class_idx];
1299 off = obj_idx_to_offset(page, obj_idx, class->size); 1435 off = (class->size * obj_idx) & ~PAGE_MASK;
1300 1436
1301 area = &get_cpu_var(zs_map_area); 1437 area = &get_cpu_var(zs_map_area);
1302 area->vm_mm = mm; 1438 area->vm_mm = mm;
@@ -1314,7 +1450,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1314 1450
1315 ret = __zs_map_object(area, pages, off, class->size); 1451 ret = __zs_map_object(area, pages, off, class->size);
1316out: 1452out:
1317 if (!class->huge) 1453 if (likely(!PageHugeObject(page)))
1318 ret += ZS_HANDLE_SIZE; 1454 ret += ZS_HANDLE_SIZE;
1319 1455
1320 return ret; 1456 return ret;
@@ -1323,8 +1459,10 @@ EXPORT_SYMBOL_GPL(zs_map_object);
1323 1459
1324void zs_unmap_object(struct zs_pool *pool, unsigned long handle) 1460void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1325{ 1461{
1462 struct zspage *zspage;
1326 struct page *page; 1463 struct page *page;
1327 unsigned long obj, obj_idx, off; 1464 unsigned long obj, off;
1465 unsigned int obj_idx;
1328 1466
1329 unsigned int class_idx; 1467 unsigned int class_idx;
1330 enum fullness_group fg; 1468 enum fullness_group fg;
@@ -1333,9 +1471,10 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1333 1471
1334 obj = handle_to_obj(handle); 1472 obj = handle_to_obj(handle);
1335 obj_to_location(obj, &page, &obj_idx); 1473 obj_to_location(obj, &page, &obj_idx);
1336 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1474 zspage = get_zspage(page);
1475 get_zspage_mapping(zspage, &class_idx, &fg);
1337 class = pool->size_class[class_idx]; 1476 class = pool->size_class[class_idx];
1338 off = obj_idx_to_offset(page, obj_idx, class->size); 1477 off = (class->size * obj_idx) & ~PAGE_MASK;
1339 1478
1340 area = this_cpu_ptr(&zs_map_area); 1479 area = this_cpu_ptr(&zs_map_area);
1341 if (off + class->size <= PAGE_SIZE) 1480 if (off + class->size <= PAGE_SIZE)
@@ -1350,38 +1489,50 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1350 __zs_unmap_object(area, pages, off, class->size); 1489 __zs_unmap_object(area, pages, off, class->size);
1351 } 1490 }
1352 put_cpu_var(zs_map_area); 1491 put_cpu_var(zs_map_area);
1492
1493 migrate_read_unlock(zspage);
1353 unpin_tag(handle); 1494 unpin_tag(handle);
1354} 1495}
1355EXPORT_SYMBOL_GPL(zs_unmap_object); 1496EXPORT_SYMBOL_GPL(zs_unmap_object);
1356 1497
1357static unsigned long obj_malloc(struct size_class *class, 1498static unsigned long obj_malloc(struct size_class *class,
1358 struct page *first_page, unsigned long handle) 1499 struct zspage *zspage, unsigned long handle)
1359{ 1500{
1501 int i, nr_page, offset;
1360 unsigned long obj; 1502 unsigned long obj;
1361 struct link_free *link; 1503 struct link_free *link;
1362 1504
1363 struct page *m_page; 1505 struct page *m_page;
1364 unsigned long m_objidx, m_offset; 1506 unsigned long m_offset;
1365 void *vaddr; 1507 void *vaddr;
1366 1508
1367 handle |= OBJ_ALLOCATED_TAG; 1509 handle |= OBJ_ALLOCATED_TAG;
1368 obj = (unsigned long)first_page->freelist; 1510 obj = get_freeobj(zspage);
1369 obj_to_location(obj, &m_page, &m_objidx); 1511
1370 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); 1512 offset = obj * class->size;
1513 nr_page = offset >> PAGE_SHIFT;
1514 m_offset = offset & ~PAGE_MASK;
1515 m_page = get_first_page(zspage);
1516
1517 for (i = 0; i < nr_page; i++)
1518 m_page = get_next_page(m_page);
1371 1519
1372 vaddr = kmap_atomic(m_page); 1520 vaddr = kmap_atomic(m_page);
1373 link = (struct link_free *)vaddr + m_offset / sizeof(*link); 1521 link = (struct link_free *)vaddr + m_offset / sizeof(*link);
1374 first_page->freelist = link->next; 1522 set_freeobj(zspage, link->next >> OBJ_TAG_BITS);
1375 if (!class->huge) 1523 if (likely(!PageHugeObject(m_page)))
1376 /* record handle in the header of allocated chunk */ 1524 /* record handle in the header of allocated chunk */
1377 link->handle = handle; 1525 link->handle = handle;
1378 else 1526 else
1379 /* record handle in first_page->private */ 1527 /* record handle to page->index */
1380 set_page_private(first_page, handle); 1528 zspage->first_page->index = handle;
1529
1381 kunmap_atomic(vaddr); 1530 kunmap_atomic(vaddr);
1382 first_page->inuse++; 1531 mod_zspage_inuse(zspage, 1);
1383 zs_stat_inc(class, OBJ_USED, 1); 1532 zs_stat_inc(class, OBJ_USED, 1);
1384 1533
1534 obj = location_to_obj(m_page, obj);
1535
1385 return obj; 1536 return obj;
1386} 1537}
1387 1538
@@ -1399,12 +1550,13 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
1399{ 1550{
1400 unsigned long handle, obj; 1551 unsigned long handle, obj;
1401 struct size_class *class; 1552 struct size_class *class;
1402 struct page *first_page; 1553 enum fullness_group newfg;
1554 struct zspage *zspage;
1403 1555
1404 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) 1556 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
1405 return 0; 1557 return 0;
1406 1558
1407 handle = alloc_handle(pool, gfp); 1559 handle = cache_alloc_handle(pool, gfp);
1408 if (!handle) 1560 if (!handle)
1409 return 0; 1561 return 0;
1410 1562
@@ -1413,29 +1565,38 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
1413 class = pool->size_class[get_size_class_index(size)]; 1565 class = pool->size_class[get_size_class_index(size)];
1414 1566
1415 spin_lock(&class->lock); 1567 spin_lock(&class->lock);
1416 first_page = find_get_zspage(class); 1568 zspage = find_get_zspage(class);
1417 1569 if (likely(zspage)) {
1418 if (!first_page) { 1570 obj = obj_malloc(class, zspage, handle);
1571 /* Now move the zspage to another fullness group, if required */
1572 fix_fullness_group(class, zspage);
1573 record_obj(handle, obj);
1419 spin_unlock(&class->lock); 1574 spin_unlock(&class->lock);
1420 first_page = alloc_zspage(class, gfp);
1421 if (unlikely(!first_page)) {
1422 free_handle(pool, handle);
1423 return 0;
1424 }
1425 1575
1426 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1576 return handle;
1427 atomic_long_add(class->pages_per_zspage, 1577 }
1428 &pool->pages_allocated);
1429 1578
1430 spin_lock(&class->lock); 1579 spin_unlock(&class->lock);
1431 zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1580
1432 class->size, class->pages_per_zspage)); 1581 zspage = alloc_zspage(pool, class, gfp);
1582 if (!zspage) {
1583 cache_free_handle(pool, handle);
1584 return 0;
1433 } 1585 }
1434 1586
1435 obj = obj_malloc(class, first_page, handle); 1587 spin_lock(&class->lock);
1436 /* Now move the zspage to another fullness group, if required */ 1588 obj = obj_malloc(class, zspage, handle);
1437 fix_fullness_group(class, first_page); 1589 newfg = get_fullness_group(class, zspage);
1590 insert_zspage(class, zspage, newfg);
1591 set_zspage_mapping(zspage, class->index, newfg);
1438 record_obj(handle, obj); 1592 record_obj(handle, obj);
1593 atomic_long_add(class->pages_per_zspage,
1594 &pool->pages_allocated);
1595 zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
1596 class->size, class->pages_per_zspage));
1597
1598 /* We completely set up zspage so mark them as movable */
1599 SetZsPageMovable(pool, zspage);
1439 spin_unlock(&class->lock); 1600 spin_unlock(&class->lock);
1440 1601
1441 return handle; 1602 return handle;
@@ -1445,36 +1606,38 @@ EXPORT_SYMBOL_GPL(zs_malloc);
1445static void obj_free(struct size_class *class, unsigned long obj) 1606static void obj_free(struct size_class *class, unsigned long obj)
1446{ 1607{
1447 struct link_free *link; 1608 struct link_free *link;
1448 struct page *first_page, *f_page; 1609 struct zspage *zspage;
1449 unsigned long f_objidx, f_offset; 1610 struct page *f_page;
1611 unsigned long f_offset;
1612 unsigned int f_objidx;
1450 void *vaddr; 1613 void *vaddr;
1451 1614
1452 obj &= ~OBJ_ALLOCATED_TAG; 1615 obj &= ~OBJ_ALLOCATED_TAG;
1453 obj_to_location(obj, &f_page, &f_objidx); 1616 obj_to_location(obj, &f_page, &f_objidx);
1454 first_page = get_first_page(f_page); 1617 f_offset = (class->size * f_objidx) & ~PAGE_MASK;
1455 1618 zspage = get_zspage(f_page);
1456 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
1457 1619
1458 vaddr = kmap_atomic(f_page); 1620 vaddr = kmap_atomic(f_page);
1459 1621
1460 /* Insert this object in containing zspage's freelist */ 1622 /* Insert this object in containing zspage's freelist */
1461 link = (struct link_free *)(vaddr + f_offset); 1623 link = (struct link_free *)(vaddr + f_offset);
1462 link->next = first_page->freelist; 1624 link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
1463 if (class->huge)
1464 set_page_private(first_page, 0);
1465 kunmap_atomic(vaddr); 1625 kunmap_atomic(vaddr);
1466 first_page->freelist = (void *)obj; 1626 set_freeobj(zspage, f_objidx);
1467 first_page->inuse--; 1627 mod_zspage_inuse(zspage, -1);
1468 zs_stat_dec(class, OBJ_USED, 1); 1628 zs_stat_dec(class, OBJ_USED, 1);
1469} 1629}
1470 1630
1471void zs_free(struct zs_pool *pool, unsigned long handle) 1631void zs_free(struct zs_pool *pool, unsigned long handle)
1472{ 1632{
1473 struct page *first_page, *f_page; 1633 struct zspage *zspage;
1474 unsigned long obj, f_objidx; 1634 struct page *f_page;
1635 unsigned long obj;
1636 unsigned int f_objidx;
1475 int class_idx; 1637 int class_idx;
1476 struct size_class *class; 1638 struct size_class *class;
1477 enum fullness_group fullness; 1639 enum fullness_group fullness;
1640 bool isolated;
1478 1641
1479 if (unlikely(!handle)) 1642 if (unlikely(!handle))
1480 return; 1643 return;
@@ -1482,25 +1645,31 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
1482 pin_tag(handle); 1645 pin_tag(handle);
1483 obj = handle_to_obj(handle); 1646 obj = handle_to_obj(handle);
1484 obj_to_location(obj, &f_page, &f_objidx); 1647 obj_to_location(obj, &f_page, &f_objidx);
1485 first_page = get_first_page(f_page); 1648 zspage = get_zspage(f_page);
1486 1649
1487 get_zspage_mapping(first_page, &class_idx, &fullness); 1650 migrate_read_lock(zspage);
1651
1652 get_zspage_mapping(zspage, &class_idx, &fullness);
1488 class = pool->size_class[class_idx]; 1653 class = pool->size_class[class_idx];
1489 1654
1490 spin_lock(&class->lock); 1655 spin_lock(&class->lock);
1491 obj_free(class, obj); 1656 obj_free(class, obj);
1492 fullness = fix_fullness_group(class, first_page); 1657 fullness = fix_fullness_group(class, zspage);
1493 if (fullness == ZS_EMPTY) { 1658 if (fullness != ZS_EMPTY) {
1494 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1659 migrate_read_unlock(zspage);
1495 class->size, class->pages_per_zspage)); 1660 goto out;
1496 atomic_long_sub(class->pages_per_zspage,
1497 &pool->pages_allocated);
1498 free_zspage(first_page);
1499 } 1661 }
1662
1663 isolated = is_zspage_isolated(zspage);
1664 migrate_read_unlock(zspage);
1665 /* If zspage is isolated, zs_page_putback will free the zspage */
1666 if (likely(!isolated))
1667 free_zspage(pool, class, zspage);
1668out:
1669
1500 spin_unlock(&class->lock); 1670 spin_unlock(&class->lock);
1501 unpin_tag(handle); 1671 unpin_tag(handle);
1502 1672 cache_free_handle(pool, handle);
1503 free_handle(pool, handle);
1504} 1673}
1505EXPORT_SYMBOL_GPL(zs_free); 1674EXPORT_SYMBOL_GPL(zs_free);
1506 1675
@@ -1508,7 +1677,7 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
1508 unsigned long src) 1677 unsigned long src)
1509{ 1678{
1510 struct page *s_page, *d_page; 1679 struct page *s_page, *d_page;
1511 unsigned long s_objidx, d_objidx; 1680 unsigned int s_objidx, d_objidx;
1512 unsigned long s_off, d_off; 1681 unsigned long s_off, d_off;
1513 void *s_addr, *d_addr; 1682 void *s_addr, *d_addr;
1514 int s_size, d_size, size; 1683 int s_size, d_size, size;
@@ -1519,8 +1688,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
1519 obj_to_location(src, &s_page, &s_objidx); 1688 obj_to_location(src, &s_page, &s_objidx);
1520 obj_to_location(dst, &d_page, &d_objidx); 1689 obj_to_location(dst, &d_page, &d_objidx);
1521 1690
1522 s_off = obj_idx_to_offset(s_page, s_objidx, class->size); 1691 s_off = (class->size * s_objidx) & ~PAGE_MASK;
1523 d_off = obj_idx_to_offset(d_page, d_objidx, class->size); 1692 d_off = (class->size * d_objidx) & ~PAGE_MASK;
1524 1693
1525 if (s_off + class->size > PAGE_SIZE) 1694 if (s_off + class->size > PAGE_SIZE)
1526 s_size = PAGE_SIZE - s_off; 1695 s_size = PAGE_SIZE - s_off;
@@ -1579,12 +1748,11 @@ static unsigned long find_alloced_obj(struct size_class *class,
1579 unsigned long handle = 0; 1748 unsigned long handle = 0;
1580 void *addr = kmap_atomic(page); 1749 void *addr = kmap_atomic(page);
1581 1750
1582 if (!is_first_page(page)) 1751 offset = get_first_obj_offset(page);
1583 offset = page->index;
1584 offset += class->size * index; 1752 offset += class->size * index;
1585 1753
1586 while (offset < PAGE_SIZE) { 1754 while (offset < PAGE_SIZE) {
1587 head = obj_to_head(class, page, addr + offset); 1755 head = obj_to_head(page, addr + offset);
1588 if (head & OBJ_ALLOCATED_TAG) { 1756 if (head & OBJ_ALLOCATED_TAG) {
1589 handle = head & ~OBJ_ALLOCATED_TAG; 1757 handle = head & ~OBJ_ALLOCATED_TAG;
1590 if (trypin_tag(handle)) 1758 if (trypin_tag(handle))
@@ -1601,7 +1769,7 @@ static unsigned long find_alloced_obj(struct size_class *class,
1601} 1769}
1602 1770
1603struct zs_compact_control { 1771struct zs_compact_control {
1604 /* Source page for migration which could be a subpage of zspage. */ 1772 /* Source spage for migration which could be a subpage of zspage */
1605 struct page *s_page; 1773 struct page *s_page;
1606 /* Destination page for migration which should be a first page 1774 /* Destination page for migration which should be a first page
1607 * of zspage. */ 1775 * of zspage. */
@@ -1632,14 +1800,14 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
1632 } 1800 }
1633 1801
1634 /* Stop if there is no more space */ 1802 /* Stop if there is no more space */
1635 if (zspage_full(d_page)) { 1803 if (zspage_full(class, get_zspage(d_page))) {
1636 unpin_tag(handle); 1804 unpin_tag(handle);
1637 ret = -ENOMEM; 1805 ret = -ENOMEM;
1638 break; 1806 break;
1639 } 1807 }
1640 1808
1641 used_obj = handle_to_obj(handle); 1809 used_obj = handle_to_obj(handle);
1642 free_obj = obj_malloc(class, d_page, handle); 1810 free_obj = obj_malloc(class, get_zspage(d_page), handle);
1643 zs_object_copy(class, free_obj, used_obj); 1811 zs_object_copy(class, free_obj, used_obj);
1644 index++; 1812 index++;
1645 /* 1813 /*
@@ -1661,68 +1829,422 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
1661 return ret; 1829 return ret;
1662} 1830}
1663 1831
1664static struct page *isolate_target_page(struct size_class *class) 1832static struct zspage *isolate_zspage(struct size_class *class, bool source)
1665{ 1833{
1666 int i; 1834 int i;
1667 struct page *page; 1835 struct zspage *zspage;
1836 enum fullness_group fg[2] = {ZS_ALMOST_EMPTY, ZS_ALMOST_FULL};
1668 1837
1669 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1838 if (!source) {
1670 page = class->fullness_list[i]; 1839 fg[0] = ZS_ALMOST_FULL;
1671 if (page) { 1840 fg[1] = ZS_ALMOST_EMPTY;
1672 remove_zspage(class, i, page); 1841 }
1673 break; 1842
1843 for (i = 0; i < 2; i++) {
1844 zspage = list_first_entry_or_null(&class->fullness_list[fg[i]],
1845 struct zspage, list);
1846 if (zspage) {
1847 VM_BUG_ON(is_zspage_isolated(zspage));
1848 remove_zspage(class, zspage, fg[i]);
1849 return zspage;
1674 } 1850 }
1675 } 1851 }
1676 1852
1677 return page; 1853 return zspage;
1678} 1854}
1679 1855
1680/* 1856/*
1681 * putback_zspage - add @first_page into right class's fullness list 1857 * putback_zspage - add @zspage into right class's fullness list
1682 * @pool: target pool
1683 * @class: destination class 1858 * @class: destination class
1684 * @first_page: target page 1859 * @zspage: target page
1685 * 1860 *
1686 * Return @fist_page's fullness_group 1861 * Return @zspage's fullness_group
1687 */ 1862 */
1688static enum fullness_group putback_zspage(struct zs_pool *pool, 1863static enum fullness_group putback_zspage(struct size_class *class,
1689 struct size_class *class, 1864 struct zspage *zspage)
1690 struct page *first_page)
1691{ 1865{
1692 enum fullness_group fullness; 1866 enum fullness_group fullness;
1693 1867
1694 fullness = get_fullness_group(first_page); 1868 VM_BUG_ON(is_zspage_isolated(zspage));
1695 insert_zspage(class, fullness, first_page);
1696 set_zspage_mapping(first_page, class->index, fullness);
1697 1869
1698 if (fullness == ZS_EMPTY) { 1870 fullness = get_fullness_group(class, zspage);
1699 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1871 insert_zspage(class, zspage, fullness);
1700 class->size, class->pages_per_zspage)); 1872 set_zspage_mapping(zspage, class->index, fullness);
1701 atomic_long_sub(class->pages_per_zspage, 1873
1702 &pool->pages_allocated); 1874 return fullness;
1875}
1876
1877#ifdef CONFIG_COMPACTION
1878static struct dentry *zs_mount(struct file_system_type *fs_type,
1879 int flags, const char *dev_name, void *data)
1880{
1881 static const struct dentry_operations ops = {
1882 .d_dname = simple_dname,
1883 };
1884
1885 return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC);
1886}
1887
1888static struct file_system_type zsmalloc_fs = {
1889 .name = "zsmalloc",
1890 .mount = zs_mount,
1891 .kill_sb = kill_anon_super,
1892};
1893
1894static int zsmalloc_mount(void)
1895{
1896 int ret = 0;
1703 1897
1704 free_zspage(first_page); 1898 zsmalloc_mnt = kern_mount(&zsmalloc_fs);
1899 if (IS_ERR(zsmalloc_mnt))
1900 ret = PTR_ERR(zsmalloc_mnt);
1901
1902 return ret;
1903}
1904
1905static void zsmalloc_unmount(void)
1906{
1907 kern_unmount(zsmalloc_mnt);
1908}
1909
1910static void migrate_lock_init(struct zspage *zspage)
1911{
1912 rwlock_init(&zspage->lock);
1913}
1914
1915static void migrate_read_lock(struct zspage *zspage)
1916{
1917 read_lock(&zspage->lock);
1918}
1919
1920static void migrate_read_unlock(struct zspage *zspage)
1921{
1922 read_unlock(&zspage->lock);
1923}
1924
1925static void migrate_write_lock(struct zspage *zspage)
1926{
1927 write_lock(&zspage->lock);
1928}
1929
1930static void migrate_write_unlock(struct zspage *zspage)
1931{
1932 write_unlock(&zspage->lock);
1933}
1934
1935/* Number of isolated subpage for *page migration* in this zspage */
1936static void inc_zspage_isolation(struct zspage *zspage)
1937{
1938 zspage->isolated++;
1939}
1940
1941static void dec_zspage_isolation(struct zspage *zspage)
1942{
1943 zspage->isolated--;
1944}
1945
1946static void replace_sub_page(struct size_class *class, struct zspage *zspage,
1947 struct page *newpage, struct page *oldpage)
1948{
1949 struct page *page;
1950 struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, };
1951 int idx = 0;
1952
1953 page = get_first_page(zspage);
1954 do {
1955 if (page == oldpage)
1956 pages[idx] = newpage;
1957 else
1958 pages[idx] = page;
1959 idx++;
1960 } while ((page = get_next_page(page)) != NULL);
1961
1962 create_page_chain(class, zspage, pages);
1963 set_first_obj_offset(newpage, get_first_obj_offset(oldpage));
1964 if (unlikely(PageHugeObject(oldpage)))
1965 newpage->index = oldpage->index;
1966 __SetPageMovable(newpage, page_mapping(oldpage));
1967}
1968
1969bool zs_page_isolate(struct page *page, isolate_mode_t mode)
1970{
1971 struct zs_pool *pool;
1972 struct size_class *class;
1973 int class_idx;
1974 enum fullness_group fullness;
1975 struct zspage *zspage;
1976 struct address_space *mapping;
1977
1978 /*
1979 * Page is locked so zspage couldn't be destroyed. For detail, look at
1980 * lock_zspage in free_zspage.
1981 */
1982 VM_BUG_ON_PAGE(!PageMovable(page), page);
1983 VM_BUG_ON_PAGE(PageIsolated(page), page);
1984
1985 zspage = get_zspage(page);
1986
1987 /*
1988 * Without class lock, fullness could be stale while class_idx is okay
1989 * because class_idx is constant unless page is freed so we should get
1990 * fullness again under class lock.
1991 */
1992 get_zspage_mapping(zspage, &class_idx, &fullness);
1993 mapping = page_mapping(page);
1994 pool = mapping->private_data;
1995 class = pool->size_class[class_idx];
1996
1997 spin_lock(&class->lock);
1998 if (get_zspage_inuse(zspage) == 0) {
1999 spin_unlock(&class->lock);
2000 return false;
1705 } 2001 }
1706 2002
1707 return fullness; 2003 /* zspage is isolated for object migration */
2004 if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
2005 spin_unlock(&class->lock);
2006 return false;
2007 }
2008
2009 /*
2010 * If this is first time isolation for the zspage, isolate zspage from
2011 * size_class to prevent further object allocation from the zspage.
2012 */
2013 if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
2014 get_zspage_mapping(zspage, &class_idx, &fullness);
2015 remove_zspage(class, zspage, fullness);
2016 }
2017
2018 inc_zspage_isolation(zspage);
2019 spin_unlock(&class->lock);
2020
2021 return true;
1708} 2022}
1709 2023
1710static struct page *isolate_source_page(struct size_class *class) 2024int zs_page_migrate(struct address_space *mapping, struct page *newpage,
2025 struct page *page, enum migrate_mode mode)
2026{
2027 struct zs_pool *pool;
2028 struct size_class *class;
2029 int class_idx;
2030 enum fullness_group fullness;
2031 struct zspage *zspage;
2032 struct page *dummy;
2033 void *s_addr, *d_addr, *addr;
2034 int offset, pos;
2035 unsigned long handle, head;
2036 unsigned long old_obj, new_obj;
2037 unsigned int obj_idx;
2038 int ret = -EAGAIN;
2039
2040 VM_BUG_ON_PAGE(!PageMovable(page), page);
2041 VM_BUG_ON_PAGE(!PageIsolated(page), page);
2042
2043 zspage = get_zspage(page);
2044
2045 /* Concurrent compactor cannot migrate any subpage in zspage */
2046 migrate_write_lock(zspage);
2047 get_zspage_mapping(zspage, &class_idx, &fullness);
2048 pool = mapping->private_data;
2049 class = pool->size_class[class_idx];
2050 offset = get_first_obj_offset(page);
2051
2052 spin_lock(&class->lock);
2053 if (!get_zspage_inuse(zspage)) {
2054 ret = -EBUSY;
2055 goto unlock_class;
2056 }
2057
2058 pos = offset;
2059 s_addr = kmap_atomic(page);
2060 while (pos < PAGE_SIZE) {
2061 head = obj_to_head(page, s_addr + pos);
2062 if (head & OBJ_ALLOCATED_TAG) {
2063 handle = head & ~OBJ_ALLOCATED_TAG;
2064 if (!trypin_tag(handle))
2065 goto unpin_objects;
2066 }
2067 pos += class->size;
2068 }
2069
2070 /*
2071 * Here, any user cannot access all objects in the zspage so let's move.
2072 */
2073 d_addr = kmap_atomic(newpage);
2074 memcpy(d_addr, s_addr, PAGE_SIZE);
2075 kunmap_atomic(d_addr);
2076
2077 for (addr = s_addr + offset; addr < s_addr + pos;
2078 addr += class->size) {
2079 head = obj_to_head(page, addr);
2080 if (head & OBJ_ALLOCATED_TAG) {
2081 handle = head & ~OBJ_ALLOCATED_TAG;
2082 if (!testpin_tag(handle))
2083 BUG();
2084
2085 old_obj = handle_to_obj(handle);
2086 obj_to_location(old_obj, &dummy, &obj_idx);
2087 new_obj = (unsigned long)location_to_obj(newpage,
2088 obj_idx);
2089 new_obj |= BIT(HANDLE_PIN_BIT);
2090 record_obj(handle, new_obj);
2091 }
2092 }
2093
2094 replace_sub_page(class, zspage, newpage, page);
2095 get_page(newpage);
2096
2097 dec_zspage_isolation(zspage);
2098
2099 /*
2100 * Page migration is done so let's putback isolated zspage to
2101 * the list if @page is final isolated subpage in the zspage.
2102 */
2103 if (!is_zspage_isolated(zspage))
2104 putback_zspage(class, zspage);
2105
2106 reset_page(page);
2107 put_page(page);
2108 page = newpage;
2109
2110 ret = MIGRATEPAGE_SUCCESS;
2111unpin_objects:
2112 for (addr = s_addr + offset; addr < s_addr + pos;
2113 addr += class->size) {
2114 head = obj_to_head(page, addr);
2115 if (head & OBJ_ALLOCATED_TAG) {
2116 handle = head & ~OBJ_ALLOCATED_TAG;
2117 if (!testpin_tag(handle))
2118 BUG();
2119 unpin_tag(handle);
2120 }
2121 }
2122 kunmap_atomic(s_addr);
2123unlock_class:
2124 spin_unlock(&class->lock);
2125 migrate_write_unlock(zspage);
2126
2127 return ret;
2128}
2129
2130void zs_page_putback(struct page *page)
2131{
2132 struct zs_pool *pool;
2133 struct size_class *class;
2134 int class_idx;
2135 enum fullness_group fg;
2136 struct address_space *mapping;
2137 struct zspage *zspage;
2138
2139 VM_BUG_ON_PAGE(!PageMovable(page), page);
2140 VM_BUG_ON_PAGE(!PageIsolated(page), page);
2141
2142 zspage = get_zspage(page);
2143 get_zspage_mapping(zspage, &class_idx, &fg);
2144 mapping = page_mapping(page);
2145 pool = mapping->private_data;
2146 class = pool->size_class[class_idx];
2147
2148 spin_lock(&class->lock);
2149 dec_zspage_isolation(zspage);
2150 if (!is_zspage_isolated(zspage)) {
2151 fg = putback_zspage(class, zspage);
2152 /*
2153 * Due to page_lock, we cannot free zspage immediately
2154 * so let's defer.
2155 */
2156 if (fg == ZS_EMPTY)
2157 schedule_work(&pool->free_work);
2158 }
2159 spin_unlock(&class->lock);
2160}
2161
2162const struct address_space_operations zsmalloc_aops = {
2163 .isolate_page = zs_page_isolate,
2164 .migratepage = zs_page_migrate,
2165 .putback_page = zs_page_putback,
2166};
2167
2168static int zs_register_migration(struct zs_pool *pool)
2169{
2170 pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb);
2171 if (IS_ERR(pool->inode)) {
2172 pool->inode = NULL;
2173 return 1;
2174 }
2175
2176 pool->inode->i_mapping->private_data = pool;
2177 pool->inode->i_mapping->a_ops = &zsmalloc_aops;
2178 return 0;
2179}
2180
2181static void zs_unregister_migration(struct zs_pool *pool)
2182{
2183 flush_work(&pool->free_work);
2184 if (pool->inode)
2185 iput(pool->inode);
2186}
2187
2188/*
2189 * Caller should hold page_lock of all pages in the zspage
2190 * In here, we cannot use zspage meta data.
2191 */
2192static void async_free_zspage(struct work_struct *work)
1711{ 2193{
1712 int i; 2194 int i;
1713 struct page *page = NULL; 2195 struct size_class *class;
2196 unsigned int class_idx;
2197 enum fullness_group fullness;
2198 struct zspage *zspage, *tmp;
2199 LIST_HEAD(free_pages);
2200 struct zs_pool *pool = container_of(work, struct zs_pool,
2201 free_work);
1714 2202
1715 for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { 2203 for (i = 0; i < zs_size_classes; i++) {
1716 page = class->fullness_list[i]; 2204 class = pool->size_class[i];
1717 if (!page) 2205 if (class->index != i)
1718 continue; 2206 continue;
1719 2207
1720 remove_zspage(class, i, page); 2208 spin_lock(&class->lock);
1721 break; 2209 list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages);
2210 spin_unlock(&class->lock);
2211 }
2212
2213
2214 list_for_each_entry_safe(zspage, tmp, &free_pages, list) {
2215 list_del(&zspage->list);
2216 lock_zspage(zspage);
2217
2218 get_zspage_mapping(zspage, &class_idx, &fullness);
2219 VM_BUG_ON(fullness != ZS_EMPTY);
2220 class = pool->size_class[class_idx];
2221 spin_lock(&class->lock);
2222 __free_zspage(pool, pool->size_class[class_idx], zspage);
2223 spin_unlock(&class->lock);
1722 } 2224 }
2225};
2226
2227static void kick_deferred_free(struct zs_pool *pool)
2228{
2229 schedule_work(&pool->free_work);
2230}
2231
2232static void init_deferred_free(struct zs_pool *pool)
2233{
2234 INIT_WORK(&pool->free_work, async_free_zspage);
2235}
2236
2237static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
2238{
2239 struct page *page = get_first_page(zspage);
1723 2240
1724 return page; 2241 do {
2242 WARN_ON(!trylock_page(page));
2243 __SetPageMovable(page, pool->inode->i_mapping);
2244 unlock_page(page);
2245 } while ((page = get_next_page(page)) != NULL);
1725} 2246}
2247#endif
1726 2248
1727/* 2249/*
1728 * 2250 *
@@ -1748,20 +2270,20 @@ static unsigned long zs_can_compact(struct size_class *class)
1748static void __zs_compact(struct zs_pool *pool, struct size_class *class) 2270static void __zs_compact(struct zs_pool *pool, struct size_class *class)
1749{ 2271{
1750 struct zs_compact_control cc; 2272 struct zs_compact_control cc;
1751 struct page *src_page; 2273 struct zspage *src_zspage;
1752 struct page *dst_page = NULL; 2274 struct zspage *dst_zspage = NULL;
1753 2275
1754 spin_lock(&class->lock); 2276 spin_lock(&class->lock);
1755 while ((src_page = isolate_source_page(class))) { 2277 while ((src_zspage = isolate_zspage(class, true))) {
1756 2278
1757 if (!zs_can_compact(class)) 2279 if (!zs_can_compact(class))
1758 break; 2280 break;
1759 2281
1760 cc.index = 0; 2282 cc.index = 0;
1761 cc.s_page = src_page; 2283 cc.s_page = get_first_page(src_zspage);
1762 2284
1763 while ((dst_page = isolate_target_page(class))) { 2285 while ((dst_zspage = isolate_zspage(class, false))) {
1764 cc.d_page = dst_page; 2286 cc.d_page = get_first_page(dst_zspage);
1765 /* 2287 /*
1766 * If there is no more space in dst_page, resched 2288 * If there is no more space in dst_page, resched
1767 * and see if anyone had allocated another zspage. 2289 * and see if anyone had allocated another zspage.
@@ -1769,23 +2291,25 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
1769 if (!migrate_zspage(pool, class, &cc)) 2291 if (!migrate_zspage(pool, class, &cc))
1770 break; 2292 break;
1771 2293
1772 putback_zspage(pool, class, dst_page); 2294 putback_zspage(class, dst_zspage);
1773 } 2295 }
1774 2296
1775 /* Stop if we couldn't find slot */ 2297 /* Stop if we couldn't find slot */
1776 if (dst_page == NULL) 2298 if (dst_zspage == NULL)
1777 break; 2299 break;
1778 2300
1779 putback_zspage(pool, class, dst_page); 2301 putback_zspage(class, dst_zspage);
1780 if (putback_zspage(pool, class, src_page) == ZS_EMPTY) 2302 if (putback_zspage(class, src_zspage) == ZS_EMPTY) {
2303 free_zspage(pool, class, src_zspage);
1781 pool->stats.pages_compacted += class->pages_per_zspage; 2304 pool->stats.pages_compacted += class->pages_per_zspage;
2305 }
1782 spin_unlock(&class->lock); 2306 spin_unlock(&class->lock);
1783 cond_resched(); 2307 cond_resched();
1784 spin_lock(&class->lock); 2308 spin_lock(&class->lock);
1785 } 2309 }
1786 2310
1787 if (src_page) 2311 if (src_zspage)
1788 putback_zspage(pool, class, src_page); 2312 putback_zspage(class, src_zspage);
1789 2313
1790 spin_unlock(&class->lock); 2314 spin_unlock(&class->lock);
1791} 2315}
@@ -1892,6 +2416,7 @@ struct zs_pool *zs_create_pool(const char *name)
1892 if (!pool) 2416 if (!pool)
1893 return NULL; 2417 return NULL;
1894 2418
2419 init_deferred_free(pool);
1895 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), 2420 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
1896 GFP_KERNEL); 2421 GFP_KERNEL);
1897 if (!pool->size_class) { 2422 if (!pool->size_class) {
@@ -1903,7 +2428,7 @@ struct zs_pool *zs_create_pool(const char *name)
1903 if (!pool->name) 2428 if (!pool->name)
1904 goto err; 2429 goto err;
1905 2430
1906 if (create_handle_cache(pool)) 2431 if (create_cache(pool))
1907 goto err; 2432 goto err;
1908 2433
1909 /* 2434 /*
@@ -1914,6 +2439,7 @@ struct zs_pool *zs_create_pool(const char *name)
1914 int size; 2439 int size;
1915 int pages_per_zspage; 2440 int pages_per_zspage;
1916 struct size_class *class; 2441 struct size_class *class;
2442 int fullness = 0;
1917 2443
1918 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; 2444 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA;
1919 if (size > ZS_MAX_ALLOC_SIZE) 2445 if (size > ZS_MAX_ALLOC_SIZE)
@@ -1943,11 +2469,13 @@ struct zs_pool *zs_create_pool(const char *name)
1943 class->size = size; 2469 class->size = size;
1944 class->index = i; 2470 class->index = i;
1945 class->pages_per_zspage = pages_per_zspage; 2471 class->pages_per_zspage = pages_per_zspage;
1946 if (pages_per_zspage == 1 && 2472 class->objs_per_zspage = class->pages_per_zspage *
1947 get_maxobj_per_zspage(size, pages_per_zspage) == 1) 2473 PAGE_SIZE / class->size;
1948 class->huge = true;
1949 spin_lock_init(&class->lock); 2474 spin_lock_init(&class->lock);
1950 pool->size_class[i] = class; 2475 pool->size_class[i] = class;
2476 for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS;
2477 fullness++)
2478 INIT_LIST_HEAD(&class->fullness_list[fullness]);
1951 2479
1952 prev_class = class; 2480 prev_class = class;
1953 } 2481 }
@@ -1955,6 +2483,9 @@ struct zs_pool *zs_create_pool(const char *name)
1955 /* debug only, don't abort if it fails */ 2483 /* debug only, don't abort if it fails */
1956 zs_pool_stat_create(pool, name); 2484 zs_pool_stat_create(pool, name);
1957 2485
2486 if (zs_register_migration(pool))
2487 goto err;
2488
1958 /* 2489 /*
1959 * Not critical, we still can use the pool 2490 * Not critical, we still can use the pool
1960 * and user can trigger compaction manually. 2491 * and user can trigger compaction manually.
@@ -1974,6 +2505,7 @@ void zs_destroy_pool(struct zs_pool *pool)
1974 int i; 2505 int i;
1975 2506
1976 zs_unregister_shrinker(pool); 2507 zs_unregister_shrinker(pool);
2508 zs_unregister_migration(pool);
1977 zs_pool_stat_destroy(pool); 2509 zs_pool_stat_destroy(pool);
1978 2510
1979 for (i = 0; i < zs_size_classes; i++) { 2511 for (i = 0; i < zs_size_classes; i++) {
@@ -1986,8 +2518,8 @@ void zs_destroy_pool(struct zs_pool *pool)
1986 if (class->index != i) 2518 if (class->index != i)
1987 continue; 2519 continue;
1988 2520
1989 for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { 2521 for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) {
1990 if (class->fullness_list[fg]) { 2522 if (!list_empty(&class->fullness_list[fg])) {
1991 pr_info("Freeing non-empty class with size %db, fullness group %d\n", 2523 pr_info("Freeing non-empty class with size %db, fullness group %d\n",
1992 class->size, fg); 2524 class->size, fg);
1993 } 2525 }
@@ -1995,7 +2527,7 @@ void zs_destroy_pool(struct zs_pool *pool)
1995 kfree(class); 2527 kfree(class);
1996 } 2528 }
1997 2529
1998 destroy_handle_cache(pool); 2530 destroy_cache(pool);
1999 kfree(pool->size_class); 2531 kfree(pool->size_class);
2000 kfree(pool->name); 2532 kfree(pool->name);
2001 kfree(pool); 2533 kfree(pool);
@@ -2004,7 +2536,13 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool);
2004 2536
2005static int __init zs_init(void) 2537static int __init zs_init(void)
2006{ 2538{
2007 int ret = zs_register_cpu_notifier(); 2539 int ret;
2540
2541 ret = zsmalloc_mount();
2542 if (ret)
2543 goto out;
2544
2545 ret = zs_register_cpu_notifier();
2008 2546
2009 if (ret) 2547 if (ret)
2010 goto notifier_fail; 2548 goto notifier_fail;
@@ -2021,7 +2559,8 @@ static int __init zs_init(void)
2021 2559
2022notifier_fail: 2560notifier_fail:
2023 zs_unregister_cpu_notifier(); 2561 zs_unregister_cpu_notifier();
2024 2562 zsmalloc_unmount();
2563out:
2025 return ret; 2564 return ret;
2026} 2565}
2027 2566
@@ -2030,6 +2569,7 @@ static void __exit zs_exit(void)
2030#ifdef CONFIG_ZPOOL 2569#ifdef CONFIG_ZPOOL
2031 zpool_unregister_driver(&zs_zpool_driver); 2570 zpool_unregister_driver(&zs_zpool_driver);
2032#endif 2571#endif
2572 zsmalloc_unmount();
2033 zs_unregister_cpu_notifier(); 2573 zs_unregister_cpu_notifier();
2034 2574
2035 zs_stat_exit(); 2575 zs_stat_exit();
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 735362c26c8e..f1dffe84f0d5 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -769,6 +769,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
769 lockdep_set_class(&sk->sk_receive_queue.lock, 769 lockdep_set_class(&sk->sk_receive_queue.lock,
770 &af_unix_sk_receive_queue_lock_key); 770 &af_unix_sk_receive_queue_lock_key);
771 771
772 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
772 sk->sk_write_space = unix_write_space; 773 sk->sk_write_space = unix_write_space;
773 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; 774 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
774 sk->sk_destruct = unix_sock_destructor; 775 sk->sk_destruct = unix_sock_destructor;
diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter
index 0254f3ba0dba..19f5adfd877d 100755
--- a/scripts/bloat-o-meter
+++ b/scripts/bloat-o-meter
@@ -67,5 +67,5 @@ print("%-40s %7s %7s %+7s" % ("function", "old", "new", "delta"))
67for d, n in delta: 67for d, n in delta:
68 if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d)) 68 if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d))
69 69
70print("Total: Before=%d, After=%d, chg %f%%" % \ 70print("Total: Before=%d, After=%d, chg %+.2f%%" % \
71 (otot, ntot, (ntot - otot)*100/otot)) 71 (otot, ntot, (ntot - otot)*100.0/otot))
diff --git a/scripts/tags.sh b/scripts/tags.sh
index f72f48f638ae..ed7eef24ef89 100755
--- a/scripts/tags.sh
+++ b/scripts/tags.sh
@@ -185,6 +185,9 @@ regex_c=(
185 '/\<CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/ClearPage\1/' 185 '/\<CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/ClearPage\1/'
186 '/\<__CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/__ClearPage\1/' 186 '/\<__CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/__ClearPage\1/'
187 '/\<TESTCLEARFLAG_FALSE(\([[:alnum:]_]*\).*/TestClearPage\1/' 187 '/\<TESTCLEARFLAG_FALSE(\([[:alnum:]_]*\).*/TestClearPage\1/'
188 '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/Page\1/'
189 '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/__SetPage\1/'
190 '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/__ClearPage\1/'
188 '/^TASK_PFA_TEST([^,]*, *\([[:alnum:]_]*\))/task_\1/' 191 '/^TASK_PFA_TEST([^,]*, *\([[:alnum:]_]*\))/task_\1/'
189 '/^TASK_PFA_SET([^,]*, *\([[:alnum:]_]*\))/task_set_\1/' 192 '/^TASK_PFA_SET([^,]*, *\([[:alnum:]_]*\))/task_set_\1/'
190 '/^TASK_PFA_CLEAR([^,]*, *\([[:alnum:]_]*\))/task_clear_\1/' 193 '/^TASK_PFA_CLEAR([^,]*, *\([[:alnum:]_]*\))/task_clear_\1/'
diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c
index 77147b42d598..f1c055f3c243 100644
--- a/tools/vm/page_owner_sort.c
+++ b/tools/vm/page_owner_sort.c
@@ -79,12 +79,12 @@ static void add_list(char *buf, int len)
79 } 79 }
80} 80}
81 81
82#define BUF_SIZE 1024 82#define BUF_SIZE (128 * 1024)
83 83
84int main(int argc, char **argv) 84int main(int argc, char **argv)
85{ 85{
86 FILE *fin, *fout; 86 FILE *fin, *fout;
87 char buf[BUF_SIZE]; 87 char *buf;
88 int ret, i, count; 88 int ret, i, count;
89 struct block_list *list2; 89 struct block_list *list2;
90 struct stat st; 90 struct stat st;
@@ -107,6 +107,11 @@ int main(int argc, char **argv)
107 max_size = st.st_size / 100; /* hack ... */ 107 max_size = st.st_size / 100; /* hack ... */
108 108
109 list = malloc(max_size * sizeof(*list)); 109 list = malloc(max_size * sizeof(*list));
110 buf = malloc(BUF_SIZE);
111 if (!list || !buf) {
112 printf("Out of memory\n");
113 exit(1);
114 }
110 115
111 for ( ; ; ) { 116 for ( ; ; ) {
112 ret = read_block(buf, BUF_SIZE, fin); 117 ret = read_block(buf, BUF_SIZE, fin);