diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-26 22:55:54 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-07-26 22:55:54 -0400 |
commit | 0e06f5c0deeef0332a5da2ecb8f1fcf3e024d958 (patch) | |
tree | e0f0af4aadf10c713c5cf1b65356844b3c9b3215 | |
parent | f7816ad0f878dacd5f0120476f9b836ccf8699ea (diff) | |
parent | 8f19b0c058d93a678a99dd6fec03af2e769943f2 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge updates from Andrew Morton:
- a few misc bits
- ocfs2
- most(?) of MM
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (125 commits)
thp: fix comments of __pmd_trans_huge_lock()
cgroup: remove unnecessary 0 check from css_from_id()
cgroup: fix idr leak for the first cgroup root
mm: memcontrol: fix documentation for compound parameter
mm: memcontrol: remove BUG_ON in uncharge_list
mm: fix build warnings in <linux/compaction.h>
mm, thp: convert from optimistic swapin collapsing to conservative
mm, thp: fix comment inconsistency for swapin readahead functions
thp: update Documentation/{vm/transhuge,filesystems/proc}.txt
shmem: split huge pages beyond i_size under memory pressure
thp: introduce CONFIG_TRANSPARENT_HUGE_PAGECACHE
khugepaged: add support of collapse for tmpfs/shmem pages
shmem: make shmem_inode_info::lock irq-safe
khugepaged: move up_read(mmap_sem) out of khugepaged_alloc_page()
thp: extract khugepaged from mm/huge_memory.c
shmem, thp: respect MADV_{NO,}HUGEPAGE for file mappings
shmem: add huge pages support
shmem: get_unmapped_area align huge page
shmem: prepare huge= mount option and sysfs knob
mm, rmap: account shmem thp pages
...
186 files changed, 7363 insertions, 4134 deletions
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 13100fb3c26d..0535ae1f73e5 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt | |||
@@ -59,23 +59,23 @@ num_devices parameter is optional and tells zram how many devices should be | |||
59 | pre-created. Default: 1. | 59 | pre-created. Default: 1. |
60 | 60 | ||
61 | 2) Set max number of compression streams | 61 | 2) Set max number of compression streams |
62 | Regardless the value passed to this attribute, ZRAM will always | 62 | Regardless the value passed to this attribute, ZRAM will always |
63 | allocate multiple compression streams - one per online CPUs - thus | 63 | allocate multiple compression streams - one per online CPUs - thus |
64 | allowing several concurrent compression operations. The number of | 64 | allowing several concurrent compression operations. The number of |
65 | allocated compression streams goes down when some of the CPUs | 65 | allocated compression streams goes down when some of the CPUs |
66 | become offline. There is no single-compression-stream mode anymore, | 66 | become offline. There is no single-compression-stream mode anymore, |
67 | unless you are running a UP system or has only 1 CPU online. | 67 | unless you are running a UP system or has only 1 CPU online. |
68 | 68 | ||
69 | To find out how many streams are currently available: | 69 | To find out how many streams are currently available: |
70 | cat /sys/block/zram0/max_comp_streams | 70 | cat /sys/block/zram0/max_comp_streams |
71 | 71 | ||
72 | 3) Select compression algorithm | 72 | 3) Select compression algorithm |
73 | Using comp_algorithm device attribute one can see available and | 73 | Using comp_algorithm device attribute one can see available and |
74 | currently selected (shown in square brackets) compression algorithms, | 74 | currently selected (shown in square brackets) compression algorithms, |
75 | change selected compression algorithm (once the device is initialised | 75 | change selected compression algorithm (once the device is initialised |
76 | there is no way to change compression algorithm). | 76 | there is no way to change compression algorithm). |
77 | 77 | ||
78 | Examples: | 78 | Examples: |
79 | #show supported compression algorithms | 79 | #show supported compression algorithms |
80 | cat /sys/block/zram0/comp_algorithm | 80 | cat /sys/block/zram0/comp_algorithm |
81 | lzo [lz4] | 81 | lzo [lz4] |
@@ -83,17 +83,27 @@ pre-created. Default: 1. | |||
83 | #select lzo compression algorithm | 83 | #select lzo compression algorithm |
84 | echo lzo > /sys/block/zram0/comp_algorithm | 84 | echo lzo > /sys/block/zram0/comp_algorithm |
85 | 85 | ||
86 | For the time being, the `comp_algorithm' content does not necessarily | ||
87 | show every compression algorithm supported by the kernel. We keep this | ||
88 | list primarily to simplify device configuration and one can configure | ||
89 | a new device with a compression algorithm that is not listed in | ||
90 | `comp_algorithm'. The thing is that, internally, ZRAM uses Crypto API | ||
91 | and, if some of the algorithms were built as modules, it's impossible | ||
92 | to list all of them using, for instance, /proc/crypto or any other | ||
93 | method. This, however, has an advantage of permitting the usage of | ||
94 | custom crypto compression modules (implementing S/W or H/W compression). | ||
95 | |||
86 | 4) Set Disksize | 96 | 4) Set Disksize |
87 | Set disk size by writing the value to sysfs node 'disksize'. | 97 | Set disk size by writing the value to sysfs node 'disksize'. |
88 | The value can be either in bytes or you can use mem suffixes. | 98 | The value can be either in bytes or you can use mem suffixes. |
89 | Examples: | 99 | Examples: |
90 | # Initialize /dev/zram0 with 50MB disksize | 100 | # Initialize /dev/zram0 with 50MB disksize |
91 | echo $((50*1024*1024)) > /sys/block/zram0/disksize | 101 | echo $((50*1024*1024)) > /sys/block/zram0/disksize |
92 | 102 | ||
93 | # Using mem suffixes | 103 | # Using mem suffixes |
94 | echo 256K > /sys/block/zram0/disksize | 104 | echo 256K > /sys/block/zram0/disksize |
95 | echo 512M > /sys/block/zram0/disksize | 105 | echo 512M > /sys/block/zram0/disksize |
96 | echo 1G > /sys/block/zram0/disksize | 106 | echo 1G > /sys/block/zram0/disksize |
97 | 107 | ||
98 | Note: | 108 | Note: |
99 | There is little point creating a zram of greater than twice the size of memory | 109 | There is little point creating a zram of greater than twice the size of memory |
@@ -101,20 +111,20 @@ since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the | |||
101 | size of the disk when not in use so a huge zram is wasteful. | 111 | size of the disk when not in use so a huge zram is wasteful. |
102 | 112 | ||
103 | 5) Set memory limit: Optional | 113 | 5) Set memory limit: Optional |
104 | Set memory limit by writing the value to sysfs node 'mem_limit'. | 114 | Set memory limit by writing the value to sysfs node 'mem_limit'. |
105 | The value can be either in bytes or you can use mem suffixes. | 115 | The value can be either in bytes or you can use mem suffixes. |
106 | In addition, you could change the value in runtime. | 116 | In addition, you could change the value in runtime. |
107 | Examples: | 117 | Examples: |
108 | # limit /dev/zram0 with 50MB memory | 118 | # limit /dev/zram0 with 50MB memory |
109 | echo $((50*1024*1024)) > /sys/block/zram0/mem_limit | 119 | echo $((50*1024*1024)) > /sys/block/zram0/mem_limit |
110 | 120 | ||
111 | # Using mem suffixes | 121 | # Using mem suffixes |
112 | echo 256K > /sys/block/zram0/mem_limit | 122 | echo 256K > /sys/block/zram0/mem_limit |
113 | echo 512M > /sys/block/zram0/mem_limit | 123 | echo 512M > /sys/block/zram0/mem_limit |
114 | echo 1G > /sys/block/zram0/mem_limit | 124 | echo 1G > /sys/block/zram0/mem_limit |
115 | 125 | ||
116 | # To disable memory limit | 126 | # To disable memory limit |
117 | echo 0 > /sys/block/zram0/mem_limit | 127 | echo 0 > /sys/block/zram0/mem_limit |
118 | 128 | ||
119 | 6) Activate: | 129 | 6) Activate: |
120 | mkswap /dev/zram0 | 130 | mkswap /dev/zram0 |
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 75eea7ce3d7c..5a7386e38e2d 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -195,7 +195,9 @@ prototypes: | |||
195 | int (*releasepage) (struct page *, int); | 195 | int (*releasepage) (struct page *, int); |
196 | void (*freepage)(struct page *); | 196 | void (*freepage)(struct page *); |
197 | int (*direct_IO)(struct kiocb *, struct iov_iter *iter); | 197 | int (*direct_IO)(struct kiocb *, struct iov_iter *iter); |
198 | bool (*isolate_page) (struct page *, isolate_mode_t); | ||
198 | int (*migratepage)(struct address_space *, struct page *, struct page *); | 199 | int (*migratepage)(struct address_space *, struct page *, struct page *); |
200 | void (*putback_page) (struct page *); | ||
199 | int (*launder_page)(struct page *); | 201 | int (*launder_page)(struct page *); |
200 | int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long); | 202 | int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long); |
201 | int (*error_remove_page)(struct address_space *, struct page *); | 203 | int (*error_remove_page)(struct address_space *, struct page *); |
@@ -219,7 +221,9 @@ invalidatepage: yes | |||
219 | releasepage: yes | 221 | releasepage: yes |
220 | freepage: yes | 222 | freepage: yes |
221 | direct_IO: | 223 | direct_IO: |
224 | isolate_page: yes | ||
222 | migratepage: yes (both) | 225 | migratepage: yes (both) |
226 | putback_page: yes | ||
223 | launder_page: yes | 227 | launder_page: yes |
224 | is_partially_uptodate: yes | 228 | is_partially_uptodate: yes |
225 | error_remove_page: yes | 229 | error_remove_page: yes |
@@ -544,13 +548,13 @@ subsequent truncate), and then return with VM_FAULT_LOCKED, and the page | |||
544 | locked. The VM will unlock the page. | 548 | locked. The VM will unlock the page. |
545 | 549 | ||
546 | ->map_pages() is called when VM asks to map easy accessible pages. | 550 | ->map_pages() is called when VM asks to map easy accessible pages. |
547 | Filesystem should find and map pages associated with offsets from "pgoff" | 551 | Filesystem should find and map pages associated with offsets from "start_pgoff" |
548 | till "max_pgoff". ->map_pages() is called with page table locked and must | 552 | till "end_pgoff". ->map_pages() is called with page table locked and must |
549 | not block. If it's not possible to reach a page without blocking, | 553 | not block. If it's not possible to reach a page without blocking, |
550 | filesystem should skip it. Filesystem should use do_set_pte() to setup | 554 | filesystem should skip it. Filesystem should use do_set_pte() to setup |
551 | page table entry. Pointer to entry associated with offset "pgoff" is | 555 | page table entry. Pointer to entry associated with the page is passed in |
552 | passed in "pte" field in vm_fault structure. Pointers to entries for other | 556 | "pte" field in fault_env structure. Pointers to entries for other offsets |
553 | offsets should be calculated relative to "pte". | 557 | should be calculated relative to "pte". |
554 | 558 | ||
555 | ->page_mkwrite() is called when a previously read-only pte is | 559 | ->page_mkwrite() is called when a previously read-only pte is |
556 | about to become writeable. The filesystem again must ensure that there are | 560 | about to become writeable. The filesystem again must ensure that there are |
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index ce4587d257d2..0c16a22521a8 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt | |||
@@ -49,6 +49,7 @@ These block devices may be used for inspiration: | |||
49 | - axonram: Axon DDR2 device driver | 49 | - axonram: Axon DDR2 device driver |
50 | - brd: RAM backed block device driver | 50 | - brd: RAM backed block device driver |
51 | - dcssblk: s390 dcss block device driver | 51 | - dcssblk: s390 dcss block device driver |
52 | - pmem: NVDIMM persistent memory driver | ||
52 | 53 | ||
53 | 54 | ||
54 | Implementation Tips for Filesystem Writers | 55 | Implementation Tips for Filesystem Writers |
@@ -75,8 +76,9 @@ calls to get_block() (for example by a page-fault racing with a read() | |||
75 | or a write()) work correctly. | 76 | or a write()) work correctly. |
76 | 77 | ||
77 | These filesystems may be used for inspiration: | 78 | These filesystems may be used for inspiration: |
78 | - ext2: the second extended filesystem, see Documentation/filesystems/ext2.txt | 79 | - ext2: see Documentation/filesystems/ext2.txt |
79 | - ext4: the fourth extended filesystem, see Documentation/filesystems/ext4.txt | 80 | - ext4: see Documentation/filesystems/ext4.txt |
81 | - xfs: see Documentation/filesystems/xfs.txt | ||
80 | 82 | ||
81 | 83 | ||
82 | Handling Media Errors | 84 | Handling Media Errors |
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 5b61eeae3f6e..68080ad6a75e 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -436,6 +436,7 @@ Private_Dirty: 0 kB | |||
436 | Referenced: 892 kB | 436 | Referenced: 892 kB |
437 | Anonymous: 0 kB | 437 | Anonymous: 0 kB |
438 | AnonHugePages: 0 kB | 438 | AnonHugePages: 0 kB |
439 | ShmemPmdMapped: 0 kB | ||
439 | Shared_Hugetlb: 0 kB | 440 | Shared_Hugetlb: 0 kB |
440 | Private_Hugetlb: 0 kB | 441 | Private_Hugetlb: 0 kB |
441 | Swap: 0 kB | 442 | Swap: 0 kB |
@@ -464,6 +465,8 @@ accessed. | |||
464 | a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE | 465 | a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE |
465 | and a page is modified, the file page is replaced by a private anonymous copy. | 466 | and a page is modified, the file page is replaced by a private anonymous copy. |
466 | "AnonHugePages" shows the ammount of memory backed by transparent hugepage. | 467 | "AnonHugePages" shows the ammount of memory backed by transparent hugepage. |
468 | "ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by | ||
469 | huge pages. | ||
467 | "Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by | 470 | "Shared_Hugetlb" and "Private_Hugetlb" show the ammounts of memory backed by |
468 | hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical | 471 | hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical |
469 | reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field. | 472 | reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field. |
@@ -868,6 +871,9 @@ VmallocTotal: 112216 kB | |||
868 | VmallocUsed: 428 kB | 871 | VmallocUsed: 428 kB |
869 | VmallocChunk: 111088 kB | 872 | VmallocChunk: 111088 kB |
870 | AnonHugePages: 49152 kB | 873 | AnonHugePages: 49152 kB |
874 | ShmemHugePages: 0 kB | ||
875 | ShmemPmdMapped: 0 kB | ||
876 | |||
871 | 877 | ||
872 | MemTotal: Total usable ram (i.e. physical ram minus a few reserved | 878 | MemTotal: Total usable ram (i.e. physical ram minus a few reserved |
873 | bits and the kernel binary code) | 879 | bits and the kernel binary code) |
@@ -912,6 +918,9 @@ MemAvailable: An estimate of how much memory is available for starting new | |||
912 | AnonHugePages: Non-file backed huge pages mapped into userspace page tables | 918 | AnonHugePages: Non-file backed huge pages mapped into userspace page tables |
913 | Mapped: files which have been mmaped, such as libraries | 919 | Mapped: files which have been mmaped, such as libraries |
914 | Shmem: Total memory used by shared memory (shmem) and tmpfs | 920 | Shmem: Total memory used by shared memory (shmem) and tmpfs |
921 | ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated | ||
922 | with huge pages | ||
923 | ShmemPmdMapped: Shared memory mapped into userspace with huge pages | ||
915 | Slab: in-kernel data structures cache | 924 | Slab: in-kernel data structures cache |
916 | SReclaimable: Part of Slab, that might be reclaimed, such as caches | 925 | SReclaimable: Part of Slab, that might be reclaimed, such as caches |
917 | SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure | 926 | SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure |
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index c61a223ef3ff..900360cbcdae 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -592,9 +592,14 @@ struct address_space_operations { | |||
592 | int (*releasepage) (struct page *, int); | 592 | int (*releasepage) (struct page *, int); |
593 | void (*freepage)(struct page *); | 593 | void (*freepage)(struct page *); |
594 | ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); | 594 | ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); |
595 | /* isolate a page for migration */ | ||
596 | bool (*isolate_page) (struct page *, isolate_mode_t); | ||
595 | /* migrate the contents of a page to the specified target */ | 597 | /* migrate the contents of a page to the specified target */ |
596 | int (*migratepage) (struct page *, struct page *); | 598 | int (*migratepage) (struct page *, struct page *); |
599 | /* put migration-failed page back to right list */ | ||
600 | void (*putback_page) (struct page *); | ||
597 | int (*launder_page) (struct page *); | 601 | int (*launder_page) (struct page *); |
602 | |||
598 | int (*is_partially_uptodate) (struct page *, unsigned long, | 603 | int (*is_partially_uptodate) (struct page *, unsigned long, |
599 | unsigned long); | 604 | unsigned long); |
600 | void (*is_dirty_writeback) (struct page *, bool *, bool *); | 605 | void (*is_dirty_writeback) (struct page *, bool *, bool *); |
@@ -747,6 +752,10 @@ struct address_space_operations { | |||
747 | and transfer data directly between the storage and the | 752 | and transfer data directly between the storage and the |
748 | application's address space. | 753 | application's address space. |
749 | 754 | ||
755 | isolate_page: Called by the VM when isolating a movable non-lru page. | ||
756 | If page is successfully isolated, VM marks the page as PG_isolated | ||
757 | via __SetPageIsolated. | ||
758 | |||
750 | migrate_page: This is used to compact the physical memory usage. | 759 | migrate_page: This is used to compact the physical memory usage. |
751 | If the VM wants to relocate a page (maybe off a memory card | 760 | If the VM wants to relocate a page (maybe off a memory card |
752 | that is signalling imminent failure) it will pass a new page | 761 | that is signalling imminent failure) it will pass a new page |
@@ -754,6 +763,8 @@ struct address_space_operations { | |||
754 | transfer any private data across and update any references | 763 | transfer any private data across and update any references |
755 | that it has to the page. | 764 | that it has to the page. |
756 | 765 | ||
766 | putback_page: Called by the VM when isolated page's migration fails. | ||
767 | |||
757 | launder_page: Called before freeing a page - it writes back the dirty page. To | 768 | launder_page: Called before freeing a page - it writes back the dirty page. To |
758 | prevent redirtying the page, it is kept locked during the whole | 769 | prevent redirtying the page, it is kept locked during the whole |
759 | operation. | 770 | operation. |
diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration index fea5c0864170..94bd9c11c4e0 100644 --- a/Documentation/vm/page_migration +++ b/Documentation/vm/page_migration | |||
@@ -142,5 +142,111 @@ Steps: | |||
142 | 20. The new page is moved to the LRU and can be scanned by the swapper | 142 | 20. The new page is moved to the LRU and can be scanned by the swapper |
143 | etc again. | 143 | etc again. |
144 | 144 | ||
145 | Christoph Lameter, May 8, 2006. | 145 | C. Non-LRU page migration |
146 | ------------------------- | ||
147 | |||
148 | Although original migration aimed for reducing the latency of memory access | ||
149 | for NUMA, compaction who want to create high-order page is also main customer. | ||
150 | |||
151 | Current problem of the implementation is that it is designed to migrate only | ||
152 | *LRU* pages. However, there are potential non-lru pages which can be migrated | ||
153 | in drivers, for example, zsmalloc, virtio-balloon pages. | ||
154 | |||
155 | For virtio-balloon pages, some parts of migration code path have been hooked | ||
156 | up and added virtio-balloon specific functions to intercept migration logics. | ||
157 | It's too specific to a driver so other drivers who want to make their pages | ||
158 | movable would have to add own specific hooks in migration path. | ||
159 | |||
160 | To overclome the problem, VM supports non-LRU page migration which provides | ||
161 | generic functions for non-LRU movable pages without driver specific hooks | ||
162 | migration path. | ||
163 | |||
164 | If a driver want to make own pages movable, it should define three functions | ||
165 | which are function pointers of struct address_space_operations. | ||
166 | |||
167 | 1. bool (*isolate_page) (struct page *page, isolate_mode_t mode); | ||
168 | |||
169 | What VM expects on isolate_page function of driver is to return *true* | ||
170 | if driver isolates page successfully. On returing true, VM marks the page | ||
171 | as PG_isolated so concurrent isolation in several CPUs skip the page | ||
172 | for isolation. If a driver cannot isolate the page, it should return *false*. | ||
173 | |||
174 | Once page is successfully isolated, VM uses page.lru fields so driver | ||
175 | shouldn't expect to preserve values in that fields. | ||
176 | |||
177 | 2. int (*migratepage) (struct address_space *mapping, | ||
178 | struct page *newpage, struct page *oldpage, enum migrate_mode); | ||
179 | |||
180 | After isolation, VM calls migratepage of driver with isolated page. | ||
181 | The function of migratepage is to move content of the old page to new page | ||
182 | and set up fields of struct page newpage. Keep in mind that you should | ||
183 | indicate to the VM the oldpage is no longer movable via __ClearPageMovable() | ||
184 | under page_lock if you migrated the oldpage successfully and returns | ||
185 | MIGRATEPAGE_SUCCESS. If driver cannot migrate the page at the moment, driver | ||
186 | can return -EAGAIN. On -EAGAIN, VM will retry page migration in a short time | ||
187 | because VM interprets -EAGAIN as "temporal migration failure". On returning | ||
188 | any error except -EAGAIN, VM will give up the page migration without retrying | ||
189 | in this time. | ||
190 | |||
191 | Driver shouldn't touch page.lru field VM using in the functions. | ||
192 | |||
193 | 3. void (*putback_page)(struct page *); | ||
194 | |||
195 | If migration fails on isolated page, VM should return the isolated page | ||
196 | to the driver so VM calls driver's putback_page with migration failed page. | ||
197 | In this function, driver should put the isolated page back to the own data | ||
198 | structure. | ||
146 | 199 | ||
200 | 4. non-lru movable page flags | ||
201 | |||
202 | There are two page flags for supporting non-lru movable page. | ||
203 | |||
204 | * PG_movable | ||
205 | |||
206 | Driver should use the below function to make page movable under page_lock. | ||
207 | |||
208 | void __SetPageMovable(struct page *page, struct address_space *mapping) | ||
209 | |||
210 | It needs argument of address_space for registering migration family functions | ||
211 | which will be called by VM. Exactly speaking, PG_movable is not a real flag of | ||
212 | struct page. Rather than, VM reuses page->mapping's lower bits to represent it. | ||
213 | |||
214 | #define PAGE_MAPPING_MOVABLE 0x2 | ||
215 | page->mapping = page->mapping | PAGE_MAPPING_MOVABLE; | ||
216 | |||
217 | so driver shouldn't access page->mapping directly. Instead, driver should | ||
218 | use page_mapping which mask off the low two bits of page->mapping under | ||
219 | page lock so it can get right struct address_space. | ||
220 | |||
221 | For testing of non-lru movable page, VM supports __PageMovable function. | ||
222 | However, it doesn't guarantee to identify non-lru movable page because | ||
223 | page->mapping field is unified with other variables in struct page. | ||
224 | As well, if driver releases the page after isolation by VM, page->mapping | ||
225 | doesn't have stable value although it has PAGE_MAPPING_MOVABLE | ||
226 | (Look at __ClearPageMovable). But __PageMovable is cheap to catch whether | ||
227 | page is LRU or non-lru movable once the page has been isolated. Because | ||
228 | LRU pages never can have PAGE_MAPPING_MOVABLE in page->mapping. It is also | ||
229 | good for just peeking to test non-lru movable pages before more expensive | ||
230 | checking with lock_page in pfn scanning to select victim. | ||
231 | |||
232 | For guaranteeing non-lru movable page, VM provides PageMovable function. | ||
233 | Unlike __PageMovable, PageMovable functions validates page->mapping and | ||
234 | mapping->a_ops->isolate_page under lock_page. The lock_page prevents sudden | ||
235 | destroying of page->mapping. | ||
236 | |||
237 | Driver using __SetPageMovable should clear the flag via __ClearMovablePage | ||
238 | under page_lock before the releasing the page. | ||
239 | |||
240 | * PG_isolated | ||
241 | |||
242 | To prevent concurrent isolation among several CPUs, VM marks isolated page | ||
243 | as PG_isolated under lock_page. So if a CPU encounters PG_isolated non-lru | ||
244 | movable page, it can skip it. Driver doesn't need to manipulate the flag | ||
245 | because VM will set/clear it automatically. Keep in mind that if driver | ||
246 | sees PG_isolated page, it means the page have been isolated by VM so it | ||
247 | shouldn't touch page.lru field. | ||
248 | PG_isolated is alias with PG_reclaim flag so driver shouldn't use the flag | ||
249 | for own purpose. | ||
250 | |||
251 | Christoph Lameter, May 8, 2006. | ||
252 | Minchan Kim, Mar 28, 2016. | ||
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index 7c871d6beb63..2ec6adb5a4ce 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt | |||
@@ -9,8 +9,8 @@ using huge pages for the backing of virtual memory with huge pages | |||
9 | that supports the automatic promotion and demotion of page sizes and | 9 | that supports the automatic promotion and demotion of page sizes and |
10 | without the shortcomings of hugetlbfs. | 10 | without the shortcomings of hugetlbfs. |
11 | 11 | ||
12 | Currently it only works for anonymous memory mappings but in the | 12 | Currently it only works for anonymous memory mappings and tmpfs/shmem. |
13 | future it can expand over the pagecache layer starting with tmpfs. | 13 | But in the future it can expand to other filesystems. |
14 | 14 | ||
15 | The reason applications are running faster is because of two | 15 | The reason applications are running faster is because of two |
16 | factors. The first factor is almost completely irrelevant and it's not | 16 | factors. The first factor is almost completely irrelevant and it's not |
@@ -57,10 +57,6 @@ miss is going to run faster. | |||
57 | feature that applies to all dynamic high order allocations in the | 57 | feature that applies to all dynamic high order allocations in the |
58 | kernel) | 58 | kernel) |
59 | 59 | ||
60 | - this initial support only offers the feature in the anonymous memory | ||
61 | regions but it'd be ideal to move it to tmpfs and the pagecache | ||
62 | later | ||
63 | |||
64 | Transparent Hugepage Support maximizes the usefulness of free memory | 60 | Transparent Hugepage Support maximizes the usefulness of free memory |
65 | if compared to the reservation approach of hugetlbfs by allowing all | 61 | if compared to the reservation approach of hugetlbfs by allowing all |
66 | unused memory to be used as cache or other movable (or even unmovable | 62 | unused memory to be used as cache or other movable (or even unmovable |
@@ -94,21 +90,21 @@ madvise(MADV_HUGEPAGE) on their critical mmapped regions. | |||
94 | 90 | ||
95 | == sysfs == | 91 | == sysfs == |
96 | 92 | ||
97 | Transparent Hugepage Support can be entirely disabled (mostly for | 93 | Transparent Hugepage Support for anonymous memory can be entirely disabled |
98 | debugging purposes) or only enabled inside MADV_HUGEPAGE regions (to | 94 | (mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE |
99 | avoid the risk of consuming more memory resources) or enabled system | 95 | regions (to avoid the risk of consuming more memory resources) or enabled |
100 | wide. This can be achieved with one of: | 96 | system wide. This can be achieved with one of: |
101 | 97 | ||
102 | echo always >/sys/kernel/mm/transparent_hugepage/enabled | 98 | echo always >/sys/kernel/mm/transparent_hugepage/enabled |
103 | echo madvise >/sys/kernel/mm/transparent_hugepage/enabled | 99 | echo madvise >/sys/kernel/mm/transparent_hugepage/enabled |
104 | echo never >/sys/kernel/mm/transparent_hugepage/enabled | 100 | echo never >/sys/kernel/mm/transparent_hugepage/enabled |
105 | 101 | ||
106 | It's also possible to limit defrag efforts in the VM to generate | 102 | It's also possible to limit defrag efforts in the VM to generate |
107 | hugepages in case they're not immediately free to madvise regions or | 103 | anonymous hugepages in case they're not immediately free to madvise |
108 | to never try to defrag memory and simply fallback to regular pages | 104 | regions or to never try to defrag memory and simply fallback to regular |
109 | unless hugepages are immediately available. Clearly if we spend CPU | 105 | pages unless hugepages are immediately available. Clearly if we spend CPU |
110 | time to defrag memory, we would expect to gain even more by the fact | 106 | time to defrag memory, we would expect to gain even more by the fact we |
111 | we use hugepages later instead of regular pages. This isn't always | 107 | use hugepages later instead of regular pages. This isn't always |
112 | guaranteed, but it may be more likely in case the allocation is for a | 108 | guaranteed, but it may be more likely in case the allocation is for a |
113 | MADV_HUGEPAGE region. | 109 | MADV_HUGEPAGE region. |
114 | 110 | ||
@@ -133,9 +129,9 @@ that are have used madvise(MADV_HUGEPAGE). This is the default behaviour. | |||
133 | 129 | ||
134 | "never" should be self-explanatory. | 130 | "never" should be self-explanatory. |
135 | 131 | ||
136 | By default kernel tries to use huge zero page on read page fault. | 132 | By default kernel tries to use huge zero page on read page fault to |
137 | It's possible to disable huge zero page by writing 0 or enable it | 133 | anonymous mapping. It's possible to disable huge zero page by writing 0 |
138 | back by writing 1: | 134 | or enable it back by writing 1: |
139 | 135 | ||
140 | echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page | 136 | echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page |
141 | echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page | 137 | echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page |
@@ -204,21 +200,67 @@ Support by passing the parameter "transparent_hugepage=always" or | |||
204 | "transparent_hugepage=madvise" or "transparent_hugepage=never" | 200 | "transparent_hugepage=madvise" or "transparent_hugepage=never" |
205 | (without "") to the kernel command line. | 201 | (without "") to the kernel command line. |
206 | 202 | ||
203 | == Hugepages in tmpfs/shmem == | ||
204 | |||
205 | You can control hugepage allocation policy in tmpfs with mount option | ||
206 | "huge=". It can have following values: | ||
207 | |||
208 | - "always": | ||
209 | Attempt to allocate huge pages every time we need a new page; | ||
210 | |||
211 | - "never": | ||
212 | Do not allocate huge pages; | ||
213 | |||
214 | - "within_size": | ||
215 | Only allocate huge page if it will be fully within i_size. | ||
216 | Also respect fadvise()/madvise() hints; | ||
217 | |||
218 | - "advise: | ||
219 | Only allocate huge pages if requested with fadvise()/madvise(); | ||
220 | |||
221 | The default policy is "never". | ||
222 | |||
223 | "mount -o remount,huge= /mountpoint" works fine after mount: remounting | ||
224 | huge=never will not attempt to break up huge pages at all, just stop more | ||
225 | from being allocated. | ||
226 | |||
227 | There's also sysfs knob to control hugepage allocation policy for internal | ||
228 | shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount | ||
229 | is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or | ||
230 | MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem. | ||
231 | |||
232 | In addition to policies listed above, shmem_enabled allows two further | ||
233 | values: | ||
234 | |||
235 | - "deny": | ||
236 | For use in emergencies, to force the huge option off from | ||
237 | all mounts; | ||
238 | - "force": | ||
239 | Force the huge option on for all - very useful for testing; | ||
240 | |||
207 | == Need of application restart == | 241 | == Need of application restart == |
208 | 242 | ||
209 | The transparent_hugepage/enabled values only affect future | 243 | The transparent_hugepage/enabled values and tmpfs mount option only affect |
210 | behavior. So to make them effective you need to restart any | 244 | future behavior. So to make them effective you need to restart any |
211 | application that could have been using hugepages. This also applies to | 245 | application that could have been using hugepages. This also applies to the |
212 | the regions registered in khugepaged. | 246 | regions registered in khugepaged. |
213 | 247 | ||
214 | == Monitoring usage == | 248 | == Monitoring usage == |
215 | 249 | ||
216 | The number of transparent huge pages currently used by the system is | 250 | The number of anonymous transparent huge pages currently used by the |
217 | available by reading the AnonHugePages field in /proc/meminfo. To | 251 | system is available by reading the AnonHugePages field in /proc/meminfo. |
218 | identify what applications are using transparent huge pages, it is | 252 | To identify what applications are using anonymous transparent huge pages, |
219 | necessary to read /proc/PID/smaps and count the AnonHugePages fields | 253 | it is necessary to read /proc/PID/smaps and count the AnonHugePages fields |
220 | for each mapping. Note that reading the smaps file is expensive and | 254 | for each mapping. |
221 | reading it frequently will incur overhead. | 255 | |
256 | The number of file transparent huge pages mapped to userspace is available | ||
257 | by reading ShmemPmdMapped and ShmemHugePages fields in /proc/meminfo. | ||
258 | To identify what applications are mapping file transparent huge pages, it | ||
259 | is necessary to read /proc/PID/smaps and count the FileHugeMapped fields | ||
260 | for each mapping. | ||
261 | |||
262 | Note that reading the smaps file is expensive and reading it | ||
263 | frequently will incur overhead. | ||
222 | 264 | ||
223 | There are a number of counters in /proc/vmstat that may be used to | 265 | There are a number of counters in /proc/vmstat that may be used to |
224 | monitor how successfully the system is providing huge pages for use. | 266 | monitor how successfully the system is providing huge pages for use. |
@@ -238,6 +280,12 @@ thp_collapse_alloc_failed is incremented if khugepaged found a range | |||
238 | of pages that should be collapsed into one huge page but failed | 280 | of pages that should be collapsed into one huge page but failed |
239 | the allocation. | 281 | the allocation. |
240 | 282 | ||
283 | thp_file_alloc is incremented every time a file huge page is successfully | ||
284 | i allocated. | ||
285 | |||
286 | thp_file_mapped is incremented every time a file huge page is mapped into | ||
287 | user address space. | ||
288 | |||
241 | thp_split_page is incremented every time a huge page is split into base | 289 | thp_split_page is incremented every time a huge page is split into base |
242 | pages. This can happen for a variety of reasons but a common | 290 | pages. This can happen for a variety of reasons but a common |
243 | reason is that a huge page is old and is being reclaimed. | 291 | reason is that a huge page is old and is being reclaimed. |
@@ -403,19 +451,27 @@ pages: | |||
403 | on relevant sub-page of the compound page. | 451 | on relevant sub-page of the compound page. |
404 | 452 | ||
405 | - map/unmap of the whole compound page accounted in compound_mapcount | 453 | - map/unmap of the whole compound page accounted in compound_mapcount |
406 | (stored in first tail page). | 454 | (stored in first tail page). For file huge pages, we also increment |
455 | ->_mapcount of all sub-pages in order to have race-free detection of | ||
456 | last unmap of subpages. | ||
407 | 457 | ||
408 | PageDoubleMap() indicates that ->_mapcount in all subpages is offset up by one. | 458 | PageDoubleMap() indicates that the page is *possibly* mapped with PTEs. |
409 | This additional reference is required to get race-free detection of unmap of | 459 | |
410 | subpages when we have them mapped with both PMDs and PTEs. | 460 | For anonymous pages PageDoubleMap() also indicates ->_mapcount in all |
461 | subpages is offset up by one. This additional reference is required to | ||
462 | get race-free detection of unmap of subpages when we have them mapped with | ||
463 | both PMDs and PTEs. | ||
411 | 464 | ||
412 | This is optimization required to lower overhead of per-subpage mapcount | 465 | This is optimization required to lower overhead of per-subpage mapcount |
413 | tracking. The alternative is alter ->_mapcount in all subpages on each | 466 | tracking. The alternative is alter ->_mapcount in all subpages on each |
414 | map/unmap of the whole compound page. | 467 | map/unmap of the whole compound page. |
415 | 468 | ||
416 | We set PG_double_map when a PMD of the page got split for the first time, | 469 | For anonymous pages, we set PG_double_map when a PMD of the page got split |
417 | but still have PMD mapping. The additional references go away with last | 470 | for the first time, but still have PMD mapping. The additional references |
418 | compound_mapcount. | 471 | go away with last compound_mapcount. |
472 | |||
473 | File pages get PG_double_map set on first map of the page with PTE and | ||
474 | goes away when the page gets evicted from page cache. | ||
419 | 475 | ||
420 | split_huge_page internally has to distribute the refcounts in the head | 476 | split_huge_page internally has to distribute the refcounts in the head |
421 | page to the tail pages before clearing all PG_head/tail bits from the page | 477 | page to the tail pages before clearing all PG_head/tail bits from the page |
@@ -427,7 +483,7 @@ sum of mapcount of all sub-pages plus one (split_huge_page caller must | |||
427 | have reference for head page). | 483 | have reference for head page). |
428 | 484 | ||
429 | split_huge_page uses migration entries to stabilize page->_refcount and | 485 | split_huge_page uses migration entries to stabilize page->_refcount and |
430 | page->_mapcount. | 486 | page->_mapcount of anonymous pages. File pages just got unmapped. |
431 | 487 | ||
432 | We safe against physical memory scanners too: the only legitimate way | 488 | We safe against physical memory scanners too: the only legitimate way |
433 | scanner can get reference to a page is get_page_unless_zero(). | 489 | scanner can get reference to a page is get_page_unless_zero(). |
diff --git a/Documentation/vm/unevictable-lru.txt b/Documentation/vm/unevictable-lru.txt index fa3b527086fa..0026a8d33fc0 100644 --- a/Documentation/vm/unevictable-lru.txt +++ b/Documentation/vm/unevictable-lru.txt | |||
@@ -461,6 +461,27 @@ unevictable LRU is enabled, the work of compaction is mostly handled by | |||
461 | the page migration code and the same work flow as described in MIGRATING | 461 | the page migration code and the same work flow as described in MIGRATING |
462 | MLOCKED PAGES will apply. | 462 | MLOCKED PAGES will apply. |
463 | 463 | ||
464 | MLOCKING TRANSPARENT HUGE PAGES | ||
465 | ------------------------------- | ||
466 | |||
467 | A transparent huge page is represented by a single entry on an LRU list. | ||
468 | Therefore, we can only make unevictable an entire compound page, not | ||
469 | individual subpages. | ||
470 | |||
471 | If a user tries to mlock() part of a huge page, we want the rest of the | ||
472 | page to be reclaimable. | ||
473 | |||
474 | We cannot just split the page on partial mlock() as split_huge_page() can | ||
475 | fail and new intermittent failure mode for the syscall is undesirable. | ||
476 | |||
477 | We handle this by keeping PTE-mapped huge pages on normal LRU lists: the | ||
478 | PMD on border of VM_LOCKED VMA will be split into PTE table. | ||
479 | |||
480 | This way the huge page is accessible for vmscan. Under memory pressure the | ||
481 | page will be split, subpages which belong to VM_LOCKED VMAs will be moved | ||
482 | to unevictable LRU and the rest can be reclaimed. | ||
483 | |||
484 | See also comment in follow_trans_huge_pmd(). | ||
464 | 485 | ||
465 | mmap(MAP_LOCKED) SYSTEM CALL HANDLING | 486 | mmap(MAP_LOCKED) SYSTEM CALL HANDLING |
466 | ------------------------------------- | 487 | ------------------------------------- |
@@ -647,41 +647,28 @@ ifneq ($(CONFIG_FRAME_WARN),0) | |||
647 | KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN}) | 647 | KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN}) |
648 | endif | 648 | endif |
649 | 649 | ||
650 | # Handle stack protector mode. | 650 | # This selects the stack protector compiler flag. Testing it is delayed |
651 | # | 651 | # until after .config has been reprocessed, in the prepare-compiler-check |
652 | # Since kbuild can potentially perform two passes (first with the old | 652 | # target. |
653 | # .config values and then with updated .config values), we cannot error out | ||
654 | # if a desired compiler option is unsupported. If we were to error, kbuild | ||
655 | # could never get to the second pass and actually notice that we changed | ||
656 | # the option to something that was supported. | ||
657 | # | ||
658 | # Additionally, we don't want to fallback and/or silently change which compiler | ||
659 | # flags will be used, since that leads to producing kernels with different | ||
660 | # security feature characteristics depending on the compiler used. ("But I | ||
661 | # selected CC_STACKPROTECTOR_STRONG! Why did it build with _REGULAR?!") | ||
662 | # | ||
663 | # The middle ground is to warn here so that the failed option is obvious, but | ||
664 | # to let the build fail with bad compiler flags so that we can't produce a | ||
665 | # kernel when there is a CONFIG and compiler mismatch. | ||
666 | # | ||
667 | ifdef CONFIG_CC_STACKPROTECTOR_REGULAR | 653 | ifdef CONFIG_CC_STACKPROTECTOR_REGULAR |
668 | stackp-flag := -fstack-protector | 654 | stackp-flag := -fstack-protector |
669 | ifeq ($(call cc-option, $(stackp-flag)),) | 655 | stackp-name := REGULAR |
670 | $(warning Cannot use CONFIG_CC_STACKPROTECTOR_REGULAR: \ | ||
671 | -fstack-protector not supported by compiler) | ||
672 | endif | ||
673 | else | 656 | else |
674 | ifdef CONFIG_CC_STACKPROTECTOR_STRONG | 657 | ifdef CONFIG_CC_STACKPROTECTOR_STRONG |
675 | stackp-flag := -fstack-protector-strong | 658 | stackp-flag := -fstack-protector-strong |
676 | ifeq ($(call cc-option, $(stackp-flag)),) | 659 | stackp-name := STRONG |
677 | $(warning Cannot use CONFIG_CC_STACKPROTECTOR_STRONG: \ | ||
678 | -fstack-protector-strong not supported by compiler) | ||
679 | endif | ||
680 | else | 660 | else |
681 | # Force off for distro compilers that enable stack protector by default. | 661 | # Force off for distro compilers that enable stack protector by default. |
682 | stackp-flag := $(call cc-option, -fno-stack-protector) | 662 | stackp-flag := $(call cc-option, -fno-stack-protector) |
683 | endif | 663 | endif |
684 | endif | 664 | endif |
665 | # Find arch-specific stack protector compiler sanity-checking script. | ||
666 | ifdef CONFIG_CC_STACKPROTECTOR | ||
667 | stackp-path := $(srctree)/scripts/gcc-$(ARCH)_$(BITS)-has-stack-protector.sh | ||
668 | ifneq ($(wildcard $(stackp-path)),) | ||
669 | stackp-check := $(stackp-path) | ||
670 | endif | ||
671 | endif | ||
685 | KBUILD_CFLAGS += $(stackp-flag) | 672 | KBUILD_CFLAGS += $(stackp-flag) |
686 | 673 | ||
687 | ifdef CONFIG_KCOV | 674 | ifdef CONFIG_KCOV |
@@ -1017,8 +1004,10 @@ ifneq ($(KBUILD_SRC),) | |||
1017 | fi; | 1004 | fi; |
1018 | endif | 1005 | endif |
1019 | 1006 | ||
1020 | # prepare2 creates a makefile if using a separate output directory | 1007 | # prepare2 creates a makefile if using a separate output directory. |
1021 | prepare2: prepare3 outputmakefile asm-generic | 1008 | # From this point forward, .config has been reprocessed, so any rules |
1009 | # that need to depend on updated CONFIG_* values can be checked here. | ||
1010 | prepare2: prepare3 prepare-compiler-check outputmakefile asm-generic | ||
1022 | 1011 | ||
1023 | prepare1: prepare2 $(version_h) include/generated/utsrelease.h \ | 1012 | prepare1: prepare2 $(version_h) include/generated/utsrelease.h \ |
1024 | include/config/auto.conf | 1013 | include/config/auto.conf |
@@ -1049,6 +1038,32 @@ endif | |||
1049 | PHONY += prepare-objtool | 1038 | PHONY += prepare-objtool |
1050 | prepare-objtool: $(objtool_target) | 1039 | prepare-objtool: $(objtool_target) |
1051 | 1040 | ||
1041 | # Check for CONFIG flags that require compiler support. Abort the build | ||
1042 | # after .config has been processed, but before the kernel build starts. | ||
1043 | # | ||
1044 | # For security-sensitive CONFIG options, we don't want to fallback and/or | ||
1045 | # silently change which compiler flags will be used, since that leads to | ||
1046 | # producing kernels with different security feature characteristics | ||
1047 | # depending on the compiler used. (For example, "But I selected | ||
1048 | # CC_STACKPROTECTOR_STRONG! Why did it build with _REGULAR?!") | ||
1049 | PHONY += prepare-compiler-check | ||
1050 | prepare-compiler-check: FORCE | ||
1051 | # Make sure compiler supports requested stack protector flag. | ||
1052 | ifdef stackp-name | ||
1053 | ifeq ($(call cc-option, $(stackp-flag)),) | ||
1054 | @echo Cannot use CONFIG_CC_STACKPROTECTOR_$(stackp-name): \ | ||
1055 | $(stackp-flag) not supported by compiler >&2 && exit 1 | ||
1056 | endif | ||
1057 | endif | ||
1058 | # Make sure compiler does not have buggy stack-protector support. | ||
1059 | ifdef stackp-check | ||
1060 | ifneq ($(shell $(CONFIG_SHELL) $(stackp-check) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y) | ||
1061 | @echo Cannot use CONFIG_CC_STACKPROTECTOR_$(stackp-name): \ | ||
1062 | $(stackp-flag) available but compiler is broken >&2 && exit 1 | ||
1063 | endif | ||
1064 | endif | ||
1065 | @: | ||
1066 | |||
1052 | # Generate some files | 1067 | # Generate some files |
1053 | # --------------------------------------------------------------------------- | 1068 | # --------------------------------------------------------------------------- |
1054 | 1069 | ||
diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c index 4a905bd667e2..83e9eee57a55 100644 --- a/arch/alpha/mm/fault.c +++ b/arch/alpha/mm/fault.c | |||
@@ -147,7 +147,7 @@ retry: | |||
147 | /* If for any reason at all we couldn't handle the fault, | 147 | /* If for any reason at all we couldn't handle the fault, |
148 | make sure we exit gracefully rather than endlessly redo | 148 | make sure we exit gracefully rather than endlessly redo |
149 | the fault. */ | 149 | the fault. */ |
150 | fault = handle_mm_fault(mm, vma, address, flags); | 150 | fault = handle_mm_fault(vma, address, flags); |
151 | 151 | ||
152 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 152 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
153 | return; | 153 | return; |
diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c index af63f4a13e60..e94e5aa33985 100644 --- a/arch/arc/mm/fault.c +++ b/arch/arc/mm/fault.c | |||
@@ -137,7 +137,7 @@ good_area: | |||
137 | * make sure we exit gracefully rather than endlessly redo | 137 | * make sure we exit gracefully rather than endlessly redo |
138 | * the fault. | 138 | * the fault. |
139 | */ | 139 | */ |
140 | fault = handle_mm_fault(mm, vma, address, flags); | 140 | fault = handle_mm_fault(vma, address, flags); |
141 | 141 | ||
142 | /* If Pagefault was interrupted by SIGKILL, exit page fault "early" */ | 142 | /* If Pagefault was interrupted by SIGKILL, exit page fault "early" */ |
143 | if (unlikely(fatal_signal_pending(current))) { | 143 | if (unlikely(fatal_signal_pending(current))) { |
diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index 20febb368844..b2902a5cd780 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h | |||
@@ -57,7 +57,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) | |||
57 | extern pgd_t *pgd_alloc(struct mm_struct *mm); | 57 | extern pgd_t *pgd_alloc(struct mm_struct *mm); |
58 | extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); | 58 | extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); |
59 | 59 | ||
60 | #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) | 60 | #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) |
61 | 61 | ||
62 | static inline void clean_pte_table(pte_t *pte) | 62 | static inline void clean_pte_table(pte_t *pte) |
63 | { | 63 | { |
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 3cadb726ec88..1e25cd80589e 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h | |||
@@ -209,17 +209,38 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) | |||
209 | tlb_flush(tlb); | 209 | tlb_flush(tlb); |
210 | } | 210 | } |
211 | 211 | ||
212 | static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | 212 | static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) |
213 | { | 213 | { |
214 | if (tlb->nr == tlb->max) | ||
215 | return true; | ||
214 | tlb->pages[tlb->nr++] = page; | 216 | tlb->pages[tlb->nr++] = page; |
215 | VM_BUG_ON(tlb->nr > tlb->max); | 217 | return false; |
216 | return tlb->max - tlb->nr; | ||
217 | } | 218 | } |
218 | 219 | ||
219 | static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) | 220 | static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) |
220 | { | 221 | { |
221 | if (!__tlb_remove_page(tlb, page)) | 222 | if (__tlb_remove_page(tlb, page)) { |
222 | tlb_flush_mmu(tlb); | 223 | tlb_flush_mmu(tlb); |
224 | __tlb_remove_page(tlb, page); | ||
225 | } | ||
226 | } | ||
227 | |||
228 | static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, | ||
229 | struct page *page, int page_size) | ||
230 | { | ||
231 | return __tlb_remove_page(tlb, page); | ||
232 | } | ||
233 | |||
234 | static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, | ||
235 | struct page *page) | ||
236 | { | ||
237 | return __tlb_remove_page(tlb, page); | ||
238 | } | ||
239 | |||
240 | static inline void tlb_remove_page_size(struct mmu_gather *tlb, | ||
241 | struct page *page, int page_size) | ||
242 | { | ||
243 | return tlb_remove_page(tlb, page); | ||
223 | } | 244 | } |
224 | 245 | ||
225 | static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, | 246 | static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, |
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index ad5841856007..3a2e678b8d30 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c | |||
@@ -243,7 +243,7 @@ good_area: | |||
243 | goto out; | 243 | goto out; |
244 | } | 244 | } |
245 | 245 | ||
246 | return handle_mm_fault(mm, vma, addr & PAGE_MASK, flags); | 246 | return handle_mm_fault(vma, addr & PAGE_MASK, flags); |
247 | 247 | ||
248 | check_stack: | 248 | check_stack: |
249 | /* Don't allow expansion below FIRST_USER_ADDRESS */ | 249 | /* Don't allow expansion below FIRST_USER_ADDRESS */ |
diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index b8d477321730..c1c1a5c67da1 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c | |||
@@ -23,7 +23,7 @@ | |||
23 | #define __pgd_alloc() kmalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL) | 23 | #define __pgd_alloc() kmalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL) |
24 | #define __pgd_free(pgd) kfree(pgd) | 24 | #define __pgd_free(pgd) kfree(pgd) |
25 | #else | 25 | #else |
26 | #define __pgd_alloc() (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_REPEAT, 2) | 26 | #define __pgd_alloc() (pgd_t *)__get_free_pages(GFP_KERNEL, 2) |
27 | #define __pgd_free(pgd) free_pages((unsigned long)pgd, 2) | 27 | #define __pgd_free(pgd) free_pages((unsigned long)pgd, 2) |
28 | #endif | 28 | #endif |
29 | 29 | ||
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index b1166d1e5955..031820d989a8 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c | |||
@@ -233,7 +233,7 @@ good_area: | |||
233 | goto out; | 233 | goto out; |
234 | } | 234 | } |
235 | 235 | ||
236 | return handle_mm_fault(mm, vma, addr & PAGE_MASK, mm_flags); | 236 | return handle_mm_fault(vma, addr & PAGE_MASK, mm_flags); |
237 | 237 | ||
238 | check_stack: | 238 | check_stack: |
239 | if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr)) | 239 | if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr)) |
diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c index c03533937a9f..a4b7edac8f10 100644 --- a/arch/avr32/mm/fault.c +++ b/arch/avr32/mm/fault.c | |||
@@ -134,7 +134,7 @@ good_area: | |||
134 | * sure we exit gracefully rather than endlessly redo the | 134 | * sure we exit gracefully rather than endlessly redo the |
135 | * fault. | 135 | * fault. |
136 | */ | 136 | */ |
137 | fault = handle_mm_fault(mm, vma, address, flags); | 137 | fault = handle_mm_fault(vma, address, flags); |
138 | 138 | ||
139 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 139 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
140 | return; | 140 | return; |
diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c index 3066d40a6db1..112ef26c7f2e 100644 --- a/arch/cris/mm/fault.c +++ b/arch/cris/mm/fault.c | |||
@@ -168,7 +168,7 @@ retry: | |||
168 | * the fault. | 168 | * the fault. |
169 | */ | 169 | */ |
170 | 170 | ||
171 | fault = handle_mm_fault(mm, vma, address, flags); | 171 | fault = handle_mm_fault(vma, address, flags); |
172 | 172 | ||
173 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 173 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
174 | return; | 174 | return; |
diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c index 61d99767fe16..614a46c413d2 100644 --- a/arch/frv/mm/fault.c +++ b/arch/frv/mm/fault.c | |||
@@ -164,7 +164,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear | |||
164 | * make sure we exit gracefully rather than endlessly redo | 164 | * make sure we exit gracefully rather than endlessly redo |
165 | * the fault. | 165 | * the fault. |
166 | */ | 166 | */ |
167 | fault = handle_mm_fault(mm, vma, ear0, flags); | 167 | fault = handle_mm_fault(vma, ear0, flags); |
168 | if (unlikely(fault & VM_FAULT_ERROR)) { | 168 | if (unlikely(fault & VM_FAULT_ERROR)) { |
169 | if (fault & VM_FAULT_OOM) | 169 | if (fault & VM_FAULT_OOM) |
170 | goto out_of_memory; | 170 | goto out_of_memory; |
diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c index 8704c9320032..bd7c251e2bce 100644 --- a/arch/hexagon/mm/vm_fault.c +++ b/arch/hexagon/mm/vm_fault.c | |||
@@ -101,7 +101,7 @@ good_area: | |||
101 | break; | 101 | break; |
102 | } | 102 | } |
103 | 103 | ||
104 | fault = handle_mm_fault(mm, vma, address, flags); | 104 | fault = handle_mm_fault(vma, address, flags); |
105 | 105 | ||
106 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 106 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
107 | return; | 107 | return; |
diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index 39d64e0df1de..77e541cf0e5d 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h | |||
@@ -205,17 +205,18 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) | |||
205 | * must be delayed until after the TLB has been flushed (see comments at the beginning of | 205 | * must be delayed until after the TLB has been flushed (see comments at the beginning of |
206 | * this file). | 206 | * this file). |
207 | */ | 207 | */ |
208 | static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | 208 | static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) |
209 | { | 209 | { |
210 | if (tlb->nr == tlb->max) | ||
211 | return true; | ||
212 | |||
210 | tlb->need_flush = 1; | 213 | tlb->need_flush = 1; |
211 | 214 | ||
212 | if (!tlb->nr && tlb->pages == tlb->local) | 215 | if (!tlb->nr && tlb->pages == tlb->local) |
213 | __tlb_alloc_page(tlb); | 216 | __tlb_alloc_page(tlb); |
214 | 217 | ||
215 | tlb->pages[tlb->nr++] = page; | 218 | tlb->pages[tlb->nr++] = page; |
216 | VM_BUG_ON(tlb->nr > tlb->max); | 219 | return false; |
217 | |||
218 | return tlb->max - tlb->nr; | ||
219 | } | 220 | } |
220 | 221 | ||
221 | static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) | 222 | static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) |
@@ -235,8 +236,28 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb) | |||
235 | 236 | ||
236 | static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) | 237 | static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) |
237 | { | 238 | { |
238 | if (!__tlb_remove_page(tlb, page)) | 239 | if (__tlb_remove_page(tlb, page)) { |
239 | tlb_flush_mmu(tlb); | 240 | tlb_flush_mmu(tlb); |
241 | __tlb_remove_page(tlb, page); | ||
242 | } | ||
243 | } | ||
244 | |||
245 | static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, | ||
246 | struct page *page, int page_size) | ||
247 | { | ||
248 | return __tlb_remove_page(tlb, page); | ||
249 | } | ||
250 | |||
251 | static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, | ||
252 | struct page *page) | ||
253 | { | ||
254 | return __tlb_remove_page(tlb, page); | ||
255 | } | ||
256 | |||
257 | static inline void tlb_remove_page_size(struct mmu_gather *tlb, | ||
258 | struct page *page, int page_size) | ||
259 | { | ||
260 | return tlb_remove_page(tlb, page); | ||
240 | } | 261 | } |
241 | 262 | ||
242 | /* | 263 | /* |
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c index 70b40d1205a6..fa6ad95e992e 100644 --- a/arch/ia64/mm/fault.c +++ b/arch/ia64/mm/fault.c | |||
@@ -159,7 +159,7 @@ retry: | |||
159 | * sure we exit gracefully rather than endlessly redo the | 159 | * sure we exit gracefully rather than endlessly redo the |
160 | * fault. | 160 | * fault. |
161 | */ | 161 | */ |
162 | fault = handle_mm_fault(mm, vma, address, flags); | 162 | fault = handle_mm_fault(vma, address, flags); |
163 | 163 | ||
164 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 164 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
165 | return; | 165 | return; |
diff --git a/arch/m32r/kernel/m32r_ksyms.c b/arch/m32r/kernel/m32r_ksyms.c index b727e693c805..23f26f4adfff 100644 --- a/arch/m32r/kernel/m32r_ksyms.c +++ b/arch/m32r/kernel/m32r_ksyms.c | |||
@@ -41,6 +41,9 @@ EXPORT_SYMBOL(cpu_data); | |||
41 | EXPORT_SYMBOL(smp_flush_tlb_page); | 41 | EXPORT_SYMBOL(smp_flush_tlb_page); |
42 | #endif | 42 | #endif |
43 | 43 | ||
44 | extern int __ucmpdi2(unsigned long long a, unsigned long long b); | ||
45 | EXPORT_SYMBOL(__ucmpdi2); | ||
46 | |||
44 | /* compiler generated symbol */ | 47 | /* compiler generated symbol */ |
45 | extern void __ashldi3(void); | 48 | extern void __ashldi3(void); |
46 | extern void __ashrdi3(void); | 49 | extern void __ashrdi3(void); |
diff --git a/arch/m32r/lib/Makefile b/arch/m32r/lib/Makefile index d16b4e40d1ae..5889eb9610b5 100644 --- a/arch/m32r/lib/Makefile +++ b/arch/m32r/lib/Makefile | |||
@@ -3,5 +3,5 @@ | |||
3 | # | 3 | # |
4 | 4 | ||
5 | lib-y := checksum.o ashxdi3.o memset.o memcpy.o \ | 5 | lib-y := checksum.o ashxdi3.o memset.o memcpy.o \ |
6 | delay.o strlen.o usercopy.o csum_partial_copy.o | 6 | delay.o strlen.o usercopy.o csum_partial_copy.o \ |
7 | 7 | ucmpdi2.o | |
diff --git a/arch/m32r/lib/libgcc.h b/arch/m32r/lib/libgcc.h new file mode 100644 index 000000000000..267aa435bc35 --- /dev/null +++ b/arch/m32r/lib/libgcc.h | |||
@@ -0,0 +1,23 @@ | |||
1 | #ifndef __ASM_LIBGCC_H | ||
2 | #define __ASM_LIBGCC_H | ||
3 | |||
4 | #include <asm/byteorder.h> | ||
5 | |||
6 | #ifdef __BIG_ENDIAN | ||
7 | struct DWstruct { | ||
8 | int high, low; | ||
9 | }; | ||
10 | #elif defined(__LITTLE_ENDIAN) | ||
11 | struct DWstruct { | ||
12 | int low, high; | ||
13 | }; | ||
14 | #else | ||
15 | #error I feel sick. | ||
16 | #endif | ||
17 | |||
18 | typedef union { | ||
19 | struct DWstruct s; | ||
20 | long long ll; | ||
21 | } DWunion; | ||
22 | |||
23 | #endif /* __ASM_LIBGCC_H */ | ||
diff --git a/arch/m32r/lib/ucmpdi2.c b/arch/m32r/lib/ucmpdi2.c new file mode 100644 index 000000000000..9d3c682c89b5 --- /dev/null +++ b/arch/m32r/lib/ucmpdi2.c | |||
@@ -0,0 +1,17 @@ | |||
1 | #include "libgcc.h" | ||
2 | |||
3 | int __ucmpdi2(unsigned long long a, unsigned long long b) | ||
4 | { | ||
5 | const DWunion au = {.ll = a}; | ||
6 | const DWunion bu = {.ll = b}; | ||
7 | |||
8 | if ((unsigned int)au.s.high < (unsigned int)bu.s.high) | ||
9 | return 0; | ||
10 | else if ((unsigned int)au.s.high > (unsigned int)bu.s.high) | ||
11 | return 2; | ||
12 | if ((unsigned int)au.s.low < (unsigned int)bu.s.low) | ||
13 | return 0; | ||
14 | else if ((unsigned int)au.s.low > (unsigned int)bu.s.low) | ||
15 | return 2; | ||
16 | return 1; | ||
17 | } | ||
diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c index 8f9875b7933d..a3785d3644c2 100644 --- a/arch/m32r/mm/fault.c +++ b/arch/m32r/mm/fault.c | |||
@@ -196,7 +196,7 @@ good_area: | |||
196 | */ | 196 | */ |
197 | addr = (address & PAGE_MASK); | 197 | addr = (address & PAGE_MASK); |
198 | set_thread_fault_code(error_code); | 198 | set_thread_fault_code(error_code); |
199 | fault = handle_mm_fault(mm, vma, addr, flags); | 199 | fault = handle_mm_fault(vma, addr, flags); |
200 | if (unlikely(fault & VM_FAULT_ERROR)) { | 200 | if (unlikely(fault & VM_FAULT_ERROR)) { |
201 | if (fault & VM_FAULT_OOM) | 201 | if (fault & VM_FAULT_OOM) |
202 | goto out_of_memory; | 202 | goto out_of_memory; |
diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index 6a94cdd0c830..bd66a0b20c6b 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c | |||
@@ -136,7 +136,7 @@ good_area: | |||
136 | * the fault. | 136 | * the fault. |
137 | */ | 137 | */ |
138 | 138 | ||
139 | fault = handle_mm_fault(mm, vma, address, flags); | 139 | fault = handle_mm_fault(vma, address, flags); |
140 | pr_debug("handle_mm_fault returns %d\n", fault); | 140 | pr_debug("handle_mm_fault returns %d\n", fault); |
141 | 141 | ||
142 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 142 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c index f57edca63609..372783a67dda 100644 --- a/arch/metag/mm/fault.c +++ b/arch/metag/mm/fault.c | |||
@@ -133,7 +133,7 @@ good_area: | |||
133 | * make sure we exit gracefully rather than endlessly redo | 133 | * make sure we exit gracefully rather than endlessly redo |
134 | * the fault. | 134 | * the fault. |
135 | */ | 135 | */ |
136 | fault = handle_mm_fault(mm, vma, address, flags); | 136 | fault = handle_mm_fault(vma, address, flags); |
137 | 137 | ||
138 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 138 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
139 | return 0; | 139 | return 0; |
diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c index 177dfc003643..abb678ccde6f 100644 --- a/arch/microblaze/mm/fault.c +++ b/arch/microblaze/mm/fault.c | |||
@@ -216,7 +216,7 @@ good_area: | |||
216 | * make sure we exit gracefully rather than endlessly redo | 216 | * make sure we exit gracefully rather than endlessly redo |
217 | * the fault. | 217 | * the fault. |
218 | */ | 218 | */ |
219 | fault = handle_mm_fault(mm, vma, address, flags); | 219 | fault = handle_mm_fault(vma, address, flags); |
220 | 220 | ||
221 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 221 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
222 | return; | 222 | return; |
diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c index 4b88fa031891..9560ad731120 100644 --- a/arch/mips/mm/fault.c +++ b/arch/mips/mm/fault.c | |||
@@ -153,7 +153,7 @@ good_area: | |||
153 | * make sure we exit gracefully rather than endlessly redo | 153 | * make sure we exit gracefully rather than endlessly redo |
154 | * the fault. | 154 | * the fault. |
155 | */ | 155 | */ |
156 | fault = handle_mm_fault(mm, vma, address, flags); | 156 | fault = handle_mm_fault(vma, address, flags); |
157 | 157 | ||
158 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 158 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
159 | return; | 159 | return; |
diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c index 4a1d181ed32f..f23781d6bbb3 100644 --- a/arch/mn10300/mm/fault.c +++ b/arch/mn10300/mm/fault.c | |||
@@ -254,7 +254,7 @@ good_area: | |||
254 | * make sure we exit gracefully rather than endlessly redo | 254 | * make sure we exit gracefully rather than endlessly redo |
255 | * the fault. | 255 | * the fault. |
256 | */ | 256 | */ |
257 | fault = handle_mm_fault(mm, vma, address, flags); | 257 | fault = handle_mm_fault(vma, address, flags); |
258 | 258 | ||
259 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 259 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
260 | return; | 260 | return; |
diff --git a/arch/nios2/mm/fault.c b/arch/nios2/mm/fault.c index b51878b0c6b8..affc4eb3f89e 100644 --- a/arch/nios2/mm/fault.c +++ b/arch/nios2/mm/fault.c | |||
@@ -131,7 +131,7 @@ good_area: | |||
131 | * make sure we exit gracefully rather than endlessly redo | 131 | * make sure we exit gracefully rather than endlessly redo |
132 | * the fault. | 132 | * the fault. |
133 | */ | 133 | */ |
134 | fault = handle_mm_fault(mm, vma, address, flags); | 134 | fault = handle_mm_fault(vma, address, flags); |
135 | 135 | ||
136 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 136 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
137 | return; | 137 | return; |
diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c index 230ac20ae794..e94cd225e816 100644 --- a/arch/openrisc/mm/fault.c +++ b/arch/openrisc/mm/fault.c | |||
@@ -163,7 +163,7 @@ good_area: | |||
163 | * the fault. | 163 | * the fault. |
164 | */ | 164 | */ |
165 | 165 | ||
166 | fault = handle_mm_fault(mm, vma, address, flags); | 166 | fault = handle_mm_fault(vma, address, flags); |
167 | 167 | ||
168 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 168 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
169 | return; | 169 | return; |
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index 16dbe81c97c9..163af2c31d76 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c | |||
@@ -239,7 +239,7 @@ good_area: | |||
239 | * fault. | 239 | * fault. |
240 | */ | 240 | */ |
241 | 241 | ||
242 | fault = handle_mm_fault(mm, vma, address, flags); | 242 | fault = handle_mm_fault(vma, address, flags); |
243 | 243 | ||
244 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 244 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
245 | return; | 245 | return; |
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index ee09e99097f0..9bd87f269d6d 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h | |||
@@ -71,10 +71,8 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, | |||
71 | static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, | 71 | static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, |
72 | bool *is_thp, unsigned *shift) | 72 | bool *is_thp, unsigned *shift) |
73 | { | 73 | { |
74 | if (!arch_irqs_disabled()) { | 74 | VM_WARN(!arch_irqs_disabled(), |
75 | pr_info("%s called with irq enabled\n", __func__); | 75 | "%s called with irq enabled\n", __func__); |
76 | dump_stack(); | ||
77 | } | ||
78 | return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift); | 76 | return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift); |
79 | } | 77 | } |
80 | 78 | ||
diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index 6527882ce05e..bb0354222b11 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c | |||
@@ -75,7 +75,7 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, | |||
75 | } | 75 | } |
76 | 76 | ||
77 | ret = 0; | 77 | ret = 0; |
78 | *flt = handle_mm_fault(mm, vma, ea, is_write ? FAULT_FLAG_WRITE : 0); | 78 | *flt = handle_mm_fault(vma, ea, is_write ? FAULT_FLAG_WRITE : 0); |
79 | if (unlikely(*flt & VM_FAULT_ERROR)) { | 79 | if (unlikely(*flt & VM_FAULT_ERROR)) { |
80 | if (*flt & VM_FAULT_OOM) { | 80 | if (*flt & VM_FAULT_OOM) { |
81 | ret = -ENOMEM; | 81 | ret = -ENOMEM; |
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index a67c6d781c52..a4db22f65021 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c | |||
@@ -429,7 +429,7 @@ good_area: | |||
429 | * make sure we exit gracefully rather than endlessly redo | 429 | * make sure we exit gracefully rather than endlessly redo |
430 | * the fault. | 430 | * the fault. |
431 | */ | 431 | */ |
432 | fault = handle_mm_fault(mm, vma, address, flags); | 432 | fault = handle_mm_fault(vma, address, flags); |
433 | if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { | 433 | if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { |
434 | if (fault & VM_FAULT_SIGSEGV) | 434 | if (fault & VM_FAULT_SIGSEGV) |
435 | goto bad_area; | 435 | goto bad_area; |
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 7a92e69c50bc..15711de10403 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h | |||
@@ -87,10 +87,10 @@ static inline void tlb_finish_mmu(struct mmu_gather *tlb, | |||
87 | * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page | 87 | * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page |
88 | * has already been freed, so just do free_page_and_swap_cache. | 88 | * has already been freed, so just do free_page_and_swap_cache. |
89 | */ | 89 | */ |
90 | static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | 90 | static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) |
91 | { | 91 | { |
92 | free_page_and_swap_cache(page); | 92 | free_page_and_swap_cache(page); |
93 | return 1; /* avoid calling tlb_flush_mmu */ | 93 | return false; /* avoid calling tlb_flush_mmu */ |
94 | } | 94 | } |
95 | 95 | ||
96 | static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) | 96 | static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) |
@@ -98,6 +98,24 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) | |||
98 | free_page_and_swap_cache(page); | 98 | free_page_and_swap_cache(page); |
99 | } | 99 | } |
100 | 100 | ||
101 | static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, | ||
102 | struct page *page, int page_size) | ||
103 | { | ||
104 | return __tlb_remove_page(tlb, page); | ||
105 | } | ||
106 | |||
107 | static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, | ||
108 | struct page *page) | ||
109 | { | ||
110 | return __tlb_remove_page(tlb, page); | ||
111 | } | ||
112 | |||
113 | static inline void tlb_remove_page_size(struct mmu_gather *tlb, | ||
114 | struct page *page, int page_size) | ||
115 | { | ||
116 | return tlb_remove_page(tlb, page); | ||
117 | } | ||
118 | |||
101 | /* | 119 | /* |
102 | * pte_free_tlb frees a pte table and clears the CRSTE for the | 120 | * pte_free_tlb frees a pte table and clears the CRSTE for the |
103 | * page table from the tlb. | 121 | * page table from the tlb. |
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 6ad7eff84c82..25783dc3c813 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c | |||
@@ -456,7 +456,7 @@ retry: | |||
456 | * make sure we exit gracefully rather than endlessly redo | 456 | * make sure we exit gracefully rather than endlessly redo |
457 | * the fault. | 457 | * the fault. |
458 | */ | 458 | */ |
459 | fault = handle_mm_fault(mm, vma, address, flags); | 459 | fault = handle_mm_fault(vma, address, flags); |
460 | /* No reason to continue if interrupted by SIGKILL. */ | 460 | /* No reason to continue if interrupted by SIGKILL. */ |
461 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { | 461 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { |
462 | fault = VM_FAULT_SIGNAL; | 462 | fault = VM_FAULT_SIGNAL; |
diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c index 37a6c2e0e969..995b71e4db4b 100644 --- a/arch/score/mm/fault.c +++ b/arch/score/mm/fault.c | |||
@@ -111,7 +111,7 @@ good_area: | |||
111 | * make sure we exit gracefully rather than endlessly redo | 111 | * make sure we exit gracefully rather than endlessly redo |
112 | * the fault. | 112 | * the fault. |
113 | */ | 113 | */ |
114 | fault = handle_mm_fault(mm, vma, address, flags); | 114 | fault = handle_mm_fault(vma, address, flags); |
115 | if (unlikely(fault & VM_FAULT_ERROR)) { | 115 | if (unlikely(fault & VM_FAULT_ERROR)) { |
116 | if (fault & VM_FAULT_OOM) | 116 | if (fault & VM_FAULT_OOM) |
117 | goto out_of_memory; | 117 | goto out_of_memory; |
diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 62f80d2a9df9..025cdb1032f6 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h | |||
@@ -101,7 +101,7 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb) | |||
101 | static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | 101 | static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) |
102 | { | 102 | { |
103 | free_page_and_swap_cache(page); | 103 | free_page_and_swap_cache(page); |
104 | return 1; /* avoid calling tlb_flush_mmu */ | 104 | return false; /* avoid calling tlb_flush_mmu */ |
105 | } | 105 | } |
106 | 106 | ||
107 | static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) | 107 | static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) |
@@ -109,6 +109,24 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) | |||
109 | __tlb_remove_page(tlb, page); | 109 | __tlb_remove_page(tlb, page); |
110 | } | 110 | } |
111 | 111 | ||
112 | static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, | ||
113 | struct page *page, int page_size) | ||
114 | { | ||
115 | return __tlb_remove_page(tlb, page); | ||
116 | } | ||
117 | |||
118 | static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, | ||
119 | struct page *page) | ||
120 | { | ||
121 | return __tlb_remove_page(tlb, page); | ||
122 | } | ||
123 | |||
124 | static inline void tlb_remove_page_size(struct mmu_gather *tlb, | ||
125 | struct page *page, int page_size) | ||
126 | { | ||
127 | return tlb_remove_page(tlb, page); | ||
128 | } | ||
129 | |||
112 | #define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep) | 130 | #define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep) |
113 | #define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp) | 131 | #define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp) |
114 | #define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) | 132 | #define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) |
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index 79d8276377d1..9bf876780cef 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c | |||
@@ -487,7 +487,7 @@ good_area: | |||
487 | * make sure we exit gracefully rather than endlessly redo | 487 | * make sure we exit gracefully rather than endlessly redo |
488 | * the fault. | 488 | * the fault. |
489 | */ | 489 | */ |
490 | fault = handle_mm_fault(mm, vma, address, flags); | 490 | fault = handle_mm_fault(vma, address, flags); |
491 | 491 | ||
492 | if (unlikely(fault & (VM_FAULT_RETRY | VM_FAULT_ERROR))) | 492 | if (unlikely(fault & (VM_FAULT_RETRY | VM_FAULT_ERROR))) |
493 | if (mm_fault_error(regs, error_code, address, fault)) | 493 | if (mm_fault_error(regs, error_code, address, fault)) |
diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index b6c559cbd64d..4714061d6cd3 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c | |||
@@ -241,7 +241,7 @@ good_area: | |||
241 | * make sure we exit gracefully rather than endlessly redo | 241 | * make sure we exit gracefully rather than endlessly redo |
242 | * the fault. | 242 | * the fault. |
243 | */ | 243 | */ |
244 | fault = handle_mm_fault(mm, vma, address, flags); | 244 | fault = handle_mm_fault(vma, address, flags); |
245 | 245 | ||
246 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 246 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
247 | return; | 247 | return; |
@@ -411,7 +411,7 @@ good_area: | |||
411 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) | 411 | if (!(vma->vm_flags & (VM_READ | VM_EXEC))) |
412 | goto bad_area; | 412 | goto bad_area; |
413 | } | 413 | } |
414 | switch (handle_mm_fault(mm, vma, address, flags)) { | 414 | switch (handle_mm_fault(vma, address, flags)) { |
415 | case VM_FAULT_SIGBUS: | 415 | case VM_FAULT_SIGBUS: |
416 | case VM_FAULT_OOM: | 416 | case VM_FAULT_OOM: |
417 | goto do_sigbus; | 417 | goto do_sigbus; |
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index cb841a33da59..6c43b924a7a2 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c | |||
@@ -436,7 +436,7 @@ good_area: | |||
436 | goto bad_area; | 436 | goto bad_area; |
437 | } | 437 | } |
438 | 438 | ||
439 | fault = handle_mm_fault(mm, vma, address, flags); | 439 | fault = handle_mm_fault(vma, address, flags); |
440 | 440 | ||
441 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 441 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
442 | goto exit_exception; | 442 | goto exit_exception; |
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index 26734214818c..beba986589e5 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c | |||
@@ -434,7 +434,7 @@ good_area: | |||
434 | * make sure we exit gracefully rather than endlessly redo | 434 | * make sure we exit gracefully rather than endlessly redo |
435 | * the fault. | 435 | * the fault. |
436 | */ | 436 | */ |
437 | fault = handle_mm_fault(mm, vma, address, flags); | 437 | fault = handle_mm_fault(vma, address, flags); |
438 | 438 | ||
439 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 439 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
440 | return 0; | 440 | return 0; |
diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 16eb63fac57d..821ff0acfe17 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h | |||
@@ -102,7 +102,7 @@ static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | |||
102 | { | 102 | { |
103 | tlb->need_flush = 1; | 103 | tlb->need_flush = 1; |
104 | free_page_and_swap_cache(page); | 104 | free_page_and_swap_cache(page); |
105 | return 1; /* avoid calling tlb_flush_mmu */ | 105 | return false; /* avoid calling tlb_flush_mmu */ |
106 | } | 106 | } |
107 | 107 | ||
108 | static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) | 108 | static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) |
@@ -110,6 +110,24 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) | |||
110 | __tlb_remove_page(tlb, page); | 110 | __tlb_remove_page(tlb, page); |
111 | } | 111 | } |
112 | 112 | ||
113 | static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, | ||
114 | struct page *page, int page_size) | ||
115 | { | ||
116 | return __tlb_remove_page(tlb, page); | ||
117 | } | ||
118 | |||
119 | static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, | ||
120 | struct page *page) | ||
121 | { | ||
122 | return __tlb_remove_page(tlb, page); | ||
123 | } | ||
124 | |||
125 | static inline void tlb_remove_page_size(struct mmu_gather *tlb, | ||
126 | struct page *page, int page_size) | ||
127 | { | ||
128 | return tlb_remove_page(tlb, page); | ||
129 | } | ||
130 | |||
113 | /** | 131 | /** |
114 | * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation. | 132 | * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation. |
115 | * | 133 | * |
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 98783dd0fa2e..ad8f206ab5e8 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c | |||
@@ -73,7 +73,7 @@ good_area: | |||
73 | do { | 73 | do { |
74 | int fault; | 74 | int fault; |
75 | 75 | ||
76 | fault = handle_mm_fault(mm, vma, address, flags); | 76 | fault = handle_mm_fault(vma, address, flags); |
77 | 77 | ||
78 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 78 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
79 | goto out_nosemaphore; | 79 | goto out_nosemaphore; |
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c index 2ec3d3adcefc..6c7f70bcaae3 100644 --- a/arch/unicore32/mm/fault.c +++ b/arch/unicore32/mm/fault.c | |||
@@ -194,7 +194,7 @@ good_area: | |||
194 | * If for any reason at all we couldn't handle the fault, make | 194 | * If for any reason at all we couldn't handle the fault, make |
195 | * sure we exit gracefully rather than endlessly redo the fault. | 195 | * sure we exit gracefully rather than endlessly redo the fault. |
196 | */ | 196 | */ |
197 | fault = handle_mm_fault(mm, vma, addr & PAGE_MASK, flags); | 197 | fault = handle_mm_fault(vma, addr & PAGE_MASK, flags); |
198 | return fault; | 198 | return fault; |
199 | 199 | ||
200 | check_stack: | 200 | check_stack: |
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 6fce7f096b88..830ed391e7ef 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile | |||
@@ -126,14 +126,6 @@ else | |||
126 | KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args) | 126 | KBUILD_CFLAGS += $(call cc-option,-maccumulate-outgoing-args) |
127 | endif | 127 | endif |
128 | 128 | ||
129 | # Make sure compiler does not have buggy stack-protector support. | ||
130 | ifdef CONFIG_CC_STACKPROTECTOR | ||
131 | cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh | ||
132 | ifneq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC) $(KBUILD_CPPFLAGS) $(biarch)),y) | ||
133 | $(warning stack-protector enabled but compiler support broken) | ||
134 | endif | ||
135 | endif | ||
136 | |||
137 | ifdef CONFIG_X86_X32 | 129 | ifdef CONFIG_X86_X32 |
138 | x32_ld_ok := $(call try-run,\ | 130 | x32_ld_ok := $(call try-run,\ |
139 | /bin/echo -e '1: .quad 1b' | \ | 131 | /bin/echo -e '1: .quad 1b' | \ |
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 574c23cf761a..b6d425999f99 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h | |||
@@ -81,7 +81,11 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, | |||
81 | static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) | 81 | static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) |
82 | { | 82 | { |
83 | struct page *page; | 83 | struct page *page; |
84 | page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0); | 84 | gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; |
85 | |||
86 | if (mm == &init_mm) | ||
87 | gfp &= ~__GFP_ACCOUNT; | ||
88 | page = alloc_pages(gfp, 0); | ||
85 | if (!page) | 89 | if (!page) |
86 | return NULL; | 90 | return NULL; |
87 | if (!pgtable_pmd_page_ctor(page)) { | 91 | if (!pgtable_pmd_page_ctor(page)) { |
@@ -125,7 +129,11 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) | |||
125 | 129 | ||
126 | static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) | 130 | static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) |
127 | { | 131 | { |
128 | return (pud_t *)get_zeroed_page(GFP_KERNEL); | 132 | gfp_t gfp = GFP_KERNEL_ACCOUNT; |
133 | |||
134 | if (mm == &init_mm) | ||
135 | gfp &= ~__GFP_ACCOUNT; | ||
136 | return (pud_t *)get_zeroed_page(gfp); | ||
129 | } | 137 | } |
130 | 138 | ||
131 | static inline void pud_free(struct mm_struct *mm, pud_t *pud) | 139 | static inline void pud_free(struct mm_struct *mm, pud_t *pud) |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index d22161ab941d..dc8023060456 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
@@ -1353,7 +1353,7 @@ good_area: | |||
1353 | * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if | 1353 | * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if |
1354 | * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. | 1354 | * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. |
1355 | */ | 1355 | */ |
1356 | fault = handle_mm_fault(mm, vma, address, flags); | 1356 | fault = handle_mm_fault(vma, address, flags); |
1357 | major |= fault & VM_FAULT_MAJOR; | 1357 | major |= fault & VM_FAULT_MAJOR; |
1358 | 1358 | ||
1359 | /* | 1359 | /* |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index aa0ff4b02a96..3feec5af4e67 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -6,7 +6,7 @@ | |||
6 | #include <asm/fixmap.h> | 6 | #include <asm/fixmap.h> |
7 | #include <asm/mtrr.h> | 7 | #include <asm/mtrr.h> |
8 | 8 | ||
9 | #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO | 9 | #define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO) |
10 | 10 | ||
11 | #ifdef CONFIG_HIGHPTE | 11 | #ifdef CONFIG_HIGHPTE |
12 | #define PGALLOC_USER_GFP __GFP_HIGHMEM | 12 | #define PGALLOC_USER_GFP __GFP_HIGHMEM |
@@ -18,7 +18,7 @@ gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; | |||
18 | 18 | ||
19 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) | 19 | pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) |
20 | { | 20 | { |
21 | return (pte_t *)__get_free_page(PGALLOC_GFP); | 21 | return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT); |
22 | } | 22 | } |
23 | 23 | ||
24 | pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) | 24 | pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) |
@@ -207,9 +207,13 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) | |||
207 | { | 207 | { |
208 | int i; | 208 | int i; |
209 | bool failed = false; | 209 | bool failed = false; |
210 | gfp_t gfp = PGALLOC_GFP; | ||
211 | |||
212 | if (mm == &init_mm) | ||
213 | gfp &= ~__GFP_ACCOUNT; | ||
210 | 214 | ||
211 | for(i = 0; i < PREALLOCATED_PMDS; i++) { | 215 | for(i = 0; i < PREALLOCATED_PMDS; i++) { |
212 | pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP); | 216 | pmd_t *pmd = (pmd_t *)__get_free_page(gfp); |
213 | if (!pmd) | 217 | if (!pmd) |
214 | failed = true; | 218 | failed = true; |
215 | if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { | 219 | if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { |
diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 7f4a1fdb1502..2725e08ef353 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c | |||
@@ -110,7 +110,7 @@ good_area: | |||
110 | * make sure we exit gracefully rather than endlessly redo | 110 | * make sure we exit gracefully rather than endlessly redo |
111 | * the fault. | 111 | * the fault. |
112 | */ | 112 | */ |
113 | fault = handle_mm_fault(mm, vma, address, flags); | 113 | fault = handle_mm_fault(vma, address, flags); |
114 | 114 | ||
115 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) | 115 | if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) |
116 | return; | 116 | return; |
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index f46dba8b7092..dc75de9059cd 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c | |||
@@ -391,6 +391,7 @@ static ssize_t show_valid_zones(struct device *dev, | |||
391 | unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; | 391 | unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; |
392 | struct page *first_page; | 392 | struct page *first_page; |
393 | struct zone *zone; | 393 | struct zone *zone; |
394 | int zone_shift = 0; | ||
394 | 395 | ||
395 | start_pfn = section_nr_to_pfn(mem->start_section_nr); | 396 | start_pfn = section_nr_to_pfn(mem->start_section_nr); |
396 | end_pfn = start_pfn + nr_pages; | 397 | end_pfn = start_pfn + nr_pages; |
@@ -402,21 +403,26 @@ static ssize_t show_valid_zones(struct device *dev, | |||
402 | 403 | ||
403 | zone = page_zone(first_page); | 404 | zone = page_zone(first_page); |
404 | 405 | ||
405 | if (zone_idx(zone) == ZONE_MOVABLE - 1) { | 406 | /* MMOP_ONLINE_KEEP */ |
406 | /*The mem block is the last memoryblock of this zone.*/ | 407 | sprintf(buf, "%s", zone->name); |
407 | if (end_pfn == zone_end_pfn(zone)) | 408 | |
408 | return sprintf(buf, "%s %s\n", | 409 | /* MMOP_ONLINE_KERNEL */ |
409 | zone->name, (zone + 1)->name); | 410 | zone_shift = zone_can_shift(start_pfn, nr_pages, ZONE_NORMAL); |
411 | if (zone_shift) { | ||
412 | strcat(buf, " "); | ||
413 | strcat(buf, (zone + zone_shift)->name); | ||
410 | } | 414 | } |
411 | 415 | ||
412 | if (zone_idx(zone) == ZONE_MOVABLE) { | 416 | /* MMOP_ONLINE_MOVABLE */ |
413 | /*The mem block is the first memoryblock of ZONE_MOVABLE.*/ | 417 | zone_shift = zone_can_shift(start_pfn, nr_pages, ZONE_MOVABLE); |
414 | if (start_pfn == zone->zone_start_pfn) | 418 | if (zone_shift) { |
415 | return sprintf(buf, "%s %s\n", | 419 | strcat(buf, " "); |
416 | zone->name, (zone - 1)->name); | 420 | strcat(buf, (zone + zone_shift)->name); |
417 | } | 421 | } |
418 | 422 | ||
419 | return sprintf(buf, "%s\n", zone->name); | 423 | strcat(buf, "\n"); |
424 | |||
425 | return strlen(buf); | ||
420 | } | 426 | } |
421 | static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL); | 427 | static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL); |
422 | #endif | 428 | #endif |
diff --git a/drivers/base/node.c b/drivers/base/node.c index 560751bad294..51c7db2c4ee2 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c | |||
@@ -113,6 +113,8 @@ static ssize_t node_read_meminfo(struct device *dev, | |||
113 | "Node %d SUnreclaim: %8lu kB\n" | 113 | "Node %d SUnreclaim: %8lu kB\n" |
114 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 114 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
115 | "Node %d AnonHugePages: %8lu kB\n" | 115 | "Node %d AnonHugePages: %8lu kB\n" |
116 | "Node %d ShmemHugePages: %8lu kB\n" | ||
117 | "Node %d ShmemPmdMapped: %8lu kB\n" | ||
116 | #endif | 118 | #endif |
117 | , | 119 | , |
118 | nid, K(node_page_state(nid, NR_FILE_DIRTY)), | 120 | nid, K(node_page_state(nid, NR_FILE_DIRTY)), |
@@ -131,10 +133,13 @@ static ssize_t node_read_meminfo(struct device *dev, | |||
131 | node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), | 133 | node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), |
132 | nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)), | 134 | nid, K(node_page_state(nid, NR_SLAB_RECLAIMABLE)), |
133 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 135 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
134 | nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)) | 136 | nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE)), |
135 | , nid, | 137 | nid, K(node_page_state(nid, NR_ANON_THPS) * |
136 | K(node_page_state(nid, NR_ANON_TRANSPARENT_HUGEPAGES) * | 138 | HPAGE_PMD_NR), |
137 | HPAGE_PMD_NR)); | 139 | nid, K(node_page_state(nid, NR_SHMEM_THPS) * |
140 | HPAGE_PMD_NR), | ||
141 | nid, K(node_page_state(nid, NR_SHMEM_PMDMAPPED) * | ||
142 | HPAGE_PMD_NR)); | ||
138 | #else | 143 | #else |
139 | nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); | 144 | nid, K(node_page_state(nid, NR_SLAB_UNRECLAIMABLE))); |
140 | #endif | 145 | #endif |
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 386ba3d1a6ee..b8ecba6dcd3b 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig | |||
@@ -1,8 +1,7 @@ | |||
1 | config ZRAM | 1 | config ZRAM |
2 | tristate "Compressed RAM block device support" | 2 | tristate "Compressed RAM block device support" |
3 | depends on BLOCK && SYSFS && ZSMALLOC | 3 | depends on BLOCK && SYSFS && ZSMALLOC && CRYPTO |
4 | select LZO_COMPRESS | 4 | select CRYPTO_LZO |
5 | select LZO_DECOMPRESS | ||
6 | default n | 5 | default n |
7 | help | 6 | help |
8 | Creates virtual block devices called /dev/zramX (X = 0, 1, ...). | 7 | Creates virtual block devices called /dev/zramX (X = 0, 1, ...). |
@@ -14,13 +13,3 @@ config ZRAM | |||
14 | disks and maybe many more. | 13 | disks and maybe many more. |
15 | 14 | ||
16 | See zram.txt for more information. | 15 | See zram.txt for more information. |
17 | |||
18 | config ZRAM_LZ4_COMPRESS | ||
19 | bool "Enable LZ4 algorithm support" | ||
20 | depends on ZRAM | ||
21 | select LZ4_COMPRESS | ||
22 | select LZ4_DECOMPRESS | ||
23 | default n | ||
24 | help | ||
25 | This option enables LZ4 compression algorithm support. Compression | ||
26 | algorithm can be changed using `comp_algorithm' device attribute. \ No newline at end of file | ||
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index be0763ff57a2..9e2b79e9a990 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile | |||
@@ -1,5 +1,3 @@ | |||
1 | zram-y := zcomp_lzo.o zcomp.o zram_drv.o | 1 | zram-y := zcomp.o zram_drv.o |
2 | |||
3 | zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o | ||
4 | 2 | ||
5 | obj-$(CONFIG_ZRAM) += zram.o | 3 | obj-$(CONFIG_ZRAM) += zram.o |
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index b51a816d766b..4b5cd3a7b2b6 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c | |||
@@ -14,108 +14,150 @@ | |||
14 | #include <linux/wait.h> | 14 | #include <linux/wait.h> |
15 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
16 | #include <linux/cpu.h> | 16 | #include <linux/cpu.h> |
17 | #include <linux/crypto.h> | ||
17 | 18 | ||
18 | #include "zcomp.h" | 19 | #include "zcomp.h" |
19 | #include "zcomp_lzo.h" | ||
20 | #ifdef CONFIG_ZRAM_LZ4_COMPRESS | ||
21 | #include "zcomp_lz4.h" | ||
22 | #endif | ||
23 | 20 | ||
24 | static struct zcomp_backend *backends[] = { | 21 | static const char * const backends[] = { |
25 | &zcomp_lzo, | 22 | "lzo", |
26 | #ifdef CONFIG_ZRAM_LZ4_COMPRESS | 23 | #if IS_ENABLED(CONFIG_CRYPTO_LZ4) |
27 | &zcomp_lz4, | 24 | "lz4", |
25 | #endif | ||
26 | #if IS_ENABLED(CONFIG_CRYPTO_DEFLATE) | ||
27 | "deflate", | ||
28 | #endif | ||
29 | #if IS_ENABLED(CONFIG_CRYPTO_LZ4HC) | ||
30 | "lz4hc", | ||
31 | #endif | ||
32 | #if IS_ENABLED(CONFIG_CRYPTO_842) | ||
33 | "842", | ||
28 | #endif | 34 | #endif |
29 | NULL | 35 | NULL |
30 | }; | 36 | }; |
31 | 37 | ||
32 | static struct zcomp_backend *find_backend(const char *compress) | 38 | static void zcomp_strm_free(struct zcomp_strm *zstrm) |
33 | { | ||
34 | int i = 0; | ||
35 | while (backends[i]) { | ||
36 | if (sysfs_streq(compress, backends[i]->name)) | ||
37 | break; | ||
38 | i++; | ||
39 | } | ||
40 | return backends[i]; | ||
41 | } | ||
42 | |||
43 | static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) | ||
44 | { | 39 | { |
45 | if (zstrm->private) | 40 | if (!IS_ERR_OR_NULL(zstrm->tfm)) |
46 | comp->backend->destroy(zstrm->private); | 41 | crypto_free_comp(zstrm->tfm); |
47 | free_pages((unsigned long)zstrm->buffer, 1); | 42 | free_pages((unsigned long)zstrm->buffer, 1); |
48 | kfree(zstrm); | 43 | kfree(zstrm); |
49 | } | 44 | } |
50 | 45 | ||
51 | /* | 46 | /* |
52 | * allocate new zcomp_strm structure with ->private initialized by | 47 | * allocate new zcomp_strm structure with ->tfm initialized by |
53 | * backend, return NULL on error | 48 | * backend, return NULL on error |
54 | */ | 49 | */ |
55 | static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) | 50 | static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) |
56 | { | 51 | { |
57 | struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), flags); | 52 | struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL); |
58 | if (!zstrm) | 53 | if (!zstrm) |
59 | return NULL; | 54 | return NULL; |
60 | 55 | ||
61 | zstrm->private = comp->backend->create(flags); | 56 | zstrm->tfm = crypto_alloc_comp(comp->name, 0, 0); |
62 | /* | 57 | /* |
63 | * allocate 2 pages. 1 for compressed data, plus 1 extra for the | 58 | * allocate 2 pages. 1 for compressed data, plus 1 extra for the |
64 | * case when compressed size is larger than the original one | 59 | * case when compressed size is larger than the original one |
65 | */ | 60 | */ |
66 | zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1); | 61 | zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); |
67 | if (!zstrm->private || !zstrm->buffer) { | 62 | if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) { |
68 | zcomp_strm_free(comp, zstrm); | 63 | zcomp_strm_free(zstrm); |
69 | zstrm = NULL; | 64 | zstrm = NULL; |
70 | } | 65 | } |
71 | return zstrm; | 66 | return zstrm; |
72 | } | 67 | } |
73 | 68 | ||
69 | bool zcomp_available_algorithm(const char *comp) | ||
70 | { | ||
71 | int i = 0; | ||
72 | |||
73 | while (backends[i]) { | ||
74 | if (sysfs_streq(comp, backends[i])) | ||
75 | return true; | ||
76 | i++; | ||
77 | } | ||
78 | |||
79 | /* | ||
80 | * Crypto does not ignore a trailing new line symbol, | ||
81 | * so make sure you don't supply a string containing | ||
82 | * one. | ||
83 | * This also means that we permit zcomp initialisation | ||
84 | * with any compressing algorithm known to crypto api. | ||
85 | */ | ||
86 | return crypto_has_comp(comp, 0, 0) == 1; | ||
87 | } | ||
88 | |||
74 | /* show available compressors */ | 89 | /* show available compressors */ |
75 | ssize_t zcomp_available_show(const char *comp, char *buf) | 90 | ssize_t zcomp_available_show(const char *comp, char *buf) |
76 | { | 91 | { |
92 | bool known_algorithm = false; | ||
77 | ssize_t sz = 0; | 93 | ssize_t sz = 0; |
78 | int i = 0; | 94 | int i = 0; |
79 | 95 | ||
80 | while (backends[i]) { | 96 | for (; backends[i]; i++) { |
81 | if (!strcmp(comp, backends[i]->name)) | 97 | if (!strcmp(comp, backends[i])) { |
98 | known_algorithm = true; | ||
82 | sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, | 99 | sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, |
83 | "[%s] ", backends[i]->name); | 100 | "[%s] ", backends[i]); |
84 | else | 101 | } else { |
85 | sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, | 102 | sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, |
86 | "%s ", backends[i]->name); | 103 | "%s ", backends[i]); |
87 | i++; | 104 | } |
88 | } | 105 | } |
106 | |||
107 | /* | ||
108 | * Out-of-tree module known to crypto api or a missing | ||
109 | * entry in `backends'. | ||
110 | */ | ||
111 | if (!known_algorithm && crypto_has_comp(comp, 0, 0) == 1) | ||
112 | sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, | ||
113 | "[%s] ", comp); | ||
114 | |||
89 | sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); | 115 | sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); |
90 | return sz; | 116 | return sz; |
91 | } | 117 | } |
92 | 118 | ||
93 | bool zcomp_available_algorithm(const char *comp) | 119 | struct zcomp_strm *zcomp_stream_get(struct zcomp *comp) |
94 | { | ||
95 | return find_backend(comp) != NULL; | ||
96 | } | ||
97 | |||
98 | struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) | ||
99 | { | 120 | { |
100 | return *get_cpu_ptr(comp->stream); | 121 | return *get_cpu_ptr(comp->stream); |
101 | } | 122 | } |
102 | 123 | ||
103 | void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) | 124 | void zcomp_stream_put(struct zcomp *comp) |
104 | { | 125 | { |
105 | put_cpu_ptr(comp->stream); | 126 | put_cpu_ptr(comp->stream); |
106 | } | 127 | } |
107 | 128 | ||
108 | int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, | 129 | int zcomp_compress(struct zcomp_strm *zstrm, |
109 | const unsigned char *src, size_t *dst_len) | 130 | const void *src, unsigned int *dst_len) |
110 | { | 131 | { |
111 | return comp->backend->compress(src, zstrm->buffer, dst_len, | 132 | /* |
112 | zstrm->private); | 133 | * Our dst memory (zstrm->buffer) is always `2 * PAGE_SIZE' sized |
134 | * because sometimes we can endup having a bigger compressed data | ||
135 | * due to various reasons: for example compression algorithms tend | ||
136 | * to add some padding to the compressed buffer. Speaking of padding, | ||
137 | * comp algorithm `842' pads the compressed length to multiple of 8 | ||
138 | * and returns -ENOSP when the dst memory is not big enough, which | ||
139 | * is not something that ZRAM wants to see. We can handle the | ||
140 | * `compressed_size > PAGE_SIZE' case easily in ZRAM, but when we | ||
141 | * receive -ERRNO from the compressing backend we can't help it | ||
142 | * anymore. To make `842' happy we need to tell the exact size of | ||
143 | * the dst buffer, zram_drv will take care of the fact that | ||
144 | * compressed buffer is too big. | ||
145 | */ | ||
146 | *dst_len = PAGE_SIZE * 2; | ||
147 | |||
148 | return crypto_comp_compress(zstrm->tfm, | ||
149 | src, PAGE_SIZE, | ||
150 | zstrm->buffer, dst_len); | ||
113 | } | 151 | } |
114 | 152 | ||
115 | int zcomp_decompress(struct zcomp *comp, const unsigned char *src, | 153 | int zcomp_decompress(struct zcomp_strm *zstrm, |
116 | size_t src_len, unsigned char *dst) | 154 | const void *src, unsigned int src_len, void *dst) |
117 | { | 155 | { |
118 | return comp->backend->decompress(src, src_len, dst); | 156 | unsigned int dst_len = PAGE_SIZE; |
157 | |||
158 | return crypto_comp_decompress(zstrm->tfm, | ||
159 | src, src_len, | ||
160 | dst, &dst_len); | ||
119 | } | 161 | } |
120 | 162 | ||
121 | static int __zcomp_cpu_notifier(struct zcomp *comp, | 163 | static int __zcomp_cpu_notifier(struct zcomp *comp, |
@@ -127,7 +169,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp, | |||
127 | case CPU_UP_PREPARE: | 169 | case CPU_UP_PREPARE: |
128 | if (WARN_ON(*per_cpu_ptr(comp->stream, cpu))) | 170 | if (WARN_ON(*per_cpu_ptr(comp->stream, cpu))) |
129 | break; | 171 | break; |
130 | zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); | 172 | zstrm = zcomp_strm_alloc(comp); |
131 | if (IS_ERR_OR_NULL(zstrm)) { | 173 | if (IS_ERR_OR_NULL(zstrm)) { |
132 | pr_err("Can't allocate a compression stream\n"); | 174 | pr_err("Can't allocate a compression stream\n"); |
133 | return NOTIFY_BAD; | 175 | return NOTIFY_BAD; |
@@ -138,7 +180,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp, | |||
138 | case CPU_UP_CANCELED: | 180 | case CPU_UP_CANCELED: |
139 | zstrm = *per_cpu_ptr(comp->stream, cpu); | 181 | zstrm = *per_cpu_ptr(comp->stream, cpu); |
140 | if (!IS_ERR_OR_NULL(zstrm)) | 182 | if (!IS_ERR_OR_NULL(zstrm)) |
141 | zcomp_strm_free(comp, zstrm); | 183 | zcomp_strm_free(zstrm); |
142 | *per_cpu_ptr(comp->stream, cpu) = NULL; | 184 | *per_cpu_ptr(comp->stream, cpu) = NULL; |
143 | break; | 185 | break; |
144 | default: | 186 | default: |
@@ -209,18 +251,16 @@ void zcomp_destroy(struct zcomp *comp) | |||
209 | struct zcomp *zcomp_create(const char *compress) | 251 | struct zcomp *zcomp_create(const char *compress) |
210 | { | 252 | { |
211 | struct zcomp *comp; | 253 | struct zcomp *comp; |
212 | struct zcomp_backend *backend; | ||
213 | int error; | 254 | int error; |
214 | 255 | ||
215 | backend = find_backend(compress); | 256 | if (!zcomp_available_algorithm(compress)) |
216 | if (!backend) | ||
217 | return ERR_PTR(-EINVAL); | 257 | return ERR_PTR(-EINVAL); |
218 | 258 | ||
219 | comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); | 259 | comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); |
220 | if (!comp) | 260 | if (!comp) |
221 | return ERR_PTR(-ENOMEM); | 261 | return ERR_PTR(-ENOMEM); |
222 | 262 | ||
223 | comp->backend = backend; | 263 | comp->name = compress; |
224 | error = zcomp_init(comp); | 264 | error = zcomp_init(comp); |
225 | if (error) { | 265 | if (error) { |
226 | kfree(comp); | 266 | kfree(comp); |
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index ffd88cb747fe..478cac2ed465 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h | |||
@@ -13,33 +13,15 @@ | |||
13 | struct zcomp_strm { | 13 | struct zcomp_strm { |
14 | /* compression/decompression buffer */ | 14 | /* compression/decompression buffer */ |
15 | void *buffer; | 15 | void *buffer; |
16 | /* | 16 | struct crypto_comp *tfm; |
17 | * The private data of the compression stream, only compression | ||
18 | * stream backend can touch this (e.g. compression algorithm | ||
19 | * working memory) | ||
20 | */ | ||
21 | void *private; | ||
22 | }; | ||
23 | |||
24 | /* static compression backend */ | ||
25 | struct zcomp_backend { | ||
26 | int (*compress)(const unsigned char *src, unsigned char *dst, | ||
27 | size_t *dst_len, void *private); | ||
28 | |||
29 | int (*decompress)(const unsigned char *src, size_t src_len, | ||
30 | unsigned char *dst); | ||
31 | |||
32 | void *(*create)(gfp_t flags); | ||
33 | void (*destroy)(void *private); | ||
34 | |||
35 | const char *name; | ||
36 | }; | 17 | }; |
37 | 18 | ||
38 | /* dynamic per-device compression frontend */ | 19 | /* dynamic per-device compression frontend */ |
39 | struct zcomp { | 20 | struct zcomp { |
40 | struct zcomp_strm * __percpu *stream; | 21 | struct zcomp_strm * __percpu *stream; |
41 | struct zcomp_backend *backend; | ||
42 | struct notifier_block notifier; | 22 | struct notifier_block notifier; |
23 | |||
24 | const char *name; | ||
43 | }; | 25 | }; |
44 | 26 | ||
45 | ssize_t zcomp_available_show(const char *comp, char *buf); | 27 | ssize_t zcomp_available_show(const char *comp, char *buf); |
@@ -48,14 +30,14 @@ bool zcomp_available_algorithm(const char *comp); | |||
48 | struct zcomp *zcomp_create(const char *comp); | 30 | struct zcomp *zcomp_create(const char *comp); |
49 | void zcomp_destroy(struct zcomp *comp); | 31 | void zcomp_destroy(struct zcomp *comp); |
50 | 32 | ||
51 | struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); | 33 | struct zcomp_strm *zcomp_stream_get(struct zcomp *comp); |
52 | void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm); | 34 | void zcomp_stream_put(struct zcomp *comp); |
53 | 35 | ||
54 | int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, | 36 | int zcomp_compress(struct zcomp_strm *zstrm, |
55 | const unsigned char *src, size_t *dst_len); | 37 | const void *src, unsigned int *dst_len); |
56 | 38 | ||
57 | int zcomp_decompress(struct zcomp *comp, const unsigned char *src, | 39 | int zcomp_decompress(struct zcomp_strm *zstrm, |
58 | size_t src_len, unsigned char *dst); | 40 | const void *src, unsigned int src_len, void *dst); |
59 | 41 | ||
60 | bool zcomp_set_max_streams(struct zcomp *comp, int num_strm); | 42 | bool zcomp_set_max_streams(struct zcomp *comp, int num_strm); |
61 | #endif /* _ZCOMP_H_ */ | 43 | #endif /* _ZCOMP_H_ */ |
diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c deleted file mode 100644 index 0110086accba..000000000000 --- a/drivers/block/zram/zcomp_lz4.c +++ /dev/null | |||
@@ -1,56 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2014 Sergey Senozhatsky. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | */ | ||
9 | |||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/slab.h> | ||
12 | #include <linux/lz4.h> | ||
13 | #include <linux/vmalloc.h> | ||
14 | #include <linux/mm.h> | ||
15 | |||
16 | #include "zcomp_lz4.h" | ||
17 | |||
18 | static void *zcomp_lz4_create(gfp_t flags) | ||
19 | { | ||
20 | void *ret; | ||
21 | |||
22 | ret = kmalloc(LZ4_MEM_COMPRESS, flags); | ||
23 | if (!ret) | ||
24 | ret = __vmalloc(LZ4_MEM_COMPRESS, | ||
25 | flags | __GFP_HIGHMEM, | ||
26 | PAGE_KERNEL); | ||
27 | return ret; | ||
28 | } | ||
29 | |||
30 | static void zcomp_lz4_destroy(void *private) | ||
31 | { | ||
32 | kvfree(private); | ||
33 | } | ||
34 | |||
35 | static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, | ||
36 | size_t *dst_len, void *private) | ||
37 | { | ||
38 | /* return : Success if return 0 */ | ||
39 | return lz4_compress(src, PAGE_SIZE, dst, dst_len, private); | ||
40 | } | ||
41 | |||
42 | static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len, | ||
43 | unsigned char *dst) | ||
44 | { | ||
45 | size_t dst_len = PAGE_SIZE; | ||
46 | /* return : Success if return 0 */ | ||
47 | return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len); | ||
48 | } | ||
49 | |||
50 | struct zcomp_backend zcomp_lz4 = { | ||
51 | .compress = zcomp_lz4_compress, | ||
52 | .decompress = zcomp_lz4_decompress, | ||
53 | .create = zcomp_lz4_create, | ||
54 | .destroy = zcomp_lz4_destroy, | ||
55 | .name = "lz4", | ||
56 | }; | ||
diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h deleted file mode 100644 index 60613fb29dd8..000000000000 --- a/drivers/block/zram/zcomp_lz4.h +++ /dev/null | |||
@@ -1,17 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2014 Sergey Senozhatsky. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | */ | ||
9 | |||
10 | #ifndef _ZCOMP_LZ4_H_ | ||
11 | #define _ZCOMP_LZ4_H_ | ||
12 | |||
13 | #include "zcomp.h" | ||
14 | |||
15 | extern struct zcomp_backend zcomp_lz4; | ||
16 | |||
17 | #endif /* _ZCOMP_LZ4_H_ */ | ||
diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c deleted file mode 100644 index ed7a1f0549ec..000000000000 --- a/drivers/block/zram/zcomp_lzo.c +++ /dev/null | |||
@@ -1,56 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2014 Sergey Senozhatsky. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | */ | ||
9 | |||
10 | #include <linux/kernel.h> | ||
11 | #include <linux/slab.h> | ||
12 | #include <linux/lzo.h> | ||
13 | #include <linux/vmalloc.h> | ||
14 | #include <linux/mm.h> | ||
15 | |||
16 | #include "zcomp_lzo.h" | ||
17 | |||
18 | static void *lzo_create(gfp_t flags) | ||
19 | { | ||
20 | void *ret; | ||
21 | |||
22 | ret = kmalloc(LZO1X_MEM_COMPRESS, flags); | ||
23 | if (!ret) | ||
24 | ret = __vmalloc(LZO1X_MEM_COMPRESS, | ||
25 | flags | __GFP_HIGHMEM, | ||
26 | PAGE_KERNEL); | ||
27 | return ret; | ||
28 | } | ||
29 | |||
30 | static void lzo_destroy(void *private) | ||
31 | { | ||
32 | kvfree(private); | ||
33 | } | ||
34 | |||
35 | static int lzo_compress(const unsigned char *src, unsigned char *dst, | ||
36 | size_t *dst_len, void *private) | ||
37 | { | ||
38 | int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private); | ||
39 | return ret == LZO_E_OK ? 0 : ret; | ||
40 | } | ||
41 | |||
42 | static int lzo_decompress(const unsigned char *src, size_t src_len, | ||
43 | unsigned char *dst) | ||
44 | { | ||
45 | size_t dst_len = PAGE_SIZE; | ||
46 | int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len); | ||
47 | return ret == LZO_E_OK ? 0 : ret; | ||
48 | } | ||
49 | |||
50 | struct zcomp_backend zcomp_lzo = { | ||
51 | .compress = lzo_compress, | ||
52 | .decompress = lzo_decompress, | ||
53 | .create = lzo_create, | ||
54 | .destroy = lzo_destroy, | ||
55 | .name = "lzo", | ||
56 | }; | ||
diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h deleted file mode 100644 index 128c5807fa14..000000000000 --- a/drivers/block/zram/zcomp_lzo.h +++ /dev/null | |||
@@ -1,17 +0,0 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2014 Sergey Senozhatsky. | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or | ||
5 | * modify it under the terms of the GNU General Public License | ||
6 | * as published by the Free Software Foundation; either version | ||
7 | * 2 of the License, or (at your option) any later version. | ||
8 | */ | ||
9 | |||
10 | #ifndef _ZCOMP_LZO_H_ | ||
11 | #define _ZCOMP_LZO_H_ | ||
12 | |||
13 | #include "zcomp.h" | ||
14 | |||
15 | extern struct zcomp_backend zcomp_lzo; | ||
16 | |||
17 | #endif /* _ZCOMP_LZO_H_ */ | ||
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e5e5d19f2172..7454cf188c8e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c | |||
@@ -342,9 +342,16 @@ static ssize_t comp_algorithm_store(struct device *dev, | |||
342 | struct device_attribute *attr, const char *buf, size_t len) | 342 | struct device_attribute *attr, const char *buf, size_t len) |
343 | { | 343 | { |
344 | struct zram *zram = dev_to_zram(dev); | 344 | struct zram *zram = dev_to_zram(dev); |
345 | char compressor[CRYPTO_MAX_ALG_NAME]; | ||
345 | size_t sz; | 346 | size_t sz; |
346 | 347 | ||
347 | if (!zcomp_available_algorithm(buf)) | 348 | strlcpy(compressor, buf, sizeof(compressor)); |
349 | /* ignore trailing newline */ | ||
350 | sz = strlen(compressor); | ||
351 | if (sz > 0 && compressor[sz - 1] == '\n') | ||
352 | compressor[sz - 1] = 0x00; | ||
353 | |||
354 | if (!zcomp_available_algorithm(compressor)) | ||
348 | return -EINVAL; | 355 | return -EINVAL; |
349 | 356 | ||
350 | down_write(&zram->init_lock); | 357 | down_write(&zram->init_lock); |
@@ -353,13 +360,8 @@ static ssize_t comp_algorithm_store(struct device *dev, | |||
353 | pr_info("Can't change algorithm for initialized device\n"); | 360 | pr_info("Can't change algorithm for initialized device\n"); |
354 | return -EBUSY; | 361 | return -EBUSY; |
355 | } | 362 | } |
356 | strlcpy(zram->compressor, buf, sizeof(zram->compressor)); | ||
357 | |||
358 | /* ignore trailing newline */ | ||
359 | sz = strlen(zram->compressor); | ||
360 | if (sz > 0 && zram->compressor[sz - 1] == '\n') | ||
361 | zram->compressor[sz - 1] = 0x00; | ||
362 | 363 | ||
364 | strlcpy(zram->compressor, compressor, sizeof(compressor)); | ||
363 | up_write(&zram->init_lock); | 365 | up_write(&zram->init_lock); |
364 | return len; | 366 | return len; |
365 | } | 367 | } |
@@ -563,7 +565,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) | |||
563 | unsigned char *cmem; | 565 | unsigned char *cmem; |
564 | struct zram_meta *meta = zram->meta; | 566 | struct zram_meta *meta = zram->meta; |
565 | unsigned long handle; | 567 | unsigned long handle; |
566 | size_t size; | 568 | unsigned int size; |
567 | 569 | ||
568 | bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); | 570 | bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); |
569 | handle = meta->table[index].handle; | 571 | handle = meta->table[index].handle; |
@@ -576,10 +578,14 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) | |||
576 | } | 578 | } |
577 | 579 | ||
578 | cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); | 580 | cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); |
579 | if (size == PAGE_SIZE) | 581 | if (size == PAGE_SIZE) { |
580 | copy_page(mem, cmem); | 582 | copy_page(mem, cmem); |
581 | else | 583 | } else { |
582 | ret = zcomp_decompress(zram->comp, cmem, size, mem); | 584 | struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp); |
585 | |||
586 | ret = zcomp_decompress(zstrm, cmem, size, mem); | ||
587 | zcomp_stream_put(zram->comp); | ||
588 | } | ||
583 | zs_unmap_object(meta->mem_pool, handle); | 589 | zs_unmap_object(meta->mem_pool, handle); |
584 | bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); | 590 | bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); |
585 | 591 | ||
@@ -646,7 +652,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, | |||
646 | int offset) | 652 | int offset) |
647 | { | 653 | { |
648 | int ret = 0; | 654 | int ret = 0; |
649 | size_t clen; | 655 | unsigned int clen; |
650 | unsigned long handle = 0; | 656 | unsigned long handle = 0; |
651 | struct page *page; | 657 | struct page *page; |
652 | unsigned char *user_mem, *cmem, *src, *uncmem = NULL; | 658 | unsigned char *user_mem, *cmem, *src, *uncmem = NULL; |
@@ -695,8 +701,8 @@ compress_again: | |||
695 | goto out; | 701 | goto out; |
696 | } | 702 | } |
697 | 703 | ||
698 | zstrm = zcomp_strm_find(zram->comp); | 704 | zstrm = zcomp_stream_get(zram->comp); |
699 | ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); | 705 | ret = zcomp_compress(zstrm, uncmem, &clen); |
700 | if (!is_partial_io(bvec)) { | 706 | if (!is_partial_io(bvec)) { |
701 | kunmap_atomic(user_mem); | 707 | kunmap_atomic(user_mem); |
702 | user_mem = NULL; | 708 | user_mem = NULL; |
@@ -732,19 +738,21 @@ compress_again: | |||
732 | handle = zs_malloc(meta->mem_pool, clen, | 738 | handle = zs_malloc(meta->mem_pool, clen, |
733 | __GFP_KSWAPD_RECLAIM | | 739 | __GFP_KSWAPD_RECLAIM | |
734 | __GFP_NOWARN | | 740 | __GFP_NOWARN | |
735 | __GFP_HIGHMEM); | 741 | __GFP_HIGHMEM | |
742 | __GFP_MOVABLE); | ||
736 | if (!handle) { | 743 | if (!handle) { |
737 | zcomp_strm_release(zram->comp, zstrm); | 744 | zcomp_stream_put(zram->comp); |
738 | zstrm = NULL; | 745 | zstrm = NULL; |
739 | 746 | ||
740 | atomic64_inc(&zram->stats.writestall); | 747 | atomic64_inc(&zram->stats.writestall); |
741 | 748 | ||
742 | handle = zs_malloc(meta->mem_pool, clen, | 749 | handle = zs_malloc(meta->mem_pool, clen, |
743 | GFP_NOIO | __GFP_HIGHMEM); | 750 | GFP_NOIO | __GFP_HIGHMEM | |
751 | __GFP_MOVABLE); | ||
744 | if (handle) | 752 | if (handle) |
745 | goto compress_again; | 753 | goto compress_again; |
746 | 754 | ||
747 | pr_err("Error allocating memory for compressed page: %u, size=%zu\n", | 755 | pr_err("Error allocating memory for compressed page: %u, size=%u\n", |
748 | index, clen); | 756 | index, clen); |
749 | ret = -ENOMEM; | 757 | ret = -ENOMEM; |
750 | goto out; | 758 | goto out; |
@@ -769,7 +777,7 @@ compress_again: | |||
769 | memcpy(cmem, src, clen); | 777 | memcpy(cmem, src, clen); |
770 | } | 778 | } |
771 | 779 | ||
772 | zcomp_strm_release(zram->comp, zstrm); | 780 | zcomp_stream_put(zram->comp); |
773 | zstrm = NULL; | 781 | zstrm = NULL; |
774 | zs_unmap_object(meta->mem_pool, handle); | 782 | zs_unmap_object(meta->mem_pool, handle); |
775 | 783 | ||
@@ -789,7 +797,7 @@ compress_again: | |||
789 | atomic64_inc(&zram->stats.pages_stored); | 797 | atomic64_inc(&zram->stats.pages_stored); |
790 | out: | 798 | out: |
791 | if (zstrm) | 799 | if (zstrm) |
792 | zcomp_strm_release(zram->comp, zstrm); | 800 | zcomp_stream_put(zram->comp); |
793 | if (is_partial_io(bvec)) | 801 | if (is_partial_io(bvec)) |
794 | kfree(uncmem); | 802 | kfree(uncmem); |
795 | return ret; | 803 | return ret; |
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 3f5bf66a27e4..74fcf10da374 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h | |||
@@ -15,8 +15,9 @@ | |||
15 | #ifndef _ZRAM_DRV_H_ | 15 | #ifndef _ZRAM_DRV_H_ |
16 | #define _ZRAM_DRV_H_ | 16 | #define _ZRAM_DRV_H_ |
17 | 17 | ||
18 | #include <linux/spinlock.h> | 18 | #include <linux/rwsem.h> |
19 | #include <linux/zsmalloc.h> | 19 | #include <linux/zsmalloc.h> |
20 | #include <linux/crypto.h> | ||
20 | 21 | ||
21 | #include "zcomp.h" | 22 | #include "zcomp.h" |
22 | 23 | ||
@@ -113,7 +114,7 @@ struct zram { | |||
113 | * we can store in a disk. | 114 | * we can store in a disk. |
114 | */ | 115 | */ |
115 | u64 disksize; /* bytes */ | 116 | u64 disksize; /* bytes */ |
116 | char compressor[10]; | 117 | char compressor[CRYPTO_MAX_ALG_NAME]; |
117 | /* | 118 | /* |
118 | * zram is claimed so open request will be failed | 119 | * zram is claimed so open request will be failed |
119 | */ | 120 | */ |
diff --git a/drivers/char/mem.c b/drivers/char/mem.c index d633974e7f8b..a33163dbb913 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c | |||
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/device.h> | 22 | #include <linux/device.h> |
23 | #include <linux/highmem.h> | 23 | #include <linux/highmem.h> |
24 | #include <linux/backing-dev.h> | 24 | #include <linux/backing-dev.h> |
25 | #include <linux/shmem_fs.h> | ||
25 | #include <linux/splice.h> | 26 | #include <linux/splice.h> |
26 | #include <linux/pfn.h> | 27 | #include <linux/pfn.h> |
27 | #include <linux/export.h> | 28 | #include <linux/export.h> |
@@ -657,6 +658,28 @@ static int mmap_zero(struct file *file, struct vm_area_struct *vma) | |||
657 | return 0; | 658 | return 0; |
658 | } | 659 | } |
659 | 660 | ||
661 | static unsigned long get_unmapped_area_zero(struct file *file, | ||
662 | unsigned long addr, unsigned long len, | ||
663 | unsigned long pgoff, unsigned long flags) | ||
664 | { | ||
665 | #ifdef CONFIG_MMU | ||
666 | if (flags & MAP_SHARED) { | ||
667 | /* | ||
668 | * mmap_zero() will call shmem_zero_setup() to create a file, | ||
669 | * so use shmem's get_unmapped_area in case it can be huge; | ||
670 | * and pass NULL for file as in mmap.c's get_unmapped_area(), | ||
671 | * so as not to confuse shmem with our handle on "/dev/zero". | ||
672 | */ | ||
673 | return shmem_get_unmapped_area(NULL, addr, len, pgoff, flags); | ||
674 | } | ||
675 | |||
676 | /* Otherwise flags & MAP_PRIVATE: with no shmem object beneath it */ | ||
677 | return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); | ||
678 | #else | ||
679 | return -ENOSYS; | ||
680 | #endif | ||
681 | } | ||
682 | |||
660 | static ssize_t write_full(struct file *file, const char __user *buf, | 683 | static ssize_t write_full(struct file *file, const char __user *buf, |
661 | size_t count, loff_t *ppos) | 684 | size_t count, loff_t *ppos) |
662 | { | 685 | { |
@@ -764,6 +787,7 @@ static const struct file_operations zero_fops = { | |||
764 | .read_iter = read_iter_zero, | 787 | .read_iter = read_iter_zero, |
765 | .write_iter = write_iter_zero, | 788 | .write_iter = write_iter_zero, |
766 | .mmap = mmap_zero, | 789 | .mmap = mmap_zero, |
790 | .get_unmapped_area = get_unmapped_area_zero, | ||
767 | #ifndef CONFIG_MMU | 791 | #ifndef CONFIG_MMU |
768 | .mmap_capabilities = zero_mmap_capabilities, | 792 | .mmap_capabilities = zero_mmap_capabilities, |
769 | #endif | 793 | #endif |
diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index 56999d2fac07..fbdaf81ae925 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c | |||
@@ -538,8 +538,7 @@ static void do_fault(struct work_struct *work) | |||
538 | if (access_error(vma, fault)) | 538 | if (access_error(vma, fault)) |
539 | goto out; | 539 | goto out; |
540 | 540 | ||
541 | ret = handle_mm_fault(mm, vma, address, flags); | 541 | ret = handle_mm_fault(vma, address, flags); |
542 | |||
543 | out: | 542 | out: |
544 | up_read(&mm->mmap_sem); | 543 | up_read(&mm->mmap_sem); |
545 | 544 | ||
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index d9939fa9b588..8ebb3530afa7 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c | |||
@@ -583,7 +583,7 @@ static irqreturn_t prq_event_thread(int irq, void *d) | |||
583 | if (access_error(vma, req)) | 583 | if (access_error(vma, req)) |
584 | goto invalid; | 584 | goto invalid; |
585 | 585 | ||
586 | ret = handle_mm_fault(svm->mm, vma, address, | 586 | ret = handle_mm_fault(vma, address, |
587 | req->wr_req ? FAULT_FLAG_WRITE : 0); | 587 | req->wr_req ? FAULT_FLAG_WRITE : 0); |
588 | if (ret & VM_FAULT_ERROR) | 588 | if (ret & VM_FAULT_ERROR) |
589 | goto invalid; | 589 | goto invalid; |
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index e5139402e7f8..52bbd27e93ae 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c | |||
@@ -363,6 +363,7 @@ static void moom_callback(struct work_struct *ignored) | |||
363 | struct oom_control oc = { | 363 | struct oom_control oc = { |
364 | .zonelist = node_zonelist(first_memory_node, gfp_mask), | 364 | .zonelist = node_zonelist(first_memory_node, gfp_mask), |
365 | .nodemask = NULL, | 365 | .nodemask = NULL, |
366 | .memcg = NULL, | ||
366 | .gfp_mask = gfp_mask, | 367 | .gfp_mask = gfp_mask, |
367 | .order = -1, | 368 | .order = -1, |
368 | }; | 369 | }; |
diff --git a/drivers/video/fbdev/core/fbmon.c b/drivers/video/fbdev/core/fbmon.c index 47c3191ec313..62c0cf79674f 100644 --- a/drivers/video/fbdev/core/fbmon.c +++ b/drivers/video/fbdev/core/fbmon.c | |||
@@ -1496,7 +1496,6 @@ int fb_parse_edid(unsigned char *edid, struct fb_var_screeninfo *var) | |||
1496 | } | 1496 | } |
1497 | void fb_edid_to_monspecs(unsigned char *edid, struct fb_monspecs *specs) | 1497 | void fb_edid_to_monspecs(unsigned char *edid, struct fb_monspecs *specs) |
1498 | { | 1498 | { |
1499 | specs = NULL; | ||
1500 | } | 1499 | } |
1501 | void fb_edid_add_monspecs(unsigned char *edid, struct fb_monspecs *specs) | 1500 | void fb_edid_add_monspecs(unsigned char *edid, struct fb_monspecs *specs) |
1502 | { | 1501 | { |
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 476c0e3a7150..888d5f8322ce 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/oom.h> | 30 | #include <linux/oom.h> |
31 | #include <linux/wait.h> | 31 | #include <linux/wait.h> |
32 | #include <linux/mm.h> | 32 | #include <linux/mm.h> |
33 | #include <linux/mount.h> | ||
33 | 34 | ||
34 | /* | 35 | /* |
35 | * Balloon device works in 4K page units. So each page is pointed to by | 36 | * Balloon device works in 4K page units. So each page is pointed to by |
@@ -45,6 +46,10 @@ static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES; | |||
45 | module_param(oom_pages, int, S_IRUSR | S_IWUSR); | 46 | module_param(oom_pages, int, S_IRUSR | S_IWUSR); |
46 | MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); | 47 | MODULE_PARM_DESC(oom_pages, "pages to free on OOM"); |
47 | 48 | ||
49 | #ifdef CONFIG_BALLOON_COMPACTION | ||
50 | static struct vfsmount *balloon_mnt; | ||
51 | #endif | ||
52 | |||
48 | struct virtio_balloon { | 53 | struct virtio_balloon { |
49 | struct virtio_device *vdev; | 54 | struct virtio_device *vdev; |
50 | struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; | 55 | struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; |
@@ -490,6 +495,24 @@ static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, | |||
490 | 495 | ||
491 | return MIGRATEPAGE_SUCCESS; | 496 | return MIGRATEPAGE_SUCCESS; |
492 | } | 497 | } |
498 | |||
499 | static struct dentry *balloon_mount(struct file_system_type *fs_type, | ||
500 | int flags, const char *dev_name, void *data) | ||
501 | { | ||
502 | static const struct dentry_operations ops = { | ||
503 | .d_dname = simple_dname, | ||
504 | }; | ||
505 | |||
506 | return mount_pseudo(fs_type, "balloon-kvm:", NULL, &ops, | ||
507 | BALLOON_KVM_MAGIC); | ||
508 | } | ||
509 | |||
510 | static struct file_system_type balloon_fs = { | ||
511 | .name = "balloon-kvm", | ||
512 | .mount = balloon_mount, | ||
513 | .kill_sb = kill_anon_super, | ||
514 | }; | ||
515 | |||
493 | #endif /* CONFIG_BALLOON_COMPACTION */ | 516 | #endif /* CONFIG_BALLOON_COMPACTION */ |
494 | 517 | ||
495 | static int virtballoon_probe(struct virtio_device *vdev) | 518 | static int virtballoon_probe(struct virtio_device *vdev) |
@@ -519,9 +542,6 @@ static int virtballoon_probe(struct virtio_device *vdev) | |||
519 | vb->vdev = vdev; | 542 | vb->vdev = vdev; |
520 | 543 | ||
521 | balloon_devinfo_init(&vb->vb_dev_info); | 544 | balloon_devinfo_init(&vb->vb_dev_info); |
522 | #ifdef CONFIG_BALLOON_COMPACTION | ||
523 | vb->vb_dev_info.migratepage = virtballoon_migratepage; | ||
524 | #endif | ||
525 | 545 | ||
526 | err = init_vqs(vb); | 546 | err = init_vqs(vb); |
527 | if (err) | 547 | if (err) |
@@ -531,13 +551,33 @@ static int virtballoon_probe(struct virtio_device *vdev) | |||
531 | vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY; | 551 | vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY; |
532 | err = register_oom_notifier(&vb->nb); | 552 | err = register_oom_notifier(&vb->nb); |
533 | if (err < 0) | 553 | if (err < 0) |
534 | goto out_oom_notify; | 554 | goto out_del_vqs; |
555 | |||
556 | #ifdef CONFIG_BALLOON_COMPACTION | ||
557 | balloon_mnt = kern_mount(&balloon_fs); | ||
558 | if (IS_ERR(balloon_mnt)) { | ||
559 | err = PTR_ERR(balloon_mnt); | ||
560 | unregister_oom_notifier(&vb->nb); | ||
561 | goto out_del_vqs; | ||
562 | } | ||
563 | |||
564 | vb->vb_dev_info.migratepage = virtballoon_migratepage; | ||
565 | vb->vb_dev_info.inode = alloc_anon_inode(balloon_mnt->mnt_sb); | ||
566 | if (IS_ERR(vb->vb_dev_info.inode)) { | ||
567 | err = PTR_ERR(vb->vb_dev_info.inode); | ||
568 | kern_unmount(balloon_mnt); | ||
569 | unregister_oom_notifier(&vb->nb); | ||
570 | vb->vb_dev_info.inode = NULL; | ||
571 | goto out_del_vqs; | ||
572 | } | ||
573 | vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops; | ||
574 | #endif | ||
535 | 575 | ||
536 | virtio_device_ready(vdev); | 576 | virtio_device_ready(vdev); |
537 | 577 | ||
538 | return 0; | 578 | return 0; |
539 | 579 | ||
540 | out_oom_notify: | 580 | out_del_vqs: |
541 | vdev->config->del_vqs(vdev); | 581 | vdev->config->del_vqs(vdev); |
542 | out_free_vb: | 582 | out_free_vb: |
543 | kfree(vb); | 583 | kfree(vb); |
@@ -571,6 +611,8 @@ static void virtballoon_remove(struct virtio_device *vdev) | |||
571 | cancel_work_sync(&vb->update_balloon_stats_work); | 611 | cancel_work_sync(&vb->update_balloon_stats_work); |
572 | 612 | ||
573 | remove_common(vb); | 613 | remove_common(vb); |
614 | if (vb->vb_dev_info.inode) | ||
615 | iput(vb->vb_dev_info.inode); | ||
574 | kfree(vb); | 616 | kfree(vb); |
575 | } | 617 | } |
576 | 618 | ||
diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 53a085fca00c..66620713242a 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c | |||
@@ -195,7 +195,7 @@ static void selfballoon_process(struct work_struct *work) | |||
195 | MB2PAGES(selfballoon_reserved_mb); | 195 | MB2PAGES(selfballoon_reserved_mb); |
196 | #ifdef CONFIG_FRONTSWAP | 196 | #ifdef CONFIG_FRONTSWAP |
197 | /* allow space for frontswap pages to be repatriated */ | 197 | /* allow space for frontswap pages to be repatriated */ |
198 | if (frontswap_selfshrinking && frontswap_enabled) | 198 | if (frontswap_selfshrinking) |
199 | goal_pages += frontswap_curr_pages(); | 199 | goal_pages += frontswap_curr_pages(); |
200 | #endif | 200 | #endif |
201 | if (cur_pages > goal_pages) | 201 | if (cur_pages > goal_pages) |
@@ -230,7 +230,7 @@ static void selfballoon_process(struct work_struct *work) | |||
230 | reset_timer = true; | 230 | reset_timer = true; |
231 | } | 231 | } |
232 | #ifdef CONFIG_FRONTSWAP | 232 | #ifdef CONFIG_FRONTSWAP |
233 | if (frontswap_selfshrinking && frontswap_enabled) { | 233 | if (frontswap_selfshrinking) { |
234 | frontswap_selfshrink(); | 234 | frontswap_selfshrink(); |
235 | reset_timer = true; | 235 | reset_timer = true; |
236 | } | 236 | } |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 27c214941004..cee4cb99b8ce 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -4178,7 +4178,8 @@ int extent_readpages(struct extent_io_tree *tree, | |||
4178 | prefetchw(&page->flags); | 4178 | prefetchw(&page->flags); |
4179 | list_del(&page->lru); | 4179 | list_del(&page->lru); |
4180 | if (add_to_page_cache_lru(page, mapping, | 4180 | if (add_to_page_cache_lru(page, mapping, |
4181 | page->index, GFP_NOFS)) { | 4181 | page->index, |
4182 | readahead_gfp_mask(mapping))) { | ||
4182 | put_page(page); | 4183 | put_page(page); |
4183 | continue; | 4184 | continue; |
4184 | } | 4185 | } |
diff --git a/fs/cifs/file.c b/fs/cifs/file.c index d4890b6dc22d..579e41b350a2 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c | |||
@@ -3366,7 +3366,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, | |||
3366 | struct page *page, *tpage; | 3366 | struct page *page, *tpage; |
3367 | unsigned int expected_index; | 3367 | unsigned int expected_index; |
3368 | int rc; | 3368 | int rc; |
3369 | gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); | 3369 | gfp_t gfp = readahead_gfp_mask(mapping); |
3370 | 3370 | ||
3371 | INIT_LIST_HEAD(tmplist); | 3371 | INIT_LIST_HEAD(tmplist); |
3372 | 3372 | ||
@@ -819,16 +819,16 @@ static int dax_insert_mapping(struct address_space *mapping, | |||
819 | } | 819 | } |
820 | 820 | ||
821 | /** | 821 | /** |
822 | * __dax_fault - handle a page fault on a DAX file | 822 | * dax_fault - handle a page fault on a DAX file |
823 | * @vma: The virtual memory area where the fault occurred | 823 | * @vma: The virtual memory area where the fault occurred |
824 | * @vmf: The description of the fault | 824 | * @vmf: The description of the fault |
825 | * @get_block: The filesystem method used to translate file offsets to blocks | 825 | * @get_block: The filesystem method used to translate file offsets to blocks |
826 | * | 826 | * |
827 | * When a page fault occurs, filesystems may call this helper in their | 827 | * When a page fault occurs, filesystems may call this helper in their |
828 | * fault handler for DAX files. __dax_fault() assumes the caller has done all | 828 | * fault handler for DAX files. dax_fault() assumes the caller has done all |
829 | * the necessary locking for the page fault to proceed successfully. | 829 | * the necessary locking for the page fault to proceed successfully. |
830 | */ | 830 | */ |
831 | int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | 831 | int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, |
832 | get_block_t get_block) | 832 | get_block_t get_block) |
833 | { | 833 | { |
834 | struct file *file = vma->vm_file; | 834 | struct file *file = vma->vm_file; |
@@ -913,33 +913,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
913 | return VM_FAULT_SIGBUS | major; | 913 | return VM_FAULT_SIGBUS | major; |
914 | return VM_FAULT_NOPAGE | major; | 914 | return VM_FAULT_NOPAGE | major; |
915 | } | 915 | } |
916 | EXPORT_SYMBOL(__dax_fault); | ||
917 | |||
918 | /** | ||
919 | * dax_fault - handle a page fault on a DAX file | ||
920 | * @vma: The virtual memory area where the fault occurred | ||
921 | * @vmf: The description of the fault | ||
922 | * @get_block: The filesystem method used to translate file offsets to blocks | ||
923 | * | ||
924 | * When a page fault occurs, filesystems may call this helper in their | ||
925 | * fault handler for DAX files. | ||
926 | */ | ||
927 | int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | ||
928 | get_block_t get_block) | ||
929 | { | ||
930 | int result; | ||
931 | struct super_block *sb = file_inode(vma->vm_file)->i_sb; | ||
932 | |||
933 | if (vmf->flags & FAULT_FLAG_WRITE) { | ||
934 | sb_start_pagefault(sb); | ||
935 | file_update_time(vma->vm_file); | ||
936 | } | ||
937 | result = __dax_fault(vma, vmf, get_block); | ||
938 | if (vmf->flags & FAULT_FLAG_WRITE) | ||
939 | sb_end_pagefault(sb); | ||
940 | |||
941 | return result; | ||
942 | } | ||
943 | EXPORT_SYMBOL_GPL(dax_fault); | 916 | EXPORT_SYMBOL_GPL(dax_fault); |
944 | 917 | ||
945 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) | 918 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) |
@@ -967,7 +940,16 @@ static void __dax_dbg(struct buffer_head *bh, unsigned long address, | |||
967 | 940 | ||
968 | #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") | 941 | #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") |
969 | 942 | ||
970 | int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, | 943 | /** |
944 | * dax_pmd_fault - handle a PMD fault on a DAX file | ||
945 | * @vma: The virtual memory area where the fault occurred | ||
946 | * @vmf: The description of the fault | ||
947 | * @get_block: The filesystem method used to translate file offsets to blocks | ||
948 | * | ||
949 | * When a page fault occurs, filesystems may call this helper in their | ||
950 | * pmd_fault handler for DAX files. | ||
951 | */ | ||
952 | int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, | ||
971 | pmd_t *pmd, unsigned int flags, get_block_t get_block) | 953 | pmd_t *pmd, unsigned int flags, get_block_t get_block) |
972 | { | 954 | { |
973 | struct file *file = vma->vm_file; | 955 | struct file *file = vma->vm_file; |
@@ -1119,7 +1101,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
1119 | * | 1101 | * |
1120 | * The PMD path doesn't have an equivalent to | 1102 | * The PMD path doesn't have an equivalent to |
1121 | * dax_pfn_mkwrite(), though, so for a read followed by a | 1103 | * dax_pfn_mkwrite(), though, so for a read followed by a |
1122 | * write we traverse all the way through __dax_pmd_fault() | 1104 | * write we traverse all the way through dax_pmd_fault() |
1123 | * twice. This means we can just skip inserting a radix tree | 1105 | * twice. This means we can just skip inserting a radix tree |
1124 | * entry completely on the initial read and just wait until | 1106 | * entry completely on the initial read and just wait until |
1125 | * the write to insert a dirty entry. | 1107 | * the write to insert a dirty entry. |
@@ -1148,33 +1130,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
1148 | result = VM_FAULT_FALLBACK; | 1130 | result = VM_FAULT_FALLBACK; |
1149 | goto out; | 1131 | goto out; |
1150 | } | 1132 | } |
1151 | EXPORT_SYMBOL_GPL(__dax_pmd_fault); | ||
1152 | |||
1153 | /** | ||
1154 | * dax_pmd_fault - handle a PMD fault on a DAX file | ||
1155 | * @vma: The virtual memory area where the fault occurred | ||
1156 | * @vmf: The description of the fault | ||
1157 | * @get_block: The filesystem method used to translate file offsets to blocks | ||
1158 | * | ||
1159 | * When a page fault occurs, filesystems may call this helper in their | ||
1160 | * pmd_fault handler for DAX files. | ||
1161 | */ | ||
1162 | int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, | ||
1163 | pmd_t *pmd, unsigned int flags, get_block_t get_block) | ||
1164 | { | ||
1165 | int result; | ||
1166 | struct super_block *sb = file_inode(vma->vm_file)->i_sb; | ||
1167 | |||
1168 | if (flags & FAULT_FLAG_WRITE) { | ||
1169 | sb_start_pagefault(sb); | ||
1170 | file_update_time(vma->vm_file); | ||
1171 | } | ||
1172 | result = __dax_pmd_fault(vma, address, pmd, flags, get_block); | ||
1173 | if (flags & FAULT_FLAG_WRITE) | ||
1174 | sb_end_pagefault(sb); | ||
1175 | |||
1176 | return result; | ||
1177 | } | ||
1178 | EXPORT_SYMBOL_GPL(dax_pmd_fault); | 1133 | EXPORT_SYMBOL_GPL(dax_pmd_fault); |
1179 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 1134 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
1180 | 1135 | ||
diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 868c02317b05..5efeefe17abb 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c | |||
@@ -51,7 +51,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
51 | } | 51 | } |
52 | down_read(&ei->dax_sem); | 52 | down_read(&ei->dax_sem); |
53 | 53 | ||
54 | ret = __dax_fault(vma, vmf, ext2_get_block); | 54 | ret = dax_fault(vma, vmf, ext2_get_block); |
55 | 55 | ||
56 | up_read(&ei->dax_sem); | 56 | up_read(&ei->dax_sem); |
57 | if (vmf->flags & FAULT_FLAG_WRITE) | 57 | if (vmf->flags & FAULT_FLAG_WRITE) |
@@ -72,7 +72,7 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, | |||
72 | } | 72 | } |
73 | down_read(&ei->dax_sem); | 73 | down_read(&ei->dax_sem); |
74 | 74 | ||
75 | ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block); | 75 | ret = dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block); |
76 | 76 | ||
77 | up_read(&ei->dax_sem); | 77 | up_read(&ei->dax_sem); |
78 | if (flags & FAULT_FLAG_WRITE) | 78 | if (flags & FAULT_FLAG_WRITE) |
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 4f615cdd22ca..261ac3734c58 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c | |||
@@ -202,7 +202,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
202 | if (IS_ERR(handle)) | 202 | if (IS_ERR(handle)) |
203 | result = VM_FAULT_SIGBUS; | 203 | result = VM_FAULT_SIGBUS; |
204 | else | 204 | else |
205 | result = __dax_fault(vma, vmf, ext4_dax_get_block); | 205 | result = dax_fault(vma, vmf, ext4_dax_get_block); |
206 | 206 | ||
207 | if (write) { | 207 | if (write) { |
208 | if (!IS_ERR(handle)) | 208 | if (!IS_ERR(handle)) |
@@ -237,7 +237,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, | |||
237 | if (IS_ERR(handle)) | 237 | if (IS_ERR(handle)) |
238 | result = VM_FAULT_SIGBUS; | 238 | result = VM_FAULT_SIGBUS; |
239 | else | 239 | else |
240 | result = __dax_pmd_fault(vma, addr, pmd, flags, | 240 | result = dax_pmd_fault(vma, addr, pmd, flags, |
241 | ext4_dax_get_block); | 241 | ext4_dax_get_block); |
242 | 242 | ||
243 | if (write) { | 243 | if (write) { |
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index bfc7f4d30643..a81b829d56de 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c | |||
@@ -130,7 +130,7 @@ int ext4_mpage_readpages(struct address_space *mapping, | |||
130 | page = list_entry(pages->prev, struct page, lru); | 130 | page = list_entry(pages->prev, struct page, lru); |
131 | list_del(&page->lru); | 131 | list_del(&page->lru); |
132 | if (add_to_page_cache_lru(page, mapping, page->index, | 132 | if (add_to_page_cache_lru(page, mapping, page->index, |
133 | mapping_gfp_constraint(mapping, GFP_KERNEL))) | 133 | readahead_gfp_mask(mapping))) |
134 | goto next_page; | 134 | goto next_page; |
135 | } | 135 | } |
136 | 136 | ||
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8769e8349dff..ded224518978 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c | |||
@@ -1002,7 +1002,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping, | |||
1002 | page = list_entry(pages->prev, struct page, lru); | 1002 | page = list_entry(pages->prev, struct page, lru); |
1003 | list_del(&page->lru); | 1003 | list_del(&page->lru); |
1004 | if (add_to_page_cache_lru(page, mapping, | 1004 | if (add_to_page_cache_lru(page, mapping, |
1005 | page->index, GFP_KERNEL)) | 1005 | page->index, |
1006 | readahead_gfp_mask(mapping))) | ||
1006 | goto next_page; | 1007 | goto next_page; |
1007 | } | 1008 | } |
1008 | 1009 | ||
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index fe7e83a45eff..6f9c9f6f5157 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -981,6 +981,42 @@ void inode_io_list_del(struct inode *inode) | |||
981 | } | 981 | } |
982 | 982 | ||
983 | /* | 983 | /* |
984 | * mark an inode as under writeback on the sb | ||
985 | */ | ||
986 | void sb_mark_inode_writeback(struct inode *inode) | ||
987 | { | ||
988 | struct super_block *sb = inode->i_sb; | ||
989 | unsigned long flags; | ||
990 | |||
991 | if (list_empty(&inode->i_wb_list)) { | ||
992 | spin_lock_irqsave(&sb->s_inode_wblist_lock, flags); | ||
993 | if (list_empty(&inode->i_wb_list)) { | ||
994 | list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb); | ||
995 | trace_sb_mark_inode_writeback(inode); | ||
996 | } | ||
997 | spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags); | ||
998 | } | ||
999 | } | ||
1000 | |||
1001 | /* | ||
1002 | * clear an inode as under writeback on the sb | ||
1003 | */ | ||
1004 | void sb_clear_inode_writeback(struct inode *inode) | ||
1005 | { | ||
1006 | struct super_block *sb = inode->i_sb; | ||
1007 | unsigned long flags; | ||
1008 | |||
1009 | if (!list_empty(&inode->i_wb_list)) { | ||
1010 | spin_lock_irqsave(&sb->s_inode_wblist_lock, flags); | ||
1011 | if (!list_empty(&inode->i_wb_list)) { | ||
1012 | list_del_init(&inode->i_wb_list); | ||
1013 | trace_sb_clear_inode_writeback(inode); | ||
1014 | } | ||
1015 | spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags); | ||
1016 | } | ||
1017 | } | ||
1018 | |||
1019 | /* | ||
984 | * Redirty an inode: set its when-it-was dirtied timestamp and move it to the | 1020 | * Redirty an inode: set its when-it-was dirtied timestamp and move it to the |
985 | * furthest end of its superblock's dirty-inode list. | 1021 | * furthest end of its superblock's dirty-inode list. |
986 | * | 1022 | * |
@@ -2154,7 +2190,7 @@ EXPORT_SYMBOL(__mark_inode_dirty); | |||
2154 | */ | 2190 | */ |
2155 | static void wait_sb_inodes(struct super_block *sb) | 2191 | static void wait_sb_inodes(struct super_block *sb) |
2156 | { | 2192 | { |
2157 | struct inode *inode, *old_inode = NULL; | 2193 | LIST_HEAD(sync_list); |
2158 | 2194 | ||
2159 | /* | 2195 | /* |
2160 | * We need to be protected against the filesystem going from | 2196 | * We need to be protected against the filesystem going from |
@@ -2163,38 +2199,60 @@ static void wait_sb_inodes(struct super_block *sb) | |||
2163 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); | 2199 | WARN_ON(!rwsem_is_locked(&sb->s_umount)); |
2164 | 2200 | ||
2165 | mutex_lock(&sb->s_sync_lock); | 2201 | mutex_lock(&sb->s_sync_lock); |
2166 | spin_lock(&sb->s_inode_list_lock); | ||
2167 | 2202 | ||
2168 | /* | 2203 | /* |
2169 | * Data integrity sync. Must wait for all pages under writeback, | 2204 | * Splice the writeback list onto a temporary list to avoid waiting on |
2170 | * because there may have been pages dirtied before our sync | 2205 | * inodes that have started writeback after this point. |
2171 | * call, but which had writeout started before we write it out. | 2206 | * |
2172 | * In which case, the inode may not be on the dirty list, but | 2207 | * Use rcu_read_lock() to keep the inodes around until we have a |
2173 | * we still have to wait for that writeout. | 2208 | * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as |
2209 | * the local list because inodes can be dropped from either by writeback | ||
2210 | * completion. | ||
2211 | */ | ||
2212 | rcu_read_lock(); | ||
2213 | spin_lock_irq(&sb->s_inode_wblist_lock); | ||
2214 | list_splice_init(&sb->s_inodes_wb, &sync_list); | ||
2215 | |||
2216 | /* | ||
2217 | * Data integrity sync. Must wait for all pages under writeback, because | ||
2218 | * there may have been pages dirtied before our sync call, but which had | ||
2219 | * writeout started before we write it out. In which case, the inode | ||
2220 | * may not be on the dirty list, but we still have to wait for that | ||
2221 | * writeout. | ||
2174 | */ | 2222 | */ |
2175 | list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { | 2223 | while (!list_empty(&sync_list)) { |
2224 | struct inode *inode = list_first_entry(&sync_list, struct inode, | ||
2225 | i_wb_list); | ||
2176 | struct address_space *mapping = inode->i_mapping; | 2226 | struct address_space *mapping = inode->i_mapping; |
2177 | 2227 | ||
2228 | /* | ||
2229 | * Move each inode back to the wb list before we drop the lock | ||
2230 | * to preserve consistency between i_wb_list and the mapping | ||
2231 | * writeback tag. Writeback completion is responsible to remove | ||
2232 | * the inode from either list once the writeback tag is cleared. | ||
2233 | */ | ||
2234 | list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb); | ||
2235 | |||
2236 | /* | ||
2237 | * The mapping can appear untagged while still on-list since we | ||
2238 | * do not have the mapping lock. Skip it here, wb completion | ||
2239 | * will remove it. | ||
2240 | */ | ||
2241 | if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) | ||
2242 | continue; | ||
2243 | |||
2244 | spin_unlock_irq(&sb->s_inode_wblist_lock); | ||
2245 | |||
2178 | spin_lock(&inode->i_lock); | 2246 | spin_lock(&inode->i_lock); |
2179 | if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || | 2247 | if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { |
2180 | (mapping->nrpages == 0)) { | ||
2181 | spin_unlock(&inode->i_lock); | 2248 | spin_unlock(&inode->i_lock); |
2249 | |||
2250 | spin_lock_irq(&sb->s_inode_wblist_lock); | ||
2182 | continue; | 2251 | continue; |
2183 | } | 2252 | } |
2184 | __iget(inode); | 2253 | __iget(inode); |
2185 | spin_unlock(&inode->i_lock); | 2254 | spin_unlock(&inode->i_lock); |
2186 | spin_unlock(&sb->s_inode_list_lock); | 2255 | rcu_read_unlock(); |
2187 | |||
2188 | /* | ||
2189 | * We hold a reference to 'inode' so it couldn't have been | ||
2190 | * removed from s_inodes list while we dropped the | ||
2191 | * s_inode_list_lock. We cannot iput the inode now as we can | ||
2192 | * be holding the last reference and we cannot iput it under | ||
2193 | * s_inode_list_lock. So we keep the reference and iput it | ||
2194 | * later. | ||
2195 | */ | ||
2196 | iput(old_inode); | ||
2197 | old_inode = inode; | ||
2198 | 2256 | ||
2199 | /* | 2257 | /* |
2200 | * We keep the error status of individual mapping so that | 2258 | * We keep the error status of individual mapping so that |
@@ -2205,10 +2263,13 @@ static void wait_sb_inodes(struct super_block *sb) | |||
2205 | 2263 | ||
2206 | cond_resched(); | 2264 | cond_resched(); |
2207 | 2265 | ||
2208 | spin_lock(&sb->s_inode_list_lock); | 2266 | iput(inode); |
2267 | |||
2268 | rcu_read_lock(); | ||
2269 | spin_lock_irq(&sb->s_inode_wblist_lock); | ||
2209 | } | 2270 | } |
2210 | spin_unlock(&sb->s_inode_list_lock); | 2271 | spin_unlock_irq(&sb->s_inode_wblist_lock); |
2211 | iput(old_inode); | 2272 | rcu_read_unlock(); |
2212 | mutex_unlock(&sb->s_sync_lock); | 2273 | mutex_unlock(&sb->s_sync_lock); |
2213 | } | 2274 | } |
2214 | 2275 | ||
diff --git a/fs/inode.c b/fs/inode.c index 4ccbc21b30ce..e171f7b5f9e4 100644 --- a/fs/inode.c +++ b/fs/inode.c | |||
@@ -365,6 +365,7 @@ void inode_init_once(struct inode *inode) | |||
365 | INIT_HLIST_NODE(&inode->i_hash); | 365 | INIT_HLIST_NODE(&inode->i_hash); |
366 | INIT_LIST_HEAD(&inode->i_devices); | 366 | INIT_LIST_HEAD(&inode->i_devices); |
367 | INIT_LIST_HEAD(&inode->i_io_list); | 367 | INIT_LIST_HEAD(&inode->i_io_list); |
368 | INIT_LIST_HEAD(&inode->i_wb_list); | ||
368 | INIT_LIST_HEAD(&inode->i_lru); | 369 | INIT_LIST_HEAD(&inode->i_lru); |
369 | address_space_init_once(&inode->i_data); | 370 | address_space_init_once(&inode->i_data); |
370 | i_size_ordered_init(inode); | 371 | i_size_ordered_init(inode); |
@@ -507,6 +508,7 @@ void clear_inode(struct inode *inode) | |||
507 | BUG_ON(!list_empty(&inode->i_data.private_list)); | 508 | BUG_ON(!list_empty(&inode->i_data.private_list)); |
508 | BUG_ON(!(inode->i_state & I_FREEING)); | 509 | BUG_ON(!(inode->i_state & I_FREEING)); |
509 | BUG_ON(inode->i_state & I_CLEAR); | 510 | BUG_ON(inode->i_state & I_CLEAR); |
511 | BUG_ON(!list_empty(&inode->i_wb_list)); | ||
510 | /* don't need i_lock here, no concurrent mods to i_state */ | 512 | /* don't need i_lock here, no concurrent mods to i_state */ |
511 | inode->i_state = I_FREEING | I_CLEAR; | 513 | inode->i_state = I_FREEING | I_CLEAR; |
512 | } | 514 | } |
diff --git a/fs/mpage.c b/fs/mpage.c index 37b28280ad04..2ca1f39c8cba 100644 --- a/fs/mpage.c +++ b/fs/mpage.c | |||
@@ -72,6 +72,8 @@ mpage_alloc(struct block_device *bdev, | |||
72 | { | 72 | { |
73 | struct bio *bio; | 73 | struct bio *bio; |
74 | 74 | ||
75 | /* Restrict the given (page cache) mask for slab allocations */ | ||
76 | gfp_flags &= GFP_KERNEL; | ||
75 | bio = bio_alloc(gfp_flags, nr_vecs); | 77 | bio = bio_alloc(gfp_flags, nr_vecs); |
76 | 78 | ||
77 | if (bio == NULL && (current->flags & PF_MEMALLOC)) { | 79 | if (bio == NULL && (current->flags & PF_MEMALLOC)) { |
@@ -363,7 +365,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, | |||
363 | sector_t last_block_in_bio = 0; | 365 | sector_t last_block_in_bio = 0; |
364 | struct buffer_head map_bh; | 366 | struct buffer_head map_bh; |
365 | unsigned long first_logical_block = 0; | 367 | unsigned long first_logical_block = 0; |
366 | gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); | 368 | gfp_t gfp = readahead_gfp_mask(mapping); |
367 | 369 | ||
368 | map_bh.b_state = 0; | 370 | map_bh.b_state = 0; |
369 | map_bh.b_size = 0; | 371 | map_bh.b_size = 0; |
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 4238eb28889f..1d67fcbf7160 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c | |||
@@ -1618,16 +1618,12 @@ static void o2net_start_connect(struct work_struct *work) | |||
1618 | 1618 | ||
1619 | /* watch for racing with tearing a node down */ | 1619 | /* watch for racing with tearing a node down */ |
1620 | node = o2nm_get_node_by_num(o2net_num_from_nn(nn)); | 1620 | node = o2nm_get_node_by_num(o2net_num_from_nn(nn)); |
1621 | if (node == NULL) { | 1621 | if (node == NULL) |
1622 | ret = 0; | ||
1623 | goto out; | 1622 | goto out; |
1624 | } | ||
1625 | 1623 | ||
1626 | mynode = o2nm_get_node_by_num(o2nm_this_node()); | 1624 | mynode = o2nm_get_node_by_num(o2nm_this_node()); |
1627 | if (mynode == NULL) { | 1625 | if (mynode == NULL) |
1628 | ret = 0; | ||
1629 | goto out; | 1626 | goto out; |
1630 | } | ||
1631 | 1627 | ||
1632 | spin_lock(&nn->nn_lock); | 1628 | spin_lock(&nn->nn_lock); |
1633 | /* | 1629 | /* |
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 825136070d2c..e7b760deefae 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c | |||
@@ -347,26 +347,6 @@ static struct dentry *dlm_debugfs_root; | |||
347 | #define DLM_DEBUGFS_PURGE_LIST "purge_list" | 347 | #define DLM_DEBUGFS_PURGE_LIST "purge_list" |
348 | 348 | ||
349 | /* begin - utils funcs */ | 349 | /* begin - utils funcs */ |
350 | static void dlm_debug_free(struct kref *kref) | ||
351 | { | ||
352 | struct dlm_debug_ctxt *dc; | ||
353 | |||
354 | dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt); | ||
355 | |||
356 | kfree(dc); | ||
357 | } | ||
358 | |||
359 | static void dlm_debug_put(struct dlm_debug_ctxt *dc) | ||
360 | { | ||
361 | if (dc) | ||
362 | kref_put(&dc->debug_refcnt, dlm_debug_free); | ||
363 | } | ||
364 | |||
365 | static void dlm_debug_get(struct dlm_debug_ctxt *dc) | ||
366 | { | ||
367 | kref_get(&dc->debug_refcnt); | ||
368 | } | ||
369 | |||
370 | static int debug_release(struct inode *inode, struct file *file) | 350 | static int debug_release(struct inode *inode, struct file *file) |
371 | { | 351 | { |
372 | free_page((unsigned long)file->private_data); | 352 | free_page((unsigned long)file->private_data); |
@@ -932,11 +912,9 @@ int dlm_debug_init(struct dlm_ctxt *dlm) | |||
932 | goto bail; | 912 | goto bail; |
933 | } | 913 | } |
934 | 914 | ||
935 | dlm_debug_get(dc); | ||
936 | return 0; | 915 | return 0; |
937 | 916 | ||
938 | bail: | 917 | bail: |
939 | dlm_debug_shutdown(dlm); | ||
940 | return -ENOMEM; | 918 | return -ENOMEM; |
941 | } | 919 | } |
942 | 920 | ||
@@ -949,7 +927,8 @@ void dlm_debug_shutdown(struct dlm_ctxt *dlm) | |||
949 | debugfs_remove(dc->debug_mle_dentry); | 927 | debugfs_remove(dc->debug_mle_dentry); |
950 | debugfs_remove(dc->debug_lockres_dentry); | 928 | debugfs_remove(dc->debug_lockres_dentry); |
951 | debugfs_remove(dc->debug_state_dentry); | 929 | debugfs_remove(dc->debug_state_dentry); |
952 | dlm_debug_put(dc); | 930 | kfree(dc); |
931 | dc = NULL; | ||
953 | } | 932 | } |
954 | } | 933 | } |
955 | 934 | ||
@@ -969,7 +948,6 @@ int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) | |||
969 | mlog_errno(-ENOMEM); | 948 | mlog_errno(-ENOMEM); |
970 | goto bail; | 949 | goto bail; |
971 | } | 950 | } |
972 | kref_init(&dlm->dlm_debug_ctxt->debug_refcnt); | ||
973 | 951 | ||
974 | return 0; | 952 | return 0; |
975 | bail: | 953 | bail: |
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h index 1f27c4812d1a..5ced5482e7d3 100644 --- a/fs/ocfs2/dlm/dlmdebug.h +++ b/fs/ocfs2/dlm/dlmdebug.h | |||
@@ -30,7 +30,6 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle); | |||
30 | #ifdef CONFIG_DEBUG_FS | 30 | #ifdef CONFIG_DEBUG_FS |
31 | 31 | ||
32 | struct dlm_debug_ctxt { | 32 | struct dlm_debug_ctxt { |
33 | struct kref debug_refcnt; | ||
34 | struct dentry *debug_state_dentry; | 33 | struct dentry *debug_state_dentry; |
35 | struct dentry *debug_lockres_dentry; | 34 | struct dentry *debug_lockres_dentry; |
36 | struct dentry *debug_mle_dentry; | 35 | struct dentry *debug_mle_dentry; |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1eaa9100c889..83d576f6a287 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -1635,7 +1635,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode) | |||
1635 | int ret; | 1635 | int ret; |
1636 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1636 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1637 | 1637 | ||
1638 | BUG_ON(!inode); | ||
1639 | BUG_ON(!ocfs2_inode_is_new(inode)); | 1638 | BUG_ON(!ocfs2_inode_is_new(inode)); |
1640 | 1639 | ||
1641 | mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); | 1640 | mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); |
@@ -1665,10 +1664,8 @@ int ocfs2_create_new_inode_locks(struct inode *inode) | |||
1665 | } | 1664 | } |
1666 | 1665 | ||
1667 | ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0); | 1666 | ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0); |
1668 | if (ret) { | 1667 | if (ret) |
1669 | mlog_errno(ret); | 1668 | mlog_errno(ret); |
1670 | goto bail; | ||
1671 | } | ||
1672 | 1669 | ||
1673 | bail: | 1670 | bail: |
1674 | return ret; | 1671 | return ret; |
@@ -1680,8 +1677,6 @@ int ocfs2_rw_lock(struct inode *inode, int write) | |||
1680 | struct ocfs2_lock_res *lockres; | 1677 | struct ocfs2_lock_res *lockres; |
1681 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1678 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1682 | 1679 | ||
1683 | BUG_ON(!inode); | ||
1684 | |||
1685 | mlog(0, "inode %llu take %s RW lock\n", | 1680 | mlog(0, "inode %llu take %s RW lock\n", |
1686 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 1681 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
1687 | write ? "EXMODE" : "PRMODE"); | 1682 | write ? "EXMODE" : "PRMODE"); |
@@ -1724,8 +1719,6 @@ int ocfs2_open_lock(struct inode *inode) | |||
1724 | struct ocfs2_lock_res *lockres; | 1719 | struct ocfs2_lock_res *lockres; |
1725 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1720 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1726 | 1721 | ||
1727 | BUG_ON(!inode); | ||
1728 | |||
1729 | mlog(0, "inode %llu take PRMODE open lock\n", | 1722 | mlog(0, "inode %llu take PRMODE open lock\n", |
1730 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | 1723 | (unsigned long long)OCFS2_I(inode)->ip_blkno); |
1731 | 1724 | ||
@@ -1749,8 +1742,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write) | |||
1749 | struct ocfs2_lock_res *lockres; | 1742 | struct ocfs2_lock_res *lockres; |
1750 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1743 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1751 | 1744 | ||
1752 | BUG_ON(!inode); | ||
1753 | |||
1754 | mlog(0, "inode %llu try to take %s open lock\n", | 1745 | mlog(0, "inode %llu try to take %s open lock\n", |
1755 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 1746 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
1756 | write ? "EXMODE" : "PRMODE"); | 1747 | write ? "EXMODE" : "PRMODE"); |
@@ -2328,8 +2319,6 @@ int ocfs2_inode_lock_full_nested(struct inode *inode, | |||
2328 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 2319 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
2329 | struct buffer_head *local_bh = NULL; | 2320 | struct buffer_head *local_bh = NULL; |
2330 | 2321 | ||
2331 | BUG_ON(!inode); | ||
2332 | |||
2333 | mlog(0, "inode %llu, take %s META lock\n", | 2322 | mlog(0, "inode %llu, take %s META lock\n", |
2334 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 2323 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
2335 | ex ? "EXMODE" : "PRMODE"); | 2324 | ex ? "EXMODE" : "PRMODE"); |
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index d8f3fc8d2551..50cc55047443 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h | |||
@@ -145,22 +145,15 @@ int ocfs2_drop_inode(struct inode *inode); | |||
145 | struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); | 145 | struct inode *ocfs2_ilookup(struct super_block *sb, u64 feoff); |
146 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, | 146 | struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, |
147 | int sysfile_type); | 147 | int sysfile_type); |
148 | int ocfs2_inode_init_private(struct inode *inode); | ||
149 | int ocfs2_inode_revalidate(struct dentry *dentry); | 148 | int ocfs2_inode_revalidate(struct dentry *dentry); |
150 | void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, | 149 | void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, |
151 | int create_ino); | 150 | int create_ino); |
152 | void ocfs2_read_inode(struct inode *inode); | ||
153 | void ocfs2_read_inode2(struct inode *inode, void *opaque); | ||
154 | ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf, | ||
155 | size_t size, loff_t *offp); | ||
156 | void ocfs2_sync_blockdev(struct super_block *sb); | 151 | void ocfs2_sync_blockdev(struct super_block *sb); |
157 | void ocfs2_refresh_inode(struct inode *inode, | 152 | void ocfs2_refresh_inode(struct inode *inode, |
158 | struct ocfs2_dinode *fe); | 153 | struct ocfs2_dinode *fe); |
159 | int ocfs2_mark_inode_dirty(handle_t *handle, | 154 | int ocfs2_mark_inode_dirty(handle_t *handle, |
160 | struct inode *inode, | 155 | struct inode *inode, |
161 | struct buffer_head *bh); | 156 | struct buffer_head *bh); |
162 | struct buffer_head *ocfs2_bread(struct inode *inode, | ||
163 | int block, int *err, int reada); | ||
164 | 157 | ||
165 | void ocfs2_set_inode_flags(struct inode *inode); | 158 | void ocfs2_set_inode_flags(struct inode *inode); |
166 | void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi); | 159 | void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi); |
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index e607419cdfa4..a244f14c6b87 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -1159,10 +1159,8 @@ static int ocfs2_force_read_journal(struct inode *inode) | |||
1159 | int status = 0; | 1159 | int status = 0; |
1160 | int i; | 1160 | int i; |
1161 | u64 v_blkno, p_blkno, p_blocks, num_blocks; | 1161 | u64 v_blkno, p_blkno, p_blocks, num_blocks; |
1162 | #define CONCURRENT_JOURNAL_FILL 32ULL | 1162 | struct buffer_head *bh = NULL; |
1163 | struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL]; | 1163 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1164 | |||
1165 | memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); | ||
1166 | 1164 | ||
1167 | num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); | 1165 | num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); |
1168 | v_blkno = 0; | 1166 | v_blkno = 0; |
@@ -1174,29 +1172,32 @@ static int ocfs2_force_read_journal(struct inode *inode) | |||
1174 | goto bail; | 1172 | goto bail; |
1175 | } | 1173 | } |
1176 | 1174 | ||
1177 | if (p_blocks > CONCURRENT_JOURNAL_FILL) | 1175 | for (i = 0; i < p_blocks; i++, p_blkno++) { |
1178 | p_blocks = CONCURRENT_JOURNAL_FILL; | 1176 | bh = __find_get_block(osb->sb->s_bdev, p_blkno, |
1179 | 1177 | osb->sb->s_blocksize); | |
1180 | /* We are reading journal data which should not | 1178 | /* block not cached. */ |
1181 | * be put in the uptodate cache */ | 1179 | if (!bh) |
1182 | status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb), | 1180 | continue; |
1183 | p_blkno, p_blocks, bhs); | 1181 | |
1184 | if (status < 0) { | 1182 | brelse(bh); |
1185 | mlog_errno(status); | 1183 | bh = NULL; |
1186 | goto bail; | 1184 | /* We are reading journal data which should not |
1187 | } | 1185 | * be put in the uptodate cache. |
1186 | */ | ||
1187 | status = ocfs2_read_blocks_sync(osb, p_blkno, 1, &bh); | ||
1188 | if (status < 0) { | ||
1189 | mlog_errno(status); | ||
1190 | goto bail; | ||
1191 | } | ||
1188 | 1192 | ||
1189 | for(i = 0; i < p_blocks; i++) { | 1193 | brelse(bh); |
1190 | brelse(bhs[i]); | 1194 | bh = NULL; |
1191 | bhs[i] = NULL; | ||
1192 | } | 1195 | } |
1193 | 1196 | ||
1194 | v_blkno += p_blocks; | 1197 | v_blkno += p_blocks; |
1195 | } | 1198 | } |
1196 | 1199 | ||
1197 | bail: | 1200 | bail: |
1198 | for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) | ||
1199 | brelse(bhs[i]); | ||
1200 | return status; | 1201 | return status; |
1201 | } | 1202 | } |
1202 | 1203 | ||
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 13219ed73e1d..52c07346bea3 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c | |||
@@ -735,8 +735,6 @@ static void __exit ocfs2_stack_glue_exit(void) | |||
735 | { | 735 | { |
736 | memset(&locking_max_version, 0, | 736 | memset(&locking_max_version, 0, |
737 | sizeof(struct ocfs2_protocol_version)); | 737 | sizeof(struct ocfs2_protocol_version)); |
738 | locking_max_version.pv_major = 0; | ||
739 | locking_max_version.pv_minor = 0; | ||
740 | ocfs2_sysfs_exit(); | 738 | ocfs2_sysfs_exit(); |
741 | if (ocfs2_table_header) | 739 | if (ocfs2_table_header) |
742 | unregister_sysctl_table(ocfs2_table_header); | 740 | unregister_sysctl_table(ocfs2_table_header); |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 3971146228d3..603b28d6f008 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -2072,7 +2072,6 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
2072 | osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash); | 2072 | osb->osb_dx_seed[3] = le32_to_cpu(di->id2.i_super.s_uuid_hash); |
2073 | 2073 | ||
2074 | osb->sb = sb; | 2074 | osb->sb = sb; |
2075 | /* Save off for ocfs2_rw_direct */ | ||
2076 | osb->s_sectsize_bits = blksize_bits(sector_size); | 2075 | osb->s_sectsize_bits = blksize_bits(sector_size); |
2077 | BUG_ON(!osb->s_sectsize_bits); | 2076 | BUG_ON(!osb->s_sectsize_bits); |
2078 | 2077 | ||
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index a44caabb0fc2..8f2fa94cc4f6 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c | |||
@@ -80,7 +80,7 @@ static int orangefs_readpages(struct file *file, | |||
80 | if (!add_to_page_cache(page, | 80 | if (!add_to_page_cache(page, |
81 | mapping, | 81 | mapping, |
82 | page->index, | 82 | page->index, |
83 | GFP_KERNEL)) { | 83 | readahead_gfp_mask(mapping))) { |
84 | ret = read_one_page(page); | 84 | ret = read_one_page(page); |
85 | gossip_debug(GOSSIP_INODE_DEBUG, | 85 | gossip_debug(GOSSIP_INODE_DEBUG, |
86 | "failure adding page to cache, read_one_page returned: %d\n", | 86 | "failure adding page to cache, read_one_page returned: %d\n", |
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/audit.h> | 21 | #include <linux/audit.h> |
22 | #include <linux/syscalls.h> | 22 | #include <linux/syscalls.h> |
23 | #include <linux/fcntl.h> | 23 | #include <linux/fcntl.h> |
24 | #include <linux/memcontrol.h> | ||
24 | 25 | ||
25 | #include <asm/uaccess.h> | 26 | #include <asm/uaccess.h> |
26 | #include <asm/ioctls.h> | 27 | #include <asm/ioctls.h> |
@@ -137,6 +138,22 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, | |||
137 | put_page(page); | 138 | put_page(page); |
138 | } | 139 | } |
139 | 140 | ||
141 | static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, | ||
142 | struct pipe_buffer *buf) | ||
143 | { | ||
144 | struct page *page = buf->page; | ||
145 | |||
146 | if (page_count(page) == 1) { | ||
147 | if (memcg_kmem_enabled()) { | ||
148 | memcg_kmem_uncharge(page, 0); | ||
149 | __ClearPageKmemcg(page); | ||
150 | } | ||
151 | __SetPageLocked(page); | ||
152 | return 0; | ||
153 | } | ||
154 | return 1; | ||
155 | } | ||
156 | |||
140 | /** | 157 | /** |
141 | * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer | 158 | * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer |
142 | * @pipe: the pipe that the buffer belongs to | 159 | * @pipe: the pipe that the buffer belongs to |
@@ -219,7 +236,7 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { | |||
219 | .can_merge = 1, | 236 | .can_merge = 1, |
220 | .confirm = generic_pipe_buf_confirm, | 237 | .confirm = generic_pipe_buf_confirm, |
221 | .release = anon_pipe_buf_release, | 238 | .release = anon_pipe_buf_release, |
222 | .steal = generic_pipe_buf_steal, | 239 | .steal = anon_pipe_buf_steal, |
223 | .get = generic_pipe_buf_get, | 240 | .get = generic_pipe_buf_get, |
224 | }; | 241 | }; |
225 | 242 | ||
@@ -227,7 +244,7 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = { | |||
227 | .can_merge = 0, | 244 | .can_merge = 0, |
228 | .confirm = generic_pipe_buf_confirm, | 245 | .confirm = generic_pipe_buf_confirm, |
229 | .release = anon_pipe_buf_release, | 246 | .release = anon_pipe_buf_release, |
230 | .steal = generic_pipe_buf_steal, | 247 | .steal = anon_pipe_buf_steal, |
231 | .get = generic_pipe_buf_get, | 248 | .get = generic_pipe_buf_get, |
232 | }; | 249 | }; |
233 | 250 | ||
@@ -405,7 +422,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) | |||
405 | int copied; | 422 | int copied; |
406 | 423 | ||
407 | if (!page) { | 424 | if (!page) { |
408 | page = alloc_page(GFP_HIGHUSER); | 425 | page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); |
409 | if (unlikely(!page)) { | 426 | if (unlikely(!page)) { |
410 | ret = ret ? : -ENOMEM; | 427 | ret = ret ? : -ENOMEM; |
411 | break; | 428 | break; |
@@ -611,7 +628,7 @@ struct pipe_inode_info *alloc_pipe_info(void) | |||
611 | { | 628 | { |
612 | struct pipe_inode_info *pipe; | 629 | struct pipe_inode_info *pipe; |
613 | 630 | ||
614 | pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); | 631 | pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); |
615 | if (pipe) { | 632 | if (pipe) { |
616 | unsigned long pipe_bufs = PIPE_DEF_BUFFERS; | 633 | unsigned long pipe_bufs = PIPE_DEF_BUFFERS; |
617 | struct user_struct *user = get_current_user(); | 634 | struct user_struct *user = get_current_user(); |
@@ -619,7 +636,9 @@ struct pipe_inode_info *alloc_pipe_info(void) | |||
619 | if (!too_many_pipe_buffers_hard(user)) { | 636 | if (!too_many_pipe_buffers_hard(user)) { |
620 | if (too_many_pipe_buffers_soft(user)) | 637 | if (too_many_pipe_buffers_soft(user)) |
621 | pipe_bufs = 1; | 638 | pipe_bufs = 1; |
622 | pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL); | 639 | pipe->bufs = kcalloc(pipe_bufs, |
640 | sizeof(struct pipe_buffer), | ||
641 | GFP_KERNEL_ACCOUNT); | ||
623 | } | 642 | } |
624 | 643 | ||
625 | if (pipe->bufs) { | 644 | if (pipe->bufs) { |
@@ -1010,7 +1029,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) | |||
1010 | if (nr_pages < pipe->nrbufs) | 1029 | if (nr_pages < pipe->nrbufs) |
1011 | return -EBUSY; | 1030 | return -EBUSY; |
1012 | 1031 | ||
1013 | bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); | 1032 | bufs = kcalloc(nr_pages, sizeof(*bufs), |
1033 | GFP_KERNEL_ACCOUNT | __GFP_NOWARN); | ||
1014 | if (unlikely(!bufs)) | 1034 | if (unlikely(!bufs)) |
1015 | return -ENOMEM; | 1035 | return -ENOMEM; |
1016 | 1036 | ||
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 83720460c5bc..cf301a9ef512 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c | |||
@@ -105,6 +105,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
105 | #endif | 105 | #endif |
106 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 106 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
107 | "AnonHugePages: %8lu kB\n" | 107 | "AnonHugePages: %8lu kB\n" |
108 | "ShmemHugePages: %8lu kB\n" | ||
109 | "ShmemPmdMapped: %8lu kB\n" | ||
108 | #endif | 110 | #endif |
109 | #ifdef CONFIG_CMA | 111 | #ifdef CONFIG_CMA |
110 | "CmaTotal: %8lu kB\n" | 112 | "CmaTotal: %8lu kB\n" |
@@ -162,8 +164,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
162 | , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) | 164 | , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) |
163 | #endif | 165 | #endif |
164 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 166 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
165 | , K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * | 167 | , K(global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR) |
166 | HPAGE_PMD_NR) | 168 | , K(global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR) |
169 | , K(global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR) | ||
167 | #endif | 170 | #endif |
168 | #ifdef CONFIG_CMA | 171 | #ifdef CONFIG_CMA |
169 | , K(totalcma_pages) | 172 | , K(totalcma_pages) |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4648c7f63ae2..187d84ef9de9 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -448,6 +448,7 @@ struct mem_size_stats { | |||
448 | unsigned long referenced; | 448 | unsigned long referenced; |
449 | unsigned long anonymous; | 449 | unsigned long anonymous; |
450 | unsigned long anonymous_thp; | 450 | unsigned long anonymous_thp; |
451 | unsigned long shmem_thp; | ||
451 | unsigned long swap; | 452 | unsigned long swap; |
452 | unsigned long shared_hugetlb; | 453 | unsigned long shared_hugetlb; |
453 | unsigned long private_hugetlb; | 454 | unsigned long private_hugetlb; |
@@ -576,7 +577,12 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, | |||
576 | page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); | 577 | page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); |
577 | if (IS_ERR_OR_NULL(page)) | 578 | if (IS_ERR_OR_NULL(page)) |
578 | return; | 579 | return; |
579 | mss->anonymous_thp += HPAGE_PMD_SIZE; | 580 | if (PageAnon(page)) |
581 | mss->anonymous_thp += HPAGE_PMD_SIZE; | ||
582 | else if (PageSwapBacked(page)) | ||
583 | mss->shmem_thp += HPAGE_PMD_SIZE; | ||
584 | else | ||
585 | VM_BUG_ON_PAGE(1, page); | ||
580 | smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); | 586 | smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); |
581 | } | 587 | } |
582 | #else | 588 | #else |
@@ -770,6 +776,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) | |||
770 | "Referenced: %8lu kB\n" | 776 | "Referenced: %8lu kB\n" |
771 | "Anonymous: %8lu kB\n" | 777 | "Anonymous: %8lu kB\n" |
772 | "AnonHugePages: %8lu kB\n" | 778 | "AnonHugePages: %8lu kB\n" |
779 | "ShmemPmdMapped: %8lu kB\n" | ||
773 | "Shared_Hugetlb: %8lu kB\n" | 780 | "Shared_Hugetlb: %8lu kB\n" |
774 | "Private_Hugetlb: %7lu kB\n" | 781 | "Private_Hugetlb: %7lu kB\n" |
775 | "Swap: %8lu kB\n" | 782 | "Swap: %8lu kB\n" |
@@ -787,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) | |||
787 | mss.referenced >> 10, | 794 | mss.referenced >> 10, |
788 | mss.anonymous >> 10, | 795 | mss.anonymous >> 10, |
789 | mss.anonymous_thp >> 10, | 796 | mss.anonymous_thp >> 10, |
797 | mss.shmem_thp >> 10, | ||
790 | mss.shared_hugetlb >> 10, | 798 | mss.shared_hugetlb >> 10, |
791 | mss.private_hugetlb >> 10, | 799 | mss.private_hugetlb >> 10, |
792 | mss.swap >> 10, | 800 | mss.swap >> 10, |
diff --git a/fs/super.c b/fs/super.c index d78b9847e6cb..5806ffd45563 100644 --- a/fs/super.c +++ b/fs/super.c | |||
@@ -206,6 +206,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) | |||
206 | mutex_init(&s->s_sync_lock); | 206 | mutex_init(&s->s_sync_lock); |
207 | INIT_LIST_HEAD(&s->s_inodes); | 207 | INIT_LIST_HEAD(&s->s_inodes); |
208 | spin_lock_init(&s->s_inode_list_lock); | 208 | spin_lock_init(&s->s_inode_list_lock); |
209 | INIT_LIST_HEAD(&s->s_inodes_wb); | ||
210 | spin_lock_init(&s->s_inode_wblist_lock); | ||
209 | 211 | ||
210 | if (list_lru_init_memcg(&s->s_dentry_lru)) | 212 | if (list_lru_init_memcg(&s->s_dentry_lru)) |
211 | goto fail; | 213 | goto fail; |
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 2d97952e341a..85959d8324df 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c | |||
@@ -257,10 +257,9 @@ out: | |||
257 | * fatal_signal_pending()s, and the mmap_sem must be released before | 257 | * fatal_signal_pending()s, and the mmap_sem must be released before |
258 | * returning it. | 258 | * returning it. |
259 | */ | 259 | */ |
260 | int handle_userfault(struct vm_area_struct *vma, unsigned long address, | 260 | int handle_userfault(struct fault_env *fe, unsigned long reason) |
261 | unsigned int flags, unsigned long reason) | ||
262 | { | 261 | { |
263 | struct mm_struct *mm = vma->vm_mm; | 262 | struct mm_struct *mm = fe->vma->vm_mm; |
264 | struct userfaultfd_ctx *ctx; | 263 | struct userfaultfd_ctx *ctx; |
265 | struct userfaultfd_wait_queue uwq; | 264 | struct userfaultfd_wait_queue uwq; |
266 | int ret; | 265 | int ret; |
@@ -269,7 +268,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, | |||
269 | BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); | 268 | BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); |
270 | 269 | ||
271 | ret = VM_FAULT_SIGBUS; | 270 | ret = VM_FAULT_SIGBUS; |
272 | ctx = vma->vm_userfaultfd_ctx.ctx; | 271 | ctx = fe->vma->vm_userfaultfd_ctx.ctx; |
273 | if (!ctx) | 272 | if (!ctx) |
274 | goto out; | 273 | goto out; |
275 | 274 | ||
@@ -302,17 +301,17 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, | |||
302 | * without first stopping userland access to the memory. For | 301 | * without first stopping userland access to the memory. For |
303 | * VM_UFFD_MISSING userfaults this is enough for now. | 302 | * VM_UFFD_MISSING userfaults this is enough for now. |
304 | */ | 303 | */ |
305 | if (unlikely(!(flags & FAULT_FLAG_ALLOW_RETRY))) { | 304 | if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) { |
306 | /* | 305 | /* |
307 | * Validate the invariant that nowait must allow retry | 306 | * Validate the invariant that nowait must allow retry |
308 | * to be sure not to return SIGBUS erroneously on | 307 | * to be sure not to return SIGBUS erroneously on |
309 | * nowait invocations. | 308 | * nowait invocations. |
310 | */ | 309 | */ |
311 | BUG_ON(flags & FAULT_FLAG_RETRY_NOWAIT); | 310 | BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT); |
312 | #ifdef CONFIG_DEBUG_VM | 311 | #ifdef CONFIG_DEBUG_VM |
313 | if (printk_ratelimit()) { | 312 | if (printk_ratelimit()) { |
314 | printk(KERN_WARNING | 313 | printk(KERN_WARNING |
315 | "FAULT_FLAG_ALLOW_RETRY missing %x\n", flags); | 314 | "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags); |
316 | dump_stack(); | 315 | dump_stack(); |
317 | } | 316 | } |
318 | #endif | 317 | #endif |
@@ -324,7 +323,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, | |||
324 | * and wait. | 323 | * and wait. |
325 | */ | 324 | */ |
326 | ret = VM_FAULT_RETRY; | 325 | ret = VM_FAULT_RETRY; |
327 | if (flags & FAULT_FLAG_RETRY_NOWAIT) | 326 | if (fe->flags & FAULT_FLAG_RETRY_NOWAIT) |
328 | goto out; | 327 | goto out; |
329 | 328 | ||
330 | /* take the reference before dropping the mmap_sem */ | 329 | /* take the reference before dropping the mmap_sem */ |
@@ -332,10 +331,11 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, | |||
332 | 331 | ||
333 | init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); | 332 | init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); |
334 | uwq.wq.private = current; | 333 | uwq.wq.private = current; |
335 | uwq.msg = userfault_msg(address, flags, reason); | 334 | uwq.msg = userfault_msg(fe->address, fe->flags, reason); |
336 | uwq.ctx = ctx; | 335 | uwq.ctx = ctx; |
337 | 336 | ||
338 | return_to_userland = (flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == | 337 | return_to_userland = |
338 | (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == | ||
339 | (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); | 339 | (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); |
340 | 340 | ||
341 | spin_lock(&ctx->fault_pending_wqh.lock); | 341 | spin_lock(&ctx->fault_pending_wqh.lock); |
@@ -353,7 +353,7 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, | |||
353 | TASK_KILLABLE); | 353 | TASK_KILLABLE); |
354 | spin_unlock(&ctx->fault_pending_wqh.lock); | 354 | spin_unlock(&ctx->fault_pending_wqh.lock); |
355 | 355 | ||
356 | must_wait = userfaultfd_must_wait(ctx, address, flags, reason); | 356 | must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason); |
357 | up_read(&mm->mmap_sem); | 357 | up_read(&mm->mmap_sem); |
358 | 358 | ||
359 | if (likely(must_wait && !ACCESS_ONCE(ctx->released) && | 359 | if (likely(must_wait && !ACCESS_ONCE(ctx->released) && |
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 47fc63295422..1b3dc9dd8861 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c | |||
@@ -1551,7 +1551,7 @@ xfs_filemap_page_mkwrite( | |||
1551 | xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); | 1551 | xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); |
1552 | 1552 | ||
1553 | if (IS_DAX(inode)) { | 1553 | if (IS_DAX(inode)) { |
1554 | ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); | 1554 | ret = dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); |
1555 | } else { | 1555 | } else { |
1556 | ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); | 1556 | ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); |
1557 | ret = block_page_mkwrite_return(ret); | 1557 | ret = block_page_mkwrite_return(ret); |
@@ -1585,7 +1585,7 @@ xfs_filemap_fault( | |||
1585 | * changes to xfs_get_blocks_direct() to map unwritten extent | 1585 | * changes to xfs_get_blocks_direct() to map unwritten extent |
1586 | * ioend for conversion on read-only mappings. | 1586 | * ioend for conversion on read-only mappings. |
1587 | */ | 1587 | */ |
1588 | ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault); | 1588 | ret = dax_fault(vma, vmf, xfs_get_blocks_dax_fault); |
1589 | } else | 1589 | } else |
1590 | ret = filemap_fault(vma, vmf); | 1590 | ret = filemap_fault(vma, vmf); |
1591 | xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); | 1591 | xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); |
@@ -1622,7 +1622,7 @@ xfs_filemap_pmd_fault( | |||
1622 | } | 1622 | } |
1623 | 1623 | ||
1624 | xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); | 1624 | xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); |
1625 | ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); | 1625 | ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); |
1626 | xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); | 1626 | xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); |
1627 | 1627 | ||
1628 | if (flags & FAULT_FLAG_WRITE) | 1628 | if (flags & FAULT_FLAG_WRITE) |
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 9dbb739cafa0..c6d667187608 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h | |||
@@ -107,6 +107,12 @@ struct mmu_gather { | |||
107 | struct mmu_gather_batch local; | 107 | struct mmu_gather_batch local; |
108 | struct page *__pages[MMU_GATHER_BUNDLE]; | 108 | struct page *__pages[MMU_GATHER_BUNDLE]; |
109 | unsigned int batch_count; | 109 | unsigned int batch_count; |
110 | /* | ||
111 | * __tlb_adjust_range will track the new addr here, | ||
112 | * that that we can adjust the range after the flush | ||
113 | */ | ||
114 | unsigned long addr; | ||
115 | int page_size; | ||
110 | }; | 116 | }; |
111 | 117 | ||
112 | #define HAVE_GENERIC_MMU_GATHER | 118 | #define HAVE_GENERIC_MMU_GATHER |
@@ -115,23 +121,20 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long | |||
115 | void tlb_flush_mmu(struct mmu_gather *tlb); | 121 | void tlb_flush_mmu(struct mmu_gather *tlb); |
116 | void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, | 122 | void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, |
117 | unsigned long end); | 123 | unsigned long end); |
118 | int __tlb_remove_page(struct mmu_gather *tlb, struct page *page); | 124 | extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, |
119 | 125 | int page_size); | |
120 | /* tlb_remove_page | ||
121 | * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when | ||
122 | * required. | ||
123 | */ | ||
124 | static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) | ||
125 | { | ||
126 | if (!__tlb_remove_page(tlb, page)) | ||
127 | tlb_flush_mmu(tlb); | ||
128 | } | ||
129 | 126 | ||
130 | static inline void __tlb_adjust_range(struct mmu_gather *tlb, | 127 | static inline void __tlb_adjust_range(struct mmu_gather *tlb, |
131 | unsigned long address) | 128 | unsigned long address) |
132 | { | 129 | { |
133 | tlb->start = min(tlb->start, address); | 130 | tlb->start = min(tlb->start, address); |
134 | tlb->end = max(tlb->end, address + PAGE_SIZE); | 131 | tlb->end = max(tlb->end, address + PAGE_SIZE); |
132 | /* | ||
133 | * Track the last address with which we adjusted the range. This | ||
134 | * will be used later to adjust again after a mmu_flush due to | ||
135 | * failed __tlb_remove_page | ||
136 | */ | ||
137 | tlb->addr = address; | ||
135 | } | 138 | } |
136 | 139 | ||
137 | static inline void __tlb_reset_range(struct mmu_gather *tlb) | 140 | static inline void __tlb_reset_range(struct mmu_gather *tlb) |
@@ -144,6 +147,40 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb) | |||
144 | } | 147 | } |
145 | } | 148 | } |
146 | 149 | ||
150 | static inline void tlb_remove_page_size(struct mmu_gather *tlb, | ||
151 | struct page *page, int page_size) | ||
152 | { | ||
153 | if (__tlb_remove_page_size(tlb, page, page_size)) { | ||
154 | tlb_flush_mmu(tlb); | ||
155 | tlb->page_size = page_size; | ||
156 | __tlb_adjust_range(tlb, tlb->addr); | ||
157 | __tlb_remove_page_size(tlb, page, page_size); | ||
158 | } | ||
159 | } | ||
160 | |||
161 | static bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | ||
162 | { | ||
163 | return __tlb_remove_page_size(tlb, page, PAGE_SIZE); | ||
164 | } | ||
165 | |||
166 | /* tlb_remove_page | ||
167 | * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when | ||
168 | * required. | ||
169 | */ | ||
170 | static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) | ||
171 | { | ||
172 | return tlb_remove_page_size(tlb, page, PAGE_SIZE); | ||
173 | } | ||
174 | |||
175 | static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *page) | ||
176 | { | ||
177 | /* active->nr should be zero when we call this */ | ||
178 | VM_BUG_ON_PAGE(tlb->active->nr, page); | ||
179 | tlb->page_size = PAGE_SIZE; | ||
180 | __tlb_adjust_range(tlb, tlb->addr); | ||
181 | return __tlb_remove_page(tlb, page); | ||
182 | } | ||
183 | |||
147 | /* | 184 | /* |
148 | * In the case of tlb vma handling, we can optimise these away in the | 185 | * In the case of tlb vma handling, we can optimise these away in the |
149 | * case where we're doing a full MM flush. When we're doing a munmap, | 186 | * case where we're doing a full MM flush. When we're doing a munmap, |
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 9b0a15d06a4f..79542b2698ec 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h | |||
@@ -48,6 +48,7 @@ | |||
48 | #include <linux/migrate.h> | 48 | #include <linux/migrate.h> |
49 | #include <linux/gfp.h> | 49 | #include <linux/gfp.h> |
50 | #include <linux/err.h> | 50 | #include <linux/err.h> |
51 | #include <linux/fs.h> | ||
51 | 52 | ||
52 | /* | 53 | /* |
53 | * Balloon device information descriptor. | 54 | * Balloon device information descriptor. |
@@ -62,6 +63,7 @@ struct balloon_dev_info { | |||
62 | struct list_head pages; /* Pages enqueued & handled to Host */ | 63 | struct list_head pages; /* Pages enqueued & handled to Host */ |
63 | int (*migratepage)(struct balloon_dev_info *, struct page *newpage, | 64 | int (*migratepage)(struct balloon_dev_info *, struct page *newpage, |
64 | struct page *page, enum migrate_mode mode); | 65 | struct page *page, enum migrate_mode mode); |
66 | struct inode *inode; | ||
65 | }; | 67 | }; |
66 | 68 | ||
67 | extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info); | 69 | extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info); |
@@ -73,45 +75,19 @@ static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) | |||
73 | spin_lock_init(&balloon->pages_lock); | 75 | spin_lock_init(&balloon->pages_lock); |
74 | INIT_LIST_HEAD(&balloon->pages); | 76 | INIT_LIST_HEAD(&balloon->pages); |
75 | balloon->migratepage = NULL; | 77 | balloon->migratepage = NULL; |
78 | balloon->inode = NULL; | ||
76 | } | 79 | } |
77 | 80 | ||
78 | #ifdef CONFIG_BALLOON_COMPACTION | 81 | #ifdef CONFIG_BALLOON_COMPACTION |
79 | extern bool balloon_page_isolate(struct page *page); | 82 | extern const struct address_space_operations balloon_aops; |
83 | extern bool balloon_page_isolate(struct page *page, | ||
84 | isolate_mode_t mode); | ||
80 | extern void balloon_page_putback(struct page *page); | 85 | extern void balloon_page_putback(struct page *page); |
81 | extern int balloon_page_migrate(struct page *newpage, | 86 | extern int balloon_page_migrate(struct address_space *mapping, |
87 | struct page *newpage, | ||
82 | struct page *page, enum migrate_mode mode); | 88 | struct page *page, enum migrate_mode mode); |
83 | 89 | ||
84 | /* | 90 | /* |
85 | * __is_movable_balloon_page - helper to perform @page PageBalloon tests | ||
86 | */ | ||
87 | static inline bool __is_movable_balloon_page(struct page *page) | ||
88 | { | ||
89 | return PageBalloon(page); | ||
90 | } | ||
91 | |||
92 | /* | ||
93 | * balloon_page_movable - test PageBalloon to identify balloon pages | ||
94 | * and PagePrivate to check that the page is not | ||
95 | * isolated and can be moved by compaction/migration. | ||
96 | * | ||
97 | * As we might return false positives in the case of a balloon page being just | ||
98 | * released under us, this need to be re-tested later, under the page lock. | ||
99 | */ | ||
100 | static inline bool balloon_page_movable(struct page *page) | ||
101 | { | ||
102 | return PageBalloon(page) && PagePrivate(page); | ||
103 | } | ||
104 | |||
105 | /* | ||
106 | * isolated_balloon_page - identify an isolated balloon page on private | ||
107 | * compaction/migration page lists. | ||
108 | */ | ||
109 | static inline bool isolated_balloon_page(struct page *page) | ||
110 | { | ||
111 | return PageBalloon(page); | ||
112 | } | ||
113 | |||
114 | /* | ||
115 | * balloon_page_insert - insert a page into the balloon's page list and make | 91 | * balloon_page_insert - insert a page into the balloon's page list and make |
116 | * the page->private assignment accordingly. | 92 | * the page->private assignment accordingly. |
117 | * @balloon : pointer to balloon device | 93 | * @balloon : pointer to balloon device |
@@ -124,7 +100,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, | |||
124 | struct page *page) | 100 | struct page *page) |
125 | { | 101 | { |
126 | __SetPageBalloon(page); | 102 | __SetPageBalloon(page); |
127 | SetPagePrivate(page); | 103 | __SetPageMovable(page, balloon->inode->i_mapping); |
128 | set_page_private(page, (unsigned long)balloon); | 104 | set_page_private(page, (unsigned long)balloon); |
129 | list_add(&page->lru, &balloon->pages); | 105 | list_add(&page->lru, &balloon->pages); |
130 | } | 106 | } |
@@ -140,11 +116,14 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, | |||
140 | static inline void balloon_page_delete(struct page *page) | 116 | static inline void balloon_page_delete(struct page *page) |
141 | { | 117 | { |
142 | __ClearPageBalloon(page); | 118 | __ClearPageBalloon(page); |
119 | __ClearPageMovable(page); | ||
143 | set_page_private(page, 0); | 120 | set_page_private(page, 0); |
144 | if (PagePrivate(page)) { | 121 | /* |
145 | ClearPagePrivate(page); | 122 | * No touch page.lru field once @page has been isolated |
123 | * because VM is using the field. | ||
124 | */ | ||
125 | if (!PageIsolated(page)) | ||
146 | list_del(&page->lru); | 126 | list_del(&page->lru); |
147 | } | ||
148 | } | 127 | } |
149 | 128 | ||
150 | /* | 129 | /* |
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index a58c852a268f..1a02dab16646 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
@@ -212,6 +212,7 @@ static inline void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_i | |||
212 | #endif /* CONFIG_COMPACTION */ | 212 | #endif /* CONFIG_COMPACTION */ |
213 | 213 | ||
214 | #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) | 214 | #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) |
215 | struct node; | ||
215 | extern int compaction_register_node(struct node *node); | 216 | extern int compaction_register_node(struct node *node); |
216 | extern void compaction_unregister_node(struct node *node); | 217 | extern void compaction_unregister_node(struct node *node); |
217 | 218 | ||
diff --git a/include/linux/dax.h b/include/linux/dax.h index 43d5f0b799c7..9c6dc7704043 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h | |||
@@ -14,7 +14,6 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, | |||
14 | int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); | 14 | int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); |
15 | int dax_truncate_page(struct inode *, loff_t from, get_block_t); | 15 | int dax_truncate_page(struct inode *, loff_t from, get_block_t); |
16 | int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); | 16 | int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); |
17 | int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); | ||
18 | int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); | 17 | int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); |
19 | void dax_wake_mapping_entry_waiter(struct address_space *mapping, | 18 | void dax_wake_mapping_entry_waiter(struct address_space *mapping, |
20 | pgoff_t index, bool wake_all); | 19 | pgoff_t index, bool wake_all); |
@@ -46,19 +45,15 @@ static inline int __dax_zero_page_range(struct block_device *bdev, | |||
46 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) | 45 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) |
47 | int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, | 46 | int dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, |
48 | unsigned int flags, get_block_t); | 47 | unsigned int flags, get_block_t); |
49 | int __dax_pmd_fault(struct vm_area_struct *, unsigned long addr, pmd_t *, | ||
50 | unsigned int flags, get_block_t); | ||
51 | #else | 48 | #else |
52 | static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, | 49 | static inline int dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, |
53 | pmd_t *pmd, unsigned int flags, get_block_t gb) | 50 | pmd_t *pmd, unsigned int flags, get_block_t gb) |
54 | { | 51 | { |
55 | return VM_FAULT_FALLBACK; | 52 | return VM_FAULT_FALLBACK; |
56 | } | 53 | } |
57 | #define __dax_pmd_fault dax_pmd_fault | ||
58 | #endif | 54 | #endif |
59 | int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); | 55 | int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *); |
60 | #define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) | 56 | #define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb) |
61 | #define __dax_mkwrite(vma, vmf, gb) __dax_fault(vma, vmf, gb) | ||
62 | 57 | ||
63 | static inline bool vma_is_dax(struct vm_area_struct *vma) | 58 | static inline bool vma_is_dax(struct vm_area_struct *vma) |
64 | { | 59 | { |
diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h index 46056cb161fc..d82bf1994485 100644 --- a/include/linux/debugobjects.h +++ b/include/linux/debugobjects.h | |||
@@ -38,7 +38,7 @@ struct debug_obj { | |||
38 | * @name: name of the object typee | 38 | * @name: name of the object typee |
39 | * @debug_hint: function returning address, which have associated | 39 | * @debug_hint: function returning address, which have associated |
40 | * kernel symbol, to allow identify the object | 40 | * kernel symbol, to allow identify the object |
41 | * @is_static_object return true if the obj is static, otherwise return false | 41 | * @is_static_object: return true if the obj is static, otherwise return false |
42 | * @fixup_init: fixup function, which is called when the init check | 42 | * @fixup_init: fixup function, which is called when the init check |
43 | * fails. All fixup functions must return true if fixup | 43 | * fails. All fixup functions must return true if fixup |
44 | * was successful, otherwise return false | 44 | * was successful, otherwise return false |
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index e65ef959546c..c46d2aa16d81 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h | |||
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/swap.h> | 4 | #include <linux/swap.h> |
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include <linux/bitops.h> | 6 | #include <linux/bitops.h> |
7 | #include <linux/jump_label.h> | ||
7 | 8 | ||
8 | struct frontswap_ops { | 9 | struct frontswap_ops { |
9 | void (*init)(unsigned); /* this swap type was just swapon'ed */ | 10 | void (*init)(unsigned); /* this swap type was just swapon'ed */ |
@@ -14,7 +15,6 @@ struct frontswap_ops { | |||
14 | struct frontswap_ops *next; /* private pointer to next ops */ | 15 | struct frontswap_ops *next; /* private pointer to next ops */ |
15 | }; | 16 | }; |
16 | 17 | ||
17 | extern bool frontswap_enabled; | ||
18 | extern void frontswap_register_ops(struct frontswap_ops *ops); | 18 | extern void frontswap_register_ops(struct frontswap_ops *ops); |
19 | extern void frontswap_shrink(unsigned long); | 19 | extern void frontswap_shrink(unsigned long); |
20 | extern unsigned long frontswap_curr_pages(void); | 20 | extern unsigned long frontswap_curr_pages(void); |
@@ -30,7 +30,12 @@ extern void __frontswap_invalidate_page(unsigned, pgoff_t); | |||
30 | extern void __frontswap_invalidate_area(unsigned); | 30 | extern void __frontswap_invalidate_area(unsigned); |
31 | 31 | ||
32 | #ifdef CONFIG_FRONTSWAP | 32 | #ifdef CONFIG_FRONTSWAP |
33 | #define frontswap_enabled (1) | 33 | extern struct static_key_false frontswap_enabled_key; |
34 | |||
35 | static inline bool frontswap_enabled(void) | ||
36 | { | ||
37 | return static_branch_unlikely(&frontswap_enabled_key); | ||
38 | } | ||
34 | 39 | ||
35 | static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) | 40 | static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) |
36 | { | 41 | { |
@@ -50,7 +55,10 @@ static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) | |||
50 | #else | 55 | #else |
51 | /* all inline routines become no-ops and all externs are ignored */ | 56 | /* all inline routines become no-ops and all externs are ignored */ |
52 | 57 | ||
53 | #define frontswap_enabled (0) | 58 | static inline bool frontswap_enabled(void) |
59 | { | ||
60 | return false; | ||
61 | } | ||
54 | 62 | ||
55 | static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) | 63 | static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) |
56 | { | 64 | { |
@@ -70,37 +78,35 @@ static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) | |||
70 | 78 | ||
71 | static inline int frontswap_store(struct page *page) | 79 | static inline int frontswap_store(struct page *page) |
72 | { | 80 | { |
73 | int ret = -1; | 81 | if (frontswap_enabled()) |
82 | return __frontswap_store(page); | ||
74 | 83 | ||
75 | if (frontswap_enabled) | 84 | return -1; |
76 | ret = __frontswap_store(page); | ||
77 | return ret; | ||
78 | } | 85 | } |
79 | 86 | ||
80 | static inline int frontswap_load(struct page *page) | 87 | static inline int frontswap_load(struct page *page) |
81 | { | 88 | { |
82 | int ret = -1; | 89 | if (frontswap_enabled()) |
90 | return __frontswap_load(page); | ||
83 | 91 | ||
84 | if (frontswap_enabled) | 92 | return -1; |
85 | ret = __frontswap_load(page); | ||
86 | return ret; | ||
87 | } | 93 | } |
88 | 94 | ||
89 | static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset) | 95 | static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset) |
90 | { | 96 | { |
91 | if (frontswap_enabled) | 97 | if (frontswap_enabled()) |
92 | __frontswap_invalidate_page(type, offset); | 98 | __frontswap_invalidate_page(type, offset); |
93 | } | 99 | } |
94 | 100 | ||
95 | static inline void frontswap_invalidate_area(unsigned type) | 101 | static inline void frontswap_invalidate_area(unsigned type) |
96 | { | 102 | { |
97 | if (frontswap_enabled) | 103 | if (frontswap_enabled()) |
98 | __frontswap_invalidate_area(type); | 104 | __frontswap_invalidate_area(type); |
99 | } | 105 | } |
100 | 106 | ||
101 | static inline void frontswap_init(unsigned type, unsigned long *map) | 107 | static inline void frontswap_init(unsigned type, unsigned long *map) |
102 | { | 108 | { |
103 | if (frontswap_enabled) | 109 | if (frontswap_enabled()) |
104 | __frontswap_init(type, map); | 110 | __frontswap_init(type, map); |
105 | } | 111 | } |
106 | 112 | ||
diff --git a/include/linux/fs.h b/include/linux/fs.h index dc488662ce0b..f2a69f20926f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -398,6 +398,8 @@ struct address_space_operations { | |||
398 | */ | 398 | */ |
399 | int (*migratepage) (struct address_space *, | 399 | int (*migratepage) (struct address_space *, |
400 | struct page *, struct page *, enum migrate_mode); | 400 | struct page *, struct page *, enum migrate_mode); |
401 | bool (*isolate_page)(struct page *, isolate_mode_t); | ||
402 | void (*putback_page)(struct page *); | ||
401 | int (*launder_page) (struct page *); | 403 | int (*launder_page) (struct page *); |
402 | int (*is_partially_uptodate) (struct page *, unsigned long, | 404 | int (*is_partially_uptodate) (struct page *, unsigned long, |
403 | unsigned long); | 405 | unsigned long); |
@@ -661,6 +663,7 @@ struct inode { | |||
661 | #endif | 663 | #endif |
662 | struct list_head i_lru; /* inode LRU list */ | 664 | struct list_head i_lru; /* inode LRU list */ |
663 | struct list_head i_sb_list; | 665 | struct list_head i_sb_list; |
666 | struct list_head i_wb_list; /* backing dev writeback list */ | ||
664 | union { | 667 | union { |
665 | struct hlist_head i_dentry; | 668 | struct hlist_head i_dentry; |
666 | struct rcu_head i_rcu; | 669 | struct rcu_head i_rcu; |
@@ -1444,6 +1447,9 @@ struct super_block { | |||
1444 | /* s_inode_list_lock protects s_inodes */ | 1447 | /* s_inode_list_lock protects s_inodes */ |
1445 | spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; | 1448 | spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; |
1446 | struct list_head s_inodes; /* all inodes */ | 1449 | struct list_head s_inodes; /* all inodes */ |
1450 | |||
1451 | spinlock_t s_inode_wblist_lock; | ||
1452 | struct list_head s_inodes_wb; /* writeback inodes */ | ||
1447 | }; | 1453 | }; |
1448 | 1454 | ||
1449 | extern struct timespec current_fs_time(struct super_block *sb); | 1455 | extern struct timespec current_fs_time(struct super_block *sb); |
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 570383a41853..c29e9d347bc6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
@@ -78,8 +78,7 @@ struct vm_area_struct; | |||
78 | * __GFP_THISNODE forces the allocation to be satisified from the requested | 78 | * __GFP_THISNODE forces the allocation to be satisified from the requested |
79 | * node with no fallbacks or placement policy enforcements. | 79 | * node with no fallbacks or placement policy enforcements. |
80 | * | 80 | * |
81 | * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg (only relevant | 81 | * __GFP_ACCOUNT causes the allocation to be accounted to kmemcg. |
82 | * to kmem allocations). | ||
83 | */ | 82 | */ |
84 | #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) | 83 | #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) |
85 | #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) | 84 | #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) |
@@ -486,10 +485,6 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, | |||
486 | #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ | 485 | #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ |
487 | alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) | 486 | alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) |
488 | 487 | ||
489 | extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order); | ||
490 | extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, | ||
491 | unsigned int order); | ||
492 | |||
493 | extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); | 488 | extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); |
494 | extern unsigned long get_zeroed_page(gfp_t gfp_mask); | 489 | extern unsigned long get_zeroed_page(gfp_t gfp_mask); |
495 | 490 | ||
@@ -513,9 +508,6 @@ extern void *__alloc_page_frag(struct page_frag_cache *nc, | |||
513 | unsigned int fragsz, gfp_t gfp_mask); | 508 | unsigned int fragsz, gfp_t gfp_mask); |
514 | extern void __free_page_frag(void *addr); | 509 | extern void __free_page_frag(void *addr); |
515 | 510 | ||
516 | extern void __free_kmem_pages(struct page *page, unsigned int order); | ||
517 | extern void free_kmem_pages(unsigned long addr, unsigned int order); | ||
518 | |||
519 | #define __free_page(page) __free_pages((page), 0) | 511 | #define __free_page(page) __free_pages((page), 0) |
520 | #define free_page(addr) free_pages((addr), 0) | 512 | #define free_page(addr) free_pages((addr), 0) |
521 | 513 | ||
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index f0a7a0320300..92ce91c03cd0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -1,20 +1,12 @@ | |||
1 | #ifndef _LINUX_HUGE_MM_H | 1 | #ifndef _LINUX_HUGE_MM_H |
2 | #define _LINUX_HUGE_MM_H | 2 | #define _LINUX_HUGE_MM_H |
3 | 3 | ||
4 | extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, | 4 | extern int do_huge_pmd_anonymous_page(struct fault_env *fe); |
5 | struct vm_area_struct *vma, | ||
6 | unsigned long address, pmd_t *pmd, | ||
7 | unsigned int flags); | ||
8 | extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 5 | extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
9 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, | 6 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, |
10 | struct vm_area_struct *vma); | 7 | struct vm_area_struct *vma); |
11 | extern void huge_pmd_set_accessed(struct mm_struct *mm, | 8 | extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd); |
12 | struct vm_area_struct *vma, | 9 | extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd); |
13 | unsigned long address, pmd_t *pmd, | ||
14 | pmd_t orig_pmd, int dirty); | ||
15 | extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
16 | unsigned long address, pmd_t *pmd, | ||
17 | pmd_t orig_pmd); | ||
18 | extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | 10 | extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, |
19 | unsigned long addr, | 11 | unsigned long addr, |
20 | pmd_t *pmd, | 12 | pmd_t *pmd, |
@@ -49,6 +41,18 @@ enum transparent_hugepage_flag { | |||
49 | #endif | 41 | #endif |
50 | }; | 42 | }; |
51 | 43 | ||
44 | struct kobject; | ||
45 | struct kobj_attribute; | ||
46 | |||
47 | extern ssize_t single_hugepage_flag_store(struct kobject *kobj, | ||
48 | struct kobj_attribute *attr, | ||
49 | const char *buf, size_t count, | ||
50 | enum transparent_hugepage_flag flag); | ||
51 | extern ssize_t single_hugepage_flag_show(struct kobject *kobj, | ||
52 | struct kobj_attribute *attr, char *buf, | ||
53 | enum transparent_hugepage_flag flag); | ||
54 | extern struct kobj_attribute shmem_enabled_attr; | ||
55 | |||
52 | #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) | 56 | #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) |
53 | #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) | 57 | #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) |
54 | 58 | ||
@@ -134,8 +138,7 @@ static inline int hpage_nr_pages(struct page *page) | |||
134 | return 1; | 138 | return 1; |
135 | } | 139 | } |
136 | 140 | ||
137 | extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 141 | extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd); |
138 | unsigned long addr, pmd_t pmd, pmd_t *pmdp); | ||
139 | 142 | ||
140 | extern struct page *huge_zero_page; | 143 | extern struct page *huge_zero_page; |
141 | 144 | ||
@@ -152,6 +155,8 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) | |||
152 | struct page *get_huge_zero_page(void); | 155 | struct page *get_huge_zero_page(void); |
153 | void put_huge_zero_page(void); | 156 | void put_huge_zero_page(void); |
154 | 157 | ||
158 | #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) | ||
159 | |||
155 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ | 160 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ |
156 | #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) | 161 | #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) |
157 | #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) | 162 | #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) |
@@ -161,6 +166,8 @@ void put_huge_zero_page(void); | |||
161 | 166 | ||
162 | #define transparent_hugepage_enabled(__vma) 0 | 167 | #define transparent_hugepage_enabled(__vma) 0 |
163 | 168 | ||
169 | static inline void prep_transhuge_page(struct page *page) {} | ||
170 | |||
164 | #define transparent_hugepage_flags 0UL | 171 | #define transparent_hugepage_flags 0UL |
165 | static inline int | 172 | static inline int |
166 | split_huge_page_to_list(struct page *page, struct list_head *list) | 173 | split_huge_page_to_list(struct page *page, struct list_head *list) |
@@ -196,8 +203,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, | |||
196 | return NULL; | 203 | return NULL; |
197 | } | 204 | } |
198 | 205 | ||
199 | static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 206 | static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd) |
200 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) | ||
201 | { | 207 | { |
202 | return 0; | 208 | return 0; |
203 | } | 209 | } |
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index eeb307985715..1e032a1ddb3e 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h | |||
@@ -4,6 +4,11 @@ | |||
4 | #include <linux/sched.h> /* MMF_VM_HUGEPAGE */ | 4 | #include <linux/sched.h> /* MMF_VM_HUGEPAGE */ |
5 | 5 | ||
6 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 6 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
7 | extern struct attribute_group khugepaged_attr_group; | ||
8 | |||
9 | extern int khugepaged_init(void); | ||
10 | extern void khugepaged_destroy(void); | ||
11 | extern int start_stop_khugepaged(void); | ||
7 | extern int __khugepaged_enter(struct mm_struct *mm); | 12 | extern int __khugepaged_enter(struct mm_struct *mm); |
8 | extern void __khugepaged_exit(struct mm_struct *mm); | 13 | extern void __khugepaged_exit(struct mm_struct *mm); |
9 | extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, | 14 | extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, |
diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 7ae216a39c9e..481c8c4627ca 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h | |||
@@ -43,8 +43,7 @@ static inline struct stable_node *page_stable_node(struct page *page) | |||
43 | static inline void set_page_stable_node(struct page *page, | 43 | static inline void set_page_stable_node(struct page *page, |
44 | struct stable_node *stable_node) | 44 | struct stable_node *stable_node) |
45 | { | 45 | { |
46 | page->mapping = (void *)stable_node + | 46 | page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); |
47 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | ||
48 | } | 47 | } |
49 | 48 | ||
50 | /* | 49 | /* |
diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 3106ac1c895e..6c14b6179727 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h | |||
@@ -73,8 +73,8 @@ extern bool movable_node_enabled; | |||
73 | if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) | 73 | if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) |
74 | 74 | ||
75 | phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, | 75 | phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, |
76 | phys_addr_t start, phys_addr_t end, | 76 | phys_addr_t start, phys_addr_t end, |
77 | int nid, ulong flags); | 77 | int nid, ulong flags); |
78 | phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, | 78 | phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, |
79 | phys_addr_t size, phys_addr_t align); | 79 | phys_addr_t size, phys_addr_t align); |
80 | phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); | 80 | phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); |
@@ -110,7 +110,7 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags, | |||
110 | phys_addr_t *out_end, int *out_nid); | 110 | phys_addr_t *out_end, int *out_nid); |
111 | 111 | ||
112 | void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, | 112 | void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, |
113 | phys_addr_t *out_end); | 113 | phys_addr_t *out_end); |
114 | 114 | ||
115 | /** | 115 | /** |
116 | * for_each_mem_range - iterate through memblock areas from type_a and not | 116 | * for_each_mem_range - iterate through memblock areas from type_a and not |
@@ -148,7 +148,7 @@ void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, | |||
148 | p_start, p_end, p_nid) \ | 148 | p_start, p_end, p_nid) \ |
149 | for (i = (u64)ULLONG_MAX, \ | 149 | for (i = (u64)ULLONG_MAX, \ |
150 | __next_mem_range_rev(&i, nid, flags, type_a, type_b,\ | 150 | __next_mem_range_rev(&i, nid, flags, type_a, type_b,\ |
151 | p_start, p_end, p_nid); \ | 151 | p_start, p_end, p_nid); \ |
152 | i != (u64)ULLONG_MAX; \ | 152 | i != (u64)ULLONG_MAX; \ |
153 | __next_mem_range_rev(&i, nid, flags, type_a, type_b, \ | 153 | __next_mem_range_rev(&i, nid, flags, type_a, type_b, \ |
154 | p_start, p_end, p_nid)) | 154 | p_start, p_end, p_nid)) |
@@ -163,8 +163,7 @@ void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, | |||
163 | * is initialized. | 163 | * is initialized. |
164 | */ | 164 | */ |
165 | #define for_each_reserved_mem_region(i, p_start, p_end) \ | 165 | #define for_each_reserved_mem_region(i, p_start, p_end) \ |
166 | for (i = 0UL, \ | 166 | for (i = 0UL, __next_reserved_mem_region(&i, p_start, p_end); \ |
167 | __next_reserved_mem_region(&i, p_start, p_end); \ | ||
168 | i != (u64)ULLONG_MAX; \ | 167 | i != (u64)ULLONG_MAX; \ |
169 | __next_reserved_mem_region(&i, p_start, p_end)) | 168 | __next_reserved_mem_region(&i, p_start, p_end)) |
170 | 169 | ||
@@ -403,15 +402,14 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo | |||
403 | } | 402 | } |
404 | 403 | ||
405 | #define for_each_memblock(memblock_type, region) \ | 404 | #define for_each_memblock(memblock_type, region) \ |
406 | for (region = memblock.memblock_type.regions; \ | 405 | for (region = memblock.memblock_type.regions; \ |
407 | region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \ | 406 | region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \ |
408 | region++) | 407 | region++) |
409 | 408 | ||
410 | #define for_each_memblock_type(memblock_type, rgn) \ | 409 | #define for_each_memblock_type(memblock_type, rgn) \ |
411 | idx = 0; \ | 410 | for (idx = 0, rgn = &memblock_type->regions[0]; \ |
412 | rgn = &memblock_type->regions[idx]; \ | 411 | idx < memblock_type->cnt; \ |
413 | for (idx = 0; idx < memblock_type->cnt; \ | 412 | idx++, rgn = &memblock_type->regions[idx]) |
414 | idx++,rgn = &memblock_type->regions[idx]) | ||
415 | 413 | ||
416 | #ifdef CONFIG_MEMTEST | 414 | #ifdef CONFIG_MEMTEST |
417 | extern void early_memtest(phys_addr_t start, phys_addr_t end); | 415 | extern void early_memtest(phys_addr_t start, phys_addr_t end); |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 56e6069d2452..71aff733a497 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -749,6 +749,13 @@ static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) | |||
749 | } | 749 | } |
750 | #endif | 750 | #endif |
751 | 751 | ||
752 | struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep); | ||
753 | void memcg_kmem_put_cache(struct kmem_cache *cachep); | ||
754 | int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, | ||
755 | struct mem_cgroup *memcg); | ||
756 | int memcg_kmem_charge(struct page *page, gfp_t gfp, int order); | ||
757 | void memcg_kmem_uncharge(struct page *page, int order); | ||
758 | |||
752 | #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) | 759 | #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) |
753 | extern struct static_key_false memcg_kmem_enabled_key; | 760 | extern struct static_key_false memcg_kmem_enabled_key; |
754 | 761 | ||
@@ -770,22 +777,6 @@ static inline bool memcg_kmem_enabled(void) | |||
770 | } | 777 | } |
771 | 778 | ||
772 | /* | 779 | /* |
773 | * In general, we'll do everything in our power to not incur in any overhead | ||
774 | * for non-memcg users for the kmem functions. Not even a function call, if we | ||
775 | * can avoid it. | ||
776 | * | ||
777 | * Therefore, we'll inline all those functions so that in the best case, we'll | ||
778 | * see that kmemcg is off for everybody and proceed quickly. If it is on, | ||
779 | * we'll still do most of the flag checking inline. We check a lot of | ||
780 | * conditions, but because they are pretty simple, they are expected to be | ||
781 | * fast. | ||
782 | */ | ||
783 | int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, | ||
784 | struct mem_cgroup *memcg); | ||
785 | int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order); | ||
786 | void __memcg_kmem_uncharge(struct page *page, int order); | ||
787 | |||
788 | /* | ||
789 | * helper for accessing a memcg's index. It will be used as an index in the | 780 | * helper for accessing a memcg's index. It will be used as an index in the |
790 | * child cache array in kmem_cache, and also to derive its name. This function | 781 | * child cache array in kmem_cache, and also to derive its name. This function |
791 | * will return -1 when this is not a kmem-limited memcg. | 782 | * will return -1 when this is not a kmem-limited memcg. |
@@ -795,67 +786,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) | |||
795 | return memcg ? memcg->kmemcg_id : -1; | 786 | return memcg ? memcg->kmemcg_id : -1; |
796 | } | 787 | } |
797 | 788 | ||
798 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); | ||
799 | void __memcg_kmem_put_cache(struct kmem_cache *cachep); | ||
800 | |||
801 | static inline bool __memcg_kmem_bypass(void) | ||
802 | { | ||
803 | if (!memcg_kmem_enabled()) | ||
804 | return true; | ||
805 | if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) | ||
806 | return true; | ||
807 | return false; | ||
808 | } | ||
809 | |||
810 | /** | ||
811 | * memcg_kmem_charge: charge a kmem page | ||
812 | * @page: page to charge | ||
813 | * @gfp: reclaim mode | ||
814 | * @order: allocation order | ||
815 | * | ||
816 | * Returns 0 on success, an error code on failure. | ||
817 | */ | ||
818 | static __always_inline int memcg_kmem_charge(struct page *page, | ||
819 | gfp_t gfp, int order) | ||
820 | { | ||
821 | if (__memcg_kmem_bypass()) | ||
822 | return 0; | ||
823 | if (!(gfp & __GFP_ACCOUNT)) | ||
824 | return 0; | ||
825 | return __memcg_kmem_charge(page, gfp, order); | ||
826 | } | ||
827 | |||
828 | /** | ||
829 | * memcg_kmem_uncharge: uncharge a kmem page | ||
830 | * @page: page to uncharge | ||
831 | * @order: allocation order | ||
832 | */ | ||
833 | static __always_inline void memcg_kmem_uncharge(struct page *page, int order) | ||
834 | { | ||
835 | if (memcg_kmem_enabled()) | ||
836 | __memcg_kmem_uncharge(page, order); | ||
837 | } | ||
838 | |||
839 | /** | ||
840 | * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation | ||
841 | * @cachep: the original global kmem cache | ||
842 | * | ||
843 | * All memory allocated from a per-memcg cache is charged to the owner memcg. | ||
844 | */ | ||
845 | static __always_inline struct kmem_cache * | ||
846 | memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) | ||
847 | { | ||
848 | if (__memcg_kmem_bypass()) | ||
849 | return cachep; | ||
850 | return __memcg_kmem_get_cache(cachep, gfp); | ||
851 | } | ||
852 | |||
853 | static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep) | ||
854 | { | ||
855 | if (memcg_kmem_enabled()) | ||
856 | __memcg_kmem_put_cache(cachep); | ||
857 | } | ||
858 | |||
859 | /** | 789 | /** |
860 | * memcg_kmem_update_page_stat - update kmem page state statistics | 790 | * memcg_kmem_update_page_stat - update kmem page state statistics |
861 | * @page: the page | 791 | * @page: the page |
@@ -878,15 +808,6 @@ static inline bool memcg_kmem_enabled(void) | |||
878 | return false; | 808 | return false; |
879 | } | 809 | } |
880 | 810 | ||
881 | static inline int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) | ||
882 | { | ||
883 | return 0; | ||
884 | } | ||
885 | |||
886 | static inline void memcg_kmem_uncharge(struct page *page, int order) | ||
887 | { | ||
888 | } | ||
889 | |||
890 | static inline int memcg_cache_id(struct mem_cgroup *memcg) | 811 | static inline int memcg_cache_id(struct mem_cgroup *memcg) |
891 | { | 812 | { |
892 | return -1; | 813 | return -1; |
@@ -900,16 +821,6 @@ static inline void memcg_put_cache_ids(void) | |||
900 | { | 821 | { |
901 | } | 822 | } |
902 | 823 | ||
903 | static inline struct kmem_cache * | ||
904 | memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) | ||
905 | { | ||
906 | return cachep; | ||
907 | } | ||
908 | |||
909 | static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) | ||
910 | { | ||
911 | } | ||
912 | |||
913 | static inline void memcg_kmem_update_page_stat(struct page *page, | 824 | static inline void memcg_kmem_update_page_stat(struct page *page, |
914 | enum mem_cgroup_stat_index idx, int val) | 825 | enum mem_cgroup_stat_index idx, int val) |
915 | { | 826 | { |
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 5145620ba48a..01033fadea47 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h | |||
@@ -284,5 +284,7 @@ extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, | |||
284 | unsigned long map_offset); | 284 | unsigned long map_offset); |
285 | extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, | 285 | extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, |
286 | unsigned long pnum); | 286 | unsigned long pnum); |
287 | extern int zone_can_shift(unsigned long pfn, unsigned long nr_pages, | ||
288 | enum zone_type target); | ||
287 | 289 | ||
288 | #endif /* __LINUX_MEMORY_HOTPLUG_H */ | 290 | #endif /* __LINUX_MEMORY_HOTPLUG_H */ |
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 9b50325e4ddf..ae8d475a9385 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
@@ -37,6 +37,8 @@ extern int migrate_page(struct address_space *, | |||
37 | struct page *, struct page *, enum migrate_mode); | 37 | struct page *, struct page *, enum migrate_mode); |
38 | extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, | 38 | extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, |
39 | unsigned long private, enum migrate_mode mode, int reason); | 39 | unsigned long private, enum migrate_mode mode, int reason); |
40 | extern bool isolate_movable_page(struct page *page, isolate_mode_t mode); | ||
41 | extern void putback_movable_page(struct page *page); | ||
40 | 42 | ||
41 | extern int migrate_prep(void); | 43 | extern int migrate_prep(void); |
42 | extern int migrate_prep_local(void); | 44 | extern int migrate_prep_local(void); |
@@ -69,6 +71,21 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
69 | 71 | ||
70 | #endif /* CONFIG_MIGRATION */ | 72 | #endif /* CONFIG_MIGRATION */ |
71 | 73 | ||
74 | #ifdef CONFIG_COMPACTION | ||
75 | extern int PageMovable(struct page *page); | ||
76 | extern void __SetPageMovable(struct page *page, struct address_space *mapping); | ||
77 | extern void __ClearPageMovable(struct page *page); | ||
78 | #else | ||
79 | static inline int PageMovable(struct page *page) { return 0; }; | ||
80 | static inline void __SetPageMovable(struct page *page, | ||
81 | struct address_space *mapping) | ||
82 | { | ||
83 | } | ||
84 | static inline void __ClearPageMovable(struct page *page) | ||
85 | { | ||
86 | } | ||
87 | #endif | ||
88 | |||
72 | #ifdef CONFIG_NUMA_BALANCING | 89 | #ifdef CONFIG_NUMA_BALANCING |
73 | extern bool pmd_trans_migrating(pmd_t pmd); | 90 | extern bool pmd_trans_migrating(pmd_t pmd); |
74 | extern int migrate_misplaced_page(struct page *page, | 91 | extern int migrate_misplaced_page(struct page *page, |
diff --git a/include/linux/mm.h b/include/linux/mm.h index ece042dfe23c..192c1bbe5fcd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -309,10 +309,34 @@ struct vm_fault { | |||
309 | * VM_FAULT_DAX_LOCKED and fill in | 309 | * VM_FAULT_DAX_LOCKED and fill in |
310 | * entry here. | 310 | * entry here. |
311 | */ | 311 | */ |
312 | /* for ->map_pages() only */ | 312 | }; |
313 | pgoff_t max_pgoff; /* map pages for offset from pgoff till | 313 | |
314 | * max_pgoff inclusive */ | 314 | /* |
315 | pte_t *pte; /* pte entry associated with ->pgoff */ | 315 | * Page fault context: passes though page fault handler instead of endless list |
316 | * of function arguments. | ||
317 | */ | ||
318 | struct fault_env { | ||
319 | struct vm_area_struct *vma; /* Target VMA */ | ||
320 | unsigned long address; /* Faulting virtual address */ | ||
321 | unsigned int flags; /* FAULT_FLAG_xxx flags */ | ||
322 | pmd_t *pmd; /* Pointer to pmd entry matching | ||
323 | * the 'address' | ||
324 | */ | ||
325 | pte_t *pte; /* Pointer to pte entry matching | ||
326 | * the 'address'. NULL if the page | ||
327 | * table hasn't been allocated. | ||
328 | */ | ||
329 | spinlock_t *ptl; /* Page table lock. | ||
330 | * Protects pte page table if 'pte' | ||
331 | * is not NULL, otherwise pmd. | ||
332 | */ | ||
333 | pgtable_t prealloc_pte; /* Pre-allocated pte page table. | ||
334 | * vm_ops->map_pages() calls | ||
335 | * alloc_set_pte() from atomic context. | ||
336 | * do_fault_around() pre-allocates | ||
337 | * page table to avoid allocation from | ||
338 | * atomic context. | ||
339 | */ | ||
316 | }; | 340 | }; |
317 | 341 | ||
318 | /* | 342 | /* |
@@ -327,7 +351,8 @@ struct vm_operations_struct { | |||
327 | int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); | 351 | int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); |
328 | int (*pmd_fault)(struct vm_area_struct *, unsigned long address, | 352 | int (*pmd_fault)(struct vm_area_struct *, unsigned long address, |
329 | pmd_t *, unsigned int flags); | 353 | pmd_t *, unsigned int flags); |
330 | void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf); | 354 | void (*map_pages)(struct fault_env *fe, |
355 | pgoff_t start_pgoff, pgoff_t end_pgoff); | ||
331 | 356 | ||
332 | /* notification that a previously read-only page is about to become | 357 | /* notification that a previously read-only page is about to become |
333 | * writable, if an error is returned it will cause a SIGBUS */ | 358 | * writable, if an error is returned it will cause a SIGBUS */ |
@@ -537,7 +562,6 @@ void __put_page(struct page *page); | |||
537 | void put_pages_list(struct list_head *pages); | 562 | void put_pages_list(struct list_head *pages); |
538 | 563 | ||
539 | void split_page(struct page *page, unsigned int order); | 564 | void split_page(struct page *page, unsigned int order); |
540 | int split_free_page(struct page *page); | ||
541 | 565 | ||
542 | /* | 566 | /* |
543 | * Compound pages have a destructor function. Provide a | 567 | * Compound pages have a destructor function. Provide a |
@@ -601,8 +625,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
601 | return pte; | 625 | return pte; |
602 | } | 626 | } |
603 | 627 | ||
604 | void do_set_pte(struct vm_area_struct *vma, unsigned long address, | 628 | int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, |
605 | struct page *page, pte_t *pte, bool write, bool anon); | 629 | struct page *page); |
606 | #endif | 630 | #endif |
607 | 631 | ||
608 | /* | 632 | /* |
@@ -1035,6 +1059,7 @@ static inline pgoff_t page_file_index(struct page *page) | |||
1035 | } | 1059 | } |
1036 | 1060 | ||
1037 | bool page_mapped(struct page *page); | 1061 | bool page_mapped(struct page *page); |
1062 | struct address_space *page_mapping(struct page *page); | ||
1038 | 1063 | ||
1039 | /* | 1064 | /* |
1040 | * Return true only if the page has been allocated with | 1065 | * Return true only if the page has been allocated with |
@@ -1215,15 +1240,14 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page); | |||
1215 | int invalidate_inode_page(struct page *page); | 1240 | int invalidate_inode_page(struct page *page); |
1216 | 1241 | ||
1217 | #ifdef CONFIG_MMU | 1242 | #ifdef CONFIG_MMU |
1218 | extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 1243 | extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, |
1219 | unsigned long address, unsigned int flags); | 1244 | unsigned int flags); |
1220 | extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | 1245 | extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, |
1221 | unsigned long address, unsigned int fault_flags, | 1246 | unsigned long address, unsigned int fault_flags, |
1222 | bool *unlocked); | 1247 | bool *unlocked); |
1223 | #else | 1248 | #else |
1224 | static inline int handle_mm_fault(struct mm_struct *mm, | 1249 | static inline int handle_mm_fault(struct vm_area_struct *vma, |
1225 | struct vm_area_struct *vma, unsigned long address, | 1250 | unsigned long address, unsigned int flags) |
1226 | unsigned int flags) | ||
1227 | { | 1251 | { |
1228 | /* should never happen if there's no MMU */ | 1252 | /* should never happen if there's no MMU */ |
1229 | BUG(); | 1253 | BUG(); |
@@ -2063,7 +2087,8 @@ extern void truncate_inode_pages_final(struct address_space *); | |||
2063 | 2087 | ||
2064 | /* generic vm_area_ops exported for stackable file systems */ | 2088 | /* generic vm_area_ops exported for stackable file systems */ |
2065 | extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); | 2089 | extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); |
2066 | extern void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf); | 2090 | extern void filemap_map_pages(struct fault_env *fe, |
2091 | pgoff_t start_pgoff, pgoff_t end_pgoff); | ||
2067 | extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); | 2092 | extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); |
2068 | 2093 | ||
2069 | /* mm/page-writeback.c */ | 2094 | /* mm/page-writeback.c */ |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 917f2b6a0cde..79472b22d23f 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -60,51 +60,52 @@ struct page { | |||
60 | }; | 60 | }; |
61 | 61 | ||
62 | /* Second double word */ | 62 | /* Second double word */ |
63 | struct { | 63 | union { |
64 | union { | 64 | pgoff_t index; /* Our offset within mapping. */ |
65 | pgoff_t index; /* Our offset within mapping. */ | 65 | void *freelist; /* sl[aou]b first free object */ |
66 | void *freelist; /* sl[aou]b first free object */ | 66 | /* page_deferred_list().prev -- second tail page */ |
67 | /* page_deferred_list().prev -- second tail page */ | 67 | }; |
68 | }; | ||
69 | 68 | ||
70 | union { | 69 | union { |
71 | #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ | 70 | #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ |
72 | defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) | 71 | defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) |
73 | /* Used for cmpxchg_double in slub */ | 72 | /* Used for cmpxchg_double in slub */ |
74 | unsigned long counters; | 73 | unsigned long counters; |
75 | #else | 74 | #else |
76 | /* | 75 | /* |
77 | * Keep _refcount separate from slub cmpxchg_double | 76 | * Keep _refcount separate from slub cmpxchg_double data. |
78 | * data. As the rest of the double word is protected by | 77 | * As the rest of the double word is protected by slab_lock |
79 | * slab_lock but _refcount is not. | 78 | * but _refcount is not. |
80 | */ | 79 | */ |
81 | unsigned counters; | 80 | unsigned counters; |
82 | #endif | 81 | #endif |
82 | struct { | ||
83 | 83 | ||
84 | struct { | 84 | union { |
85 | |||
86 | union { | ||
87 | /* | ||
88 | * Count of ptes mapped in mms, to show | ||
89 | * when page is mapped & limit reverse | ||
90 | * map searches. | ||
91 | */ | ||
92 | atomic_t _mapcount; | ||
93 | |||
94 | struct { /* SLUB */ | ||
95 | unsigned inuse:16; | ||
96 | unsigned objects:15; | ||
97 | unsigned frozen:1; | ||
98 | }; | ||
99 | int units; /* SLOB */ | ||
100 | }; | ||
101 | /* | 85 | /* |
102 | * Usage count, *USE WRAPPER FUNCTION* | 86 | * Count of ptes mapped in mms, to show when |
103 | * when manual accounting. See page_ref.h | 87 | * page is mapped & limit reverse map searches. |
88 | * | ||
89 | * Extra information about page type may be | ||
90 | * stored here for pages that are never mapped, | ||
91 | * in which case the value MUST BE <= -2. | ||
92 | * See page-flags.h for more details. | ||
104 | */ | 93 | */ |
105 | atomic_t _refcount; | 94 | atomic_t _mapcount; |
95 | |||
96 | unsigned int active; /* SLAB */ | ||
97 | struct { /* SLUB */ | ||
98 | unsigned inuse:16; | ||
99 | unsigned objects:15; | ||
100 | unsigned frozen:1; | ||
101 | }; | ||
102 | int units; /* SLOB */ | ||
106 | }; | 103 | }; |
107 | unsigned int active; /* SLAB */ | 104 | /* |
105 | * Usage count, *USE WRAPPER FUNCTION* when manual | ||
106 | * accounting. See page_ref.h | ||
107 | */ | ||
108 | atomic_t _refcount; | ||
108 | }; | 109 | }; |
109 | }; | 110 | }; |
110 | 111 | ||
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index de7be78c6f0e..451a811f48f2 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h | |||
@@ -39,6 +39,7 @@ void dump_mm(const struct mm_struct *mm); | |||
39 | #define VM_WARN_ON(cond) WARN_ON(cond) | 39 | #define VM_WARN_ON(cond) WARN_ON(cond) |
40 | #define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) | 40 | #define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) |
41 | #define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) | 41 | #define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) |
42 | #define VM_WARN(cond, format...) WARN(cond, format) | ||
42 | #else | 43 | #else |
43 | #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) | 44 | #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) |
44 | #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) | 45 | #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) |
@@ -47,6 +48,7 @@ void dump_mm(const struct mm_struct *mm); | |||
47 | #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) | 48 | #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) |
48 | #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) | 49 | #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) |
49 | #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) | 50 | #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) |
51 | #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) | ||
50 | #endif | 52 | #endif |
51 | 53 | ||
52 | #ifdef CONFIG_DEBUG_VIRTUAL | 54 | #ifdef CONFIG_DEBUG_VIRTUAL |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 02069c23486d..19425e988bdc 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -140,6 +140,9 @@ enum zone_stat_item { | |||
140 | NR_DIRTIED, /* page dirtyings since bootup */ | 140 | NR_DIRTIED, /* page dirtyings since bootup */ |
141 | NR_WRITTEN, /* page writings since bootup */ | 141 | NR_WRITTEN, /* page writings since bootup */ |
142 | NR_PAGES_SCANNED, /* pages scanned since last reclaim */ | 142 | NR_PAGES_SCANNED, /* pages scanned since last reclaim */ |
143 | #if IS_ENABLED(CONFIG_ZSMALLOC) | ||
144 | NR_ZSPAGES, /* allocated in zsmalloc */ | ||
145 | #endif | ||
143 | #ifdef CONFIG_NUMA | 146 | #ifdef CONFIG_NUMA |
144 | NUMA_HIT, /* allocated in intended node */ | 147 | NUMA_HIT, /* allocated in intended node */ |
145 | NUMA_MISS, /* allocated in non intended node */ | 148 | NUMA_MISS, /* allocated in non intended node */ |
@@ -151,7 +154,9 @@ enum zone_stat_item { | |||
151 | WORKINGSET_REFAULT, | 154 | WORKINGSET_REFAULT, |
152 | WORKINGSET_ACTIVATE, | 155 | WORKINGSET_ACTIVATE, |
153 | WORKINGSET_NODERECLAIM, | 156 | WORKINGSET_NODERECLAIM, |
154 | NR_ANON_TRANSPARENT_HUGEPAGES, | 157 | NR_ANON_THPS, |
158 | NR_SHMEM_THPS, | ||
159 | NR_SHMEM_PMDMAPPED, | ||
155 | NR_FREE_CMA_PAGES, | 160 | NR_FREE_CMA_PAGES, |
156 | NR_VM_ZONE_STAT_ITEMS }; | 161 | NR_VM_ZONE_STAT_ITEMS }; |
157 | 162 | ||
@@ -524,7 +529,6 @@ struct zone { | |||
524 | 529 | ||
525 | enum zone_flags { | 530 | enum zone_flags { |
526 | ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ | 531 | ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ |
527 | ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ | ||
528 | ZONE_CONGESTED, /* zone has many dirty pages backed by | 532 | ZONE_CONGESTED, /* zone has many dirty pages backed by |
529 | * a congested BDI | 533 | * a congested BDI |
530 | */ | 534 | */ |
diff --git a/include/linux/oom.h b/include/linux/oom.h index 83469522690a..606137b3b778 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h | |||
@@ -23,6 +23,9 @@ struct oom_control { | |||
23 | /* Used to determine mempolicy */ | 23 | /* Used to determine mempolicy */ |
24 | nodemask_t *nodemask; | 24 | nodemask_t *nodemask; |
25 | 25 | ||
26 | /* Memory cgroup in which oom is invoked, or NULL for global oom */ | ||
27 | struct mem_cgroup *memcg; | ||
28 | |||
26 | /* Used to determine cpuset and node locality requirement */ | 29 | /* Used to determine cpuset and node locality requirement */ |
27 | const gfp_t gfp_mask; | 30 | const gfp_t gfp_mask; |
28 | 31 | ||
@@ -83,14 +86,13 @@ extern unsigned long oom_badness(struct task_struct *p, | |||
83 | 86 | ||
84 | extern void oom_kill_process(struct oom_control *oc, struct task_struct *p, | 87 | extern void oom_kill_process(struct oom_control *oc, struct task_struct *p, |
85 | unsigned int points, unsigned long totalpages, | 88 | unsigned int points, unsigned long totalpages, |
86 | struct mem_cgroup *memcg, const char *message); | 89 | const char *message); |
87 | 90 | ||
88 | extern void check_panic_on_oom(struct oom_control *oc, | 91 | extern void check_panic_on_oom(struct oom_control *oc, |
89 | enum oom_constraint constraint, | 92 | enum oom_constraint constraint); |
90 | struct mem_cgroup *memcg); | ||
91 | 93 | ||
92 | extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, | 94 | extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, |
93 | struct task_struct *task, unsigned long totalpages); | 95 | struct task_struct *task); |
94 | 96 | ||
95 | extern bool out_of_memory(struct oom_control *oc); | 97 | extern bool out_of_memory(struct oom_control *oc); |
96 | 98 | ||
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e5a32445f930..74e4dda91238 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h | |||
@@ -129,6 +129,9 @@ enum pageflags { | |||
129 | 129 | ||
130 | /* Compound pages. Stored in first tail page's flags */ | 130 | /* Compound pages. Stored in first tail page's flags */ |
131 | PG_double_map = PG_private_2, | 131 | PG_double_map = PG_private_2, |
132 | |||
133 | /* non-lru isolated movable page */ | ||
134 | PG_isolated = PG_reclaim, | ||
132 | }; | 135 | }; |
133 | 136 | ||
134 | #ifndef __GENERATING_BOUNDS_H | 137 | #ifndef __GENERATING_BOUNDS_H |
@@ -292,11 +295,11 @@ PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) | |||
292 | */ | 295 | */ |
293 | TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND) | 296 | TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND) |
294 | TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND) | 297 | TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND) |
295 | PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND) | 298 | PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL) |
296 | 299 | ||
297 | /* PG_readahead is only used for reads; PG_reclaim is only for writes */ | 300 | /* PG_readahead is only used for reads; PG_reclaim is only for writes */ |
298 | PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND) | 301 | PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) |
299 | TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND) | 302 | TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL) |
300 | PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) | 303 | PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) |
301 | TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND) | 304 | TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND) |
302 | 305 | ||
@@ -357,29 +360,37 @@ PAGEFLAG(Idle, idle, PF_ANY) | |||
357 | * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. | 360 | * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. |
358 | * | 361 | * |
359 | * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, | 362 | * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, |
360 | * the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit; | 363 | * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON |
361 | * and then page->mapping points, not to an anon_vma, but to a private | 364 | * bit; and then page->mapping points, not to an anon_vma, but to a private |
362 | * structure which KSM associates with that merged page. See ksm.h. | 365 | * structure which KSM associates with that merged page. See ksm.h. |
363 | * | 366 | * |
364 | * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used. | 367 | * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable |
368 | * page and then page->mapping points a struct address_space. | ||
365 | * | 369 | * |
366 | * Please note that, confusingly, "page_mapping" refers to the inode | 370 | * Please note that, confusingly, "page_mapping" refers to the inode |
367 | * address_space which maps the page from disk; whereas "page_mapped" | 371 | * address_space which maps the page from disk; whereas "page_mapped" |
368 | * refers to user virtual address space into which the page is mapped. | 372 | * refers to user virtual address space into which the page is mapped. |
369 | */ | 373 | */ |
370 | #define PAGE_MAPPING_ANON 1 | 374 | #define PAGE_MAPPING_ANON 0x1 |
371 | #define PAGE_MAPPING_KSM 2 | 375 | #define PAGE_MAPPING_MOVABLE 0x2 |
372 | #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) | 376 | #define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) |
377 | #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) | ||
373 | 378 | ||
374 | static __always_inline int PageAnonHead(struct page *page) | 379 | static __always_inline int PageMappingFlags(struct page *page) |
375 | { | 380 | { |
376 | return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; | 381 | return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0; |
377 | } | 382 | } |
378 | 383 | ||
379 | static __always_inline int PageAnon(struct page *page) | 384 | static __always_inline int PageAnon(struct page *page) |
380 | { | 385 | { |
381 | page = compound_head(page); | 386 | page = compound_head(page); |
382 | return PageAnonHead(page); | 387 | return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; |
388 | } | ||
389 | |||
390 | static __always_inline int __PageMovable(struct page *page) | ||
391 | { | ||
392 | return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == | ||
393 | PAGE_MAPPING_MOVABLE; | ||
383 | } | 394 | } |
384 | 395 | ||
385 | #ifdef CONFIG_KSM | 396 | #ifdef CONFIG_KSM |
@@ -393,7 +404,7 @@ static __always_inline int PageKsm(struct page *page) | |||
393 | { | 404 | { |
394 | page = compound_head(page); | 405 | page = compound_head(page); |
395 | return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == | 406 | return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == |
396 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | 407 | PAGE_MAPPING_KSM; |
397 | } | 408 | } |
398 | #else | 409 | #else |
399 | TESTPAGEFLAG_FALSE(Ksm) | 410 | TESTPAGEFLAG_FALSE(Ksm) |
@@ -570,6 +581,17 @@ static inline int PageDoubleMap(struct page *page) | |||
570 | return PageHead(page) && test_bit(PG_double_map, &page[1].flags); | 581 | return PageHead(page) && test_bit(PG_double_map, &page[1].flags); |
571 | } | 582 | } |
572 | 583 | ||
584 | static inline void SetPageDoubleMap(struct page *page) | ||
585 | { | ||
586 | VM_BUG_ON_PAGE(!PageHead(page), page); | ||
587 | set_bit(PG_double_map, &page[1].flags); | ||
588 | } | ||
589 | |||
590 | static inline void ClearPageDoubleMap(struct page *page) | ||
591 | { | ||
592 | VM_BUG_ON_PAGE(!PageHead(page), page); | ||
593 | clear_bit(PG_double_map, &page[1].flags); | ||
594 | } | ||
573 | static inline int TestSetPageDoubleMap(struct page *page) | 595 | static inline int TestSetPageDoubleMap(struct page *page) |
574 | { | 596 | { |
575 | VM_BUG_ON_PAGE(!PageHead(page), page); | 597 | VM_BUG_ON_PAGE(!PageHead(page), page); |
@@ -587,59 +609,59 @@ TESTPAGEFLAG_FALSE(TransHuge) | |||
587 | TESTPAGEFLAG_FALSE(TransCompound) | 609 | TESTPAGEFLAG_FALSE(TransCompound) |
588 | TESTPAGEFLAG_FALSE(TransCompoundMap) | 610 | TESTPAGEFLAG_FALSE(TransCompoundMap) |
589 | TESTPAGEFLAG_FALSE(TransTail) | 611 | TESTPAGEFLAG_FALSE(TransTail) |
590 | TESTPAGEFLAG_FALSE(DoubleMap) | 612 | PAGEFLAG_FALSE(DoubleMap) |
591 | TESTSETFLAG_FALSE(DoubleMap) | 613 | TESTSETFLAG_FALSE(DoubleMap) |
592 | TESTCLEARFLAG_FALSE(DoubleMap) | 614 | TESTCLEARFLAG_FALSE(DoubleMap) |
593 | #endif | 615 | #endif |
594 | 616 | ||
595 | /* | 617 | /* |
618 | * For pages that are never mapped to userspace, page->mapcount may be | ||
619 | * used for storing extra information about page type. Any value used | ||
620 | * for this purpose must be <= -2, but it's better start not too close | ||
621 | * to -2 so that an underflow of the page_mapcount() won't be mistaken | ||
622 | * for a special page. | ||
623 | */ | ||
624 | #define PAGE_MAPCOUNT_OPS(uname, lname) \ | ||
625 | static __always_inline int Page##uname(struct page *page) \ | ||
626 | { \ | ||
627 | return atomic_read(&page->_mapcount) == \ | ||
628 | PAGE_##lname##_MAPCOUNT_VALUE; \ | ||
629 | } \ | ||
630 | static __always_inline void __SetPage##uname(struct page *page) \ | ||
631 | { \ | ||
632 | VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); \ | ||
633 | atomic_set(&page->_mapcount, PAGE_##lname##_MAPCOUNT_VALUE); \ | ||
634 | } \ | ||
635 | static __always_inline void __ClearPage##uname(struct page *page) \ | ||
636 | { \ | ||
637 | VM_BUG_ON_PAGE(!Page##uname(page), page); \ | ||
638 | atomic_set(&page->_mapcount, -1); \ | ||
639 | } | ||
640 | |||
641 | /* | ||
596 | * PageBuddy() indicate that the page is free and in the buddy system | 642 | * PageBuddy() indicate that the page is free and in the buddy system |
597 | * (see mm/page_alloc.c). | 643 | * (see mm/page_alloc.c). |
598 | * | ||
599 | * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to | ||
600 | * -2 so that an underflow of the page_mapcount() won't be mistaken | ||
601 | * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very | ||
602 | * efficiently by most CPU architectures. | ||
603 | */ | 644 | */ |
604 | #define PAGE_BUDDY_MAPCOUNT_VALUE (-128) | 645 | #define PAGE_BUDDY_MAPCOUNT_VALUE (-128) |
605 | 646 | PAGE_MAPCOUNT_OPS(Buddy, BUDDY) | |
606 | static inline int PageBuddy(struct page *page) | ||
607 | { | ||
608 | return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE; | ||
609 | } | ||
610 | 647 | ||
611 | static inline void __SetPageBuddy(struct page *page) | 648 | /* |
612 | { | 649 | * PageBalloon() is set on pages that are on the balloon page list |
613 | VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); | 650 | * (see mm/balloon_compaction.c). |
614 | atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE); | 651 | */ |
615 | } | 652 | #define PAGE_BALLOON_MAPCOUNT_VALUE (-256) |
653 | PAGE_MAPCOUNT_OPS(Balloon, BALLOON) | ||
616 | 654 | ||
617 | static inline void __ClearPageBuddy(struct page *page) | 655 | /* |
618 | { | 656 | * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on |
619 | VM_BUG_ON_PAGE(!PageBuddy(page), page); | 657 | * pages allocated with __GFP_ACCOUNT. It gets cleared on page free. |
620 | atomic_set(&page->_mapcount, -1); | 658 | */ |
621 | } | 659 | #define PAGE_KMEMCG_MAPCOUNT_VALUE (-512) |
660 | PAGE_MAPCOUNT_OPS(Kmemcg, KMEMCG) | ||
622 | 661 | ||
623 | extern bool is_free_buddy_page(struct page *page); | 662 | extern bool is_free_buddy_page(struct page *page); |
624 | 663 | ||
625 | #define PAGE_BALLOON_MAPCOUNT_VALUE (-256) | 664 | __PAGEFLAG(Isolated, isolated, PF_ANY); |
626 | |||
627 | static inline int PageBalloon(struct page *page) | ||
628 | { | ||
629 | return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE; | ||
630 | } | ||
631 | |||
632 | static inline void __SetPageBalloon(struct page *page) | ||
633 | { | ||
634 | VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); | ||
635 | atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE); | ||
636 | } | ||
637 | |||
638 | static inline void __ClearPageBalloon(struct page *page) | ||
639 | { | ||
640 | VM_BUG_ON_PAGE(!PageBalloon(page), page); | ||
641 | atomic_set(&page->_mapcount, -1); | ||
642 | } | ||
643 | 665 | ||
644 | /* | 666 | /* |
645 | * If network-based swap is enabled, sl*b must keep track of whether pages | 667 | * If network-based swap is enabled, sl*b must keep track of whether pages |
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index e1fe7cf5bddf..03f2a3e7d76d 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h | |||
@@ -3,6 +3,7 @@ | |||
3 | 3 | ||
4 | #include <linux/types.h> | 4 | #include <linux/types.h> |
5 | #include <linux/stacktrace.h> | 5 | #include <linux/stacktrace.h> |
6 | #include <linux/stackdepot.h> | ||
6 | 7 | ||
7 | struct pglist_data; | 8 | struct pglist_data; |
8 | struct page_ext_operations { | 9 | struct page_ext_operations { |
@@ -44,9 +45,8 @@ struct page_ext { | |||
44 | #ifdef CONFIG_PAGE_OWNER | 45 | #ifdef CONFIG_PAGE_OWNER |
45 | unsigned int order; | 46 | unsigned int order; |
46 | gfp_t gfp_mask; | 47 | gfp_t gfp_mask; |
47 | unsigned int nr_entries; | ||
48 | int last_migrate_reason; | 48 | int last_migrate_reason; |
49 | unsigned long trace_entries[8]; | 49 | depot_stack_handle_t handle; |
50 | #endif | 50 | #endif |
51 | }; | 51 | }; |
52 | 52 | ||
diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h index 46f1b939948c..30583ab0ffb1 100644 --- a/include/linux/page_owner.h +++ b/include/linux/page_owner.h | |||
@@ -10,7 +10,7 @@ extern struct page_ext_operations page_owner_ops; | |||
10 | extern void __reset_page_owner(struct page *page, unsigned int order); | 10 | extern void __reset_page_owner(struct page *page, unsigned int order); |
11 | extern void __set_page_owner(struct page *page, | 11 | extern void __set_page_owner(struct page *page, |
12 | unsigned int order, gfp_t gfp_mask); | 12 | unsigned int order, gfp_t gfp_mask); |
13 | extern gfp_t __get_page_owner_gfp(struct page *page); | 13 | extern void __split_page_owner(struct page *page, unsigned int order); |
14 | extern void __copy_page_owner(struct page *oldpage, struct page *newpage); | 14 | extern void __copy_page_owner(struct page *oldpage, struct page *newpage); |
15 | extern void __set_page_owner_migrate_reason(struct page *page, int reason); | 15 | extern void __set_page_owner_migrate_reason(struct page *page, int reason); |
16 | extern void __dump_page_owner(struct page *page); | 16 | extern void __dump_page_owner(struct page *page); |
@@ -28,12 +28,10 @@ static inline void set_page_owner(struct page *page, | |||
28 | __set_page_owner(page, order, gfp_mask); | 28 | __set_page_owner(page, order, gfp_mask); |
29 | } | 29 | } |
30 | 30 | ||
31 | static inline gfp_t get_page_owner_gfp(struct page *page) | 31 | static inline void split_page_owner(struct page *page, unsigned int order) |
32 | { | 32 | { |
33 | if (static_branch_unlikely(&page_owner_inited)) | 33 | if (static_branch_unlikely(&page_owner_inited)) |
34 | return __get_page_owner_gfp(page); | 34 | __split_page_owner(page, order); |
35 | else | ||
36 | return 0; | ||
37 | } | 35 | } |
38 | static inline void copy_page_owner(struct page *oldpage, struct page *newpage) | 36 | static inline void copy_page_owner(struct page *oldpage, struct page *newpage) |
39 | { | 37 | { |
@@ -58,9 +56,9 @@ static inline void set_page_owner(struct page *page, | |||
58 | unsigned int order, gfp_t gfp_mask) | 56 | unsigned int order, gfp_t gfp_mask) |
59 | { | 57 | { |
60 | } | 58 | } |
61 | static inline gfp_t get_page_owner_gfp(struct page *page) | 59 | static inline void split_page_owner(struct page *page, |
60 | unsigned int order) | ||
62 | { | 61 | { |
63 | return 0; | ||
64 | } | 62 | } |
65 | static inline void copy_page_owner(struct page *oldpage, struct page *newpage) | 63 | static inline void copy_page_owner(struct page *oldpage, struct page *newpage) |
66 | { | 64 | { |
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 97354102794d..81363b834900 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h | |||
@@ -209,10 +209,10 @@ static inline struct page *page_cache_alloc_cold(struct address_space *x) | |||
209 | return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD); | 209 | return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD); |
210 | } | 210 | } |
211 | 211 | ||
212 | static inline struct page *page_cache_alloc_readahead(struct address_space *x) | 212 | static inline gfp_t readahead_gfp_mask(struct address_space *x) |
213 | { | 213 | { |
214 | return __page_cache_alloc(mapping_gfp_mask(x) | | 214 | return mapping_gfp_mask(x) | |
215 | __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN); | 215 | __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN; |
216 | } | 216 | } |
217 | 217 | ||
218 | typedef int filler_t(void *, struct page *); | 218 | typedef int filler_t(void *, struct page *); |
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index eca6f626c16e..cbfee507c839 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h | |||
@@ -291,6 +291,7 @@ unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root, | |||
291 | unsigned long first_index, unsigned int max_items); | 291 | unsigned long first_index, unsigned int max_items); |
292 | int radix_tree_preload(gfp_t gfp_mask); | 292 | int radix_tree_preload(gfp_t gfp_mask); |
293 | int radix_tree_maybe_preload(gfp_t gfp_mask); | 293 | int radix_tree_maybe_preload(gfp_t gfp_mask); |
294 | int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order); | ||
294 | void radix_tree_init(void); | 295 | void radix_tree_init(void); |
295 | void *radix_tree_tag_set(struct radix_tree_root *root, | 296 | void *radix_tree_tag_set(struct radix_tree_root *root, |
296 | unsigned long index, unsigned int tag); | 297 | unsigned long index, unsigned int tag); |
diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 2b0fad83683f..b46bb5620a76 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h | |||
@@ -165,7 +165,7 @@ void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, | |||
165 | unsigned long, int); | 165 | unsigned long, int); |
166 | void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, | 166 | void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, |
167 | unsigned long, bool); | 167 | unsigned long, bool); |
168 | void page_add_file_rmap(struct page *); | 168 | void page_add_file_rmap(struct page *, bool); |
169 | void page_remove_rmap(struct page *, bool); | 169 | void page_remove_rmap(struct page *, bool); |
170 | 170 | ||
171 | void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, | 171 | void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, |
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 4d4780c00d34..ff078e7043b6 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h | |||
@@ -16,8 +16,9 @@ struct shmem_inode_info { | |||
16 | unsigned long flags; | 16 | unsigned long flags; |
17 | unsigned long alloced; /* data pages alloced to file */ | 17 | unsigned long alloced; /* data pages alloced to file */ |
18 | unsigned long swapped; /* subtotal assigned to swap */ | 18 | unsigned long swapped; /* subtotal assigned to swap */ |
19 | struct shared_policy policy; /* NUMA memory alloc policy */ | 19 | struct list_head shrinklist; /* shrinkable hpage inodes */ |
20 | struct list_head swaplist; /* chain of maybes on swap */ | 20 | struct list_head swaplist; /* chain of maybes on swap */ |
21 | struct shared_policy policy; /* NUMA memory alloc policy */ | ||
21 | struct simple_xattrs xattrs; /* list of xattrs */ | 22 | struct simple_xattrs xattrs; /* list of xattrs */ |
22 | struct inode vfs_inode; | 23 | struct inode vfs_inode; |
23 | }; | 24 | }; |
@@ -28,10 +29,14 @@ struct shmem_sb_info { | |||
28 | unsigned long max_inodes; /* How many inodes are allowed */ | 29 | unsigned long max_inodes; /* How many inodes are allowed */ |
29 | unsigned long free_inodes; /* How many are left for allocation */ | 30 | unsigned long free_inodes; /* How many are left for allocation */ |
30 | spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ | 31 | spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ |
32 | umode_t mode; /* Mount mode for root directory */ | ||
33 | unsigned char huge; /* Whether to try for hugepages */ | ||
31 | kuid_t uid; /* Mount uid for root directory */ | 34 | kuid_t uid; /* Mount uid for root directory */ |
32 | kgid_t gid; /* Mount gid for root directory */ | 35 | kgid_t gid; /* Mount gid for root directory */ |
33 | umode_t mode; /* Mount mode for root directory */ | ||
34 | struct mempolicy *mpol; /* default memory policy for mappings */ | 36 | struct mempolicy *mpol; /* default memory policy for mappings */ |
37 | spinlock_t shrinklist_lock; /* Protects shrinklist */ | ||
38 | struct list_head shrinklist; /* List of shinkable inodes */ | ||
39 | unsigned long shrinklist_len; /* Length of shrinklist */ | ||
35 | }; | 40 | }; |
36 | 41 | ||
37 | static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) | 42 | static inline struct shmem_inode_info *SHMEM_I(struct inode *inode) |
@@ -49,6 +54,8 @@ extern struct file *shmem_file_setup(const char *name, | |||
49 | extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, | 54 | extern struct file *shmem_kernel_file_setup(const char *name, loff_t size, |
50 | unsigned long flags); | 55 | unsigned long flags); |
51 | extern int shmem_zero_setup(struct vm_area_struct *); | 56 | extern int shmem_zero_setup(struct vm_area_struct *); |
57 | extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, | ||
58 | unsigned long len, unsigned long pgoff, unsigned long flags); | ||
52 | extern int shmem_lock(struct file *file, int lock, struct user_struct *user); | 59 | extern int shmem_lock(struct file *file, int lock, struct user_struct *user); |
53 | extern bool shmem_mapping(struct address_space *mapping); | 60 | extern bool shmem_mapping(struct address_space *mapping); |
54 | extern void shmem_unlock_mapping(struct address_space *mapping); | 61 | extern void shmem_unlock_mapping(struct address_space *mapping); |
@@ -61,6 +68,19 @@ extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); | |||
61 | extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, | 68 | extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, |
62 | pgoff_t start, pgoff_t end); | 69 | pgoff_t start, pgoff_t end); |
63 | 70 | ||
71 | /* Flag allocation requirements to shmem_getpage */ | ||
72 | enum sgp_type { | ||
73 | SGP_READ, /* don't exceed i_size, don't allocate page */ | ||
74 | SGP_CACHE, /* don't exceed i_size, may allocate page */ | ||
75 | SGP_NOHUGE, /* like SGP_CACHE, but no huge pages */ | ||
76 | SGP_HUGE, /* like SGP_CACHE, huge pages preferred */ | ||
77 | SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ | ||
78 | SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ | ||
79 | }; | ||
80 | |||
81 | extern int shmem_getpage(struct inode *inode, pgoff_t index, | ||
82 | struct page **pagep, enum sgp_type sgp); | ||
83 | |||
64 | static inline struct page *shmem_read_mapping_page( | 84 | static inline struct page *shmem_read_mapping_page( |
65 | struct address_space *mapping, pgoff_t index) | 85 | struct address_space *mapping, pgoff_t index) |
66 | { | 86 | { |
@@ -68,6 +88,18 @@ static inline struct page *shmem_read_mapping_page( | |||
68 | mapping_gfp_mask(mapping)); | 88 | mapping_gfp_mask(mapping)); |
69 | } | 89 | } |
70 | 90 | ||
91 | static inline bool shmem_file(struct file *file) | ||
92 | { | ||
93 | if (!IS_ENABLED(CONFIG_SHMEM)) | ||
94 | return false; | ||
95 | if (!file || !file->f_mapping) | ||
96 | return false; | ||
97 | return shmem_mapping(file->f_mapping); | ||
98 | } | ||
99 | |||
100 | extern bool shmem_charge(struct inode *inode, long pages); | ||
101 | extern void shmem_uncharge(struct inode *inode, long pages); | ||
102 | |||
71 | #ifdef CONFIG_TMPFS | 103 | #ifdef CONFIG_TMPFS |
72 | 104 | ||
73 | extern int shmem_add_seals(struct file *file, unsigned int seals); | 105 | extern int shmem_add_seals(struct file *file, unsigned int seals); |
@@ -83,4 +115,13 @@ static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a) | |||
83 | 115 | ||
84 | #endif | 116 | #endif |
85 | 117 | ||
118 | #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE | ||
119 | extern bool shmem_huge_enabled(struct vm_area_struct *vma); | ||
120 | #else | ||
121 | static inline bool shmem_huge_enabled(struct vm_area_struct *vma) | ||
122 | { | ||
123 | return false; | ||
124 | } | ||
125 | #endif | ||
126 | |||
86 | #endif | 127 | #endif |
diff --git a/include/linux/slab.h b/include/linux/slab.h index aeb3e6d00a66..1a4ea551aae5 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h | |||
@@ -565,6 +565,8 @@ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) | |||
565 | { | 565 | { |
566 | if (size != 0 && n > SIZE_MAX / size) | 566 | if (size != 0 && n > SIZE_MAX / size) |
567 | return NULL; | 567 | return NULL; |
568 | if (__builtin_constant_p(n) && __builtin_constant_p(size)) | ||
569 | return kmalloc(n * size, flags); | ||
568 | return __kmalloc(n * size, flags); | 570 | return __kmalloc(n * size, flags); |
569 | } | 571 | } |
570 | 572 | ||
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 8694f7a5d92b..339ba027ade9 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h | |||
@@ -81,7 +81,7 @@ struct kmem_cache { | |||
81 | #endif | 81 | #endif |
82 | 82 | ||
83 | #ifdef CONFIG_SLAB_FREELIST_RANDOM | 83 | #ifdef CONFIG_SLAB_FREELIST_RANDOM |
84 | void *random_seq; | 84 | unsigned int *random_seq; |
85 | #endif | 85 | #endif |
86 | 86 | ||
87 | struct kmem_cache_node *node[MAX_NUMNODES]; | 87 | struct kmem_cache_node *node[MAX_NUMNODES]; |
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index d1faa019c02a..5624c1f3eb0a 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h | |||
@@ -99,6 +99,11 @@ struct kmem_cache { | |||
99 | */ | 99 | */ |
100 | int remote_node_defrag_ratio; | 100 | int remote_node_defrag_ratio; |
101 | #endif | 101 | #endif |
102 | |||
103 | #ifdef CONFIG_SLAB_FREELIST_RANDOM | ||
104 | unsigned int *random_seq; | ||
105 | #endif | ||
106 | |||
102 | struct kmem_cache_node *node[MAX_NUMNODES]; | 107 | struct kmem_cache_node *node[MAX_NUMNODES]; |
103 | }; | 108 | }; |
104 | 109 | ||
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 587480ad41b7..dd66a952e8cd 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h | |||
@@ -27,8 +27,7 @@ | |||
27 | #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) | 27 | #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) |
28 | #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) | 28 | #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) |
29 | 29 | ||
30 | extern int handle_userfault(struct vm_area_struct *vma, unsigned long address, | 30 | extern int handle_userfault(struct fault_env *fe, unsigned long reason); |
31 | unsigned int flags, unsigned long reason); | ||
32 | 31 | ||
33 | extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, | 32 | extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, |
34 | unsigned long src_start, unsigned long len); | 33 | unsigned long src_start, unsigned long len); |
@@ -56,10 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) | |||
56 | #else /* CONFIG_USERFAULTFD */ | 55 | #else /* CONFIG_USERFAULTFD */ |
57 | 56 | ||
58 | /* mm helpers */ | 57 | /* mm helpers */ |
59 | static inline int handle_userfault(struct vm_area_struct *vma, | 58 | static inline int handle_userfault(struct fault_env *fe, unsigned long reason) |
60 | unsigned long address, | ||
61 | unsigned int flags, | ||
62 | unsigned long reason) | ||
63 | { | 59 | { |
64 | return VM_FAULT_SIGBUS; | 60 | return VM_FAULT_SIGBUS; |
65 | } | 61 | } |
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index ec084321fe09..42604173f122 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
@@ -70,6 +70,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
70 | THP_FAULT_FALLBACK, | 70 | THP_FAULT_FALLBACK, |
71 | THP_COLLAPSE_ALLOC, | 71 | THP_COLLAPSE_ALLOC, |
72 | THP_COLLAPSE_ALLOC_FAILED, | 72 | THP_COLLAPSE_ALLOC_FAILED, |
73 | THP_FILE_ALLOC, | ||
74 | THP_FILE_MAPPED, | ||
73 | THP_SPLIT_PAGE, | 75 | THP_SPLIT_PAGE, |
74 | THP_SPLIT_PAGE_FAILED, | 76 | THP_SPLIT_PAGE_FAILED, |
75 | THP_DEFERRED_SPLIT_PAGE, | 77 | THP_DEFERRED_SPLIT_PAGE, |
@@ -100,4 +102,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
100 | NR_VM_EVENT_ITEMS | 102 | NR_VM_EVENT_ITEMS |
101 | }; | 103 | }; |
102 | 104 | ||
105 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE | ||
106 | #define THP_FILE_ALLOC ({ BUILD_BUG(); 0; }) | ||
107 | #define THP_FILE_MAPPED ({ BUILD_BUG(); 0; }) | ||
108 | #endif | ||
109 | |||
103 | #endif /* VM_EVENT_ITEM_H_INCLUDED */ | 110 | #endif /* VM_EVENT_ITEM_H_INCLUDED */ |
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d0b5ca5d4e08..717e6149e753 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
@@ -384,4 +384,7 @@ void tag_pages_for_writeback(struct address_space *mapping, | |||
384 | 384 | ||
385 | void account_page_redirty(struct page *page); | 385 | void account_page_redirty(struct page *page); |
386 | 386 | ||
387 | void sb_mark_inode_writeback(struct inode *inode); | ||
388 | void sb_clear_inode_writeback(struct inode *inode); | ||
389 | |||
387 | #endif /* WRITEBACK_H */ | 390 | #endif /* WRITEBACK_H */ |
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 551ba4acde4d..04f58acda8e8 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h | |||
@@ -13,7 +13,7 @@ | |||
13 | EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ | 13 | EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ |
14 | EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ | 14 | EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ |
15 | EM( SCAN_PAGE_RO, "no_writable_page") \ | 15 | EM( SCAN_PAGE_RO, "no_writable_page") \ |
16 | EM( SCAN_NO_REFERENCED_PAGE, "no_referenced_page") \ | 16 | EM( SCAN_LACK_REFERENCED_PAGE, "lack_referenced_page") \ |
17 | EM( SCAN_PAGE_NULL, "page_null") \ | 17 | EM( SCAN_PAGE_NULL, "page_null") \ |
18 | EM( SCAN_SCAN_ABORT, "scan_aborted") \ | 18 | EM( SCAN_SCAN_ABORT, "scan_aborted") \ |
19 | EM( SCAN_PAGE_COUNT, "not_suitable_page_count") \ | 19 | EM( SCAN_PAGE_COUNT, "not_suitable_page_count") \ |
@@ -28,7 +28,9 @@ | |||
28 | EM( SCAN_SWAP_CACHE_PAGE, "page_swap_cache") \ | 28 | EM( SCAN_SWAP_CACHE_PAGE, "page_swap_cache") \ |
29 | EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\ | 29 | EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\ |
30 | EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ | 30 | EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ |
31 | EMe( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") | 31 | EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \ |
32 | EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ | ||
33 | EMe(SCAN_TRUNCATED, "truncated") \ | ||
32 | 34 | ||
33 | #undef EM | 35 | #undef EM |
34 | #undef EMe | 36 | #undef EMe |
@@ -45,17 +47,18 @@ SCAN_STATUS | |||
45 | TRACE_EVENT(mm_khugepaged_scan_pmd, | 47 | TRACE_EVENT(mm_khugepaged_scan_pmd, |
46 | 48 | ||
47 | TP_PROTO(struct mm_struct *mm, struct page *page, bool writable, | 49 | TP_PROTO(struct mm_struct *mm, struct page *page, bool writable, |
48 | bool referenced, int none_or_zero, int status), | 50 | int referenced, int none_or_zero, int status, int unmapped), |
49 | 51 | ||
50 | TP_ARGS(mm, page, writable, referenced, none_or_zero, status), | 52 | TP_ARGS(mm, page, writable, referenced, none_or_zero, status, unmapped), |
51 | 53 | ||
52 | TP_STRUCT__entry( | 54 | TP_STRUCT__entry( |
53 | __field(struct mm_struct *, mm) | 55 | __field(struct mm_struct *, mm) |
54 | __field(unsigned long, pfn) | 56 | __field(unsigned long, pfn) |
55 | __field(bool, writable) | 57 | __field(bool, writable) |
56 | __field(bool, referenced) | 58 | __field(int, referenced) |
57 | __field(int, none_or_zero) | 59 | __field(int, none_or_zero) |
58 | __field(int, status) | 60 | __field(int, status) |
61 | __field(int, unmapped) | ||
59 | ), | 62 | ), |
60 | 63 | ||
61 | TP_fast_assign( | 64 | TP_fast_assign( |
@@ -65,15 +68,17 @@ TRACE_EVENT(mm_khugepaged_scan_pmd, | |||
65 | __entry->referenced = referenced; | 68 | __entry->referenced = referenced; |
66 | __entry->none_or_zero = none_or_zero; | 69 | __entry->none_or_zero = none_or_zero; |
67 | __entry->status = status; | 70 | __entry->status = status; |
71 | __entry->unmapped = unmapped; | ||
68 | ), | 72 | ), |
69 | 73 | ||
70 | TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s", | 74 | TP_printk("mm=%p, scan_pfn=0x%lx, writable=%d, referenced=%d, none_or_zero=%d, status=%s, unmapped=%d", |
71 | __entry->mm, | 75 | __entry->mm, |
72 | __entry->pfn, | 76 | __entry->pfn, |
73 | __entry->writable, | 77 | __entry->writable, |
74 | __entry->referenced, | 78 | __entry->referenced, |
75 | __entry->none_or_zero, | 79 | __entry->none_or_zero, |
76 | __print_symbolic(__entry->status, SCAN_STATUS)) | 80 | __print_symbolic(__entry->status, SCAN_STATUS), |
81 | __entry->unmapped) | ||
77 | ); | 82 | ); |
78 | 83 | ||
79 | TRACE_EVENT(mm_collapse_huge_page, | 84 | TRACE_EVENT(mm_collapse_huge_page, |
@@ -103,14 +108,14 @@ TRACE_EVENT(mm_collapse_huge_page, | |||
103 | TRACE_EVENT(mm_collapse_huge_page_isolate, | 108 | TRACE_EVENT(mm_collapse_huge_page_isolate, |
104 | 109 | ||
105 | TP_PROTO(struct page *page, int none_or_zero, | 110 | TP_PROTO(struct page *page, int none_or_zero, |
106 | bool referenced, bool writable, int status), | 111 | int referenced, bool writable, int status), |
107 | 112 | ||
108 | TP_ARGS(page, none_or_zero, referenced, writable, status), | 113 | TP_ARGS(page, none_or_zero, referenced, writable, status), |
109 | 114 | ||
110 | TP_STRUCT__entry( | 115 | TP_STRUCT__entry( |
111 | __field(unsigned long, pfn) | 116 | __field(unsigned long, pfn) |
112 | __field(int, none_or_zero) | 117 | __field(int, none_or_zero) |
113 | __field(bool, referenced) | 118 | __field(int, referenced) |
114 | __field(bool, writable) | 119 | __field(bool, writable) |
115 | __field(int, status) | 120 | __field(int, status) |
116 | ), | 121 | ), |
@@ -131,5 +136,32 @@ TRACE_EVENT(mm_collapse_huge_page_isolate, | |||
131 | __print_symbolic(__entry->status, SCAN_STATUS)) | 136 | __print_symbolic(__entry->status, SCAN_STATUS)) |
132 | ); | 137 | ); |
133 | 138 | ||
139 | TRACE_EVENT(mm_collapse_huge_page_swapin, | ||
140 | |||
141 | TP_PROTO(struct mm_struct *mm, int swapped_in, int referenced, int ret), | ||
142 | |||
143 | TP_ARGS(mm, swapped_in, referenced, ret), | ||
144 | |||
145 | TP_STRUCT__entry( | ||
146 | __field(struct mm_struct *, mm) | ||
147 | __field(int, swapped_in) | ||
148 | __field(int, referenced) | ||
149 | __field(int, ret) | ||
150 | ), | ||
151 | |||
152 | TP_fast_assign( | ||
153 | __entry->mm = mm; | ||
154 | __entry->swapped_in = swapped_in; | ||
155 | __entry->referenced = referenced; | ||
156 | __entry->ret = ret; | ||
157 | ), | ||
158 | |||
159 | TP_printk("mm=%p, swapped_in=%d, referenced=%d, ret=%d", | ||
160 | __entry->mm, | ||
161 | __entry->swapped_in, | ||
162 | __entry->referenced, | ||
163 | __entry->ret) | ||
164 | ); | ||
165 | |||
134 | #endif /* __HUGE_MEMORY_H */ | 166 | #endif /* __HUGE_MEMORY_H */ |
135 | #include <trace/define_trace.h> | 167 | #include <trace/define_trace.h> |
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 73614ce1d204..531f5811ff6b 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h | |||
@@ -696,7 +696,7 @@ DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, | |||
696 | TP_ARGS(inode, wbc, nr_to_write) | 696 | TP_ARGS(inode, wbc, nr_to_write) |
697 | ); | 697 | ); |
698 | 698 | ||
699 | DECLARE_EVENT_CLASS(writeback_lazytime_template, | 699 | DECLARE_EVENT_CLASS(writeback_inode_template, |
700 | TP_PROTO(struct inode *inode), | 700 | TP_PROTO(struct inode *inode), |
701 | 701 | ||
702 | TP_ARGS(inode), | 702 | TP_ARGS(inode), |
@@ -723,25 +723,39 @@ DECLARE_EVENT_CLASS(writeback_lazytime_template, | |||
723 | show_inode_state(__entry->state), __entry->mode) | 723 | show_inode_state(__entry->state), __entry->mode) |
724 | ); | 724 | ); |
725 | 725 | ||
726 | DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime, | 726 | DEFINE_EVENT(writeback_inode_template, writeback_lazytime, |
727 | TP_PROTO(struct inode *inode), | 727 | TP_PROTO(struct inode *inode), |
728 | 728 | ||
729 | TP_ARGS(inode) | 729 | TP_ARGS(inode) |
730 | ); | 730 | ); |
731 | 731 | ||
732 | DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime_iput, | 732 | DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput, |
733 | TP_PROTO(struct inode *inode), | 733 | TP_PROTO(struct inode *inode), |
734 | 734 | ||
735 | TP_ARGS(inode) | 735 | TP_ARGS(inode) |
736 | ); | 736 | ); |
737 | 737 | ||
738 | DEFINE_EVENT(writeback_lazytime_template, writeback_dirty_inode_enqueue, | 738 | DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue, |
739 | 739 | ||
740 | TP_PROTO(struct inode *inode), | 740 | TP_PROTO(struct inode *inode), |
741 | 741 | ||
742 | TP_ARGS(inode) | 742 | TP_ARGS(inode) |
743 | ); | 743 | ); |
744 | 744 | ||
745 | /* | ||
746 | * Inode writeback list tracking. | ||
747 | */ | ||
748 | |||
749 | DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback, | ||
750 | TP_PROTO(struct inode *inode), | ||
751 | TP_ARGS(inode) | ||
752 | ); | ||
753 | |||
754 | DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback, | ||
755 | TP_PROTO(struct inode *inode), | ||
756 | TP_ARGS(inode) | ||
757 | ); | ||
758 | |||
745 | #endif /* _TRACE_WRITEBACK_H */ | 759 | #endif /* _TRACE_WRITEBACK_H */ |
746 | 760 | ||
747 | /* This part must be outside protection */ | 761 | /* This part must be outside protection */ |
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 546b38886e11..e398beac67b8 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h | |||
@@ -80,5 +80,7 @@ | |||
80 | #define BPF_FS_MAGIC 0xcafe4a11 | 80 | #define BPF_FS_MAGIC 0xcafe4a11 |
81 | /* Since UDF 2.01 is ISO 13346 based... */ | 81 | /* Since UDF 2.01 is ISO 13346 based... */ |
82 | #define UDF_SUPER_MAGIC 0x15013346 | 82 | #define UDF_SUPER_MAGIC 0x15013346 |
83 | #define BALLOON_KVM_MAGIC 0x13661366 | ||
84 | #define ZSMALLOC_MAGIC 0x58295829 | ||
83 | 85 | ||
84 | #endif /* __LINUX_MAGIC_H__ */ | 86 | #endif /* __LINUX_MAGIC_H__ */ |
diff --git a/init/Kconfig b/init/Kconfig index 557bdf10cd44..504057925ee9 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -1786,10 +1786,10 @@ endchoice | |||
1786 | 1786 | ||
1787 | config SLAB_FREELIST_RANDOM | 1787 | config SLAB_FREELIST_RANDOM |
1788 | default n | 1788 | default n |
1789 | depends on SLAB | 1789 | depends on SLAB || SLUB |
1790 | bool "SLAB freelist randomization" | 1790 | bool "SLAB freelist randomization" |
1791 | help | 1791 | help |
1792 | Randomizes the freelist order used on creating new SLABs. This | 1792 | Randomizes the freelist order used on creating new pages. This |
1793 | security feature reduces the predictability of the kernel slab | 1793 | security feature reduces the predictability of the kernel slab |
1794 | allocator against heap overflows. | 1794 | allocator against heap overflows. |
1795 | 1795 | ||
@@ -476,13 +476,15 @@ static const struct file_operations shm_file_operations = { | |||
476 | .mmap = shm_mmap, | 476 | .mmap = shm_mmap, |
477 | .fsync = shm_fsync, | 477 | .fsync = shm_fsync, |
478 | .release = shm_release, | 478 | .release = shm_release, |
479 | #ifndef CONFIG_MMU | ||
480 | .get_unmapped_area = shm_get_unmapped_area, | 479 | .get_unmapped_area = shm_get_unmapped_area, |
481 | #endif | ||
482 | .llseek = noop_llseek, | 480 | .llseek = noop_llseek, |
483 | .fallocate = shm_fallocate, | 481 | .fallocate = shm_fallocate, |
484 | }; | 482 | }; |
485 | 483 | ||
484 | /* | ||
485 | * shm_file_operations_huge is now identical to shm_file_operations, | ||
486 | * but we keep it distinct for the sake of is_file_shm_hugepages(). | ||
487 | */ | ||
486 | static const struct file_operations shm_file_operations_huge = { | 488 | static const struct file_operations shm_file_operations_huge = { |
487 | .mmap = shm_mmap, | 489 | .mmap = shm_mmap, |
488 | .fsync = shm_fsync, | 490 | .fsync = shm_fsync, |
@@ -764,10 +766,10 @@ static void shm_add_rss_swap(struct shmid_kernel *shp, | |||
764 | } else { | 766 | } else { |
765 | #ifdef CONFIG_SHMEM | 767 | #ifdef CONFIG_SHMEM |
766 | struct shmem_inode_info *info = SHMEM_I(inode); | 768 | struct shmem_inode_info *info = SHMEM_I(inode); |
767 | spin_lock(&info->lock); | 769 | spin_lock_irq(&info->lock); |
768 | *rss_add += inode->i_mapping->nrpages; | 770 | *rss_add += inode->i_mapping->nrpages; |
769 | *swp_add += info->swapped; | 771 | *swp_add += info->swapped; |
770 | spin_unlock(&info->lock); | 772 | spin_unlock_irq(&info->lock); |
771 | #else | 773 | #else |
772 | *rss_add += inode->i_mapping->nrpages; | 774 | *rss_add += inode->i_mapping->nrpages; |
773 | #endif | 775 | #endif |
diff --git a/kernel/fork.c b/kernel/fork.c index 4a7ec0c6c88c..de21f25e0d2c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -162,8 +162,8 @@ void __weak arch_release_thread_stack(unsigned long *stack) | |||
162 | static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, | 162 | static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, |
163 | int node) | 163 | int node) |
164 | { | 164 | { |
165 | struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, | 165 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, |
166 | THREAD_SIZE_ORDER); | 166 | THREAD_SIZE_ORDER); |
167 | 167 | ||
168 | if (page) | 168 | if (page) |
169 | memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, | 169 | memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, |
@@ -178,7 +178,7 @@ static inline void free_thread_stack(unsigned long *stack) | |||
178 | 178 | ||
179 | memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, | 179 | memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, |
180 | -(1 << THREAD_SIZE_ORDER)); | 180 | -(1 << THREAD_SIZE_ORDER)); |
181 | __free_kmem_pages(page, THREAD_SIZE_ORDER); | 181 | __free_pages(page, THREAD_SIZE_ORDER); |
182 | } | 182 | } |
183 | # else | 183 | # else |
184 | static struct kmem_cache *thread_stack_cache; | 184 | static struct kmem_cache *thread_stack_cache; |
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 805b7048a1bd..f07842e2d69f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug | |||
@@ -244,6 +244,7 @@ config PAGE_OWNER | |||
244 | depends on DEBUG_KERNEL && STACKTRACE_SUPPORT | 244 | depends on DEBUG_KERNEL && STACKTRACE_SUPPORT |
245 | select DEBUG_FS | 245 | select DEBUG_FS |
246 | select STACKTRACE | 246 | select STACKTRACE |
247 | select STACKDEPOT | ||
247 | select PAGE_EXTENSION | 248 | select PAGE_EXTENSION |
248 | help | 249 | help |
249 | This keeps track of what call chain is the owner of a page, may | 250 | This keeps track of what call chain is the owner of a page, may |
diff --git a/lib/dma-debug.c b/lib/dma-debug.c index 51a76af25c66..fcfa1939ac41 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c | |||
@@ -253,6 +253,7 @@ static int hash_fn(struct dma_debug_entry *entry) | |||
253 | */ | 253 | */ |
254 | static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, | 254 | static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, |
255 | unsigned long *flags) | 255 | unsigned long *flags) |
256 | __acquires(&dma_entry_hash[idx].lock) | ||
256 | { | 257 | { |
257 | int idx = hash_fn(entry); | 258 | int idx = hash_fn(entry); |
258 | unsigned long __flags; | 259 | unsigned long __flags; |
@@ -267,6 +268,7 @@ static struct hash_bucket *get_hash_bucket(struct dma_debug_entry *entry, | |||
267 | */ | 268 | */ |
268 | static void put_hash_bucket(struct hash_bucket *bucket, | 269 | static void put_hash_bucket(struct hash_bucket *bucket, |
269 | unsigned long *flags) | 270 | unsigned long *flags) |
271 | __releases(&bucket->lock) | ||
270 | { | 272 | { |
271 | unsigned long __flags = *flags; | 273 | unsigned long __flags = *flags; |
272 | 274 | ||
diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8b7d8459bb9d..61b8fb529cef 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c | |||
@@ -38,6 +38,9 @@ | |||
38 | #include <linux/preempt.h> /* in_interrupt() */ | 38 | #include <linux/preempt.h> /* in_interrupt() */ |
39 | 39 | ||
40 | 40 | ||
41 | /* Number of nodes in fully populated tree of given height */ | ||
42 | static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly; | ||
43 | |||
41 | /* | 44 | /* |
42 | * Radix tree node cache. | 45 | * Radix tree node cache. |
43 | */ | 46 | */ |
@@ -342,7 +345,7 @@ radix_tree_node_free(struct radix_tree_node *node) | |||
342 | * To make use of this facility, the radix tree must be initialised without | 345 | * To make use of this facility, the radix tree must be initialised without |
343 | * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). | 346 | * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). |
344 | */ | 347 | */ |
345 | static int __radix_tree_preload(gfp_t gfp_mask) | 348 | static int __radix_tree_preload(gfp_t gfp_mask, int nr) |
346 | { | 349 | { |
347 | struct radix_tree_preload *rtp; | 350 | struct radix_tree_preload *rtp; |
348 | struct radix_tree_node *node; | 351 | struct radix_tree_node *node; |
@@ -350,14 +353,14 @@ static int __radix_tree_preload(gfp_t gfp_mask) | |||
350 | 353 | ||
351 | preempt_disable(); | 354 | preempt_disable(); |
352 | rtp = this_cpu_ptr(&radix_tree_preloads); | 355 | rtp = this_cpu_ptr(&radix_tree_preloads); |
353 | while (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { | 356 | while (rtp->nr < nr) { |
354 | preempt_enable(); | 357 | preempt_enable(); |
355 | node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); | 358 | node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); |
356 | if (node == NULL) | 359 | if (node == NULL) |
357 | goto out; | 360 | goto out; |
358 | preempt_disable(); | 361 | preempt_disable(); |
359 | rtp = this_cpu_ptr(&radix_tree_preloads); | 362 | rtp = this_cpu_ptr(&radix_tree_preloads); |
360 | if (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { | 363 | if (rtp->nr < nr) { |
361 | node->private_data = rtp->nodes; | 364 | node->private_data = rtp->nodes; |
362 | rtp->nodes = node; | 365 | rtp->nodes = node; |
363 | rtp->nr++; | 366 | rtp->nr++; |
@@ -383,7 +386,7 @@ int radix_tree_preload(gfp_t gfp_mask) | |||
383 | { | 386 | { |
384 | /* Warn on non-sensical use... */ | 387 | /* Warn on non-sensical use... */ |
385 | WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); | 388 | WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); |
386 | return __radix_tree_preload(gfp_mask); | 389 | return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); |
387 | } | 390 | } |
388 | EXPORT_SYMBOL(radix_tree_preload); | 391 | EXPORT_SYMBOL(radix_tree_preload); |
389 | 392 | ||
@@ -395,7 +398,7 @@ EXPORT_SYMBOL(radix_tree_preload); | |||
395 | int radix_tree_maybe_preload(gfp_t gfp_mask) | 398 | int radix_tree_maybe_preload(gfp_t gfp_mask) |
396 | { | 399 | { |
397 | if (gfpflags_allow_blocking(gfp_mask)) | 400 | if (gfpflags_allow_blocking(gfp_mask)) |
398 | return __radix_tree_preload(gfp_mask); | 401 | return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); |
399 | /* Preloading doesn't help anything with this gfp mask, skip it */ | 402 | /* Preloading doesn't help anything with this gfp mask, skip it */ |
400 | preempt_disable(); | 403 | preempt_disable(); |
401 | return 0; | 404 | return 0; |
@@ -403,6 +406,51 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) | |||
403 | EXPORT_SYMBOL(radix_tree_maybe_preload); | 406 | EXPORT_SYMBOL(radix_tree_maybe_preload); |
404 | 407 | ||
405 | /* | 408 | /* |
409 | * The same as function above, but preload number of nodes required to insert | ||
410 | * (1 << order) continuous naturally-aligned elements. | ||
411 | */ | ||
412 | int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) | ||
413 | { | ||
414 | unsigned long nr_subtrees; | ||
415 | int nr_nodes, subtree_height; | ||
416 | |||
417 | /* Preloading doesn't help anything with this gfp mask, skip it */ | ||
418 | if (!gfpflags_allow_blocking(gfp_mask)) { | ||
419 | preempt_disable(); | ||
420 | return 0; | ||
421 | } | ||
422 | |||
423 | /* | ||
424 | * Calculate number and height of fully populated subtrees it takes to | ||
425 | * store (1 << order) elements. | ||
426 | */ | ||
427 | nr_subtrees = 1 << order; | ||
428 | for (subtree_height = 0; nr_subtrees > RADIX_TREE_MAP_SIZE; | ||
429 | subtree_height++) | ||
430 | nr_subtrees >>= RADIX_TREE_MAP_SHIFT; | ||
431 | |||
432 | /* | ||
433 | * The worst case is zero height tree with a single item at index 0 and | ||
434 | * then inserting items starting at ULONG_MAX - (1 << order). | ||
435 | * | ||
436 | * This requires RADIX_TREE_MAX_PATH nodes to build branch from root to | ||
437 | * 0-index item. | ||
438 | */ | ||
439 | nr_nodes = RADIX_TREE_MAX_PATH; | ||
440 | |||
441 | /* Plus branch to fully populated subtrees. */ | ||
442 | nr_nodes += RADIX_TREE_MAX_PATH - subtree_height; | ||
443 | |||
444 | /* Root node is shared. */ | ||
445 | nr_nodes--; | ||
446 | |||
447 | /* Plus nodes required to build subtrees. */ | ||
448 | nr_nodes += nr_subtrees * height_to_maxnodes[subtree_height]; | ||
449 | |||
450 | return __radix_tree_preload(gfp_mask, nr_nodes); | ||
451 | } | ||
452 | |||
453 | /* | ||
406 | * The maximum index which can be stored in a radix tree | 454 | * The maximum index which can be stored in a radix tree |
407 | */ | 455 | */ |
408 | static inline unsigned long shift_maxindex(unsigned int shift) | 456 | static inline unsigned long shift_maxindex(unsigned int shift) |
@@ -1571,6 +1619,31 @@ radix_tree_node_ctor(void *arg) | |||
1571 | INIT_LIST_HEAD(&node->private_list); | 1619 | INIT_LIST_HEAD(&node->private_list); |
1572 | } | 1620 | } |
1573 | 1621 | ||
1622 | static __init unsigned long __maxindex(unsigned int height) | ||
1623 | { | ||
1624 | unsigned int width = height * RADIX_TREE_MAP_SHIFT; | ||
1625 | int shift = RADIX_TREE_INDEX_BITS - width; | ||
1626 | |||
1627 | if (shift < 0) | ||
1628 | return ~0UL; | ||
1629 | if (shift >= BITS_PER_LONG) | ||
1630 | return 0UL; | ||
1631 | return ~0UL >> shift; | ||
1632 | } | ||
1633 | |||
1634 | static __init void radix_tree_init_maxnodes(void) | ||
1635 | { | ||
1636 | unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1]; | ||
1637 | unsigned int i, j; | ||
1638 | |||
1639 | for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) | ||
1640 | height_to_maxindex[i] = __maxindex(i); | ||
1641 | for (i = 0; i < ARRAY_SIZE(height_to_maxnodes); i++) { | ||
1642 | for (j = i; j > 0; j--) | ||
1643 | height_to_maxnodes[i] += height_to_maxindex[j - 1] + 1; | ||
1644 | } | ||
1645 | } | ||
1646 | |||
1574 | static int radix_tree_callback(struct notifier_block *nfb, | 1647 | static int radix_tree_callback(struct notifier_block *nfb, |
1575 | unsigned long action, void *hcpu) | 1648 | unsigned long action, void *hcpu) |
1576 | { | 1649 | { |
@@ -1597,5 +1670,6 @@ void __init radix_tree_init(void) | |||
1597 | sizeof(struct radix_tree_node), 0, | 1670 | sizeof(struct radix_tree_node), 0, |
1598 | SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, | 1671 | SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, |
1599 | radix_tree_node_ctor); | 1672 | radix_tree_node_ctor); |
1673 | radix_tree_init_maxnodes(); | ||
1600 | hotcpu_notifier(radix_tree_callback, 0); | 1674 | hotcpu_notifier(radix_tree_callback, 0); |
1601 | } | 1675 | } |
diff --git a/mm/Kconfig b/mm/Kconfig index 3e2daef3c946..3c81803b00a3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -440,6 +440,14 @@ choice | |||
440 | endchoice | 440 | endchoice |
441 | 441 | ||
442 | # | 442 | # |
443 | # We don't deposit page tables on file THP mapping, | ||
444 | # but Power makes use of them to address MMU quirk. | ||
445 | # | ||
446 | config TRANSPARENT_HUGE_PAGECACHE | ||
447 | def_bool y | ||
448 | depends on TRANSPARENT_HUGEPAGE && !PPC | ||
449 | |||
450 | # | ||
443 | # UP and nommu archs use km based percpu allocator | 451 | # UP and nommu archs use km based percpu allocator |
444 | # | 452 | # |
445 | config NEED_PER_CPU_KM | 453 | config NEED_PER_CPU_KM |
diff --git a/mm/Makefile b/mm/Makefile index 78c6f7dedb83..fc059666c760 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -74,7 +74,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | |||
74 | obj-$(CONFIG_MEMTEST) += memtest.o | 74 | obj-$(CONFIG_MEMTEST) += memtest.o |
75 | obj-$(CONFIG_MIGRATION) += migrate.o | 75 | obj-$(CONFIG_MIGRATION) += migrate.o |
76 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 76 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
77 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o | 77 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o |
78 | obj-$(CONFIG_PAGE_COUNTER) += page_counter.o | 78 | obj-$(CONFIG_PAGE_COUNTER) += page_counter.o |
79 | obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o | 79 | obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o |
80 | obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o | 80 | obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o |
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 57b3e9bd6bc5..da91df50ba31 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c | |||
@@ -70,7 +70,7 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) | |||
70 | */ | 70 | */ |
71 | if (trylock_page(page)) { | 71 | if (trylock_page(page)) { |
72 | #ifdef CONFIG_BALLOON_COMPACTION | 72 | #ifdef CONFIG_BALLOON_COMPACTION |
73 | if (!PagePrivate(page)) { | 73 | if (PageIsolated(page)) { |
74 | /* raced with isolation */ | 74 | /* raced with isolation */ |
75 | unlock_page(page); | 75 | unlock_page(page); |
76 | continue; | 76 | continue; |
@@ -106,110 +106,50 @@ EXPORT_SYMBOL_GPL(balloon_page_dequeue); | |||
106 | 106 | ||
107 | #ifdef CONFIG_BALLOON_COMPACTION | 107 | #ifdef CONFIG_BALLOON_COMPACTION |
108 | 108 | ||
109 | static inline void __isolate_balloon_page(struct page *page) | 109 | bool balloon_page_isolate(struct page *page, isolate_mode_t mode) |
110 | |||
110 | { | 111 | { |
111 | struct balloon_dev_info *b_dev_info = balloon_page_device(page); | 112 | struct balloon_dev_info *b_dev_info = balloon_page_device(page); |
112 | unsigned long flags; | 113 | unsigned long flags; |
113 | 114 | ||
114 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | 115 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); |
115 | ClearPagePrivate(page); | ||
116 | list_del(&page->lru); | 116 | list_del(&page->lru); |
117 | b_dev_info->isolated_pages++; | 117 | b_dev_info->isolated_pages++; |
118 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | 118 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); |
119 | |||
120 | return true; | ||
119 | } | 121 | } |
120 | 122 | ||
121 | static inline void __putback_balloon_page(struct page *page) | 123 | void balloon_page_putback(struct page *page) |
122 | { | 124 | { |
123 | struct balloon_dev_info *b_dev_info = balloon_page_device(page); | 125 | struct balloon_dev_info *b_dev_info = balloon_page_device(page); |
124 | unsigned long flags; | 126 | unsigned long flags; |
125 | 127 | ||
126 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | 128 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); |
127 | SetPagePrivate(page); | ||
128 | list_add(&page->lru, &b_dev_info->pages); | 129 | list_add(&page->lru, &b_dev_info->pages); |
129 | b_dev_info->isolated_pages--; | 130 | b_dev_info->isolated_pages--; |
130 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | 131 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); |
131 | } | 132 | } |
132 | 133 | ||
133 | /* __isolate_lru_page() counterpart for a ballooned page */ | ||
134 | bool balloon_page_isolate(struct page *page) | ||
135 | { | ||
136 | /* | ||
137 | * Avoid burning cycles with pages that are yet under __free_pages(), | ||
138 | * or just got freed under us. | ||
139 | * | ||
140 | * In case we 'win' a race for a balloon page being freed under us and | ||
141 | * raise its refcount preventing __free_pages() from doing its job | ||
142 | * the put_page() at the end of this block will take care of | ||
143 | * release this page, thus avoiding a nasty leakage. | ||
144 | */ | ||
145 | if (likely(get_page_unless_zero(page))) { | ||
146 | /* | ||
147 | * As balloon pages are not isolated from LRU lists, concurrent | ||
148 | * compaction threads can race against page migration functions | ||
149 | * as well as race against the balloon driver releasing a page. | ||
150 | * | ||
151 | * In order to avoid having an already isolated balloon page | ||
152 | * being (wrongly) re-isolated while it is under migration, | ||
153 | * or to avoid attempting to isolate pages being released by | ||
154 | * the balloon driver, lets be sure we have the page lock | ||
155 | * before proceeding with the balloon page isolation steps. | ||
156 | */ | ||
157 | if (likely(trylock_page(page))) { | ||
158 | /* | ||
159 | * A ballooned page, by default, has PagePrivate set. | ||
160 | * Prevent concurrent compaction threads from isolating | ||
161 | * an already isolated balloon page by clearing it. | ||
162 | */ | ||
163 | if (balloon_page_movable(page)) { | ||
164 | __isolate_balloon_page(page); | ||
165 | unlock_page(page); | ||
166 | return true; | ||
167 | } | ||
168 | unlock_page(page); | ||
169 | } | ||
170 | put_page(page); | ||
171 | } | ||
172 | return false; | ||
173 | } | ||
174 | |||
175 | /* putback_lru_page() counterpart for a ballooned page */ | ||
176 | void balloon_page_putback(struct page *page) | ||
177 | { | ||
178 | /* | ||
179 | * 'lock_page()' stabilizes the page and prevents races against | ||
180 | * concurrent isolation threads attempting to re-isolate it. | ||
181 | */ | ||
182 | lock_page(page); | ||
183 | |||
184 | if (__is_movable_balloon_page(page)) { | ||
185 | __putback_balloon_page(page); | ||
186 | /* drop the extra ref count taken for page isolation */ | ||
187 | put_page(page); | ||
188 | } else { | ||
189 | WARN_ON(1); | ||
190 | dump_page(page, "not movable balloon page"); | ||
191 | } | ||
192 | unlock_page(page); | ||
193 | } | ||
194 | 134 | ||
195 | /* move_to_new_page() counterpart for a ballooned page */ | 135 | /* move_to_new_page() counterpart for a ballooned page */ |
196 | int balloon_page_migrate(struct page *newpage, | 136 | int balloon_page_migrate(struct address_space *mapping, |
197 | struct page *page, enum migrate_mode mode) | 137 | struct page *newpage, struct page *page, |
138 | enum migrate_mode mode) | ||
198 | { | 139 | { |
199 | struct balloon_dev_info *balloon = balloon_page_device(page); | 140 | struct balloon_dev_info *balloon = balloon_page_device(page); |
200 | int rc = -EAGAIN; | ||
201 | 141 | ||
202 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 142 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
203 | VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); | 143 | VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); |
204 | 144 | ||
205 | if (WARN_ON(!__is_movable_balloon_page(page))) { | 145 | return balloon->migratepage(balloon, newpage, page, mode); |
206 | dump_page(page, "not movable balloon page"); | 146 | } |
207 | return rc; | ||
208 | } | ||
209 | 147 | ||
210 | if (balloon && balloon->migratepage) | 148 | const struct address_space_operations balloon_aops = { |
211 | rc = balloon->migratepage(balloon, newpage, page, mode); | 149 | .migratepage = balloon_page_migrate, |
150 | .isolate_page = balloon_page_isolate, | ||
151 | .putback_page = balloon_page_putback, | ||
152 | }; | ||
153 | EXPORT_SYMBOL_GPL(balloon_aops); | ||
212 | 154 | ||
213 | return rc; | ||
214 | } | ||
215 | #endif /* CONFIG_BALLOON_COMPACTION */ | 155 | #endif /* CONFIG_BALLOON_COMPACTION */ |
diff --git a/mm/compaction.c b/mm/compaction.c index 7bc04778f84d..64df5fe052db 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -15,11 +15,11 @@ | |||
15 | #include <linux/backing-dev.h> | 15 | #include <linux/backing-dev.h> |
16 | #include <linux/sysctl.h> | 16 | #include <linux/sysctl.h> |
17 | #include <linux/sysfs.h> | 17 | #include <linux/sysfs.h> |
18 | #include <linux/balloon_compaction.h> | ||
19 | #include <linux/page-isolation.h> | 18 | #include <linux/page-isolation.h> |
20 | #include <linux/kasan.h> | 19 | #include <linux/kasan.h> |
21 | #include <linux/kthread.h> | 20 | #include <linux/kthread.h> |
22 | #include <linux/freezer.h> | 21 | #include <linux/freezer.h> |
22 | #include <linux/page_owner.h> | ||
23 | #include "internal.h" | 23 | #include "internal.h" |
24 | 24 | ||
25 | #ifdef CONFIG_COMPACTION | 25 | #ifdef CONFIG_COMPACTION |
@@ -65,13 +65,27 @@ static unsigned long release_freepages(struct list_head *freelist) | |||
65 | 65 | ||
66 | static void map_pages(struct list_head *list) | 66 | static void map_pages(struct list_head *list) |
67 | { | 67 | { |
68 | struct page *page; | 68 | unsigned int i, order, nr_pages; |
69 | struct page *page, *next; | ||
70 | LIST_HEAD(tmp_list); | ||
71 | |||
72 | list_for_each_entry_safe(page, next, list, lru) { | ||
73 | list_del(&page->lru); | ||
69 | 74 | ||
70 | list_for_each_entry(page, list, lru) { | 75 | order = page_private(page); |
71 | arch_alloc_page(page, 0); | 76 | nr_pages = 1 << order; |
72 | kernel_map_pages(page, 1, 1); | 77 | |
73 | kasan_alloc_pages(page, 0); | 78 | post_alloc_hook(page, order, __GFP_MOVABLE); |
79 | if (order) | ||
80 | split_page(page, order); | ||
81 | |||
82 | for (i = 0; i < nr_pages; i++) { | ||
83 | list_add(&page->lru, &tmp_list); | ||
84 | page++; | ||
85 | } | ||
74 | } | 86 | } |
87 | |||
88 | list_splice(&tmp_list, list); | ||
75 | } | 89 | } |
76 | 90 | ||
77 | static inline bool migrate_async_suitable(int migratetype) | 91 | static inline bool migrate_async_suitable(int migratetype) |
@@ -81,6 +95,44 @@ static inline bool migrate_async_suitable(int migratetype) | |||
81 | 95 | ||
82 | #ifdef CONFIG_COMPACTION | 96 | #ifdef CONFIG_COMPACTION |
83 | 97 | ||
98 | int PageMovable(struct page *page) | ||
99 | { | ||
100 | struct address_space *mapping; | ||
101 | |||
102 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
103 | if (!__PageMovable(page)) | ||
104 | return 0; | ||
105 | |||
106 | mapping = page_mapping(page); | ||
107 | if (mapping && mapping->a_ops && mapping->a_ops->isolate_page) | ||
108 | return 1; | ||
109 | |||
110 | return 0; | ||
111 | } | ||
112 | EXPORT_SYMBOL(PageMovable); | ||
113 | |||
114 | void __SetPageMovable(struct page *page, struct address_space *mapping) | ||
115 | { | ||
116 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
117 | VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page); | ||
118 | page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE); | ||
119 | } | ||
120 | EXPORT_SYMBOL(__SetPageMovable); | ||
121 | |||
122 | void __ClearPageMovable(struct page *page) | ||
123 | { | ||
124 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
125 | VM_BUG_ON_PAGE(!PageMovable(page), page); | ||
126 | /* | ||
127 | * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE | ||
128 | * flag so that VM can catch up released page by driver after isolation. | ||
129 | * With it, VM migration doesn't try to put it back. | ||
130 | */ | ||
131 | page->mapping = (void *)((unsigned long)page->mapping & | ||
132 | PAGE_MAPPING_MOVABLE); | ||
133 | } | ||
134 | EXPORT_SYMBOL(__ClearPageMovable); | ||
135 | |||
84 | /* Do not skip compaction more than 64 times */ | 136 | /* Do not skip compaction more than 64 times */ |
85 | #define COMPACT_MAX_DEFER_SHIFT 6 | 137 | #define COMPACT_MAX_DEFER_SHIFT 6 |
86 | 138 | ||
@@ -368,12 +420,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
368 | unsigned long flags = 0; | 420 | unsigned long flags = 0; |
369 | bool locked = false; | 421 | bool locked = false; |
370 | unsigned long blockpfn = *start_pfn; | 422 | unsigned long blockpfn = *start_pfn; |
423 | unsigned int order; | ||
371 | 424 | ||
372 | cursor = pfn_to_page(blockpfn); | 425 | cursor = pfn_to_page(blockpfn); |
373 | 426 | ||
374 | /* Isolate free pages. */ | 427 | /* Isolate free pages. */ |
375 | for (; blockpfn < end_pfn; blockpfn++, cursor++) { | 428 | for (; blockpfn < end_pfn; blockpfn++, cursor++) { |
376 | int isolated, i; | 429 | int isolated; |
377 | struct page *page = cursor; | 430 | struct page *page = cursor; |
378 | 431 | ||
379 | /* | 432 | /* |
@@ -439,17 +492,17 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
439 | goto isolate_fail; | 492 | goto isolate_fail; |
440 | } | 493 | } |
441 | 494 | ||
442 | /* Found a free page, break it into order-0 pages */ | 495 | /* Found a free page, will break it into order-0 pages */ |
443 | isolated = split_free_page(page); | 496 | order = page_order(page); |
497 | isolated = __isolate_free_page(page, order); | ||
444 | if (!isolated) | 498 | if (!isolated) |
445 | break; | 499 | break; |
500 | set_page_private(page, order); | ||
446 | 501 | ||
447 | total_isolated += isolated; | 502 | total_isolated += isolated; |
448 | cc->nr_freepages += isolated; | 503 | cc->nr_freepages += isolated; |
449 | for (i = 0; i < isolated; i++) { | 504 | list_add_tail(&page->lru, freelist); |
450 | list_add(&page->lru, freelist); | 505 | |
451 | page++; | ||
452 | } | ||
453 | if (!strict && cc->nr_migratepages <= cc->nr_freepages) { | 506 | if (!strict && cc->nr_migratepages <= cc->nr_freepages) { |
454 | blockpfn += isolated; | 507 | blockpfn += isolated; |
455 | break; | 508 | break; |
@@ -568,7 +621,7 @@ isolate_freepages_range(struct compact_control *cc, | |||
568 | */ | 621 | */ |
569 | } | 622 | } |
570 | 623 | ||
571 | /* split_free_page does not map the pages */ | 624 | /* __isolate_free_page() does not map the pages */ |
572 | map_pages(&freelist); | 625 | map_pages(&freelist); |
573 | 626 | ||
574 | if (pfn < end_pfn) { | 627 | if (pfn < end_pfn) { |
@@ -670,7 +723,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
670 | 723 | ||
671 | /* Time to isolate some pages for migration */ | 724 | /* Time to isolate some pages for migration */ |
672 | for (; low_pfn < end_pfn; low_pfn++) { | 725 | for (; low_pfn < end_pfn; low_pfn++) { |
673 | bool is_lru; | ||
674 | 726 | ||
675 | if (skip_on_failure && low_pfn >= next_skip_pfn) { | 727 | if (skip_on_failure && low_pfn >= next_skip_pfn) { |
676 | /* | 728 | /* |
@@ -733,21 +785,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
733 | } | 785 | } |
734 | 786 | ||
735 | /* | 787 | /* |
736 | * Check may be lockless but that's ok as we recheck later. | ||
737 | * It's possible to migrate LRU pages and balloon pages | ||
738 | * Skip any other type of page | ||
739 | */ | ||
740 | is_lru = PageLRU(page); | ||
741 | if (!is_lru) { | ||
742 | if (unlikely(balloon_page_movable(page))) { | ||
743 | if (balloon_page_isolate(page)) { | ||
744 | /* Successfully isolated */ | ||
745 | goto isolate_success; | ||
746 | } | ||
747 | } | ||
748 | } | ||
749 | |||
750 | /* | ||
751 | * Regardless of being on LRU, compound pages such as THP and | 788 | * Regardless of being on LRU, compound pages such as THP and |
752 | * hugetlbfs are not to be compacted. We can potentially save | 789 | * hugetlbfs are not to be compacted. We can potentially save |
753 | * a lot of iterations if we skip them at once. The check is | 790 | * a lot of iterations if we skip them at once. The check is |
@@ -763,8 +800,30 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
763 | goto isolate_fail; | 800 | goto isolate_fail; |
764 | } | 801 | } |
765 | 802 | ||
766 | if (!is_lru) | 803 | /* |
804 | * Check may be lockless but that's ok as we recheck later. | ||
805 | * It's possible to migrate LRU and non-lru movable pages. | ||
806 | * Skip any other type of page | ||
807 | */ | ||
808 | if (!PageLRU(page)) { | ||
809 | /* | ||
810 | * __PageMovable can return false positive so we need | ||
811 | * to verify it under page_lock. | ||
812 | */ | ||
813 | if (unlikely(__PageMovable(page)) && | ||
814 | !PageIsolated(page)) { | ||
815 | if (locked) { | ||
816 | spin_unlock_irqrestore(&zone->lru_lock, | ||
817 | flags); | ||
818 | locked = false; | ||
819 | } | ||
820 | |||
821 | if (isolate_movable_page(page, isolate_mode)) | ||
822 | goto isolate_success; | ||
823 | } | ||
824 | |||
767 | goto isolate_fail; | 825 | goto isolate_fail; |
826 | } | ||
768 | 827 | ||
769 | /* | 828 | /* |
770 | * Migration will fail if an anonymous page is pinned in memory, | 829 | * Migration will fail if an anonymous page is pinned in memory, |
@@ -1059,7 +1118,7 @@ static void isolate_freepages(struct compact_control *cc) | |||
1059 | } | 1118 | } |
1060 | } | 1119 | } |
1061 | 1120 | ||
1062 | /* split_free_page does not map the pages */ | 1121 | /* __isolate_free_page() does not map the pages */ |
1063 | map_pages(freelist); | 1122 | map_pages(freelist); |
1064 | 1123 | ||
1065 | /* | 1124 | /* |
diff --git a/mm/filemap.c b/mm/filemap.c index 20f3b1f33f0e..e90c1543ec2d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -114,14 +114,14 @@ static void page_cache_tree_delete(struct address_space *mapping, | |||
114 | struct page *page, void *shadow) | 114 | struct page *page, void *shadow) |
115 | { | 115 | { |
116 | struct radix_tree_node *node; | 116 | struct radix_tree_node *node; |
117 | int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page); | ||
117 | 118 | ||
118 | VM_BUG_ON(!PageLocked(page)); | 119 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
119 | 120 | VM_BUG_ON_PAGE(PageTail(page), page); | |
120 | node = radix_tree_replace_clear_tags(&mapping->page_tree, page->index, | 121 | VM_BUG_ON_PAGE(nr != 1 && shadow, page); |
121 | shadow); | ||
122 | 122 | ||
123 | if (shadow) { | 123 | if (shadow) { |
124 | mapping->nrexceptional++; | 124 | mapping->nrexceptional += nr; |
125 | /* | 125 | /* |
126 | * Make sure the nrexceptional update is committed before | 126 | * Make sure the nrexceptional update is committed before |
127 | * the nrpages update so that final truncate racing | 127 | * the nrpages update so that final truncate racing |
@@ -130,31 +130,38 @@ static void page_cache_tree_delete(struct address_space *mapping, | |||
130 | */ | 130 | */ |
131 | smp_wmb(); | 131 | smp_wmb(); |
132 | } | 132 | } |
133 | mapping->nrpages--; | 133 | mapping->nrpages -= nr; |
134 | 134 | ||
135 | if (!node) | 135 | for (i = 0; i < nr; i++) { |
136 | return; | 136 | node = radix_tree_replace_clear_tags(&mapping->page_tree, |
137 | 137 | page->index + i, shadow); | |
138 | workingset_node_pages_dec(node); | 138 | if (!node) { |
139 | if (shadow) | 139 | VM_BUG_ON_PAGE(nr != 1, page); |
140 | workingset_node_shadows_inc(node); | ||
141 | else | ||
142 | if (__radix_tree_delete_node(&mapping->page_tree, node)) | ||
143 | return; | 140 | return; |
141 | } | ||
144 | 142 | ||
145 | /* | 143 | workingset_node_pages_dec(node); |
146 | * Track node that only contains shadow entries. DAX mappings contain | 144 | if (shadow) |
147 | * no shadow entries and may contain other exceptional entries so skip | 145 | workingset_node_shadows_inc(node); |
148 | * those. | 146 | else |
149 | * | 147 | if (__radix_tree_delete_node(&mapping->page_tree, node)) |
150 | * Avoid acquiring the list_lru lock if already tracked. The | 148 | continue; |
151 | * list_empty() test is safe as node->private_list is | 149 | |
152 | * protected by mapping->tree_lock. | 150 | /* |
153 | */ | 151 | * Track node that only contains shadow entries. DAX mappings |
154 | if (!dax_mapping(mapping) && !workingset_node_pages(node) && | 152 | * contain no shadow entries and may contain other exceptional |
155 | list_empty(&node->private_list)) { | 153 | * entries so skip those. |
156 | node->private_data = mapping; | 154 | * |
157 | list_lru_add(&workingset_shadow_nodes, &node->private_list); | 155 | * Avoid acquiring the list_lru lock if already tracked. |
156 | * The list_empty() test is safe as node->private_list is | ||
157 | * protected by mapping->tree_lock. | ||
158 | */ | ||
159 | if (!dax_mapping(mapping) && !workingset_node_pages(node) && | ||
160 | list_empty(&node->private_list)) { | ||
161 | node->private_data = mapping; | ||
162 | list_lru_add(&workingset_shadow_nodes, | ||
163 | &node->private_list); | ||
164 | } | ||
158 | } | 165 | } |
159 | } | 166 | } |
160 | 167 | ||
@@ -166,6 +173,7 @@ static void page_cache_tree_delete(struct address_space *mapping, | |||
166 | void __delete_from_page_cache(struct page *page, void *shadow) | 173 | void __delete_from_page_cache(struct page *page, void *shadow) |
167 | { | 174 | { |
168 | struct address_space *mapping = page->mapping; | 175 | struct address_space *mapping = page->mapping; |
176 | int nr = hpage_nr_pages(page); | ||
169 | 177 | ||
170 | trace_mm_filemap_delete_from_page_cache(page); | 178 | trace_mm_filemap_delete_from_page_cache(page); |
171 | /* | 179 | /* |
@@ -178,6 +186,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) | |||
178 | else | 186 | else |
179 | cleancache_invalidate_page(mapping, page); | 187 | cleancache_invalidate_page(mapping, page); |
180 | 188 | ||
189 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
181 | VM_BUG_ON_PAGE(page_mapped(page), page); | 190 | VM_BUG_ON_PAGE(page_mapped(page), page); |
182 | if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { | 191 | if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(page_mapped(page))) { |
183 | int mapcount; | 192 | int mapcount; |
@@ -209,9 +218,14 @@ void __delete_from_page_cache(struct page *page, void *shadow) | |||
209 | 218 | ||
210 | /* hugetlb pages do not participate in page cache accounting. */ | 219 | /* hugetlb pages do not participate in page cache accounting. */ |
211 | if (!PageHuge(page)) | 220 | if (!PageHuge(page)) |
212 | __dec_zone_page_state(page, NR_FILE_PAGES); | 221 | __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, -nr); |
213 | if (PageSwapBacked(page)) | 222 | if (PageSwapBacked(page)) { |
214 | __dec_zone_page_state(page, NR_SHMEM); | 223 | __mod_zone_page_state(page_zone(page), NR_SHMEM, -nr); |
224 | if (PageTransHuge(page)) | ||
225 | __dec_zone_page_state(page, NR_SHMEM_THPS); | ||
226 | } else { | ||
227 | VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page); | ||
228 | } | ||
215 | 229 | ||
216 | /* | 230 | /* |
217 | * At this point page must be either written or cleaned by truncate. | 231 | * At this point page must be either written or cleaned by truncate. |
@@ -235,9 +249,8 @@ void __delete_from_page_cache(struct page *page, void *shadow) | |||
235 | */ | 249 | */ |
236 | void delete_from_page_cache(struct page *page) | 250 | void delete_from_page_cache(struct page *page) |
237 | { | 251 | { |
238 | struct address_space *mapping = page->mapping; | 252 | struct address_space *mapping = page_mapping(page); |
239 | unsigned long flags; | 253 | unsigned long flags; |
240 | |||
241 | void (*freepage)(struct page *); | 254 | void (*freepage)(struct page *); |
242 | 255 | ||
243 | BUG_ON(!PageLocked(page)); | 256 | BUG_ON(!PageLocked(page)); |
@@ -250,7 +263,13 @@ void delete_from_page_cache(struct page *page) | |||
250 | 263 | ||
251 | if (freepage) | 264 | if (freepage) |
252 | freepage(page); | 265 | freepage(page); |
253 | put_page(page); | 266 | |
267 | if (PageTransHuge(page) && !PageHuge(page)) { | ||
268 | page_ref_sub(page, HPAGE_PMD_NR); | ||
269 | VM_BUG_ON_PAGE(page_count(page) <= 0, page); | ||
270 | } else { | ||
271 | put_page(page); | ||
272 | } | ||
254 | } | 273 | } |
255 | EXPORT_SYMBOL(delete_from_page_cache); | 274 | EXPORT_SYMBOL(delete_from_page_cache); |
256 | 275 | ||
@@ -1053,7 +1072,7 @@ EXPORT_SYMBOL(page_cache_prev_hole); | |||
1053 | struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) | 1072 | struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) |
1054 | { | 1073 | { |
1055 | void **pagep; | 1074 | void **pagep; |
1056 | struct page *page; | 1075 | struct page *head, *page; |
1057 | 1076 | ||
1058 | rcu_read_lock(); | 1077 | rcu_read_lock(); |
1059 | repeat: | 1078 | repeat: |
@@ -1073,16 +1092,24 @@ repeat: | |||
1073 | */ | 1092 | */ |
1074 | goto out; | 1093 | goto out; |
1075 | } | 1094 | } |
1076 | if (!page_cache_get_speculative(page)) | 1095 | |
1096 | head = compound_head(page); | ||
1097 | if (!page_cache_get_speculative(head)) | ||
1077 | goto repeat; | 1098 | goto repeat; |
1078 | 1099 | ||
1100 | /* The page was split under us? */ | ||
1101 | if (compound_head(page) != head) { | ||
1102 | put_page(head); | ||
1103 | goto repeat; | ||
1104 | } | ||
1105 | |||
1079 | /* | 1106 | /* |
1080 | * Has the page moved? | 1107 | * Has the page moved? |
1081 | * This is part of the lockless pagecache protocol. See | 1108 | * This is part of the lockless pagecache protocol. See |
1082 | * include/linux/pagemap.h for details. | 1109 | * include/linux/pagemap.h for details. |
1083 | */ | 1110 | */ |
1084 | if (unlikely(page != *pagep)) { | 1111 | if (unlikely(page != *pagep)) { |
1085 | put_page(page); | 1112 | put_page(head); |
1086 | goto repeat; | 1113 | goto repeat; |
1087 | } | 1114 | } |
1088 | } | 1115 | } |
@@ -1118,12 +1145,12 @@ repeat: | |||
1118 | if (page && !radix_tree_exception(page)) { | 1145 | if (page && !radix_tree_exception(page)) { |
1119 | lock_page(page); | 1146 | lock_page(page); |
1120 | /* Has the page been truncated? */ | 1147 | /* Has the page been truncated? */ |
1121 | if (unlikely(page->mapping != mapping)) { | 1148 | if (unlikely(page_mapping(page) != mapping)) { |
1122 | unlock_page(page); | 1149 | unlock_page(page); |
1123 | put_page(page); | 1150 | put_page(page); |
1124 | goto repeat; | 1151 | goto repeat; |
1125 | } | 1152 | } |
1126 | VM_BUG_ON_PAGE(page->index != offset, page); | 1153 | VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page); |
1127 | } | 1154 | } |
1128 | return page; | 1155 | return page; |
1129 | } | 1156 | } |
@@ -1255,7 +1282,7 @@ unsigned find_get_entries(struct address_space *mapping, | |||
1255 | 1282 | ||
1256 | rcu_read_lock(); | 1283 | rcu_read_lock(); |
1257 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { | 1284 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { |
1258 | struct page *page; | 1285 | struct page *head, *page; |
1259 | repeat: | 1286 | repeat: |
1260 | page = radix_tree_deref_slot(slot); | 1287 | page = radix_tree_deref_slot(slot); |
1261 | if (unlikely(!page)) | 1288 | if (unlikely(!page)) |
@@ -1272,12 +1299,20 @@ repeat: | |||
1272 | */ | 1299 | */ |
1273 | goto export; | 1300 | goto export; |
1274 | } | 1301 | } |
1275 | if (!page_cache_get_speculative(page)) | 1302 | |
1303 | head = compound_head(page); | ||
1304 | if (!page_cache_get_speculative(head)) | ||
1276 | goto repeat; | 1305 | goto repeat; |
1277 | 1306 | ||
1307 | /* The page was split under us? */ | ||
1308 | if (compound_head(page) != head) { | ||
1309 | put_page(head); | ||
1310 | goto repeat; | ||
1311 | } | ||
1312 | |||
1278 | /* Has the page moved? */ | 1313 | /* Has the page moved? */ |
1279 | if (unlikely(page != *slot)) { | 1314 | if (unlikely(page != *slot)) { |
1280 | put_page(page); | 1315 | put_page(head); |
1281 | goto repeat; | 1316 | goto repeat; |
1282 | } | 1317 | } |
1283 | export: | 1318 | export: |
@@ -1318,7 +1353,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, | |||
1318 | 1353 | ||
1319 | rcu_read_lock(); | 1354 | rcu_read_lock(); |
1320 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { | 1355 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { |
1321 | struct page *page; | 1356 | struct page *head, *page; |
1322 | repeat: | 1357 | repeat: |
1323 | page = radix_tree_deref_slot(slot); | 1358 | page = radix_tree_deref_slot(slot); |
1324 | if (unlikely(!page)) | 1359 | if (unlikely(!page)) |
@@ -1337,12 +1372,19 @@ repeat: | |||
1337 | continue; | 1372 | continue; |
1338 | } | 1373 | } |
1339 | 1374 | ||
1340 | if (!page_cache_get_speculative(page)) | 1375 | head = compound_head(page); |
1376 | if (!page_cache_get_speculative(head)) | ||
1341 | goto repeat; | 1377 | goto repeat; |
1342 | 1378 | ||
1379 | /* The page was split under us? */ | ||
1380 | if (compound_head(page) != head) { | ||
1381 | put_page(head); | ||
1382 | goto repeat; | ||
1383 | } | ||
1384 | |||
1343 | /* Has the page moved? */ | 1385 | /* Has the page moved? */ |
1344 | if (unlikely(page != *slot)) { | 1386 | if (unlikely(page != *slot)) { |
1345 | put_page(page); | 1387 | put_page(head); |
1346 | goto repeat; | 1388 | goto repeat; |
1347 | } | 1389 | } |
1348 | 1390 | ||
@@ -1379,7 +1421,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | |||
1379 | 1421 | ||
1380 | rcu_read_lock(); | 1422 | rcu_read_lock(); |
1381 | radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { | 1423 | radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { |
1382 | struct page *page; | 1424 | struct page *head, *page; |
1383 | repeat: | 1425 | repeat: |
1384 | page = radix_tree_deref_slot(slot); | 1426 | page = radix_tree_deref_slot(slot); |
1385 | /* The hole, there no reason to continue */ | 1427 | /* The hole, there no reason to continue */ |
@@ -1399,12 +1441,19 @@ repeat: | |||
1399 | break; | 1441 | break; |
1400 | } | 1442 | } |
1401 | 1443 | ||
1402 | if (!page_cache_get_speculative(page)) | 1444 | head = compound_head(page); |
1445 | if (!page_cache_get_speculative(head)) | ||
1403 | goto repeat; | 1446 | goto repeat; |
1404 | 1447 | ||
1448 | /* The page was split under us? */ | ||
1449 | if (compound_head(page) != head) { | ||
1450 | put_page(head); | ||
1451 | goto repeat; | ||
1452 | } | ||
1453 | |||
1405 | /* Has the page moved? */ | 1454 | /* Has the page moved? */ |
1406 | if (unlikely(page != *slot)) { | 1455 | if (unlikely(page != *slot)) { |
1407 | put_page(page); | 1456 | put_page(head); |
1408 | goto repeat; | 1457 | goto repeat; |
1409 | } | 1458 | } |
1410 | 1459 | ||
@@ -1413,7 +1462,7 @@ repeat: | |||
1413 | * otherwise we can get both false positives and false | 1462 | * otherwise we can get both false positives and false |
1414 | * negatives, which is just confusing to the caller. | 1463 | * negatives, which is just confusing to the caller. |
1415 | */ | 1464 | */ |
1416 | if (page->mapping == NULL || page->index != iter.index) { | 1465 | if (page->mapping == NULL || page_to_pgoff(page) != iter.index) { |
1417 | put_page(page); | 1466 | put_page(page); |
1418 | break; | 1467 | break; |
1419 | } | 1468 | } |
@@ -1451,7 +1500,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | |||
1451 | rcu_read_lock(); | 1500 | rcu_read_lock(); |
1452 | radix_tree_for_each_tagged(slot, &mapping->page_tree, | 1501 | radix_tree_for_each_tagged(slot, &mapping->page_tree, |
1453 | &iter, *index, tag) { | 1502 | &iter, *index, tag) { |
1454 | struct page *page; | 1503 | struct page *head, *page; |
1455 | repeat: | 1504 | repeat: |
1456 | page = radix_tree_deref_slot(slot); | 1505 | page = radix_tree_deref_slot(slot); |
1457 | if (unlikely(!page)) | 1506 | if (unlikely(!page)) |
@@ -1476,12 +1525,19 @@ repeat: | |||
1476 | continue; | 1525 | continue; |
1477 | } | 1526 | } |
1478 | 1527 | ||
1479 | if (!page_cache_get_speculative(page)) | 1528 | head = compound_head(page); |
1529 | if (!page_cache_get_speculative(head)) | ||
1480 | goto repeat; | 1530 | goto repeat; |
1481 | 1531 | ||
1532 | /* The page was split under us? */ | ||
1533 | if (compound_head(page) != head) { | ||
1534 | put_page(head); | ||
1535 | goto repeat; | ||
1536 | } | ||
1537 | |||
1482 | /* Has the page moved? */ | 1538 | /* Has the page moved? */ |
1483 | if (unlikely(page != *slot)) { | 1539 | if (unlikely(page != *slot)) { |
1484 | put_page(page); | 1540 | put_page(head); |
1485 | goto repeat; | 1541 | goto repeat; |
1486 | } | 1542 | } |
1487 | 1543 | ||
@@ -1525,7 +1581,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, | |||
1525 | rcu_read_lock(); | 1581 | rcu_read_lock(); |
1526 | radix_tree_for_each_tagged(slot, &mapping->page_tree, | 1582 | radix_tree_for_each_tagged(slot, &mapping->page_tree, |
1527 | &iter, start, tag) { | 1583 | &iter, start, tag) { |
1528 | struct page *page; | 1584 | struct page *head, *page; |
1529 | repeat: | 1585 | repeat: |
1530 | page = radix_tree_deref_slot(slot); | 1586 | page = radix_tree_deref_slot(slot); |
1531 | if (unlikely(!page)) | 1587 | if (unlikely(!page)) |
@@ -1543,12 +1599,20 @@ repeat: | |||
1543 | */ | 1599 | */ |
1544 | goto export; | 1600 | goto export; |
1545 | } | 1601 | } |
1546 | if (!page_cache_get_speculative(page)) | 1602 | |
1603 | head = compound_head(page); | ||
1604 | if (!page_cache_get_speculative(head)) | ||
1547 | goto repeat; | 1605 | goto repeat; |
1548 | 1606 | ||
1607 | /* The page was split under us? */ | ||
1608 | if (compound_head(page) != head) { | ||
1609 | put_page(head); | ||
1610 | goto repeat; | ||
1611 | } | ||
1612 | |||
1549 | /* Has the page moved? */ | 1613 | /* Has the page moved? */ |
1550 | if (unlikely(page != *slot)) { | 1614 | if (unlikely(page != *slot)) { |
1551 | put_page(page); | 1615 | put_page(head); |
1552 | goto repeat; | 1616 | goto repeat; |
1553 | } | 1617 | } |
1554 | export: | 1618 | export: |
@@ -2128,21 +2192,21 @@ page_not_uptodate: | |||
2128 | } | 2192 | } |
2129 | EXPORT_SYMBOL(filemap_fault); | 2193 | EXPORT_SYMBOL(filemap_fault); |
2130 | 2194 | ||
2131 | void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) | 2195 | void filemap_map_pages(struct fault_env *fe, |
2196 | pgoff_t start_pgoff, pgoff_t end_pgoff) | ||
2132 | { | 2197 | { |
2133 | struct radix_tree_iter iter; | 2198 | struct radix_tree_iter iter; |
2134 | void **slot; | 2199 | void **slot; |
2135 | struct file *file = vma->vm_file; | 2200 | struct file *file = fe->vma->vm_file; |
2136 | struct address_space *mapping = file->f_mapping; | 2201 | struct address_space *mapping = file->f_mapping; |
2202 | pgoff_t last_pgoff = start_pgoff; | ||
2137 | loff_t size; | 2203 | loff_t size; |
2138 | struct page *page; | 2204 | struct page *head, *page; |
2139 | unsigned long address = (unsigned long) vmf->virtual_address; | ||
2140 | unsigned long addr; | ||
2141 | pte_t *pte; | ||
2142 | 2205 | ||
2143 | rcu_read_lock(); | 2206 | rcu_read_lock(); |
2144 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, vmf->pgoff) { | 2207 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, |
2145 | if (iter.index > vmf->max_pgoff) | 2208 | start_pgoff) { |
2209 | if (iter.index > end_pgoff) | ||
2146 | break; | 2210 | break; |
2147 | repeat: | 2211 | repeat: |
2148 | page = radix_tree_deref_slot(slot); | 2212 | page = radix_tree_deref_slot(slot); |
@@ -2156,12 +2220,19 @@ repeat: | |||
2156 | goto next; | 2220 | goto next; |
2157 | } | 2221 | } |
2158 | 2222 | ||
2159 | if (!page_cache_get_speculative(page)) | 2223 | head = compound_head(page); |
2224 | if (!page_cache_get_speculative(head)) | ||
2160 | goto repeat; | 2225 | goto repeat; |
2161 | 2226 | ||
2227 | /* The page was split under us? */ | ||
2228 | if (compound_head(page) != head) { | ||
2229 | put_page(head); | ||
2230 | goto repeat; | ||
2231 | } | ||
2232 | |||
2162 | /* Has the page moved? */ | 2233 | /* Has the page moved? */ |
2163 | if (unlikely(page != *slot)) { | 2234 | if (unlikely(page != *slot)) { |
2164 | put_page(page); | 2235 | put_page(head); |
2165 | goto repeat; | 2236 | goto repeat; |
2166 | } | 2237 | } |
2167 | 2238 | ||
@@ -2179,14 +2250,15 @@ repeat: | |||
2179 | if (page->index >= size >> PAGE_SHIFT) | 2250 | if (page->index >= size >> PAGE_SHIFT) |
2180 | goto unlock; | 2251 | goto unlock; |
2181 | 2252 | ||
2182 | pte = vmf->pte + page->index - vmf->pgoff; | ||
2183 | if (!pte_none(*pte)) | ||
2184 | goto unlock; | ||
2185 | |||
2186 | if (file->f_ra.mmap_miss > 0) | 2253 | if (file->f_ra.mmap_miss > 0) |
2187 | file->f_ra.mmap_miss--; | 2254 | file->f_ra.mmap_miss--; |
2188 | addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; | 2255 | |
2189 | do_set_pte(vma, addr, page, pte, false, false); | 2256 | fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; |
2257 | if (fe->pte) | ||
2258 | fe->pte += iter.index - last_pgoff; | ||
2259 | last_pgoff = iter.index; | ||
2260 | if (alloc_set_pte(fe, NULL, page)) | ||
2261 | goto unlock; | ||
2190 | unlock_page(page); | 2262 | unlock_page(page); |
2191 | goto next; | 2263 | goto next; |
2192 | unlock: | 2264 | unlock: |
@@ -2194,7 +2266,10 @@ unlock: | |||
2194 | skip: | 2266 | skip: |
2195 | put_page(page); | 2267 | put_page(page); |
2196 | next: | 2268 | next: |
2197 | if (iter.index == vmf->max_pgoff) | 2269 | /* Huge page is mapped? No need to proceed. */ |
2270 | if (pmd_trans_huge(*fe->pmd)) | ||
2271 | break; | ||
2272 | if (iter.index == end_pgoff) | ||
2198 | break; | 2273 | break; |
2199 | } | 2274 | } |
2200 | rcu_read_unlock(); | 2275 | rcu_read_unlock(); |
diff --git a/mm/frontswap.c b/mm/frontswap.c index 27a9924caf61..fec8b5044040 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c | |||
@@ -20,6 +20,8 @@ | |||
20 | #include <linux/frontswap.h> | 20 | #include <linux/frontswap.h> |
21 | #include <linux/swapfile.h> | 21 | #include <linux/swapfile.h> |
22 | 22 | ||
23 | DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key); | ||
24 | |||
23 | /* | 25 | /* |
24 | * frontswap_ops are added by frontswap_register_ops, and provide the | 26 | * frontswap_ops are added by frontswap_register_ops, and provide the |
25 | * frontswap "backend" implementation functions. Multiple implementations | 27 | * frontswap "backend" implementation functions. Multiple implementations |
@@ -139,6 +141,8 @@ void frontswap_register_ops(struct frontswap_ops *ops) | |||
139 | ops->next = frontswap_ops; | 141 | ops->next = frontswap_ops; |
140 | } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next); | 142 | } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next); |
141 | 143 | ||
144 | static_branch_inc(&frontswap_enabled_key); | ||
145 | |||
142 | spin_lock(&swap_lock); | 146 | spin_lock(&swap_lock); |
143 | plist_for_each_entry(si, &swap_active_head, list) { | 147 | plist_for_each_entry(si, &swap_active_head, list) { |
144 | if (si->frontswap_map) | 148 | if (si->frontswap_map) |
@@ -189,7 +193,7 @@ void __frontswap_init(unsigned type, unsigned long *map) | |||
189 | struct swap_info_struct *sis = swap_info[type]; | 193 | struct swap_info_struct *sis = swap_info[type]; |
190 | struct frontswap_ops *ops; | 194 | struct frontswap_ops *ops; |
191 | 195 | ||
192 | BUG_ON(sis == NULL); | 196 | VM_BUG_ON(sis == NULL); |
193 | 197 | ||
194 | /* | 198 | /* |
195 | * p->frontswap is a bitmap that we MUST have to figure out which page | 199 | * p->frontswap is a bitmap that we MUST have to figure out which page |
@@ -248,15 +252,9 @@ int __frontswap_store(struct page *page) | |||
248 | pgoff_t offset = swp_offset(entry); | 252 | pgoff_t offset = swp_offset(entry); |
249 | struct frontswap_ops *ops; | 253 | struct frontswap_ops *ops; |
250 | 254 | ||
251 | /* | 255 | VM_BUG_ON(!frontswap_ops); |
252 | * Return if no backend registed. | 256 | VM_BUG_ON(!PageLocked(page)); |
253 | * Don't need to inc frontswap_failed_stores here. | 257 | VM_BUG_ON(sis == NULL); |
254 | */ | ||
255 | if (!frontswap_ops) | ||
256 | return -1; | ||
257 | |||
258 | BUG_ON(!PageLocked(page)); | ||
259 | BUG_ON(sis == NULL); | ||
260 | 258 | ||
261 | /* | 259 | /* |
262 | * If a dup, we must remove the old page first; we can't leave the | 260 | * If a dup, we must remove the old page first; we can't leave the |
@@ -303,11 +301,10 @@ int __frontswap_load(struct page *page) | |||
303 | pgoff_t offset = swp_offset(entry); | 301 | pgoff_t offset = swp_offset(entry); |
304 | struct frontswap_ops *ops; | 302 | struct frontswap_ops *ops; |
305 | 303 | ||
306 | if (!frontswap_ops) | 304 | VM_BUG_ON(!frontswap_ops); |
307 | return -1; | 305 | VM_BUG_ON(!PageLocked(page)); |
306 | VM_BUG_ON(sis == NULL); | ||
308 | 307 | ||
309 | BUG_ON(!PageLocked(page)); | ||
310 | BUG_ON(sis == NULL); | ||
311 | if (!__frontswap_test(sis, offset)) | 308 | if (!__frontswap_test(sis, offset)) |
312 | return -1; | 309 | return -1; |
313 | 310 | ||
@@ -337,10 +334,9 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset) | |||
337 | struct swap_info_struct *sis = swap_info[type]; | 334 | struct swap_info_struct *sis = swap_info[type]; |
338 | struct frontswap_ops *ops; | 335 | struct frontswap_ops *ops; |
339 | 336 | ||
340 | if (!frontswap_ops) | 337 | VM_BUG_ON(!frontswap_ops); |
341 | return; | 338 | VM_BUG_ON(sis == NULL); |
342 | 339 | ||
343 | BUG_ON(sis == NULL); | ||
344 | if (!__frontswap_test(sis, offset)) | 340 | if (!__frontswap_test(sis, offset)) |
345 | return; | 341 | return; |
346 | 342 | ||
@@ -360,10 +356,9 @@ void __frontswap_invalidate_area(unsigned type) | |||
360 | struct swap_info_struct *sis = swap_info[type]; | 356 | struct swap_info_struct *sis = swap_info[type]; |
361 | struct frontswap_ops *ops; | 357 | struct frontswap_ops *ops; |
362 | 358 | ||
363 | if (!frontswap_ops) | 359 | VM_BUG_ON(!frontswap_ops); |
364 | return; | 360 | VM_BUG_ON(sis == NULL); |
365 | 361 | ||
366 | BUG_ON(sis == NULL); | ||
367 | if (sis->frontswap_map == NULL) | 362 | if (sis->frontswap_map == NULL) |
368 | return; | 363 | return; |
369 | 364 | ||
@@ -279,6 +279,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
279 | spin_unlock(ptl); | 279 | spin_unlock(ptl); |
280 | ret = 0; | 280 | ret = 0; |
281 | split_huge_pmd(vma, pmd, address); | 281 | split_huge_pmd(vma, pmd, address); |
282 | if (pmd_trans_unstable(pmd)) | ||
283 | ret = -EBUSY; | ||
282 | } else { | 284 | } else { |
283 | get_page(page); | 285 | get_page(page); |
284 | spin_unlock(ptl); | 286 | spin_unlock(ptl); |
@@ -286,6 +288,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
286 | ret = split_huge_page(page); | 288 | ret = split_huge_page(page); |
287 | unlock_page(page); | 289 | unlock_page(page); |
288 | put_page(page); | 290 | put_page(page); |
291 | if (pmd_none(*pmd)) | ||
292 | return no_page_table(vma, flags); | ||
289 | } | 293 | } |
290 | 294 | ||
291 | return ret ? ERR_PTR(ret) : | 295 | return ret ? ERR_PTR(ret) : |
@@ -350,7 +354,6 @@ unmap: | |||
350 | static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, | 354 | static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, |
351 | unsigned long address, unsigned int *flags, int *nonblocking) | 355 | unsigned long address, unsigned int *flags, int *nonblocking) |
352 | { | 356 | { |
353 | struct mm_struct *mm = vma->vm_mm; | ||
354 | unsigned int fault_flags = 0; | 357 | unsigned int fault_flags = 0; |
355 | int ret; | 358 | int ret; |
356 | 359 | ||
@@ -375,7 +378,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, | |||
375 | fault_flags |= FAULT_FLAG_TRIED; | 378 | fault_flags |= FAULT_FLAG_TRIED; |
376 | } | 379 | } |
377 | 380 | ||
378 | ret = handle_mm_fault(mm, vma, address, fault_flags); | 381 | ret = handle_mm_fault(vma, address, fault_flags); |
379 | if (ret & VM_FAULT_ERROR) { | 382 | if (ret & VM_FAULT_ERROR) { |
380 | if (ret & VM_FAULT_OOM) | 383 | if (ret & VM_FAULT_OOM) |
381 | return -ENOMEM; | 384 | return -ENOMEM; |
@@ -690,7 +693,7 @@ retry: | |||
690 | if (!vma_permits_fault(vma, fault_flags)) | 693 | if (!vma_permits_fault(vma, fault_flags)) |
691 | return -EFAULT; | 694 | return -EFAULT; |
692 | 695 | ||
693 | ret = handle_mm_fault(mm, vma, address, fault_flags); | 696 | ret = handle_mm_fault(vma, address, fault_flags); |
694 | major |= ret & VM_FAULT_MAJOR; | 697 | major |= ret & VM_FAULT_MAJOR; |
695 | if (ret & VM_FAULT_ERROR) { | 698 | if (ret & VM_FAULT_ERROR) { |
696 | if (ret & VM_FAULT_OOM) | 699 | if (ret & VM_FAULT_OOM) |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 343a2b7e57aa..3647334c2ef9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -18,7 +18,6 @@ | |||
18 | #include <linux/mm_inline.h> | 18 | #include <linux/mm_inline.h> |
19 | #include <linux/swapops.h> | 19 | #include <linux/swapops.h> |
20 | #include <linux/dax.h> | 20 | #include <linux/dax.h> |
21 | #include <linux/kthread.h> | ||
22 | #include <linux/khugepaged.h> | 21 | #include <linux/khugepaged.h> |
23 | #include <linux/freezer.h> | 22 | #include <linux/freezer.h> |
24 | #include <linux/pfn_t.h> | 23 | #include <linux/pfn_t.h> |
@@ -30,39 +29,12 @@ | |||
30 | #include <linux/hashtable.h> | 29 | #include <linux/hashtable.h> |
31 | #include <linux/userfaultfd_k.h> | 30 | #include <linux/userfaultfd_k.h> |
32 | #include <linux/page_idle.h> | 31 | #include <linux/page_idle.h> |
32 | #include <linux/shmem_fs.h> | ||
33 | 33 | ||
34 | #include <asm/tlb.h> | 34 | #include <asm/tlb.h> |
35 | #include <asm/pgalloc.h> | 35 | #include <asm/pgalloc.h> |
36 | #include "internal.h" | 36 | #include "internal.h" |
37 | 37 | ||
38 | enum scan_result { | ||
39 | SCAN_FAIL, | ||
40 | SCAN_SUCCEED, | ||
41 | SCAN_PMD_NULL, | ||
42 | SCAN_EXCEED_NONE_PTE, | ||
43 | SCAN_PTE_NON_PRESENT, | ||
44 | SCAN_PAGE_RO, | ||
45 | SCAN_NO_REFERENCED_PAGE, | ||
46 | SCAN_PAGE_NULL, | ||
47 | SCAN_SCAN_ABORT, | ||
48 | SCAN_PAGE_COUNT, | ||
49 | SCAN_PAGE_LRU, | ||
50 | SCAN_PAGE_LOCK, | ||
51 | SCAN_PAGE_ANON, | ||
52 | SCAN_PAGE_COMPOUND, | ||
53 | SCAN_ANY_PROCESS, | ||
54 | SCAN_VMA_NULL, | ||
55 | SCAN_VMA_CHECK, | ||
56 | SCAN_ADDRESS_RANGE, | ||
57 | SCAN_SWAP_CACHE_PAGE, | ||
58 | SCAN_DEL_PAGE_LRU, | ||
59 | SCAN_ALLOC_HUGE_PAGE_FAIL, | ||
60 | SCAN_CGROUP_CHARGE_FAIL | ||
61 | }; | ||
62 | |||
63 | #define CREATE_TRACE_POINTS | ||
64 | #include <trace/events/huge_memory.h> | ||
65 | |||
66 | /* | 38 | /* |
67 | * By default transparent hugepage support is disabled in order that avoid | 39 | * By default transparent hugepage support is disabled in order that avoid |
68 | * to risk increase the memory footprint of applications without a guaranteed | 40 | * to risk increase the memory footprint of applications without a guaranteed |
@@ -82,127 +54,8 @@ unsigned long transparent_hugepage_flags __read_mostly = | |||
82 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| | 54 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| |
83 | (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | 55 | (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); |
84 | 56 | ||
85 | /* default scan 8*512 pte (or vmas) every 30 second */ | ||
86 | static unsigned int khugepaged_pages_to_scan __read_mostly; | ||
87 | static unsigned int khugepaged_pages_collapsed; | ||
88 | static unsigned int khugepaged_full_scans; | ||
89 | static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; | ||
90 | /* during fragmentation poll the hugepage allocator once every minute */ | ||
91 | static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; | ||
92 | static unsigned long khugepaged_sleep_expire; | ||
93 | static struct task_struct *khugepaged_thread __read_mostly; | ||
94 | static DEFINE_MUTEX(khugepaged_mutex); | ||
95 | static DEFINE_SPINLOCK(khugepaged_mm_lock); | ||
96 | static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); | ||
97 | /* | ||
98 | * default collapse hugepages if there is at least one pte mapped like | ||
99 | * it would have happened if the vma was large enough during page | ||
100 | * fault. | ||
101 | */ | ||
102 | static unsigned int khugepaged_max_ptes_none __read_mostly; | ||
103 | |||
104 | static int khugepaged(void *none); | ||
105 | static int khugepaged_slab_init(void); | ||
106 | static void khugepaged_slab_exit(void); | ||
107 | |||
108 | #define MM_SLOTS_HASH_BITS 10 | ||
109 | static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); | ||
110 | |||
111 | static struct kmem_cache *mm_slot_cache __read_mostly; | ||
112 | |||
113 | /** | ||
114 | * struct mm_slot - hash lookup from mm to mm_slot | ||
115 | * @hash: hash collision list | ||
116 | * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head | ||
117 | * @mm: the mm that this information is valid for | ||
118 | */ | ||
119 | struct mm_slot { | ||
120 | struct hlist_node hash; | ||
121 | struct list_head mm_node; | ||
122 | struct mm_struct *mm; | ||
123 | }; | ||
124 | |||
125 | /** | ||
126 | * struct khugepaged_scan - cursor for scanning | ||
127 | * @mm_head: the head of the mm list to scan | ||
128 | * @mm_slot: the current mm_slot we are scanning | ||
129 | * @address: the next address inside that to be scanned | ||
130 | * | ||
131 | * There is only the one khugepaged_scan instance of this cursor structure. | ||
132 | */ | ||
133 | struct khugepaged_scan { | ||
134 | struct list_head mm_head; | ||
135 | struct mm_slot *mm_slot; | ||
136 | unsigned long address; | ||
137 | }; | ||
138 | static struct khugepaged_scan khugepaged_scan = { | ||
139 | .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), | ||
140 | }; | ||
141 | |||
142 | static struct shrinker deferred_split_shrinker; | 57 | static struct shrinker deferred_split_shrinker; |
143 | 58 | ||
144 | static void set_recommended_min_free_kbytes(void) | ||
145 | { | ||
146 | struct zone *zone; | ||
147 | int nr_zones = 0; | ||
148 | unsigned long recommended_min; | ||
149 | |||
150 | for_each_populated_zone(zone) | ||
151 | nr_zones++; | ||
152 | |||
153 | /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ | ||
154 | recommended_min = pageblock_nr_pages * nr_zones * 2; | ||
155 | |||
156 | /* | ||
157 | * Make sure that on average at least two pageblocks are almost free | ||
158 | * of another type, one for a migratetype to fall back to and a | ||
159 | * second to avoid subsequent fallbacks of other types There are 3 | ||
160 | * MIGRATE_TYPES we care about. | ||
161 | */ | ||
162 | recommended_min += pageblock_nr_pages * nr_zones * | ||
163 | MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; | ||
164 | |||
165 | /* don't ever allow to reserve more than 5% of the lowmem */ | ||
166 | recommended_min = min(recommended_min, | ||
167 | (unsigned long) nr_free_buffer_pages() / 20); | ||
168 | recommended_min <<= (PAGE_SHIFT-10); | ||
169 | |||
170 | if (recommended_min > min_free_kbytes) { | ||
171 | if (user_min_free_kbytes >= 0) | ||
172 | pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", | ||
173 | min_free_kbytes, recommended_min); | ||
174 | |||
175 | min_free_kbytes = recommended_min; | ||
176 | } | ||
177 | setup_per_zone_wmarks(); | ||
178 | } | ||
179 | |||
180 | static int start_stop_khugepaged(void) | ||
181 | { | ||
182 | int err = 0; | ||
183 | if (khugepaged_enabled()) { | ||
184 | if (!khugepaged_thread) | ||
185 | khugepaged_thread = kthread_run(khugepaged, NULL, | ||
186 | "khugepaged"); | ||
187 | if (IS_ERR(khugepaged_thread)) { | ||
188 | pr_err("khugepaged: kthread_run(khugepaged) failed\n"); | ||
189 | err = PTR_ERR(khugepaged_thread); | ||
190 | khugepaged_thread = NULL; | ||
191 | goto fail; | ||
192 | } | ||
193 | |||
194 | if (!list_empty(&khugepaged_scan.mm_head)) | ||
195 | wake_up_interruptible(&khugepaged_wait); | ||
196 | |||
197 | set_recommended_min_free_kbytes(); | ||
198 | } else if (khugepaged_thread) { | ||
199 | kthread_stop(khugepaged_thread); | ||
200 | khugepaged_thread = NULL; | ||
201 | } | ||
202 | fail: | ||
203 | return err; | ||
204 | } | ||
205 | |||
206 | static atomic_t huge_zero_refcount; | 59 | static atomic_t huge_zero_refcount; |
207 | struct page *huge_zero_page __read_mostly; | 60 | struct page *huge_zero_page __read_mostly; |
208 | 61 | ||
@@ -328,12 +181,7 @@ static ssize_t enabled_store(struct kobject *kobj, | |||
328 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); | 181 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); |
329 | 182 | ||
330 | if (ret > 0) { | 183 | if (ret > 0) { |
331 | int err; | 184 | int err = start_stop_khugepaged(); |
332 | |||
333 | mutex_lock(&khugepaged_mutex); | ||
334 | err = start_stop_khugepaged(); | ||
335 | mutex_unlock(&khugepaged_mutex); | ||
336 | |||
337 | if (err) | 185 | if (err) |
338 | ret = err; | 186 | ret = err; |
339 | } | 187 | } |
@@ -343,7 +191,7 @@ static ssize_t enabled_store(struct kobject *kobj, | |||
343 | static struct kobj_attribute enabled_attr = | 191 | static struct kobj_attribute enabled_attr = |
344 | __ATTR(enabled, 0644, enabled_show, enabled_store); | 192 | __ATTR(enabled, 0644, enabled_show, enabled_store); |
345 | 193 | ||
346 | static ssize_t single_flag_show(struct kobject *kobj, | 194 | ssize_t single_hugepage_flag_show(struct kobject *kobj, |
347 | struct kobj_attribute *attr, char *buf, | 195 | struct kobj_attribute *attr, char *buf, |
348 | enum transparent_hugepage_flag flag) | 196 | enum transparent_hugepage_flag flag) |
349 | { | 197 | { |
@@ -351,7 +199,7 @@ static ssize_t single_flag_show(struct kobject *kobj, | |||
351 | !!test_bit(flag, &transparent_hugepage_flags)); | 199 | !!test_bit(flag, &transparent_hugepage_flags)); |
352 | } | 200 | } |
353 | 201 | ||
354 | static ssize_t single_flag_store(struct kobject *kobj, | 202 | ssize_t single_hugepage_flag_store(struct kobject *kobj, |
355 | struct kobj_attribute *attr, | 203 | struct kobj_attribute *attr, |
356 | const char *buf, size_t count, | 204 | const char *buf, size_t count, |
357 | enum transparent_hugepage_flag flag) | 205 | enum transparent_hugepage_flag flag) |
@@ -406,13 +254,13 @@ static struct kobj_attribute defrag_attr = | |||
406 | static ssize_t use_zero_page_show(struct kobject *kobj, | 254 | static ssize_t use_zero_page_show(struct kobject *kobj, |
407 | struct kobj_attribute *attr, char *buf) | 255 | struct kobj_attribute *attr, char *buf) |
408 | { | 256 | { |
409 | return single_flag_show(kobj, attr, buf, | 257 | return single_hugepage_flag_show(kobj, attr, buf, |
410 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | 258 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); |
411 | } | 259 | } |
412 | static ssize_t use_zero_page_store(struct kobject *kobj, | 260 | static ssize_t use_zero_page_store(struct kobject *kobj, |
413 | struct kobj_attribute *attr, const char *buf, size_t count) | 261 | struct kobj_attribute *attr, const char *buf, size_t count) |
414 | { | 262 | { |
415 | return single_flag_store(kobj, attr, buf, count, | 263 | return single_hugepage_flag_store(kobj, attr, buf, count, |
416 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | 264 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); |
417 | } | 265 | } |
418 | static struct kobj_attribute use_zero_page_attr = | 266 | static struct kobj_attribute use_zero_page_attr = |
@@ -421,14 +269,14 @@ static struct kobj_attribute use_zero_page_attr = | |||
421 | static ssize_t debug_cow_show(struct kobject *kobj, | 269 | static ssize_t debug_cow_show(struct kobject *kobj, |
422 | struct kobj_attribute *attr, char *buf) | 270 | struct kobj_attribute *attr, char *buf) |
423 | { | 271 | { |
424 | return single_flag_show(kobj, attr, buf, | 272 | return single_hugepage_flag_show(kobj, attr, buf, |
425 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); | 273 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); |
426 | } | 274 | } |
427 | static ssize_t debug_cow_store(struct kobject *kobj, | 275 | static ssize_t debug_cow_store(struct kobject *kobj, |
428 | struct kobj_attribute *attr, | 276 | struct kobj_attribute *attr, |
429 | const char *buf, size_t count) | 277 | const char *buf, size_t count) |
430 | { | 278 | { |
431 | return single_flag_store(kobj, attr, buf, count, | 279 | return single_hugepage_flag_store(kobj, attr, buf, count, |
432 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); | 280 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG); |
433 | } | 281 | } |
434 | static struct kobj_attribute debug_cow_attr = | 282 | static struct kobj_attribute debug_cow_attr = |
@@ -439,6 +287,9 @@ static struct attribute *hugepage_attr[] = { | |||
439 | &enabled_attr.attr, | 287 | &enabled_attr.attr, |
440 | &defrag_attr.attr, | 288 | &defrag_attr.attr, |
441 | &use_zero_page_attr.attr, | 289 | &use_zero_page_attr.attr, |
290 | #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) | ||
291 | &shmem_enabled_attr.attr, | ||
292 | #endif | ||
442 | #ifdef CONFIG_DEBUG_VM | 293 | #ifdef CONFIG_DEBUG_VM |
443 | &debug_cow_attr.attr, | 294 | &debug_cow_attr.attr, |
444 | #endif | 295 | #endif |
@@ -449,171 +300,6 @@ static struct attribute_group hugepage_attr_group = { | |||
449 | .attrs = hugepage_attr, | 300 | .attrs = hugepage_attr, |
450 | }; | 301 | }; |
451 | 302 | ||
452 | static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, | ||
453 | struct kobj_attribute *attr, | ||
454 | char *buf) | ||
455 | { | ||
456 | return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); | ||
457 | } | ||
458 | |||
459 | static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, | ||
460 | struct kobj_attribute *attr, | ||
461 | const char *buf, size_t count) | ||
462 | { | ||
463 | unsigned long msecs; | ||
464 | int err; | ||
465 | |||
466 | err = kstrtoul(buf, 10, &msecs); | ||
467 | if (err || msecs > UINT_MAX) | ||
468 | return -EINVAL; | ||
469 | |||
470 | khugepaged_scan_sleep_millisecs = msecs; | ||
471 | khugepaged_sleep_expire = 0; | ||
472 | wake_up_interruptible(&khugepaged_wait); | ||
473 | |||
474 | return count; | ||
475 | } | ||
476 | static struct kobj_attribute scan_sleep_millisecs_attr = | ||
477 | __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, | ||
478 | scan_sleep_millisecs_store); | ||
479 | |||
480 | static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, | ||
481 | struct kobj_attribute *attr, | ||
482 | char *buf) | ||
483 | { | ||
484 | return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); | ||
485 | } | ||
486 | |||
487 | static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, | ||
488 | struct kobj_attribute *attr, | ||
489 | const char *buf, size_t count) | ||
490 | { | ||
491 | unsigned long msecs; | ||
492 | int err; | ||
493 | |||
494 | err = kstrtoul(buf, 10, &msecs); | ||
495 | if (err || msecs > UINT_MAX) | ||
496 | return -EINVAL; | ||
497 | |||
498 | khugepaged_alloc_sleep_millisecs = msecs; | ||
499 | khugepaged_sleep_expire = 0; | ||
500 | wake_up_interruptible(&khugepaged_wait); | ||
501 | |||
502 | return count; | ||
503 | } | ||
504 | static struct kobj_attribute alloc_sleep_millisecs_attr = | ||
505 | __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, | ||
506 | alloc_sleep_millisecs_store); | ||
507 | |||
508 | static ssize_t pages_to_scan_show(struct kobject *kobj, | ||
509 | struct kobj_attribute *attr, | ||
510 | char *buf) | ||
511 | { | ||
512 | return sprintf(buf, "%u\n", khugepaged_pages_to_scan); | ||
513 | } | ||
514 | static ssize_t pages_to_scan_store(struct kobject *kobj, | ||
515 | struct kobj_attribute *attr, | ||
516 | const char *buf, size_t count) | ||
517 | { | ||
518 | int err; | ||
519 | unsigned long pages; | ||
520 | |||
521 | err = kstrtoul(buf, 10, &pages); | ||
522 | if (err || !pages || pages > UINT_MAX) | ||
523 | return -EINVAL; | ||
524 | |||
525 | khugepaged_pages_to_scan = pages; | ||
526 | |||
527 | return count; | ||
528 | } | ||
529 | static struct kobj_attribute pages_to_scan_attr = | ||
530 | __ATTR(pages_to_scan, 0644, pages_to_scan_show, | ||
531 | pages_to_scan_store); | ||
532 | |||
533 | static ssize_t pages_collapsed_show(struct kobject *kobj, | ||
534 | struct kobj_attribute *attr, | ||
535 | char *buf) | ||
536 | { | ||
537 | return sprintf(buf, "%u\n", khugepaged_pages_collapsed); | ||
538 | } | ||
539 | static struct kobj_attribute pages_collapsed_attr = | ||
540 | __ATTR_RO(pages_collapsed); | ||
541 | |||
542 | static ssize_t full_scans_show(struct kobject *kobj, | ||
543 | struct kobj_attribute *attr, | ||
544 | char *buf) | ||
545 | { | ||
546 | return sprintf(buf, "%u\n", khugepaged_full_scans); | ||
547 | } | ||
548 | static struct kobj_attribute full_scans_attr = | ||
549 | __ATTR_RO(full_scans); | ||
550 | |||
551 | static ssize_t khugepaged_defrag_show(struct kobject *kobj, | ||
552 | struct kobj_attribute *attr, char *buf) | ||
553 | { | ||
554 | return single_flag_show(kobj, attr, buf, | ||
555 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | ||
556 | } | ||
557 | static ssize_t khugepaged_defrag_store(struct kobject *kobj, | ||
558 | struct kobj_attribute *attr, | ||
559 | const char *buf, size_t count) | ||
560 | { | ||
561 | return single_flag_store(kobj, attr, buf, count, | ||
562 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | ||
563 | } | ||
564 | static struct kobj_attribute khugepaged_defrag_attr = | ||
565 | __ATTR(defrag, 0644, khugepaged_defrag_show, | ||
566 | khugepaged_defrag_store); | ||
567 | |||
568 | /* | ||
569 | * max_ptes_none controls if khugepaged should collapse hugepages over | ||
570 | * any unmapped ptes in turn potentially increasing the memory | ||
571 | * footprint of the vmas. When max_ptes_none is 0 khugepaged will not | ||
572 | * reduce the available free memory in the system as it | ||
573 | * runs. Increasing max_ptes_none will instead potentially reduce the | ||
574 | * free memory in the system during the khugepaged scan. | ||
575 | */ | ||
576 | static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, | ||
577 | struct kobj_attribute *attr, | ||
578 | char *buf) | ||
579 | { | ||
580 | return sprintf(buf, "%u\n", khugepaged_max_ptes_none); | ||
581 | } | ||
582 | static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, | ||
583 | struct kobj_attribute *attr, | ||
584 | const char *buf, size_t count) | ||
585 | { | ||
586 | int err; | ||
587 | unsigned long max_ptes_none; | ||
588 | |||
589 | err = kstrtoul(buf, 10, &max_ptes_none); | ||
590 | if (err || max_ptes_none > HPAGE_PMD_NR-1) | ||
591 | return -EINVAL; | ||
592 | |||
593 | khugepaged_max_ptes_none = max_ptes_none; | ||
594 | |||
595 | return count; | ||
596 | } | ||
597 | static struct kobj_attribute khugepaged_max_ptes_none_attr = | ||
598 | __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, | ||
599 | khugepaged_max_ptes_none_store); | ||
600 | |||
601 | static struct attribute *khugepaged_attr[] = { | ||
602 | &khugepaged_defrag_attr.attr, | ||
603 | &khugepaged_max_ptes_none_attr.attr, | ||
604 | &pages_to_scan_attr.attr, | ||
605 | &pages_collapsed_attr.attr, | ||
606 | &full_scans_attr.attr, | ||
607 | &scan_sleep_millisecs_attr.attr, | ||
608 | &alloc_sleep_millisecs_attr.attr, | ||
609 | NULL, | ||
610 | }; | ||
611 | |||
612 | static struct attribute_group khugepaged_attr_group = { | ||
613 | .attrs = khugepaged_attr, | ||
614 | .name = "khugepaged", | ||
615 | }; | ||
616 | |||
617 | static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) | 303 | static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) |
618 | { | 304 | { |
619 | int err; | 305 | int err; |
@@ -672,8 +358,6 @@ static int __init hugepage_init(void) | |||
672 | return -EINVAL; | 358 | return -EINVAL; |
673 | } | 359 | } |
674 | 360 | ||
675 | khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; | ||
676 | khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; | ||
677 | /* | 361 | /* |
678 | * hugepages can't be allocated by the buddy allocator | 362 | * hugepages can't be allocated by the buddy allocator |
679 | */ | 363 | */ |
@@ -688,7 +372,7 @@ static int __init hugepage_init(void) | |||
688 | if (err) | 372 | if (err) |
689 | goto err_sysfs; | 373 | goto err_sysfs; |
690 | 374 | ||
691 | err = khugepaged_slab_init(); | 375 | err = khugepaged_init(); |
692 | if (err) | 376 | if (err) |
693 | goto err_slab; | 377 | goto err_slab; |
694 | 378 | ||
@@ -719,7 +403,7 @@ err_khugepaged: | |||
719 | err_split_shrinker: | 403 | err_split_shrinker: |
720 | unregister_shrinker(&huge_zero_page_shrinker); | 404 | unregister_shrinker(&huge_zero_page_shrinker); |
721 | err_hzp_shrinker: | 405 | err_hzp_shrinker: |
722 | khugepaged_slab_exit(); | 406 | khugepaged_destroy(); |
723 | err_slab: | 407 | err_slab: |
724 | hugepage_exit_sysfs(hugepage_kobj); | 408 | hugepage_exit_sysfs(hugepage_kobj); |
725 | err_sysfs: | 409 | err_sysfs: |
@@ -765,11 +449,6 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | |||
765 | return pmd; | 449 | return pmd; |
766 | } | 450 | } |
767 | 451 | ||
768 | static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) | ||
769 | { | ||
770 | return pmd_mkhuge(mk_pmd(page, prot)); | ||
771 | } | ||
772 | |||
773 | static inline struct list_head *page_deferred_list(struct page *page) | 452 | static inline struct list_head *page_deferred_list(struct page *page) |
774 | { | 453 | { |
775 | /* | 454 | /* |
@@ -790,26 +469,23 @@ void prep_transhuge_page(struct page *page) | |||
790 | set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); | 469 | set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); |
791 | } | 470 | } |
792 | 471 | ||
793 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | 472 | static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, |
794 | struct vm_area_struct *vma, | 473 | gfp_t gfp) |
795 | unsigned long address, pmd_t *pmd, | ||
796 | struct page *page, gfp_t gfp, | ||
797 | unsigned int flags) | ||
798 | { | 474 | { |
475 | struct vm_area_struct *vma = fe->vma; | ||
799 | struct mem_cgroup *memcg; | 476 | struct mem_cgroup *memcg; |
800 | pgtable_t pgtable; | 477 | pgtable_t pgtable; |
801 | spinlock_t *ptl; | 478 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; |
802 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
803 | 479 | ||
804 | VM_BUG_ON_PAGE(!PageCompound(page), page); | 480 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
805 | 481 | ||
806 | if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) { | 482 | if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { |
807 | put_page(page); | 483 | put_page(page); |
808 | count_vm_event(THP_FAULT_FALLBACK); | 484 | count_vm_event(THP_FAULT_FALLBACK); |
809 | return VM_FAULT_FALLBACK; | 485 | return VM_FAULT_FALLBACK; |
810 | } | 486 | } |
811 | 487 | ||
812 | pgtable = pte_alloc_one(mm, haddr); | 488 | pgtable = pte_alloc_one(vma->vm_mm, haddr); |
813 | if (unlikely(!pgtable)) { | 489 | if (unlikely(!pgtable)) { |
814 | mem_cgroup_cancel_charge(page, memcg, true); | 490 | mem_cgroup_cancel_charge(page, memcg, true); |
815 | put_page(page); | 491 | put_page(page); |
@@ -824,12 +500,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
824 | */ | 500 | */ |
825 | __SetPageUptodate(page); | 501 | __SetPageUptodate(page); |
826 | 502 | ||
827 | ptl = pmd_lock(mm, pmd); | 503 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); |
828 | if (unlikely(!pmd_none(*pmd))) { | 504 | if (unlikely(!pmd_none(*fe->pmd))) { |
829 | spin_unlock(ptl); | 505 | spin_unlock(fe->ptl); |
830 | mem_cgroup_cancel_charge(page, memcg, true); | 506 | mem_cgroup_cancel_charge(page, memcg, true); |
831 | put_page(page); | 507 | put_page(page); |
832 | pte_free(mm, pgtable); | 508 | pte_free(vma->vm_mm, pgtable); |
833 | } else { | 509 | } else { |
834 | pmd_t entry; | 510 | pmd_t entry; |
835 | 511 | ||
@@ -837,12 +513,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
837 | if (userfaultfd_missing(vma)) { | 513 | if (userfaultfd_missing(vma)) { |
838 | int ret; | 514 | int ret; |
839 | 515 | ||
840 | spin_unlock(ptl); | 516 | spin_unlock(fe->ptl); |
841 | mem_cgroup_cancel_charge(page, memcg, true); | 517 | mem_cgroup_cancel_charge(page, memcg, true); |
842 | put_page(page); | 518 | put_page(page); |
843 | pte_free(mm, pgtable); | 519 | pte_free(vma->vm_mm, pgtable); |
844 | ret = handle_userfault(vma, address, flags, | 520 | ret = handle_userfault(fe, VM_UFFD_MISSING); |
845 | VM_UFFD_MISSING); | ||
846 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); | 521 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); |
847 | return ret; | 522 | return ret; |
848 | } | 523 | } |
@@ -852,11 +527,11 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
852 | page_add_new_anon_rmap(page, vma, haddr, true); | 527 | page_add_new_anon_rmap(page, vma, haddr, true); |
853 | mem_cgroup_commit_charge(page, memcg, false, true); | 528 | mem_cgroup_commit_charge(page, memcg, false, true); |
854 | lru_cache_add_active_or_unevictable(page, vma); | 529 | lru_cache_add_active_or_unevictable(page, vma); |
855 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 530 | pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable); |
856 | set_pmd_at(mm, haddr, pmd, entry); | 531 | set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); |
857 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 532 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
858 | atomic_long_inc(&mm->nr_ptes); | 533 | atomic_long_inc(&vma->vm_mm->nr_ptes); |
859 | spin_unlock(ptl); | 534 | spin_unlock(fe->ptl); |
860 | count_vm_event(THP_FAULT_ALLOC); | 535 | count_vm_event(THP_FAULT_ALLOC); |
861 | } | 536 | } |
862 | 537 | ||
@@ -883,12 +558,6 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) | |||
883 | return GFP_TRANSHUGE | reclaim_flags; | 558 | return GFP_TRANSHUGE | reclaim_flags; |
884 | } | 559 | } |
885 | 560 | ||
886 | /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ | ||
887 | static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) | ||
888 | { | ||
889 | return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); | ||
890 | } | ||
891 | |||
892 | /* Caller must hold page table lock. */ | 561 | /* Caller must hold page table lock. */ |
893 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | 562 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, |
894 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | 563 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, |
@@ -906,13 +575,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | |||
906 | return true; | 575 | return true; |
907 | } | 576 | } |
908 | 577 | ||
909 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 578 | int do_huge_pmd_anonymous_page(struct fault_env *fe) |
910 | unsigned long address, pmd_t *pmd, | ||
911 | unsigned int flags) | ||
912 | { | 579 | { |
580 | struct vm_area_struct *vma = fe->vma; | ||
913 | gfp_t gfp; | 581 | gfp_t gfp; |
914 | struct page *page; | 582 | struct page *page; |
915 | unsigned long haddr = address & HPAGE_PMD_MASK; | 583 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; |
916 | 584 | ||
917 | if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) | 585 | if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) |
918 | return VM_FAULT_FALLBACK; | 586 | return VM_FAULT_FALLBACK; |
@@ -920,42 +588,40 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
920 | return VM_FAULT_OOM; | 588 | return VM_FAULT_OOM; |
921 | if (unlikely(khugepaged_enter(vma, vma->vm_flags))) | 589 | if (unlikely(khugepaged_enter(vma, vma->vm_flags))) |
922 | return VM_FAULT_OOM; | 590 | return VM_FAULT_OOM; |
923 | if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) && | 591 | if (!(fe->flags & FAULT_FLAG_WRITE) && |
592 | !mm_forbids_zeropage(vma->vm_mm) && | ||
924 | transparent_hugepage_use_zero_page()) { | 593 | transparent_hugepage_use_zero_page()) { |
925 | spinlock_t *ptl; | ||
926 | pgtable_t pgtable; | 594 | pgtable_t pgtable; |
927 | struct page *zero_page; | 595 | struct page *zero_page; |
928 | bool set; | 596 | bool set; |
929 | int ret; | 597 | int ret; |
930 | pgtable = pte_alloc_one(mm, haddr); | 598 | pgtable = pte_alloc_one(vma->vm_mm, haddr); |
931 | if (unlikely(!pgtable)) | 599 | if (unlikely(!pgtable)) |
932 | return VM_FAULT_OOM; | 600 | return VM_FAULT_OOM; |
933 | zero_page = get_huge_zero_page(); | 601 | zero_page = get_huge_zero_page(); |
934 | if (unlikely(!zero_page)) { | 602 | if (unlikely(!zero_page)) { |
935 | pte_free(mm, pgtable); | 603 | pte_free(vma->vm_mm, pgtable); |
936 | count_vm_event(THP_FAULT_FALLBACK); | 604 | count_vm_event(THP_FAULT_FALLBACK); |
937 | return VM_FAULT_FALLBACK; | 605 | return VM_FAULT_FALLBACK; |
938 | } | 606 | } |
939 | ptl = pmd_lock(mm, pmd); | 607 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); |
940 | ret = 0; | 608 | ret = 0; |
941 | set = false; | 609 | set = false; |
942 | if (pmd_none(*pmd)) { | 610 | if (pmd_none(*fe->pmd)) { |
943 | if (userfaultfd_missing(vma)) { | 611 | if (userfaultfd_missing(vma)) { |
944 | spin_unlock(ptl); | 612 | spin_unlock(fe->ptl); |
945 | ret = handle_userfault(vma, address, flags, | 613 | ret = handle_userfault(fe, VM_UFFD_MISSING); |
946 | VM_UFFD_MISSING); | ||
947 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); | 614 | VM_BUG_ON(ret & VM_FAULT_FALLBACK); |
948 | } else { | 615 | } else { |
949 | set_huge_zero_page(pgtable, mm, vma, | 616 | set_huge_zero_page(pgtable, vma->vm_mm, vma, |
950 | haddr, pmd, | 617 | haddr, fe->pmd, zero_page); |
951 | zero_page); | 618 | spin_unlock(fe->ptl); |
952 | spin_unlock(ptl); | ||
953 | set = true; | 619 | set = true; |
954 | } | 620 | } |
955 | } else | 621 | } else |
956 | spin_unlock(ptl); | 622 | spin_unlock(fe->ptl); |
957 | if (!set) { | 623 | if (!set) { |
958 | pte_free(mm, pgtable); | 624 | pte_free(vma->vm_mm, pgtable); |
959 | put_huge_zero_page(); | 625 | put_huge_zero_page(); |
960 | } | 626 | } |
961 | return ret; | 627 | return ret; |
@@ -967,8 +633,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
967 | return VM_FAULT_FALLBACK; | 633 | return VM_FAULT_FALLBACK; |
968 | } | 634 | } |
969 | prep_transhuge_page(page); | 635 | prep_transhuge_page(page); |
970 | return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, | 636 | return __do_huge_pmd_anonymous_page(fe, page, gfp); |
971 | flags); | ||
972 | } | 637 | } |
973 | 638 | ||
974 | static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | 639 | static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, |
@@ -1080,14 +745,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1080 | struct page *src_page; | 745 | struct page *src_page; |
1081 | pmd_t pmd; | 746 | pmd_t pmd; |
1082 | pgtable_t pgtable = NULL; | 747 | pgtable_t pgtable = NULL; |
1083 | int ret; | 748 | int ret = -ENOMEM; |
1084 | 749 | ||
1085 | if (!vma_is_dax(vma)) { | 750 | /* Skip if can be re-fill on fault */ |
1086 | ret = -ENOMEM; | 751 | if (!vma_is_anonymous(vma)) |
1087 | pgtable = pte_alloc_one(dst_mm, addr); | 752 | return 0; |
1088 | if (unlikely(!pgtable)) | 753 | |
1089 | goto out; | 754 | pgtable = pte_alloc_one(dst_mm, addr); |
1090 | } | 755 | if (unlikely(!pgtable)) |
756 | goto out; | ||
1091 | 757 | ||
1092 | dst_ptl = pmd_lock(dst_mm, dst_pmd); | 758 | dst_ptl = pmd_lock(dst_mm, dst_pmd); |
1093 | src_ptl = pmd_lockptr(src_mm, src_pmd); | 759 | src_ptl = pmd_lockptr(src_mm, src_pmd); |
@@ -1095,7 +761,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1095 | 761 | ||
1096 | ret = -EAGAIN; | 762 | ret = -EAGAIN; |
1097 | pmd = *src_pmd; | 763 | pmd = *src_pmd; |
1098 | if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) { | 764 | if (unlikely(!pmd_trans_huge(pmd))) { |
1099 | pte_free(dst_mm, pgtable); | 765 | pte_free(dst_mm, pgtable); |
1100 | goto out_unlock; | 766 | goto out_unlock; |
1101 | } | 767 | } |
@@ -1118,16 +784,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1118 | goto out_unlock; | 784 | goto out_unlock; |
1119 | } | 785 | } |
1120 | 786 | ||
1121 | if (!vma_is_dax(vma)) { | 787 | src_page = pmd_page(pmd); |
1122 | /* thp accounting separate from pmd_devmap accounting */ | 788 | VM_BUG_ON_PAGE(!PageHead(src_page), src_page); |
1123 | src_page = pmd_page(pmd); | 789 | get_page(src_page); |
1124 | VM_BUG_ON_PAGE(!PageHead(src_page), src_page); | 790 | page_dup_rmap(src_page, true); |
1125 | get_page(src_page); | 791 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
1126 | page_dup_rmap(src_page, true); | 792 | atomic_long_inc(&dst_mm->nr_ptes); |
1127 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); | 793 | pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); |
1128 | atomic_long_inc(&dst_mm->nr_ptes); | ||
1129 | pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); | ||
1130 | } | ||
1131 | 794 | ||
1132 | pmdp_set_wrprotect(src_mm, addr, src_pmd); | 795 | pmdp_set_wrprotect(src_mm, addr, src_pmd); |
1133 | pmd = pmd_mkold(pmd_wrprotect(pmd)); | 796 | pmd = pmd_mkold(pmd_wrprotect(pmd)); |
@@ -1141,38 +804,31 @@ out: | |||
1141 | return ret; | 804 | return ret; |
1142 | } | 805 | } |
1143 | 806 | ||
1144 | void huge_pmd_set_accessed(struct mm_struct *mm, | 807 | void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd) |
1145 | struct vm_area_struct *vma, | ||
1146 | unsigned long address, | ||
1147 | pmd_t *pmd, pmd_t orig_pmd, | ||
1148 | int dirty) | ||
1149 | { | 808 | { |
1150 | spinlock_t *ptl; | ||
1151 | pmd_t entry; | 809 | pmd_t entry; |
1152 | unsigned long haddr; | 810 | unsigned long haddr; |
1153 | 811 | ||
1154 | ptl = pmd_lock(mm, pmd); | 812 | fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd); |
1155 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 813 | if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) |
1156 | goto unlock; | 814 | goto unlock; |
1157 | 815 | ||
1158 | entry = pmd_mkyoung(orig_pmd); | 816 | entry = pmd_mkyoung(orig_pmd); |
1159 | haddr = address & HPAGE_PMD_MASK; | 817 | haddr = fe->address & HPAGE_PMD_MASK; |
1160 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) | 818 | if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, |
1161 | update_mmu_cache_pmd(vma, address, pmd); | 819 | fe->flags & FAULT_FLAG_WRITE)) |
820 | update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd); | ||
1162 | 821 | ||
1163 | unlock: | 822 | unlock: |
1164 | spin_unlock(ptl); | 823 | spin_unlock(fe->ptl); |
1165 | } | 824 | } |
1166 | 825 | ||
1167 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | 826 | static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, |
1168 | struct vm_area_struct *vma, | 827 | struct page *page) |
1169 | unsigned long address, | ||
1170 | pmd_t *pmd, pmd_t orig_pmd, | ||
1171 | struct page *page, | ||
1172 | unsigned long haddr) | ||
1173 | { | 828 | { |
829 | struct vm_area_struct *vma = fe->vma; | ||
830 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; | ||
1174 | struct mem_cgroup *memcg; | 831 | struct mem_cgroup *memcg; |
1175 | spinlock_t *ptl; | ||
1176 | pgtable_t pgtable; | 832 | pgtable_t pgtable; |
1177 | pmd_t _pmd; | 833 | pmd_t _pmd; |
1178 | int ret = 0, i; | 834 | int ret = 0, i; |
@@ -1189,11 +845,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
1189 | 845 | ||
1190 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 846 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
1191 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | | 847 | pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | |
1192 | __GFP_OTHER_NODE, | 848 | __GFP_OTHER_NODE, vma, |
1193 | vma, address, page_to_nid(page)); | 849 | fe->address, page_to_nid(page)); |
1194 | if (unlikely(!pages[i] || | 850 | if (unlikely(!pages[i] || |
1195 | mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, | 851 | mem_cgroup_try_charge(pages[i], vma->vm_mm, |
1196 | &memcg, false))) { | 852 | GFP_KERNEL, &memcg, false))) { |
1197 | if (pages[i]) | 853 | if (pages[i]) |
1198 | put_page(pages[i]); | 854 | put_page(pages[i]); |
1199 | while (--i >= 0) { | 855 | while (--i >= 0) { |
@@ -1219,41 +875,41 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
1219 | 875 | ||
1220 | mmun_start = haddr; | 876 | mmun_start = haddr; |
1221 | mmun_end = haddr + HPAGE_PMD_SIZE; | 877 | mmun_end = haddr + HPAGE_PMD_SIZE; |
1222 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 878 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); |
1223 | 879 | ||
1224 | ptl = pmd_lock(mm, pmd); | 880 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); |
1225 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 881 | if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) |
1226 | goto out_free_pages; | 882 | goto out_free_pages; |
1227 | VM_BUG_ON_PAGE(!PageHead(page), page); | 883 | VM_BUG_ON_PAGE(!PageHead(page), page); |
1228 | 884 | ||
1229 | pmdp_huge_clear_flush_notify(vma, haddr, pmd); | 885 | pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); |
1230 | /* leave pmd empty until pte is filled */ | 886 | /* leave pmd empty until pte is filled */ |
1231 | 887 | ||
1232 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); | 888 | pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd); |
1233 | pmd_populate(mm, &_pmd, pgtable); | 889 | pmd_populate(vma->vm_mm, &_pmd, pgtable); |
1234 | 890 | ||
1235 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | 891 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { |
1236 | pte_t *pte, entry; | 892 | pte_t entry; |
1237 | entry = mk_pte(pages[i], vma->vm_page_prot); | 893 | entry = mk_pte(pages[i], vma->vm_page_prot); |
1238 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 894 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1239 | memcg = (void *)page_private(pages[i]); | 895 | memcg = (void *)page_private(pages[i]); |
1240 | set_page_private(pages[i], 0); | 896 | set_page_private(pages[i], 0); |
1241 | page_add_new_anon_rmap(pages[i], vma, haddr, false); | 897 | page_add_new_anon_rmap(pages[i], fe->vma, haddr, false); |
1242 | mem_cgroup_commit_charge(pages[i], memcg, false, false); | 898 | mem_cgroup_commit_charge(pages[i], memcg, false, false); |
1243 | lru_cache_add_active_or_unevictable(pages[i], vma); | 899 | lru_cache_add_active_or_unevictable(pages[i], vma); |
1244 | pte = pte_offset_map(&_pmd, haddr); | 900 | fe->pte = pte_offset_map(&_pmd, haddr); |
1245 | VM_BUG_ON(!pte_none(*pte)); | 901 | VM_BUG_ON(!pte_none(*fe->pte)); |
1246 | set_pte_at(mm, haddr, pte, entry); | 902 | set_pte_at(vma->vm_mm, haddr, fe->pte, entry); |
1247 | pte_unmap(pte); | 903 | pte_unmap(fe->pte); |
1248 | } | 904 | } |
1249 | kfree(pages); | 905 | kfree(pages); |
1250 | 906 | ||
1251 | smp_wmb(); /* make pte visible before pmd */ | 907 | smp_wmb(); /* make pte visible before pmd */ |
1252 | pmd_populate(mm, pmd, pgtable); | 908 | pmd_populate(vma->vm_mm, fe->pmd, pgtable); |
1253 | page_remove_rmap(page, true); | 909 | page_remove_rmap(page, true); |
1254 | spin_unlock(ptl); | 910 | spin_unlock(fe->ptl); |
1255 | 911 | ||
1256 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 912 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); |
1257 | 913 | ||
1258 | ret |= VM_FAULT_WRITE; | 914 | ret |= VM_FAULT_WRITE; |
1259 | put_page(page); | 915 | put_page(page); |
@@ -1262,8 +918,8 @@ out: | |||
1262 | return ret; | 918 | return ret; |
1263 | 919 | ||
1264 | out_free_pages: | 920 | out_free_pages: |
1265 | spin_unlock(ptl); | 921 | spin_unlock(fe->ptl); |
1266 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 922 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); |
1267 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 923 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
1268 | memcg = (void *)page_private(pages[i]); | 924 | memcg = (void *)page_private(pages[i]); |
1269 | set_page_private(pages[i], 0); | 925 | set_page_private(pages[i], 0); |
@@ -1274,25 +930,23 @@ out_free_pages: | |||
1274 | goto out; | 930 | goto out; |
1275 | } | 931 | } |
1276 | 932 | ||
1277 | int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | 933 | int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) |
1278 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) | ||
1279 | { | 934 | { |
1280 | spinlock_t *ptl; | 935 | struct vm_area_struct *vma = fe->vma; |
1281 | int ret = 0; | ||
1282 | struct page *page = NULL, *new_page; | 936 | struct page *page = NULL, *new_page; |
1283 | struct mem_cgroup *memcg; | 937 | struct mem_cgroup *memcg; |
1284 | unsigned long haddr; | 938 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; |
1285 | unsigned long mmun_start; /* For mmu_notifiers */ | 939 | unsigned long mmun_start; /* For mmu_notifiers */ |
1286 | unsigned long mmun_end; /* For mmu_notifiers */ | 940 | unsigned long mmun_end; /* For mmu_notifiers */ |
1287 | gfp_t huge_gfp; /* for allocation and charge */ | 941 | gfp_t huge_gfp; /* for allocation and charge */ |
942 | int ret = 0; | ||
1288 | 943 | ||
1289 | ptl = pmd_lockptr(mm, pmd); | 944 | fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd); |
1290 | VM_BUG_ON_VMA(!vma->anon_vma, vma); | 945 | VM_BUG_ON_VMA(!vma->anon_vma, vma); |
1291 | haddr = address & HPAGE_PMD_MASK; | ||
1292 | if (is_huge_zero_pmd(orig_pmd)) | 946 | if (is_huge_zero_pmd(orig_pmd)) |
1293 | goto alloc; | 947 | goto alloc; |
1294 | spin_lock(ptl); | 948 | spin_lock(fe->ptl); |
1295 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 949 | if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) |
1296 | goto out_unlock; | 950 | goto out_unlock; |
1297 | 951 | ||
1298 | page = pmd_page(orig_pmd); | 952 | page = pmd_page(orig_pmd); |
@@ -1305,13 +959,13 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1305 | pmd_t entry; | 959 | pmd_t entry; |
1306 | entry = pmd_mkyoung(orig_pmd); | 960 | entry = pmd_mkyoung(orig_pmd); |
1307 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 961 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
1308 | if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) | 962 | if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1)) |
1309 | update_mmu_cache_pmd(vma, address, pmd); | 963 | update_mmu_cache_pmd(vma, fe->address, fe->pmd); |
1310 | ret |= VM_FAULT_WRITE; | 964 | ret |= VM_FAULT_WRITE; |
1311 | goto out_unlock; | 965 | goto out_unlock; |
1312 | } | 966 | } |
1313 | get_page(page); | 967 | get_page(page); |
1314 | spin_unlock(ptl); | 968 | spin_unlock(fe->ptl); |
1315 | alloc: | 969 | alloc: |
1316 | if (transparent_hugepage_enabled(vma) && | 970 | if (transparent_hugepage_enabled(vma) && |
1317 | !transparent_hugepage_debug_cow()) { | 971 | !transparent_hugepage_debug_cow()) { |
@@ -1324,13 +978,12 @@ alloc: | |||
1324 | prep_transhuge_page(new_page); | 978 | prep_transhuge_page(new_page); |
1325 | } else { | 979 | } else { |
1326 | if (!page) { | 980 | if (!page) { |
1327 | split_huge_pmd(vma, pmd, address); | 981 | split_huge_pmd(vma, fe->pmd, fe->address); |
1328 | ret |= VM_FAULT_FALLBACK; | 982 | ret |= VM_FAULT_FALLBACK; |
1329 | } else { | 983 | } else { |
1330 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | 984 | ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page); |
1331 | pmd, orig_pmd, page, haddr); | ||
1332 | if (ret & VM_FAULT_OOM) { | 985 | if (ret & VM_FAULT_OOM) { |
1333 | split_huge_pmd(vma, pmd, address); | 986 | split_huge_pmd(vma, fe->pmd, fe->address); |
1334 | ret |= VM_FAULT_FALLBACK; | 987 | ret |= VM_FAULT_FALLBACK; |
1335 | } | 988 | } |
1336 | put_page(page); | 989 | put_page(page); |
@@ -1339,14 +992,12 @@ alloc: | |||
1339 | goto out; | 992 | goto out; |
1340 | } | 993 | } |
1341 | 994 | ||
1342 | if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg, | 995 | if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, |
1343 | true))) { | 996 | huge_gfp, &memcg, true))) { |
1344 | put_page(new_page); | 997 | put_page(new_page); |
1345 | if (page) { | 998 | split_huge_pmd(vma, fe->pmd, fe->address); |
1346 | split_huge_pmd(vma, pmd, address); | 999 | if (page) |
1347 | put_page(page); | 1000 | put_page(page); |
1348 | } else | ||
1349 | split_huge_pmd(vma, pmd, address); | ||
1350 | ret |= VM_FAULT_FALLBACK; | 1001 | ret |= VM_FAULT_FALLBACK; |
1351 | count_vm_event(THP_FAULT_FALLBACK); | 1002 | count_vm_event(THP_FAULT_FALLBACK); |
1352 | goto out; | 1003 | goto out; |
@@ -1362,13 +1013,13 @@ alloc: | |||
1362 | 1013 | ||
1363 | mmun_start = haddr; | 1014 | mmun_start = haddr; |
1364 | mmun_end = haddr + HPAGE_PMD_SIZE; | 1015 | mmun_end = haddr + HPAGE_PMD_SIZE; |
1365 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1016 | mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); |
1366 | 1017 | ||
1367 | spin_lock(ptl); | 1018 | spin_lock(fe->ptl); |
1368 | if (page) | 1019 | if (page) |
1369 | put_page(page); | 1020 | put_page(page); |
1370 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 1021 | if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) { |
1371 | spin_unlock(ptl); | 1022 | spin_unlock(fe->ptl); |
1372 | mem_cgroup_cancel_charge(new_page, memcg, true); | 1023 | mem_cgroup_cancel_charge(new_page, memcg, true); |
1373 | put_page(new_page); | 1024 | put_page(new_page); |
1374 | goto out_mn; | 1025 | goto out_mn; |
@@ -1376,14 +1027,14 @@ alloc: | |||
1376 | pmd_t entry; | 1027 | pmd_t entry; |
1377 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); | 1028 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); |
1378 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1029 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
1379 | pmdp_huge_clear_flush_notify(vma, haddr, pmd); | 1030 | pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); |
1380 | page_add_new_anon_rmap(new_page, vma, haddr, true); | 1031 | page_add_new_anon_rmap(new_page, vma, haddr, true); |
1381 | mem_cgroup_commit_charge(new_page, memcg, false, true); | 1032 | mem_cgroup_commit_charge(new_page, memcg, false, true); |
1382 | lru_cache_add_active_or_unevictable(new_page, vma); | 1033 | lru_cache_add_active_or_unevictable(new_page, vma); |
1383 | set_pmd_at(mm, haddr, pmd, entry); | 1034 | set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); |
1384 | update_mmu_cache_pmd(vma, address, pmd); | 1035 | update_mmu_cache_pmd(vma, fe->address, fe->pmd); |
1385 | if (!page) { | 1036 | if (!page) { |
1386 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); | 1037 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
1387 | put_huge_zero_page(); | 1038 | put_huge_zero_page(); |
1388 | } else { | 1039 | } else { |
1389 | VM_BUG_ON_PAGE(!PageHead(page), page); | 1040 | VM_BUG_ON_PAGE(!PageHead(page), page); |
@@ -1392,13 +1043,13 @@ alloc: | |||
1392 | } | 1043 | } |
1393 | ret |= VM_FAULT_WRITE; | 1044 | ret |= VM_FAULT_WRITE; |
1394 | } | 1045 | } |
1395 | spin_unlock(ptl); | 1046 | spin_unlock(fe->ptl); |
1396 | out_mn: | 1047 | out_mn: |
1397 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1048 | mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); |
1398 | out: | 1049 | out: |
1399 | return ret; | 1050 | return ret; |
1400 | out_unlock: | 1051 | out_unlock: |
1401 | spin_unlock(ptl); | 1052 | spin_unlock(fe->ptl); |
1402 | return ret; | 1053 | return ret; |
1403 | } | 1054 | } |
1404 | 1055 | ||
@@ -1432,6 +1083,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
1432 | * We don't mlock() pte-mapped THPs. This way we can avoid | 1083 | * We don't mlock() pte-mapped THPs. This way we can avoid |
1433 | * leaking mlocked pages into non-VM_LOCKED VMAs. | 1084 | * leaking mlocked pages into non-VM_LOCKED VMAs. |
1434 | * | 1085 | * |
1086 | * For anon THP: | ||
1087 | * | ||
1435 | * In most cases the pmd is the only mapping of the page as we | 1088 | * In most cases the pmd is the only mapping of the page as we |
1436 | * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for | 1089 | * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for |
1437 | * writable private mappings in populate_vma_page_range(). | 1090 | * writable private mappings in populate_vma_page_range(). |
@@ -1439,15 +1092,26 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
1439 | * The only scenario when we have the page shared here is if we | 1092 | * The only scenario when we have the page shared here is if we |
1440 | * mlocking read-only mapping shared over fork(). We skip | 1093 | * mlocking read-only mapping shared over fork(). We skip |
1441 | * mlocking such pages. | 1094 | * mlocking such pages. |
1095 | * | ||
1096 | * For file THP: | ||
1097 | * | ||
1098 | * We can expect PageDoubleMap() to be stable under page lock: | ||
1099 | * for file pages we set it in page_add_file_rmap(), which | ||
1100 | * requires page to be locked. | ||
1442 | */ | 1101 | */ |
1443 | if (compound_mapcount(page) == 1 && !PageDoubleMap(page) && | 1102 | |
1444 | page->mapping && trylock_page(page)) { | 1103 | if (PageAnon(page) && compound_mapcount(page) != 1) |
1445 | lru_add_drain(); | 1104 | goto skip_mlock; |
1446 | if (page->mapping) | 1105 | if (PageDoubleMap(page) || !page->mapping) |
1447 | mlock_vma_page(page); | 1106 | goto skip_mlock; |
1448 | unlock_page(page); | 1107 | if (!trylock_page(page)) |
1449 | } | 1108 | goto skip_mlock; |
1109 | lru_add_drain(); | ||
1110 | if (page->mapping && !PageDoubleMap(page)) | ||
1111 | mlock_vma_page(page); | ||
1112 | unlock_page(page); | ||
1450 | } | 1113 | } |
1114 | skip_mlock: | ||
1451 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | 1115 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; |
1452 | VM_BUG_ON_PAGE(!PageCompound(page), page); | 1116 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
1453 | if (flags & FOLL_GET) | 1117 | if (flags & FOLL_GET) |
@@ -1458,13 +1122,12 @@ out: | |||
1458 | } | 1122 | } |
1459 | 1123 | ||
1460 | /* NUMA hinting page fault entry point for trans huge pmds */ | 1124 | /* NUMA hinting page fault entry point for trans huge pmds */ |
1461 | int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1125 | int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) |
1462 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) | ||
1463 | { | 1126 | { |
1464 | spinlock_t *ptl; | 1127 | struct vm_area_struct *vma = fe->vma; |
1465 | struct anon_vma *anon_vma = NULL; | 1128 | struct anon_vma *anon_vma = NULL; |
1466 | struct page *page; | 1129 | struct page *page; |
1467 | unsigned long haddr = addr & HPAGE_PMD_MASK; | 1130 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; |
1468 | int page_nid = -1, this_nid = numa_node_id(); | 1131 | int page_nid = -1, this_nid = numa_node_id(); |
1469 | int target_nid, last_cpupid = -1; | 1132 | int target_nid, last_cpupid = -1; |
1470 | bool page_locked; | 1133 | bool page_locked; |
@@ -1475,8 +1138,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1475 | /* A PROT_NONE fault should not end up here */ | 1138 | /* A PROT_NONE fault should not end up here */ |
1476 | BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); | 1139 | BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); |
1477 | 1140 | ||
1478 | ptl = pmd_lock(mm, pmdp); | 1141 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); |
1479 | if (unlikely(!pmd_same(pmd, *pmdp))) | 1142 | if (unlikely(!pmd_same(pmd, *fe->pmd))) |
1480 | goto out_unlock; | 1143 | goto out_unlock; |
1481 | 1144 | ||
1482 | /* | 1145 | /* |
@@ -1484,9 +1147,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1484 | * without disrupting NUMA hinting information. Do not relock and | 1147 | * without disrupting NUMA hinting information. Do not relock and |
1485 | * check_same as the page may no longer be mapped. | 1148 | * check_same as the page may no longer be mapped. |
1486 | */ | 1149 | */ |
1487 | if (unlikely(pmd_trans_migrating(*pmdp))) { | 1150 | if (unlikely(pmd_trans_migrating(*fe->pmd))) { |
1488 | page = pmd_page(*pmdp); | 1151 | page = pmd_page(*fe->pmd); |
1489 | spin_unlock(ptl); | 1152 | spin_unlock(fe->ptl); |
1490 | wait_on_page_locked(page); | 1153 | wait_on_page_locked(page); |
1491 | goto out; | 1154 | goto out; |
1492 | } | 1155 | } |
@@ -1519,7 +1182,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1519 | 1182 | ||
1520 | /* Migration could have started since the pmd_trans_migrating check */ | 1183 | /* Migration could have started since the pmd_trans_migrating check */ |
1521 | if (!page_locked) { | 1184 | if (!page_locked) { |
1522 | spin_unlock(ptl); | 1185 | spin_unlock(fe->ptl); |
1523 | wait_on_page_locked(page); | 1186 | wait_on_page_locked(page); |
1524 | page_nid = -1; | 1187 | page_nid = -1; |
1525 | goto out; | 1188 | goto out; |
@@ -1530,12 +1193,12 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1530 | * to serialises splits | 1193 | * to serialises splits |
1531 | */ | 1194 | */ |
1532 | get_page(page); | 1195 | get_page(page); |
1533 | spin_unlock(ptl); | 1196 | spin_unlock(fe->ptl); |
1534 | anon_vma = page_lock_anon_vma_read(page); | 1197 | anon_vma = page_lock_anon_vma_read(page); |
1535 | 1198 | ||
1536 | /* Confirm the PMD did not change while page_table_lock was released */ | 1199 | /* Confirm the PMD did not change while page_table_lock was released */ |
1537 | spin_lock(ptl); | 1200 | spin_lock(fe->ptl); |
1538 | if (unlikely(!pmd_same(pmd, *pmdp))) { | 1201 | if (unlikely(!pmd_same(pmd, *fe->pmd))) { |
1539 | unlock_page(page); | 1202 | unlock_page(page); |
1540 | put_page(page); | 1203 | put_page(page); |
1541 | page_nid = -1; | 1204 | page_nid = -1; |
@@ -1553,9 +1216,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1553 | * Migrate the THP to the requested node, returns with page unlocked | 1216 | * Migrate the THP to the requested node, returns with page unlocked |
1554 | * and access rights restored. | 1217 | * and access rights restored. |
1555 | */ | 1218 | */ |
1556 | spin_unlock(ptl); | 1219 | spin_unlock(fe->ptl); |
1557 | migrated = migrate_misplaced_transhuge_page(mm, vma, | 1220 | migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, |
1558 | pmdp, pmd, addr, page, target_nid); | 1221 | fe->pmd, pmd, fe->address, page, target_nid); |
1559 | if (migrated) { | 1222 | if (migrated) { |
1560 | flags |= TNF_MIGRATED; | 1223 | flags |= TNF_MIGRATED; |
1561 | page_nid = target_nid; | 1224 | page_nid = target_nid; |
@@ -1570,18 +1233,18 @@ clear_pmdnuma: | |||
1570 | pmd = pmd_mkyoung(pmd); | 1233 | pmd = pmd_mkyoung(pmd); |
1571 | if (was_writable) | 1234 | if (was_writable) |
1572 | pmd = pmd_mkwrite(pmd); | 1235 | pmd = pmd_mkwrite(pmd); |
1573 | set_pmd_at(mm, haddr, pmdp, pmd); | 1236 | set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd); |
1574 | update_mmu_cache_pmd(vma, addr, pmdp); | 1237 | update_mmu_cache_pmd(vma, fe->address, fe->pmd); |
1575 | unlock_page(page); | 1238 | unlock_page(page); |
1576 | out_unlock: | 1239 | out_unlock: |
1577 | spin_unlock(ptl); | 1240 | spin_unlock(fe->ptl); |
1578 | 1241 | ||
1579 | out: | 1242 | out: |
1580 | if (anon_vma) | 1243 | if (anon_vma) |
1581 | page_unlock_anon_vma_read(anon_vma); | 1244 | page_unlock_anon_vma_read(anon_vma); |
1582 | 1245 | ||
1583 | if (page_nid != -1) | 1246 | if (page_nid != -1) |
1584 | task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); | 1247 | task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags); |
1585 | 1248 | ||
1586 | return 0; | 1249 | return 0; |
1587 | } | 1250 | } |
@@ -1684,12 +1347,18 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1684 | struct page *page = pmd_page(orig_pmd); | 1347 | struct page *page = pmd_page(orig_pmd); |
1685 | page_remove_rmap(page, true); | 1348 | page_remove_rmap(page, true); |
1686 | VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); | 1349 | VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); |
1687 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | ||
1688 | VM_BUG_ON_PAGE(!PageHead(page), page); | 1350 | VM_BUG_ON_PAGE(!PageHead(page), page); |
1689 | pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); | 1351 | if (PageAnon(page)) { |
1690 | atomic_long_dec(&tlb->mm->nr_ptes); | 1352 | pgtable_t pgtable; |
1353 | pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); | ||
1354 | pte_free(tlb->mm, pgtable); | ||
1355 | atomic_long_dec(&tlb->mm->nr_ptes); | ||
1356 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | ||
1357 | } else { | ||
1358 | add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); | ||
1359 | } | ||
1691 | spin_unlock(ptl); | 1360 | spin_unlock(ptl); |
1692 | tlb_remove_page(tlb, page); | 1361 | tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); |
1693 | } | 1362 | } |
1694 | return 1; | 1363 | return 1; |
1695 | } | 1364 | } |
@@ -1779,7 +1448,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1779 | entry = pmd_mkwrite(entry); | 1448 | entry = pmd_mkwrite(entry); |
1780 | ret = HPAGE_PMD_NR; | 1449 | ret = HPAGE_PMD_NR; |
1781 | set_pmd_at(mm, addr, pmd, entry); | 1450 | set_pmd_at(mm, addr, pmd, entry); |
1782 | BUG_ON(!preserve_write && pmd_write(entry)); | 1451 | BUG_ON(vma_is_anonymous(vma) && !preserve_write && |
1452 | pmd_write(entry)); | ||
1783 | } | 1453 | } |
1784 | spin_unlock(ptl); | 1454 | spin_unlock(ptl); |
1785 | } | 1455 | } |
@@ -1788,10 +1458,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1788 | } | 1458 | } |
1789 | 1459 | ||
1790 | /* | 1460 | /* |
1791 | * Returns true if a given pmd maps a thp, false otherwise. | 1461 | * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise. |
1792 | * | 1462 | * |
1793 | * Note that if it returns true, this routine returns without unlocking page | 1463 | * Note that if it returns page table lock pointer, this routine returns without |
1794 | * table lock. So callers must unlock it. | 1464 | * unlocking page table lock. So callers must unlock it. |
1795 | */ | 1465 | */ |
1796 | spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) | 1466 | spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) |
1797 | { | 1467 | { |
@@ -1803,1040 +1473,6 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) | |||
1803 | return NULL; | 1473 | return NULL; |
1804 | } | 1474 | } |
1805 | 1475 | ||
1806 | #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) | ||
1807 | |||
1808 | int hugepage_madvise(struct vm_area_struct *vma, | ||
1809 | unsigned long *vm_flags, int advice) | ||
1810 | { | ||
1811 | switch (advice) { | ||
1812 | case MADV_HUGEPAGE: | ||
1813 | #ifdef CONFIG_S390 | ||
1814 | /* | ||
1815 | * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 | ||
1816 | * can't handle this properly after s390_enable_sie, so we simply | ||
1817 | * ignore the madvise to prevent qemu from causing a SIGSEGV. | ||
1818 | */ | ||
1819 | if (mm_has_pgste(vma->vm_mm)) | ||
1820 | return 0; | ||
1821 | #endif | ||
1822 | /* | ||
1823 | * Be somewhat over-protective like KSM for now! | ||
1824 | */ | ||
1825 | if (*vm_flags & VM_NO_THP) | ||
1826 | return -EINVAL; | ||
1827 | *vm_flags &= ~VM_NOHUGEPAGE; | ||
1828 | *vm_flags |= VM_HUGEPAGE; | ||
1829 | /* | ||
1830 | * If the vma become good for khugepaged to scan, | ||
1831 | * register it here without waiting a page fault that | ||
1832 | * may not happen any time soon. | ||
1833 | */ | ||
1834 | if (unlikely(khugepaged_enter_vma_merge(vma, *vm_flags))) | ||
1835 | return -ENOMEM; | ||
1836 | break; | ||
1837 | case MADV_NOHUGEPAGE: | ||
1838 | /* | ||
1839 | * Be somewhat over-protective like KSM for now! | ||
1840 | */ | ||
1841 | if (*vm_flags & VM_NO_THP) | ||
1842 | return -EINVAL; | ||
1843 | *vm_flags &= ~VM_HUGEPAGE; | ||
1844 | *vm_flags |= VM_NOHUGEPAGE; | ||
1845 | /* | ||
1846 | * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning | ||
1847 | * this vma even if we leave the mm registered in khugepaged if | ||
1848 | * it got registered before VM_NOHUGEPAGE was set. | ||
1849 | */ | ||
1850 | break; | ||
1851 | } | ||
1852 | |||
1853 | return 0; | ||
1854 | } | ||
1855 | |||
1856 | static int __init khugepaged_slab_init(void) | ||
1857 | { | ||
1858 | mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", | ||
1859 | sizeof(struct mm_slot), | ||
1860 | __alignof__(struct mm_slot), 0, NULL); | ||
1861 | if (!mm_slot_cache) | ||
1862 | return -ENOMEM; | ||
1863 | |||
1864 | return 0; | ||
1865 | } | ||
1866 | |||
1867 | static void __init khugepaged_slab_exit(void) | ||
1868 | { | ||
1869 | kmem_cache_destroy(mm_slot_cache); | ||
1870 | } | ||
1871 | |||
1872 | static inline struct mm_slot *alloc_mm_slot(void) | ||
1873 | { | ||
1874 | if (!mm_slot_cache) /* initialization failed */ | ||
1875 | return NULL; | ||
1876 | return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); | ||
1877 | } | ||
1878 | |||
1879 | static inline void free_mm_slot(struct mm_slot *mm_slot) | ||
1880 | { | ||
1881 | kmem_cache_free(mm_slot_cache, mm_slot); | ||
1882 | } | ||
1883 | |||
1884 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) | ||
1885 | { | ||
1886 | struct mm_slot *mm_slot; | ||
1887 | |||
1888 | hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) | ||
1889 | if (mm == mm_slot->mm) | ||
1890 | return mm_slot; | ||
1891 | |||
1892 | return NULL; | ||
1893 | } | ||
1894 | |||
1895 | static void insert_to_mm_slots_hash(struct mm_struct *mm, | ||
1896 | struct mm_slot *mm_slot) | ||
1897 | { | ||
1898 | mm_slot->mm = mm; | ||
1899 | hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); | ||
1900 | } | ||
1901 | |||
1902 | static inline int khugepaged_test_exit(struct mm_struct *mm) | ||
1903 | { | ||
1904 | return atomic_read(&mm->mm_users) == 0; | ||
1905 | } | ||
1906 | |||
1907 | int __khugepaged_enter(struct mm_struct *mm) | ||
1908 | { | ||
1909 | struct mm_slot *mm_slot; | ||
1910 | int wakeup; | ||
1911 | |||
1912 | mm_slot = alloc_mm_slot(); | ||
1913 | if (!mm_slot) | ||
1914 | return -ENOMEM; | ||
1915 | |||
1916 | /* __khugepaged_exit() must not run from under us */ | ||
1917 | VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); | ||
1918 | if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { | ||
1919 | free_mm_slot(mm_slot); | ||
1920 | return 0; | ||
1921 | } | ||
1922 | |||
1923 | spin_lock(&khugepaged_mm_lock); | ||
1924 | insert_to_mm_slots_hash(mm, mm_slot); | ||
1925 | /* | ||
1926 | * Insert just behind the scanning cursor, to let the area settle | ||
1927 | * down a little. | ||
1928 | */ | ||
1929 | wakeup = list_empty(&khugepaged_scan.mm_head); | ||
1930 | list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); | ||
1931 | spin_unlock(&khugepaged_mm_lock); | ||
1932 | |||
1933 | atomic_inc(&mm->mm_count); | ||
1934 | if (wakeup) | ||
1935 | wake_up_interruptible(&khugepaged_wait); | ||
1936 | |||
1937 | return 0; | ||
1938 | } | ||
1939 | |||
1940 | int khugepaged_enter_vma_merge(struct vm_area_struct *vma, | ||
1941 | unsigned long vm_flags) | ||
1942 | { | ||
1943 | unsigned long hstart, hend; | ||
1944 | if (!vma->anon_vma) | ||
1945 | /* | ||
1946 | * Not yet faulted in so we will register later in the | ||
1947 | * page fault if needed. | ||
1948 | */ | ||
1949 | return 0; | ||
1950 | if (vma->vm_ops || (vm_flags & VM_NO_THP)) | ||
1951 | /* khugepaged not yet working on file or special mappings */ | ||
1952 | return 0; | ||
1953 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
1954 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
1955 | if (hstart < hend) | ||
1956 | return khugepaged_enter(vma, vm_flags); | ||
1957 | return 0; | ||
1958 | } | ||
1959 | |||
1960 | void __khugepaged_exit(struct mm_struct *mm) | ||
1961 | { | ||
1962 | struct mm_slot *mm_slot; | ||
1963 | int free = 0; | ||
1964 | |||
1965 | spin_lock(&khugepaged_mm_lock); | ||
1966 | mm_slot = get_mm_slot(mm); | ||
1967 | if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { | ||
1968 | hash_del(&mm_slot->hash); | ||
1969 | list_del(&mm_slot->mm_node); | ||
1970 | free = 1; | ||
1971 | } | ||
1972 | spin_unlock(&khugepaged_mm_lock); | ||
1973 | |||
1974 | if (free) { | ||
1975 | clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | ||
1976 | free_mm_slot(mm_slot); | ||
1977 | mmdrop(mm); | ||
1978 | } else if (mm_slot) { | ||
1979 | /* | ||
1980 | * This is required to serialize against | ||
1981 | * khugepaged_test_exit() (which is guaranteed to run | ||
1982 | * under mmap sem read mode). Stop here (after we | ||
1983 | * return all pagetables will be destroyed) until | ||
1984 | * khugepaged has finished working on the pagetables | ||
1985 | * under the mmap_sem. | ||
1986 | */ | ||
1987 | down_write(&mm->mmap_sem); | ||
1988 | up_write(&mm->mmap_sem); | ||
1989 | } | ||
1990 | } | ||
1991 | |||
1992 | static void release_pte_page(struct page *page) | ||
1993 | { | ||
1994 | /* 0 stands for page_is_file_cache(page) == false */ | ||
1995 | dec_zone_page_state(page, NR_ISOLATED_ANON + 0); | ||
1996 | unlock_page(page); | ||
1997 | putback_lru_page(page); | ||
1998 | } | ||
1999 | |||
2000 | static void release_pte_pages(pte_t *pte, pte_t *_pte) | ||
2001 | { | ||
2002 | while (--_pte >= pte) { | ||
2003 | pte_t pteval = *_pte; | ||
2004 | if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) | ||
2005 | release_pte_page(pte_page(pteval)); | ||
2006 | } | ||
2007 | } | ||
2008 | |||
2009 | static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | ||
2010 | unsigned long address, | ||
2011 | pte_t *pte) | ||
2012 | { | ||
2013 | struct page *page = NULL; | ||
2014 | pte_t *_pte; | ||
2015 | int none_or_zero = 0, result = 0; | ||
2016 | bool referenced = false, writable = false; | ||
2017 | |||
2018 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | ||
2019 | _pte++, address += PAGE_SIZE) { | ||
2020 | pte_t pteval = *_pte; | ||
2021 | if (pte_none(pteval) || (pte_present(pteval) && | ||
2022 | is_zero_pfn(pte_pfn(pteval)))) { | ||
2023 | if (!userfaultfd_armed(vma) && | ||
2024 | ++none_or_zero <= khugepaged_max_ptes_none) { | ||
2025 | continue; | ||
2026 | } else { | ||
2027 | result = SCAN_EXCEED_NONE_PTE; | ||
2028 | goto out; | ||
2029 | } | ||
2030 | } | ||
2031 | if (!pte_present(pteval)) { | ||
2032 | result = SCAN_PTE_NON_PRESENT; | ||
2033 | goto out; | ||
2034 | } | ||
2035 | page = vm_normal_page(vma, address, pteval); | ||
2036 | if (unlikely(!page)) { | ||
2037 | result = SCAN_PAGE_NULL; | ||
2038 | goto out; | ||
2039 | } | ||
2040 | |||
2041 | VM_BUG_ON_PAGE(PageCompound(page), page); | ||
2042 | VM_BUG_ON_PAGE(!PageAnon(page), page); | ||
2043 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | ||
2044 | |||
2045 | /* | ||
2046 | * We can do it before isolate_lru_page because the | ||
2047 | * page can't be freed from under us. NOTE: PG_lock | ||
2048 | * is needed to serialize against split_huge_page | ||
2049 | * when invoked from the VM. | ||
2050 | */ | ||
2051 | if (!trylock_page(page)) { | ||
2052 | result = SCAN_PAGE_LOCK; | ||
2053 | goto out; | ||
2054 | } | ||
2055 | |||
2056 | /* | ||
2057 | * cannot use mapcount: can't collapse if there's a gup pin. | ||
2058 | * The page must only be referenced by the scanned process | ||
2059 | * and page swap cache. | ||
2060 | */ | ||
2061 | if (page_count(page) != 1 + !!PageSwapCache(page)) { | ||
2062 | unlock_page(page); | ||
2063 | result = SCAN_PAGE_COUNT; | ||
2064 | goto out; | ||
2065 | } | ||
2066 | if (pte_write(pteval)) { | ||
2067 | writable = true; | ||
2068 | } else { | ||
2069 | if (PageSwapCache(page) && | ||
2070 | !reuse_swap_page(page, NULL)) { | ||
2071 | unlock_page(page); | ||
2072 | result = SCAN_SWAP_CACHE_PAGE; | ||
2073 | goto out; | ||
2074 | } | ||
2075 | /* | ||
2076 | * Page is not in the swap cache. It can be collapsed | ||
2077 | * into a THP. | ||
2078 | */ | ||
2079 | } | ||
2080 | |||
2081 | /* | ||
2082 | * Isolate the page to avoid collapsing an hugepage | ||
2083 | * currently in use by the VM. | ||
2084 | */ | ||
2085 | if (isolate_lru_page(page)) { | ||
2086 | unlock_page(page); | ||
2087 | result = SCAN_DEL_PAGE_LRU; | ||
2088 | goto out; | ||
2089 | } | ||
2090 | /* 0 stands for page_is_file_cache(page) == false */ | ||
2091 | inc_zone_page_state(page, NR_ISOLATED_ANON + 0); | ||
2092 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
2093 | VM_BUG_ON_PAGE(PageLRU(page), page); | ||
2094 | |||
2095 | /* If there is no mapped pte young don't collapse the page */ | ||
2096 | if (pte_young(pteval) || | ||
2097 | page_is_young(page) || PageReferenced(page) || | ||
2098 | mmu_notifier_test_young(vma->vm_mm, address)) | ||
2099 | referenced = true; | ||
2100 | } | ||
2101 | if (likely(writable)) { | ||
2102 | if (likely(referenced)) { | ||
2103 | result = SCAN_SUCCEED; | ||
2104 | trace_mm_collapse_huge_page_isolate(page, none_or_zero, | ||
2105 | referenced, writable, result); | ||
2106 | return 1; | ||
2107 | } | ||
2108 | } else { | ||
2109 | result = SCAN_PAGE_RO; | ||
2110 | } | ||
2111 | |||
2112 | out: | ||
2113 | release_pte_pages(pte, _pte); | ||
2114 | trace_mm_collapse_huge_page_isolate(page, none_or_zero, | ||
2115 | referenced, writable, result); | ||
2116 | return 0; | ||
2117 | } | ||
2118 | |||
2119 | static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | ||
2120 | struct vm_area_struct *vma, | ||
2121 | unsigned long address, | ||
2122 | spinlock_t *ptl) | ||
2123 | { | ||
2124 | pte_t *_pte; | ||
2125 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { | ||
2126 | pte_t pteval = *_pte; | ||
2127 | struct page *src_page; | ||
2128 | |||
2129 | if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { | ||
2130 | clear_user_highpage(page, address); | ||
2131 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); | ||
2132 | if (is_zero_pfn(pte_pfn(pteval))) { | ||
2133 | /* | ||
2134 | * ptl mostly unnecessary. | ||
2135 | */ | ||
2136 | spin_lock(ptl); | ||
2137 | /* | ||
2138 | * paravirt calls inside pte_clear here are | ||
2139 | * superfluous. | ||
2140 | */ | ||
2141 | pte_clear(vma->vm_mm, address, _pte); | ||
2142 | spin_unlock(ptl); | ||
2143 | } | ||
2144 | } else { | ||
2145 | src_page = pte_page(pteval); | ||
2146 | copy_user_highpage(page, src_page, address, vma); | ||
2147 | VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); | ||
2148 | release_pte_page(src_page); | ||
2149 | /* | ||
2150 | * ptl mostly unnecessary, but preempt has to | ||
2151 | * be disabled to update the per-cpu stats | ||
2152 | * inside page_remove_rmap(). | ||
2153 | */ | ||
2154 | spin_lock(ptl); | ||
2155 | /* | ||
2156 | * paravirt calls inside pte_clear here are | ||
2157 | * superfluous. | ||
2158 | */ | ||
2159 | pte_clear(vma->vm_mm, address, _pte); | ||
2160 | page_remove_rmap(src_page, false); | ||
2161 | spin_unlock(ptl); | ||
2162 | free_page_and_swap_cache(src_page); | ||
2163 | } | ||
2164 | |||
2165 | address += PAGE_SIZE; | ||
2166 | page++; | ||
2167 | } | ||
2168 | } | ||
2169 | |||
2170 | static void khugepaged_alloc_sleep(void) | ||
2171 | { | ||
2172 | DEFINE_WAIT(wait); | ||
2173 | |||
2174 | add_wait_queue(&khugepaged_wait, &wait); | ||
2175 | freezable_schedule_timeout_interruptible( | ||
2176 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); | ||
2177 | remove_wait_queue(&khugepaged_wait, &wait); | ||
2178 | } | ||
2179 | |||
2180 | static int khugepaged_node_load[MAX_NUMNODES]; | ||
2181 | |||
2182 | static bool khugepaged_scan_abort(int nid) | ||
2183 | { | ||
2184 | int i; | ||
2185 | |||
2186 | /* | ||
2187 | * If zone_reclaim_mode is disabled, then no extra effort is made to | ||
2188 | * allocate memory locally. | ||
2189 | */ | ||
2190 | if (!zone_reclaim_mode) | ||
2191 | return false; | ||
2192 | |||
2193 | /* If there is a count for this node already, it must be acceptable */ | ||
2194 | if (khugepaged_node_load[nid]) | ||
2195 | return false; | ||
2196 | |||
2197 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
2198 | if (!khugepaged_node_load[i]) | ||
2199 | continue; | ||
2200 | if (node_distance(nid, i) > RECLAIM_DISTANCE) | ||
2201 | return true; | ||
2202 | } | ||
2203 | return false; | ||
2204 | } | ||
2205 | |||
2206 | #ifdef CONFIG_NUMA | ||
2207 | static int khugepaged_find_target_node(void) | ||
2208 | { | ||
2209 | static int last_khugepaged_target_node = NUMA_NO_NODE; | ||
2210 | int nid, target_node = 0, max_value = 0; | ||
2211 | |||
2212 | /* find first node with max normal pages hit */ | ||
2213 | for (nid = 0; nid < MAX_NUMNODES; nid++) | ||
2214 | if (khugepaged_node_load[nid] > max_value) { | ||
2215 | max_value = khugepaged_node_load[nid]; | ||
2216 | target_node = nid; | ||
2217 | } | ||
2218 | |||
2219 | /* do some balance if several nodes have the same hit record */ | ||
2220 | if (target_node <= last_khugepaged_target_node) | ||
2221 | for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; | ||
2222 | nid++) | ||
2223 | if (max_value == khugepaged_node_load[nid]) { | ||
2224 | target_node = nid; | ||
2225 | break; | ||
2226 | } | ||
2227 | |||
2228 | last_khugepaged_target_node = target_node; | ||
2229 | return target_node; | ||
2230 | } | ||
2231 | |||
2232 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | ||
2233 | { | ||
2234 | if (IS_ERR(*hpage)) { | ||
2235 | if (!*wait) | ||
2236 | return false; | ||
2237 | |||
2238 | *wait = false; | ||
2239 | *hpage = NULL; | ||
2240 | khugepaged_alloc_sleep(); | ||
2241 | } else if (*hpage) { | ||
2242 | put_page(*hpage); | ||
2243 | *hpage = NULL; | ||
2244 | } | ||
2245 | |||
2246 | return true; | ||
2247 | } | ||
2248 | |||
2249 | static struct page * | ||
2250 | khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, | ||
2251 | unsigned long address, int node) | ||
2252 | { | ||
2253 | VM_BUG_ON_PAGE(*hpage, *hpage); | ||
2254 | |||
2255 | /* | ||
2256 | * Before allocating the hugepage, release the mmap_sem read lock. | ||
2257 | * The allocation can take potentially a long time if it involves | ||
2258 | * sync compaction, and we do not need to hold the mmap_sem during | ||
2259 | * that. We will recheck the vma after taking it again in write mode. | ||
2260 | */ | ||
2261 | up_read(&mm->mmap_sem); | ||
2262 | |||
2263 | *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); | ||
2264 | if (unlikely(!*hpage)) { | ||
2265 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
2266 | *hpage = ERR_PTR(-ENOMEM); | ||
2267 | return NULL; | ||
2268 | } | ||
2269 | |||
2270 | prep_transhuge_page(*hpage); | ||
2271 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
2272 | return *hpage; | ||
2273 | } | ||
2274 | #else | ||
2275 | static int khugepaged_find_target_node(void) | ||
2276 | { | ||
2277 | return 0; | ||
2278 | } | ||
2279 | |||
2280 | static inline struct page *alloc_khugepaged_hugepage(void) | ||
2281 | { | ||
2282 | struct page *page; | ||
2283 | |||
2284 | page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), | ||
2285 | HPAGE_PMD_ORDER); | ||
2286 | if (page) | ||
2287 | prep_transhuge_page(page); | ||
2288 | return page; | ||
2289 | } | ||
2290 | |||
2291 | static struct page *khugepaged_alloc_hugepage(bool *wait) | ||
2292 | { | ||
2293 | struct page *hpage; | ||
2294 | |||
2295 | do { | ||
2296 | hpage = alloc_khugepaged_hugepage(); | ||
2297 | if (!hpage) { | ||
2298 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
2299 | if (!*wait) | ||
2300 | return NULL; | ||
2301 | |||
2302 | *wait = false; | ||
2303 | khugepaged_alloc_sleep(); | ||
2304 | } else | ||
2305 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
2306 | } while (unlikely(!hpage) && likely(khugepaged_enabled())); | ||
2307 | |||
2308 | return hpage; | ||
2309 | } | ||
2310 | |||
2311 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | ||
2312 | { | ||
2313 | if (!*hpage) | ||
2314 | *hpage = khugepaged_alloc_hugepage(wait); | ||
2315 | |||
2316 | if (unlikely(!*hpage)) | ||
2317 | return false; | ||
2318 | |||
2319 | return true; | ||
2320 | } | ||
2321 | |||
2322 | static struct page * | ||
2323 | khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, | ||
2324 | unsigned long address, int node) | ||
2325 | { | ||
2326 | up_read(&mm->mmap_sem); | ||
2327 | VM_BUG_ON(!*hpage); | ||
2328 | |||
2329 | return *hpage; | ||
2330 | } | ||
2331 | #endif | ||
2332 | |||
2333 | static bool hugepage_vma_check(struct vm_area_struct *vma) | ||
2334 | { | ||
2335 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || | ||
2336 | (vma->vm_flags & VM_NOHUGEPAGE)) | ||
2337 | return false; | ||
2338 | if (!vma->anon_vma || vma->vm_ops) | ||
2339 | return false; | ||
2340 | if (is_vma_temporary_stack(vma)) | ||
2341 | return false; | ||
2342 | return !(vma->vm_flags & VM_NO_THP); | ||
2343 | } | ||
2344 | |||
2345 | static void collapse_huge_page(struct mm_struct *mm, | ||
2346 | unsigned long address, | ||
2347 | struct page **hpage, | ||
2348 | struct vm_area_struct *vma, | ||
2349 | int node) | ||
2350 | { | ||
2351 | pmd_t *pmd, _pmd; | ||
2352 | pte_t *pte; | ||
2353 | pgtable_t pgtable; | ||
2354 | struct page *new_page; | ||
2355 | spinlock_t *pmd_ptl, *pte_ptl; | ||
2356 | int isolated = 0, result = 0; | ||
2357 | unsigned long hstart, hend; | ||
2358 | struct mem_cgroup *memcg; | ||
2359 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
2360 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
2361 | gfp_t gfp; | ||
2362 | |||
2363 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
2364 | |||
2365 | /* Only allocate from the target node */ | ||
2366 | gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; | ||
2367 | |||
2368 | /* release the mmap_sem read lock. */ | ||
2369 | new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); | ||
2370 | if (!new_page) { | ||
2371 | result = SCAN_ALLOC_HUGE_PAGE_FAIL; | ||
2372 | goto out_nolock; | ||
2373 | } | ||
2374 | |||
2375 | if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { | ||
2376 | result = SCAN_CGROUP_CHARGE_FAIL; | ||
2377 | goto out_nolock; | ||
2378 | } | ||
2379 | |||
2380 | /* | ||
2381 | * Prevent all access to pagetables with the exception of | ||
2382 | * gup_fast later hanlded by the ptep_clear_flush and the VM | ||
2383 | * handled by the anon_vma lock + PG_lock. | ||
2384 | */ | ||
2385 | down_write(&mm->mmap_sem); | ||
2386 | if (unlikely(khugepaged_test_exit(mm))) { | ||
2387 | result = SCAN_ANY_PROCESS; | ||
2388 | goto out; | ||
2389 | } | ||
2390 | |||
2391 | vma = find_vma(mm, address); | ||
2392 | if (!vma) { | ||
2393 | result = SCAN_VMA_NULL; | ||
2394 | goto out; | ||
2395 | } | ||
2396 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
2397 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
2398 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) { | ||
2399 | result = SCAN_ADDRESS_RANGE; | ||
2400 | goto out; | ||
2401 | } | ||
2402 | if (!hugepage_vma_check(vma)) { | ||
2403 | result = SCAN_VMA_CHECK; | ||
2404 | goto out; | ||
2405 | } | ||
2406 | pmd = mm_find_pmd(mm, address); | ||
2407 | if (!pmd) { | ||
2408 | result = SCAN_PMD_NULL; | ||
2409 | goto out; | ||
2410 | } | ||
2411 | |||
2412 | anon_vma_lock_write(vma->anon_vma); | ||
2413 | |||
2414 | pte = pte_offset_map(pmd, address); | ||
2415 | pte_ptl = pte_lockptr(mm, pmd); | ||
2416 | |||
2417 | mmun_start = address; | ||
2418 | mmun_end = address + HPAGE_PMD_SIZE; | ||
2419 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
2420 | pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ | ||
2421 | /* | ||
2422 | * After this gup_fast can't run anymore. This also removes | ||
2423 | * any huge TLB entry from the CPU so we won't allow | ||
2424 | * huge and small TLB entries for the same virtual address | ||
2425 | * to avoid the risk of CPU bugs in that area. | ||
2426 | */ | ||
2427 | _pmd = pmdp_collapse_flush(vma, address, pmd); | ||
2428 | spin_unlock(pmd_ptl); | ||
2429 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
2430 | |||
2431 | spin_lock(pte_ptl); | ||
2432 | isolated = __collapse_huge_page_isolate(vma, address, pte); | ||
2433 | spin_unlock(pte_ptl); | ||
2434 | |||
2435 | if (unlikely(!isolated)) { | ||
2436 | pte_unmap(pte); | ||
2437 | spin_lock(pmd_ptl); | ||
2438 | BUG_ON(!pmd_none(*pmd)); | ||
2439 | /* | ||
2440 | * We can only use set_pmd_at when establishing | ||
2441 | * hugepmds and never for establishing regular pmds that | ||
2442 | * points to regular pagetables. Use pmd_populate for that | ||
2443 | */ | ||
2444 | pmd_populate(mm, pmd, pmd_pgtable(_pmd)); | ||
2445 | spin_unlock(pmd_ptl); | ||
2446 | anon_vma_unlock_write(vma->anon_vma); | ||
2447 | result = SCAN_FAIL; | ||
2448 | goto out; | ||
2449 | } | ||
2450 | |||
2451 | /* | ||
2452 | * All pages are isolated and locked so anon_vma rmap | ||
2453 | * can't run anymore. | ||
2454 | */ | ||
2455 | anon_vma_unlock_write(vma->anon_vma); | ||
2456 | |||
2457 | __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); | ||
2458 | pte_unmap(pte); | ||
2459 | __SetPageUptodate(new_page); | ||
2460 | pgtable = pmd_pgtable(_pmd); | ||
2461 | |||
2462 | _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); | ||
2463 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); | ||
2464 | |||
2465 | /* | ||
2466 | * spin_lock() below is not the equivalent of smp_wmb(), so | ||
2467 | * this is needed to avoid the copy_huge_page writes to become | ||
2468 | * visible after the set_pmd_at() write. | ||
2469 | */ | ||
2470 | smp_wmb(); | ||
2471 | |||
2472 | spin_lock(pmd_ptl); | ||
2473 | BUG_ON(!pmd_none(*pmd)); | ||
2474 | page_add_new_anon_rmap(new_page, vma, address, true); | ||
2475 | mem_cgroup_commit_charge(new_page, memcg, false, true); | ||
2476 | lru_cache_add_active_or_unevictable(new_page, vma); | ||
2477 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | ||
2478 | set_pmd_at(mm, address, pmd, _pmd); | ||
2479 | update_mmu_cache_pmd(vma, address, pmd); | ||
2480 | spin_unlock(pmd_ptl); | ||
2481 | |||
2482 | *hpage = NULL; | ||
2483 | |||
2484 | khugepaged_pages_collapsed++; | ||
2485 | result = SCAN_SUCCEED; | ||
2486 | out_up_write: | ||
2487 | up_write(&mm->mmap_sem); | ||
2488 | trace_mm_collapse_huge_page(mm, isolated, result); | ||
2489 | return; | ||
2490 | |||
2491 | out_nolock: | ||
2492 | trace_mm_collapse_huge_page(mm, isolated, result); | ||
2493 | return; | ||
2494 | out: | ||
2495 | mem_cgroup_cancel_charge(new_page, memcg, true); | ||
2496 | goto out_up_write; | ||
2497 | } | ||
2498 | |||
2499 | static int khugepaged_scan_pmd(struct mm_struct *mm, | ||
2500 | struct vm_area_struct *vma, | ||
2501 | unsigned long address, | ||
2502 | struct page **hpage) | ||
2503 | { | ||
2504 | pmd_t *pmd; | ||
2505 | pte_t *pte, *_pte; | ||
2506 | int ret = 0, none_or_zero = 0, result = 0; | ||
2507 | struct page *page = NULL; | ||
2508 | unsigned long _address; | ||
2509 | spinlock_t *ptl; | ||
2510 | int node = NUMA_NO_NODE; | ||
2511 | bool writable = false, referenced = false; | ||
2512 | |||
2513 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
2514 | |||
2515 | pmd = mm_find_pmd(mm, address); | ||
2516 | if (!pmd) { | ||
2517 | result = SCAN_PMD_NULL; | ||
2518 | goto out; | ||
2519 | } | ||
2520 | |||
2521 | memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); | ||
2522 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
2523 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; | ||
2524 | _pte++, _address += PAGE_SIZE) { | ||
2525 | pte_t pteval = *_pte; | ||
2526 | if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { | ||
2527 | if (!userfaultfd_armed(vma) && | ||
2528 | ++none_or_zero <= khugepaged_max_ptes_none) { | ||
2529 | continue; | ||
2530 | } else { | ||
2531 | result = SCAN_EXCEED_NONE_PTE; | ||
2532 | goto out_unmap; | ||
2533 | } | ||
2534 | } | ||
2535 | if (!pte_present(pteval)) { | ||
2536 | result = SCAN_PTE_NON_PRESENT; | ||
2537 | goto out_unmap; | ||
2538 | } | ||
2539 | if (pte_write(pteval)) | ||
2540 | writable = true; | ||
2541 | |||
2542 | page = vm_normal_page(vma, _address, pteval); | ||
2543 | if (unlikely(!page)) { | ||
2544 | result = SCAN_PAGE_NULL; | ||
2545 | goto out_unmap; | ||
2546 | } | ||
2547 | |||
2548 | /* TODO: teach khugepaged to collapse THP mapped with pte */ | ||
2549 | if (PageCompound(page)) { | ||
2550 | result = SCAN_PAGE_COMPOUND; | ||
2551 | goto out_unmap; | ||
2552 | } | ||
2553 | |||
2554 | /* | ||
2555 | * Record which node the original page is from and save this | ||
2556 | * information to khugepaged_node_load[]. | ||
2557 | * Khupaged will allocate hugepage from the node has the max | ||
2558 | * hit record. | ||
2559 | */ | ||
2560 | node = page_to_nid(page); | ||
2561 | if (khugepaged_scan_abort(node)) { | ||
2562 | result = SCAN_SCAN_ABORT; | ||
2563 | goto out_unmap; | ||
2564 | } | ||
2565 | khugepaged_node_load[node]++; | ||
2566 | if (!PageLRU(page)) { | ||
2567 | result = SCAN_PAGE_LRU; | ||
2568 | goto out_unmap; | ||
2569 | } | ||
2570 | if (PageLocked(page)) { | ||
2571 | result = SCAN_PAGE_LOCK; | ||
2572 | goto out_unmap; | ||
2573 | } | ||
2574 | if (!PageAnon(page)) { | ||
2575 | result = SCAN_PAGE_ANON; | ||
2576 | goto out_unmap; | ||
2577 | } | ||
2578 | |||
2579 | /* | ||
2580 | * cannot use mapcount: can't collapse if there's a gup pin. | ||
2581 | * The page must only be referenced by the scanned process | ||
2582 | * and page swap cache. | ||
2583 | */ | ||
2584 | if (page_count(page) != 1 + !!PageSwapCache(page)) { | ||
2585 | result = SCAN_PAGE_COUNT; | ||
2586 | goto out_unmap; | ||
2587 | } | ||
2588 | if (pte_young(pteval) || | ||
2589 | page_is_young(page) || PageReferenced(page) || | ||
2590 | mmu_notifier_test_young(vma->vm_mm, address)) | ||
2591 | referenced = true; | ||
2592 | } | ||
2593 | if (writable) { | ||
2594 | if (referenced) { | ||
2595 | result = SCAN_SUCCEED; | ||
2596 | ret = 1; | ||
2597 | } else { | ||
2598 | result = SCAN_NO_REFERENCED_PAGE; | ||
2599 | } | ||
2600 | } else { | ||
2601 | result = SCAN_PAGE_RO; | ||
2602 | } | ||
2603 | out_unmap: | ||
2604 | pte_unmap_unlock(pte, ptl); | ||
2605 | if (ret) { | ||
2606 | node = khugepaged_find_target_node(); | ||
2607 | /* collapse_huge_page will return with the mmap_sem released */ | ||
2608 | collapse_huge_page(mm, address, hpage, vma, node); | ||
2609 | } | ||
2610 | out: | ||
2611 | trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, | ||
2612 | none_or_zero, result); | ||
2613 | return ret; | ||
2614 | } | ||
2615 | |||
2616 | static void collect_mm_slot(struct mm_slot *mm_slot) | ||
2617 | { | ||
2618 | struct mm_struct *mm = mm_slot->mm; | ||
2619 | |||
2620 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); | ||
2621 | |||
2622 | if (khugepaged_test_exit(mm)) { | ||
2623 | /* free mm_slot */ | ||
2624 | hash_del(&mm_slot->hash); | ||
2625 | list_del(&mm_slot->mm_node); | ||
2626 | |||
2627 | /* | ||
2628 | * Not strictly needed because the mm exited already. | ||
2629 | * | ||
2630 | * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | ||
2631 | */ | ||
2632 | |||
2633 | /* khugepaged_mm_lock actually not necessary for the below */ | ||
2634 | free_mm_slot(mm_slot); | ||
2635 | mmdrop(mm); | ||
2636 | } | ||
2637 | } | ||
2638 | |||
2639 | static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | ||
2640 | struct page **hpage) | ||
2641 | __releases(&khugepaged_mm_lock) | ||
2642 | __acquires(&khugepaged_mm_lock) | ||
2643 | { | ||
2644 | struct mm_slot *mm_slot; | ||
2645 | struct mm_struct *mm; | ||
2646 | struct vm_area_struct *vma; | ||
2647 | int progress = 0; | ||
2648 | |||
2649 | VM_BUG_ON(!pages); | ||
2650 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); | ||
2651 | |||
2652 | if (khugepaged_scan.mm_slot) | ||
2653 | mm_slot = khugepaged_scan.mm_slot; | ||
2654 | else { | ||
2655 | mm_slot = list_entry(khugepaged_scan.mm_head.next, | ||
2656 | struct mm_slot, mm_node); | ||
2657 | khugepaged_scan.address = 0; | ||
2658 | khugepaged_scan.mm_slot = mm_slot; | ||
2659 | } | ||
2660 | spin_unlock(&khugepaged_mm_lock); | ||
2661 | |||
2662 | mm = mm_slot->mm; | ||
2663 | down_read(&mm->mmap_sem); | ||
2664 | if (unlikely(khugepaged_test_exit(mm))) | ||
2665 | vma = NULL; | ||
2666 | else | ||
2667 | vma = find_vma(mm, khugepaged_scan.address); | ||
2668 | |||
2669 | progress++; | ||
2670 | for (; vma; vma = vma->vm_next) { | ||
2671 | unsigned long hstart, hend; | ||
2672 | |||
2673 | cond_resched(); | ||
2674 | if (unlikely(khugepaged_test_exit(mm))) { | ||
2675 | progress++; | ||
2676 | break; | ||
2677 | } | ||
2678 | if (!hugepage_vma_check(vma)) { | ||
2679 | skip: | ||
2680 | progress++; | ||
2681 | continue; | ||
2682 | } | ||
2683 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
2684 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
2685 | if (hstart >= hend) | ||
2686 | goto skip; | ||
2687 | if (khugepaged_scan.address > hend) | ||
2688 | goto skip; | ||
2689 | if (khugepaged_scan.address < hstart) | ||
2690 | khugepaged_scan.address = hstart; | ||
2691 | VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); | ||
2692 | |||
2693 | while (khugepaged_scan.address < hend) { | ||
2694 | int ret; | ||
2695 | cond_resched(); | ||
2696 | if (unlikely(khugepaged_test_exit(mm))) | ||
2697 | goto breakouterloop; | ||
2698 | |||
2699 | VM_BUG_ON(khugepaged_scan.address < hstart || | ||
2700 | khugepaged_scan.address + HPAGE_PMD_SIZE > | ||
2701 | hend); | ||
2702 | ret = khugepaged_scan_pmd(mm, vma, | ||
2703 | khugepaged_scan.address, | ||
2704 | hpage); | ||
2705 | /* move to next address */ | ||
2706 | khugepaged_scan.address += HPAGE_PMD_SIZE; | ||
2707 | progress += HPAGE_PMD_NR; | ||
2708 | if (ret) | ||
2709 | /* we released mmap_sem so break loop */ | ||
2710 | goto breakouterloop_mmap_sem; | ||
2711 | if (progress >= pages) | ||
2712 | goto breakouterloop; | ||
2713 | } | ||
2714 | } | ||
2715 | breakouterloop: | ||
2716 | up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ | ||
2717 | breakouterloop_mmap_sem: | ||
2718 | |||
2719 | spin_lock(&khugepaged_mm_lock); | ||
2720 | VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); | ||
2721 | /* | ||
2722 | * Release the current mm_slot if this mm is about to die, or | ||
2723 | * if we scanned all vmas of this mm. | ||
2724 | */ | ||
2725 | if (khugepaged_test_exit(mm) || !vma) { | ||
2726 | /* | ||
2727 | * Make sure that if mm_users is reaching zero while | ||
2728 | * khugepaged runs here, khugepaged_exit will find | ||
2729 | * mm_slot not pointing to the exiting mm. | ||
2730 | */ | ||
2731 | if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { | ||
2732 | khugepaged_scan.mm_slot = list_entry( | ||
2733 | mm_slot->mm_node.next, | ||
2734 | struct mm_slot, mm_node); | ||
2735 | khugepaged_scan.address = 0; | ||
2736 | } else { | ||
2737 | khugepaged_scan.mm_slot = NULL; | ||
2738 | khugepaged_full_scans++; | ||
2739 | } | ||
2740 | |||
2741 | collect_mm_slot(mm_slot); | ||
2742 | } | ||
2743 | |||
2744 | return progress; | ||
2745 | } | ||
2746 | |||
2747 | static int khugepaged_has_work(void) | ||
2748 | { | ||
2749 | return !list_empty(&khugepaged_scan.mm_head) && | ||
2750 | khugepaged_enabled(); | ||
2751 | } | ||
2752 | |||
2753 | static int khugepaged_wait_event(void) | ||
2754 | { | ||
2755 | return !list_empty(&khugepaged_scan.mm_head) || | ||
2756 | kthread_should_stop(); | ||
2757 | } | ||
2758 | |||
2759 | static void khugepaged_do_scan(void) | ||
2760 | { | ||
2761 | struct page *hpage = NULL; | ||
2762 | unsigned int progress = 0, pass_through_head = 0; | ||
2763 | unsigned int pages = khugepaged_pages_to_scan; | ||
2764 | bool wait = true; | ||
2765 | |||
2766 | barrier(); /* write khugepaged_pages_to_scan to local stack */ | ||
2767 | |||
2768 | while (progress < pages) { | ||
2769 | if (!khugepaged_prealloc_page(&hpage, &wait)) | ||
2770 | break; | ||
2771 | |||
2772 | cond_resched(); | ||
2773 | |||
2774 | if (unlikely(kthread_should_stop() || try_to_freeze())) | ||
2775 | break; | ||
2776 | |||
2777 | spin_lock(&khugepaged_mm_lock); | ||
2778 | if (!khugepaged_scan.mm_slot) | ||
2779 | pass_through_head++; | ||
2780 | if (khugepaged_has_work() && | ||
2781 | pass_through_head < 2) | ||
2782 | progress += khugepaged_scan_mm_slot(pages - progress, | ||
2783 | &hpage); | ||
2784 | else | ||
2785 | progress = pages; | ||
2786 | spin_unlock(&khugepaged_mm_lock); | ||
2787 | } | ||
2788 | |||
2789 | if (!IS_ERR_OR_NULL(hpage)) | ||
2790 | put_page(hpage); | ||
2791 | } | ||
2792 | |||
2793 | static bool khugepaged_should_wakeup(void) | ||
2794 | { | ||
2795 | return kthread_should_stop() || | ||
2796 | time_after_eq(jiffies, khugepaged_sleep_expire); | ||
2797 | } | ||
2798 | |||
2799 | static void khugepaged_wait_work(void) | ||
2800 | { | ||
2801 | if (khugepaged_has_work()) { | ||
2802 | const unsigned long scan_sleep_jiffies = | ||
2803 | msecs_to_jiffies(khugepaged_scan_sleep_millisecs); | ||
2804 | |||
2805 | if (!scan_sleep_jiffies) | ||
2806 | return; | ||
2807 | |||
2808 | khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; | ||
2809 | wait_event_freezable_timeout(khugepaged_wait, | ||
2810 | khugepaged_should_wakeup(), | ||
2811 | scan_sleep_jiffies); | ||
2812 | return; | ||
2813 | } | ||
2814 | |||
2815 | if (khugepaged_enabled()) | ||
2816 | wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); | ||
2817 | } | ||
2818 | |||
2819 | static int khugepaged(void *none) | ||
2820 | { | ||
2821 | struct mm_slot *mm_slot; | ||
2822 | |||
2823 | set_freezable(); | ||
2824 | set_user_nice(current, MAX_NICE); | ||
2825 | |||
2826 | while (!kthread_should_stop()) { | ||
2827 | khugepaged_do_scan(); | ||
2828 | khugepaged_wait_work(); | ||
2829 | } | ||
2830 | |||
2831 | spin_lock(&khugepaged_mm_lock); | ||
2832 | mm_slot = khugepaged_scan.mm_slot; | ||
2833 | khugepaged_scan.mm_slot = NULL; | ||
2834 | if (mm_slot) | ||
2835 | collect_mm_slot(mm_slot); | ||
2836 | spin_unlock(&khugepaged_mm_lock); | ||
2837 | return 0; | ||
2838 | } | ||
2839 | |||
2840 | static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, | 1476 | static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, |
2841 | unsigned long haddr, pmd_t *pmd) | 1477 | unsigned long haddr, pmd_t *pmd) |
2842 | { | 1478 | { |
@@ -2883,10 +1519,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, | |||
2883 | 1519 | ||
2884 | count_vm_event(THP_SPLIT_PMD); | 1520 | count_vm_event(THP_SPLIT_PMD); |
2885 | 1521 | ||
2886 | if (vma_is_dax(vma)) { | 1522 | if (!vma_is_anonymous(vma)) { |
2887 | pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); | 1523 | _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); |
2888 | if (is_huge_zero_pmd(_pmd)) | 1524 | if (is_huge_zero_pmd(_pmd)) |
2889 | put_huge_zero_page(); | 1525 | put_huge_zero_page(); |
1526 | if (vma_is_dax(vma)) | ||
1527 | return; | ||
1528 | page = pmd_page(_pmd); | ||
1529 | if (!PageReferenced(page) && pmd_young(_pmd)) | ||
1530 | SetPageReferenced(page); | ||
1531 | page_remove_rmap(page, true); | ||
1532 | put_page(page); | ||
1533 | add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR); | ||
2890 | return; | 1534 | return; |
2891 | } else if (is_huge_zero_pmd(*pmd)) { | 1535 | } else if (is_huge_zero_pmd(*pmd)) { |
2892 | return __split_huge_zero_page_pmd(vma, haddr, pmd); | 1536 | return __split_huge_zero_page_pmd(vma, haddr, pmd); |
@@ -2942,7 +1586,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, | |||
2942 | 1586 | ||
2943 | if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { | 1587 | if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { |
2944 | /* Last compound_mapcount is gone. */ | 1588 | /* Last compound_mapcount is gone. */ |
2945 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1589 | __dec_zone_page_state(page, NR_ANON_THPS); |
2946 | if (TestClearPageDoubleMap(page)) { | 1590 | if (TestClearPageDoubleMap(page)) { |
2947 | /* No need in mapcount reference anymore */ | 1591 | /* No need in mapcount reference anymore */ |
2948 | for (i = 0; i < HPAGE_PMD_NR; i++) | 1592 | for (i = 0; i < HPAGE_PMD_NR; i++) |
@@ -3076,12 +1720,15 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, | |||
3076 | 1720 | ||
3077 | static void freeze_page(struct page *page) | 1721 | static void freeze_page(struct page *page) |
3078 | { | 1722 | { |
3079 | enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | | 1723 | enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | |
3080 | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED; | 1724 | TTU_RMAP_LOCKED; |
3081 | int i, ret; | 1725 | int i, ret; |
3082 | 1726 | ||
3083 | VM_BUG_ON_PAGE(!PageHead(page), page); | 1727 | VM_BUG_ON_PAGE(!PageHead(page), page); |
3084 | 1728 | ||
1729 | if (PageAnon(page)) | ||
1730 | ttu_flags |= TTU_MIGRATION; | ||
1731 | |||
3085 | /* We only need TTU_SPLIT_HUGE_PMD once */ | 1732 | /* We only need TTU_SPLIT_HUGE_PMD once */ |
3086 | ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); | 1733 | ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); |
3087 | for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { | 1734 | for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { |
@@ -3091,7 +1738,7 @@ static void freeze_page(struct page *page) | |||
3091 | 1738 | ||
3092 | ret = try_to_unmap(page + i, ttu_flags); | 1739 | ret = try_to_unmap(page + i, ttu_flags); |
3093 | } | 1740 | } |
3094 | VM_BUG_ON(ret); | 1741 | VM_BUG_ON_PAGE(ret, page + i - 1); |
3095 | } | 1742 | } |
3096 | 1743 | ||
3097 | static void unfreeze_page(struct page *page) | 1744 | static void unfreeze_page(struct page *page) |
@@ -3113,15 +1760,20 @@ static void __split_huge_page_tail(struct page *head, int tail, | |||
3113 | /* | 1760 | /* |
3114 | * tail_page->_refcount is zero and not changing from under us. But | 1761 | * tail_page->_refcount is zero and not changing from under us. But |
3115 | * get_page_unless_zero() may be running from under us on the | 1762 | * get_page_unless_zero() may be running from under us on the |
3116 | * tail_page. If we used atomic_set() below instead of atomic_inc(), we | 1763 | * tail_page. If we used atomic_set() below instead of atomic_inc() or |
3117 | * would then run atomic_set() concurrently with | 1764 | * atomic_add(), we would then run atomic_set() concurrently with |
3118 | * get_page_unless_zero(), and atomic_set() is implemented in C not | 1765 | * get_page_unless_zero(), and atomic_set() is implemented in C not |
3119 | * using locked ops. spin_unlock on x86 sometime uses locked ops | 1766 | * using locked ops. spin_unlock on x86 sometime uses locked ops |
3120 | * because of PPro errata 66, 92, so unless somebody can guarantee | 1767 | * because of PPro errata 66, 92, so unless somebody can guarantee |
3121 | * atomic_set() here would be safe on all archs (and not only on x86), | 1768 | * atomic_set() here would be safe on all archs (and not only on x86), |
3122 | * it's safer to use atomic_inc(). | 1769 | * it's safer to use atomic_inc()/atomic_add(). |
3123 | */ | 1770 | */ |
3124 | page_ref_inc(page_tail); | 1771 | if (PageAnon(head)) { |
1772 | page_ref_inc(page_tail); | ||
1773 | } else { | ||
1774 | /* Additional pin to radix tree */ | ||
1775 | page_ref_add(page_tail, 2); | ||
1776 | } | ||
3125 | 1777 | ||
3126 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 1778 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
3127 | page_tail->flags |= (head->flags & | 1779 | page_tail->flags |= (head->flags & |
@@ -3157,25 +1809,46 @@ static void __split_huge_page_tail(struct page *head, int tail, | |||
3157 | lru_add_page_tail(head, page_tail, lruvec, list); | 1809 | lru_add_page_tail(head, page_tail, lruvec, list); |
3158 | } | 1810 | } |
3159 | 1811 | ||
3160 | static void __split_huge_page(struct page *page, struct list_head *list) | 1812 | static void __split_huge_page(struct page *page, struct list_head *list, |
1813 | unsigned long flags) | ||
3161 | { | 1814 | { |
3162 | struct page *head = compound_head(page); | 1815 | struct page *head = compound_head(page); |
3163 | struct zone *zone = page_zone(head); | 1816 | struct zone *zone = page_zone(head); |
3164 | struct lruvec *lruvec; | 1817 | struct lruvec *lruvec; |
1818 | pgoff_t end = -1; | ||
3165 | int i; | 1819 | int i; |
3166 | 1820 | ||
3167 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | ||
3168 | spin_lock_irq(&zone->lru_lock); | ||
3169 | lruvec = mem_cgroup_page_lruvec(head, zone); | 1821 | lruvec = mem_cgroup_page_lruvec(head, zone); |
3170 | 1822 | ||
3171 | /* complete memcg works before add pages to LRU */ | 1823 | /* complete memcg works before add pages to LRU */ |
3172 | mem_cgroup_split_huge_fixup(head); | 1824 | mem_cgroup_split_huge_fixup(head); |
3173 | 1825 | ||
3174 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) | 1826 | if (!PageAnon(page)) |
1827 | end = DIV_ROUND_UP(i_size_read(head->mapping->host), PAGE_SIZE); | ||
1828 | |||
1829 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { | ||
3175 | __split_huge_page_tail(head, i, lruvec, list); | 1830 | __split_huge_page_tail(head, i, lruvec, list); |
1831 | /* Some pages can be beyond i_size: drop them from page cache */ | ||
1832 | if (head[i].index >= end) { | ||
1833 | __ClearPageDirty(head + i); | ||
1834 | __delete_from_page_cache(head + i, NULL); | ||
1835 | if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) | ||
1836 | shmem_uncharge(head->mapping->host, 1); | ||
1837 | put_page(head + i); | ||
1838 | } | ||
1839 | } | ||
3176 | 1840 | ||
3177 | ClearPageCompound(head); | 1841 | ClearPageCompound(head); |
3178 | spin_unlock_irq(&zone->lru_lock); | 1842 | /* See comment in __split_huge_page_tail() */ |
1843 | if (PageAnon(head)) { | ||
1844 | page_ref_inc(head); | ||
1845 | } else { | ||
1846 | /* Additional pin to radix tree */ | ||
1847 | page_ref_add(head, 2); | ||
1848 | spin_unlock(&head->mapping->tree_lock); | ||
1849 | } | ||
1850 | |||
1851 | spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags); | ||
3179 | 1852 | ||
3180 | unfreeze_page(head); | 1853 | unfreeze_page(head); |
3181 | 1854 | ||
@@ -3198,18 +1871,22 @@ static void __split_huge_page(struct page *page, struct list_head *list) | |||
3198 | 1871 | ||
3199 | int total_mapcount(struct page *page) | 1872 | int total_mapcount(struct page *page) |
3200 | { | 1873 | { |
3201 | int i, ret; | 1874 | int i, compound, ret; |
3202 | 1875 | ||
3203 | VM_BUG_ON_PAGE(PageTail(page), page); | 1876 | VM_BUG_ON_PAGE(PageTail(page), page); |
3204 | 1877 | ||
3205 | if (likely(!PageCompound(page))) | 1878 | if (likely(!PageCompound(page))) |
3206 | return atomic_read(&page->_mapcount) + 1; | 1879 | return atomic_read(&page->_mapcount) + 1; |
3207 | 1880 | ||
3208 | ret = compound_mapcount(page); | 1881 | compound = compound_mapcount(page); |
3209 | if (PageHuge(page)) | 1882 | if (PageHuge(page)) |
3210 | return ret; | 1883 | return compound; |
1884 | ret = compound; | ||
3211 | for (i = 0; i < HPAGE_PMD_NR; i++) | 1885 | for (i = 0; i < HPAGE_PMD_NR; i++) |
3212 | ret += atomic_read(&page[i]._mapcount) + 1; | 1886 | ret += atomic_read(&page[i]._mapcount) + 1; |
1887 | /* File pages has compound_mapcount included in _mapcount */ | ||
1888 | if (!PageAnon(page)) | ||
1889 | return ret - compound * HPAGE_PMD_NR; | ||
3213 | if (PageDoubleMap(page)) | 1890 | if (PageDoubleMap(page)) |
3214 | ret -= HPAGE_PMD_NR; | 1891 | ret -= HPAGE_PMD_NR; |
3215 | return ret; | 1892 | return ret; |
@@ -3296,36 +1973,54 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
3296 | { | 1973 | { |
3297 | struct page *head = compound_head(page); | 1974 | struct page *head = compound_head(page); |
3298 | struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); | 1975 | struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); |
3299 | struct anon_vma *anon_vma; | 1976 | struct anon_vma *anon_vma = NULL; |
3300 | int count, mapcount, ret; | 1977 | struct address_space *mapping = NULL; |
1978 | int count, mapcount, extra_pins, ret; | ||
3301 | bool mlocked; | 1979 | bool mlocked; |
3302 | unsigned long flags; | 1980 | unsigned long flags; |
3303 | 1981 | ||
3304 | VM_BUG_ON_PAGE(is_huge_zero_page(page), page); | 1982 | VM_BUG_ON_PAGE(is_huge_zero_page(page), page); |
3305 | VM_BUG_ON_PAGE(!PageAnon(page), page); | ||
3306 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 1983 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
3307 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | 1984 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
3308 | VM_BUG_ON_PAGE(!PageCompound(page), page); | 1985 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
3309 | 1986 | ||
3310 | /* | 1987 | if (PageAnon(head)) { |
3311 | * The caller does not necessarily hold an mmap_sem that would prevent | 1988 | /* |
3312 | * the anon_vma disappearing so we first we take a reference to it | 1989 | * The caller does not necessarily hold an mmap_sem that would |
3313 | * and then lock the anon_vma for write. This is similar to | 1990 | * prevent the anon_vma disappearing so we first we take a |
3314 | * page_lock_anon_vma_read except the write lock is taken to serialise | 1991 | * reference to it and then lock the anon_vma for write. This |
3315 | * against parallel split or collapse operations. | 1992 | * is similar to page_lock_anon_vma_read except the write lock |
3316 | */ | 1993 | * is taken to serialise against parallel split or collapse |
3317 | anon_vma = page_get_anon_vma(head); | 1994 | * operations. |
3318 | if (!anon_vma) { | 1995 | */ |
3319 | ret = -EBUSY; | 1996 | anon_vma = page_get_anon_vma(head); |
3320 | goto out; | 1997 | if (!anon_vma) { |
1998 | ret = -EBUSY; | ||
1999 | goto out; | ||
2000 | } | ||
2001 | extra_pins = 0; | ||
2002 | mapping = NULL; | ||
2003 | anon_vma_lock_write(anon_vma); | ||
2004 | } else { | ||
2005 | mapping = head->mapping; | ||
2006 | |||
2007 | /* Truncated ? */ | ||
2008 | if (!mapping) { | ||
2009 | ret = -EBUSY; | ||
2010 | goto out; | ||
2011 | } | ||
2012 | |||
2013 | /* Addidional pins from radix tree */ | ||
2014 | extra_pins = HPAGE_PMD_NR; | ||
2015 | anon_vma = NULL; | ||
2016 | i_mmap_lock_read(mapping); | ||
3321 | } | 2017 | } |
3322 | anon_vma_lock_write(anon_vma); | ||
3323 | 2018 | ||
3324 | /* | 2019 | /* |
3325 | * Racy check if we can split the page, before freeze_page() will | 2020 | * Racy check if we can split the page, before freeze_page() will |
3326 | * split PMDs | 2021 | * split PMDs |
3327 | */ | 2022 | */ |
3328 | if (total_mapcount(head) != page_count(head) - 1) { | 2023 | if (total_mapcount(head) != page_count(head) - extra_pins - 1) { |
3329 | ret = -EBUSY; | 2024 | ret = -EBUSY; |
3330 | goto out_unlock; | 2025 | goto out_unlock; |
3331 | } | 2026 | } |
@@ -3338,35 +2033,62 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
3338 | if (mlocked) | 2033 | if (mlocked) |
3339 | lru_add_drain(); | 2034 | lru_add_drain(); |
3340 | 2035 | ||
2036 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | ||
2037 | spin_lock_irqsave(&page_zone(head)->lru_lock, flags); | ||
2038 | |||
2039 | if (mapping) { | ||
2040 | void **pslot; | ||
2041 | |||
2042 | spin_lock(&mapping->tree_lock); | ||
2043 | pslot = radix_tree_lookup_slot(&mapping->page_tree, | ||
2044 | page_index(head)); | ||
2045 | /* | ||
2046 | * Check if the head page is present in radix tree. | ||
2047 | * We assume all tail are present too, if head is there. | ||
2048 | */ | ||
2049 | if (radix_tree_deref_slot_protected(pslot, | ||
2050 | &mapping->tree_lock) != head) | ||
2051 | goto fail; | ||
2052 | } | ||
2053 | |||
3341 | /* Prevent deferred_split_scan() touching ->_refcount */ | 2054 | /* Prevent deferred_split_scan() touching ->_refcount */ |
3342 | spin_lock_irqsave(&pgdata->split_queue_lock, flags); | 2055 | spin_lock(&pgdata->split_queue_lock); |
3343 | count = page_count(head); | 2056 | count = page_count(head); |
3344 | mapcount = total_mapcount(head); | 2057 | mapcount = total_mapcount(head); |
3345 | if (!mapcount && count == 1) { | 2058 | if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { |
3346 | if (!list_empty(page_deferred_list(head))) { | 2059 | if (!list_empty(page_deferred_list(head))) { |
3347 | pgdata->split_queue_len--; | 2060 | pgdata->split_queue_len--; |
3348 | list_del(page_deferred_list(head)); | 2061 | list_del(page_deferred_list(head)); |
3349 | } | 2062 | } |
3350 | spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); | 2063 | if (mapping) |
3351 | __split_huge_page(page, list); | 2064 | __dec_zone_page_state(page, NR_SHMEM_THPS); |
2065 | spin_unlock(&pgdata->split_queue_lock); | ||
2066 | __split_huge_page(page, list, flags); | ||
3352 | ret = 0; | 2067 | ret = 0; |
3353 | } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { | ||
3354 | spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); | ||
3355 | pr_alert("total_mapcount: %u, page_count(): %u\n", | ||
3356 | mapcount, count); | ||
3357 | if (PageTail(page)) | ||
3358 | dump_page(head, NULL); | ||
3359 | dump_page(page, "total_mapcount(head) > 0"); | ||
3360 | BUG(); | ||
3361 | } else { | 2068 | } else { |
3362 | spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); | 2069 | if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { |
2070 | pr_alert("total_mapcount: %u, page_count(): %u\n", | ||
2071 | mapcount, count); | ||
2072 | if (PageTail(page)) | ||
2073 | dump_page(head, NULL); | ||
2074 | dump_page(page, "total_mapcount(head) > 0"); | ||
2075 | BUG(); | ||
2076 | } | ||
2077 | spin_unlock(&pgdata->split_queue_lock); | ||
2078 | fail: if (mapping) | ||
2079 | spin_unlock(&mapping->tree_lock); | ||
2080 | spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags); | ||
3363 | unfreeze_page(head); | 2081 | unfreeze_page(head); |
3364 | ret = -EBUSY; | 2082 | ret = -EBUSY; |
3365 | } | 2083 | } |
3366 | 2084 | ||
3367 | out_unlock: | 2085 | out_unlock: |
3368 | anon_vma_unlock_write(anon_vma); | 2086 | if (anon_vma) { |
3369 | put_anon_vma(anon_vma); | 2087 | anon_vma_unlock_write(anon_vma); |
2088 | put_anon_vma(anon_vma); | ||
2089 | } | ||
2090 | if (mapping) | ||
2091 | i_mmap_unlock_read(mapping); | ||
3370 | out: | 2092 | out: |
3371 | count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); | 2093 | count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); |
3372 | return ret; | 2094 | return ret; |
@@ -3489,8 +2211,7 @@ static int split_huge_pages_set(void *data, u64 val) | |||
3489 | if (zone != page_zone(page)) | 2211 | if (zone != page_zone(page)) |
3490 | goto next; | 2212 | goto next; |
3491 | 2213 | ||
3492 | if (!PageHead(page) || !PageAnon(page) || | 2214 | if (!PageHead(page) || PageHuge(page) || !PageLRU(page)) |
3493 | PageHuge(page)) | ||
3494 | goto next; | 2215 | goto next; |
3495 | 2216 | ||
3496 | total++; | 2217 | total++; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cc2a99e9cbc8..abc1c5fb7222 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -3179,7 +3179,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
3179 | unsigned long start, unsigned long end, | 3179 | unsigned long start, unsigned long end, |
3180 | struct page *ref_page) | 3180 | struct page *ref_page) |
3181 | { | 3181 | { |
3182 | int force_flush = 0; | ||
3183 | struct mm_struct *mm = vma->vm_mm; | 3182 | struct mm_struct *mm = vma->vm_mm; |
3184 | unsigned long address; | 3183 | unsigned long address; |
3185 | pte_t *ptep; | 3184 | pte_t *ptep; |
@@ -3198,19 +3197,22 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
3198 | tlb_start_vma(tlb, vma); | 3197 | tlb_start_vma(tlb, vma); |
3199 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 3198 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
3200 | address = start; | 3199 | address = start; |
3201 | again: | ||
3202 | for (; address < end; address += sz) { | 3200 | for (; address < end; address += sz) { |
3203 | ptep = huge_pte_offset(mm, address); | 3201 | ptep = huge_pte_offset(mm, address); |
3204 | if (!ptep) | 3202 | if (!ptep) |
3205 | continue; | 3203 | continue; |
3206 | 3204 | ||
3207 | ptl = huge_pte_lock(h, mm, ptep); | 3205 | ptl = huge_pte_lock(h, mm, ptep); |
3208 | if (huge_pmd_unshare(mm, &address, ptep)) | 3206 | if (huge_pmd_unshare(mm, &address, ptep)) { |
3209 | goto unlock; | 3207 | spin_unlock(ptl); |
3208 | continue; | ||
3209 | } | ||
3210 | 3210 | ||
3211 | pte = huge_ptep_get(ptep); | 3211 | pte = huge_ptep_get(ptep); |
3212 | if (huge_pte_none(pte)) | 3212 | if (huge_pte_none(pte)) { |
3213 | goto unlock; | 3213 | spin_unlock(ptl); |
3214 | continue; | ||
3215 | } | ||
3214 | 3216 | ||
3215 | /* | 3217 | /* |
3216 | * Migrating hugepage or HWPoisoned hugepage is already | 3218 | * Migrating hugepage or HWPoisoned hugepage is already |
@@ -3218,7 +3220,8 @@ again: | |||
3218 | */ | 3220 | */ |
3219 | if (unlikely(!pte_present(pte))) { | 3221 | if (unlikely(!pte_present(pte))) { |
3220 | huge_pte_clear(mm, address, ptep); | 3222 | huge_pte_clear(mm, address, ptep); |
3221 | goto unlock; | 3223 | spin_unlock(ptl); |
3224 | continue; | ||
3222 | } | 3225 | } |
3223 | 3226 | ||
3224 | page = pte_page(pte); | 3227 | page = pte_page(pte); |
@@ -3228,9 +3231,10 @@ again: | |||
3228 | * are about to unmap is the actual page of interest. | 3231 | * are about to unmap is the actual page of interest. |
3229 | */ | 3232 | */ |
3230 | if (ref_page) { | 3233 | if (ref_page) { |
3231 | if (page != ref_page) | 3234 | if (page != ref_page) { |
3232 | goto unlock; | 3235 | spin_unlock(ptl); |
3233 | 3236 | continue; | |
3237 | } | ||
3234 | /* | 3238 | /* |
3235 | * Mark the VMA as having unmapped its page so that | 3239 | * Mark the VMA as having unmapped its page so that |
3236 | * future faults in this VMA will fail rather than | 3240 | * future faults in this VMA will fail rather than |
@@ -3246,30 +3250,14 @@ again: | |||
3246 | 3250 | ||
3247 | hugetlb_count_sub(pages_per_huge_page(h), mm); | 3251 | hugetlb_count_sub(pages_per_huge_page(h), mm); |
3248 | page_remove_rmap(page, true); | 3252 | page_remove_rmap(page, true); |
3249 | force_flush = !__tlb_remove_page(tlb, page); | 3253 | |
3250 | if (force_flush) { | ||
3251 | address += sz; | ||
3252 | spin_unlock(ptl); | ||
3253 | break; | ||
3254 | } | ||
3255 | /* Bail out after unmapping reference page if supplied */ | ||
3256 | if (ref_page) { | ||
3257 | spin_unlock(ptl); | ||
3258 | break; | ||
3259 | } | ||
3260 | unlock: | ||
3261 | spin_unlock(ptl); | 3254 | spin_unlock(ptl); |
3262 | } | 3255 | tlb_remove_page_size(tlb, page, huge_page_size(h)); |
3263 | /* | 3256 | /* |
3264 | * mmu_gather ran out of room to batch pages, we break out of | 3257 | * Bail out after unmapping reference page if supplied |
3265 | * the PTE lock to avoid doing the potential expensive TLB invalidate | 3258 | */ |
3266 | * and page-free while holding it. | 3259 | if (ref_page) |
3267 | */ | 3260 | break; |
3268 | if (force_flush) { | ||
3269 | force_flush = 0; | ||
3270 | tlb_flush_mmu(tlb); | ||
3271 | if (address < end && !ref_page) | ||
3272 | goto again; | ||
3273 | } | 3261 | } |
3274 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 3262 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
3275 | tlb_end_vma(tlb, vma); | 3263 | tlb_end_vma(tlb, vma); |
diff --git a/mm/internal.h b/mm/internal.h index 2524ec880e24..9b6a6c43ac39 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -36,6 +36,8 @@ | |||
36 | /* Do not use these with a slab allocator */ | 36 | /* Do not use these with a slab allocator */ |
37 | #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) | 37 | #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) |
38 | 38 | ||
39 | int do_swap_page(struct fault_env *fe, pte_t orig_pte); | ||
40 | |||
39 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 41 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
40 | unsigned long floor, unsigned long ceiling); | 42 | unsigned long floor, unsigned long ceiling); |
41 | 43 | ||
@@ -150,6 +152,8 @@ extern int __isolate_free_page(struct page *page, unsigned int order); | |||
150 | extern void __free_pages_bootmem(struct page *page, unsigned long pfn, | 152 | extern void __free_pages_bootmem(struct page *page, unsigned long pfn, |
151 | unsigned int order); | 153 | unsigned int order); |
152 | extern void prep_compound_page(struct page *page, unsigned int order); | 154 | extern void prep_compound_page(struct page *page, unsigned int order); |
155 | extern void post_alloc_hook(struct page *page, unsigned int order, | ||
156 | gfp_t gfp_flags); | ||
153 | extern int user_min_free_kbytes; | 157 | extern int user_min_free_kbytes; |
154 | 158 | ||
155 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | 159 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA |
diff --git a/mm/khugepaged.c b/mm/khugepaged.c new file mode 100644 index 000000000000..7dbee698d6aa --- /dev/null +++ b/mm/khugepaged.c | |||
@@ -0,0 +1,1922 @@ | |||
1 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
2 | |||
3 | #include <linux/mm.h> | ||
4 | #include <linux/sched.h> | ||
5 | #include <linux/mmu_notifier.h> | ||
6 | #include <linux/rmap.h> | ||
7 | #include <linux/swap.h> | ||
8 | #include <linux/mm_inline.h> | ||
9 | #include <linux/kthread.h> | ||
10 | #include <linux/khugepaged.h> | ||
11 | #include <linux/freezer.h> | ||
12 | #include <linux/mman.h> | ||
13 | #include <linux/hashtable.h> | ||
14 | #include <linux/userfaultfd_k.h> | ||
15 | #include <linux/page_idle.h> | ||
16 | #include <linux/swapops.h> | ||
17 | #include <linux/shmem_fs.h> | ||
18 | |||
19 | #include <asm/tlb.h> | ||
20 | #include <asm/pgalloc.h> | ||
21 | #include "internal.h" | ||
22 | |||
23 | enum scan_result { | ||
24 | SCAN_FAIL, | ||
25 | SCAN_SUCCEED, | ||
26 | SCAN_PMD_NULL, | ||
27 | SCAN_EXCEED_NONE_PTE, | ||
28 | SCAN_PTE_NON_PRESENT, | ||
29 | SCAN_PAGE_RO, | ||
30 | SCAN_LACK_REFERENCED_PAGE, | ||
31 | SCAN_PAGE_NULL, | ||
32 | SCAN_SCAN_ABORT, | ||
33 | SCAN_PAGE_COUNT, | ||
34 | SCAN_PAGE_LRU, | ||
35 | SCAN_PAGE_LOCK, | ||
36 | SCAN_PAGE_ANON, | ||
37 | SCAN_PAGE_COMPOUND, | ||
38 | SCAN_ANY_PROCESS, | ||
39 | SCAN_VMA_NULL, | ||
40 | SCAN_VMA_CHECK, | ||
41 | SCAN_ADDRESS_RANGE, | ||
42 | SCAN_SWAP_CACHE_PAGE, | ||
43 | SCAN_DEL_PAGE_LRU, | ||
44 | SCAN_ALLOC_HUGE_PAGE_FAIL, | ||
45 | SCAN_CGROUP_CHARGE_FAIL, | ||
46 | SCAN_EXCEED_SWAP_PTE, | ||
47 | SCAN_TRUNCATED, | ||
48 | }; | ||
49 | |||
50 | #define CREATE_TRACE_POINTS | ||
51 | #include <trace/events/huge_memory.h> | ||
52 | |||
53 | /* default scan 8*512 pte (or vmas) every 30 second */ | ||
54 | static unsigned int khugepaged_pages_to_scan __read_mostly; | ||
55 | static unsigned int khugepaged_pages_collapsed; | ||
56 | static unsigned int khugepaged_full_scans; | ||
57 | static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; | ||
58 | /* during fragmentation poll the hugepage allocator once every minute */ | ||
59 | static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; | ||
60 | static unsigned long khugepaged_sleep_expire; | ||
61 | static DEFINE_SPINLOCK(khugepaged_mm_lock); | ||
62 | static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); | ||
63 | /* | ||
64 | * default collapse hugepages if there is at least one pte mapped like | ||
65 | * it would have happened if the vma was large enough during page | ||
66 | * fault. | ||
67 | */ | ||
68 | static unsigned int khugepaged_max_ptes_none __read_mostly; | ||
69 | static unsigned int khugepaged_max_ptes_swap __read_mostly; | ||
70 | |||
71 | #define MM_SLOTS_HASH_BITS 10 | ||
72 | static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); | ||
73 | |||
74 | static struct kmem_cache *mm_slot_cache __read_mostly; | ||
75 | |||
76 | /** | ||
77 | * struct mm_slot - hash lookup from mm to mm_slot | ||
78 | * @hash: hash collision list | ||
79 | * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head | ||
80 | * @mm: the mm that this information is valid for | ||
81 | */ | ||
82 | struct mm_slot { | ||
83 | struct hlist_node hash; | ||
84 | struct list_head mm_node; | ||
85 | struct mm_struct *mm; | ||
86 | }; | ||
87 | |||
88 | /** | ||
89 | * struct khugepaged_scan - cursor for scanning | ||
90 | * @mm_head: the head of the mm list to scan | ||
91 | * @mm_slot: the current mm_slot we are scanning | ||
92 | * @address: the next address inside that to be scanned | ||
93 | * | ||
94 | * There is only the one khugepaged_scan instance of this cursor structure. | ||
95 | */ | ||
96 | struct khugepaged_scan { | ||
97 | struct list_head mm_head; | ||
98 | struct mm_slot *mm_slot; | ||
99 | unsigned long address; | ||
100 | }; | ||
101 | |||
102 | static struct khugepaged_scan khugepaged_scan = { | ||
103 | .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), | ||
104 | }; | ||
105 | |||
106 | static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, | ||
107 | struct kobj_attribute *attr, | ||
108 | char *buf) | ||
109 | { | ||
110 | return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); | ||
111 | } | ||
112 | |||
113 | static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, | ||
114 | struct kobj_attribute *attr, | ||
115 | const char *buf, size_t count) | ||
116 | { | ||
117 | unsigned long msecs; | ||
118 | int err; | ||
119 | |||
120 | err = kstrtoul(buf, 10, &msecs); | ||
121 | if (err || msecs > UINT_MAX) | ||
122 | return -EINVAL; | ||
123 | |||
124 | khugepaged_scan_sleep_millisecs = msecs; | ||
125 | khugepaged_sleep_expire = 0; | ||
126 | wake_up_interruptible(&khugepaged_wait); | ||
127 | |||
128 | return count; | ||
129 | } | ||
130 | static struct kobj_attribute scan_sleep_millisecs_attr = | ||
131 | __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, | ||
132 | scan_sleep_millisecs_store); | ||
133 | |||
134 | static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, | ||
135 | struct kobj_attribute *attr, | ||
136 | char *buf) | ||
137 | { | ||
138 | return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); | ||
139 | } | ||
140 | |||
141 | static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, | ||
142 | struct kobj_attribute *attr, | ||
143 | const char *buf, size_t count) | ||
144 | { | ||
145 | unsigned long msecs; | ||
146 | int err; | ||
147 | |||
148 | err = kstrtoul(buf, 10, &msecs); | ||
149 | if (err || msecs > UINT_MAX) | ||
150 | return -EINVAL; | ||
151 | |||
152 | khugepaged_alloc_sleep_millisecs = msecs; | ||
153 | khugepaged_sleep_expire = 0; | ||
154 | wake_up_interruptible(&khugepaged_wait); | ||
155 | |||
156 | return count; | ||
157 | } | ||
158 | static struct kobj_attribute alloc_sleep_millisecs_attr = | ||
159 | __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, | ||
160 | alloc_sleep_millisecs_store); | ||
161 | |||
162 | static ssize_t pages_to_scan_show(struct kobject *kobj, | ||
163 | struct kobj_attribute *attr, | ||
164 | char *buf) | ||
165 | { | ||
166 | return sprintf(buf, "%u\n", khugepaged_pages_to_scan); | ||
167 | } | ||
168 | static ssize_t pages_to_scan_store(struct kobject *kobj, | ||
169 | struct kobj_attribute *attr, | ||
170 | const char *buf, size_t count) | ||
171 | { | ||
172 | int err; | ||
173 | unsigned long pages; | ||
174 | |||
175 | err = kstrtoul(buf, 10, &pages); | ||
176 | if (err || !pages || pages > UINT_MAX) | ||
177 | return -EINVAL; | ||
178 | |||
179 | khugepaged_pages_to_scan = pages; | ||
180 | |||
181 | return count; | ||
182 | } | ||
183 | static struct kobj_attribute pages_to_scan_attr = | ||
184 | __ATTR(pages_to_scan, 0644, pages_to_scan_show, | ||
185 | pages_to_scan_store); | ||
186 | |||
187 | static ssize_t pages_collapsed_show(struct kobject *kobj, | ||
188 | struct kobj_attribute *attr, | ||
189 | char *buf) | ||
190 | { | ||
191 | return sprintf(buf, "%u\n", khugepaged_pages_collapsed); | ||
192 | } | ||
193 | static struct kobj_attribute pages_collapsed_attr = | ||
194 | __ATTR_RO(pages_collapsed); | ||
195 | |||
196 | static ssize_t full_scans_show(struct kobject *kobj, | ||
197 | struct kobj_attribute *attr, | ||
198 | char *buf) | ||
199 | { | ||
200 | return sprintf(buf, "%u\n", khugepaged_full_scans); | ||
201 | } | ||
202 | static struct kobj_attribute full_scans_attr = | ||
203 | __ATTR_RO(full_scans); | ||
204 | |||
205 | static ssize_t khugepaged_defrag_show(struct kobject *kobj, | ||
206 | struct kobj_attribute *attr, char *buf) | ||
207 | { | ||
208 | return single_hugepage_flag_show(kobj, attr, buf, | ||
209 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | ||
210 | } | ||
211 | static ssize_t khugepaged_defrag_store(struct kobject *kobj, | ||
212 | struct kobj_attribute *attr, | ||
213 | const char *buf, size_t count) | ||
214 | { | ||
215 | return single_hugepage_flag_store(kobj, attr, buf, count, | ||
216 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | ||
217 | } | ||
218 | static struct kobj_attribute khugepaged_defrag_attr = | ||
219 | __ATTR(defrag, 0644, khugepaged_defrag_show, | ||
220 | khugepaged_defrag_store); | ||
221 | |||
222 | /* | ||
223 | * max_ptes_none controls if khugepaged should collapse hugepages over | ||
224 | * any unmapped ptes in turn potentially increasing the memory | ||
225 | * footprint of the vmas. When max_ptes_none is 0 khugepaged will not | ||
226 | * reduce the available free memory in the system as it | ||
227 | * runs. Increasing max_ptes_none will instead potentially reduce the | ||
228 | * free memory in the system during the khugepaged scan. | ||
229 | */ | ||
230 | static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, | ||
231 | struct kobj_attribute *attr, | ||
232 | char *buf) | ||
233 | { | ||
234 | return sprintf(buf, "%u\n", khugepaged_max_ptes_none); | ||
235 | } | ||
236 | static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, | ||
237 | struct kobj_attribute *attr, | ||
238 | const char *buf, size_t count) | ||
239 | { | ||
240 | int err; | ||
241 | unsigned long max_ptes_none; | ||
242 | |||
243 | err = kstrtoul(buf, 10, &max_ptes_none); | ||
244 | if (err || max_ptes_none > HPAGE_PMD_NR-1) | ||
245 | return -EINVAL; | ||
246 | |||
247 | khugepaged_max_ptes_none = max_ptes_none; | ||
248 | |||
249 | return count; | ||
250 | } | ||
251 | static struct kobj_attribute khugepaged_max_ptes_none_attr = | ||
252 | __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, | ||
253 | khugepaged_max_ptes_none_store); | ||
254 | |||
255 | static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj, | ||
256 | struct kobj_attribute *attr, | ||
257 | char *buf) | ||
258 | { | ||
259 | return sprintf(buf, "%u\n", khugepaged_max_ptes_swap); | ||
260 | } | ||
261 | |||
262 | static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj, | ||
263 | struct kobj_attribute *attr, | ||
264 | const char *buf, size_t count) | ||
265 | { | ||
266 | int err; | ||
267 | unsigned long max_ptes_swap; | ||
268 | |||
269 | err = kstrtoul(buf, 10, &max_ptes_swap); | ||
270 | if (err || max_ptes_swap > HPAGE_PMD_NR-1) | ||
271 | return -EINVAL; | ||
272 | |||
273 | khugepaged_max_ptes_swap = max_ptes_swap; | ||
274 | |||
275 | return count; | ||
276 | } | ||
277 | |||
278 | static struct kobj_attribute khugepaged_max_ptes_swap_attr = | ||
279 | __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show, | ||
280 | khugepaged_max_ptes_swap_store); | ||
281 | |||
282 | static struct attribute *khugepaged_attr[] = { | ||
283 | &khugepaged_defrag_attr.attr, | ||
284 | &khugepaged_max_ptes_none_attr.attr, | ||
285 | &pages_to_scan_attr.attr, | ||
286 | &pages_collapsed_attr.attr, | ||
287 | &full_scans_attr.attr, | ||
288 | &scan_sleep_millisecs_attr.attr, | ||
289 | &alloc_sleep_millisecs_attr.attr, | ||
290 | &khugepaged_max_ptes_swap_attr.attr, | ||
291 | NULL, | ||
292 | }; | ||
293 | |||
294 | struct attribute_group khugepaged_attr_group = { | ||
295 | .attrs = khugepaged_attr, | ||
296 | .name = "khugepaged", | ||
297 | }; | ||
298 | |||
299 | #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) | ||
300 | |||
301 | int hugepage_madvise(struct vm_area_struct *vma, | ||
302 | unsigned long *vm_flags, int advice) | ||
303 | { | ||
304 | switch (advice) { | ||
305 | case MADV_HUGEPAGE: | ||
306 | #ifdef CONFIG_S390 | ||
307 | /* | ||
308 | * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390 | ||
309 | * can't handle this properly after s390_enable_sie, so we simply | ||
310 | * ignore the madvise to prevent qemu from causing a SIGSEGV. | ||
311 | */ | ||
312 | if (mm_has_pgste(vma->vm_mm)) | ||
313 | return 0; | ||
314 | #endif | ||
315 | *vm_flags &= ~VM_NOHUGEPAGE; | ||
316 | *vm_flags |= VM_HUGEPAGE; | ||
317 | /* | ||
318 | * If the vma become good for khugepaged to scan, | ||
319 | * register it here without waiting a page fault that | ||
320 | * may not happen any time soon. | ||
321 | */ | ||
322 | if (!(*vm_flags & VM_NO_KHUGEPAGED) && | ||
323 | khugepaged_enter_vma_merge(vma, *vm_flags)) | ||
324 | return -ENOMEM; | ||
325 | break; | ||
326 | case MADV_NOHUGEPAGE: | ||
327 | *vm_flags &= ~VM_HUGEPAGE; | ||
328 | *vm_flags |= VM_NOHUGEPAGE; | ||
329 | /* | ||
330 | * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning | ||
331 | * this vma even if we leave the mm registered in khugepaged if | ||
332 | * it got registered before VM_NOHUGEPAGE was set. | ||
333 | */ | ||
334 | break; | ||
335 | } | ||
336 | |||
337 | return 0; | ||
338 | } | ||
339 | |||
340 | int __init khugepaged_init(void) | ||
341 | { | ||
342 | mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", | ||
343 | sizeof(struct mm_slot), | ||
344 | __alignof__(struct mm_slot), 0, NULL); | ||
345 | if (!mm_slot_cache) | ||
346 | return -ENOMEM; | ||
347 | |||
348 | khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; | ||
349 | khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; | ||
350 | khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; | ||
351 | |||
352 | return 0; | ||
353 | } | ||
354 | |||
355 | void __init khugepaged_destroy(void) | ||
356 | { | ||
357 | kmem_cache_destroy(mm_slot_cache); | ||
358 | } | ||
359 | |||
360 | static inline struct mm_slot *alloc_mm_slot(void) | ||
361 | { | ||
362 | if (!mm_slot_cache) /* initialization failed */ | ||
363 | return NULL; | ||
364 | return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); | ||
365 | } | ||
366 | |||
367 | static inline void free_mm_slot(struct mm_slot *mm_slot) | ||
368 | { | ||
369 | kmem_cache_free(mm_slot_cache, mm_slot); | ||
370 | } | ||
371 | |||
372 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) | ||
373 | { | ||
374 | struct mm_slot *mm_slot; | ||
375 | |||
376 | hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) | ||
377 | if (mm == mm_slot->mm) | ||
378 | return mm_slot; | ||
379 | |||
380 | return NULL; | ||
381 | } | ||
382 | |||
383 | static void insert_to_mm_slots_hash(struct mm_struct *mm, | ||
384 | struct mm_slot *mm_slot) | ||
385 | { | ||
386 | mm_slot->mm = mm; | ||
387 | hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); | ||
388 | } | ||
389 | |||
390 | static inline int khugepaged_test_exit(struct mm_struct *mm) | ||
391 | { | ||
392 | return atomic_read(&mm->mm_users) == 0; | ||
393 | } | ||
394 | |||
395 | int __khugepaged_enter(struct mm_struct *mm) | ||
396 | { | ||
397 | struct mm_slot *mm_slot; | ||
398 | int wakeup; | ||
399 | |||
400 | mm_slot = alloc_mm_slot(); | ||
401 | if (!mm_slot) | ||
402 | return -ENOMEM; | ||
403 | |||
404 | /* __khugepaged_exit() must not run from under us */ | ||
405 | VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); | ||
406 | if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { | ||
407 | free_mm_slot(mm_slot); | ||
408 | return 0; | ||
409 | } | ||
410 | |||
411 | spin_lock(&khugepaged_mm_lock); | ||
412 | insert_to_mm_slots_hash(mm, mm_slot); | ||
413 | /* | ||
414 | * Insert just behind the scanning cursor, to let the area settle | ||
415 | * down a little. | ||
416 | */ | ||
417 | wakeup = list_empty(&khugepaged_scan.mm_head); | ||
418 | list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); | ||
419 | spin_unlock(&khugepaged_mm_lock); | ||
420 | |||
421 | atomic_inc(&mm->mm_count); | ||
422 | if (wakeup) | ||
423 | wake_up_interruptible(&khugepaged_wait); | ||
424 | |||
425 | return 0; | ||
426 | } | ||
427 | |||
428 | int khugepaged_enter_vma_merge(struct vm_area_struct *vma, | ||
429 | unsigned long vm_flags) | ||
430 | { | ||
431 | unsigned long hstart, hend; | ||
432 | if (!vma->anon_vma) | ||
433 | /* | ||
434 | * Not yet faulted in so we will register later in the | ||
435 | * page fault if needed. | ||
436 | */ | ||
437 | return 0; | ||
438 | if (vma->vm_ops || (vm_flags & VM_NO_KHUGEPAGED)) | ||
439 | /* khugepaged not yet working on file or special mappings */ | ||
440 | return 0; | ||
441 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
442 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
443 | if (hstart < hend) | ||
444 | return khugepaged_enter(vma, vm_flags); | ||
445 | return 0; | ||
446 | } | ||
447 | |||
448 | void __khugepaged_exit(struct mm_struct *mm) | ||
449 | { | ||
450 | struct mm_slot *mm_slot; | ||
451 | int free = 0; | ||
452 | |||
453 | spin_lock(&khugepaged_mm_lock); | ||
454 | mm_slot = get_mm_slot(mm); | ||
455 | if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { | ||
456 | hash_del(&mm_slot->hash); | ||
457 | list_del(&mm_slot->mm_node); | ||
458 | free = 1; | ||
459 | } | ||
460 | spin_unlock(&khugepaged_mm_lock); | ||
461 | |||
462 | if (free) { | ||
463 | clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | ||
464 | free_mm_slot(mm_slot); | ||
465 | mmdrop(mm); | ||
466 | } else if (mm_slot) { | ||
467 | /* | ||
468 | * This is required to serialize against | ||
469 | * khugepaged_test_exit() (which is guaranteed to run | ||
470 | * under mmap sem read mode). Stop here (after we | ||
471 | * return all pagetables will be destroyed) until | ||
472 | * khugepaged has finished working on the pagetables | ||
473 | * under the mmap_sem. | ||
474 | */ | ||
475 | down_write(&mm->mmap_sem); | ||
476 | up_write(&mm->mmap_sem); | ||
477 | } | ||
478 | } | ||
479 | |||
480 | static void release_pte_page(struct page *page) | ||
481 | { | ||
482 | /* 0 stands for page_is_file_cache(page) == false */ | ||
483 | dec_zone_page_state(page, NR_ISOLATED_ANON + 0); | ||
484 | unlock_page(page); | ||
485 | putback_lru_page(page); | ||
486 | } | ||
487 | |||
488 | static void release_pte_pages(pte_t *pte, pte_t *_pte) | ||
489 | { | ||
490 | while (--_pte >= pte) { | ||
491 | pte_t pteval = *_pte; | ||
492 | if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) | ||
493 | release_pte_page(pte_page(pteval)); | ||
494 | } | ||
495 | } | ||
496 | |||
497 | static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | ||
498 | unsigned long address, | ||
499 | pte_t *pte) | ||
500 | { | ||
501 | struct page *page = NULL; | ||
502 | pte_t *_pte; | ||
503 | int none_or_zero = 0, result = 0, referenced = 0; | ||
504 | bool writable = false; | ||
505 | |||
506 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | ||
507 | _pte++, address += PAGE_SIZE) { | ||
508 | pte_t pteval = *_pte; | ||
509 | if (pte_none(pteval) || (pte_present(pteval) && | ||
510 | is_zero_pfn(pte_pfn(pteval)))) { | ||
511 | if (!userfaultfd_armed(vma) && | ||
512 | ++none_or_zero <= khugepaged_max_ptes_none) { | ||
513 | continue; | ||
514 | } else { | ||
515 | result = SCAN_EXCEED_NONE_PTE; | ||
516 | goto out; | ||
517 | } | ||
518 | } | ||
519 | if (!pte_present(pteval)) { | ||
520 | result = SCAN_PTE_NON_PRESENT; | ||
521 | goto out; | ||
522 | } | ||
523 | page = vm_normal_page(vma, address, pteval); | ||
524 | if (unlikely(!page)) { | ||
525 | result = SCAN_PAGE_NULL; | ||
526 | goto out; | ||
527 | } | ||
528 | |||
529 | VM_BUG_ON_PAGE(PageCompound(page), page); | ||
530 | VM_BUG_ON_PAGE(!PageAnon(page), page); | ||
531 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | ||
532 | |||
533 | /* | ||
534 | * We can do it before isolate_lru_page because the | ||
535 | * page can't be freed from under us. NOTE: PG_lock | ||
536 | * is needed to serialize against split_huge_page | ||
537 | * when invoked from the VM. | ||
538 | */ | ||
539 | if (!trylock_page(page)) { | ||
540 | result = SCAN_PAGE_LOCK; | ||
541 | goto out; | ||
542 | } | ||
543 | |||
544 | /* | ||
545 | * cannot use mapcount: can't collapse if there's a gup pin. | ||
546 | * The page must only be referenced by the scanned process | ||
547 | * and page swap cache. | ||
548 | */ | ||
549 | if (page_count(page) != 1 + !!PageSwapCache(page)) { | ||
550 | unlock_page(page); | ||
551 | result = SCAN_PAGE_COUNT; | ||
552 | goto out; | ||
553 | } | ||
554 | if (pte_write(pteval)) { | ||
555 | writable = true; | ||
556 | } else { | ||
557 | if (PageSwapCache(page) && | ||
558 | !reuse_swap_page(page, NULL)) { | ||
559 | unlock_page(page); | ||
560 | result = SCAN_SWAP_CACHE_PAGE; | ||
561 | goto out; | ||
562 | } | ||
563 | /* | ||
564 | * Page is not in the swap cache. It can be collapsed | ||
565 | * into a THP. | ||
566 | */ | ||
567 | } | ||
568 | |||
569 | /* | ||
570 | * Isolate the page to avoid collapsing an hugepage | ||
571 | * currently in use by the VM. | ||
572 | */ | ||
573 | if (isolate_lru_page(page)) { | ||
574 | unlock_page(page); | ||
575 | result = SCAN_DEL_PAGE_LRU; | ||
576 | goto out; | ||
577 | } | ||
578 | /* 0 stands for page_is_file_cache(page) == false */ | ||
579 | inc_zone_page_state(page, NR_ISOLATED_ANON + 0); | ||
580 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
581 | VM_BUG_ON_PAGE(PageLRU(page), page); | ||
582 | |||
583 | /* There should be enough young pte to collapse the page */ | ||
584 | if (pte_young(pteval) || | ||
585 | page_is_young(page) || PageReferenced(page) || | ||
586 | mmu_notifier_test_young(vma->vm_mm, address)) | ||
587 | referenced++; | ||
588 | } | ||
589 | if (likely(writable)) { | ||
590 | if (likely(referenced)) { | ||
591 | result = SCAN_SUCCEED; | ||
592 | trace_mm_collapse_huge_page_isolate(page, none_or_zero, | ||
593 | referenced, writable, result); | ||
594 | return 1; | ||
595 | } | ||
596 | } else { | ||
597 | result = SCAN_PAGE_RO; | ||
598 | } | ||
599 | |||
600 | out: | ||
601 | release_pte_pages(pte, _pte); | ||
602 | trace_mm_collapse_huge_page_isolate(page, none_or_zero, | ||
603 | referenced, writable, result); | ||
604 | return 0; | ||
605 | } | ||
606 | |||
607 | static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | ||
608 | struct vm_area_struct *vma, | ||
609 | unsigned long address, | ||
610 | spinlock_t *ptl) | ||
611 | { | ||
612 | pte_t *_pte; | ||
613 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { | ||
614 | pte_t pteval = *_pte; | ||
615 | struct page *src_page; | ||
616 | |||
617 | if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { | ||
618 | clear_user_highpage(page, address); | ||
619 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); | ||
620 | if (is_zero_pfn(pte_pfn(pteval))) { | ||
621 | /* | ||
622 | * ptl mostly unnecessary. | ||
623 | */ | ||
624 | spin_lock(ptl); | ||
625 | /* | ||
626 | * paravirt calls inside pte_clear here are | ||
627 | * superfluous. | ||
628 | */ | ||
629 | pte_clear(vma->vm_mm, address, _pte); | ||
630 | spin_unlock(ptl); | ||
631 | } | ||
632 | } else { | ||
633 | src_page = pte_page(pteval); | ||
634 | copy_user_highpage(page, src_page, address, vma); | ||
635 | VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page); | ||
636 | release_pte_page(src_page); | ||
637 | /* | ||
638 | * ptl mostly unnecessary, but preempt has to | ||
639 | * be disabled to update the per-cpu stats | ||
640 | * inside page_remove_rmap(). | ||
641 | */ | ||
642 | spin_lock(ptl); | ||
643 | /* | ||
644 | * paravirt calls inside pte_clear here are | ||
645 | * superfluous. | ||
646 | */ | ||
647 | pte_clear(vma->vm_mm, address, _pte); | ||
648 | page_remove_rmap(src_page, false); | ||
649 | spin_unlock(ptl); | ||
650 | free_page_and_swap_cache(src_page); | ||
651 | } | ||
652 | |||
653 | address += PAGE_SIZE; | ||
654 | page++; | ||
655 | } | ||
656 | } | ||
657 | |||
658 | static void khugepaged_alloc_sleep(void) | ||
659 | { | ||
660 | DEFINE_WAIT(wait); | ||
661 | |||
662 | add_wait_queue(&khugepaged_wait, &wait); | ||
663 | freezable_schedule_timeout_interruptible( | ||
664 | msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); | ||
665 | remove_wait_queue(&khugepaged_wait, &wait); | ||
666 | } | ||
667 | |||
668 | static int khugepaged_node_load[MAX_NUMNODES]; | ||
669 | |||
670 | static bool khugepaged_scan_abort(int nid) | ||
671 | { | ||
672 | int i; | ||
673 | |||
674 | /* | ||
675 | * If zone_reclaim_mode is disabled, then no extra effort is made to | ||
676 | * allocate memory locally. | ||
677 | */ | ||
678 | if (!zone_reclaim_mode) | ||
679 | return false; | ||
680 | |||
681 | /* If there is a count for this node already, it must be acceptable */ | ||
682 | if (khugepaged_node_load[nid]) | ||
683 | return false; | ||
684 | |||
685 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
686 | if (!khugepaged_node_load[i]) | ||
687 | continue; | ||
688 | if (node_distance(nid, i) > RECLAIM_DISTANCE) | ||
689 | return true; | ||
690 | } | ||
691 | return false; | ||
692 | } | ||
693 | |||
694 | /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ | ||
695 | static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) | ||
696 | { | ||
697 | return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); | ||
698 | } | ||
699 | |||
700 | #ifdef CONFIG_NUMA | ||
701 | static int khugepaged_find_target_node(void) | ||
702 | { | ||
703 | static int last_khugepaged_target_node = NUMA_NO_NODE; | ||
704 | int nid, target_node = 0, max_value = 0; | ||
705 | |||
706 | /* find first node with max normal pages hit */ | ||
707 | for (nid = 0; nid < MAX_NUMNODES; nid++) | ||
708 | if (khugepaged_node_load[nid] > max_value) { | ||
709 | max_value = khugepaged_node_load[nid]; | ||
710 | target_node = nid; | ||
711 | } | ||
712 | |||
713 | /* do some balance if several nodes have the same hit record */ | ||
714 | if (target_node <= last_khugepaged_target_node) | ||
715 | for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; | ||
716 | nid++) | ||
717 | if (max_value == khugepaged_node_load[nid]) { | ||
718 | target_node = nid; | ||
719 | break; | ||
720 | } | ||
721 | |||
722 | last_khugepaged_target_node = target_node; | ||
723 | return target_node; | ||
724 | } | ||
725 | |||
726 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | ||
727 | { | ||
728 | if (IS_ERR(*hpage)) { | ||
729 | if (!*wait) | ||
730 | return false; | ||
731 | |||
732 | *wait = false; | ||
733 | *hpage = NULL; | ||
734 | khugepaged_alloc_sleep(); | ||
735 | } else if (*hpage) { | ||
736 | put_page(*hpage); | ||
737 | *hpage = NULL; | ||
738 | } | ||
739 | |||
740 | return true; | ||
741 | } | ||
742 | |||
743 | static struct page * | ||
744 | khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) | ||
745 | { | ||
746 | VM_BUG_ON_PAGE(*hpage, *hpage); | ||
747 | |||
748 | *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); | ||
749 | if (unlikely(!*hpage)) { | ||
750 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
751 | *hpage = ERR_PTR(-ENOMEM); | ||
752 | return NULL; | ||
753 | } | ||
754 | |||
755 | prep_transhuge_page(*hpage); | ||
756 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
757 | return *hpage; | ||
758 | } | ||
759 | #else | ||
760 | static int khugepaged_find_target_node(void) | ||
761 | { | ||
762 | return 0; | ||
763 | } | ||
764 | |||
765 | static inline struct page *alloc_khugepaged_hugepage(void) | ||
766 | { | ||
767 | struct page *page; | ||
768 | |||
769 | page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), | ||
770 | HPAGE_PMD_ORDER); | ||
771 | if (page) | ||
772 | prep_transhuge_page(page); | ||
773 | return page; | ||
774 | } | ||
775 | |||
776 | static struct page *khugepaged_alloc_hugepage(bool *wait) | ||
777 | { | ||
778 | struct page *hpage; | ||
779 | |||
780 | do { | ||
781 | hpage = alloc_khugepaged_hugepage(); | ||
782 | if (!hpage) { | ||
783 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | ||
784 | if (!*wait) | ||
785 | return NULL; | ||
786 | |||
787 | *wait = false; | ||
788 | khugepaged_alloc_sleep(); | ||
789 | } else | ||
790 | count_vm_event(THP_COLLAPSE_ALLOC); | ||
791 | } while (unlikely(!hpage) && likely(khugepaged_enabled())); | ||
792 | |||
793 | return hpage; | ||
794 | } | ||
795 | |||
796 | static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | ||
797 | { | ||
798 | if (!*hpage) | ||
799 | *hpage = khugepaged_alloc_hugepage(wait); | ||
800 | |||
801 | if (unlikely(!*hpage)) | ||
802 | return false; | ||
803 | |||
804 | return true; | ||
805 | } | ||
806 | |||
807 | static struct page * | ||
808 | khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) | ||
809 | { | ||
810 | VM_BUG_ON(!*hpage); | ||
811 | |||
812 | return *hpage; | ||
813 | } | ||
814 | #endif | ||
815 | |||
816 | static bool hugepage_vma_check(struct vm_area_struct *vma) | ||
817 | { | ||
818 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || | ||
819 | (vma->vm_flags & VM_NOHUGEPAGE)) | ||
820 | return false; | ||
821 | if (shmem_file(vma->vm_file)) { | ||
822 | if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) | ||
823 | return false; | ||
824 | return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, | ||
825 | HPAGE_PMD_NR); | ||
826 | } | ||
827 | if (!vma->anon_vma || vma->vm_ops) | ||
828 | return false; | ||
829 | if (is_vma_temporary_stack(vma)) | ||
830 | return false; | ||
831 | return !(vma->vm_flags & VM_NO_KHUGEPAGED); | ||
832 | } | ||
833 | |||
834 | /* | ||
835 | * If mmap_sem temporarily dropped, revalidate vma | ||
836 | * before taking mmap_sem. | ||
837 | * Return 0 if succeeds, otherwise return none-zero | ||
838 | * value (scan code). | ||
839 | */ | ||
840 | |||
841 | static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address) | ||
842 | { | ||
843 | struct vm_area_struct *vma; | ||
844 | unsigned long hstart, hend; | ||
845 | |||
846 | if (unlikely(khugepaged_test_exit(mm))) | ||
847 | return SCAN_ANY_PROCESS; | ||
848 | |||
849 | vma = find_vma(mm, address); | ||
850 | if (!vma) | ||
851 | return SCAN_VMA_NULL; | ||
852 | |||
853 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
854 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
855 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) | ||
856 | return SCAN_ADDRESS_RANGE; | ||
857 | if (!hugepage_vma_check(vma)) | ||
858 | return SCAN_VMA_CHECK; | ||
859 | return 0; | ||
860 | } | ||
861 | |||
862 | /* | ||
863 | * Bring missing pages in from swap, to complete THP collapse. | ||
864 | * Only done if khugepaged_scan_pmd believes it is worthwhile. | ||
865 | * | ||
866 | * Called and returns without pte mapped or spinlocks held, | ||
867 | * but with mmap_sem held to protect against vma changes. | ||
868 | */ | ||
869 | |||
870 | static bool __collapse_huge_page_swapin(struct mm_struct *mm, | ||
871 | struct vm_area_struct *vma, | ||
872 | unsigned long address, pmd_t *pmd, | ||
873 | int referenced) | ||
874 | { | ||
875 | pte_t pteval; | ||
876 | int swapped_in = 0, ret = 0; | ||
877 | struct fault_env fe = { | ||
878 | .vma = vma, | ||
879 | .address = address, | ||
880 | .flags = FAULT_FLAG_ALLOW_RETRY, | ||
881 | .pmd = pmd, | ||
882 | }; | ||
883 | |||
884 | fe.pte = pte_offset_map(pmd, address); | ||
885 | for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; | ||
886 | fe.pte++, fe.address += PAGE_SIZE) { | ||
887 | pteval = *fe.pte; | ||
888 | if (!is_swap_pte(pteval)) | ||
889 | continue; | ||
890 | swapped_in++; | ||
891 | /* we only decide to swapin, if there is enough young ptes */ | ||
892 | if (referenced < HPAGE_PMD_NR/2) { | ||
893 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); | ||
894 | return false; | ||
895 | } | ||
896 | ret = do_swap_page(&fe, pteval); | ||
897 | |||
898 | /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ | ||
899 | if (ret & VM_FAULT_RETRY) { | ||
900 | down_read(&mm->mmap_sem); | ||
901 | if (hugepage_vma_revalidate(mm, address)) { | ||
902 | /* vma is no longer available, don't continue to swapin */ | ||
903 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); | ||
904 | return false; | ||
905 | } | ||
906 | /* check if the pmd is still valid */ | ||
907 | if (mm_find_pmd(mm, address) != pmd) | ||
908 | return false; | ||
909 | } | ||
910 | if (ret & VM_FAULT_ERROR) { | ||
911 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); | ||
912 | return false; | ||
913 | } | ||
914 | /* pte is unmapped now, we need to map it */ | ||
915 | fe.pte = pte_offset_map(pmd, fe.address); | ||
916 | } | ||
917 | fe.pte--; | ||
918 | pte_unmap(fe.pte); | ||
919 | trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); | ||
920 | return true; | ||
921 | } | ||
922 | |||
923 | static void collapse_huge_page(struct mm_struct *mm, | ||
924 | unsigned long address, | ||
925 | struct page **hpage, | ||
926 | struct vm_area_struct *vma, | ||
927 | int node, int referenced) | ||
928 | { | ||
929 | pmd_t *pmd, _pmd; | ||
930 | pte_t *pte; | ||
931 | pgtable_t pgtable; | ||
932 | struct page *new_page; | ||
933 | spinlock_t *pmd_ptl, *pte_ptl; | ||
934 | int isolated = 0, result = 0; | ||
935 | struct mem_cgroup *memcg; | ||
936 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
937 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
938 | gfp_t gfp; | ||
939 | |||
940 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
941 | |||
942 | /* Only allocate from the target node */ | ||
943 | gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; | ||
944 | |||
945 | /* | ||
946 | * Before allocating the hugepage, release the mmap_sem read lock. | ||
947 | * The allocation can take potentially a long time if it involves | ||
948 | * sync compaction, and we do not need to hold the mmap_sem during | ||
949 | * that. We will recheck the vma after taking it again in write mode. | ||
950 | */ | ||
951 | up_read(&mm->mmap_sem); | ||
952 | new_page = khugepaged_alloc_page(hpage, gfp, node); | ||
953 | if (!new_page) { | ||
954 | result = SCAN_ALLOC_HUGE_PAGE_FAIL; | ||
955 | goto out_nolock; | ||
956 | } | ||
957 | |||
958 | if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { | ||
959 | result = SCAN_CGROUP_CHARGE_FAIL; | ||
960 | goto out_nolock; | ||
961 | } | ||
962 | |||
963 | down_read(&mm->mmap_sem); | ||
964 | result = hugepage_vma_revalidate(mm, address); | ||
965 | if (result) { | ||
966 | mem_cgroup_cancel_charge(new_page, memcg, true); | ||
967 | up_read(&mm->mmap_sem); | ||
968 | goto out_nolock; | ||
969 | } | ||
970 | |||
971 | pmd = mm_find_pmd(mm, address); | ||
972 | if (!pmd) { | ||
973 | result = SCAN_PMD_NULL; | ||
974 | mem_cgroup_cancel_charge(new_page, memcg, true); | ||
975 | up_read(&mm->mmap_sem); | ||
976 | goto out_nolock; | ||
977 | } | ||
978 | |||
979 | /* | ||
980 | * __collapse_huge_page_swapin always returns with mmap_sem locked. | ||
981 | * If it fails, we release mmap_sem and jump out_nolock. | ||
982 | * Continuing to collapse causes inconsistency. | ||
983 | */ | ||
984 | if (!__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) { | ||
985 | mem_cgroup_cancel_charge(new_page, memcg, true); | ||
986 | up_read(&mm->mmap_sem); | ||
987 | goto out_nolock; | ||
988 | } | ||
989 | |||
990 | up_read(&mm->mmap_sem); | ||
991 | /* | ||
992 | * Prevent all access to pagetables with the exception of | ||
993 | * gup_fast later handled by the ptep_clear_flush and the VM | ||
994 | * handled by the anon_vma lock + PG_lock. | ||
995 | */ | ||
996 | down_write(&mm->mmap_sem); | ||
997 | result = hugepage_vma_revalidate(mm, address); | ||
998 | if (result) | ||
999 | goto out; | ||
1000 | /* check if the pmd is still valid */ | ||
1001 | if (mm_find_pmd(mm, address) != pmd) | ||
1002 | goto out; | ||
1003 | |||
1004 | anon_vma_lock_write(vma->anon_vma); | ||
1005 | |||
1006 | pte = pte_offset_map(pmd, address); | ||
1007 | pte_ptl = pte_lockptr(mm, pmd); | ||
1008 | |||
1009 | mmun_start = address; | ||
1010 | mmun_end = address + HPAGE_PMD_SIZE; | ||
1011 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1012 | pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ | ||
1013 | /* | ||
1014 | * After this gup_fast can't run anymore. This also removes | ||
1015 | * any huge TLB entry from the CPU so we won't allow | ||
1016 | * huge and small TLB entries for the same virtual address | ||
1017 | * to avoid the risk of CPU bugs in that area. | ||
1018 | */ | ||
1019 | _pmd = pmdp_collapse_flush(vma, address, pmd); | ||
1020 | spin_unlock(pmd_ptl); | ||
1021 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1022 | |||
1023 | spin_lock(pte_ptl); | ||
1024 | isolated = __collapse_huge_page_isolate(vma, address, pte); | ||
1025 | spin_unlock(pte_ptl); | ||
1026 | |||
1027 | if (unlikely(!isolated)) { | ||
1028 | pte_unmap(pte); | ||
1029 | spin_lock(pmd_ptl); | ||
1030 | BUG_ON(!pmd_none(*pmd)); | ||
1031 | /* | ||
1032 | * We can only use set_pmd_at when establishing | ||
1033 | * hugepmds and never for establishing regular pmds that | ||
1034 | * points to regular pagetables. Use pmd_populate for that | ||
1035 | */ | ||
1036 | pmd_populate(mm, pmd, pmd_pgtable(_pmd)); | ||
1037 | spin_unlock(pmd_ptl); | ||
1038 | anon_vma_unlock_write(vma->anon_vma); | ||
1039 | result = SCAN_FAIL; | ||
1040 | goto out; | ||
1041 | } | ||
1042 | |||
1043 | /* | ||
1044 | * All pages are isolated and locked so anon_vma rmap | ||
1045 | * can't run anymore. | ||
1046 | */ | ||
1047 | anon_vma_unlock_write(vma->anon_vma); | ||
1048 | |||
1049 | __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); | ||
1050 | pte_unmap(pte); | ||
1051 | __SetPageUptodate(new_page); | ||
1052 | pgtable = pmd_pgtable(_pmd); | ||
1053 | |||
1054 | _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); | ||
1055 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); | ||
1056 | |||
1057 | /* | ||
1058 | * spin_lock() below is not the equivalent of smp_wmb(), so | ||
1059 | * this is needed to avoid the copy_huge_page writes to become | ||
1060 | * visible after the set_pmd_at() write. | ||
1061 | */ | ||
1062 | smp_wmb(); | ||
1063 | |||
1064 | spin_lock(pmd_ptl); | ||
1065 | BUG_ON(!pmd_none(*pmd)); | ||
1066 | page_add_new_anon_rmap(new_page, vma, address, true); | ||
1067 | mem_cgroup_commit_charge(new_page, memcg, false, true); | ||
1068 | lru_cache_add_active_or_unevictable(new_page, vma); | ||
1069 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | ||
1070 | set_pmd_at(mm, address, pmd, _pmd); | ||
1071 | update_mmu_cache_pmd(vma, address, pmd); | ||
1072 | spin_unlock(pmd_ptl); | ||
1073 | |||
1074 | *hpage = NULL; | ||
1075 | |||
1076 | khugepaged_pages_collapsed++; | ||
1077 | result = SCAN_SUCCEED; | ||
1078 | out_up_write: | ||
1079 | up_write(&mm->mmap_sem); | ||
1080 | out_nolock: | ||
1081 | trace_mm_collapse_huge_page(mm, isolated, result); | ||
1082 | return; | ||
1083 | out: | ||
1084 | mem_cgroup_cancel_charge(new_page, memcg, true); | ||
1085 | goto out_up_write; | ||
1086 | } | ||
1087 | |||
1088 | static int khugepaged_scan_pmd(struct mm_struct *mm, | ||
1089 | struct vm_area_struct *vma, | ||
1090 | unsigned long address, | ||
1091 | struct page **hpage) | ||
1092 | { | ||
1093 | pmd_t *pmd; | ||
1094 | pte_t *pte, *_pte; | ||
1095 | int ret = 0, none_or_zero = 0, result = 0, referenced = 0; | ||
1096 | struct page *page = NULL; | ||
1097 | unsigned long _address; | ||
1098 | spinlock_t *ptl; | ||
1099 | int node = NUMA_NO_NODE, unmapped = 0; | ||
1100 | bool writable = false; | ||
1101 | |||
1102 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
1103 | |||
1104 | pmd = mm_find_pmd(mm, address); | ||
1105 | if (!pmd) { | ||
1106 | result = SCAN_PMD_NULL; | ||
1107 | goto out; | ||
1108 | } | ||
1109 | |||
1110 | memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); | ||
1111 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
1112 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; | ||
1113 | _pte++, _address += PAGE_SIZE) { | ||
1114 | pte_t pteval = *_pte; | ||
1115 | if (is_swap_pte(pteval)) { | ||
1116 | if (++unmapped <= khugepaged_max_ptes_swap) { | ||
1117 | continue; | ||
1118 | } else { | ||
1119 | result = SCAN_EXCEED_SWAP_PTE; | ||
1120 | goto out_unmap; | ||
1121 | } | ||
1122 | } | ||
1123 | if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { | ||
1124 | if (!userfaultfd_armed(vma) && | ||
1125 | ++none_or_zero <= khugepaged_max_ptes_none) { | ||
1126 | continue; | ||
1127 | } else { | ||
1128 | result = SCAN_EXCEED_NONE_PTE; | ||
1129 | goto out_unmap; | ||
1130 | } | ||
1131 | } | ||
1132 | if (!pte_present(pteval)) { | ||
1133 | result = SCAN_PTE_NON_PRESENT; | ||
1134 | goto out_unmap; | ||
1135 | } | ||
1136 | if (pte_write(pteval)) | ||
1137 | writable = true; | ||
1138 | |||
1139 | page = vm_normal_page(vma, _address, pteval); | ||
1140 | if (unlikely(!page)) { | ||
1141 | result = SCAN_PAGE_NULL; | ||
1142 | goto out_unmap; | ||
1143 | } | ||
1144 | |||
1145 | /* TODO: teach khugepaged to collapse THP mapped with pte */ | ||
1146 | if (PageCompound(page)) { | ||
1147 | result = SCAN_PAGE_COMPOUND; | ||
1148 | goto out_unmap; | ||
1149 | } | ||
1150 | |||
1151 | /* | ||
1152 | * Record which node the original page is from and save this | ||
1153 | * information to khugepaged_node_load[]. | ||
1154 | * Khupaged will allocate hugepage from the node has the max | ||
1155 | * hit record. | ||
1156 | */ | ||
1157 | node = page_to_nid(page); | ||
1158 | if (khugepaged_scan_abort(node)) { | ||
1159 | result = SCAN_SCAN_ABORT; | ||
1160 | goto out_unmap; | ||
1161 | } | ||
1162 | khugepaged_node_load[node]++; | ||
1163 | if (!PageLRU(page)) { | ||
1164 | result = SCAN_PAGE_LRU; | ||
1165 | goto out_unmap; | ||
1166 | } | ||
1167 | if (PageLocked(page)) { | ||
1168 | result = SCAN_PAGE_LOCK; | ||
1169 | goto out_unmap; | ||
1170 | } | ||
1171 | if (!PageAnon(page)) { | ||
1172 | result = SCAN_PAGE_ANON; | ||
1173 | goto out_unmap; | ||
1174 | } | ||
1175 | |||
1176 | /* | ||
1177 | * cannot use mapcount: can't collapse if there's a gup pin. | ||
1178 | * The page must only be referenced by the scanned process | ||
1179 | * and page swap cache. | ||
1180 | */ | ||
1181 | if (page_count(page) != 1 + !!PageSwapCache(page)) { | ||
1182 | result = SCAN_PAGE_COUNT; | ||
1183 | goto out_unmap; | ||
1184 | } | ||
1185 | if (pte_young(pteval) || | ||
1186 | page_is_young(page) || PageReferenced(page) || | ||
1187 | mmu_notifier_test_young(vma->vm_mm, address)) | ||
1188 | referenced++; | ||
1189 | } | ||
1190 | if (writable) { | ||
1191 | if (referenced) { | ||
1192 | result = SCAN_SUCCEED; | ||
1193 | ret = 1; | ||
1194 | } else { | ||
1195 | result = SCAN_LACK_REFERENCED_PAGE; | ||
1196 | } | ||
1197 | } else { | ||
1198 | result = SCAN_PAGE_RO; | ||
1199 | } | ||
1200 | out_unmap: | ||
1201 | pte_unmap_unlock(pte, ptl); | ||
1202 | if (ret) { | ||
1203 | node = khugepaged_find_target_node(); | ||
1204 | /* collapse_huge_page will return with the mmap_sem released */ | ||
1205 | collapse_huge_page(mm, address, hpage, vma, node, referenced); | ||
1206 | } | ||
1207 | out: | ||
1208 | trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, | ||
1209 | none_or_zero, result, unmapped); | ||
1210 | return ret; | ||
1211 | } | ||
1212 | |||
1213 | static void collect_mm_slot(struct mm_slot *mm_slot) | ||
1214 | { | ||
1215 | struct mm_struct *mm = mm_slot->mm; | ||
1216 | |||
1217 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); | ||
1218 | |||
1219 | if (khugepaged_test_exit(mm)) { | ||
1220 | /* free mm_slot */ | ||
1221 | hash_del(&mm_slot->hash); | ||
1222 | list_del(&mm_slot->mm_node); | ||
1223 | |||
1224 | /* | ||
1225 | * Not strictly needed because the mm exited already. | ||
1226 | * | ||
1227 | * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | ||
1228 | */ | ||
1229 | |||
1230 | /* khugepaged_mm_lock actually not necessary for the below */ | ||
1231 | free_mm_slot(mm_slot); | ||
1232 | mmdrop(mm); | ||
1233 | } | ||
1234 | } | ||
1235 | |||
1236 | #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) | ||
1237 | static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) | ||
1238 | { | ||
1239 | struct vm_area_struct *vma; | ||
1240 | unsigned long addr; | ||
1241 | pmd_t *pmd, _pmd; | ||
1242 | |||
1243 | i_mmap_lock_write(mapping); | ||
1244 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { | ||
1245 | /* probably overkill */ | ||
1246 | if (vma->anon_vma) | ||
1247 | continue; | ||
1248 | addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | ||
1249 | if (addr & ~HPAGE_PMD_MASK) | ||
1250 | continue; | ||
1251 | if (vma->vm_end < addr + HPAGE_PMD_SIZE) | ||
1252 | continue; | ||
1253 | pmd = mm_find_pmd(vma->vm_mm, addr); | ||
1254 | if (!pmd) | ||
1255 | continue; | ||
1256 | /* | ||
1257 | * We need exclusive mmap_sem to retract page table. | ||
1258 | * If trylock fails we would end up with pte-mapped THP after | ||
1259 | * re-fault. Not ideal, but it's more important to not disturb | ||
1260 | * the system too much. | ||
1261 | */ | ||
1262 | if (down_write_trylock(&vma->vm_mm->mmap_sem)) { | ||
1263 | spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); | ||
1264 | /* assume page table is clear */ | ||
1265 | _pmd = pmdp_collapse_flush(vma, addr, pmd); | ||
1266 | spin_unlock(ptl); | ||
1267 | up_write(&vma->vm_mm->mmap_sem); | ||
1268 | atomic_long_dec(&vma->vm_mm->nr_ptes); | ||
1269 | pte_free(vma->vm_mm, pmd_pgtable(_pmd)); | ||
1270 | } | ||
1271 | } | ||
1272 | i_mmap_unlock_write(mapping); | ||
1273 | } | ||
1274 | |||
1275 | /** | ||
1276 | * collapse_shmem - collapse small tmpfs/shmem pages into huge one. | ||
1277 | * | ||
1278 | * Basic scheme is simple, details are more complex: | ||
1279 | * - allocate and freeze a new huge page; | ||
1280 | * - scan over radix tree replacing old pages the new one | ||
1281 | * + swap in pages if necessary; | ||
1282 | * + fill in gaps; | ||
1283 | * + keep old pages around in case if rollback is required; | ||
1284 | * - if replacing succeed: | ||
1285 | * + copy data over; | ||
1286 | * + free old pages; | ||
1287 | * + unfreeze huge page; | ||
1288 | * - if replacing failed; | ||
1289 | * + put all pages back and unfreeze them; | ||
1290 | * + restore gaps in the radix-tree; | ||
1291 | * + free huge page; | ||
1292 | */ | ||
1293 | static void collapse_shmem(struct mm_struct *mm, | ||
1294 | struct address_space *mapping, pgoff_t start, | ||
1295 | struct page **hpage, int node) | ||
1296 | { | ||
1297 | gfp_t gfp; | ||
1298 | struct page *page, *new_page, *tmp; | ||
1299 | struct mem_cgroup *memcg; | ||
1300 | pgoff_t index, end = start + HPAGE_PMD_NR; | ||
1301 | LIST_HEAD(pagelist); | ||
1302 | struct radix_tree_iter iter; | ||
1303 | void **slot; | ||
1304 | int nr_none = 0, result = SCAN_SUCCEED; | ||
1305 | |||
1306 | VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); | ||
1307 | |||
1308 | /* Only allocate from the target node */ | ||
1309 | gfp = alloc_hugepage_khugepaged_gfpmask() | | ||
1310 | __GFP_OTHER_NODE | __GFP_THISNODE; | ||
1311 | |||
1312 | new_page = khugepaged_alloc_page(hpage, gfp, node); | ||
1313 | if (!new_page) { | ||
1314 | result = SCAN_ALLOC_HUGE_PAGE_FAIL; | ||
1315 | goto out; | ||
1316 | } | ||
1317 | |||
1318 | if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { | ||
1319 | result = SCAN_CGROUP_CHARGE_FAIL; | ||
1320 | goto out; | ||
1321 | } | ||
1322 | |||
1323 | new_page->index = start; | ||
1324 | new_page->mapping = mapping; | ||
1325 | __SetPageSwapBacked(new_page); | ||
1326 | __SetPageLocked(new_page); | ||
1327 | BUG_ON(!page_ref_freeze(new_page, 1)); | ||
1328 | |||
1329 | |||
1330 | /* | ||
1331 | * At this point the new_page is 'frozen' (page_count() is zero), locked | ||
1332 | * and not up-to-date. It's safe to insert it into radix tree, because | ||
1333 | * nobody would be able to map it or use it in other way until we | ||
1334 | * unfreeze it. | ||
1335 | */ | ||
1336 | |||
1337 | index = start; | ||
1338 | spin_lock_irq(&mapping->tree_lock); | ||
1339 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { | ||
1340 | int n = min(iter.index, end) - index; | ||
1341 | |||
1342 | /* | ||
1343 | * Handle holes in the radix tree: charge it from shmem and | ||
1344 | * insert relevant subpage of new_page into the radix-tree. | ||
1345 | */ | ||
1346 | if (n && !shmem_charge(mapping->host, n)) { | ||
1347 | result = SCAN_FAIL; | ||
1348 | break; | ||
1349 | } | ||
1350 | nr_none += n; | ||
1351 | for (; index < min(iter.index, end); index++) { | ||
1352 | radix_tree_insert(&mapping->page_tree, index, | ||
1353 | new_page + (index % HPAGE_PMD_NR)); | ||
1354 | } | ||
1355 | |||
1356 | /* We are done. */ | ||
1357 | if (index >= end) | ||
1358 | break; | ||
1359 | |||
1360 | page = radix_tree_deref_slot_protected(slot, | ||
1361 | &mapping->tree_lock); | ||
1362 | if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) { | ||
1363 | spin_unlock_irq(&mapping->tree_lock); | ||
1364 | /* swap in or instantiate fallocated page */ | ||
1365 | if (shmem_getpage(mapping->host, index, &page, | ||
1366 | SGP_NOHUGE)) { | ||
1367 | result = SCAN_FAIL; | ||
1368 | goto tree_unlocked; | ||
1369 | } | ||
1370 | spin_lock_irq(&mapping->tree_lock); | ||
1371 | } else if (trylock_page(page)) { | ||
1372 | get_page(page); | ||
1373 | } else { | ||
1374 | result = SCAN_PAGE_LOCK; | ||
1375 | break; | ||
1376 | } | ||
1377 | |||
1378 | /* | ||
1379 | * The page must be locked, so we can drop the tree_lock | ||
1380 | * without racing with truncate. | ||
1381 | */ | ||
1382 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
1383 | VM_BUG_ON_PAGE(!PageUptodate(page), page); | ||
1384 | VM_BUG_ON_PAGE(PageTransCompound(page), page); | ||
1385 | |||
1386 | if (page_mapping(page) != mapping) { | ||
1387 | result = SCAN_TRUNCATED; | ||
1388 | goto out_unlock; | ||
1389 | } | ||
1390 | spin_unlock_irq(&mapping->tree_lock); | ||
1391 | |||
1392 | if (isolate_lru_page(page)) { | ||
1393 | result = SCAN_DEL_PAGE_LRU; | ||
1394 | goto out_isolate_failed; | ||
1395 | } | ||
1396 | |||
1397 | if (page_mapped(page)) | ||
1398 | unmap_mapping_range(mapping, index << PAGE_SHIFT, | ||
1399 | PAGE_SIZE, 0); | ||
1400 | |||
1401 | spin_lock_irq(&mapping->tree_lock); | ||
1402 | |||
1403 | VM_BUG_ON_PAGE(page_mapped(page), page); | ||
1404 | |||
1405 | /* | ||
1406 | * The page is expected to have page_count() == 3: | ||
1407 | * - we hold a pin on it; | ||
1408 | * - one reference from radix tree; | ||
1409 | * - one from isolate_lru_page; | ||
1410 | */ | ||
1411 | if (!page_ref_freeze(page, 3)) { | ||
1412 | result = SCAN_PAGE_COUNT; | ||
1413 | goto out_lru; | ||
1414 | } | ||
1415 | |||
1416 | /* | ||
1417 | * Add the page to the list to be able to undo the collapse if | ||
1418 | * something go wrong. | ||
1419 | */ | ||
1420 | list_add_tail(&page->lru, &pagelist); | ||
1421 | |||
1422 | /* Finally, replace with the new page. */ | ||
1423 | radix_tree_replace_slot(slot, | ||
1424 | new_page + (index % HPAGE_PMD_NR)); | ||
1425 | |||
1426 | index++; | ||
1427 | continue; | ||
1428 | out_lru: | ||
1429 | spin_unlock_irq(&mapping->tree_lock); | ||
1430 | putback_lru_page(page); | ||
1431 | out_isolate_failed: | ||
1432 | unlock_page(page); | ||
1433 | put_page(page); | ||
1434 | goto tree_unlocked; | ||
1435 | out_unlock: | ||
1436 | unlock_page(page); | ||
1437 | put_page(page); | ||
1438 | break; | ||
1439 | } | ||
1440 | |||
1441 | /* | ||
1442 | * Handle hole in radix tree at the end of the range. | ||
1443 | * This code only triggers if there's nothing in radix tree | ||
1444 | * beyond 'end'. | ||
1445 | */ | ||
1446 | if (result == SCAN_SUCCEED && index < end) { | ||
1447 | int n = end - index; | ||
1448 | |||
1449 | if (!shmem_charge(mapping->host, n)) { | ||
1450 | result = SCAN_FAIL; | ||
1451 | goto tree_locked; | ||
1452 | } | ||
1453 | |||
1454 | for (; index < end; index++) { | ||
1455 | radix_tree_insert(&mapping->page_tree, index, | ||
1456 | new_page + (index % HPAGE_PMD_NR)); | ||
1457 | } | ||
1458 | nr_none += n; | ||
1459 | } | ||
1460 | |||
1461 | tree_locked: | ||
1462 | spin_unlock_irq(&mapping->tree_lock); | ||
1463 | tree_unlocked: | ||
1464 | |||
1465 | if (result == SCAN_SUCCEED) { | ||
1466 | unsigned long flags; | ||
1467 | struct zone *zone = page_zone(new_page); | ||
1468 | |||
1469 | /* | ||
1470 | * Replacing old pages with new one has succeed, now we need to | ||
1471 | * copy the content and free old pages. | ||
1472 | */ | ||
1473 | list_for_each_entry_safe(page, tmp, &pagelist, lru) { | ||
1474 | copy_highpage(new_page + (page->index % HPAGE_PMD_NR), | ||
1475 | page); | ||
1476 | list_del(&page->lru); | ||
1477 | unlock_page(page); | ||
1478 | page_ref_unfreeze(page, 1); | ||
1479 | page->mapping = NULL; | ||
1480 | ClearPageActive(page); | ||
1481 | ClearPageUnevictable(page); | ||
1482 | put_page(page); | ||
1483 | } | ||
1484 | |||
1485 | local_irq_save(flags); | ||
1486 | __inc_zone_page_state(new_page, NR_SHMEM_THPS); | ||
1487 | if (nr_none) { | ||
1488 | __mod_zone_page_state(zone, NR_FILE_PAGES, nr_none); | ||
1489 | __mod_zone_page_state(zone, NR_SHMEM, nr_none); | ||
1490 | } | ||
1491 | local_irq_restore(flags); | ||
1492 | |||
1493 | /* | ||
1494 | * Remove pte page tables, so we can re-faulti | ||
1495 | * the page as huge. | ||
1496 | */ | ||
1497 | retract_page_tables(mapping, start); | ||
1498 | |||
1499 | /* Everything is ready, let's unfreeze the new_page */ | ||
1500 | set_page_dirty(new_page); | ||
1501 | SetPageUptodate(new_page); | ||
1502 | page_ref_unfreeze(new_page, HPAGE_PMD_NR); | ||
1503 | mem_cgroup_commit_charge(new_page, memcg, false, true); | ||
1504 | lru_cache_add_anon(new_page); | ||
1505 | unlock_page(new_page); | ||
1506 | |||
1507 | *hpage = NULL; | ||
1508 | } else { | ||
1509 | /* Something went wrong: rollback changes to the radix-tree */ | ||
1510 | shmem_uncharge(mapping->host, nr_none); | ||
1511 | spin_lock_irq(&mapping->tree_lock); | ||
1512 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, | ||
1513 | start) { | ||
1514 | if (iter.index >= end) | ||
1515 | break; | ||
1516 | page = list_first_entry_or_null(&pagelist, | ||
1517 | struct page, lru); | ||
1518 | if (!page || iter.index < page->index) { | ||
1519 | if (!nr_none) | ||
1520 | break; | ||
1521 | /* Put holes back where they were */ | ||
1522 | radix_tree_replace_slot(slot, NULL); | ||
1523 | nr_none--; | ||
1524 | continue; | ||
1525 | } | ||
1526 | |||
1527 | VM_BUG_ON_PAGE(page->index != iter.index, page); | ||
1528 | |||
1529 | /* Unfreeze the page. */ | ||
1530 | list_del(&page->lru); | ||
1531 | page_ref_unfreeze(page, 2); | ||
1532 | radix_tree_replace_slot(slot, page); | ||
1533 | spin_unlock_irq(&mapping->tree_lock); | ||
1534 | putback_lru_page(page); | ||
1535 | unlock_page(page); | ||
1536 | spin_lock_irq(&mapping->tree_lock); | ||
1537 | } | ||
1538 | VM_BUG_ON(nr_none); | ||
1539 | spin_unlock_irq(&mapping->tree_lock); | ||
1540 | |||
1541 | /* Unfreeze new_page, caller would take care about freeing it */ | ||
1542 | page_ref_unfreeze(new_page, 1); | ||
1543 | mem_cgroup_cancel_charge(new_page, memcg, true); | ||
1544 | unlock_page(new_page); | ||
1545 | new_page->mapping = NULL; | ||
1546 | } | ||
1547 | out: | ||
1548 | VM_BUG_ON(!list_empty(&pagelist)); | ||
1549 | /* TODO: tracepoints */ | ||
1550 | } | ||
1551 | |||
1552 | static void khugepaged_scan_shmem(struct mm_struct *mm, | ||
1553 | struct address_space *mapping, | ||
1554 | pgoff_t start, struct page **hpage) | ||
1555 | { | ||
1556 | struct page *page = NULL; | ||
1557 | struct radix_tree_iter iter; | ||
1558 | void **slot; | ||
1559 | int present, swap; | ||
1560 | int node = NUMA_NO_NODE; | ||
1561 | int result = SCAN_SUCCEED; | ||
1562 | |||
1563 | present = 0; | ||
1564 | swap = 0; | ||
1565 | memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); | ||
1566 | rcu_read_lock(); | ||
1567 | radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { | ||
1568 | if (iter.index >= start + HPAGE_PMD_NR) | ||
1569 | break; | ||
1570 | |||
1571 | page = radix_tree_deref_slot(slot); | ||
1572 | if (radix_tree_deref_retry(page)) { | ||
1573 | slot = radix_tree_iter_retry(&iter); | ||
1574 | continue; | ||
1575 | } | ||
1576 | |||
1577 | if (radix_tree_exception(page)) { | ||
1578 | if (++swap > khugepaged_max_ptes_swap) { | ||
1579 | result = SCAN_EXCEED_SWAP_PTE; | ||
1580 | break; | ||
1581 | } | ||
1582 | continue; | ||
1583 | } | ||
1584 | |||
1585 | if (PageTransCompound(page)) { | ||
1586 | result = SCAN_PAGE_COMPOUND; | ||
1587 | break; | ||
1588 | } | ||
1589 | |||
1590 | node = page_to_nid(page); | ||
1591 | if (khugepaged_scan_abort(node)) { | ||
1592 | result = SCAN_SCAN_ABORT; | ||
1593 | break; | ||
1594 | } | ||
1595 | khugepaged_node_load[node]++; | ||
1596 | |||
1597 | if (!PageLRU(page)) { | ||
1598 | result = SCAN_PAGE_LRU; | ||
1599 | break; | ||
1600 | } | ||
1601 | |||
1602 | if (page_count(page) != 1 + page_mapcount(page)) { | ||
1603 | result = SCAN_PAGE_COUNT; | ||
1604 | break; | ||
1605 | } | ||
1606 | |||
1607 | /* | ||
1608 | * We probably should check if the page is referenced here, but | ||
1609 | * nobody would transfer pte_young() to PageReferenced() for us. | ||
1610 | * And rmap walk here is just too costly... | ||
1611 | */ | ||
1612 | |||
1613 | present++; | ||
1614 | |||
1615 | if (need_resched()) { | ||
1616 | cond_resched_rcu(); | ||
1617 | slot = radix_tree_iter_next(&iter); | ||
1618 | } | ||
1619 | } | ||
1620 | rcu_read_unlock(); | ||
1621 | |||
1622 | if (result == SCAN_SUCCEED) { | ||
1623 | if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { | ||
1624 | result = SCAN_EXCEED_NONE_PTE; | ||
1625 | } else { | ||
1626 | node = khugepaged_find_target_node(); | ||
1627 | collapse_shmem(mm, mapping, start, hpage, node); | ||
1628 | } | ||
1629 | } | ||
1630 | |||
1631 | /* TODO: tracepoints */ | ||
1632 | } | ||
1633 | #else | ||
1634 | static void khugepaged_scan_shmem(struct mm_struct *mm, | ||
1635 | struct address_space *mapping, | ||
1636 | pgoff_t start, struct page **hpage) | ||
1637 | { | ||
1638 | BUILD_BUG(); | ||
1639 | } | ||
1640 | #endif | ||
1641 | |||
1642 | static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | ||
1643 | struct page **hpage) | ||
1644 | __releases(&khugepaged_mm_lock) | ||
1645 | __acquires(&khugepaged_mm_lock) | ||
1646 | { | ||
1647 | struct mm_slot *mm_slot; | ||
1648 | struct mm_struct *mm; | ||
1649 | struct vm_area_struct *vma; | ||
1650 | int progress = 0; | ||
1651 | |||
1652 | VM_BUG_ON(!pages); | ||
1653 | VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock)); | ||
1654 | |||
1655 | if (khugepaged_scan.mm_slot) | ||
1656 | mm_slot = khugepaged_scan.mm_slot; | ||
1657 | else { | ||
1658 | mm_slot = list_entry(khugepaged_scan.mm_head.next, | ||
1659 | struct mm_slot, mm_node); | ||
1660 | khugepaged_scan.address = 0; | ||
1661 | khugepaged_scan.mm_slot = mm_slot; | ||
1662 | } | ||
1663 | spin_unlock(&khugepaged_mm_lock); | ||
1664 | |||
1665 | mm = mm_slot->mm; | ||
1666 | down_read(&mm->mmap_sem); | ||
1667 | if (unlikely(khugepaged_test_exit(mm))) | ||
1668 | vma = NULL; | ||
1669 | else | ||
1670 | vma = find_vma(mm, khugepaged_scan.address); | ||
1671 | |||
1672 | progress++; | ||
1673 | for (; vma; vma = vma->vm_next) { | ||
1674 | unsigned long hstart, hend; | ||
1675 | |||
1676 | cond_resched(); | ||
1677 | if (unlikely(khugepaged_test_exit(mm))) { | ||
1678 | progress++; | ||
1679 | break; | ||
1680 | } | ||
1681 | if (!hugepage_vma_check(vma)) { | ||
1682 | skip: | ||
1683 | progress++; | ||
1684 | continue; | ||
1685 | } | ||
1686 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
1687 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
1688 | if (hstart >= hend) | ||
1689 | goto skip; | ||
1690 | if (khugepaged_scan.address > hend) | ||
1691 | goto skip; | ||
1692 | if (khugepaged_scan.address < hstart) | ||
1693 | khugepaged_scan.address = hstart; | ||
1694 | VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); | ||
1695 | |||
1696 | while (khugepaged_scan.address < hend) { | ||
1697 | int ret; | ||
1698 | cond_resched(); | ||
1699 | if (unlikely(khugepaged_test_exit(mm))) | ||
1700 | goto breakouterloop; | ||
1701 | |||
1702 | VM_BUG_ON(khugepaged_scan.address < hstart || | ||
1703 | khugepaged_scan.address + HPAGE_PMD_SIZE > | ||
1704 | hend); | ||
1705 | if (shmem_file(vma->vm_file)) { | ||
1706 | struct file *file; | ||
1707 | pgoff_t pgoff = linear_page_index(vma, | ||
1708 | khugepaged_scan.address); | ||
1709 | if (!shmem_huge_enabled(vma)) | ||
1710 | goto skip; | ||
1711 | file = get_file(vma->vm_file); | ||
1712 | up_read(&mm->mmap_sem); | ||
1713 | ret = 1; | ||
1714 | khugepaged_scan_shmem(mm, file->f_mapping, | ||
1715 | pgoff, hpage); | ||
1716 | fput(file); | ||
1717 | } else { | ||
1718 | ret = khugepaged_scan_pmd(mm, vma, | ||
1719 | khugepaged_scan.address, | ||
1720 | hpage); | ||
1721 | } | ||
1722 | /* move to next address */ | ||
1723 | khugepaged_scan.address += HPAGE_PMD_SIZE; | ||
1724 | progress += HPAGE_PMD_NR; | ||
1725 | if (ret) | ||
1726 | /* we released mmap_sem so break loop */ | ||
1727 | goto breakouterloop_mmap_sem; | ||
1728 | if (progress >= pages) | ||
1729 | goto breakouterloop; | ||
1730 | } | ||
1731 | } | ||
1732 | breakouterloop: | ||
1733 | up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ | ||
1734 | breakouterloop_mmap_sem: | ||
1735 | |||
1736 | spin_lock(&khugepaged_mm_lock); | ||
1737 | VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot); | ||
1738 | /* | ||
1739 | * Release the current mm_slot if this mm is about to die, or | ||
1740 | * if we scanned all vmas of this mm. | ||
1741 | */ | ||
1742 | if (khugepaged_test_exit(mm) || !vma) { | ||
1743 | /* | ||
1744 | * Make sure that if mm_users is reaching zero while | ||
1745 | * khugepaged runs here, khugepaged_exit will find | ||
1746 | * mm_slot not pointing to the exiting mm. | ||
1747 | */ | ||
1748 | if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { | ||
1749 | khugepaged_scan.mm_slot = list_entry( | ||
1750 | mm_slot->mm_node.next, | ||
1751 | struct mm_slot, mm_node); | ||
1752 | khugepaged_scan.address = 0; | ||
1753 | } else { | ||
1754 | khugepaged_scan.mm_slot = NULL; | ||
1755 | khugepaged_full_scans++; | ||
1756 | } | ||
1757 | |||
1758 | collect_mm_slot(mm_slot); | ||
1759 | } | ||
1760 | |||
1761 | return progress; | ||
1762 | } | ||
1763 | |||
1764 | static int khugepaged_has_work(void) | ||
1765 | { | ||
1766 | return !list_empty(&khugepaged_scan.mm_head) && | ||
1767 | khugepaged_enabled(); | ||
1768 | } | ||
1769 | |||
1770 | static int khugepaged_wait_event(void) | ||
1771 | { | ||
1772 | return !list_empty(&khugepaged_scan.mm_head) || | ||
1773 | kthread_should_stop(); | ||
1774 | } | ||
1775 | |||
1776 | static void khugepaged_do_scan(void) | ||
1777 | { | ||
1778 | struct page *hpage = NULL; | ||
1779 | unsigned int progress = 0, pass_through_head = 0; | ||
1780 | unsigned int pages = khugepaged_pages_to_scan; | ||
1781 | bool wait = true; | ||
1782 | |||
1783 | barrier(); /* write khugepaged_pages_to_scan to local stack */ | ||
1784 | |||
1785 | while (progress < pages) { | ||
1786 | if (!khugepaged_prealloc_page(&hpage, &wait)) | ||
1787 | break; | ||
1788 | |||
1789 | cond_resched(); | ||
1790 | |||
1791 | if (unlikely(kthread_should_stop() || try_to_freeze())) | ||
1792 | break; | ||
1793 | |||
1794 | spin_lock(&khugepaged_mm_lock); | ||
1795 | if (!khugepaged_scan.mm_slot) | ||
1796 | pass_through_head++; | ||
1797 | if (khugepaged_has_work() && | ||
1798 | pass_through_head < 2) | ||
1799 | progress += khugepaged_scan_mm_slot(pages - progress, | ||
1800 | &hpage); | ||
1801 | else | ||
1802 | progress = pages; | ||
1803 | spin_unlock(&khugepaged_mm_lock); | ||
1804 | } | ||
1805 | |||
1806 | if (!IS_ERR_OR_NULL(hpage)) | ||
1807 | put_page(hpage); | ||
1808 | } | ||
1809 | |||
1810 | static bool khugepaged_should_wakeup(void) | ||
1811 | { | ||
1812 | return kthread_should_stop() || | ||
1813 | time_after_eq(jiffies, khugepaged_sleep_expire); | ||
1814 | } | ||
1815 | |||
1816 | static void khugepaged_wait_work(void) | ||
1817 | { | ||
1818 | if (khugepaged_has_work()) { | ||
1819 | const unsigned long scan_sleep_jiffies = | ||
1820 | msecs_to_jiffies(khugepaged_scan_sleep_millisecs); | ||
1821 | |||
1822 | if (!scan_sleep_jiffies) | ||
1823 | return; | ||
1824 | |||
1825 | khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; | ||
1826 | wait_event_freezable_timeout(khugepaged_wait, | ||
1827 | khugepaged_should_wakeup(), | ||
1828 | scan_sleep_jiffies); | ||
1829 | return; | ||
1830 | } | ||
1831 | |||
1832 | if (khugepaged_enabled()) | ||
1833 | wait_event_freezable(khugepaged_wait, khugepaged_wait_event()); | ||
1834 | } | ||
1835 | |||
1836 | static int khugepaged(void *none) | ||
1837 | { | ||
1838 | struct mm_slot *mm_slot; | ||
1839 | |||
1840 | set_freezable(); | ||
1841 | set_user_nice(current, MAX_NICE); | ||
1842 | |||
1843 | while (!kthread_should_stop()) { | ||
1844 | khugepaged_do_scan(); | ||
1845 | khugepaged_wait_work(); | ||
1846 | } | ||
1847 | |||
1848 | spin_lock(&khugepaged_mm_lock); | ||
1849 | mm_slot = khugepaged_scan.mm_slot; | ||
1850 | khugepaged_scan.mm_slot = NULL; | ||
1851 | if (mm_slot) | ||
1852 | collect_mm_slot(mm_slot); | ||
1853 | spin_unlock(&khugepaged_mm_lock); | ||
1854 | return 0; | ||
1855 | } | ||
1856 | |||
1857 | static void set_recommended_min_free_kbytes(void) | ||
1858 | { | ||
1859 | struct zone *zone; | ||
1860 | int nr_zones = 0; | ||
1861 | unsigned long recommended_min; | ||
1862 | |||
1863 | for_each_populated_zone(zone) | ||
1864 | nr_zones++; | ||
1865 | |||
1866 | /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ | ||
1867 | recommended_min = pageblock_nr_pages * nr_zones * 2; | ||
1868 | |||
1869 | /* | ||
1870 | * Make sure that on average at least two pageblocks are almost free | ||
1871 | * of another type, one for a migratetype to fall back to and a | ||
1872 | * second to avoid subsequent fallbacks of other types There are 3 | ||
1873 | * MIGRATE_TYPES we care about. | ||
1874 | */ | ||
1875 | recommended_min += pageblock_nr_pages * nr_zones * | ||
1876 | MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; | ||
1877 | |||
1878 | /* don't ever allow to reserve more than 5% of the lowmem */ | ||
1879 | recommended_min = min(recommended_min, | ||
1880 | (unsigned long) nr_free_buffer_pages() / 20); | ||
1881 | recommended_min <<= (PAGE_SHIFT-10); | ||
1882 | |||
1883 | if (recommended_min > min_free_kbytes) { | ||
1884 | if (user_min_free_kbytes >= 0) | ||
1885 | pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", | ||
1886 | min_free_kbytes, recommended_min); | ||
1887 | |||
1888 | min_free_kbytes = recommended_min; | ||
1889 | } | ||
1890 | setup_per_zone_wmarks(); | ||
1891 | } | ||
1892 | |||
1893 | int start_stop_khugepaged(void) | ||
1894 | { | ||
1895 | static struct task_struct *khugepaged_thread __read_mostly; | ||
1896 | static DEFINE_MUTEX(khugepaged_mutex); | ||
1897 | int err = 0; | ||
1898 | |||
1899 | mutex_lock(&khugepaged_mutex); | ||
1900 | if (khugepaged_enabled()) { | ||
1901 | if (!khugepaged_thread) | ||
1902 | khugepaged_thread = kthread_run(khugepaged, NULL, | ||
1903 | "khugepaged"); | ||
1904 | if (IS_ERR(khugepaged_thread)) { | ||
1905 | pr_err("khugepaged: kthread_run(khugepaged) failed\n"); | ||
1906 | err = PTR_ERR(khugepaged_thread); | ||
1907 | khugepaged_thread = NULL; | ||
1908 | goto fail; | ||
1909 | } | ||
1910 | |||
1911 | if (!list_empty(&khugepaged_scan.mm_head)) | ||
1912 | wake_up_interruptible(&khugepaged_wait); | ||
1913 | |||
1914 | set_recommended_min_free_kbytes(); | ||
1915 | } else if (khugepaged_thread) { | ||
1916 | kthread_stop(khugepaged_thread); | ||
1917 | khugepaged_thread = NULL; | ||
1918 | } | ||
1919 | fail: | ||
1920 | mutex_unlock(&khugepaged_mutex); | ||
1921 | return err; | ||
1922 | } | ||
@@ -376,9 +376,8 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
376 | if (IS_ERR_OR_NULL(page)) | 376 | if (IS_ERR_OR_NULL(page)) |
377 | break; | 377 | break; |
378 | if (PageKsm(page)) | 378 | if (PageKsm(page)) |
379 | ret = handle_mm_fault(vma->vm_mm, vma, addr, | 379 | ret = handle_mm_fault(vma, addr, |
380 | FAULT_FLAG_WRITE | | 380 | FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE); |
381 | FAULT_FLAG_REMOTE); | ||
382 | else | 381 | else |
383 | ret = VM_FAULT_WRITE; | 382 | ret = VM_FAULT_WRITE; |
384 | put_page(page); | 383 | put_page(page); |
@@ -532,8 +531,8 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) | |||
532 | void *expected_mapping; | 531 | void *expected_mapping; |
533 | unsigned long kpfn; | 532 | unsigned long kpfn; |
534 | 533 | ||
535 | expected_mapping = (void *)stable_node + | 534 | expected_mapping = (void *)((unsigned long)stable_node | |
536 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | 535 | PAGE_MAPPING_KSM); |
537 | again: | 536 | again: |
538 | kpfn = READ_ONCE(stable_node->kpfn); | 537 | kpfn = READ_ONCE(stable_node->kpfn); |
539 | page = pfn_to_page(kpfn); | 538 | page = pfn_to_page(kpfn); |
diff --git a/mm/memblock.c b/mm/memblock.c index ac1248933b31..ca099159b45a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -584,6 +584,9 @@ repeat: | |||
584 | nid, flags); | 584 | nid, flags); |
585 | } | 585 | } |
586 | 586 | ||
587 | if (!nr_new) | ||
588 | return 0; | ||
589 | |||
587 | /* | 590 | /* |
588 | * If this was the first round, resize array and repeat for actual | 591 | * If this was the first round, resize array and repeat for actual |
589 | * insertions; otherwise, merge and return. | 592 | * insertions; otherwise, merge and return. |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5339c89dff63..f3a84c64f35c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1259,6 +1259,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1259 | struct oom_control oc = { | 1259 | struct oom_control oc = { |
1260 | .zonelist = NULL, | 1260 | .zonelist = NULL, |
1261 | .nodemask = NULL, | 1261 | .nodemask = NULL, |
1262 | .memcg = memcg, | ||
1262 | .gfp_mask = gfp_mask, | 1263 | .gfp_mask = gfp_mask, |
1263 | .order = order, | 1264 | .order = order, |
1264 | }; | 1265 | }; |
@@ -1281,7 +1282,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1281 | goto unlock; | 1282 | goto unlock; |
1282 | } | 1283 | } |
1283 | 1284 | ||
1284 | check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg); | 1285 | check_panic_on_oom(&oc, CONSTRAINT_MEMCG); |
1285 | totalpages = mem_cgroup_get_limit(memcg) ? : 1; | 1286 | totalpages = mem_cgroup_get_limit(memcg) ? : 1; |
1286 | for_each_mem_cgroup_tree(iter, memcg) { | 1287 | for_each_mem_cgroup_tree(iter, memcg) { |
1287 | struct css_task_iter it; | 1288 | struct css_task_iter it; |
@@ -1289,7 +1290,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1289 | 1290 | ||
1290 | css_task_iter_start(&iter->css, &it); | 1291 | css_task_iter_start(&iter->css, &it); |
1291 | while ((task = css_task_iter_next(&it))) { | 1292 | while ((task = css_task_iter_next(&it))) { |
1292 | switch (oom_scan_process_thread(&oc, task, totalpages)) { | 1293 | switch (oom_scan_process_thread(&oc, task)) { |
1293 | case OOM_SCAN_SELECT: | 1294 | case OOM_SCAN_SELECT: |
1294 | if (chosen) | 1295 | if (chosen) |
1295 | put_task_struct(chosen); | 1296 | put_task_struct(chosen); |
@@ -1329,7 +1330,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1329 | 1330 | ||
1330 | if (chosen) { | 1331 | if (chosen) { |
1331 | points = chosen_points * 1000 / totalpages; | 1332 | points = chosen_points * 1000 / totalpages; |
1332 | oom_kill_process(&oc, chosen, points, totalpages, memcg, | 1333 | oom_kill_process(&oc, chosen, points, totalpages, |
1333 | "Memory cgroup out of memory"); | 1334 | "Memory cgroup out of memory"); |
1334 | } | 1335 | } |
1335 | unlock: | 1336 | unlock: |
@@ -2272,20 +2273,30 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, | |||
2272 | current->memcg_kmem_skip_account = 0; | 2273 | current->memcg_kmem_skip_account = 0; |
2273 | } | 2274 | } |
2274 | 2275 | ||
2275 | /* | 2276 | static inline bool memcg_kmem_bypass(void) |
2277 | { | ||
2278 | if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) | ||
2279 | return true; | ||
2280 | return false; | ||
2281 | } | ||
2282 | |||
2283 | /** | ||
2284 | * memcg_kmem_get_cache: select the correct per-memcg cache for allocation | ||
2285 | * @cachep: the original global kmem cache | ||
2286 | * | ||
2276 | * Return the kmem_cache we're supposed to use for a slab allocation. | 2287 | * Return the kmem_cache we're supposed to use for a slab allocation. |
2277 | * We try to use the current memcg's version of the cache. | 2288 | * We try to use the current memcg's version of the cache. |
2278 | * | 2289 | * |
2279 | * If the cache does not exist yet, if we are the first user of it, | 2290 | * If the cache does not exist yet, if we are the first user of it, we |
2280 | * we either create it immediately, if possible, or create it asynchronously | 2291 | * create it asynchronously in a workqueue and let the current allocation |
2281 | * in a workqueue. | 2292 | * go through with the original cache. |
2282 | * In the latter case, we will let the current allocation go through with | ||
2283 | * the original cache. | ||
2284 | * | 2293 | * |
2285 | * Can't be called in interrupt context or from kernel threads. | 2294 | * This function takes a reference to the cache it returns to assure it |
2286 | * This function needs to be called with rcu_read_lock() held. | 2295 | * won't get destroyed while we are working with it. Once the caller is |
2296 | * done with it, memcg_kmem_put_cache() must be called to release the | ||
2297 | * reference. | ||
2287 | */ | 2298 | */ |
2288 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) | 2299 | struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) |
2289 | { | 2300 | { |
2290 | struct mem_cgroup *memcg; | 2301 | struct mem_cgroup *memcg; |
2291 | struct kmem_cache *memcg_cachep; | 2302 | struct kmem_cache *memcg_cachep; |
@@ -2293,10 +2304,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2293 | 2304 | ||
2294 | VM_BUG_ON(!is_root_cache(cachep)); | 2305 | VM_BUG_ON(!is_root_cache(cachep)); |
2295 | 2306 | ||
2296 | if (cachep->flags & SLAB_ACCOUNT) | 2307 | if (memcg_kmem_bypass()) |
2297 | gfp |= __GFP_ACCOUNT; | ||
2298 | |||
2299 | if (!(gfp & __GFP_ACCOUNT)) | ||
2300 | return cachep; | 2308 | return cachep; |
2301 | 2309 | ||
2302 | if (current->memcg_kmem_skip_account) | 2310 | if (current->memcg_kmem_skip_account) |
@@ -2329,14 +2337,27 @@ out: | |||
2329 | return cachep; | 2337 | return cachep; |
2330 | } | 2338 | } |
2331 | 2339 | ||
2332 | void __memcg_kmem_put_cache(struct kmem_cache *cachep) | 2340 | /** |
2341 | * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache | ||
2342 | * @cachep: the cache returned by memcg_kmem_get_cache | ||
2343 | */ | ||
2344 | void memcg_kmem_put_cache(struct kmem_cache *cachep) | ||
2333 | { | 2345 | { |
2334 | if (!is_root_cache(cachep)) | 2346 | if (!is_root_cache(cachep)) |
2335 | css_put(&cachep->memcg_params.memcg->css); | 2347 | css_put(&cachep->memcg_params.memcg->css); |
2336 | } | 2348 | } |
2337 | 2349 | ||
2338 | int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, | 2350 | /** |
2339 | struct mem_cgroup *memcg) | 2351 | * memcg_kmem_charge: charge a kmem page |
2352 | * @page: page to charge | ||
2353 | * @gfp: reclaim mode | ||
2354 | * @order: allocation order | ||
2355 | * @memcg: memory cgroup to charge | ||
2356 | * | ||
2357 | * Returns 0 on success, an error code on failure. | ||
2358 | */ | ||
2359 | int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, | ||
2360 | struct mem_cgroup *memcg) | ||
2340 | { | 2361 | { |
2341 | unsigned int nr_pages = 1 << order; | 2362 | unsigned int nr_pages = 1 << order; |
2342 | struct page_counter *counter; | 2363 | struct page_counter *counter; |
@@ -2357,19 +2378,34 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, | |||
2357 | return 0; | 2378 | return 0; |
2358 | } | 2379 | } |
2359 | 2380 | ||
2360 | int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) | 2381 | /** |
2382 | * memcg_kmem_charge: charge a kmem page to the current memory cgroup | ||
2383 | * @page: page to charge | ||
2384 | * @gfp: reclaim mode | ||
2385 | * @order: allocation order | ||
2386 | * | ||
2387 | * Returns 0 on success, an error code on failure. | ||
2388 | */ | ||
2389 | int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) | ||
2361 | { | 2390 | { |
2362 | struct mem_cgroup *memcg; | 2391 | struct mem_cgroup *memcg; |
2363 | int ret = 0; | 2392 | int ret = 0; |
2364 | 2393 | ||
2394 | if (memcg_kmem_bypass()) | ||
2395 | return 0; | ||
2396 | |||
2365 | memcg = get_mem_cgroup_from_mm(current->mm); | 2397 | memcg = get_mem_cgroup_from_mm(current->mm); |
2366 | if (!mem_cgroup_is_root(memcg)) | 2398 | if (!mem_cgroup_is_root(memcg)) |
2367 | ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); | 2399 | ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); |
2368 | css_put(&memcg->css); | 2400 | css_put(&memcg->css); |
2369 | return ret; | 2401 | return ret; |
2370 | } | 2402 | } |
2371 | 2403 | /** | |
2372 | void __memcg_kmem_uncharge(struct page *page, int order) | 2404 | * memcg_kmem_uncharge: uncharge a kmem page |
2405 | * @page: page to uncharge | ||
2406 | * @order: allocation order | ||
2407 | */ | ||
2408 | void memcg_kmem_uncharge(struct page *page, int order) | ||
2373 | { | 2409 | { |
2374 | struct mem_cgroup *memcg = page->mem_cgroup; | 2410 | struct mem_cgroup *memcg = page->mem_cgroup; |
2375 | unsigned int nr_pages = 1 << order; | 2411 | unsigned int nr_pages = 1 << order; |
@@ -4409,7 +4445,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | |||
4409 | 4445 | ||
4410 | #ifdef CONFIG_SWAP | 4446 | #ifdef CONFIG_SWAP |
4411 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | 4447 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, |
4412 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | 4448 | pte_t ptent, swp_entry_t *entry) |
4413 | { | 4449 | { |
4414 | struct page *page = NULL; | 4450 | struct page *page = NULL; |
4415 | swp_entry_t ent = pte_to_swp_entry(ptent); | 4451 | swp_entry_t ent = pte_to_swp_entry(ptent); |
@@ -4428,7 +4464,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | |||
4428 | } | 4464 | } |
4429 | #else | 4465 | #else |
4430 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | 4466 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, |
4431 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | 4467 | pte_t ptent, swp_entry_t *entry) |
4432 | { | 4468 | { |
4433 | return NULL; | 4469 | return NULL; |
4434 | } | 4470 | } |
@@ -4471,7 +4507,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
4471 | /** | 4507 | /** |
4472 | * mem_cgroup_move_account - move account of the page | 4508 | * mem_cgroup_move_account - move account of the page |
4473 | * @page: the page | 4509 | * @page: the page |
4474 | * @nr_pages: number of regular pages (>1 for huge pages) | 4510 | * @compound: charge the page as compound or small page |
4475 | * @from: mem_cgroup which the page is moved from. | 4511 | * @from: mem_cgroup which the page is moved from. |
4476 | * @to: mem_cgroup which the page is moved to. @from != @to. | 4512 | * @to: mem_cgroup which the page is moved to. @from != @to. |
4477 | * | 4513 | * |
@@ -4593,7 +4629,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | |||
4593 | if (pte_present(ptent)) | 4629 | if (pte_present(ptent)) |
4594 | page = mc_handle_present_pte(vma, addr, ptent); | 4630 | page = mc_handle_present_pte(vma, addr, ptent); |
4595 | else if (is_swap_pte(ptent)) | 4631 | else if (is_swap_pte(ptent)) |
4596 | page = mc_handle_swap_pte(vma, addr, ptent, &ent); | 4632 | page = mc_handle_swap_pte(vma, ptent, &ent); |
4597 | else if (pte_none(ptent)) | 4633 | else if (pte_none(ptent)) |
4598 | page = mc_handle_file_pte(vma, addr, ptent, &ent); | 4634 | page = mc_handle_file_pte(vma, addr, ptent, &ent); |
4599 | 4635 | ||
@@ -5333,6 +5369,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) | |||
5333 | * @mm: mm context of the victim | 5369 | * @mm: mm context of the victim |
5334 | * @gfp_mask: reclaim mode | 5370 | * @gfp_mask: reclaim mode |
5335 | * @memcgp: charged memcg return | 5371 | * @memcgp: charged memcg return |
5372 | * @compound: charge the page as compound or small page | ||
5336 | * | 5373 | * |
5337 | * Try to charge @page to the memcg that @mm belongs to, reclaiming | 5374 | * Try to charge @page to the memcg that @mm belongs to, reclaiming |
5338 | * pages according to @gfp_mask if necessary. | 5375 | * pages according to @gfp_mask if necessary. |
@@ -5395,6 +5432,7 @@ out: | |||
5395 | * @page: page to charge | 5432 | * @page: page to charge |
5396 | * @memcg: memcg to charge the page to | 5433 | * @memcg: memcg to charge the page to |
5397 | * @lrucare: page might be on LRU already | 5434 | * @lrucare: page might be on LRU already |
5435 | * @compound: charge the page as compound or small page | ||
5398 | * | 5436 | * |
5399 | * Finalize a charge transaction started by mem_cgroup_try_charge(), | 5437 | * Finalize a charge transaction started by mem_cgroup_try_charge(), |
5400 | * after page->mapping has been set up. This must happen atomically | 5438 | * after page->mapping has been set up. This must happen atomically |
@@ -5446,6 +5484,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, | |||
5446 | * mem_cgroup_cancel_charge - cancel a page charge | 5484 | * mem_cgroup_cancel_charge - cancel a page charge |
5447 | * @page: page to charge | 5485 | * @page: page to charge |
5448 | * @memcg: memcg to charge the page to | 5486 | * @memcg: memcg to charge the page to |
5487 | * @compound: charge the page as compound or small page | ||
5449 | * | 5488 | * |
5450 | * Cancel a charge transaction started by mem_cgroup_try_charge(). | 5489 | * Cancel a charge transaction started by mem_cgroup_try_charge(). |
5451 | */ | 5490 | */ |
@@ -5469,15 +5508,18 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, | |||
5469 | 5508 | ||
5470 | static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, | 5509 | static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, |
5471 | unsigned long nr_anon, unsigned long nr_file, | 5510 | unsigned long nr_anon, unsigned long nr_file, |
5472 | unsigned long nr_huge, struct page *dummy_page) | 5511 | unsigned long nr_huge, unsigned long nr_kmem, |
5512 | struct page *dummy_page) | ||
5473 | { | 5513 | { |
5474 | unsigned long nr_pages = nr_anon + nr_file; | 5514 | unsigned long nr_pages = nr_anon + nr_file + nr_kmem; |
5475 | unsigned long flags; | 5515 | unsigned long flags; |
5476 | 5516 | ||
5477 | if (!mem_cgroup_is_root(memcg)) { | 5517 | if (!mem_cgroup_is_root(memcg)) { |
5478 | page_counter_uncharge(&memcg->memory, nr_pages); | 5518 | page_counter_uncharge(&memcg->memory, nr_pages); |
5479 | if (do_memsw_account()) | 5519 | if (do_memsw_account()) |
5480 | page_counter_uncharge(&memcg->memsw, nr_pages); | 5520 | page_counter_uncharge(&memcg->memsw, nr_pages); |
5521 | if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem) | ||
5522 | page_counter_uncharge(&memcg->kmem, nr_kmem); | ||
5481 | memcg_oom_recover(memcg); | 5523 | memcg_oom_recover(memcg); |
5482 | } | 5524 | } |
5483 | 5525 | ||
@@ -5500,6 +5542,7 @@ static void uncharge_list(struct list_head *page_list) | |||
5500 | unsigned long nr_anon = 0; | 5542 | unsigned long nr_anon = 0; |
5501 | unsigned long nr_file = 0; | 5543 | unsigned long nr_file = 0; |
5502 | unsigned long nr_huge = 0; | 5544 | unsigned long nr_huge = 0; |
5545 | unsigned long nr_kmem = 0; | ||
5503 | unsigned long pgpgout = 0; | 5546 | unsigned long pgpgout = 0; |
5504 | struct list_head *next; | 5547 | struct list_head *next; |
5505 | struct page *page; | 5548 | struct page *page; |
@@ -5510,8 +5553,6 @@ static void uncharge_list(struct list_head *page_list) | |||
5510 | */ | 5553 | */ |
5511 | next = page_list->next; | 5554 | next = page_list->next; |
5512 | do { | 5555 | do { |
5513 | unsigned int nr_pages = 1; | ||
5514 | |||
5515 | page = list_entry(next, struct page, lru); | 5556 | page = list_entry(next, struct page, lru); |
5516 | next = page->lru.next; | 5557 | next = page->lru.next; |
5517 | 5558 | ||
@@ -5530,31 +5571,34 @@ static void uncharge_list(struct list_head *page_list) | |||
5530 | if (memcg != page->mem_cgroup) { | 5571 | if (memcg != page->mem_cgroup) { |
5531 | if (memcg) { | 5572 | if (memcg) { |
5532 | uncharge_batch(memcg, pgpgout, nr_anon, nr_file, | 5573 | uncharge_batch(memcg, pgpgout, nr_anon, nr_file, |
5533 | nr_huge, page); | 5574 | nr_huge, nr_kmem, page); |
5534 | pgpgout = nr_anon = nr_file = nr_huge = 0; | 5575 | pgpgout = nr_anon = nr_file = |
5576 | nr_huge = nr_kmem = 0; | ||
5535 | } | 5577 | } |
5536 | memcg = page->mem_cgroup; | 5578 | memcg = page->mem_cgroup; |
5537 | } | 5579 | } |
5538 | 5580 | ||
5539 | if (PageTransHuge(page)) { | 5581 | if (!PageKmemcg(page)) { |
5540 | nr_pages <<= compound_order(page); | 5582 | unsigned int nr_pages = 1; |
5541 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | ||
5542 | nr_huge += nr_pages; | ||
5543 | } | ||
5544 | 5583 | ||
5545 | if (PageAnon(page)) | 5584 | if (PageTransHuge(page)) { |
5546 | nr_anon += nr_pages; | 5585 | nr_pages <<= compound_order(page); |
5547 | else | 5586 | nr_huge += nr_pages; |
5548 | nr_file += nr_pages; | 5587 | } |
5588 | if (PageAnon(page)) | ||
5589 | nr_anon += nr_pages; | ||
5590 | else | ||
5591 | nr_file += nr_pages; | ||
5592 | pgpgout++; | ||
5593 | } else | ||
5594 | nr_kmem += 1 << compound_order(page); | ||
5549 | 5595 | ||
5550 | page->mem_cgroup = NULL; | 5596 | page->mem_cgroup = NULL; |
5551 | |||
5552 | pgpgout++; | ||
5553 | } while (next != page_list); | 5597 | } while (next != page_list); |
5554 | 5598 | ||
5555 | if (memcg) | 5599 | if (memcg) |
5556 | uncharge_batch(memcg, pgpgout, nr_anon, nr_file, | 5600 | uncharge_batch(memcg, pgpgout, nr_anon, nr_file, |
5557 | nr_huge, page); | 5601 | nr_huge, nr_kmem, page); |
5558 | } | 5602 | } |
5559 | 5603 | ||
5560 | /** | 5604 | /** |
diff --git a/mm/memory.c b/mm/memory.c index 9e046819e619..4425b6059339 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -233,6 +233,7 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long | |||
233 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | 233 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE |
234 | tlb->batch = NULL; | 234 | tlb->batch = NULL; |
235 | #endif | 235 | #endif |
236 | tlb->page_size = 0; | ||
236 | 237 | ||
237 | __tlb_reset_range(tlb); | 238 | __tlb_reset_range(tlb); |
238 | } | 239 | } |
@@ -292,23 +293,31 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e | |||
292 | * handling the additional races in SMP caused by other CPUs caching valid | 293 | * handling the additional races in SMP caused by other CPUs caching valid |
293 | * mappings in their TLBs. Returns the number of free page slots left. | 294 | * mappings in their TLBs. Returns the number of free page slots left. |
294 | * When out of page slots we must call tlb_flush_mmu(). | 295 | * When out of page slots we must call tlb_flush_mmu(). |
296 | *returns true if the caller should flush. | ||
295 | */ | 297 | */ |
296 | int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) | 298 | bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) |
297 | { | 299 | { |
298 | struct mmu_gather_batch *batch; | 300 | struct mmu_gather_batch *batch; |
299 | 301 | ||
300 | VM_BUG_ON(!tlb->end); | 302 | VM_BUG_ON(!tlb->end); |
301 | 303 | ||
304 | if (!tlb->page_size) | ||
305 | tlb->page_size = page_size; | ||
306 | else { | ||
307 | if (page_size != tlb->page_size) | ||
308 | return true; | ||
309 | } | ||
310 | |||
302 | batch = tlb->active; | 311 | batch = tlb->active; |
303 | batch->pages[batch->nr++] = page; | ||
304 | if (batch->nr == batch->max) { | 312 | if (batch->nr == batch->max) { |
305 | if (!tlb_next_batch(tlb)) | 313 | if (!tlb_next_batch(tlb)) |
306 | return 0; | 314 | return true; |
307 | batch = tlb->active; | 315 | batch = tlb->active; |
308 | } | 316 | } |
309 | VM_BUG_ON_PAGE(batch->nr > batch->max, page); | 317 | VM_BUG_ON_PAGE(batch->nr > batch->max, page); |
310 | 318 | ||
311 | return batch->max - batch->nr; | 319 | batch->pages[batch->nr++] = page; |
320 | return false; | ||
312 | } | 321 | } |
313 | 322 | ||
314 | #endif /* HAVE_GENERIC_MMU_GATHER */ | 323 | #endif /* HAVE_GENERIC_MMU_GATHER */ |
@@ -1109,6 +1118,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
1109 | pte_t *start_pte; | 1118 | pte_t *start_pte; |
1110 | pte_t *pte; | 1119 | pte_t *pte; |
1111 | swp_entry_t entry; | 1120 | swp_entry_t entry; |
1121 | struct page *pending_page = NULL; | ||
1112 | 1122 | ||
1113 | again: | 1123 | again: |
1114 | init_rss_vec(rss); | 1124 | init_rss_vec(rss); |
@@ -1132,7 +1142,7 @@ again: | |||
1132 | * unmap shared but keep private pages. | 1142 | * unmap shared but keep private pages. |
1133 | */ | 1143 | */ |
1134 | if (details->check_mapping && | 1144 | if (details->check_mapping && |
1135 | details->check_mapping != page->mapping) | 1145 | details->check_mapping != page_rmapping(page)) |
1136 | continue; | 1146 | continue; |
1137 | } | 1147 | } |
1138 | ptent = ptep_get_and_clear_full(mm, addr, pte, | 1148 | ptent = ptep_get_and_clear_full(mm, addr, pte, |
@@ -1160,8 +1170,9 @@ again: | |||
1160 | page_remove_rmap(page, false); | 1170 | page_remove_rmap(page, false); |
1161 | if (unlikely(page_mapcount(page) < 0)) | 1171 | if (unlikely(page_mapcount(page) < 0)) |
1162 | print_bad_pte(vma, addr, ptent, page); | 1172 | print_bad_pte(vma, addr, ptent, page); |
1163 | if (unlikely(!__tlb_remove_page(tlb, page))) { | 1173 | if (unlikely(__tlb_remove_page(tlb, page))) { |
1164 | force_flush = 1; | 1174 | force_flush = 1; |
1175 | pending_page = page; | ||
1165 | addr += PAGE_SIZE; | 1176 | addr += PAGE_SIZE; |
1166 | break; | 1177 | break; |
1167 | } | 1178 | } |
@@ -1202,7 +1213,11 @@ again: | |||
1202 | if (force_flush) { | 1213 | if (force_flush) { |
1203 | force_flush = 0; | 1214 | force_flush = 0; |
1204 | tlb_flush_mmu_free(tlb); | 1215 | tlb_flush_mmu_free(tlb); |
1205 | 1216 | if (pending_page) { | |
1217 | /* remove the page with new size */ | ||
1218 | __tlb_remove_pte_page(tlb, pending_page); | ||
1219 | pending_page = NULL; | ||
1220 | } | ||
1206 | if (addr != end) | 1221 | if (addr != end) |
1207 | goto again; | 1222 | goto again; |
1208 | } | 1223 | } |
@@ -1479,7 +1494,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
1479 | /* Ok, finally just insert the thing.. */ | 1494 | /* Ok, finally just insert the thing.. */ |
1480 | get_page(page); | 1495 | get_page(page); |
1481 | inc_mm_counter_fast(mm, mm_counter_file(page)); | 1496 | inc_mm_counter_fast(mm, mm_counter_file(page)); |
1482 | page_add_file_rmap(page); | 1497 | page_add_file_rmap(page, false); |
1483 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | 1498 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); |
1484 | 1499 | ||
1485 | retval = 0; | 1500 | retval = 0; |
@@ -2055,13 +2070,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, | |||
2055 | * case, all we need to do here is to mark the page as writable and update | 2070 | * case, all we need to do here is to mark the page as writable and update |
2056 | * any related book-keeping. | 2071 | * any related book-keeping. |
2057 | */ | 2072 | */ |
2058 | static inline int wp_page_reuse(struct mm_struct *mm, | 2073 | static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, |
2059 | struct vm_area_struct *vma, unsigned long address, | 2074 | struct page *page, int page_mkwrite, int dirty_shared) |
2060 | pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, | 2075 | __releases(fe->ptl) |
2061 | struct page *page, int page_mkwrite, | ||
2062 | int dirty_shared) | ||
2063 | __releases(ptl) | ||
2064 | { | 2076 | { |
2077 | struct vm_area_struct *vma = fe->vma; | ||
2065 | pte_t entry; | 2078 | pte_t entry; |
2066 | /* | 2079 | /* |
2067 | * Clear the pages cpupid information as the existing | 2080 | * Clear the pages cpupid information as the existing |
@@ -2071,12 +2084,12 @@ static inline int wp_page_reuse(struct mm_struct *mm, | |||
2071 | if (page) | 2084 | if (page) |
2072 | page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); | 2085 | page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); |
2073 | 2086 | ||
2074 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2087 | flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); |
2075 | entry = pte_mkyoung(orig_pte); | 2088 | entry = pte_mkyoung(orig_pte); |
2076 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2089 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2077 | if (ptep_set_access_flags(vma, address, page_table, entry, 1)) | 2090 | if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) |
2078 | update_mmu_cache(vma, address, page_table); | 2091 | update_mmu_cache(vma, fe->address, fe->pte); |
2079 | pte_unmap_unlock(page_table, ptl); | 2092 | pte_unmap_unlock(fe->pte, fe->ptl); |
2080 | 2093 | ||
2081 | if (dirty_shared) { | 2094 | if (dirty_shared) { |
2082 | struct address_space *mapping; | 2095 | struct address_space *mapping; |
@@ -2122,30 +2135,31 @@ static inline int wp_page_reuse(struct mm_struct *mm, | |||
2122 | * held to the old page, as well as updating the rmap. | 2135 | * held to the old page, as well as updating the rmap. |
2123 | * - In any case, unlock the PTL and drop the reference we took to the old page. | 2136 | * - In any case, unlock the PTL and drop the reference we took to the old page. |
2124 | */ | 2137 | */ |
2125 | static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | 2138 | static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, |
2126 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2139 | struct page *old_page) |
2127 | pte_t orig_pte, struct page *old_page) | ||
2128 | { | 2140 | { |
2141 | struct vm_area_struct *vma = fe->vma; | ||
2142 | struct mm_struct *mm = vma->vm_mm; | ||
2129 | struct page *new_page = NULL; | 2143 | struct page *new_page = NULL; |
2130 | spinlock_t *ptl = NULL; | ||
2131 | pte_t entry; | 2144 | pte_t entry; |
2132 | int page_copied = 0; | 2145 | int page_copied = 0; |
2133 | const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */ | 2146 | const unsigned long mmun_start = fe->address & PAGE_MASK; |
2134 | const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */ | 2147 | const unsigned long mmun_end = mmun_start + PAGE_SIZE; |
2135 | struct mem_cgroup *memcg; | 2148 | struct mem_cgroup *memcg; |
2136 | 2149 | ||
2137 | if (unlikely(anon_vma_prepare(vma))) | 2150 | if (unlikely(anon_vma_prepare(vma))) |
2138 | goto oom; | 2151 | goto oom; |
2139 | 2152 | ||
2140 | if (is_zero_pfn(pte_pfn(orig_pte))) { | 2153 | if (is_zero_pfn(pte_pfn(orig_pte))) { |
2141 | new_page = alloc_zeroed_user_highpage_movable(vma, address); | 2154 | new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); |
2142 | if (!new_page) | 2155 | if (!new_page) |
2143 | goto oom; | 2156 | goto oom; |
2144 | } else { | 2157 | } else { |
2145 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 2158 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, |
2159 | fe->address); | ||
2146 | if (!new_page) | 2160 | if (!new_page) |
2147 | goto oom; | 2161 | goto oom; |
2148 | cow_user_page(new_page, old_page, address, vma); | 2162 | cow_user_page(new_page, old_page, fe->address, vma); |
2149 | } | 2163 | } |
2150 | 2164 | ||
2151 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) | 2165 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) |
@@ -2158,8 +2172,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2158 | /* | 2172 | /* |
2159 | * Re-check the pte - we dropped the lock | 2173 | * Re-check the pte - we dropped the lock |
2160 | */ | 2174 | */ |
2161 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2175 | fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); |
2162 | if (likely(pte_same(*page_table, orig_pte))) { | 2176 | if (likely(pte_same(*fe->pte, orig_pte))) { |
2163 | if (old_page) { | 2177 | if (old_page) { |
2164 | if (!PageAnon(old_page)) { | 2178 | if (!PageAnon(old_page)) { |
2165 | dec_mm_counter_fast(mm, | 2179 | dec_mm_counter_fast(mm, |
@@ -2169,7 +2183,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2169 | } else { | 2183 | } else { |
2170 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2184 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2171 | } | 2185 | } |
2172 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2186 | flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); |
2173 | entry = mk_pte(new_page, vma->vm_page_prot); | 2187 | entry = mk_pte(new_page, vma->vm_page_prot); |
2174 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2188 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2175 | /* | 2189 | /* |
@@ -2178,8 +2192,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2178 | * seen in the presence of one thread doing SMC and another | 2192 | * seen in the presence of one thread doing SMC and another |
2179 | * thread doing COW. | 2193 | * thread doing COW. |
2180 | */ | 2194 | */ |
2181 | ptep_clear_flush_notify(vma, address, page_table); | 2195 | ptep_clear_flush_notify(vma, fe->address, fe->pte); |
2182 | page_add_new_anon_rmap(new_page, vma, address, false); | 2196 | page_add_new_anon_rmap(new_page, vma, fe->address, false); |
2183 | mem_cgroup_commit_charge(new_page, memcg, false, false); | 2197 | mem_cgroup_commit_charge(new_page, memcg, false, false); |
2184 | lru_cache_add_active_or_unevictable(new_page, vma); | 2198 | lru_cache_add_active_or_unevictable(new_page, vma); |
2185 | /* | 2199 | /* |
@@ -2187,8 +2201,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2187 | * mmu page tables (such as kvm shadow page tables), we want the | 2201 | * mmu page tables (such as kvm shadow page tables), we want the |
2188 | * new page to be mapped directly into the secondary page table. | 2202 | * new page to be mapped directly into the secondary page table. |
2189 | */ | 2203 | */ |
2190 | set_pte_at_notify(mm, address, page_table, entry); | 2204 | set_pte_at_notify(mm, fe->address, fe->pte, entry); |
2191 | update_mmu_cache(vma, address, page_table); | 2205 | update_mmu_cache(vma, fe->address, fe->pte); |
2192 | if (old_page) { | 2206 | if (old_page) { |
2193 | /* | 2207 | /* |
2194 | * Only after switching the pte to the new page may | 2208 | * Only after switching the pte to the new page may |
@@ -2225,7 +2239,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2225 | if (new_page) | 2239 | if (new_page) |
2226 | put_page(new_page); | 2240 | put_page(new_page); |
2227 | 2241 | ||
2228 | pte_unmap_unlock(page_table, ptl); | 2242 | pte_unmap_unlock(fe->pte, fe->ptl); |
2229 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 2243 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
2230 | if (old_page) { | 2244 | if (old_page) { |
2231 | /* | 2245 | /* |
@@ -2253,44 +2267,43 @@ oom: | |||
2253 | * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED | 2267 | * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED |
2254 | * mapping | 2268 | * mapping |
2255 | */ | 2269 | */ |
2256 | static int wp_pfn_shared(struct mm_struct *mm, | 2270 | static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) |
2257 | struct vm_area_struct *vma, unsigned long address, | ||
2258 | pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, | ||
2259 | pmd_t *pmd) | ||
2260 | { | 2271 | { |
2272 | struct vm_area_struct *vma = fe->vma; | ||
2273 | |||
2261 | if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { | 2274 | if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { |
2262 | struct vm_fault vmf = { | 2275 | struct vm_fault vmf = { |
2263 | .page = NULL, | 2276 | .page = NULL, |
2264 | .pgoff = linear_page_index(vma, address), | 2277 | .pgoff = linear_page_index(vma, fe->address), |
2265 | .virtual_address = (void __user *)(address & PAGE_MASK), | 2278 | .virtual_address = |
2279 | (void __user *)(fe->address & PAGE_MASK), | ||
2266 | .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, | 2280 | .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, |
2267 | }; | 2281 | }; |
2268 | int ret; | 2282 | int ret; |
2269 | 2283 | ||
2270 | pte_unmap_unlock(page_table, ptl); | 2284 | pte_unmap_unlock(fe->pte, fe->ptl); |
2271 | ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); | 2285 | ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); |
2272 | if (ret & VM_FAULT_ERROR) | 2286 | if (ret & VM_FAULT_ERROR) |
2273 | return ret; | 2287 | return ret; |
2274 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2288 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
2289 | &fe->ptl); | ||
2275 | /* | 2290 | /* |
2276 | * We might have raced with another page fault while we | 2291 | * We might have raced with another page fault while we |
2277 | * released the pte_offset_map_lock. | 2292 | * released the pte_offset_map_lock. |
2278 | */ | 2293 | */ |
2279 | if (!pte_same(*page_table, orig_pte)) { | 2294 | if (!pte_same(*fe->pte, orig_pte)) { |
2280 | pte_unmap_unlock(page_table, ptl); | 2295 | pte_unmap_unlock(fe->pte, fe->ptl); |
2281 | return 0; | 2296 | return 0; |
2282 | } | 2297 | } |
2283 | } | 2298 | } |
2284 | return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, | 2299 | return wp_page_reuse(fe, orig_pte, NULL, 0, 0); |
2285 | NULL, 0, 0); | ||
2286 | } | 2300 | } |
2287 | 2301 | ||
2288 | static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | 2302 | static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, |
2289 | unsigned long address, pte_t *page_table, | 2303 | struct page *old_page) |
2290 | pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, | 2304 | __releases(fe->ptl) |
2291 | struct page *old_page) | ||
2292 | __releases(ptl) | ||
2293 | { | 2305 | { |
2306 | struct vm_area_struct *vma = fe->vma; | ||
2294 | int page_mkwrite = 0; | 2307 | int page_mkwrite = 0; |
2295 | 2308 | ||
2296 | get_page(old_page); | 2309 | get_page(old_page); |
@@ -2298,8 +2311,8 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2298 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | 2311 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { |
2299 | int tmp; | 2312 | int tmp; |
2300 | 2313 | ||
2301 | pte_unmap_unlock(page_table, ptl); | 2314 | pte_unmap_unlock(fe->pte, fe->ptl); |
2302 | tmp = do_page_mkwrite(vma, old_page, address); | 2315 | tmp = do_page_mkwrite(vma, old_page, fe->address); |
2303 | if (unlikely(!tmp || (tmp & | 2316 | if (unlikely(!tmp || (tmp & |
2304 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | 2317 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { |
2305 | put_page(old_page); | 2318 | put_page(old_page); |
@@ -2311,19 +2324,18 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2311 | * they did, we just return, as we can count on the | 2324 | * they did, we just return, as we can count on the |
2312 | * MMU to tell us if they didn't also make it writable. | 2325 | * MMU to tell us if they didn't also make it writable. |
2313 | */ | 2326 | */ |
2314 | page_table = pte_offset_map_lock(mm, pmd, address, | 2327 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
2315 | &ptl); | 2328 | &fe->ptl); |
2316 | if (!pte_same(*page_table, orig_pte)) { | 2329 | if (!pte_same(*fe->pte, orig_pte)) { |
2317 | unlock_page(old_page); | 2330 | unlock_page(old_page); |
2318 | pte_unmap_unlock(page_table, ptl); | 2331 | pte_unmap_unlock(fe->pte, fe->ptl); |
2319 | put_page(old_page); | 2332 | put_page(old_page); |
2320 | return 0; | 2333 | return 0; |
2321 | } | 2334 | } |
2322 | page_mkwrite = 1; | 2335 | page_mkwrite = 1; |
2323 | } | 2336 | } |
2324 | 2337 | ||
2325 | return wp_page_reuse(mm, vma, address, page_table, ptl, | 2338 | return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); |
2326 | orig_pte, old_page, page_mkwrite, 1); | ||
2327 | } | 2339 | } |
2328 | 2340 | ||
2329 | /* | 2341 | /* |
@@ -2344,14 +2356,13 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2344 | * but allow concurrent faults), with pte both mapped and locked. | 2356 | * but allow concurrent faults), with pte both mapped and locked. |
2345 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2357 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2346 | */ | 2358 | */ |
2347 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2359 | static int do_wp_page(struct fault_env *fe, pte_t orig_pte) |
2348 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2360 | __releases(fe->ptl) |
2349 | spinlock_t *ptl, pte_t orig_pte) | ||
2350 | __releases(ptl) | ||
2351 | { | 2361 | { |
2362 | struct vm_area_struct *vma = fe->vma; | ||
2352 | struct page *old_page; | 2363 | struct page *old_page; |
2353 | 2364 | ||
2354 | old_page = vm_normal_page(vma, address, orig_pte); | 2365 | old_page = vm_normal_page(vma, fe->address, orig_pte); |
2355 | if (!old_page) { | 2366 | if (!old_page) { |
2356 | /* | 2367 | /* |
2357 | * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a | 2368 | * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a |
@@ -2362,12 +2373,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2362 | */ | 2373 | */ |
2363 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2374 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2364 | (VM_WRITE|VM_SHARED)) | 2375 | (VM_WRITE|VM_SHARED)) |
2365 | return wp_pfn_shared(mm, vma, address, page_table, ptl, | 2376 | return wp_pfn_shared(fe, orig_pte); |
2366 | orig_pte, pmd); | ||
2367 | 2377 | ||
2368 | pte_unmap_unlock(page_table, ptl); | 2378 | pte_unmap_unlock(fe->pte, fe->ptl); |
2369 | return wp_page_copy(mm, vma, address, page_table, pmd, | 2379 | return wp_page_copy(fe, orig_pte, old_page); |
2370 | orig_pte, old_page); | ||
2371 | } | 2380 | } |
2372 | 2381 | ||
2373 | /* | 2382 | /* |
@@ -2378,13 +2387,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2378 | int total_mapcount; | 2387 | int total_mapcount; |
2379 | if (!trylock_page(old_page)) { | 2388 | if (!trylock_page(old_page)) { |
2380 | get_page(old_page); | 2389 | get_page(old_page); |
2381 | pte_unmap_unlock(page_table, ptl); | 2390 | pte_unmap_unlock(fe->pte, fe->ptl); |
2382 | lock_page(old_page); | 2391 | lock_page(old_page); |
2383 | page_table = pte_offset_map_lock(mm, pmd, address, | 2392 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, |
2384 | &ptl); | 2393 | fe->address, &fe->ptl); |
2385 | if (!pte_same(*page_table, orig_pte)) { | 2394 | if (!pte_same(*fe->pte, orig_pte)) { |
2386 | unlock_page(old_page); | 2395 | unlock_page(old_page); |
2387 | pte_unmap_unlock(page_table, ptl); | 2396 | pte_unmap_unlock(fe->pte, fe->ptl); |
2388 | put_page(old_page); | 2397 | put_page(old_page); |
2389 | return 0; | 2398 | return 0; |
2390 | } | 2399 | } |
@@ -2402,14 +2411,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2402 | page_move_anon_rmap(old_page, vma); | 2411 | page_move_anon_rmap(old_page, vma); |
2403 | } | 2412 | } |
2404 | unlock_page(old_page); | 2413 | unlock_page(old_page); |
2405 | return wp_page_reuse(mm, vma, address, page_table, ptl, | 2414 | return wp_page_reuse(fe, orig_pte, old_page, 0, 0); |
2406 | orig_pte, old_page, 0, 0); | ||
2407 | } | 2415 | } |
2408 | unlock_page(old_page); | 2416 | unlock_page(old_page); |
2409 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2417 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2410 | (VM_WRITE|VM_SHARED))) { | 2418 | (VM_WRITE|VM_SHARED))) { |
2411 | return wp_page_shared(mm, vma, address, page_table, pmd, | 2419 | return wp_page_shared(fe, orig_pte, old_page); |
2412 | ptl, orig_pte, old_page); | ||
2413 | } | 2420 | } |
2414 | 2421 | ||
2415 | /* | 2422 | /* |
@@ -2417,9 +2424,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2417 | */ | 2424 | */ |
2418 | get_page(old_page); | 2425 | get_page(old_page); |
2419 | 2426 | ||
2420 | pte_unmap_unlock(page_table, ptl); | 2427 | pte_unmap_unlock(fe->pte, fe->ptl); |
2421 | return wp_page_copy(mm, vma, address, page_table, pmd, | 2428 | return wp_page_copy(fe, orig_pte, old_page); |
2422 | orig_pte, old_page); | ||
2423 | } | 2429 | } |
2424 | 2430 | ||
2425 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, | 2431 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, |
@@ -2507,11 +2513,9 @@ EXPORT_SYMBOL(unmap_mapping_range); | |||
2507 | * We return with the mmap_sem locked or unlocked in the same cases | 2513 | * We return with the mmap_sem locked or unlocked in the same cases |
2508 | * as does filemap_fault(). | 2514 | * as does filemap_fault(). |
2509 | */ | 2515 | */ |
2510 | static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2516 | int do_swap_page(struct fault_env *fe, pte_t orig_pte) |
2511 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
2512 | unsigned int flags, pte_t orig_pte) | ||
2513 | { | 2517 | { |
2514 | spinlock_t *ptl; | 2518 | struct vm_area_struct *vma = fe->vma; |
2515 | struct page *page, *swapcache; | 2519 | struct page *page, *swapcache; |
2516 | struct mem_cgroup *memcg; | 2520 | struct mem_cgroup *memcg; |
2517 | swp_entry_t entry; | 2521 | swp_entry_t entry; |
@@ -2520,17 +2524,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2520 | int exclusive = 0; | 2524 | int exclusive = 0; |
2521 | int ret = 0; | 2525 | int ret = 0; |
2522 | 2526 | ||
2523 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2527 | if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) |
2524 | goto out; | 2528 | goto out; |
2525 | 2529 | ||
2526 | entry = pte_to_swp_entry(orig_pte); | 2530 | entry = pte_to_swp_entry(orig_pte); |
2527 | if (unlikely(non_swap_entry(entry))) { | 2531 | if (unlikely(non_swap_entry(entry))) { |
2528 | if (is_migration_entry(entry)) { | 2532 | if (is_migration_entry(entry)) { |
2529 | migration_entry_wait(mm, pmd, address); | 2533 | migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); |
2530 | } else if (is_hwpoison_entry(entry)) { | 2534 | } else if (is_hwpoison_entry(entry)) { |
2531 | ret = VM_FAULT_HWPOISON; | 2535 | ret = VM_FAULT_HWPOISON; |
2532 | } else { | 2536 | } else { |
2533 | print_bad_pte(vma, address, orig_pte, NULL); | 2537 | print_bad_pte(vma, fe->address, orig_pte, NULL); |
2534 | ret = VM_FAULT_SIGBUS; | 2538 | ret = VM_FAULT_SIGBUS; |
2535 | } | 2539 | } |
2536 | goto out; | 2540 | goto out; |
@@ -2539,14 +2543,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2539 | page = lookup_swap_cache(entry); | 2543 | page = lookup_swap_cache(entry); |
2540 | if (!page) { | 2544 | if (!page) { |
2541 | page = swapin_readahead(entry, | 2545 | page = swapin_readahead(entry, |
2542 | GFP_HIGHUSER_MOVABLE, vma, address); | 2546 | GFP_HIGHUSER_MOVABLE, vma, fe->address); |
2543 | if (!page) { | 2547 | if (!page) { |
2544 | /* | 2548 | /* |
2545 | * Back out if somebody else faulted in this pte | 2549 | * Back out if somebody else faulted in this pte |
2546 | * while we released the pte lock. | 2550 | * while we released the pte lock. |
2547 | */ | 2551 | */ |
2548 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2552 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, |
2549 | if (likely(pte_same(*page_table, orig_pte))) | 2553 | fe->address, &fe->ptl); |
2554 | if (likely(pte_same(*fe->pte, orig_pte))) | ||
2550 | ret = VM_FAULT_OOM; | 2555 | ret = VM_FAULT_OOM; |
2551 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2556 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2552 | goto unlock; | 2557 | goto unlock; |
@@ -2555,7 +2560,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2555 | /* Had to read the page from swap area: Major fault */ | 2560 | /* Had to read the page from swap area: Major fault */ |
2556 | ret = VM_FAULT_MAJOR; | 2561 | ret = VM_FAULT_MAJOR; |
2557 | count_vm_event(PGMAJFAULT); | 2562 | count_vm_event(PGMAJFAULT); |
2558 | mem_cgroup_count_vm_event(mm, PGMAJFAULT); | 2563 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); |
2559 | } else if (PageHWPoison(page)) { | 2564 | } else if (PageHWPoison(page)) { |
2560 | /* | 2565 | /* |
2561 | * hwpoisoned dirty swapcache pages are kept for killing | 2566 | * hwpoisoned dirty swapcache pages are kept for killing |
@@ -2568,7 +2573,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2568 | } | 2573 | } |
2569 | 2574 | ||
2570 | swapcache = page; | 2575 | swapcache = page; |
2571 | locked = lock_page_or_retry(page, mm, flags); | 2576 | locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); |
2572 | 2577 | ||
2573 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2578 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2574 | if (!locked) { | 2579 | if (!locked) { |
@@ -2585,14 +2590,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2585 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) | 2590 | if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) |
2586 | goto out_page; | 2591 | goto out_page; |
2587 | 2592 | ||
2588 | page = ksm_might_need_to_copy(page, vma, address); | 2593 | page = ksm_might_need_to_copy(page, vma, fe->address); |
2589 | if (unlikely(!page)) { | 2594 | if (unlikely(!page)) { |
2590 | ret = VM_FAULT_OOM; | 2595 | ret = VM_FAULT_OOM; |
2591 | page = swapcache; | 2596 | page = swapcache; |
2592 | goto out_page; | 2597 | goto out_page; |
2593 | } | 2598 | } |
2594 | 2599 | ||
2595 | if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) { | 2600 | if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, |
2601 | &memcg, false)) { | ||
2596 | ret = VM_FAULT_OOM; | 2602 | ret = VM_FAULT_OOM; |
2597 | goto out_page; | 2603 | goto out_page; |
2598 | } | 2604 | } |
@@ -2600,8 +2606,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2600 | /* | 2606 | /* |
2601 | * Back out if somebody else already faulted in this pte. | 2607 | * Back out if somebody else already faulted in this pte. |
2602 | */ | 2608 | */ |
2603 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2609 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
2604 | if (unlikely(!pte_same(*page_table, orig_pte))) | 2610 | &fe->ptl); |
2611 | if (unlikely(!pte_same(*fe->pte, orig_pte))) | ||
2605 | goto out_nomap; | 2612 | goto out_nomap; |
2606 | 2613 | ||
2607 | if (unlikely(!PageUptodate(page))) { | 2614 | if (unlikely(!PageUptodate(page))) { |
@@ -2619,24 +2626,24 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2619 | * must be called after the swap_free(), or it will never succeed. | 2626 | * must be called after the swap_free(), or it will never succeed. |
2620 | */ | 2627 | */ |
2621 | 2628 | ||
2622 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2629 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2623 | dec_mm_counter_fast(mm, MM_SWAPENTS); | 2630 | dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); |
2624 | pte = mk_pte(page, vma->vm_page_prot); | 2631 | pte = mk_pte(page, vma->vm_page_prot); |
2625 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { | 2632 | if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { |
2626 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2633 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
2627 | flags &= ~FAULT_FLAG_WRITE; | 2634 | fe->flags &= ~FAULT_FLAG_WRITE; |
2628 | ret |= VM_FAULT_WRITE; | 2635 | ret |= VM_FAULT_WRITE; |
2629 | exclusive = RMAP_EXCLUSIVE; | 2636 | exclusive = RMAP_EXCLUSIVE; |
2630 | } | 2637 | } |
2631 | flush_icache_page(vma, page); | 2638 | flush_icache_page(vma, page); |
2632 | if (pte_swp_soft_dirty(orig_pte)) | 2639 | if (pte_swp_soft_dirty(orig_pte)) |
2633 | pte = pte_mksoft_dirty(pte); | 2640 | pte = pte_mksoft_dirty(pte); |
2634 | set_pte_at(mm, address, page_table, pte); | 2641 | set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); |
2635 | if (page == swapcache) { | 2642 | if (page == swapcache) { |
2636 | do_page_add_anon_rmap(page, vma, address, exclusive); | 2643 | do_page_add_anon_rmap(page, vma, fe->address, exclusive); |
2637 | mem_cgroup_commit_charge(page, memcg, true, false); | 2644 | mem_cgroup_commit_charge(page, memcg, true, false); |
2638 | } else { /* ksm created a completely new copy */ | 2645 | } else { /* ksm created a completely new copy */ |
2639 | page_add_new_anon_rmap(page, vma, address, false); | 2646 | page_add_new_anon_rmap(page, vma, fe->address, false); |
2640 | mem_cgroup_commit_charge(page, memcg, false, false); | 2647 | mem_cgroup_commit_charge(page, memcg, false, false); |
2641 | lru_cache_add_active_or_unevictable(page, vma); | 2648 | lru_cache_add_active_or_unevictable(page, vma); |
2642 | } | 2649 | } |
@@ -2659,22 +2666,22 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2659 | put_page(swapcache); | 2666 | put_page(swapcache); |
2660 | } | 2667 | } |
2661 | 2668 | ||
2662 | if (flags & FAULT_FLAG_WRITE) { | 2669 | if (fe->flags & FAULT_FLAG_WRITE) { |
2663 | ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); | 2670 | ret |= do_wp_page(fe, pte); |
2664 | if (ret & VM_FAULT_ERROR) | 2671 | if (ret & VM_FAULT_ERROR) |
2665 | ret &= VM_FAULT_ERROR; | 2672 | ret &= VM_FAULT_ERROR; |
2666 | goto out; | 2673 | goto out; |
2667 | } | 2674 | } |
2668 | 2675 | ||
2669 | /* No need to invalidate - it was non-present before */ | 2676 | /* No need to invalidate - it was non-present before */ |
2670 | update_mmu_cache(vma, address, page_table); | 2677 | update_mmu_cache(vma, fe->address, fe->pte); |
2671 | unlock: | 2678 | unlock: |
2672 | pte_unmap_unlock(page_table, ptl); | 2679 | pte_unmap_unlock(fe->pte, fe->ptl); |
2673 | out: | 2680 | out: |
2674 | return ret; | 2681 | return ret; |
2675 | out_nomap: | 2682 | out_nomap: |
2676 | mem_cgroup_cancel_charge(page, memcg, false); | 2683 | mem_cgroup_cancel_charge(page, memcg, false); |
2677 | pte_unmap_unlock(page_table, ptl); | 2684 | pte_unmap_unlock(fe->pte, fe->ptl); |
2678 | out_page: | 2685 | out_page: |
2679 | unlock_page(page); | 2686 | unlock_page(page); |
2680 | out_release: | 2687 | out_release: |
@@ -2725,37 +2732,51 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo | |||
2725 | * but allow concurrent faults), and pte mapped but not yet locked. | 2732 | * but allow concurrent faults), and pte mapped but not yet locked. |
2726 | * We return with mmap_sem still held, but pte unmapped and unlocked. | 2733 | * We return with mmap_sem still held, but pte unmapped and unlocked. |
2727 | */ | 2734 | */ |
2728 | static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2735 | static int do_anonymous_page(struct fault_env *fe) |
2729 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
2730 | unsigned int flags) | ||
2731 | { | 2736 | { |
2737 | struct vm_area_struct *vma = fe->vma; | ||
2732 | struct mem_cgroup *memcg; | 2738 | struct mem_cgroup *memcg; |
2733 | struct page *page; | 2739 | struct page *page; |
2734 | spinlock_t *ptl; | ||
2735 | pte_t entry; | 2740 | pte_t entry; |
2736 | 2741 | ||
2737 | pte_unmap(page_table); | ||
2738 | |||
2739 | /* File mapping without ->vm_ops ? */ | 2742 | /* File mapping without ->vm_ops ? */ |
2740 | if (vma->vm_flags & VM_SHARED) | 2743 | if (vma->vm_flags & VM_SHARED) |
2741 | return VM_FAULT_SIGBUS; | 2744 | return VM_FAULT_SIGBUS; |
2742 | 2745 | ||
2743 | /* Check if we need to add a guard page to the stack */ | 2746 | /* Check if we need to add a guard page to the stack */ |
2744 | if (check_stack_guard_page(vma, address) < 0) | 2747 | if (check_stack_guard_page(vma, fe->address) < 0) |
2745 | return VM_FAULT_SIGSEGV; | 2748 | return VM_FAULT_SIGSEGV; |
2746 | 2749 | ||
2750 | /* | ||
2751 | * Use pte_alloc() instead of pte_alloc_map(). We can't run | ||
2752 | * pte_offset_map() on pmds where a huge pmd might be created | ||
2753 | * from a different thread. | ||
2754 | * | ||
2755 | * pte_alloc_map() is safe to use under down_write(mmap_sem) or when | ||
2756 | * parallel threads are excluded by other means. | ||
2757 | * | ||
2758 | * Here we only have down_read(mmap_sem). | ||
2759 | */ | ||
2760 | if (pte_alloc(vma->vm_mm, fe->pmd, fe->address)) | ||
2761 | return VM_FAULT_OOM; | ||
2762 | |||
2763 | /* See the comment in pte_alloc_one_map() */ | ||
2764 | if (unlikely(pmd_trans_unstable(fe->pmd))) | ||
2765 | return 0; | ||
2766 | |||
2747 | /* Use the zero-page for reads */ | 2767 | /* Use the zero-page for reads */ |
2748 | if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) { | 2768 | if (!(fe->flags & FAULT_FLAG_WRITE) && |
2749 | entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), | 2769 | !mm_forbids_zeropage(vma->vm_mm)) { |
2770 | entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), | ||
2750 | vma->vm_page_prot)); | 2771 | vma->vm_page_prot)); |
2751 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2772 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
2752 | if (!pte_none(*page_table)) | 2773 | &fe->ptl); |
2774 | if (!pte_none(*fe->pte)) | ||
2753 | goto unlock; | 2775 | goto unlock; |
2754 | /* Deliver the page fault to userland, check inside PT lock */ | 2776 | /* Deliver the page fault to userland, check inside PT lock */ |
2755 | if (userfaultfd_missing(vma)) { | 2777 | if (userfaultfd_missing(vma)) { |
2756 | pte_unmap_unlock(page_table, ptl); | 2778 | pte_unmap_unlock(fe->pte, fe->ptl); |
2757 | return handle_userfault(vma, address, flags, | 2779 | return handle_userfault(fe, VM_UFFD_MISSING); |
2758 | VM_UFFD_MISSING); | ||
2759 | } | 2780 | } |
2760 | goto setpte; | 2781 | goto setpte; |
2761 | } | 2782 | } |
@@ -2763,11 +2784,11 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2763 | /* Allocate our own private page. */ | 2784 | /* Allocate our own private page. */ |
2764 | if (unlikely(anon_vma_prepare(vma))) | 2785 | if (unlikely(anon_vma_prepare(vma))) |
2765 | goto oom; | 2786 | goto oom; |
2766 | page = alloc_zeroed_user_highpage_movable(vma, address); | 2787 | page = alloc_zeroed_user_highpage_movable(vma, fe->address); |
2767 | if (!page) | 2788 | if (!page) |
2768 | goto oom; | 2789 | goto oom; |
2769 | 2790 | ||
2770 | if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) | 2791 | if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false)) |
2771 | goto oom_free_page; | 2792 | goto oom_free_page; |
2772 | 2793 | ||
2773 | /* | 2794 | /* |
@@ -2781,30 +2802,30 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2781 | if (vma->vm_flags & VM_WRITE) | 2802 | if (vma->vm_flags & VM_WRITE) |
2782 | entry = pte_mkwrite(pte_mkdirty(entry)); | 2803 | entry = pte_mkwrite(pte_mkdirty(entry)); |
2783 | 2804 | ||
2784 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2805 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, |
2785 | if (!pte_none(*page_table)) | 2806 | &fe->ptl); |
2807 | if (!pte_none(*fe->pte)) | ||
2786 | goto release; | 2808 | goto release; |
2787 | 2809 | ||
2788 | /* Deliver the page fault to userland, check inside PT lock */ | 2810 | /* Deliver the page fault to userland, check inside PT lock */ |
2789 | if (userfaultfd_missing(vma)) { | 2811 | if (userfaultfd_missing(vma)) { |
2790 | pte_unmap_unlock(page_table, ptl); | 2812 | pte_unmap_unlock(fe->pte, fe->ptl); |
2791 | mem_cgroup_cancel_charge(page, memcg, false); | 2813 | mem_cgroup_cancel_charge(page, memcg, false); |
2792 | put_page(page); | 2814 | put_page(page); |
2793 | return handle_userfault(vma, address, flags, | 2815 | return handle_userfault(fe, VM_UFFD_MISSING); |
2794 | VM_UFFD_MISSING); | ||
2795 | } | 2816 | } |
2796 | 2817 | ||
2797 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2818 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2798 | page_add_new_anon_rmap(page, vma, address, false); | 2819 | page_add_new_anon_rmap(page, vma, fe->address, false); |
2799 | mem_cgroup_commit_charge(page, memcg, false, false); | 2820 | mem_cgroup_commit_charge(page, memcg, false, false); |
2800 | lru_cache_add_active_or_unevictable(page, vma); | 2821 | lru_cache_add_active_or_unevictable(page, vma); |
2801 | setpte: | 2822 | setpte: |
2802 | set_pte_at(mm, address, page_table, entry); | 2823 | set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); |
2803 | 2824 | ||
2804 | /* No need to invalidate - it was non-present before */ | 2825 | /* No need to invalidate - it was non-present before */ |
2805 | update_mmu_cache(vma, address, page_table); | 2826 | update_mmu_cache(vma, fe->address, fe->pte); |
2806 | unlock: | 2827 | unlock: |
2807 | pte_unmap_unlock(page_table, ptl); | 2828 | pte_unmap_unlock(fe->pte, fe->ptl); |
2808 | return 0; | 2829 | return 0; |
2809 | release: | 2830 | release: |
2810 | mem_cgroup_cancel_charge(page, memcg, false); | 2831 | mem_cgroup_cancel_charge(page, memcg, false); |
@@ -2821,17 +2842,16 @@ oom: | |||
2821 | * released depending on flags and vma->vm_ops->fault() return value. | 2842 | * released depending on flags and vma->vm_ops->fault() return value. |
2822 | * See filemap_fault() and __lock_page_retry(). | 2843 | * See filemap_fault() and __lock_page_retry(). |
2823 | */ | 2844 | */ |
2824 | static int __do_fault(struct vm_area_struct *vma, unsigned long address, | 2845 | static int __do_fault(struct fault_env *fe, pgoff_t pgoff, |
2825 | pgoff_t pgoff, unsigned int flags, | 2846 | struct page *cow_page, struct page **page, void **entry) |
2826 | struct page *cow_page, struct page **page, | ||
2827 | void **entry) | ||
2828 | { | 2847 | { |
2848 | struct vm_area_struct *vma = fe->vma; | ||
2829 | struct vm_fault vmf; | 2849 | struct vm_fault vmf; |
2830 | int ret; | 2850 | int ret; |
2831 | 2851 | ||
2832 | vmf.virtual_address = (void __user *)(address & PAGE_MASK); | 2852 | vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); |
2833 | vmf.pgoff = pgoff; | 2853 | vmf.pgoff = pgoff; |
2834 | vmf.flags = flags; | 2854 | vmf.flags = fe->flags; |
2835 | vmf.page = NULL; | 2855 | vmf.page = NULL; |
2836 | vmf.gfp_mask = __get_fault_gfp_mask(vma); | 2856 | vmf.gfp_mask = __get_fault_gfp_mask(vma); |
2837 | vmf.cow_page = cow_page; | 2857 | vmf.cow_page = cow_page; |
@@ -2860,41 +2880,168 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, | |||
2860 | return ret; | 2880 | return ret; |
2861 | } | 2881 | } |
2862 | 2882 | ||
2883 | static int pte_alloc_one_map(struct fault_env *fe) | ||
2884 | { | ||
2885 | struct vm_area_struct *vma = fe->vma; | ||
2886 | |||
2887 | if (!pmd_none(*fe->pmd)) | ||
2888 | goto map_pte; | ||
2889 | if (fe->prealloc_pte) { | ||
2890 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | ||
2891 | if (unlikely(!pmd_none(*fe->pmd))) { | ||
2892 | spin_unlock(fe->ptl); | ||
2893 | goto map_pte; | ||
2894 | } | ||
2895 | |||
2896 | atomic_long_inc(&vma->vm_mm->nr_ptes); | ||
2897 | pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte); | ||
2898 | spin_unlock(fe->ptl); | ||
2899 | fe->prealloc_pte = 0; | ||
2900 | } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) { | ||
2901 | return VM_FAULT_OOM; | ||
2902 | } | ||
2903 | map_pte: | ||
2904 | /* | ||
2905 | * If a huge pmd materialized under us just retry later. Use | ||
2906 | * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd | ||
2907 | * didn't become pmd_trans_huge under us and then back to pmd_none, as | ||
2908 | * a result of MADV_DONTNEED running immediately after a huge pmd fault | ||
2909 | * in a different thread of this mm, in turn leading to a misleading | ||
2910 | * pmd_trans_huge() retval. All we have to ensure is that it is a | ||
2911 | * regular pmd that we can walk with pte_offset_map() and we can do that | ||
2912 | * through an atomic read in C, which is what pmd_trans_unstable() | ||
2913 | * provides. | ||
2914 | */ | ||
2915 | if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) | ||
2916 | return VM_FAULT_NOPAGE; | ||
2917 | |||
2918 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | ||
2919 | &fe->ptl); | ||
2920 | return 0; | ||
2921 | } | ||
2922 | |||
2923 | #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE | ||
2924 | |||
2925 | #define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1) | ||
2926 | static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, | ||
2927 | unsigned long haddr) | ||
2928 | { | ||
2929 | if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) != | ||
2930 | (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK)) | ||
2931 | return false; | ||
2932 | if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) | ||
2933 | return false; | ||
2934 | return true; | ||
2935 | } | ||
2936 | |||
2937 | static int do_set_pmd(struct fault_env *fe, struct page *page) | ||
2938 | { | ||
2939 | struct vm_area_struct *vma = fe->vma; | ||
2940 | bool write = fe->flags & FAULT_FLAG_WRITE; | ||
2941 | unsigned long haddr = fe->address & HPAGE_PMD_MASK; | ||
2942 | pmd_t entry; | ||
2943 | int i, ret; | ||
2944 | |||
2945 | if (!transhuge_vma_suitable(vma, haddr)) | ||
2946 | return VM_FAULT_FALLBACK; | ||
2947 | |||
2948 | ret = VM_FAULT_FALLBACK; | ||
2949 | page = compound_head(page); | ||
2950 | |||
2951 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | ||
2952 | if (unlikely(!pmd_none(*fe->pmd))) | ||
2953 | goto out; | ||
2954 | |||
2955 | for (i = 0; i < HPAGE_PMD_NR; i++) | ||
2956 | flush_icache_page(vma, page + i); | ||
2957 | |||
2958 | entry = mk_huge_pmd(page, vma->vm_page_prot); | ||
2959 | if (write) | ||
2960 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
2961 | |||
2962 | add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR); | ||
2963 | page_add_file_rmap(page, true); | ||
2964 | |||
2965 | set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); | ||
2966 | |||
2967 | update_mmu_cache_pmd(vma, haddr, fe->pmd); | ||
2968 | |||
2969 | /* fault is handled */ | ||
2970 | ret = 0; | ||
2971 | count_vm_event(THP_FILE_MAPPED); | ||
2972 | out: | ||
2973 | spin_unlock(fe->ptl); | ||
2974 | return ret; | ||
2975 | } | ||
2976 | #else | ||
2977 | static int do_set_pmd(struct fault_env *fe, struct page *page) | ||
2978 | { | ||
2979 | BUILD_BUG(); | ||
2980 | return 0; | ||
2981 | } | ||
2982 | #endif | ||
2983 | |||
2863 | /** | 2984 | /** |
2864 | * do_set_pte - setup new PTE entry for given page and add reverse page mapping. | 2985 | * alloc_set_pte - setup new PTE entry for given page and add reverse page |
2986 | * mapping. If needed, the fucntion allocates page table or use pre-allocated. | ||
2865 | * | 2987 | * |
2866 | * @vma: virtual memory area | 2988 | * @fe: fault environment |
2867 | * @address: user virtual address | 2989 | * @memcg: memcg to charge page (only for private mappings) |
2868 | * @page: page to map | 2990 | * @page: page to map |
2869 | * @pte: pointer to target page table entry | ||
2870 | * @write: true, if new entry is writable | ||
2871 | * @anon: true, if it's anonymous page | ||
2872 | * | 2991 | * |
2873 | * Caller must hold page table lock relevant for @pte. | 2992 | * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return. |
2874 | * | 2993 | * |
2875 | * Target users are page handler itself and implementations of | 2994 | * Target users are page handler itself and implementations of |
2876 | * vm_ops->map_pages. | 2995 | * vm_ops->map_pages. |
2877 | */ | 2996 | */ |
2878 | void do_set_pte(struct vm_area_struct *vma, unsigned long address, | 2997 | int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, |
2879 | struct page *page, pte_t *pte, bool write, bool anon) | 2998 | struct page *page) |
2880 | { | 2999 | { |
3000 | struct vm_area_struct *vma = fe->vma; | ||
3001 | bool write = fe->flags & FAULT_FLAG_WRITE; | ||
2881 | pte_t entry; | 3002 | pte_t entry; |
3003 | int ret; | ||
3004 | |||
3005 | if (pmd_none(*fe->pmd) && PageTransCompound(page) && | ||
3006 | IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { | ||
3007 | /* THP on COW? */ | ||
3008 | VM_BUG_ON_PAGE(memcg, page); | ||
3009 | |||
3010 | ret = do_set_pmd(fe, page); | ||
3011 | if (ret != VM_FAULT_FALLBACK) | ||
3012 | return ret; | ||
3013 | } | ||
3014 | |||
3015 | if (!fe->pte) { | ||
3016 | ret = pte_alloc_one_map(fe); | ||
3017 | if (ret) | ||
3018 | return ret; | ||
3019 | } | ||
3020 | |||
3021 | /* Re-check under ptl */ | ||
3022 | if (unlikely(!pte_none(*fe->pte))) | ||
3023 | return VM_FAULT_NOPAGE; | ||
2882 | 3024 | ||
2883 | flush_icache_page(vma, page); | 3025 | flush_icache_page(vma, page); |
2884 | entry = mk_pte(page, vma->vm_page_prot); | 3026 | entry = mk_pte(page, vma->vm_page_prot); |
2885 | if (write) | 3027 | if (write) |
2886 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 3028 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2887 | if (anon) { | 3029 | /* copy-on-write page */ |
3030 | if (write && !(vma->vm_flags & VM_SHARED)) { | ||
2888 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 3031 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2889 | page_add_new_anon_rmap(page, vma, address, false); | 3032 | page_add_new_anon_rmap(page, vma, fe->address, false); |
3033 | mem_cgroup_commit_charge(page, memcg, false, false); | ||
3034 | lru_cache_add_active_or_unevictable(page, vma); | ||
2890 | } else { | 3035 | } else { |
2891 | inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); | 3036 | inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); |
2892 | page_add_file_rmap(page); | 3037 | page_add_file_rmap(page, false); |
2893 | } | 3038 | } |
2894 | set_pte_at(vma->vm_mm, address, pte, entry); | 3039 | set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); |
2895 | 3040 | ||
2896 | /* no need to invalidate: a not-present page won't be cached */ | 3041 | /* no need to invalidate: a not-present page won't be cached */ |
2897 | update_mmu_cache(vma, address, pte); | 3042 | update_mmu_cache(vma, fe->address, fe->pte); |
3043 | |||
3044 | return 0; | ||
2898 | } | 3045 | } |
2899 | 3046 | ||
2900 | static unsigned long fault_around_bytes __read_mostly = | 3047 | static unsigned long fault_around_bytes __read_mostly = |
@@ -2961,57 +3108,66 @@ late_initcall(fault_around_debugfs); | |||
2961 | * fault_around_pages() value (and therefore to page order). This way it's | 3108 | * fault_around_pages() value (and therefore to page order). This way it's |
2962 | * easier to guarantee that we don't cross page table boundaries. | 3109 | * easier to guarantee that we don't cross page table boundaries. |
2963 | */ | 3110 | */ |
2964 | static void do_fault_around(struct vm_area_struct *vma, unsigned long address, | 3111 | static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) |
2965 | pte_t *pte, pgoff_t pgoff, unsigned int flags) | ||
2966 | { | 3112 | { |
2967 | unsigned long start_addr, nr_pages, mask; | 3113 | unsigned long address = fe->address, nr_pages, mask; |
2968 | pgoff_t max_pgoff; | 3114 | pgoff_t end_pgoff; |
2969 | struct vm_fault vmf; | 3115 | int off, ret = 0; |
2970 | int off; | ||
2971 | 3116 | ||
2972 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; | 3117 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; |
2973 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; | 3118 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; |
2974 | 3119 | ||
2975 | start_addr = max(address & mask, vma->vm_start); | 3120 | fe->address = max(address & mask, fe->vma->vm_start); |
2976 | off = ((address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); | 3121 | off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); |
2977 | pte -= off; | 3122 | start_pgoff -= off; |
2978 | pgoff -= off; | ||
2979 | 3123 | ||
2980 | /* | 3124 | /* |
2981 | * max_pgoff is either end of page table or end of vma | 3125 | * end_pgoff is either end of page table or end of vma |
2982 | * or fault_around_pages() from pgoff, depending what is nearest. | 3126 | * or fault_around_pages() from start_pgoff, depending what is nearest. |
2983 | */ | 3127 | */ |
2984 | max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + | 3128 | end_pgoff = start_pgoff - |
3129 | ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + | ||
2985 | PTRS_PER_PTE - 1; | 3130 | PTRS_PER_PTE - 1; |
2986 | max_pgoff = min3(max_pgoff, vma_pages(vma) + vma->vm_pgoff - 1, | 3131 | end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, |
2987 | pgoff + nr_pages - 1); | 3132 | start_pgoff + nr_pages - 1); |
2988 | 3133 | ||
2989 | /* Check if it makes any sense to call ->map_pages */ | 3134 | if (pmd_none(*fe->pmd)) { |
2990 | while (!pte_none(*pte)) { | 3135 | fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address); |
2991 | if (++pgoff > max_pgoff) | 3136 | smp_wmb(); /* See comment in __pte_alloc() */ |
2992 | return; | ||
2993 | start_addr += PAGE_SIZE; | ||
2994 | if (start_addr >= vma->vm_end) | ||
2995 | return; | ||
2996 | pte++; | ||
2997 | } | 3137 | } |
2998 | 3138 | ||
2999 | vmf.virtual_address = (void __user *) start_addr; | 3139 | fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); |
3000 | vmf.pte = pte; | 3140 | |
3001 | vmf.pgoff = pgoff; | 3141 | /* preallocated pagetable is unused: free it */ |
3002 | vmf.max_pgoff = max_pgoff; | 3142 | if (fe->prealloc_pte) { |
3003 | vmf.flags = flags; | 3143 | pte_free(fe->vma->vm_mm, fe->prealloc_pte); |
3004 | vmf.gfp_mask = __get_fault_gfp_mask(vma); | 3144 | fe->prealloc_pte = 0; |
3005 | vma->vm_ops->map_pages(vma, &vmf); | 3145 | } |
3146 | /* Huge page is mapped? Page fault is solved */ | ||
3147 | if (pmd_trans_huge(*fe->pmd)) { | ||
3148 | ret = VM_FAULT_NOPAGE; | ||
3149 | goto out; | ||
3150 | } | ||
3151 | |||
3152 | /* ->map_pages() haven't done anything useful. Cold page cache? */ | ||
3153 | if (!fe->pte) | ||
3154 | goto out; | ||
3155 | |||
3156 | /* check if the page fault is solved */ | ||
3157 | fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); | ||
3158 | if (!pte_none(*fe->pte)) | ||
3159 | ret = VM_FAULT_NOPAGE; | ||
3160 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
3161 | out: | ||
3162 | fe->address = address; | ||
3163 | fe->pte = NULL; | ||
3164 | return ret; | ||
3006 | } | 3165 | } |
3007 | 3166 | ||
3008 | static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3167 | static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) |
3009 | unsigned long address, pmd_t *pmd, | ||
3010 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) | ||
3011 | { | 3168 | { |
3169 | struct vm_area_struct *vma = fe->vma; | ||
3012 | struct page *fault_page; | 3170 | struct page *fault_page; |
3013 | spinlock_t *ptl; | ||
3014 | pte_t *pte; | ||
3015 | int ret = 0; | 3171 | int ret = 0; |
3016 | 3172 | ||
3017 | /* | 3173 | /* |
@@ -3020,85 +3176,64 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3020 | * something). | 3176 | * something). |
3021 | */ | 3177 | */ |
3022 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { | 3178 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { |
3023 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 3179 | ret = do_fault_around(fe, pgoff); |
3024 | do_fault_around(vma, address, pte, pgoff, flags); | 3180 | if (ret) |
3025 | if (!pte_same(*pte, orig_pte)) | 3181 | return ret; |
3026 | goto unlock_out; | ||
3027 | pte_unmap_unlock(pte, ptl); | ||
3028 | } | 3182 | } |
3029 | 3183 | ||
3030 | ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); | 3184 | ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); |
3031 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3185 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3032 | return ret; | 3186 | return ret; |
3033 | 3187 | ||
3034 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 3188 | ret |= alloc_set_pte(fe, NULL, fault_page); |
3035 | if (unlikely(!pte_same(*pte, orig_pte))) { | 3189 | if (fe->pte) |
3036 | pte_unmap_unlock(pte, ptl); | 3190 | pte_unmap_unlock(fe->pte, fe->ptl); |
3037 | unlock_page(fault_page); | ||
3038 | put_page(fault_page); | ||
3039 | return ret; | ||
3040 | } | ||
3041 | do_set_pte(vma, address, fault_page, pte, false, false); | ||
3042 | unlock_page(fault_page); | 3191 | unlock_page(fault_page); |
3043 | unlock_out: | 3192 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3044 | pte_unmap_unlock(pte, ptl); | 3193 | put_page(fault_page); |
3045 | return ret; | 3194 | return ret; |
3046 | } | 3195 | } |
3047 | 3196 | ||
3048 | static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3197 | static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) |
3049 | unsigned long address, pmd_t *pmd, | ||
3050 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) | ||
3051 | { | 3198 | { |
3199 | struct vm_area_struct *vma = fe->vma; | ||
3052 | struct page *fault_page, *new_page; | 3200 | struct page *fault_page, *new_page; |
3053 | void *fault_entry; | 3201 | void *fault_entry; |
3054 | struct mem_cgroup *memcg; | 3202 | struct mem_cgroup *memcg; |
3055 | spinlock_t *ptl; | ||
3056 | pte_t *pte; | ||
3057 | int ret; | 3203 | int ret; |
3058 | 3204 | ||
3059 | if (unlikely(anon_vma_prepare(vma))) | 3205 | if (unlikely(anon_vma_prepare(vma))) |
3060 | return VM_FAULT_OOM; | 3206 | return VM_FAULT_OOM; |
3061 | 3207 | ||
3062 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 3208 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); |
3063 | if (!new_page) | 3209 | if (!new_page) |
3064 | return VM_FAULT_OOM; | 3210 | return VM_FAULT_OOM; |
3065 | 3211 | ||
3066 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) { | 3212 | if (mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, |
3213 | &memcg, false)) { | ||
3067 | put_page(new_page); | 3214 | put_page(new_page); |
3068 | return VM_FAULT_OOM; | 3215 | return VM_FAULT_OOM; |
3069 | } | 3216 | } |
3070 | 3217 | ||
3071 | ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page, | 3218 | ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); |
3072 | &fault_entry); | ||
3073 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3219 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3074 | goto uncharge_out; | 3220 | goto uncharge_out; |
3075 | 3221 | ||
3076 | if (!(ret & VM_FAULT_DAX_LOCKED)) | 3222 | if (!(ret & VM_FAULT_DAX_LOCKED)) |
3077 | copy_user_highpage(new_page, fault_page, address, vma); | 3223 | copy_user_highpage(new_page, fault_page, fe->address, vma); |
3078 | __SetPageUptodate(new_page); | 3224 | __SetPageUptodate(new_page); |
3079 | 3225 | ||
3080 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 3226 | ret |= alloc_set_pte(fe, memcg, new_page); |
3081 | if (unlikely(!pte_same(*pte, orig_pte))) { | 3227 | if (fe->pte) |
3082 | pte_unmap_unlock(pte, ptl); | 3228 | pte_unmap_unlock(fe->pte, fe->ptl); |
3083 | if (!(ret & VM_FAULT_DAX_LOCKED)) { | ||
3084 | unlock_page(fault_page); | ||
3085 | put_page(fault_page); | ||
3086 | } else { | ||
3087 | dax_unlock_mapping_entry(vma->vm_file->f_mapping, | ||
3088 | pgoff); | ||
3089 | } | ||
3090 | goto uncharge_out; | ||
3091 | } | ||
3092 | do_set_pte(vma, address, new_page, pte, true, true); | ||
3093 | mem_cgroup_commit_charge(new_page, memcg, false, false); | ||
3094 | lru_cache_add_active_or_unevictable(new_page, vma); | ||
3095 | pte_unmap_unlock(pte, ptl); | ||
3096 | if (!(ret & VM_FAULT_DAX_LOCKED)) { | 3229 | if (!(ret & VM_FAULT_DAX_LOCKED)) { |
3097 | unlock_page(fault_page); | 3230 | unlock_page(fault_page); |
3098 | put_page(fault_page); | 3231 | put_page(fault_page); |
3099 | } else { | 3232 | } else { |
3100 | dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); | 3233 | dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); |
3101 | } | 3234 | } |
3235 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | ||
3236 | goto uncharge_out; | ||
3102 | return ret; | 3237 | return ret; |
3103 | uncharge_out: | 3238 | uncharge_out: |
3104 | mem_cgroup_cancel_charge(new_page, memcg, false); | 3239 | mem_cgroup_cancel_charge(new_page, memcg, false); |
@@ -3106,18 +3241,15 @@ uncharge_out: | |||
3106 | return ret; | 3241 | return ret; |
3107 | } | 3242 | } |
3108 | 3243 | ||
3109 | static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3244 | static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) |
3110 | unsigned long address, pmd_t *pmd, | ||
3111 | pgoff_t pgoff, unsigned int flags, pte_t orig_pte) | ||
3112 | { | 3245 | { |
3246 | struct vm_area_struct *vma = fe->vma; | ||
3113 | struct page *fault_page; | 3247 | struct page *fault_page; |
3114 | struct address_space *mapping; | 3248 | struct address_space *mapping; |
3115 | spinlock_t *ptl; | ||
3116 | pte_t *pte; | ||
3117 | int dirtied = 0; | 3249 | int dirtied = 0; |
3118 | int ret, tmp; | 3250 | int ret, tmp; |
3119 | 3251 | ||
3120 | ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); | 3252 | ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); |
3121 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3253 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3122 | return ret; | 3254 | return ret; |
3123 | 3255 | ||
@@ -3127,7 +3259,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3127 | */ | 3259 | */ |
3128 | if (vma->vm_ops->page_mkwrite) { | 3260 | if (vma->vm_ops->page_mkwrite) { |
3129 | unlock_page(fault_page); | 3261 | unlock_page(fault_page); |
3130 | tmp = do_page_mkwrite(vma, fault_page, address); | 3262 | tmp = do_page_mkwrite(vma, fault_page, fe->address); |
3131 | if (unlikely(!tmp || | 3263 | if (unlikely(!tmp || |
3132 | (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | 3264 | (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { |
3133 | put_page(fault_page); | 3265 | put_page(fault_page); |
@@ -3135,15 +3267,15 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3135 | } | 3267 | } |
3136 | } | 3268 | } |
3137 | 3269 | ||
3138 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 3270 | ret |= alloc_set_pte(fe, NULL, fault_page); |
3139 | if (unlikely(!pte_same(*pte, orig_pte))) { | 3271 | if (fe->pte) |
3140 | pte_unmap_unlock(pte, ptl); | 3272 | pte_unmap_unlock(fe->pte, fe->ptl); |
3273 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | | ||
3274 | VM_FAULT_RETRY))) { | ||
3141 | unlock_page(fault_page); | 3275 | unlock_page(fault_page); |
3142 | put_page(fault_page); | 3276 | put_page(fault_page); |
3143 | return ret; | 3277 | return ret; |
3144 | } | 3278 | } |
3145 | do_set_pte(vma, address, fault_page, pte, true, false); | ||
3146 | pte_unmap_unlock(pte, ptl); | ||
3147 | 3279 | ||
3148 | if (set_page_dirty(fault_page)) | 3280 | if (set_page_dirty(fault_page)) |
3149 | dirtied = 1; | 3281 | dirtied = 1; |
@@ -3175,23 +3307,19 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3175 | * The mmap_sem may have been released depending on flags and our | 3307 | * The mmap_sem may have been released depending on flags and our |
3176 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3308 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3177 | */ | 3309 | */ |
3178 | static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3310 | static int do_fault(struct fault_env *fe) |
3179 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
3180 | unsigned int flags, pte_t orig_pte) | ||
3181 | { | 3311 | { |
3182 | pgoff_t pgoff = linear_page_index(vma, address); | 3312 | struct vm_area_struct *vma = fe->vma; |
3313 | pgoff_t pgoff = linear_page_index(vma, fe->address); | ||
3183 | 3314 | ||
3184 | pte_unmap(page_table); | ||
3185 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ | 3315 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ |
3186 | if (!vma->vm_ops->fault) | 3316 | if (!vma->vm_ops->fault) |
3187 | return VM_FAULT_SIGBUS; | 3317 | return VM_FAULT_SIGBUS; |
3188 | if (!(flags & FAULT_FLAG_WRITE)) | 3318 | if (!(fe->flags & FAULT_FLAG_WRITE)) |
3189 | return do_read_fault(mm, vma, address, pmd, pgoff, flags, | 3319 | return do_read_fault(fe, pgoff); |
3190 | orig_pte); | ||
3191 | if (!(vma->vm_flags & VM_SHARED)) | 3320 | if (!(vma->vm_flags & VM_SHARED)) |
3192 | return do_cow_fault(mm, vma, address, pmd, pgoff, flags, | 3321 | return do_cow_fault(fe, pgoff); |
3193 | orig_pte); | 3322 | return do_shared_fault(fe, pgoff); |
3194 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | ||
3195 | } | 3323 | } |
3196 | 3324 | ||
3197 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | 3325 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, |
@@ -3209,11 +3337,10 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | |||
3209 | return mpol_misplaced(page, vma, addr); | 3337 | return mpol_misplaced(page, vma, addr); |
3210 | } | 3338 | } |
3211 | 3339 | ||
3212 | static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 3340 | static int do_numa_page(struct fault_env *fe, pte_t pte) |
3213 | unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) | ||
3214 | { | 3341 | { |
3342 | struct vm_area_struct *vma = fe->vma; | ||
3215 | struct page *page = NULL; | 3343 | struct page *page = NULL; |
3216 | spinlock_t *ptl; | ||
3217 | int page_nid = -1; | 3344 | int page_nid = -1; |
3218 | int last_cpupid; | 3345 | int last_cpupid; |
3219 | int target_nid; | 3346 | int target_nid; |
@@ -3233,10 +3360,10 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3233 | * page table entry is not accessible, so there would be no | 3360 | * page table entry is not accessible, so there would be no |
3234 | * concurrent hardware modifications to the PTE. | 3361 | * concurrent hardware modifications to the PTE. |
3235 | */ | 3362 | */ |
3236 | ptl = pte_lockptr(mm, pmd); | 3363 | fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); |
3237 | spin_lock(ptl); | 3364 | spin_lock(fe->ptl); |
3238 | if (unlikely(!pte_same(*ptep, pte))) { | 3365 | if (unlikely(!pte_same(*fe->pte, pte))) { |
3239 | pte_unmap_unlock(ptep, ptl); | 3366 | pte_unmap_unlock(fe->pte, fe->ptl); |
3240 | goto out; | 3367 | goto out; |
3241 | } | 3368 | } |
3242 | 3369 | ||
@@ -3245,18 +3372,18 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3245 | pte = pte_mkyoung(pte); | 3372 | pte = pte_mkyoung(pte); |
3246 | if (was_writable) | 3373 | if (was_writable) |
3247 | pte = pte_mkwrite(pte); | 3374 | pte = pte_mkwrite(pte); |
3248 | set_pte_at(mm, addr, ptep, pte); | 3375 | set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); |
3249 | update_mmu_cache(vma, addr, ptep); | 3376 | update_mmu_cache(vma, fe->address, fe->pte); |
3250 | 3377 | ||
3251 | page = vm_normal_page(vma, addr, pte); | 3378 | page = vm_normal_page(vma, fe->address, pte); |
3252 | if (!page) { | 3379 | if (!page) { |
3253 | pte_unmap_unlock(ptep, ptl); | 3380 | pte_unmap_unlock(fe->pte, fe->ptl); |
3254 | return 0; | 3381 | return 0; |
3255 | } | 3382 | } |
3256 | 3383 | ||
3257 | /* TODO: handle PTE-mapped THP */ | 3384 | /* TODO: handle PTE-mapped THP */ |
3258 | if (PageCompound(page)) { | 3385 | if (PageCompound(page)) { |
3259 | pte_unmap_unlock(ptep, ptl); | 3386 | pte_unmap_unlock(fe->pte, fe->ptl); |
3260 | return 0; | 3387 | return 0; |
3261 | } | 3388 | } |
3262 | 3389 | ||
@@ -3280,8 +3407,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3280 | 3407 | ||
3281 | last_cpupid = page_cpupid_last(page); | 3408 | last_cpupid = page_cpupid_last(page); |
3282 | page_nid = page_to_nid(page); | 3409 | page_nid = page_to_nid(page); |
3283 | target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); | 3410 | target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, |
3284 | pte_unmap_unlock(ptep, ptl); | 3411 | &flags); |
3412 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
3285 | if (target_nid == -1) { | 3413 | if (target_nid == -1) { |
3286 | put_page(page); | 3414 | put_page(page); |
3287 | goto out; | 3415 | goto out; |
@@ -3301,24 +3429,29 @@ out: | |||
3301 | return 0; | 3429 | return 0; |
3302 | } | 3430 | } |
3303 | 3431 | ||
3304 | static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | 3432 | static int create_huge_pmd(struct fault_env *fe) |
3305 | unsigned long address, pmd_t *pmd, unsigned int flags) | ||
3306 | { | 3433 | { |
3434 | struct vm_area_struct *vma = fe->vma; | ||
3307 | if (vma_is_anonymous(vma)) | 3435 | if (vma_is_anonymous(vma)) |
3308 | return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); | 3436 | return do_huge_pmd_anonymous_page(fe); |
3309 | if (vma->vm_ops->pmd_fault) | 3437 | if (vma->vm_ops->pmd_fault) |
3310 | return vma->vm_ops->pmd_fault(vma, address, pmd, flags); | 3438 | return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, |
3439 | fe->flags); | ||
3311 | return VM_FAULT_FALLBACK; | 3440 | return VM_FAULT_FALLBACK; |
3312 | } | 3441 | } |
3313 | 3442 | ||
3314 | static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | 3443 | static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) |
3315 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd, | ||
3316 | unsigned int flags) | ||
3317 | { | 3444 | { |
3318 | if (vma_is_anonymous(vma)) | 3445 | if (vma_is_anonymous(fe->vma)) |
3319 | return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); | 3446 | return do_huge_pmd_wp_page(fe, orig_pmd); |
3320 | if (vma->vm_ops->pmd_fault) | 3447 | if (fe->vma->vm_ops->pmd_fault) |
3321 | return vma->vm_ops->pmd_fault(vma, address, pmd, flags); | 3448 | return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, |
3449 | fe->flags); | ||
3450 | |||
3451 | /* COW handled on pte level: split pmd */ | ||
3452 | VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); | ||
3453 | split_huge_pmd(fe->vma, fe->pmd, fe->address); | ||
3454 | |||
3322 | return VM_FAULT_FALLBACK; | 3455 | return VM_FAULT_FALLBACK; |
3323 | } | 3456 | } |
3324 | 3457 | ||
@@ -3331,59 +3464,79 @@ static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3331 | * with external mmu caches can use to update those (ie the Sparc or | 3464 | * with external mmu caches can use to update those (ie the Sparc or |
3332 | * PowerPC hashed page tables that act as extended TLBs). | 3465 | * PowerPC hashed page tables that act as extended TLBs). |
3333 | * | 3466 | * |
3334 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 3467 | * We enter with non-exclusive mmap_sem (to exclude vma changes, but allow |
3335 | * but allow concurrent faults), and pte mapped but not yet locked. | 3468 | * concurrent faults). |
3336 | * We return with pte unmapped and unlocked. | ||
3337 | * | 3469 | * |
3338 | * The mmap_sem may have been released depending on flags and our | 3470 | * The mmap_sem may have been released depending on flags and our return value. |
3339 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3471 | * See filemap_fault() and __lock_page_or_retry(). |
3340 | */ | 3472 | */ |
3341 | static int handle_pte_fault(struct mm_struct *mm, | 3473 | static int handle_pte_fault(struct fault_env *fe) |
3342 | struct vm_area_struct *vma, unsigned long address, | ||
3343 | pte_t *pte, pmd_t *pmd, unsigned int flags) | ||
3344 | { | 3474 | { |
3345 | pte_t entry; | 3475 | pte_t entry; |
3346 | spinlock_t *ptl; | ||
3347 | 3476 | ||
3348 | /* | 3477 | if (unlikely(pmd_none(*fe->pmd))) { |
3349 | * some architectures can have larger ptes than wordsize, | 3478 | /* |
3350 | * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y, | 3479 | * Leave __pte_alloc() until later: because vm_ops->fault may |
3351 | * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses. | 3480 | * want to allocate huge page, and if we expose page table |
3352 | * The code below just needs a consistent view for the ifs and | 3481 | * for an instant, it will be difficult to retract from |
3353 | * we later double check anyway with the ptl lock held. So here | 3482 | * concurrent faults and from rmap lookups. |
3354 | * a barrier will do. | 3483 | */ |
3355 | */ | 3484 | fe->pte = NULL; |
3356 | entry = *pte; | 3485 | } else { |
3357 | barrier(); | 3486 | /* See comment in pte_alloc_one_map() */ |
3358 | if (!pte_present(entry)) { | 3487 | if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) |
3488 | return 0; | ||
3489 | /* | ||
3490 | * A regular pmd is established and it can't morph into a huge | ||
3491 | * pmd from under us anymore at this point because we hold the | ||
3492 | * mmap_sem read mode and khugepaged takes it in write mode. | ||
3493 | * So now it's safe to run pte_offset_map(). | ||
3494 | */ | ||
3495 | fe->pte = pte_offset_map(fe->pmd, fe->address); | ||
3496 | |||
3497 | entry = *fe->pte; | ||
3498 | |||
3499 | /* | ||
3500 | * some architectures can have larger ptes than wordsize, | ||
3501 | * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and | ||
3502 | * CONFIG_32BIT=y, so READ_ONCE or ACCESS_ONCE cannot guarantee | ||
3503 | * atomic accesses. The code below just needs a consistent | ||
3504 | * view for the ifs and we later double check anyway with the | ||
3505 | * ptl lock held. So here a barrier will do. | ||
3506 | */ | ||
3507 | barrier(); | ||
3359 | if (pte_none(entry)) { | 3508 | if (pte_none(entry)) { |
3360 | if (vma_is_anonymous(vma)) | 3509 | pte_unmap(fe->pte); |
3361 | return do_anonymous_page(mm, vma, address, | 3510 | fe->pte = NULL; |
3362 | pte, pmd, flags); | ||
3363 | else | ||
3364 | return do_fault(mm, vma, address, pte, pmd, | ||
3365 | flags, entry); | ||
3366 | } | 3511 | } |
3367 | return do_swap_page(mm, vma, address, | ||
3368 | pte, pmd, flags, entry); | ||
3369 | } | 3512 | } |
3370 | 3513 | ||
3514 | if (!fe->pte) { | ||
3515 | if (vma_is_anonymous(fe->vma)) | ||
3516 | return do_anonymous_page(fe); | ||
3517 | else | ||
3518 | return do_fault(fe); | ||
3519 | } | ||
3520 | |||
3521 | if (!pte_present(entry)) | ||
3522 | return do_swap_page(fe, entry); | ||
3523 | |||
3371 | if (pte_protnone(entry)) | 3524 | if (pte_protnone(entry)) |
3372 | return do_numa_page(mm, vma, address, entry, pte, pmd); | 3525 | return do_numa_page(fe, entry); |
3373 | 3526 | ||
3374 | ptl = pte_lockptr(mm, pmd); | 3527 | fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); |
3375 | spin_lock(ptl); | 3528 | spin_lock(fe->ptl); |
3376 | if (unlikely(!pte_same(*pte, entry))) | 3529 | if (unlikely(!pte_same(*fe->pte, entry))) |
3377 | goto unlock; | 3530 | goto unlock; |
3378 | if (flags & FAULT_FLAG_WRITE) { | 3531 | if (fe->flags & FAULT_FLAG_WRITE) { |
3379 | if (!pte_write(entry)) | 3532 | if (!pte_write(entry)) |
3380 | return do_wp_page(mm, vma, address, | 3533 | return do_wp_page(fe, entry); |
3381 | pte, pmd, ptl, entry); | ||
3382 | entry = pte_mkdirty(entry); | 3534 | entry = pte_mkdirty(entry); |
3383 | } | 3535 | } |
3384 | entry = pte_mkyoung(entry); | 3536 | entry = pte_mkyoung(entry); |
3385 | if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { | 3537 | if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, |
3386 | update_mmu_cache(vma, address, pte); | 3538 | fe->flags & FAULT_FLAG_WRITE)) { |
3539 | update_mmu_cache(fe->vma, fe->address, fe->pte); | ||
3387 | } else { | 3540 | } else { |
3388 | /* | 3541 | /* |
3389 | * This is needed only for protection faults but the arch code | 3542 | * This is needed only for protection faults but the arch code |
@@ -3391,11 +3544,11 @@ static int handle_pte_fault(struct mm_struct *mm, | |||
3391 | * This still avoids useless tlb flushes for .text page faults | 3544 | * This still avoids useless tlb flushes for .text page faults |
3392 | * with threads. | 3545 | * with threads. |
3393 | */ | 3546 | */ |
3394 | if (flags & FAULT_FLAG_WRITE) | 3547 | if (fe->flags & FAULT_FLAG_WRITE) |
3395 | flush_tlb_fix_spurious_fault(vma, address); | 3548 | flush_tlb_fix_spurious_fault(fe->vma, fe->address); |
3396 | } | 3549 | } |
3397 | unlock: | 3550 | unlock: |
3398 | pte_unmap_unlock(pte, ptl); | 3551 | pte_unmap_unlock(fe->pte, fe->ptl); |
3399 | return 0; | 3552 | return 0; |
3400 | } | 3553 | } |
3401 | 3554 | ||
@@ -3405,87 +3558,51 @@ unlock: | |||
3405 | * The mmap_sem may have been released depending on flags and our | 3558 | * The mmap_sem may have been released depending on flags and our |
3406 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3559 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3407 | */ | 3560 | */ |
3408 | static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3561 | static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, |
3409 | unsigned long address, unsigned int flags) | 3562 | unsigned int flags) |
3410 | { | 3563 | { |
3564 | struct fault_env fe = { | ||
3565 | .vma = vma, | ||
3566 | .address = address, | ||
3567 | .flags = flags, | ||
3568 | }; | ||
3569 | struct mm_struct *mm = vma->vm_mm; | ||
3411 | pgd_t *pgd; | 3570 | pgd_t *pgd; |
3412 | pud_t *pud; | 3571 | pud_t *pud; |
3413 | pmd_t *pmd; | ||
3414 | pte_t *pte; | ||
3415 | |||
3416 | if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, | ||
3417 | flags & FAULT_FLAG_INSTRUCTION, | ||
3418 | flags & FAULT_FLAG_REMOTE)) | ||
3419 | return VM_FAULT_SIGSEGV; | ||
3420 | |||
3421 | if (unlikely(is_vm_hugetlb_page(vma))) | ||
3422 | return hugetlb_fault(mm, vma, address, flags); | ||
3423 | 3572 | ||
3424 | pgd = pgd_offset(mm, address); | 3573 | pgd = pgd_offset(mm, address); |
3425 | pud = pud_alloc(mm, pgd, address); | 3574 | pud = pud_alloc(mm, pgd, address); |
3426 | if (!pud) | 3575 | if (!pud) |
3427 | return VM_FAULT_OOM; | 3576 | return VM_FAULT_OOM; |
3428 | pmd = pmd_alloc(mm, pud, address); | 3577 | fe.pmd = pmd_alloc(mm, pud, address); |
3429 | if (!pmd) | 3578 | if (!fe.pmd) |
3430 | return VM_FAULT_OOM; | 3579 | return VM_FAULT_OOM; |
3431 | if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { | 3580 | if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { |
3432 | int ret = create_huge_pmd(mm, vma, address, pmd, flags); | 3581 | int ret = create_huge_pmd(&fe); |
3433 | if (!(ret & VM_FAULT_FALLBACK)) | 3582 | if (!(ret & VM_FAULT_FALLBACK)) |
3434 | return ret; | 3583 | return ret; |
3435 | } else { | 3584 | } else { |
3436 | pmd_t orig_pmd = *pmd; | 3585 | pmd_t orig_pmd = *fe.pmd; |
3437 | int ret; | 3586 | int ret; |
3438 | 3587 | ||
3439 | barrier(); | 3588 | barrier(); |
3440 | if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { | 3589 | if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { |
3441 | unsigned int dirty = flags & FAULT_FLAG_WRITE; | ||
3442 | |||
3443 | if (pmd_protnone(orig_pmd)) | 3590 | if (pmd_protnone(orig_pmd)) |
3444 | return do_huge_pmd_numa_page(mm, vma, address, | 3591 | return do_huge_pmd_numa_page(&fe, orig_pmd); |
3445 | orig_pmd, pmd); | ||
3446 | 3592 | ||
3447 | if (dirty && !pmd_write(orig_pmd)) { | 3593 | if ((fe.flags & FAULT_FLAG_WRITE) && |
3448 | ret = wp_huge_pmd(mm, vma, address, pmd, | 3594 | !pmd_write(orig_pmd)) { |
3449 | orig_pmd, flags); | 3595 | ret = wp_huge_pmd(&fe, orig_pmd); |
3450 | if (!(ret & VM_FAULT_FALLBACK)) | 3596 | if (!(ret & VM_FAULT_FALLBACK)) |
3451 | return ret; | 3597 | return ret; |
3452 | } else { | 3598 | } else { |
3453 | huge_pmd_set_accessed(mm, vma, address, pmd, | 3599 | huge_pmd_set_accessed(&fe, orig_pmd); |
3454 | orig_pmd, dirty); | ||
3455 | return 0; | 3600 | return 0; |
3456 | } | 3601 | } |
3457 | } | 3602 | } |
3458 | } | 3603 | } |
3459 | 3604 | ||
3460 | /* | 3605 | return handle_pte_fault(&fe); |
3461 | * Use pte_alloc() instead of pte_alloc_map, because we can't | ||
3462 | * run pte_offset_map on the pmd, if an huge pmd could | ||
3463 | * materialize from under us from a different thread. | ||
3464 | */ | ||
3465 | if (unlikely(pte_alloc(mm, pmd, address))) | ||
3466 | return VM_FAULT_OOM; | ||
3467 | /* | ||
3468 | * If a huge pmd materialized under us just retry later. Use | ||
3469 | * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd | ||
3470 | * didn't become pmd_trans_huge under us and then back to pmd_none, as | ||
3471 | * a result of MADV_DONTNEED running immediately after a huge pmd fault | ||
3472 | * in a different thread of this mm, in turn leading to a misleading | ||
3473 | * pmd_trans_huge() retval. All we have to ensure is that it is a | ||
3474 | * regular pmd that we can walk with pte_offset_map() and we can do that | ||
3475 | * through an atomic read in C, which is what pmd_trans_unstable() | ||
3476 | * provides. | ||
3477 | */ | ||
3478 | if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd))) | ||
3479 | return 0; | ||
3480 | /* | ||
3481 | * A regular pmd is established and it can't morph into a huge pmd | ||
3482 | * from under us anymore at this point because we hold the mmap_sem | ||
3483 | * read mode and khugepaged takes it in write mode. So now it's | ||
3484 | * safe to run pte_offset_map(). | ||
3485 | */ | ||
3486 | pte = pte_offset_map(pmd, address); | ||
3487 | |||
3488 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); | ||
3489 | } | 3606 | } |
3490 | 3607 | ||
3491 | /* | 3608 | /* |
@@ -3494,15 +3611,15 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3494 | * The mmap_sem may have been released depending on flags and our | 3611 | * The mmap_sem may have been released depending on flags and our |
3495 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3612 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3496 | */ | 3613 | */ |
3497 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 3614 | int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, |
3498 | unsigned long address, unsigned int flags) | 3615 | unsigned int flags) |
3499 | { | 3616 | { |
3500 | int ret; | 3617 | int ret; |
3501 | 3618 | ||
3502 | __set_current_state(TASK_RUNNING); | 3619 | __set_current_state(TASK_RUNNING); |
3503 | 3620 | ||
3504 | count_vm_event(PGFAULT); | 3621 | count_vm_event(PGFAULT); |
3505 | mem_cgroup_count_vm_event(mm, PGFAULT); | 3622 | mem_cgroup_count_vm_event(vma->vm_mm, PGFAULT); |
3506 | 3623 | ||
3507 | /* do counter updates before entering really critical section. */ | 3624 | /* do counter updates before entering really critical section. */ |
3508 | check_sync_rss_stat(current); | 3625 | check_sync_rss_stat(current); |
@@ -3514,7 +3631,15 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3514 | if (flags & FAULT_FLAG_USER) | 3631 | if (flags & FAULT_FLAG_USER) |
3515 | mem_cgroup_oom_enable(); | 3632 | mem_cgroup_oom_enable(); |
3516 | 3633 | ||
3517 | ret = __handle_mm_fault(mm, vma, address, flags); | 3634 | if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, |
3635 | flags & FAULT_FLAG_INSTRUCTION, | ||
3636 | flags & FAULT_FLAG_REMOTE)) | ||
3637 | return VM_FAULT_SIGSEGV; | ||
3638 | |||
3639 | if (unlikely(is_vm_hugetlb_page(vma))) | ||
3640 | ret = hugetlb_fault(vma->vm_mm, vma, address, flags); | ||
3641 | else | ||
3642 | ret = __handle_mm_fault(vma, address, flags); | ||
3518 | 3643 | ||
3519 | if (flags & FAULT_FLAG_USER) { | 3644 | if (flags & FAULT_FLAG_USER) { |
3520 | mem_cgroup_oom_disable(); | 3645 | mem_cgroup_oom_disable(); |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e3cbdcaff2a5..82d0b98d27f8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -449,6 +449,25 @@ out_fail: | |||
449 | return -1; | 449 | return -1; |
450 | } | 450 | } |
451 | 451 | ||
452 | static struct zone * __meminit move_pfn_range(int zone_shift, | ||
453 | unsigned long start_pfn, unsigned long end_pfn) | ||
454 | { | ||
455 | struct zone *zone = page_zone(pfn_to_page(start_pfn)); | ||
456 | int ret = 0; | ||
457 | |||
458 | if (zone_shift < 0) | ||
459 | ret = move_pfn_range_left(zone + zone_shift, zone, | ||
460 | start_pfn, end_pfn); | ||
461 | else if (zone_shift) | ||
462 | ret = move_pfn_range_right(zone, zone + zone_shift, | ||
463 | start_pfn, end_pfn); | ||
464 | |||
465 | if (ret) | ||
466 | return NULL; | ||
467 | |||
468 | return zone + zone_shift; | ||
469 | } | ||
470 | |||
452 | static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, | 471 | static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, |
453 | unsigned long end_pfn) | 472 | unsigned long end_pfn) |
454 | { | 473 | { |
@@ -1028,6 +1047,37 @@ static void node_states_set_node(int node, struct memory_notify *arg) | |||
1028 | node_set_state(node, N_MEMORY); | 1047 | node_set_state(node, N_MEMORY); |
1029 | } | 1048 | } |
1030 | 1049 | ||
1050 | int zone_can_shift(unsigned long pfn, unsigned long nr_pages, | ||
1051 | enum zone_type target) | ||
1052 | { | ||
1053 | struct zone *zone = page_zone(pfn_to_page(pfn)); | ||
1054 | enum zone_type idx = zone_idx(zone); | ||
1055 | int i; | ||
1056 | |||
1057 | if (idx < target) { | ||
1058 | /* pages must be at end of current zone */ | ||
1059 | if (pfn + nr_pages != zone_end_pfn(zone)) | ||
1060 | return 0; | ||
1061 | |||
1062 | /* no zones in use between current zone and target */ | ||
1063 | for (i = idx + 1; i < target; i++) | ||
1064 | if (zone_is_initialized(zone - idx + i)) | ||
1065 | return 0; | ||
1066 | } | ||
1067 | |||
1068 | if (target < idx) { | ||
1069 | /* pages must be at beginning of current zone */ | ||
1070 | if (pfn != zone->zone_start_pfn) | ||
1071 | return 0; | ||
1072 | |||
1073 | /* no zones in use between current zone and target */ | ||
1074 | for (i = target + 1; i < idx; i++) | ||
1075 | if (zone_is_initialized(zone - idx + i)) | ||
1076 | return 0; | ||
1077 | } | ||
1078 | |||
1079 | return target - idx; | ||
1080 | } | ||
1031 | 1081 | ||
1032 | /* Must be protected by mem_hotplug_begin() */ | 1082 | /* Must be protected by mem_hotplug_begin() */ |
1033 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) | 1083 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) |
@@ -1039,6 +1089,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
1039 | int nid; | 1089 | int nid; |
1040 | int ret; | 1090 | int ret; |
1041 | struct memory_notify arg; | 1091 | struct memory_notify arg; |
1092 | int zone_shift = 0; | ||
1042 | 1093 | ||
1043 | /* | 1094 | /* |
1044 | * This doesn't need a lock to do pfn_to_page(). | 1095 | * This doesn't need a lock to do pfn_to_page(). |
@@ -1052,19 +1103,14 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
1052 | !can_online_high_movable(zone)) | 1103 | !can_online_high_movable(zone)) |
1053 | return -EINVAL; | 1104 | return -EINVAL; |
1054 | 1105 | ||
1055 | if (online_type == MMOP_ONLINE_KERNEL && | 1106 | if (online_type == MMOP_ONLINE_KERNEL) |
1056 | zone_idx(zone) == ZONE_MOVABLE) { | 1107 | zone_shift = zone_can_shift(pfn, nr_pages, ZONE_NORMAL); |
1057 | if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) | 1108 | else if (online_type == MMOP_ONLINE_MOVABLE) |
1058 | return -EINVAL; | 1109 | zone_shift = zone_can_shift(pfn, nr_pages, ZONE_MOVABLE); |
1059 | } | ||
1060 | if (online_type == MMOP_ONLINE_MOVABLE && | ||
1061 | zone_idx(zone) == ZONE_MOVABLE - 1) { | ||
1062 | if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) | ||
1063 | return -EINVAL; | ||
1064 | } | ||
1065 | 1110 | ||
1066 | /* Previous code may changed the zone of the pfn range */ | 1111 | zone = move_pfn_range(zone_shift, pfn, pfn + nr_pages); |
1067 | zone = page_zone(pfn_to_page(pfn)); | 1112 | if (!zone) |
1113 | return -EINVAL; | ||
1068 | 1114 | ||
1069 | arg.start_pfn = pfn; | 1115 | arg.start_pfn = pfn; |
1070 | arg.nr_pages = nr_pages; | 1116 | arg.nr_pages = nr_pages; |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 297d6854f849..53e40d3f3933 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -512,6 +512,8 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, | |||
512 | } | 512 | } |
513 | } | 513 | } |
514 | 514 | ||
515 | if (pmd_trans_unstable(pmd)) | ||
516 | return 0; | ||
515 | retry: | 517 | retry: |
516 | pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); | 518 | pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); |
517 | for (; addr != end; pte++, addr += PAGE_SIZE) { | 519 | for (; addr != end; pte++, addr += PAGE_SIZE) { |
@@ -529,7 +531,7 @@ retry: | |||
529 | nid = page_to_nid(page); | 531 | nid = page_to_nid(page); |
530 | if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) | 532 | if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) |
531 | continue; | 533 | continue; |
532 | if (PageTransCompound(page) && PageAnon(page)) { | 534 | if (PageTransCompound(page)) { |
533 | get_page(page); | 535 | get_page(page); |
534 | pte_unmap_unlock(pte, ptl); | 536 | pte_unmap_unlock(pte, ptl); |
535 | lock_page(page); | 537 | lock_page(page); |
diff --git a/mm/migrate.c b/mm/migrate.c index bd3fdc202e8b..2232f6923cc7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/vmalloc.h> | 31 | #include <linux/vmalloc.h> |
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | #include <linux/backing-dev.h> | 33 | #include <linux/backing-dev.h> |
34 | #include <linux/compaction.h> | ||
34 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
35 | #include <linux/hugetlb.h> | 36 | #include <linux/hugetlb.h> |
36 | #include <linux/hugetlb_cgroup.h> | 37 | #include <linux/hugetlb_cgroup.h> |
@@ -73,6 +74,81 @@ int migrate_prep_local(void) | |||
73 | return 0; | 74 | return 0; |
74 | } | 75 | } |
75 | 76 | ||
77 | bool isolate_movable_page(struct page *page, isolate_mode_t mode) | ||
78 | { | ||
79 | struct address_space *mapping; | ||
80 | |||
81 | /* | ||
82 | * Avoid burning cycles with pages that are yet under __free_pages(), | ||
83 | * or just got freed under us. | ||
84 | * | ||
85 | * In case we 'win' a race for a movable page being freed under us and | ||
86 | * raise its refcount preventing __free_pages() from doing its job | ||
87 | * the put_page() at the end of this block will take care of | ||
88 | * release this page, thus avoiding a nasty leakage. | ||
89 | */ | ||
90 | if (unlikely(!get_page_unless_zero(page))) | ||
91 | goto out; | ||
92 | |||
93 | /* | ||
94 | * Check PageMovable before holding a PG_lock because page's owner | ||
95 | * assumes anybody doesn't touch PG_lock of newly allocated page | ||
96 | * so unconditionally grapping the lock ruins page's owner side. | ||
97 | */ | ||
98 | if (unlikely(!__PageMovable(page))) | ||
99 | goto out_putpage; | ||
100 | /* | ||
101 | * As movable pages are not isolated from LRU lists, concurrent | ||
102 | * compaction threads can race against page migration functions | ||
103 | * as well as race against the releasing a page. | ||
104 | * | ||
105 | * In order to avoid having an already isolated movable page | ||
106 | * being (wrongly) re-isolated while it is under migration, | ||
107 | * or to avoid attempting to isolate pages being released, | ||
108 | * lets be sure we have the page lock | ||
109 | * before proceeding with the movable page isolation steps. | ||
110 | */ | ||
111 | if (unlikely(!trylock_page(page))) | ||
112 | goto out_putpage; | ||
113 | |||
114 | if (!PageMovable(page) || PageIsolated(page)) | ||
115 | goto out_no_isolated; | ||
116 | |||
117 | mapping = page_mapping(page); | ||
118 | VM_BUG_ON_PAGE(!mapping, page); | ||
119 | |||
120 | if (!mapping->a_ops->isolate_page(page, mode)) | ||
121 | goto out_no_isolated; | ||
122 | |||
123 | /* Driver shouldn't use PG_isolated bit of page->flags */ | ||
124 | WARN_ON_ONCE(PageIsolated(page)); | ||
125 | __SetPageIsolated(page); | ||
126 | unlock_page(page); | ||
127 | |||
128 | return true; | ||
129 | |||
130 | out_no_isolated: | ||
131 | unlock_page(page); | ||
132 | out_putpage: | ||
133 | put_page(page); | ||
134 | out: | ||
135 | return false; | ||
136 | } | ||
137 | |||
138 | /* It should be called on page which is PG_movable */ | ||
139 | void putback_movable_page(struct page *page) | ||
140 | { | ||
141 | struct address_space *mapping; | ||
142 | |||
143 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
144 | VM_BUG_ON_PAGE(!PageMovable(page), page); | ||
145 | VM_BUG_ON_PAGE(!PageIsolated(page), page); | ||
146 | |||
147 | mapping = page_mapping(page); | ||
148 | mapping->a_ops->putback_page(page); | ||
149 | __ClearPageIsolated(page); | ||
150 | } | ||
151 | |||
76 | /* | 152 | /* |
77 | * Put previously isolated pages back onto the appropriate lists | 153 | * Put previously isolated pages back onto the appropriate lists |
78 | * from where they were once taken off for compaction/migration. | 154 | * from where they were once taken off for compaction/migration. |
@@ -94,10 +170,23 @@ void putback_movable_pages(struct list_head *l) | |||
94 | list_del(&page->lru); | 170 | list_del(&page->lru); |
95 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 171 | dec_zone_page_state(page, NR_ISOLATED_ANON + |
96 | page_is_file_cache(page)); | 172 | page_is_file_cache(page)); |
97 | if (unlikely(isolated_balloon_page(page))) | 173 | /* |
98 | balloon_page_putback(page); | 174 | * We isolated non-lru movable page so here we can use |
99 | else | 175 | * __PageMovable because LRU page's mapping cannot have |
176 | * PAGE_MAPPING_MOVABLE. | ||
177 | */ | ||
178 | if (unlikely(__PageMovable(page))) { | ||
179 | VM_BUG_ON_PAGE(!PageIsolated(page), page); | ||
180 | lock_page(page); | ||
181 | if (PageMovable(page)) | ||
182 | putback_movable_page(page); | ||
183 | else | ||
184 | __ClearPageIsolated(page); | ||
185 | unlock_page(page); | ||
186 | put_page(page); | ||
187 | } else { | ||
100 | putback_lru_page(page); | 188 | putback_lru_page(page); |
189 | } | ||
101 | } | 190 | } |
102 | } | 191 | } |
103 | 192 | ||
@@ -170,7 +259,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
170 | } else if (PageAnon(new)) | 259 | } else if (PageAnon(new)) |
171 | page_add_anon_rmap(new, vma, addr, false); | 260 | page_add_anon_rmap(new, vma, addr, false); |
172 | else | 261 | else |
173 | page_add_file_rmap(new); | 262 | page_add_file_rmap(new, false); |
174 | 263 | ||
175 | if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) | 264 | if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) |
176 | mlock_vma_page(new); | 265 | mlock_vma_page(new); |
@@ -594,7 +683,7 @@ EXPORT_SYMBOL(migrate_page_copy); | |||
594 | ***********************************************************/ | 683 | ***********************************************************/ |
595 | 684 | ||
596 | /* | 685 | /* |
597 | * Common logic to directly migrate a single page suitable for | 686 | * Common logic to directly migrate a single LRU page suitable for |
598 | * pages that do not use PagePrivate/PagePrivate2. | 687 | * pages that do not use PagePrivate/PagePrivate2. |
599 | * | 688 | * |
600 | * Pages are locked upon entry and exit. | 689 | * Pages are locked upon entry and exit. |
@@ -757,33 +846,72 @@ static int move_to_new_page(struct page *newpage, struct page *page, | |||
757 | enum migrate_mode mode) | 846 | enum migrate_mode mode) |
758 | { | 847 | { |
759 | struct address_space *mapping; | 848 | struct address_space *mapping; |
760 | int rc; | 849 | int rc = -EAGAIN; |
850 | bool is_lru = !__PageMovable(page); | ||
761 | 851 | ||
762 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 852 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
763 | VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); | 853 | VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); |
764 | 854 | ||
765 | mapping = page_mapping(page); | 855 | mapping = page_mapping(page); |
766 | if (!mapping) | 856 | |
767 | rc = migrate_page(mapping, newpage, page, mode); | 857 | if (likely(is_lru)) { |
768 | else if (mapping->a_ops->migratepage) | 858 | if (!mapping) |
859 | rc = migrate_page(mapping, newpage, page, mode); | ||
860 | else if (mapping->a_ops->migratepage) | ||
861 | /* | ||
862 | * Most pages have a mapping and most filesystems | ||
863 | * provide a migratepage callback. Anonymous pages | ||
864 | * are part of swap space which also has its own | ||
865 | * migratepage callback. This is the most common path | ||
866 | * for page migration. | ||
867 | */ | ||
868 | rc = mapping->a_ops->migratepage(mapping, newpage, | ||
869 | page, mode); | ||
870 | else | ||
871 | rc = fallback_migrate_page(mapping, newpage, | ||
872 | page, mode); | ||
873 | } else { | ||
769 | /* | 874 | /* |
770 | * Most pages have a mapping and most filesystems provide a | 875 | * In case of non-lru page, it could be released after |
771 | * migratepage callback. Anonymous pages are part of swap | 876 | * isolation step. In that case, we shouldn't try migration. |
772 | * space which also has its own migratepage callback. This | ||
773 | * is the most common path for page migration. | ||
774 | */ | 877 | */ |
775 | rc = mapping->a_ops->migratepage(mapping, newpage, page, mode); | 878 | VM_BUG_ON_PAGE(!PageIsolated(page), page); |
776 | else | 879 | if (!PageMovable(page)) { |
777 | rc = fallback_migrate_page(mapping, newpage, page, mode); | 880 | rc = MIGRATEPAGE_SUCCESS; |
881 | __ClearPageIsolated(page); | ||
882 | goto out; | ||
883 | } | ||
884 | |||
885 | rc = mapping->a_ops->migratepage(mapping, newpage, | ||
886 | page, mode); | ||
887 | WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS && | ||
888 | !PageIsolated(page)); | ||
889 | } | ||
778 | 890 | ||
779 | /* | 891 | /* |
780 | * When successful, old pagecache page->mapping must be cleared before | 892 | * When successful, old pagecache page->mapping must be cleared before |
781 | * page is freed; but stats require that PageAnon be left as PageAnon. | 893 | * page is freed; but stats require that PageAnon be left as PageAnon. |
782 | */ | 894 | */ |
783 | if (rc == MIGRATEPAGE_SUCCESS) { | 895 | if (rc == MIGRATEPAGE_SUCCESS) { |
784 | if (!PageAnon(page)) | 896 | if (__PageMovable(page)) { |
897 | VM_BUG_ON_PAGE(!PageIsolated(page), page); | ||
898 | |||
899 | /* | ||
900 | * We clear PG_movable under page_lock so any compactor | ||
901 | * cannot try to migrate this page. | ||
902 | */ | ||
903 | __ClearPageIsolated(page); | ||
904 | } | ||
905 | |||
906 | /* | ||
907 | * Anonymous and movable page->mapping will be cleard by | ||
908 | * free_pages_prepare so don't reset it here for keeping | ||
909 | * the type to work PageAnon, for example. | ||
910 | */ | ||
911 | if (!PageMappingFlags(page)) | ||
785 | page->mapping = NULL; | 912 | page->mapping = NULL; |
786 | } | 913 | } |
914 | out: | ||
787 | return rc; | 915 | return rc; |
788 | } | 916 | } |
789 | 917 | ||
@@ -793,6 +921,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
793 | int rc = -EAGAIN; | 921 | int rc = -EAGAIN; |
794 | int page_was_mapped = 0; | 922 | int page_was_mapped = 0; |
795 | struct anon_vma *anon_vma = NULL; | 923 | struct anon_vma *anon_vma = NULL; |
924 | bool is_lru = !__PageMovable(page); | ||
796 | 925 | ||
797 | if (!trylock_page(page)) { | 926 | if (!trylock_page(page)) { |
798 | if (!force || mode == MIGRATE_ASYNC) | 927 | if (!force || mode == MIGRATE_ASYNC) |
@@ -861,15 +990,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
861 | if (unlikely(!trylock_page(newpage))) | 990 | if (unlikely(!trylock_page(newpage))) |
862 | goto out_unlock; | 991 | goto out_unlock; |
863 | 992 | ||
864 | if (unlikely(isolated_balloon_page(page))) { | 993 | if (unlikely(!is_lru)) { |
865 | /* | 994 | rc = move_to_new_page(newpage, page, mode); |
866 | * A ballooned page does not need any special attention from | ||
867 | * physical to virtual reverse mapping procedures. | ||
868 | * Skip any attempt to unmap PTEs or to remap swap cache, | ||
869 | * in order to avoid burning cycles at rmap level, and perform | ||
870 | * the page migration right away (proteced by page lock). | ||
871 | */ | ||
872 | rc = balloon_page_migrate(newpage, page, mode); | ||
873 | goto out_unlock_both; | 995 | goto out_unlock_both; |
874 | } | 996 | } |
875 | 997 | ||
@@ -915,6 +1037,19 @@ out_unlock: | |||
915 | put_anon_vma(anon_vma); | 1037 | put_anon_vma(anon_vma); |
916 | unlock_page(page); | 1038 | unlock_page(page); |
917 | out: | 1039 | out: |
1040 | /* | ||
1041 | * If migration is successful, decrease refcount of the newpage | ||
1042 | * which will not free the page because new page owner increased | ||
1043 | * refcounter. As well, if it is LRU page, add the page to LRU | ||
1044 | * list in here. | ||
1045 | */ | ||
1046 | if (rc == MIGRATEPAGE_SUCCESS) { | ||
1047 | if (unlikely(__PageMovable(newpage))) | ||
1048 | put_page(newpage); | ||
1049 | else | ||
1050 | putback_lru_page(newpage); | ||
1051 | } | ||
1052 | |||
918 | return rc; | 1053 | return rc; |
919 | } | 1054 | } |
920 | 1055 | ||
@@ -948,6 +1083,18 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, | |||
948 | 1083 | ||
949 | if (page_count(page) == 1) { | 1084 | if (page_count(page) == 1) { |
950 | /* page was freed from under us. So we are done. */ | 1085 | /* page was freed from under us. So we are done. */ |
1086 | ClearPageActive(page); | ||
1087 | ClearPageUnevictable(page); | ||
1088 | if (unlikely(__PageMovable(page))) { | ||
1089 | lock_page(page); | ||
1090 | if (!PageMovable(page)) | ||
1091 | __ClearPageIsolated(page); | ||
1092 | unlock_page(page); | ||
1093 | } | ||
1094 | if (put_new_page) | ||
1095 | put_new_page(newpage, private); | ||
1096 | else | ||
1097 | put_page(newpage); | ||
951 | goto out; | 1098 | goto out; |
952 | } | 1099 | } |
953 | 1100 | ||
@@ -960,10 +1107,8 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, | |||
960 | } | 1107 | } |
961 | 1108 | ||
962 | rc = __unmap_and_move(page, newpage, force, mode); | 1109 | rc = __unmap_and_move(page, newpage, force, mode); |
963 | if (rc == MIGRATEPAGE_SUCCESS) { | 1110 | if (rc == MIGRATEPAGE_SUCCESS) |
964 | put_new_page = NULL; | ||
965 | set_page_owner_migrate_reason(newpage, reason); | 1111 | set_page_owner_migrate_reason(newpage, reason); |
966 | } | ||
967 | 1112 | ||
968 | out: | 1113 | out: |
969 | if (rc != -EAGAIN) { | 1114 | if (rc != -EAGAIN) { |
@@ -976,33 +1121,45 @@ out: | |||
976 | list_del(&page->lru); | 1121 | list_del(&page->lru); |
977 | dec_zone_page_state(page, NR_ISOLATED_ANON + | 1122 | dec_zone_page_state(page, NR_ISOLATED_ANON + |
978 | page_is_file_cache(page)); | 1123 | page_is_file_cache(page)); |
979 | /* Soft-offlined page shouldn't go through lru cache list */ | 1124 | } |
980 | if (reason == MR_MEMORY_FAILURE && rc == MIGRATEPAGE_SUCCESS) { | 1125 | |
1126 | /* | ||
1127 | * If migration is successful, releases reference grabbed during | ||
1128 | * isolation. Otherwise, restore the page to right list unless | ||
1129 | * we want to retry. | ||
1130 | */ | ||
1131 | if (rc == MIGRATEPAGE_SUCCESS) { | ||
1132 | put_page(page); | ||
1133 | if (reason == MR_MEMORY_FAILURE) { | ||
981 | /* | 1134 | /* |
982 | * With this release, we free successfully migrated | 1135 | * Set PG_HWPoison on just freed page |
983 | * page and set PG_HWPoison on just freed page | 1136 | * intentionally. Although it's rather weird, |
984 | * intentionally. Although it's rather weird, it's how | 1137 | * it's how HWPoison flag works at the moment. |
985 | * HWPoison flag works at the moment. | ||
986 | */ | 1138 | */ |
987 | put_page(page); | ||
988 | if (!test_set_page_hwpoison(page)) | 1139 | if (!test_set_page_hwpoison(page)) |
989 | num_poisoned_pages_inc(); | 1140 | num_poisoned_pages_inc(); |
990 | } else | 1141 | } |
991 | putback_lru_page(page); | 1142 | } else { |
992 | } | 1143 | if (rc != -EAGAIN) { |
1144 | if (likely(!__PageMovable(page))) { | ||
1145 | putback_lru_page(page); | ||
1146 | goto put_new; | ||
1147 | } | ||
993 | 1148 | ||
994 | /* | 1149 | lock_page(page); |
995 | * If migration was not successful and there's a freeing callback, use | 1150 | if (PageMovable(page)) |
996 | * it. Otherwise, putback_lru_page() will drop the reference grabbed | 1151 | putback_movable_page(page); |
997 | * during isolation. | 1152 | else |
998 | */ | 1153 | __ClearPageIsolated(page); |
999 | if (put_new_page) | 1154 | unlock_page(page); |
1000 | put_new_page(newpage, private); | 1155 | put_page(page); |
1001 | else if (unlikely(__is_movable_balloon_page(newpage))) { | 1156 | } |
1002 | /* drop our reference, page already in the balloon */ | 1157 | put_new: |
1003 | put_page(newpage); | 1158 | if (put_new_page) |
1004 | } else | 1159 | put_new_page(newpage, private); |
1005 | putback_lru_page(newpage); | 1160 | else |
1161 | put_page(newpage); | ||
1162 | } | ||
1006 | 1163 | ||
1007 | if (result) { | 1164 | if (result) { |
1008 | if (rc) | 1165 | if (rc) |
@@ -1829,8 +1986,7 @@ fail_putback: | |||
1829 | } | 1986 | } |
1830 | 1987 | ||
1831 | orig_entry = *pmd; | 1988 | orig_entry = *pmd; |
1832 | entry = mk_pmd(new_page, vma->vm_page_prot); | 1989 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); |
1833 | entry = pmd_mkhuge(entry); | ||
1834 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1990 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
1835 | 1991 | ||
1836 | /* | 1992 | /* |
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/personality.h> | 25 | #include <linux/personality.h> |
26 | #include <linux/security.h> | 26 | #include <linux/security.h> |
27 | #include <linux/hugetlb.h> | 27 | #include <linux/hugetlb.h> |
28 | #include <linux/shmem_fs.h> | ||
28 | #include <linux/profile.h> | 29 | #include <linux/profile.h> |
29 | #include <linux/export.h> | 30 | #include <linux/export.h> |
30 | #include <linux/mount.h> | 31 | #include <linux/mount.h> |
@@ -675,6 +676,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
675 | } | 676 | } |
676 | } | 677 | } |
677 | 678 | ||
679 | vma_adjust_trans_huge(vma, start, end, adjust_next); | ||
680 | |||
678 | if (file) { | 681 | if (file) { |
679 | mapping = file->f_mapping; | 682 | mapping = file->f_mapping; |
680 | root = &mapping->i_mmap; | 683 | root = &mapping->i_mmap; |
@@ -695,8 +698,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
695 | } | 698 | } |
696 | } | 699 | } |
697 | 700 | ||
698 | vma_adjust_trans_huge(vma, start, end, adjust_next); | ||
699 | |||
700 | anon_vma = vma->anon_vma; | 701 | anon_vma = vma->anon_vma; |
701 | if (!anon_vma && adjust_next) | 702 | if (!anon_vma && adjust_next) |
702 | anon_vma = next->anon_vma; | 703 | anon_vma = next->anon_vma; |
@@ -1897,8 +1898,19 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | |||
1897 | return -ENOMEM; | 1898 | return -ENOMEM; |
1898 | 1899 | ||
1899 | get_area = current->mm->get_unmapped_area; | 1900 | get_area = current->mm->get_unmapped_area; |
1900 | if (file && file->f_op->get_unmapped_area) | 1901 | if (file) { |
1901 | get_area = file->f_op->get_unmapped_area; | 1902 | if (file->f_op->get_unmapped_area) |
1903 | get_area = file->f_op->get_unmapped_area; | ||
1904 | } else if (flags & MAP_SHARED) { | ||
1905 | /* | ||
1906 | * mmap_region() will call shmem_zero_setup() to create a file, | ||
1907 | * so use shmem's get_unmapped_area in case it can be huge. | ||
1908 | * do_mmap_pgoff() will clear pgoff, so match alignment. | ||
1909 | */ | ||
1910 | pgoff = 0; | ||
1911 | get_area = shmem_get_unmapped_area; | ||
1912 | } | ||
1913 | |||
1902 | addr = get_area(file, addr, len, pgoff, flags); | 1914 | addr = get_area(file, addr, len, pgoff, flags); |
1903 | if (IS_ERR_VALUE(addr)) | 1915 | if (IS_ERR_VALUE(addr)) |
1904 | return addr; | 1916 | return addr; |
@@ -2591,6 +2603,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | |||
2591 | /* drop PG_Mlocked flag for over-mapped range */ | 2603 | /* drop PG_Mlocked flag for over-mapped range */ |
2592 | for (tmp = vma; tmp->vm_start >= start + size; | 2604 | for (tmp = vma; tmp->vm_start >= start + size; |
2593 | tmp = tmp->vm_next) { | 2605 | tmp = tmp->vm_next) { |
2606 | /* | ||
2607 | * Split pmd and munlock page on the border | ||
2608 | * of the range. | ||
2609 | */ | ||
2610 | vma_adjust_trans_huge(tmp, start, start + size, 0); | ||
2611 | |||
2594 | munlock_vma_pages_range(tmp, | 2612 | munlock_vma_pages_range(tmp, |
2595 | max(tmp->vm_start, start), | 2613 | max(tmp->vm_start, start), |
2596 | min(tmp->vm_end, start + size)); | 2614 | min(tmp->vm_end, start + size)); |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 5019a1ef2848..a4830f0325fe 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -163,7 +163,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, | |||
163 | if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { | 163 | if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { |
164 | if (next - addr != HPAGE_PMD_SIZE) { | 164 | if (next - addr != HPAGE_PMD_SIZE) { |
165 | split_huge_pmd(vma, pmd, addr); | 165 | split_huge_pmd(vma, pmd, addr); |
166 | if (pmd_none(*pmd)) | 166 | if (pmd_trans_unstable(pmd)) |
167 | continue; | 167 | continue; |
168 | } else { | 168 | } else { |
169 | int nr_ptes = change_huge_pmd(vma, pmd, addr, | 169 | int nr_ptes = change_huge_pmd(vma, pmd, addr, |
diff --git a/mm/mremap.c b/mm/mremap.c index 1f157adfdaf9..da22ad2a5678 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -210,9 +210,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
210 | } | 210 | } |
211 | } | 211 | } |
212 | split_huge_pmd(vma, old_pmd, old_addr); | 212 | split_huge_pmd(vma, old_pmd, old_addr); |
213 | if (pmd_none(*old_pmd)) | 213 | if (pmd_trans_unstable(old_pmd)) |
214 | continue; | 214 | continue; |
215 | VM_BUG_ON(pmd_trans_huge(*old_pmd)); | ||
216 | } | 215 | } |
217 | if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr)) | 216 | if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr)) |
218 | break; | 217 | break; |
diff --git a/mm/nommu.c b/mm/nommu.c index c2e58880207f..95daf81a4855 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1809,7 +1809,8 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1809 | } | 1809 | } |
1810 | EXPORT_SYMBOL(filemap_fault); | 1810 | EXPORT_SYMBOL(filemap_fault); |
1811 | 1811 | ||
1812 | void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) | 1812 | void filemap_map_pages(struct fault_env *fe, |
1813 | pgoff_t start_pgoff, pgoff_t end_pgoff) | ||
1813 | { | 1814 | { |
1814 | BUG(); | 1815 | BUG(); |
1815 | } | 1816 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ddf74487f848..d4a929d79470 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -274,7 +274,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc, | |||
274 | #endif | 274 | #endif |
275 | 275 | ||
276 | enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, | 276 | enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, |
277 | struct task_struct *task, unsigned long totalpages) | 277 | struct task_struct *task) |
278 | { | 278 | { |
279 | if (oom_unkillable_task(task, NULL, oc->nodemask)) | 279 | if (oom_unkillable_task(task, NULL, oc->nodemask)) |
280 | return OOM_SCAN_CONTINUE; | 280 | return OOM_SCAN_CONTINUE; |
@@ -311,7 +311,7 @@ static struct task_struct *select_bad_process(struct oom_control *oc, | |||
311 | for_each_process(p) { | 311 | for_each_process(p) { |
312 | unsigned int points; | 312 | unsigned int points; |
313 | 313 | ||
314 | switch (oom_scan_process_thread(oc, p, totalpages)) { | 314 | switch (oom_scan_process_thread(oc, p)) { |
315 | case OOM_SCAN_SELECT: | 315 | case OOM_SCAN_SELECT: |
316 | chosen = p; | 316 | chosen = p; |
317 | chosen_points = ULONG_MAX; | 317 | chosen_points = ULONG_MAX; |
@@ -383,8 +383,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | |||
383 | rcu_read_unlock(); | 383 | rcu_read_unlock(); |
384 | } | 384 | } |
385 | 385 | ||
386 | static void dump_header(struct oom_control *oc, struct task_struct *p, | 386 | static void dump_header(struct oom_control *oc, struct task_struct *p) |
387 | struct mem_cgroup *memcg) | ||
388 | { | 387 | { |
389 | pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", | 388 | pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", |
390 | current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, | 389 | current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, |
@@ -392,12 +391,12 @@ static void dump_header(struct oom_control *oc, struct task_struct *p, | |||
392 | 391 | ||
393 | cpuset_print_current_mems_allowed(); | 392 | cpuset_print_current_mems_allowed(); |
394 | dump_stack(); | 393 | dump_stack(); |
395 | if (memcg) | 394 | if (oc->memcg) |
396 | mem_cgroup_print_oom_info(memcg, p); | 395 | mem_cgroup_print_oom_info(oc->memcg, p); |
397 | else | 396 | else |
398 | show_mem(SHOW_MEM_FILTER_NODES); | 397 | show_mem(SHOW_MEM_FILTER_NODES); |
399 | if (sysctl_oom_dump_tasks) | 398 | if (sysctl_oom_dump_tasks) |
400 | dump_tasks(memcg, oc->nodemask); | 399 | dump_tasks(oc->memcg, oc->nodemask); |
401 | } | 400 | } |
402 | 401 | ||
403 | /* | 402 | /* |
@@ -453,7 +452,7 @@ static bool __oom_reap_task(struct task_struct *tsk) | |||
453 | * We have to make sure to not race with the victim exit path | 452 | * We have to make sure to not race with the victim exit path |
454 | * and cause premature new oom victim selection: | 453 | * and cause premature new oom victim selection: |
455 | * __oom_reap_task exit_mm | 454 | * __oom_reap_task exit_mm |
456 | * atomic_inc_not_zero | 455 | * mmget_not_zero |
457 | * mmput | 456 | * mmput |
458 | * atomic_dec_and_test | 457 | * atomic_dec_and_test |
459 | * exit_oom_victim | 458 | * exit_oom_victim |
@@ -475,12 +474,22 @@ static bool __oom_reap_task(struct task_struct *tsk) | |||
475 | if (!p) | 474 | if (!p) |
476 | goto unlock_oom; | 475 | goto unlock_oom; |
477 | mm = p->mm; | 476 | mm = p->mm; |
478 | atomic_inc(&mm->mm_users); | 477 | atomic_inc(&mm->mm_count); |
479 | task_unlock(p); | 478 | task_unlock(p); |
480 | 479 | ||
481 | if (!down_read_trylock(&mm->mmap_sem)) { | 480 | if (!down_read_trylock(&mm->mmap_sem)) { |
482 | ret = false; | 481 | ret = false; |
483 | goto unlock_oom; | 482 | goto mm_drop; |
483 | } | ||
484 | |||
485 | /* | ||
486 | * increase mm_users only after we know we will reap something so | ||
487 | * that the mmput_async is called only when we have reaped something | ||
488 | * and delayed __mmput doesn't matter that much | ||
489 | */ | ||
490 | if (!mmget_not_zero(mm)) { | ||
491 | up_read(&mm->mmap_sem); | ||
492 | goto mm_drop; | ||
484 | } | 493 | } |
485 | 494 | ||
486 | tlb_gather_mmu(&tlb, mm, 0, -1); | 495 | tlb_gather_mmu(&tlb, mm, 0, -1); |
@@ -522,15 +531,16 @@ static bool __oom_reap_task(struct task_struct *tsk) | |||
522 | * to release its memory. | 531 | * to release its memory. |
523 | */ | 532 | */ |
524 | set_bit(MMF_OOM_REAPED, &mm->flags); | 533 | set_bit(MMF_OOM_REAPED, &mm->flags); |
525 | unlock_oom: | ||
526 | mutex_unlock(&oom_lock); | ||
527 | /* | 534 | /* |
528 | * Drop our reference but make sure the mmput slow path is called from a | 535 | * Drop our reference but make sure the mmput slow path is called from a |
529 | * different context because we shouldn't risk we get stuck there and | 536 | * different context because we shouldn't risk we get stuck there and |
530 | * put the oom_reaper out of the way. | 537 | * put the oom_reaper out of the way. |
531 | */ | 538 | */ |
532 | if (mm) | 539 | mmput_async(mm); |
533 | mmput_async(mm); | 540 | mm_drop: |
541 | mmdrop(mm); | ||
542 | unlock_oom: | ||
543 | mutex_unlock(&oom_lock); | ||
534 | return ret; | 544 | return ret; |
535 | } | 545 | } |
536 | 546 | ||
@@ -739,7 +749,7 @@ void oom_killer_enable(void) | |||
739 | */ | 749 | */ |
740 | void oom_kill_process(struct oom_control *oc, struct task_struct *p, | 750 | void oom_kill_process(struct oom_control *oc, struct task_struct *p, |
741 | unsigned int points, unsigned long totalpages, | 751 | unsigned int points, unsigned long totalpages, |
742 | struct mem_cgroup *memcg, const char *message) | 752 | const char *message) |
743 | { | 753 | { |
744 | struct task_struct *victim = p; | 754 | struct task_struct *victim = p; |
745 | struct task_struct *child; | 755 | struct task_struct *child; |
@@ -765,7 +775,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
765 | task_unlock(p); | 775 | task_unlock(p); |
766 | 776 | ||
767 | if (__ratelimit(&oom_rs)) | 777 | if (__ratelimit(&oom_rs)) |
768 | dump_header(oc, p, memcg); | 778 | dump_header(oc, p); |
769 | 779 | ||
770 | pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", | 780 | pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", |
771 | message, task_pid_nr(p), p->comm, points); | 781 | message, task_pid_nr(p), p->comm, points); |
@@ -786,8 +796,8 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
786 | /* | 796 | /* |
787 | * oom_badness() returns 0 if the thread is unkillable | 797 | * oom_badness() returns 0 if the thread is unkillable |
788 | */ | 798 | */ |
789 | child_points = oom_badness(child, memcg, oc->nodemask, | 799 | child_points = oom_badness(child, |
790 | totalpages); | 800 | oc->memcg, oc->nodemask, totalpages); |
791 | if (child_points > victim_points) { | 801 | if (child_points > victim_points) { |
792 | put_task_struct(victim); | 802 | put_task_struct(victim); |
793 | victim = child; | 803 | victim = child; |
@@ -865,8 +875,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, | |||
865 | /* | 875 | /* |
866 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. | 876 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. |
867 | */ | 877 | */ |
868 | void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, | 878 | void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint) |
869 | struct mem_cgroup *memcg) | ||
870 | { | 879 | { |
871 | if (likely(!sysctl_panic_on_oom)) | 880 | if (likely(!sysctl_panic_on_oom)) |
872 | return; | 881 | return; |
@@ -882,7 +891,7 @@ void check_panic_on_oom(struct oom_control *oc, enum oom_constraint constraint, | |||
882 | /* Do not panic for oom kills triggered by sysrq */ | 891 | /* Do not panic for oom kills triggered by sysrq */ |
883 | if (is_sysrq_oom(oc)) | 892 | if (is_sysrq_oom(oc)) |
884 | return; | 893 | return; |
885 | dump_header(oc, NULL, memcg); | 894 | dump_header(oc, NULL); |
886 | panic("Out of memory: %s panic_on_oom is enabled\n", | 895 | panic("Out of memory: %s panic_on_oom is enabled\n", |
887 | sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); | 896 | sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); |
888 | } | 897 | } |
@@ -957,13 +966,13 @@ bool out_of_memory(struct oom_control *oc) | |||
957 | constraint = constrained_alloc(oc, &totalpages); | 966 | constraint = constrained_alloc(oc, &totalpages); |
958 | if (constraint != CONSTRAINT_MEMORY_POLICY) | 967 | if (constraint != CONSTRAINT_MEMORY_POLICY) |
959 | oc->nodemask = NULL; | 968 | oc->nodemask = NULL; |
960 | check_panic_on_oom(oc, constraint, NULL); | 969 | check_panic_on_oom(oc, constraint); |
961 | 970 | ||
962 | if (sysctl_oom_kill_allocating_task && current->mm && | 971 | if (sysctl_oom_kill_allocating_task && current->mm && |
963 | !oom_unkillable_task(current, NULL, oc->nodemask) && | 972 | !oom_unkillable_task(current, NULL, oc->nodemask) && |
964 | current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { | 973 | current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { |
965 | get_task_struct(current); | 974 | get_task_struct(current); |
966 | oom_kill_process(oc, current, 0, totalpages, NULL, | 975 | oom_kill_process(oc, current, 0, totalpages, |
967 | "Out of memory (oom_kill_allocating_task)"); | 976 | "Out of memory (oom_kill_allocating_task)"); |
968 | return true; | 977 | return true; |
969 | } | 978 | } |
@@ -971,12 +980,11 @@ bool out_of_memory(struct oom_control *oc) | |||
971 | p = select_bad_process(oc, &points, totalpages); | 980 | p = select_bad_process(oc, &points, totalpages); |
972 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 981 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
973 | if (!p && !is_sysrq_oom(oc)) { | 982 | if (!p && !is_sysrq_oom(oc)) { |
974 | dump_header(oc, NULL, NULL); | 983 | dump_header(oc, NULL); |
975 | panic("Out of memory and no killable processes...\n"); | 984 | panic("Out of memory and no killable processes...\n"); |
976 | } | 985 | } |
977 | if (p && p != (void *)-1UL) { | 986 | if (p && p != (void *)-1UL) { |
978 | oom_kill_process(oc, p, points, totalpages, NULL, | 987 | oom_kill_process(oc, p, points, totalpages, "Out of memory"); |
979 | "Out of memory"); | ||
980 | /* | 988 | /* |
981 | * Give the killed process a good chance to exit before trying | 989 | * Give the killed process a good chance to exit before trying |
982 | * to allocate memory again. | 990 | * to allocate memory again. |
@@ -988,14 +996,15 @@ bool out_of_memory(struct oom_control *oc) | |||
988 | 996 | ||
989 | /* | 997 | /* |
990 | * The pagefault handler calls here because it is out of memory, so kill a | 998 | * The pagefault handler calls here because it is out of memory, so kill a |
991 | * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a | 999 | * memory-hogging task. If oom_lock is held by somebody else, a parallel oom |
992 | * parallel oom killing is already in progress so do nothing. | 1000 | * killing is already in progress so do nothing. |
993 | */ | 1001 | */ |
994 | void pagefault_out_of_memory(void) | 1002 | void pagefault_out_of_memory(void) |
995 | { | 1003 | { |
996 | struct oom_control oc = { | 1004 | struct oom_control oc = { |
997 | .zonelist = NULL, | 1005 | .zonelist = NULL, |
998 | .nodemask = NULL, | 1006 | .nodemask = NULL, |
1007 | .memcg = NULL, | ||
999 | .gfp_mask = 0, | 1008 | .gfp_mask = 0, |
1000 | .order = 0, | 1009 | .order = 0, |
1001 | }; | 1010 | }; |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e2481949494c..d578d2a56b19 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -2563,6 +2563,7 @@ int set_page_dirty(struct page *page) | |||
2563 | { | 2563 | { |
2564 | struct address_space *mapping = page_mapping(page); | 2564 | struct address_space *mapping = page_mapping(page); |
2565 | 2565 | ||
2566 | page = compound_head(page); | ||
2566 | if (likely(mapping)) { | 2567 | if (likely(mapping)) { |
2567 | int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; | 2568 | int (*spd)(struct page *) = mapping->a_ops->set_page_dirty; |
2568 | /* | 2569 | /* |
@@ -2747,6 +2748,11 @@ int test_clear_page_writeback(struct page *page) | |||
2747 | __wb_writeout_inc(wb); | 2748 | __wb_writeout_inc(wb); |
2748 | } | 2749 | } |
2749 | } | 2750 | } |
2751 | |||
2752 | if (mapping->host && !mapping_tagged(mapping, | ||
2753 | PAGECACHE_TAG_WRITEBACK)) | ||
2754 | sb_clear_inode_writeback(mapping->host); | ||
2755 | |||
2750 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 2756 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
2751 | } else { | 2757 | } else { |
2752 | ret = TestClearPageWriteback(page); | 2758 | ret = TestClearPageWriteback(page); |
@@ -2774,11 +2780,24 @@ int __test_set_page_writeback(struct page *page, bool keep_write) | |||
2774 | spin_lock_irqsave(&mapping->tree_lock, flags); | 2780 | spin_lock_irqsave(&mapping->tree_lock, flags); |
2775 | ret = TestSetPageWriteback(page); | 2781 | ret = TestSetPageWriteback(page); |
2776 | if (!ret) { | 2782 | if (!ret) { |
2783 | bool on_wblist; | ||
2784 | |||
2785 | on_wblist = mapping_tagged(mapping, | ||
2786 | PAGECACHE_TAG_WRITEBACK); | ||
2787 | |||
2777 | radix_tree_tag_set(&mapping->page_tree, | 2788 | radix_tree_tag_set(&mapping->page_tree, |
2778 | page_index(page), | 2789 | page_index(page), |
2779 | PAGECACHE_TAG_WRITEBACK); | 2790 | PAGECACHE_TAG_WRITEBACK); |
2780 | if (bdi_cap_account_writeback(bdi)) | 2791 | if (bdi_cap_account_writeback(bdi)) |
2781 | __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); | 2792 | __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); |
2793 | |||
2794 | /* | ||
2795 | * We can come through here when swapping anonymous | ||
2796 | * pages, so we don't necessarily have an inode to track | ||
2797 | * for sync. | ||
2798 | */ | ||
2799 | if (mapping->host && !on_wblist) | ||
2800 | sb_mark_inode_writeback(mapping->host); | ||
2782 | } | 2801 | } |
2783 | if (!PageDirty(page)) | 2802 | if (!PageDirty(page)) |
2784 | radix_tree_tag_clear(&mapping->page_tree, | 2803 | radix_tree_tag_clear(&mapping->page_tree, |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8b3e1341b754..452513bf02ce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -63,6 +63,7 @@ | |||
63 | #include <linux/sched/rt.h> | 63 | #include <linux/sched/rt.h> |
64 | #include <linux/page_owner.h> | 64 | #include <linux/page_owner.h> |
65 | #include <linux/kthread.h> | 65 | #include <linux/kthread.h> |
66 | #include <linux/memcontrol.h> | ||
66 | 67 | ||
67 | #include <asm/sections.h> | 68 | #include <asm/sections.h> |
68 | #include <asm/tlbflush.h> | 69 | #include <asm/tlbflush.h> |
@@ -1006,6 +1007,8 @@ static __always_inline bool free_pages_prepare(struct page *page, | |||
1006 | 1007 | ||
1007 | VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); | 1008 | VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); |
1008 | 1009 | ||
1010 | if (compound) | ||
1011 | ClearPageDoubleMap(page); | ||
1009 | for (i = 1; i < (1 << order); i++) { | 1012 | for (i = 1; i < (1 << order); i++) { |
1010 | if (compound) | 1013 | if (compound) |
1011 | bad += free_tail_pages_check(page, page + i); | 1014 | bad += free_tail_pages_check(page, page + i); |
@@ -1016,8 +1019,12 @@ static __always_inline bool free_pages_prepare(struct page *page, | |||
1016 | (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 1019 | (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
1017 | } | 1020 | } |
1018 | } | 1021 | } |
1019 | if (PageAnonHead(page)) | 1022 | if (PageMappingFlags(page)) |
1020 | page->mapping = NULL; | 1023 | page->mapping = NULL; |
1024 | if (memcg_kmem_enabled() && PageKmemcg(page)) { | ||
1025 | memcg_kmem_uncharge(page, order); | ||
1026 | __ClearPageKmemcg(page); | ||
1027 | } | ||
1021 | if (check_free) | 1028 | if (check_free) |
1022 | bad += free_pages_check(page); | 1029 | bad += free_pages_check(page); |
1023 | if (bad) | 1030 | if (bad) |
@@ -1724,6 +1731,19 @@ static bool check_new_pages(struct page *page, unsigned int order) | |||
1724 | return false; | 1731 | return false; |
1725 | } | 1732 | } |
1726 | 1733 | ||
1734 | inline void post_alloc_hook(struct page *page, unsigned int order, | ||
1735 | gfp_t gfp_flags) | ||
1736 | { | ||
1737 | set_page_private(page, 0); | ||
1738 | set_page_refcounted(page); | ||
1739 | |||
1740 | arch_alloc_page(page, order); | ||
1741 | kernel_map_pages(page, 1 << order, 1); | ||
1742 | kernel_poison_pages(page, 1 << order, 1); | ||
1743 | kasan_alloc_pages(page, order); | ||
1744 | set_page_owner(page, order, gfp_flags); | ||
1745 | } | ||
1746 | |||
1727 | static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, | 1747 | static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, |
1728 | unsigned int alloc_flags) | 1748 | unsigned int alloc_flags) |
1729 | { | 1749 | { |
@@ -1736,13 +1756,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags | |||
1736 | poisoned &= page_is_poisoned(p); | 1756 | poisoned &= page_is_poisoned(p); |
1737 | } | 1757 | } |
1738 | 1758 | ||
1739 | set_page_private(page, 0); | 1759 | post_alloc_hook(page, order, gfp_flags); |
1740 | set_page_refcounted(page); | ||
1741 | |||
1742 | arch_alloc_page(page, order); | ||
1743 | kernel_map_pages(page, 1 << order, 1); | ||
1744 | kernel_poison_pages(page, 1 << order, 1); | ||
1745 | kasan_alloc_pages(page, order); | ||
1746 | 1760 | ||
1747 | if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) | 1761 | if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) |
1748 | for (i = 0; i < (1 << order); i++) | 1762 | for (i = 0; i < (1 << order); i++) |
@@ -1751,8 +1765,6 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags | |||
1751 | if (order && (gfp_flags & __GFP_COMP)) | 1765 | if (order && (gfp_flags & __GFP_COMP)) |
1752 | prep_compound_page(page, order); | 1766 | prep_compound_page(page, order); |
1753 | 1767 | ||
1754 | set_page_owner(page, order, gfp_flags); | ||
1755 | |||
1756 | /* | 1768 | /* |
1757 | * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to | 1769 | * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to |
1758 | * allocate the page. The expectation is that the caller is taking | 1770 | * allocate the page. The expectation is that the caller is taking |
@@ -2461,7 +2473,6 @@ void free_hot_cold_page_list(struct list_head *list, bool cold) | |||
2461 | void split_page(struct page *page, unsigned int order) | 2473 | void split_page(struct page *page, unsigned int order) |
2462 | { | 2474 | { |
2463 | int i; | 2475 | int i; |
2464 | gfp_t gfp_mask; | ||
2465 | 2476 | ||
2466 | VM_BUG_ON_PAGE(PageCompound(page), page); | 2477 | VM_BUG_ON_PAGE(PageCompound(page), page); |
2467 | VM_BUG_ON_PAGE(!page_count(page), page); | 2478 | VM_BUG_ON_PAGE(!page_count(page), page); |
@@ -2475,12 +2486,9 @@ void split_page(struct page *page, unsigned int order) | |||
2475 | split_page(virt_to_page(page[0].shadow), order); | 2486 | split_page(virt_to_page(page[0].shadow), order); |
2476 | #endif | 2487 | #endif |
2477 | 2488 | ||
2478 | gfp_mask = get_page_owner_gfp(page); | 2489 | for (i = 1; i < (1 << order); i++) |
2479 | set_page_owner(page, 0, gfp_mask); | ||
2480 | for (i = 1; i < (1 << order); i++) { | ||
2481 | set_page_refcounted(page + i); | 2490 | set_page_refcounted(page + i); |
2482 | set_page_owner(page + i, 0, gfp_mask); | 2491 | split_page_owner(page, order); |
2483 | } | ||
2484 | } | 2492 | } |
2485 | EXPORT_SYMBOL_GPL(split_page); | 2493 | EXPORT_SYMBOL_GPL(split_page); |
2486 | 2494 | ||
@@ -2509,8 +2517,6 @@ int __isolate_free_page(struct page *page, unsigned int order) | |||
2509 | zone->free_area[order].nr_free--; | 2517 | zone->free_area[order].nr_free--; |
2510 | rmv_page_order(page); | 2518 | rmv_page_order(page); |
2511 | 2519 | ||
2512 | set_page_owner(page, order, __GFP_MOVABLE); | ||
2513 | |||
2514 | /* Set the pageblock if the isolated page is at least a pageblock */ | 2520 | /* Set the pageblock if the isolated page is at least a pageblock */ |
2515 | if (order >= pageblock_order - 1) { | 2521 | if (order >= pageblock_order - 1) { |
2516 | struct page *endpage = page + (1 << order) - 1; | 2522 | struct page *endpage = page + (1 << order) - 1; |
@@ -2527,33 +2533,6 @@ int __isolate_free_page(struct page *page, unsigned int order) | |||
2527 | } | 2533 | } |
2528 | 2534 | ||
2529 | /* | 2535 | /* |
2530 | * Similar to split_page except the page is already free. As this is only | ||
2531 | * being used for migration, the migratetype of the block also changes. | ||
2532 | * As this is called with interrupts disabled, the caller is responsible | ||
2533 | * for calling arch_alloc_page() and kernel_map_page() after interrupts | ||
2534 | * are enabled. | ||
2535 | * | ||
2536 | * Note: this is probably too low level an operation for use in drivers. | ||
2537 | * Please consult with lkml before using this in your driver. | ||
2538 | */ | ||
2539 | int split_free_page(struct page *page) | ||
2540 | { | ||
2541 | unsigned int order; | ||
2542 | int nr_pages; | ||
2543 | |||
2544 | order = page_order(page); | ||
2545 | |||
2546 | nr_pages = __isolate_free_page(page, order); | ||
2547 | if (!nr_pages) | ||
2548 | return 0; | ||
2549 | |||
2550 | /* Split into individual pages */ | ||
2551 | set_page_refcounted(page); | ||
2552 | split_page(page, order); | ||
2553 | return nr_pages; | ||
2554 | } | ||
2555 | |||
2556 | /* | ||
2557 | * Update NUMA hit/miss statistics | 2536 | * Update NUMA hit/miss statistics |
2558 | * | 2537 | * |
2559 | * Must be called with interrupts disabled. | 2538 | * Must be called with interrupts disabled. |
@@ -3105,6 +3084,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
3105 | struct oom_control oc = { | 3084 | struct oom_control oc = { |
3106 | .zonelist = ac->zonelist, | 3085 | .zonelist = ac->zonelist, |
3107 | .nodemask = ac->nodemask, | 3086 | .nodemask = ac->nodemask, |
3087 | .memcg = NULL, | ||
3108 | .gfp_mask = gfp_mask, | 3088 | .gfp_mask = gfp_mask, |
3109 | .order = order, | 3089 | .order = order, |
3110 | }; | 3090 | }; |
@@ -3868,6 +3848,14 @@ no_zone: | |||
3868 | } | 3848 | } |
3869 | 3849 | ||
3870 | out: | 3850 | out: |
3851 | if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page) { | ||
3852 | if (unlikely(memcg_kmem_charge(page, gfp_mask, order))) { | ||
3853 | __free_pages(page, order); | ||
3854 | page = NULL; | ||
3855 | } else | ||
3856 | __SetPageKmemcg(page); | ||
3857 | } | ||
3858 | |||
3871 | if (kmemcheck_enabled && page) | 3859 | if (kmemcheck_enabled && page) |
3872 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | 3860 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); |
3873 | 3861 | ||
@@ -4023,56 +4011,6 @@ void __free_page_frag(void *addr) | |||
4023 | } | 4011 | } |
4024 | EXPORT_SYMBOL(__free_page_frag); | 4012 | EXPORT_SYMBOL(__free_page_frag); |
4025 | 4013 | ||
4026 | /* | ||
4027 | * alloc_kmem_pages charges newly allocated pages to the kmem resource counter | ||
4028 | * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is | ||
4029 | * equivalent to alloc_pages. | ||
4030 | * | ||
4031 | * It should be used when the caller would like to use kmalloc, but since the | ||
4032 | * allocation is large, it has to fall back to the page allocator. | ||
4033 | */ | ||
4034 | struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) | ||
4035 | { | ||
4036 | struct page *page; | ||
4037 | |||
4038 | page = alloc_pages(gfp_mask, order); | ||
4039 | if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { | ||
4040 | __free_pages(page, order); | ||
4041 | page = NULL; | ||
4042 | } | ||
4043 | return page; | ||
4044 | } | ||
4045 | |||
4046 | struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) | ||
4047 | { | ||
4048 | struct page *page; | ||
4049 | |||
4050 | page = alloc_pages_node(nid, gfp_mask, order); | ||
4051 | if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { | ||
4052 | __free_pages(page, order); | ||
4053 | page = NULL; | ||
4054 | } | ||
4055 | return page; | ||
4056 | } | ||
4057 | |||
4058 | /* | ||
4059 | * __free_kmem_pages and free_kmem_pages will free pages allocated with | ||
4060 | * alloc_kmem_pages. | ||
4061 | */ | ||
4062 | void __free_kmem_pages(struct page *page, unsigned int order) | ||
4063 | { | ||
4064 | memcg_kmem_uncharge(page, order); | ||
4065 | __free_pages(page, order); | ||
4066 | } | ||
4067 | |||
4068 | void free_kmem_pages(unsigned long addr, unsigned int order) | ||
4069 | { | ||
4070 | if (addr != 0) { | ||
4071 | VM_BUG_ON(!virt_addr_valid((void *)addr)); | ||
4072 | __free_kmem_pages(virt_to_page((void *)addr), order); | ||
4073 | } | ||
4074 | } | ||
4075 | |||
4076 | static void *make_alloc_exact(unsigned long addr, unsigned int order, | 4014 | static void *make_alloc_exact(unsigned long addr, unsigned int order, |
4077 | size_t size) | 4015 | size_t size) |
4078 | { | 4016 | { |
@@ -4374,6 +4312,9 @@ void show_free_areas(unsigned int filter) | |||
4374 | " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" | 4312 | " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" |
4375 | " slab_reclaimable:%lu slab_unreclaimable:%lu\n" | 4313 | " slab_reclaimable:%lu slab_unreclaimable:%lu\n" |
4376 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" | 4314 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" |
4315 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
4316 | " anon_thp: %lu shmem_thp: %lu shmem_pmdmapped: %lu\n" | ||
4317 | #endif | ||
4377 | " free:%lu free_pcp:%lu free_cma:%lu\n", | 4318 | " free:%lu free_pcp:%lu free_cma:%lu\n", |
4378 | global_page_state(NR_ACTIVE_ANON), | 4319 | global_page_state(NR_ACTIVE_ANON), |
4379 | global_page_state(NR_INACTIVE_ANON), | 4320 | global_page_state(NR_INACTIVE_ANON), |
@@ -4391,6 +4332,11 @@ void show_free_areas(unsigned int filter) | |||
4391 | global_page_state(NR_SHMEM), | 4332 | global_page_state(NR_SHMEM), |
4392 | global_page_state(NR_PAGETABLE), | 4333 | global_page_state(NR_PAGETABLE), |
4393 | global_page_state(NR_BOUNCE), | 4334 | global_page_state(NR_BOUNCE), |
4335 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
4336 | global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR, | ||
4337 | global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR, | ||
4338 | global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR, | ||
4339 | #endif | ||
4394 | global_page_state(NR_FREE_PAGES), | 4340 | global_page_state(NR_FREE_PAGES), |
4395 | free_pcp, | 4341 | free_pcp, |
4396 | global_page_state(NR_FREE_CMA_PAGES)); | 4342 | global_page_state(NR_FREE_CMA_PAGES)); |
@@ -4425,6 +4371,11 @@ void show_free_areas(unsigned int filter) | |||
4425 | " writeback:%lukB" | 4371 | " writeback:%lukB" |
4426 | " mapped:%lukB" | 4372 | " mapped:%lukB" |
4427 | " shmem:%lukB" | 4373 | " shmem:%lukB" |
4374 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
4375 | " shmem_thp: %lukB" | ||
4376 | " shmem_pmdmapped: %lukB" | ||
4377 | " anon_thp: %lukB" | ||
4378 | #endif | ||
4428 | " slab_reclaimable:%lukB" | 4379 | " slab_reclaimable:%lukB" |
4429 | " slab_unreclaimable:%lukB" | 4380 | " slab_unreclaimable:%lukB" |
4430 | " kernel_stack:%lukB" | 4381 | " kernel_stack:%lukB" |
@@ -4457,6 +4408,12 @@ void show_free_areas(unsigned int filter) | |||
4457 | K(zone_page_state(zone, NR_WRITEBACK)), | 4408 | K(zone_page_state(zone, NR_WRITEBACK)), |
4458 | K(zone_page_state(zone, NR_FILE_MAPPED)), | 4409 | K(zone_page_state(zone, NR_FILE_MAPPED)), |
4459 | K(zone_page_state(zone, NR_SHMEM)), | 4410 | K(zone_page_state(zone, NR_SHMEM)), |
4411 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
4412 | K(zone_page_state(zone, NR_SHMEM_THPS) * HPAGE_PMD_NR), | ||
4413 | K(zone_page_state(zone, NR_SHMEM_PMDMAPPED) | ||
4414 | * HPAGE_PMD_NR), | ||
4415 | K(zone_page_state(zone, NR_ANON_THPS) * HPAGE_PMD_NR), | ||
4416 | #endif | ||
4460 | K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), | 4417 | K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), |
4461 | K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), | 4418 | K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), |
4462 | zone_page_state(zone, NR_KERNEL_STACK) * | 4419 | zone_page_state(zone, NR_KERNEL_STACK) * |
@@ -6467,15 +6424,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
6467 | sizeof(arch_zone_lowest_possible_pfn)); | 6424 | sizeof(arch_zone_lowest_possible_pfn)); |
6468 | memset(arch_zone_highest_possible_pfn, 0, | 6425 | memset(arch_zone_highest_possible_pfn, 0, |
6469 | sizeof(arch_zone_highest_possible_pfn)); | 6426 | sizeof(arch_zone_highest_possible_pfn)); |
6470 | arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); | 6427 | |
6471 | arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; | 6428 | start_pfn = find_min_pfn_with_active_regions(); |
6472 | for (i = 1; i < MAX_NR_ZONES; i++) { | 6429 | |
6430 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
6473 | if (i == ZONE_MOVABLE) | 6431 | if (i == ZONE_MOVABLE) |
6474 | continue; | 6432 | continue; |
6475 | arch_zone_lowest_possible_pfn[i] = | 6433 | |
6476 | arch_zone_highest_possible_pfn[i-1]; | 6434 | end_pfn = max(max_zone_pfn[i], start_pfn); |
6477 | arch_zone_highest_possible_pfn[i] = | 6435 | arch_zone_lowest_possible_pfn[i] = start_pfn; |
6478 | max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); | 6436 | arch_zone_highest_possible_pfn[i] = end_pfn; |
6437 | |||
6438 | start_pfn = end_pfn; | ||
6479 | } | 6439 | } |
6480 | arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; | 6440 | arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; |
6481 | arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; | 6441 | arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; |
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 612122bf6a42..064b7fb6e0b5 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -7,6 +7,7 @@ | |||
7 | #include <linux/pageblock-flags.h> | 7 | #include <linux/pageblock-flags.h> |
8 | #include <linux/memory.h> | 8 | #include <linux/memory.h> |
9 | #include <linux/hugetlb.h> | 9 | #include <linux/hugetlb.h> |
10 | #include <linux/page_owner.h> | ||
10 | #include "internal.h" | 11 | #include "internal.h" |
11 | 12 | ||
12 | #define CREATE_TRACE_POINTS | 13 | #define CREATE_TRACE_POINTS |
@@ -80,7 +81,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) | |||
80 | { | 81 | { |
81 | struct zone *zone; | 82 | struct zone *zone; |
82 | unsigned long flags, nr_pages; | 83 | unsigned long flags, nr_pages; |
83 | struct page *isolated_page = NULL; | 84 | bool isolated_page = false; |
84 | unsigned int order; | 85 | unsigned int order; |
85 | unsigned long page_idx, buddy_idx; | 86 | unsigned long page_idx, buddy_idx; |
86 | struct page *buddy; | 87 | struct page *buddy; |
@@ -108,9 +109,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) | |||
108 | if (pfn_valid_within(page_to_pfn(buddy)) && | 109 | if (pfn_valid_within(page_to_pfn(buddy)) && |
109 | !is_migrate_isolate_page(buddy)) { | 110 | !is_migrate_isolate_page(buddy)) { |
110 | __isolate_free_page(page, order); | 111 | __isolate_free_page(page, order); |
111 | kernel_map_pages(page, (1 << order), 1); | 112 | isolated_page = true; |
112 | set_page_refcounted(page); | ||
113 | isolated_page = page; | ||
114 | } | 113 | } |
115 | } | 114 | } |
116 | } | 115 | } |
@@ -128,8 +127,10 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) | |||
128 | zone->nr_isolate_pageblock--; | 127 | zone->nr_isolate_pageblock--; |
129 | out: | 128 | out: |
130 | spin_unlock_irqrestore(&zone->lock, flags); | 129 | spin_unlock_irqrestore(&zone->lock, flags); |
131 | if (isolated_page) | 130 | if (isolated_page) { |
132 | __free_pages(isolated_page, order); | 131 | post_alloc_hook(page, order, __GFP_MOVABLE); |
132 | __free_pages(page, order); | ||
133 | } | ||
133 | } | 134 | } |
134 | 135 | ||
135 | static inline struct page * | 136 | static inline struct page * |
diff --git a/mm/page_owner.c b/mm/page_owner.c index fedeba88c9cb..ec6dc1886f71 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c | |||
@@ -7,11 +7,22 @@ | |||
7 | #include <linux/page_owner.h> | 7 | #include <linux/page_owner.h> |
8 | #include <linux/jump_label.h> | 8 | #include <linux/jump_label.h> |
9 | #include <linux/migrate.h> | 9 | #include <linux/migrate.h> |
10 | #include <linux/stackdepot.h> | ||
11 | |||
10 | #include "internal.h" | 12 | #include "internal.h" |
11 | 13 | ||
14 | /* | ||
15 | * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack) | ||
16 | * to use off stack temporal storage | ||
17 | */ | ||
18 | #define PAGE_OWNER_STACK_DEPTH (16) | ||
19 | |||
12 | static bool page_owner_disabled = true; | 20 | static bool page_owner_disabled = true; |
13 | DEFINE_STATIC_KEY_FALSE(page_owner_inited); | 21 | DEFINE_STATIC_KEY_FALSE(page_owner_inited); |
14 | 22 | ||
23 | static depot_stack_handle_t dummy_handle; | ||
24 | static depot_stack_handle_t failure_handle; | ||
25 | |||
15 | static void init_early_allocated_pages(void); | 26 | static void init_early_allocated_pages(void); |
16 | 27 | ||
17 | static int early_page_owner_param(char *buf) | 28 | static int early_page_owner_param(char *buf) |
@@ -34,11 +45,41 @@ static bool need_page_owner(void) | |||
34 | return true; | 45 | return true; |
35 | } | 46 | } |
36 | 47 | ||
48 | static noinline void register_dummy_stack(void) | ||
49 | { | ||
50 | unsigned long entries[4]; | ||
51 | struct stack_trace dummy; | ||
52 | |||
53 | dummy.nr_entries = 0; | ||
54 | dummy.max_entries = ARRAY_SIZE(entries); | ||
55 | dummy.entries = &entries[0]; | ||
56 | dummy.skip = 0; | ||
57 | |||
58 | save_stack_trace(&dummy); | ||
59 | dummy_handle = depot_save_stack(&dummy, GFP_KERNEL); | ||
60 | } | ||
61 | |||
62 | static noinline void register_failure_stack(void) | ||
63 | { | ||
64 | unsigned long entries[4]; | ||
65 | struct stack_trace failure; | ||
66 | |||
67 | failure.nr_entries = 0; | ||
68 | failure.max_entries = ARRAY_SIZE(entries); | ||
69 | failure.entries = &entries[0]; | ||
70 | failure.skip = 0; | ||
71 | |||
72 | save_stack_trace(&failure); | ||
73 | failure_handle = depot_save_stack(&failure, GFP_KERNEL); | ||
74 | } | ||
75 | |||
37 | static void init_page_owner(void) | 76 | static void init_page_owner(void) |
38 | { | 77 | { |
39 | if (page_owner_disabled) | 78 | if (page_owner_disabled) |
40 | return; | 79 | return; |
41 | 80 | ||
81 | register_dummy_stack(); | ||
82 | register_failure_stack(); | ||
42 | static_branch_enable(&page_owner_inited); | 83 | static_branch_enable(&page_owner_inited); |
43 | init_early_allocated_pages(); | 84 | init_early_allocated_pages(); |
44 | } | 85 | } |
@@ -61,25 +102,66 @@ void __reset_page_owner(struct page *page, unsigned int order) | |||
61 | } | 102 | } |
62 | } | 103 | } |
63 | 104 | ||
64 | void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) | 105 | static inline bool check_recursive_alloc(struct stack_trace *trace, |
106 | unsigned long ip) | ||
65 | { | 107 | { |
66 | struct page_ext *page_ext = lookup_page_ext(page); | 108 | int i, count; |
109 | |||
110 | if (!trace->nr_entries) | ||
111 | return false; | ||
112 | |||
113 | for (i = 0, count = 0; i < trace->nr_entries; i++) { | ||
114 | if (trace->entries[i] == ip && ++count == 2) | ||
115 | return true; | ||
116 | } | ||
117 | |||
118 | return false; | ||
119 | } | ||
67 | 120 | ||
121 | static noinline depot_stack_handle_t save_stack(gfp_t flags) | ||
122 | { | ||
123 | unsigned long entries[PAGE_OWNER_STACK_DEPTH]; | ||
68 | struct stack_trace trace = { | 124 | struct stack_trace trace = { |
69 | .nr_entries = 0, | 125 | .nr_entries = 0, |
70 | .max_entries = ARRAY_SIZE(page_ext->trace_entries), | 126 | .entries = entries, |
71 | .entries = &page_ext->trace_entries[0], | 127 | .max_entries = PAGE_OWNER_STACK_DEPTH, |
72 | .skip = 3, | 128 | .skip = 0 |
73 | }; | 129 | }; |
130 | depot_stack_handle_t handle; | ||
131 | |||
132 | save_stack_trace(&trace); | ||
133 | if (trace.nr_entries != 0 && | ||
134 | trace.entries[trace.nr_entries-1] == ULONG_MAX) | ||
135 | trace.nr_entries--; | ||
136 | |||
137 | /* | ||
138 | * We need to check recursion here because our request to stackdepot | ||
139 | * could trigger memory allocation to save new entry. New memory | ||
140 | * allocation would reach here and call depot_save_stack() again | ||
141 | * if we don't catch it. There is still not enough memory in stackdepot | ||
142 | * so it would try to allocate memory again and loop forever. | ||
143 | */ | ||
144 | if (check_recursive_alloc(&trace, _RET_IP_)) | ||
145 | return dummy_handle; | ||
146 | |||
147 | handle = depot_save_stack(&trace, flags); | ||
148 | if (!handle) | ||
149 | handle = failure_handle; | ||
150 | |||
151 | return handle; | ||
152 | } | ||
153 | |||
154 | noinline void __set_page_owner(struct page *page, unsigned int order, | ||
155 | gfp_t gfp_mask) | ||
156 | { | ||
157 | struct page_ext *page_ext = lookup_page_ext(page); | ||
74 | 158 | ||
75 | if (unlikely(!page_ext)) | 159 | if (unlikely(!page_ext)) |
76 | return; | 160 | return; |
77 | 161 | ||
78 | save_stack_trace(&trace); | 162 | page_ext->handle = save_stack(gfp_mask); |
79 | |||
80 | page_ext->order = order; | 163 | page_ext->order = order; |
81 | page_ext->gfp_mask = gfp_mask; | 164 | page_ext->gfp_mask = gfp_mask; |
82 | page_ext->nr_entries = trace.nr_entries; | ||
83 | page_ext->last_migrate_reason = -1; | 165 | page_ext->last_migrate_reason = -1; |
84 | 166 | ||
85 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); | 167 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); |
@@ -94,34 +176,31 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) | |||
94 | page_ext->last_migrate_reason = reason; | 176 | page_ext->last_migrate_reason = reason; |
95 | } | 177 | } |
96 | 178 | ||
97 | gfp_t __get_page_owner_gfp(struct page *page) | 179 | void __split_page_owner(struct page *page, unsigned int order) |
98 | { | 180 | { |
181 | int i; | ||
99 | struct page_ext *page_ext = lookup_page_ext(page); | 182 | struct page_ext *page_ext = lookup_page_ext(page); |
183 | |||
100 | if (unlikely(!page_ext)) | 184 | if (unlikely(!page_ext)) |
101 | /* | 185 | return; |
102 | * The caller just returns 0 if no valid gfp | ||
103 | * So return 0 here too. | ||
104 | */ | ||
105 | return 0; | ||
106 | 186 | ||
107 | return page_ext->gfp_mask; | 187 | page_ext->order = 0; |
188 | for (i = 1; i < (1 << order); i++) | ||
189 | __copy_page_owner(page, page + i); | ||
108 | } | 190 | } |
109 | 191 | ||
110 | void __copy_page_owner(struct page *oldpage, struct page *newpage) | 192 | void __copy_page_owner(struct page *oldpage, struct page *newpage) |
111 | { | 193 | { |
112 | struct page_ext *old_ext = lookup_page_ext(oldpage); | 194 | struct page_ext *old_ext = lookup_page_ext(oldpage); |
113 | struct page_ext *new_ext = lookup_page_ext(newpage); | 195 | struct page_ext *new_ext = lookup_page_ext(newpage); |
114 | int i; | ||
115 | 196 | ||
116 | if (unlikely(!old_ext || !new_ext)) | 197 | if (unlikely(!old_ext || !new_ext)) |
117 | return; | 198 | return; |
118 | 199 | ||
119 | new_ext->order = old_ext->order; | 200 | new_ext->order = old_ext->order; |
120 | new_ext->gfp_mask = old_ext->gfp_mask; | 201 | new_ext->gfp_mask = old_ext->gfp_mask; |
121 | new_ext->nr_entries = old_ext->nr_entries; | 202 | new_ext->last_migrate_reason = old_ext->last_migrate_reason; |
122 | 203 | new_ext->handle = old_ext->handle; | |
123 | for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++) | ||
124 | new_ext->trace_entries[i] = old_ext->trace_entries[i]; | ||
125 | 204 | ||
126 | /* | 205 | /* |
127 | * We don't clear the bit on the oldpage as it's going to be freed | 206 | * We don't clear the bit on the oldpage as it's going to be freed |
@@ -137,14 +216,18 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) | |||
137 | 216 | ||
138 | static ssize_t | 217 | static ssize_t |
139 | print_page_owner(char __user *buf, size_t count, unsigned long pfn, | 218 | print_page_owner(char __user *buf, size_t count, unsigned long pfn, |
140 | struct page *page, struct page_ext *page_ext) | 219 | struct page *page, struct page_ext *page_ext, |
220 | depot_stack_handle_t handle) | ||
141 | { | 221 | { |
142 | int ret; | 222 | int ret; |
143 | int pageblock_mt, page_mt; | 223 | int pageblock_mt, page_mt; |
144 | char *kbuf; | 224 | char *kbuf; |
225 | unsigned long entries[PAGE_OWNER_STACK_DEPTH]; | ||
145 | struct stack_trace trace = { | 226 | struct stack_trace trace = { |
146 | .nr_entries = page_ext->nr_entries, | 227 | .nr_entries = 0, |
147 | .entries = &page_ext->trace_entries[0], | 228 | .entries = entries, |
229 | .max_entries = PAGE_OWNER_STACK_DEPTH, | ||
230 | .skip = 0 | ||
148 | }; | 231 | }; |
149 | 232 | ||
150 | kbuf = kmalloc(count, GFP_KERNEL); | 233 | kbuf = kmalloc(count, GFP_KERNEL); |
@@ -173,6 +256,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, | |||
173 | if (ret >= count) | 256 | if (ret >= count) |
174 | goto err; | 257 | goto err; |
175 | 258 | ||
259 | depot_fetch_stack(handle, &trace); | ||
176 | ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0); | 260 | ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0); |
177 | if (ret >= count) | 261 | if (ret >= count) |
178 | goto err; | 262 | goto err; |
@@ -203,10 +287,14 @@ err: | |||
203 | void __dump_page_owner(struct page *page) | 287 | void __dump_page_owner(struct page *page) |
204 | { | 288 | { |
205 | struct page_ext *page_ext = lookup_page_ext(page); | 289 | struct page_ext *page_ext = lookup_page_ext(page); |
290 | unsigned long entries[PAGE_OWNER_STACK_DEPTH]; | ||
206 | struct stack_trace trace = { | 291 | struct stack_trace trace = { |
207 | .nr_entries = page_ext->nr_entries, | 292 | .nr_entries = 0, |
208 | .entries = &page_ext->trace_entries[0], | 293 | .entries = entries, |
294 | .max_entries = PAGE_OWNER_STACK_DEPTH, | ||
295 | .skip = 0 | ||
209 | }; | 296 | }; |
297 | depot_stack_handle_t handle; | ||
210 | gfp_t gfp_mask; | 298 | gfp_t gfp_mask; |
211 | int mt; | 299 | int mt; |
212 | 300 | ||
@@ -222,6 +310,13 @@ void __dump_page_owner(struct page *page) | |||
222 | return; | 310 | return; |
223 | } | 311 | } |
224 | 312 | ||
313 | handle = READ_ONCE(page_ext->handle); | ||
314 | if (!handle) { | ||
315 | pr_alert("page_owner info is not active (free page?)\n"); | ||
316 | return; | ||
317 | } | ||
318 | |||
319 | depot_fetch_stack(handle, &trace); | ||
225 | pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", | 320 | pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", |
226 | page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask); | 321 | page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask); |
227 | print_stack_trace(&trace, 0); | 322 | print_stack_trace(&trace, 0); |
@@ -237,6 +332,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
237 | unsigned long pfn; | 332 | unsigned long pfn; |
238 | struct page *page; | 333 | struct page *page; |
239 | struct page_ext *page_ext; | 334 | struct page_ext *page_ext; |
335 | depot_stack_handle_t handle; | ||
240 | 336 | ||
241 | if (!static_branch_unlikely(&page_owner_inited)) | 337 | if (!static_branch_unlikely(&page_owner_inited)) |
242 | return -EINVAL; | 338 | return -EINVAL; |
@@ -285,10 +381,19 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
285 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) | 381 | if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) |
286 | continue; | 382 | continue; |
287 | 383 | ||
384 | /* | ||
385 | * Access to page_ext->handle isn't synchronous so we should | ||
386 | * be careful to access it. | ||
387 | */ | ||
388 | handle = READ_ONCE(page_ext->handle); | ||
389 | if (!handle) | ||
390 | continue; | ||
391 | |||
288 | /* Record the next PFN to read in the file offset */ | 392 | /* Record the next PFN to read in the file offset */ |
289 | *ppos = (pfn - min_low_pfn) + 1; | 393 | *ppos = (pfn - min_low_pfn) + 1; |
290 | 394 | ||
291 | return print_page_owner(buf, count, pfn, page, page_ext); | 395 | return print_page_owner(buf, count, pfn, page, |
396 | page_ext, handle); | ||
292 | } | 397 | } |
293 | 398 | ||
294 | return 0; | 399 | return 0; |
diff --git a/mm/readahead.c b/mm/readahead.c index 40be3ae0afe3..65ec288dc057 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -89,7 +89,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, | |||
89 | page = lru_to_page(pages); | 89 | page = lru_to_page(pages); |
90 | list_del(&page->lru); | 90 | list_del(&page->lru); |
91 | if (add_to_page_cache_lru(page, mapping, page->index, | 91 | if (add_to_page_cache_lru(page, mapping, page->index, |
92 | mapping_gfp_constraint(mapping, GFP_KERNEL))) { | 92 | readahead_gfp_mask(mapping))) { |
93 | read_cache_pages_invalidate_page(mapping, page); | 93 | read_cache_pages_invalidate_page(mapping, page); |
94 | continue; | 94 | continue; |
95 | } | 95 | } |
@@ -108,7 +108,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, | |||
108 | EXPORT_SYMBOL(read_cache_pages); | 108 | EXPORT_SYMBOL(read_cache_pages); |
109 | 109 | ||
110 | static int read_pages(struct address_space *mapping, struct file *filp, | 110 | static int read_pages(struct address_space *mapping, struct file *filp, |
111 | struct list_head *pages, unsigned nr_pages) | 111 | struct list_head *pages, unsigned int nr_pages, gfp_t gfp) |
112 | { | 112 | { |
113 | struct blk_plug plug; | 113 | struct blk_plug plug; |
114 | unsigned page_idx; | 114 | unsigned page_idx; |
@@ -126,10 +126,8 @@ static int read_pages(struct address_space *mapping, struct file *filp, | |||
126 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { | 126 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { |
127 | struct page *page = lru_to_page(pages); | 127 | struct page *page = lru_to_page(pages); |
128 | list_del(&page->lru); | 128 | list_del(&page->lru); |
129 | if (!add_to_page_cache_lru(page, mapping, page->index, | 129 | if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) |
130 | mapping_gfp_constraint(mapping, GFP_KERNEL))) { | ||
131 | mapping->a_ops->readpage(filp, page); | 130 | mapping->a_ops->readpage(filp, page); |
132 | } | ||
133 | put_page(page); | 131 | put_page(page); |
134 | } | 132 | } |
135 | ret = 0; | 133 | ret = 0; |
@@ -159,6 +157,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
159 | int page_idx; | 157 | int page_idx; |
160 | int ret = 0; | 158 | int ret = 0; |
161 | loff_t isize = i_size_read(inode); | 159 | loff_t isize = i_size_read(inode); |
160 | gfp_t gfp_mask = readahead_gfp_mask(mapping); | ||
162 | 161 | ||
163 | if (isize == 0) | 162 | if (isize == 0) |
164 | goto out; | 163 | goto out; |
@@ -180,7 +179,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
180 | if (page && !radix_tree_exceptional_entry(page)) | 179 | if (page && !radix_tree_exceptional_entry(page)) |
181 | continue; | 180 | continue; |
182 | 181 | ||
183 | page = page_cache_alloc_readahead(mapping); | 182 | page = __page_cache_alloc(gfp_mask); |
184 | if (!page) | 183 | if (!page) |
185 | break; | 184 | break; |
186 | page->index = page_offset; | 185 | page->index = page_offset; |
@@ -196,7 +195,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
196 | * will then handle the error. | 195 | * will then handle the error. |
197 | */ | 196 | */ |
198 | if (ret) | 197 | if (ret) |
199 | read_pages(mapping, filp, &page_pool, ret); | 198 | read_pages(mapping, filp, &page_pool, ret, gfp_mask); |
200 | BUG_ON(!list_empty(&page_pool)); | 199 | BUG_ON(!list_empty(&page_pool)); |
201 | out: | 200 | out: |
202 | return ret; | 201 | return ret; |
@@ -1212,10 +1212,8 @@ void do_page_add_anon_rmap(struct page *page, | |||
1212 | * pte lock(a spinlock) is held, which implies preemption | 1212 | * pte lock(a spinlock) is held, which implies preemption |
1213 | * disabled. | 1213 | * disabled. |
1214 | */ | 1214 | */ |
1215 | if (compound) { | 1215 | if (compound) |
1216 | __inc_zone_page_state(page, | 1216 | __inc_zone_page_state(page, NR_ANON_THPS); |
1217 | NR_ANON_TRANSPARENT_HUGEPAGES); | ||
1218 | } | ||
1219 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); | 1217 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); |
1220 | } | 1218 | } |
1221 | if (unlikely(PageKsm(page))) | 1219 | if (unlikely(PageKsm(page))) |
@@ -1253,7 +1251,7 @@ void page_add_new_anon_rmap(struct page *page, | |||
1253 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | 1251 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
1254 | /* increment count (starts at -1) */ | 1252 | /* increment count (starts at -1) */ |
1255 | atomic_set(compound_mapcount_ptr(page), 0); | 1253 | atomic_set(compound_mapcount_ptr(page), 0); |
1256 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1254 | __inc_zone_page_state(page, NR_ANON_THPS); |
1257 | } else { | 1255 | } else { |
1258 | /* Anon THP always mapped first with PMD */ | 1256 | /* Anon THP always mapped first with PMD */ |
1259 | VM_BUG_ON_PAGE(PageTransCompound(page), page); | 1257 | VM_BUG_ON_PAGE(PageTransCompound(page), page); |
@@ -1270,18 +1268,42 @@ void page_add_new_anon_rmap(struct page *page, | |||
1270 | * | 1268 | * |
1271 | * The caller needs to hold the pte lock. | 1269 | * The caller needs to hold the pte lock. |
1272 | */ | 1270 | */ |
1273 | void page_add_file_rmap(struct page *page) | 1271 | void page_add_file_rmap(struct page *page, bool compound) |
1274 | { | 1272 | { |
1273 | int i, nr = 1; | ||
1274 | |||
1275 | VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); | ||
1275 | lock_page_memcg(page); | 1276 | lock_page_memcg(page); |
1276 | if (atomic_inc_and_test(&page->_mapcount)) { | 1277 | if (compound && PageTransHuge(page)) { |
1277 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 1278 | for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { |
1278 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); | 1279 | if (atomic_inc_and_test(&page[i]._mapcount)) |
1280 | nr++; | ||
1281 | } | ||
1282 | if (!atomic_inc_and_test(compound_mapcount_ptr(page))) | ||
1283 | goto out; | ||
1284 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | ||
1285 | __inc_zone_page_state(page, NR_SHMEM_PMDMAPPED); | ||
1286 | } else { | ||
1287 | if (PageTransCompound(page)) { | ||
1288 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
1289 | SetPageDoubleMap(compound_head(page)); | ||
1290 | if (PageMlocked(page)) | ||
1291 | clear_page_mlock(compound_head(page)); | ||
1292 | } | ||
1293 | if (!atomic_inc_and_test(&page->_mapcount)) | ||
1294 | goto out; | ||
1279 | } | 1295 | } |
1296 | __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, nr); | ||
1297 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); | ||
1298 | out: | ||
1280 | unlock_page_memcg(page); | 1299 | unlock_page_memcg(page); |
1281 | } | 1300 | } |
1282 | 1301 | ||
1283 | static void page_remove_file_rmap(struct page *page) | 1302 | static void page_remove_file_rmap(struct page *page, bool compound) |
1284 | { | 1303 | { |
1304 | int i, nr = 1; | ||
1305 | |||
1306 | VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); | ||
1285 | lock_page_memcg(page); | 1307 | lock_page_memcg(page); |
1286 | 1308 | ||
1287 | /* Hugepages are not counted in NR_FILE_MAPPED for now. */ | 1309 | /* Hugepages are not counted in NR_FILE_MAPPED for now. */ |
@@ -1292,15 +1314,26 @@ static void page_remove_file_rmap(struct page *page) | |||
1292 | } | 1314 | } |
1293 | 1315 | ||
1294 | /* page still mapped by someone else? */ | 1316 | /* page still mapped by someone else? */ |
1295 | if (!atomic_add_negative(-1, &page->_mapcount)) | 1317 | if (compound && PageTransHuge(page)) { |
1296 | goto out; | 1318 | for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { |
1319 | if (atomic_add_negative(-1, &page[i]._mapcount)) | ||
1320 | nr++; | ||
1321 | } | ||
1322 | if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) | ||
1323 | goto out; | ||
1324 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | ||
1325 | __dec_zone_page_state(page, NR_SHMEM_PMDMAPPED); | ||
1326 | } else { | ||
1327 | if (!atomic_add_negative(-1, &page->_mapcount)) | ||
1328 | goto out; | ||
1329 | } | ||
1297 | 1330 | ||
1298 | /* | 1331 | /* |
1299 | * We use the irq-unsafe __{inc|mod}_zone_page_stat because | 1332 | * We use the irq-unsafe __{inc|mod}_zone_page_stat because |
1300 | * these counters are not modified in interrupt context, and | 1333 | * these counters are not modified in interrupt context, and |
1301 | * pte lock(a spinlock) is held, which implies preemption disabled. | 1334 | * pte lock(a spinlock) is held, which implies preemption disabled. |
1302 | */ | 1335 | */ |
1303 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 1336 | __mod_zone_page_state(page_zone(page), NR_FILE_MAPPED, -nr); |
1304 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); | 1337 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); |
1305 | 1338 | ||
1306 | if (unlikely(PageMlocked(page))) | 1339 | if (unlikely(PageMlocked(page))) |
@@ -1323,7 +1356,7 @@ static void page_remove_anon_compound_rmap(struct page *page) | |||
1323 | if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) | 1356 | if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) |
1324 | return; | 1357 | return; |
1325 | 1358 | ||
1326 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1359 | __dec_zone_page_state(page, NR_ANON_THPS); |
1327 | 1360 | ||
1328 | if (TestClearPageDoubleMap(page)) { | 1361 | if (TestClearPageDoubleMap(page)) { |
1329 | /* | 1362 | /* |
@@ -1356,11 +1389,8 @@ static void page_remove_anon_compound_rmap(struct page *page) | |||
1356 | */ | 1389 | */ |
1357 | void page_remove_rmap(struct page *page, bool compound) | 1390 | void page_remove_rmap(struct page *page, bool compound) |
1358 | { | 1391 | { |
1359 | if (!PageAnon(page)) { | 1392 | if (!PageAnon(page)) |
1360 | VM_BUG_ON_PAGE(compound && !PageHuge(page), page); | 1393 | return page_remove_file_rmap(page, compound); |
1361 | page_remove_file_rmap(page); | ||
1362 | return; | ||
1363 | } | ||
1364 | 1394 | ||
1365 | if (compound) | 1395 | if (compound) |
1366 | return page_remove_anon_compound_rmap(page); | 1396 | return page_remove_anon_compound_rmap(page); |
@@ -1436,8 +1466,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1436 | */ | 1466 | */ |
1437 | if (!(flags & TTU_IGNORE_MLOCK)) { | 1467 | if (!(flags & TTU_IGNORE_MLOCK)) { |
1438 | if (vma->vm_flags & VM_LOCKED) { | 1468 | if (vma->vm_flags & VM_LOCKED) { |
1439 | /* Holding pte lock, we do *not* need mmap_sem here */ | 1469 | /* PTE-mapped THP are never mlocked */ |
1440 | mlock_vma_page(page); | 1470 | if (!PageTransCompound(page)) { |
1471 | /* | ||
1472 | * Holding pte lock, we do *not* need | ||
1473 | * mmap_sem here | ||
1474 | */ | ||
1475 | mlock_vma_page(page); | ||
1476 | } | ||
1441 | ret = SWAP_MLOCK; | 1477 | ret = SWAP_MLOCK; |
1442 | goto out_unmap; | 1478 | goto out_unmap; |
1443 | } | 1479 | } |
diff --git a/mm/shmem.c b/mm/shmem.c index 171dee7a131f..62e42c7d544c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/export.h> | 32 | #include <linux/export.h> |
33 | #include <linux/swap.h> | 33 | #include <linux/swap.h> |
34 | #include <linux/uio.h> | 34 | #include <linux/uio.h> |
35 | #include <linux/khugepaged.h> | ||
35 | 36 | ||
36 | static struct vfsmount *shm_mnt; | 37 | static struct vfsmount *shm_mnt; |
37 | 38 | ||
@@ -97,14 +98,6 @@ struct shmem_falloc { | |||
97 | pgoff_t nr_unswapped; /* how often writepage refused to swap out */ | 98 | pgoff_t nr_unswapped; /* how often writepage refused to swap out */ |
98 | }; | 99 | }; |
99 | 100 | ||
100 | /* Flag allocation requirements to shmem_getpage */ | ||
101 | enum sgp_type { | ||
102 | SGP_READ, /* don't exceed i_size, don't allocate page */ | ||
103 | SGP_CACHE, /* don't exceed i_size, may allocate page */ | ||
104 | SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ | ||
105 | SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ | ||
106 | }; | ||
107 | |||
108 | #ifdef CONFIG_TMPFS | 101 | #ifdef CONFIG_TMPFS |
109 | static unsigned long shmem_default_max_blocks(void) | 102 | static unsigned long shmem_default_max_blocks(void) |
110 | { | 103 | { |
@@ -124,7 +117,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | |||
124 | struct page **pagep, enum sgp_type sgp, | 117 | struct page **pagep, enum sgp_type sgp, |
125 | gfp_t gfp, struct mm_struct *fault_mm, int *fault_type); | 118 | gfp_t gfp, struct mm_struct *fault_mm, int *fault_type); |
126 | 119 | ||
127 | static inline int shmem_getpage(struct inode *inode, pgoff_t index, | 120 | int shmem_getpage(struct inode *inode, pgoff_t index, |
128 | struct page **pagep, enum sgp_type sgp) | 121 | struct page **pagep, enum sgp_type sgp) |
129 | { | 122 | { |
130 | return shmem_getpage_gfp(inode, index, pagep, sgp, | 123 | return shmem_getpage_gfp(inode, index, pagep, sgp, |
@@ -173,10 +166,13 @@ static inline int shmem_reacct_size(unsigned long flags, | |||
173 | * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, | 166 | * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, |
174 | * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. | 167 | * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. |
175 | */ | 168 | */ |
176 | static inline int shmem_acct_block(unsigned long flags) | 169 | static inline int shmem_acct_block(unsigned long flags, long pages) |
177 | { | 170 | { |
178 | return (flags & VM_NORESERVE) ? | 171 | if (!(flags & VM_NORESERVE)) |
179 | security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_SIZE)) : 0; | 172 | return 0; |
173 | |||
174 | return security_vm_enough_memory_mm(current->mm, | ||
175 | pages * VM_ACCT(PAGE_SIZE)); | ||
180 | } | 176 | } |
181 | 177 | ||
182 | static inline void shmem_unacct_blocks(unsigned long flags, long pages) | 178 | static inline void shmem_unacct_blocks(unsigned long flags, long pages) |
@@ -192,6 +188,7 @@ static const struct inode_operations shmem_inode_operations; | |||
192 | static const struct inode_operations shmem_dir_inode_operations; | 188 | static const struct inode_operations shmem_dir_inode_operations; |
193 | static const struct inode_operations shmem_special_inode_operations; | 189 | static const struct inode_operations shmem_special_inode_operations; |
194 | static const struct vm_operations_struct shmem_vm_ops; | 190 | static const struct vm_operations_struct shmem_vm_ops; |
191 | static struct file_system_type shmem_fs_type; | ||
195 | 192 | ||
196 | static LIST_HEAD(shmem_swaplist); | 193 | static LIST_HEAD(shmem_swaplist); |
197 | static DEFINE_MUTEX(shmem_swaplist_mutex); | 194 | static DEFINE_MUTEX(shmem_swaplist_mutex); |
@@ -249,6 +246,53 @@ static void shmem_recalc_inode(struct inode *inode) | |||
249 | } | 246 | } |
250 | } | 247 | } |
251 | 248 | ||
249 | bool shmem_charge(struct inode *inode, long pages) | ||
250 | { | ||
251 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
252 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
253 | unsigned long flags; | ||
254 | |||
255 | if (shmem_acct_block(info->flags, pages)) | ||
256 | return false; | ||
257 | spin_lock_irqsave(&info->lock, flags); | ||
258 | info->alloced += pages; | ||
259 | inode->i_blocks += pages * BLOCKS_PER_PAGE; | ||
260 | shmem_recalc_inode(inode); | ||
261 | spin_unlock_irqrestore(&info->lock, flags); | ||
262 | inode->i_mapping->nrpages += pages; | ||
263 | |||
264 | if (!sbinfo->max_blocks) | ||
265 | return true; | ||
266 | if (percpu_counter_compare(&sbinfo->used_blocks, | ||
267 | sbinfo->max_blocks - pages) > 0) { | ||
268 | inode->i_mapping->nrpages -= pages; | ||
269 | spin_lock_irqsave(&info->lock, flags); | ||
270 | info->alloced -= pages; | ||
271 | shmem_recalc_inode(inode); | ||
272 | spin_unlock_irqrestore(&info->lock, flags); | ||
273 | |||
274 | return false; | ||
275 | } | ||
276 | percpu_counter_add(&sbinfo->used_blocks, pages); | ||
277 | return true; | ||
278 | } | ||
279 | |||
280 | void shmem_uncharge(struct inode *inode, long pages) | ||
281 | { | ||
282 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
283 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
284 | unsigned long flags; | ||
285 | |||
286 | spin_lock_irqsave(&info->lock, flags); | ||
287 | info->alloced -= pages; | ||
288 | inode->i_blocks -= pages * BLOCKS_PER_PAGE; | ||
289 | shmem_recalc_inode(inode); | ||
290 | spin_unlock_irqrestore(&info->lock, flags); | ||
291 | |||
292 | if (sbinfo->max_blocks) | ||
293 | percpu_counter_sub(&sbinfo->used_blocks, pages); | ||
294 | } | ||
295 | |||
252 | /* | 296 | /* |
253 | * Replace item expected in radix tree by a new item, while holding tree lock. | 297 | * Replace item expected in radix tree by a new item, while holding tree lock. |
254 | */ | 298 | */ |
@@ -289,36 +333,256 @@ static bool shmem_confirm_swap(struct address_space *mapping, | |||
289 | } | 333 | } |
290 | 334 | ||
291 | /* | 335 | /* |
336 | * Definitions for "huge tmpfs": tmpfs mounted with the huge= option | ||
337 | * | ||
338 | * SHMEM_HUGE_NEVER: | ||
339 | * disables huge pages for the mount; | ||
340 | * SHMEM_HUGE_ALWAYS: | ||
341 | * enables huge pages for the mount; | ||
342 | * SHMEM_HUGE_WITHIN_SIZE: | ||
343 | * only allocate huge pages if the page will be fully within i_size, | ||
344 | * also respect fadvise()/madvise() hints; | ||
345 | * SHMEM_HUGE_ADVISE: | ||
346 | * only allocate huge pages if requested with fadvise()/madvise(); | ||
347 | */ | ||
348 | |||
349 | #define SHMEM_HUGE_NEVER 0 | ||
350 | #define SHMEM_HUGE_ALWAYS 1 | ||
351 | #define SHMEM_HUGE_WITHIN_SIZE 2 | ||
352 | #define SHMEM_HUGE_ADVISE 3 | ||
353 | |||
354 | /* | ||
355 | * Special values. | ||
356 | * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled: | ||
357 | * | ||
358 | * SHMEM_HUGE_DENY: | ||
359 | * disables huge on shm_mnt and all mounts, for emergency use; | ||
360 | * SHMEM_HUGE_FORCE: | ||
361 | * enables huge on shm_mnt and all mounts, w/o needing option, for testing; | ||
362 | * | ||
363 | */ | ||
364 | #define SHMEM_HUGE_DENY (-1) | ||
365 | #define SHMEM_HUGE_FORCE (-2) | ||
366 | |||
367 | #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE | ||
368 | /* ifdef here to avoid bloating shmem.o when not necessary */ | ||
369 | |||
370 | int shmem_huge __read_mostly; | ||
371 | |||
372 | static int shmem_parse_huge(const char *str) | ||
373 | { | ||
374 | if (!strcmp(str, "never")) | ||
375 | return SHMEM_HUGE_NEVER; | ||
376 | if (!strcmp(str, "always")) | ||
377 | return SHMEM_HUGE_ALWAYS; | ||
378 | if (!strcmp(str, "within_size")) | ||
379 | return SHMEM_HUGE_WITHIN_SIZE; | ||
380 | if (!strcmp(str, "advise")) | ||
381 | return SHMEM_HUGE_ADVISE; | ||
382 | if (!strcmp(str, "deny")) | ||
383 | return SHMEM_HUGE_DENY; | ||
384 | if (!strcmp(str, "force")) | ||
385 | return SHMEM_HUGE_FORCE; | ||
386 | return -EINVAL; | ||
387 | } | ||
388 | |||
389 | static const char *shmem_format_huge(int huge) | ||
390 | { | ||
391 | switch (huge) { | ||
392 | case SHMEM_HUGE_NEVER: | ||
393 | return "never"; | ||
394 | case SHMEM_HUGE_ALWAYS: | ||
395 | return "always"; | ||
396 | case SHMEM_HUGE_WITHIN_SIZE: | ||
397 | return "within_size"; | ||
398 | case SHMEM_HUGE_ADVISE: | ||
399 | return "advise"; | ||
400 | case SHMEM_HUGE_DENY: | ||
401 | return "deny"; | ||
402 | case SHMEM_HUGE_FORCE: | ||
403 | return "force"; | ||
404 | default: | ||
405 | VM_BUG_ON(1); | ||
406 | return "bad_val"; | ||
407 | } | ||
408 | } | ||
409 | |||
410 | static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, | ||
411 | struct shrink_control *sc, unsigned long nr_to_split) | ||
412 | { | ||
413 | LIST_HEAD(list), *pos, *next; | ||
414 | struct inode *inode; | ||
415 | struct shmem_inode_info *info; | ||
416 | struct page *page; | ||
417 | unsigned long batch = sc ? sc->nr_to_scan : 128; | ||
418 | int removed = 0, split = 0; | ||
419 | |||
420 | if (list_empty(&sbinfo->shrinklist)) | ||
421 | return SHRINK_STOP; | ||
422 | |||
423 | spin_lock(&sbinfo->shrinklist_lock); | ||
424 | list_for_each_safe(pos, next, &sbinfo->shrinklist) { | ||
425 | info = list_entry(pos, struct shmem_inode_info, shrinklist); | ||
426 | |||
427 | /* pin the inode */ | ||
428 | inode = igrab(&info->vfs_inode); | ||
429 | |||
430 | /* inode is about to be evicted */ | ||
431 | if (!inode) { | ||
432 | list_del_init(&info->shrinklist); | ||
433 | removed++; | ||
434 | goto next; | ||
435 | } | ||
436 | |||
437 | /* Check if there's anything to gain */ | ||
438 | if (round_up(inode->i_size, PAGE_SIZE) == | ||
439 | round_up(inode->i_size, HPAGE_PMD_SIZE)) { | ||
440 | list_del_init(&info->shrinklist); | ||
441 | removed++; | ||
442 | iput(inode); | ||
443 | goto next; | ||
444 | } | ||
445 | |||
446 | list_move(&info->shrinklist, &list); | ||
447 | next: | ||
448 | if (!--batch) | ||
449 | break; | ||
450 | } | ||
451 | spin_unlock(&sbinfo->shrinklist_lock); | ||
452 | |||
453 | list_for_each_safe(pos, next, &list) { | ||
454 | int ret; | ||
455 | |||
456 | info = list_entry(pos, struct shmem_inode_info, shrinklist); | ||
457 | inode = &info->vfs_inode; | ||
458 | |||
459 | if (nr_to_split && split >= nr_to_split) { | ||
460 | iput(inode); | ||
461 | continue; | ||
462 | } | ||
463 | |||
464 | page = find_lock_page(inode->i_mapping, | ||
465 | (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); | ||
466 | if (!page) | ||
467 | goto drop; | ||
468 | |||
469 | if (!PageTransHuge(page)) { | ||
470 | unlock_page(page); | ||
471 | put_page(page); | ||
472 | goto drop; | ||
473 | } | ||
474 | |||
475 | ret = split_huge_page(page); | ||
476 | unlock_page(page); | ||
477 | put_page(page); | ||
478 | |||
479 | if (ret) { | ||
480 | /* split failed: leave it on the list */ | ||
481 | iput(inode); | ||
482 | continue; | ||
483 | } | ||
484 | |||
485 | split++; | ||
486 | drop: | ||
487 | list_del_init(&info->shrinklist); | ||
488 | removed++; | ||
489 | iput(inode); | ||
490 | } | ||
491 | |||
492 | spin_lock(&sbinfo->shrinklist_lock); | ||
493 | list_splice_tail(&list, &sbinfo->shrinklist); | ||
494 | sbinfo->shrinklist_len -= removed; | ||
495 | spin_unlock(&sbinfo->shrinklist_lock); | ||
496 | |||
497 | return split; | ||
498 | } | ||
499 | |||
500 | static long shmem_unused_huge_scan(struct super_block *sb, | ||
501 | struct shrink_control *sc) | ||
502 | { | ||
503 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | ||
504 | |||
505 | if (!READ_ONCE(sbinfo->shrinklist_len)) | ||
506 | return SHRINK_STOP; | ||
507 | |||
508 | return shmem_unused_huge_shrink(sbinfo, sc, 0); | ||
509 | } | ||
510 | |||
511 | static long shmem_unused_huge_count(struct super_block *sb, | ||
512 | struct shrink_control *sc) | ||
513 | { | ||
514 | struct shmem_sb_info *sbinfo = SHMEM_SB(sb); | ||
515 | return READ_ONCE(sbinfo->shrinklist_len); | ||
516 | } | ||
517 | #else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */ | ||
518 | |||
519 | #define shmem_huge SHMEM_HUGE_DENY | ||
520 | |||
521 | static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, | ||
522 | struct shrink_control *sc, unsigned long nr_to_split) | ||
523 | { | ||
524 | return 0; | ||
525 | } | ||
526 | #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ | ||
527 | |||
528 | /* | ||
292 | * Like add_to_page_cache_locked, but error if expected item has gone. | 529 | * Like add_to_page_cache_locked, but error if expected item has gone. |
293 | */ | 530 | */ |
294 | static int shmem_add_to_page_cache(struct page *page, | 531 | static int shmem_add_to_page_cache(struct page *page, |
295 | struct address_space *mapping, | 532 | struct address_space *mapping, |
296 | pgoff_t index, void *expected) | 533 | pgoff_t index, void *expected) |
297 | { | 534 | { |
298 | int error; | 535 | int error, nr = hpage_nr_pages(page); |
299 | 536 | ||
537 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
538 | VM_BUG_ON_PAGE(index != round_down(index, nr), page); | ||
300 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 539 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
301 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | 540 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
541 | VM_BUG_ON(expected && PageTransHuge(page)); | ||
302 | 542 | ||
303 | get_page(page); | 543 | page_ref_add(page, nr); |
304 | page->mapping = mapping; | 544 | page->mapping = mapping; |
305 | page->index = index; | 545 | page->index = index; |
306 | 546 | ||
307 | spin_lock_irq(&mapping->tree_lock); | 547 | spin_lock_irq(&mapping->tree_lock); |
308 | if (!expected) | 548 | if (PageTransHuge(page)) { |
549 | void __rcu **results; | ||
550 | pgoff_t idx; | ||
551 | int i; | ||
552 | |||
553 | error = 0; | ||
554 | if (radix_tree_gang_lookup_slot(&mapping->page_tree, | ||
555 | &results, &idx, index, 1) && | ||
556 | idx < index + HPAGE_PMD_NR) { | ||
557 | error = -EEXIST; | ||
558 | } | ||
559 | |||
560 | if (!error) { | ||
561 | for (i = 0; i < HPAGE_PMD_NR; i++) { | ||
562 | error = radix_tree_insert(&mapping->page_tree, | ||
563 | index + i, page + i); | ||
564 | VM_BUG_ON(error); | ||
565 | } | ||
566 | count_vm_event(THP_FILE_ALLOC); | ||
567 | } | ||
568 | } else if (!expected) { | ||
309 | error = radix_tree_insert(&mapping->page_tree, index, page); | 569 | error = radix_tree_insert(&mapping->page_tree, index, page); |
310 | else | 570 | } else { |
311 | error = shmem_radix_tree_replace(mapping, index, expected, | 571 | error = shmem_radix_tree_replace(mapping, index, expected, |
312 | page); | 572 | page); |
573 | } | ||
574 | |||
313 | if (!error) { | 575 | if (!error) { |
314 | mapping->nrpages++; | 576 | mapping->nrpages += nr; |
315 | __inc_zone_page_state(page, NR_FILE_PAGES); | 577 | if (PageTransHuge(page)) |
316 | __inc_zone_page_state(page, NR_SHMEM); | 578 | __inc_zone_page_state(page, NR_SHMEM_THPS); |
579 | __mod_zone_page_state(page_zone(page), NR_FILE_PAGES, nr); | ||
580 | __mod_zone_page_state(page_zone(page), NR_SHMEM, nr); | ||
317 | spin_unlock_irq(&mapping->tree_lock); | 581 | spin_unlock_irq(&mapping->tree_lock); |
318 | } else { | 582 | } else { |
319 | page->mapping = NULL; | 583 | page->mapping = NULL; |
320 | spin_unlock_irq(&mapping->tree_lock); | 584 | spin_unlock_irq(&mapping->tree_lock); |
321 | put_page(page); | 585 | page_ref_sub(page, nr); |
322 | } | 586 | } |
323 | return error; | 587 | return error; |
324 | } | 588 | } |
@@ -331,6 +595,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) | |||
331 | struct address_space *mapping = page->mapping; | 595 | struct address_space *mapping = page->mapping; |
332 | int error; | 596 | int error; |
333 | 597 | ||
598 | VM_BUG_ON_PAGE(PageCompound(page), page); | ||
599 | |||
334 | spin_lock_irq(&mapping->tree_lock); | 600 | spin_lock_irq(&mapping->tree_lock); |
335 | error = shmem_radix_tree_replace(mapping, page->index, page, radswap); | 601 | error = shmem_radix_tree_replace(mapping, page->index, page, radswap); |
336 | page->mapping = NULL; | 602 | page->mapping = NULL; |
@@ -510,10 +776,33 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, | |||
510 | continue; | 776 | continue; |
511 | } | 777 | } |
512 | 778 | ||
779 | VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page); | ||
780 | |||
513 | if (!trylock_page(page)) | 781 | if (!trylock_page(page)) |
514 | continue; | 782 | continue; |
783 | |||
784 | if (PageTransTail(page)) { | ||
785 | /* Middle of THP: zero out the page */ | ||
786 | clear_highpage(page); | ||
787 | unlock_page(page); | ||
788 | continue; | ||
789 | } else if (PageTransHuge(page)) { | ||
790 | if (index == round_down(end, HPAGE_PMD_NR)) { | ||
791 | /* | ||
792 | * Range ends in the middle of THP: | ||
793 | * zero out the page | ||
794 | */ | ||
795 | clear_highpage(page); | ||
796 | unlock_page(page); | ||
797 | continue; | ||
798 | } | ||
799 | index += HPAGE_PMD_NR - 1; | ||
800 | i += HPAGE_PMD_NR - 1; | ||
801 | } | ||
802 | |||
515 | if (!unfalloc || !PageUptodate(page)) { | 803 | if (!unfalloc || !PageUptodate(page)) { |
516 | if (page->mapping == mapping) { | 804 | VM_BUG_ON_PAGE(PageTail(page), page); |
805 | if (page_mapping(page) == mapping) { | ||
517 | VM_BUG_ON_PAGE(PageWriteback(page), page); | 806 | VM_BUG_ON_PAGE(PageWriteback(page), page); |
518 | truncate_inode_page(mapping, page); | 807 | truncate_inode_page(mapping, page); |
519 | } | 808 | } |
@@ -589,8 +878,36 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, | |||
589 | } | 878 | } |
590 | 879 | ||
591 | lock_page(page); | 880 | lock_page(page); |
881 | |||
882 | if (PageTransTail(page)) { | ||
883 | /* Middle of THP: zero out the page */ | ||
884 | clear_highpage(page); | ||
885 | unlock_page(page); | ||
886 | /* | ||
887 | * Partial thp truncate due 'start' in middle | ||
888 | * of THP: don't need to look on these pages | ||
889 | * again on !pvec.nr restart. | ||
890 | */ | ||
891 | if (index != round_down(end, HPAGE_PMD_NR)) | ||
892 | start++; | ||
893 | continue; | ||
894 | } else if (PageTransHuge(page)) { | ||
895 | if (index == round_down(end, HPAGE_PMD_NR)) { | ||
896 | /* | ||
897 | * Range ends in the middle of THP: | ||
898 | * zero out the page | ||
899 | */ | ||
900 | clear_highpage(page); | ||
901 | unlock_page(page); | ||
902 | continue; | ||
903 | } | ||
904 | index += HPAGE_PMD_NR - 1; | ||
905 | i += HPAGE_PMD_NR - 1; | ||
906 | } | ||
907 | |||
592 | if (!unfalloc || !PageUptodate(page)) { | 908 | if (!unfalloc || !PageUptodate(page)) { |
593 | if (page->mapping == mapping) { | 909 | VM_BUG_ON_PAGE(PageTail(page), page); |
910 | if (page_mapping(page) == mapping) { | ||
594 | VM_BUG_ON_PAGE(PageWriteback(page), page); | 911 | VM_BUG_ON_PAGE(PageWriteback(page), page); |
595 | truncate_inode_page(mapping, page); | 912 | truncate_inode_page(mapping, page); |
596 | } else { | 913 | } else { |
@@ -607,10 +924,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, | |||
607 | index++; | 924 | index++; |
608 | } | 925 | } |
609 | 926 | ||
610 | spin_lock(&info->lock); | 927 | spin_lock_irq(&info->lock); |
611 | info->swapped -= nr_swaps_freed; | 928 | info->swapped -= nr_swaps_freed; |
612 | shmem_recalc_inode(inode); | 929 | shmem_recalc_inode(inode); |
613 | spin_unlock(&info->lock); | 930 | spin_unlock_irq(&info->lock); |
614 | } | 931 | } |
615 | 932 | ||
616 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | 933 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) |
@@ -627,9 +944,9 @@ static int shmem_getattr(struct vfsmount *mnt, struct dentry *dentry, | |||
627 | struct shmem_inode_info *info = SHMEM_I(inode); | 944 | struct shmem_inode_info *info = SHMEM_I(inode); |
628 | 945 | ||
629 | if (info->alloced - info->swapped != inode->i_mapping->nrpages) { | 946 | if (info->alloced - info->swapped != inode->i_mapping->nrpages) { |
630 | spin_lock(&info->lock); | 947 | spin_lock_irq(&info->lock); |
631 | shmem_recalc_inode(inode); | 948 | shmem_recalc_inode(inode); |
632 | spin_unlock(&info->lock); | 949 | spin_unlock_irq(&info->lock); |
633 | } | 950 | } |
634 | generic_fillattr(inode, stat); | 951 | generic_fillattr(inode, stat); |
635 | return 0; | 952 | return 0; |
@@ -639,6 +956,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
639 | { | 956 | { |
640 | struct inode *inode = d_inode(dentry); | 957 | struct inode *inode = d_inode(dentry); |
641 | struct shmem_inode_info *info = SHMEM_I(inode); | 958 | struct shmem_inode_info *info = SHMEM_I(inode); |
959 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
642 | int error; | 960 | int error; |
643 | 961 | ||
644 | error = inode_change_ok(inode, attr); | 962 | error = inode_change_ok(inode, attr); |
@@ -674,6 +992,20 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
674 | if (oldsize > holebegin) | 992 | if (oldsize > holebegin) |
675 | unmap_mapping_range(inode->i_mapping, | 993 | unmap_mapping_range(inode->i_mapping, |
676 | holebegin, 0, 1); | 994 | holebegin, 0, 1); |
995 | |||
996 | /* | ||
997 | * Part of the huge page can be beyond i_size: subject | ||
998 | * to shrink under memory pressure. | ||
999 | */ | ||
1000 | if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { | ||
1001 | spin_lock(&sbinfo->shrinklist_lock); | ||
1002 | if (list_empty(&info->shrinklist)) { | ||
1003 | list_add_tail(&info->shrinklist, | ||
1004 | &sbinfo->shrinklist); | ||
1005 | sbinfo->shrinklist_len++; | ||
1006 | } | ||
1007 | spin_unlock(&sbinfo->shrinklist_lock); | ||
1008 | } | ||
677 | } | 1009 | } |
678 | } | 1010 | } |
679 | 1011 | ||
@@ -686,11 +1018,20 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | |||
686 | static void shmem_evict_inode(struct inode *inode) | 1018 | static void shmem_evict_inode(struct inode *inode) |
687 | { | 1019 | { |
688 | struct shmem_inode_info *info = SHMEM_I(inode); | 1020 | struct shmem_inode_info *info = SHMEM_I(inode); |
1021 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
689 | 1022 | ||
690 | if (inode->i_mapping->a_ops == &shmem_aops) { | 1023 | if (inode->i_mapping->a_ops == &shmem_aops) { |
691 | shmem_unacct_size(info->flags, inode->i_size); | 1024 | shmem_unacct_size(info->flags, inode->i_size); |
692 | inode->i_size = 0; | 1025 | inode->i_size = 0; |
693 | shmem_truncate_range(inode, 0, (loff_t)-1); | 1026 | shmem_truncate_range(inode, 0, (loff_t)-1); |
1027 | if (!list_empty(&info->shrinklist)) { | ||
1028 | spin_lock(&sbinfo->shrinklist_lock); | ||
1029 | if (!list_empty(&info->shrinklist)) { | ||
1030 | list_del_init(&info->shrinklist); | ||
1031 | sbinfo->shrinklist_len--; | ||
1032 | } | ||
1033 | spin_unlock(&sbinfo->shrinklist_lock); | ||
1034 | } | ||
694 | if (!list_empty(&info->swaplist)) { | 1035 | if (!list_empty(&info->swaplist)) { |
695 | mutex_lock(&shmem_swaplist_mutex); | 1036 | mutex_lock(&shmem_swaplist_mutex); |
696 | list_del_init(&info->swaplist); | 1037 | list_del_init(&info->swaplist); |
@@ -773,9 +1114,9 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, | |||
773 | delete_from_swap_cache(*pagep); | 1114 | delete_from_swap_cache(*pagep); |
774 | set_page_dirty(*pagep); | 1115 | set_page_dirty(*pagep); |
775 | if (!error) { | 1116 | if (!error) { |
776 | spin_lock(&info->lock); | 1117 | spin_lock_irq(&info->lock); |
777 | info->swapped--; | 1118 | info->swapped--; |
778 | spin_unlock(&info->lock); | 1119 | spin_unlock_irq(&info->lock); |
779 | swap_free(swap); | 1120 | swap_free(swap); |
780 | } | 1121 | } |
781 | } | 1122 | } |
@@ -848,6 +1189,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
848 | swp_entry_t swap; | 1189 | swp_entry_t swap; |
849 | pgoff_t index; | 1190 | pgoff_t index; |
850 | 1191 | ||
1192 | VM_BUG_ON_PAGE(PageCompound(page), page); | ||
851 | BUG_ON(!PageLocked(page)); | 1193 | BUG_ON(!PageLocked(page)); |
852 | mapping = page->mapping; | 1194 | mapping = page->mapping; |
853 | index = page->index; | 1195 | index = page->index; |
@@ -922,10 +1264,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
922 | list_add_tail(&info->swaplist, &shmem_swaplist); | 1264 | list_add_tail(&info->swaplist, &shmem_swaplist); |
923 | 1265 | ||
924 | if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { | 1266 | if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { |
925 | spin_lock(&info->lock); | 1267 | spin_lock_irq(&info->lock); |
926 | shmem_recalc_inode(inode); | 1268 | shmem_recalc_inode(inode); |
927 | info->swapped++; | 1269 | info->swapped++; |
928 | spin_unlock(&info->lock); | 1270 | spin_unlock_irq(&info->lock); |
929 | 1271 | ||
930 | swap_shmem_alloc(swap); | 1272 | swap_shmem_alloc(swap); |
931 | shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); | 1273 | shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); |
@@ -984,24 +1326,63 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
984 | #define vm_policy vm_private_data | 1326 | #define vm_policy vm_private_data |
985 | #endif | 1327 | #endif |
986 | 1328 | ||
1329 | static void shmem_pseudo_vma_init(struct vm_area_struct *vma, | ||
1330 | struct shmem_inode_info *info, pgoff_t index) | ||
1331 | { | ||
1332 | /* Create a pseudo vma that just contains the policy */ | ||
1333 | vma->vm_start = 0; | ||
1334 | /* Bias interleave by inode number to distribute better across nodes */ | ||
1335 | vma->vm_pgoff = index + info->vfs_inode.i_ino; | ||
1336 | vma->vm_ops = NULL; | ||
1337 | vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); | ||
1338 | } | ||
1339 | |||
1340 | static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) | ||
1341 | { | ||
1342 | /* Drop reference taken by mpol_shared_policy_lookup() */ | ||
1343 | mpol_cond_put(vma->vm_policy); | ||
1344 | } | ||
1345 | |||
987 | static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, | 1346 | static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, |
988 | struct shmem_inode_info *info, pgoff_t index) | 1347 | struct shmem_inode_info *info, pgoff_t index) |
989 | { | 1348 | { |
990 | struct vm_area_struct pvma; | 1349 | struct vm_area_struct pvma; |
991 | struct page *page; | 1350 | struct page *page; |
992 | 1351 | ||
993 | /* Create a pseudo vma that just contains the policy */ | 1352 | shmem_pseudo_vma_init(&pvma, info, index); |
994 | pvma.vm_start = 0; | ||
995 | /* Bias interleave by inode number to distribute better across nodes */ | ||
996 | pvma.vm_pgoff = index + info->vfs_inode.i_ino; | ||
997 | pvma.vm_ops = NULL; | ||
998 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); | ||
999 | |||
1000 | page = swapin_readahead(swap, gfp, &pvma, 0); | 1353 | page = swapin_readahead(swap, gfp, &pvma, 0); |
1354 | shmem_pseudo_vma_destroy(&pvma); | ||
1001 | 1355 | ||
1002 | /* Drop reference taken by mpol_shared_policy_lookup() */ | 1356 | return page; |
1003 | mpol_cond_put(pvma.vm_policy); | 1357 | } |
1358 | |||
1359 | static struct page *shmem_alloc_hugepage(gfp_t gfp, | ||
1360 | struct shmem_inode_info *info, pgoff_t index) | ||
1361 | { | ||
1362 | struct vm_area_struct pvma; | ||
1363 | struct inode *inode = &info->vfs_inode; | ||
1364 | struct address_space *mapping = inode->i_mapping; | ||
1365 | pgoff_t idx, hindex = round_down(index, HPAGE_PMD_NR); | ||
1366 | void __rcu **results; | ||
1367 | struct page *page; | ||
1004 | 1368 | ||
1369 | if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) | ||
1370 | return NULL; | ||
1371 | |||
1372 | rcu_read_lock(); | ||
1373 | if (radix_tree_gang_lookup_slot(&mapping->page_tree, &results, &idx, | ||
1374 | hindex, 1) && idx < hindex + HPAGE_PMD_NR) { | ||
1375 | rcu_read_unlock(); | ||
1376 | return NULL; | ||
1377 | } | ||
1378 | rcu_read_unlock(); | ||
1379 | |||
1380 | shmem_pseudo_vma_init(&pvma, info, hindex); | ||
1381 | page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, | ||
1382 | HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); | ||
1383 | shmem_pseudo_vma_destroy(&pvma); | ||
1384 | if (page) | ||
1385 | prep_transhuge_page(page); | ||
1005 | return page; | 1386 | return page; |
1006 | } | 1387 | } |
1007 | 1388 | ||
@@ -1011,23 +1392,51 @@ static struct page *shmem_alloc_page(gfp_t gfp, | |||
1011 | struct vm_area_struct pvma; | 1392 | struct vm_area_struct pvma; |
1012 | struct page *page; | 1393 | struct page *page; |
1013 | 1394 | ||
1014 | /* Create a pseudo vma that just contains the policy */ | 1395 | shmem_pseudo_vma_init(&pvma, info, index); |
1015 | pvma.vm_start = 0; | 1396 | page = alloc_page_vma(gfp, &pvma, 0); |
1016 | /* Bias interleave by inode number to distribute better across nodes */ | 1397 | shmem_pseudo_vma_destroy(&pvma); |
1017 | pvma.vm_pgoff = index + info->vfs_inode.i_ino; | 1398 | |
1018 | pvma.vm_ops = NULL; | 1399 | return page; |
1019 | pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); | 1400 | } |
1401 | |||
1402 | static struct page *shmem_alloc_and_acct_page(gfp_t gfp, | ||
1403 | struct shmem_inode_info *info, struct shmem_sb_info *sbinfo, | ||
1404 | pgoff_t index, bool huge) | ||
1405 | { | ||
1406 | struct page *page; | ||
1407 | int nr; | ||
1408 | int err = -ENOSPC; | ||
1409 | |||
1410 | if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) | ||
1411 | huge = false; | ||
1412 | nr = huge ? HPAGE_PMD_NR : 1; | ||
1413 | |||
1414 | if (shmem_acct_block(info->flags, nr)) | ||
1415 | goto failed; | ||
1416 | if (sbinfo->max_blocks) { | ||
1417 | if (percpu_counter_compare(&sbinfo->used_blocks, | ||
1418 | sbinfo->max_blocks - nr) > 0) | ||
1419 | goto unacct; | ||
1420 | percpu_counter_add(&sbinfo->used_blocks, nr); | ||
1421 | } | ||
1020 | 1422 | ||
1021 | page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false); | 1423 | if (huge) |
1424 | page = shmem_alloc_hugepage(gfp, info, index); | ||
1425 | else | ||
1426 | page = shmem_alloc_page(gfp, info, index); | ||
1022 | if (page) { | 1427 | if (page) { |
1023 | __SetPageLocked(page); | 1428 | __SetPageLocked(page); |
1024 | __SetPageSwapBacked(page); | 1429 | __SetPageSwapBacked(page); |
1430 | return page; | ||
1025 | } | 1431 | } |
1026 | 1432 | ||
1027 | /* Drop reference taken by mpol_shared_policy_lookup() */ | 1433 | err = -ENOMEM; |
1028 | mpol_cond_put(pvma.vm_policy); | 1434 | if (sbinfo->max_blocks) |
1029 | 1435 | percpu_counter_add(&sbinfo->used_blocks, -nr); | |
1030 | return page; | 1436 | unacct: |
1437 | shmem_unacct_blocks(info->flags, nr); | ||
1438 | failed: | ||
1439 | return ERR_PTR(err); | ||
1031 | } | 1440 | } |
1032 | 1441 | ||
1033 | /* | 1442 | /* |
@@ -1132,12 +1541,16 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | |||
1132 | struct mem_cgroup *memcg; | 1541 | struct mem_cgroup *memcg; |
1133 | struct page *page; | 1542 | struct page *page; |
1134 | swp_entry_t swap; | 1543 | swp_entry_t swap; |
1544 | enum sgp_type sgp_huge = sgp; | ||
1545 | pgoff_t hindex = index; | ||
1135 | int error; | 1546 | int error; |
1136 | int once = 0; | 1547 | int once = 0; |
1137 | int alloced = 0; | 1548 | int alloced = 0; |
1138 | 1549 | ||
1139 | if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) | 1550 | if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) |
1140 | return -EFBIG; | 1551 | return -EFBIG; |
1552 | if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) | ||
1553 | sgp = SGP_CACHE; | ||
1141 | repeat: | 1554 | repeat: |
1142 | swap.val = 0; | 1555 | swap.val = 0; |
1143 | page = find_lock_entry(mapping, index); | 1556 | page = find_lock_entry(mapping, index); |
@@ -1240,10 +1653,10 @@ repeat: | |||
1240 | 1653 | ||
1241 | mem_cgroup_commit_charge(page, memcg, true, false); | 1654 | mem_cgroup_commit_charge(page, memcg, true, false); |
1242 | 1655 | ||
1243 | spin_lock(&info->lock); | 1656 | spin_lock_irq(&info->lock); |
1244 | info->swapped--; | 1657 | info->swapped--; |
1245 | shmem_recalc_inode(inode); | 1658 | shmem_recalc_inode(inode); |
1246 | spin_unlock(&info->lock); | 1659 | spin_unlock_irq(&info->lock); |
1247 | 1660 | ||
1248 | if (sgp == SGP_WRITE) | 1661 | if (sgp == SGP_WRITE) |
1249 | mark_page_accessed(page); | 1662 | mark_page_accessed(page); |
@@ -1253,51 +1666,111 @@ repeat: | |||
1253 | swap_free(swap); | 1666 | swap_free(swap); |
1254 | 1667 | ||
1255 | } else { | 1668 | } else { |
1256 | if (shmem_acct_block(info->flags)) { | 1669 | /* shmem_symlink() */ |
1257 | error = -ENOSPC; | 1670 | if (mapping->a_ops != &shmem_aops) |
1258 | goto failed; | 1671 | goto alloc_nohuge; |
1672 | if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) | ||
1673 | goto alloc_nohuge; | ||
1674 | if (shmem_huge == SHMEM_HUGE_FORCE) | ||
1675 | goto alloc_huge; | ||
1676 | switch (sbinfo->huge) { | ||
1677 | loff_t i_size; | ||
1678 | pgoff_t off; | ||
1679 | case SHMEM_HUGE_NEVER: | ||
1680 | goto alloc_nohuge; | ||
1681 | case SHMEM_HUGE_WITHIN_SIZE: | ||
1682 | off = round_up(index, HPAGE_PMD_NR); | ||
1683 | i_size = round_up(i_size_read(inode), PAGE_SIZE); | ||
1684 | if (i_size >= HPAGE_PMD_SIZE && | ||
1685 | i_size >> PAGE_SHIFT >= off) | ||
1686 | goto alloc_huge; | ||
1687 | /* fallthrough */ | ||
1688 | case SHMEM_HUGE_ADVISE: | ||
1689 | if (sgp_huge == SGP_HUGE) | ||
1690 | goto alloc_huge; | ||
1691 | /* TODO: implement fadvise() hints */ | ||
1692 | goto alloc_nohuge; | ||
1259 | } | 1693 | } |
1260 | if (sbinfo->max_blocks) { | 1694 | |
1261 | if (percpu_counter_compare(&sbinfo->used_blocks, | 1695 | alloc_huge: |
1262 | sbinfo->max_blocks) >= 0) { | 1696 | page = shmem_alloc_and_acct_page(gfp, info, sbinfo, |
1263 | error = -ENOSPC; | 1697 | index, true); |
1264 | goto unacct; | 1698 | if (IS_ERR(page)) { |
1699 | alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, info, sbinfo, | ||
1700 | index, false); | ||
1701 | } | ||
1702 | if (IS_ERR(page)) { | ||
1703 | int retry = 5; | ||
1704 | error = PTR_ERR(page); | ||
1705 | page = NULL; | ||
1706 | if (error != -ENOSPC) | ||
1707 | goto failed; | ||
1708 | /* | ||
1709 | * Try to reclaim some spece by splitting a huge page | ||
1710 | * beyond i_size on the filesystem. | ||
1711 | */ | ||
1712 | while (retry--) { | ||
1713 | int ret; | ||
1714 | ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); | ||
1715 | if (ret == SHRINK_STOP) | ||
1716 | break; | ||
1717 | if (ret) | ||
1718 | goto alloc_nohuge; | ||
1265 | } | 1719 | } |
1266 | percpu_counter_inc(&sbinfo->used_blocks); | 1720 | goto failed; |
1267 | } | 1721 | } |
1268 | 1722 | ||
1269 | page = shmem_alloc_page(gfp, info, index); | 1723 | if (PageTransHuge(page)) |
1270 | if (!page) { | 1724 | hindex = round_down(index, HPAGE_PMD_NR); |
1271 | error = -ENOMEM; | 1725 | else |
1272 | goto decused; | 1726 | hindex = index; |
1273 | } | 1727 | |
1274 | if (sgp == SGP_WRITE) | 1728 | if (sgp == SGP_WRITE) |
1275 | __SetPageReferenced(page); | 1729 | __SetPageReferenced(page); |
1276 | 1730 | ||
1277 | error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, | 1731 | error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, |
1278 | false); | 1732 | PageTransHuge(page)); |
1279 | if (error) | 1733 | if (error) |
1280 | goto decused; | 1734 | goto unacct; |
1281 | error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); | 1735 | error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK, |
1736 | compound_order(page)); | ||
1282 | if (!error) { | 1737 | if (!error) { |
1283 | error = shmem_add_to_page_cache(page, mapping, index, | 1738 | error = shmem_add_to_page_cache(page, mapping, hindex, |
1284 | NULL); | 1739 | NULL); |
1285 | radix_tree_preload_end(); | 1740 | radix_tree_preload_end(); |
1286 | } | 1741 | } |
1287 | if (error) { | 1742 | if (error) { |
1288 | mem_cgroup_cancel_charge(page, memcg, false); | 1743 | mem_cgroup_cancel_charge(page, memcg, |
1289 | goto decused; | 1744 | PageTransHuge(page)); |
1745 | goto unacct; | ||
1290 | } | 1746 | } |
1291 | mem_cgroup_commit_charge(page, memcg, false, false); | 1747 | mem_cgroup_commit_charge(page, memcg, false, |
1748 | PageTransHuge(page)); | ||
1292 | lru_cache_add_anon(page); | 1749 | lru_cache_add_anon(page); |
1293 | 1750 | ||
1294 | spin_lock(&info->lock); | 1751 | spin_lock_irq(&info->lock); |
1295 | info->alloced++; | 1752 | info->alloced += 1 << compound_order(page); |
1296 | inode->i_blocks += BLOCKS_PER_PAGE; | 1753 | inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); |
1297 | shmem_recalc_inode(inode); | 1754 | shmem_recalc_inode(inode); |
1298 | spin_unlock(&info->lock); | 1755 | spin_unlock_irq(&info->lock); |
1299 | alloced = true; | 1756 | alloced = true; |
1300 | 1757 | ||
1758 | if (PageTransHuge(page) && | ||
1759 | DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < | ||
1760 | hindex + HPAGE_PMD_NR - 1) { | ||
1761 | /* | ||
1762 | * Part of the huge page is beyond i_size: subject | ||
1763 | * to shrink under memory pressure. | ||
1764 | */ | ||
1765 | spin_lock(&sbinfo->shrinklist_lock); | ||
1766 | if (list_empty(&info->shrinklist)) { | ||
1767 | list_add_tail(&info->shrinklist, | ||
1768 | &sbinfo->shrinklist); | ||
1769 | sbinfo->shrinklist_len++; | ||
1770 | } | ||
1771 | spin_unlock(&sbinfo->shrinklist_lock); | ||
1772 | } | ||
1773 | |||
1301 | /* | 1774 | /* |
1302 | * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. | 1775 | * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. |
1303 | */ | 1776 | */ |
@@ -1309,10 +1782,15 @@ clear: | |||
1309 | * but SGP_FALLOC on a page fallocated earlier must initialize | 1782 | * but SGP_FALLOC on a page fallocated earlier must initialize |
1310 | * it now, lest undo on failure cancel our earlier guarantee. | 1783 | * it now, lest undo on failure cancel our earlier guarantee. |
1311 | */ | 1784 | */ |
1312 | if (sgp != SGP_WRITE) { | 1785 | if (sgp != SGP_WRITE && !PageUptodate(page)) { |
1313 | clear_highpage(page); | 1786 | struct page *head = compound_head(page); |
1314 | flush_dcache_page(page); | 1787 | int i; |
1315 | SetPageUptodate(page); | 1788 | |
1789 | for (i = 0; i < (1 << compound_order(head)); i++) { | ||
1790 | clear_highpage(head + i); | ||
1791 | flush_dcache_page(head + i); | ||
1792 | } | ||
1793 | SetPageUptodate(head); | ||
1316 | } | 1794 | } |
1317 | } | 1795 | } |
1318 | 1796 | ||
@@ -1322,24 +1800,30 @@ clear: | |||
1322 | if (alloced) { | 1800 | if (alloced) { |
1323 | ClearPageDirty(page); | 1801 | ClearPageDirty(page); |
1324 | delete_from_page_cache(page); | 1802 | delete_from_page_cache(page); |
1325 | spin_lock(&info->lock); | 1803 | spin_lock_irq(&info->lock); |
1326 | shmem_recalc_inode(inode); | 1804 | shmem_recalc_inode(inode); |
1327 | spin_unlock(&info->lock); | 1805 | spin_unlock_irq(&info->lock); |
1328 | } | 1806 | } |
1329 | error = -EINVAL; | 1807 | error = -EINVAL; |
1330 | goto unlock; | 1808 | goto unlock; |
1331 | } | 1809 | } |
1332 | *pagep = page; | 1810 | *pagep = page + index - hindex; |
1333 | return 0; | 1811 | return 0; |
1334 | 1812 | ||
1335 | /* | 1813 | /* |
1336 | * Error recovery. | 1814 | * Error recovery. |
1337 | */ | 1815 | */ |
1338 | decused: | ||
1339 | if (sbinfo->max_blocks) | ||
1340 | percpu_counter_add(&sbinfo->used_blocks, -1); | ||
1341 | unacct: | 1816 | unacct: |
1342 | shmem_unacct_blocks(info->flags, 1); | 1817 | if (sbinfo->max_blocks) |
1818 | percpu_counter_sub(&sbinfo->used_blocks, | ||
1819 | 1 << compound_order(page)); | ||
1820 | shmem_unacct_blocks(info->flags, 1 << compound_order(page)); | ||
1821 | |||
1822 | if (PageTransHuge(page)) { | ||
1823 | unlock_page(page); | ||
1824 | put_page(page); | ||
1825 | goto alloc_nohuge; | ||
1826 | } | ||
1343 | failed: | 1827 | failed: |
1344 | if (swap.val && !shmem_confirm_swap(mapping, index, swap)) | 1828 | if (swap.val && !shmem_confirm_swap(mapping, index, swap)) |
1345 | error = -EEXIST; | 1829 | error = -EEXIST; |
@@ -1350,9 +1834,9 @@ unlock: | |||
1350 | } | 1834 | } |
1351 | if (error == -ENOSPC && !once++) { | 1835 | if (error == -ENOSPC && !once++) { |
1352 | info = SHMEM_I(inode); | 1836 | info = SHMEM_I(inode); |
1353 | spin_lock(&info->lock); | 1837 | spin_lock_irq(&info->lock); |
1354 | shmem_recalc_inode(inode); | 1838 | shmem_recalc_inode(inode); |
1355 | spin_unlock(&info->lock); | 1839 | spin_unlock_irq(&info->lock); |
1356 | goto repeat; | 1840 | goto repeat; |
1357 | } | 1841 | } |
1358 | if (error == -EEXIST) /* from above or from radix_tree_insert */ | 1842 | if (error == -EEXIST) /* from above or from radix_tree_insert */ |
@@ -1364,6 +1848,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1364 | { | 1848 | { |
1365 | struct inode *inode = file_inode(vma->vm_file); | 1849 | struct inode *inode = file_inode(vma->vm_file); |
1366 | gfp_t gfp = mapping_gfp_mask(inode->i_mapping); | 1850 | gfp_t gfp = mapping_gfp_mask(inode->i_mapping); |
1851 | enum sgp_type sgp; | ||
1367 | int error; | 1852 | int error; |
1368 | int ret = VM_FAULT_LOCKED; | 1853 | int ret = VM_FAULT_LOCKED; |
1369 | 1854 | ||
@@ -1425,13 +1910,107 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1425 | spin_unlock(&inode->i_lock); | 1910 | spin_unlock(&inode->i_lock); |
1426 | } | 1911 | } |
1427 | 1912 | ||
1428 | error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, | 1913 | sgp = SGP_CACHE; |
1914 | if (vma->vm_flags & VM_HUGEPAGE) | ||
1915 | sgp = SGP_HUGE; | ||
1916 | else if (vma->vm_flags & VM_NOHUGEPAGE) | ||
1917 | sgp = SGP_NOHUGE; | ||
1918 | |||
1919 | error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, | ||
1429 | gfp, vma->vm_mm, &ret); | 1920 | gfp, vma->vm_mm, &ret); |
1430 | if (error) | 1921 | if (error) |
1431 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); | 1922 | return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); |
1432 | return ret; | 1923 | return ret; |
1433 | } | 1924 | } |
1434 | 1925 | ||
1926 | unsigned long shmem_get_unmapped_area(struct file *file, | ||
1927 | unsigned long uaddr, unsigned long len, | ||
1928 | unsigned long pgoff, unsigned long flags) | ||
1929 | { | ||
1930 | unsigned long (*get_area)(struct file *, | ||
1931 | unsigned long, unsigned long, unsigned long, unsigned long); | ||
1932 | unsigned long addr; | ||
1933 | unsigned long offset; | ||
1934 | unsigned long inflated_len; | ||
1935 | unsigned long inflated_addr; | ||
1936 | unsigned long inflated_offset; | ||
1937 | |||
1938 | if (len > TASK_SIZE) | ||
1939 | return -ENOMEM; | ||
1940 | |||
1941 | get_area = current->mm->get_unmapped_area; | ||
1942 | addr = get_area(file, uaddr, len, pgoff, flags); | ||
1943 | |||
1944 | if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) | ||
1945 | return addr; | ||
1946 | if (IS_ERR_VALUE(addr)) | ||
1947 | return addr; | ||
1948 | if (addr & ~PAGE_MASK) | ||
1949 | return addr; | ||
1950 | if (addr > TASK_SIZE - len) | ||
1951 | return addr; | ||
1952 | |||
1953 | if (shmem_huge == SHMEM_HUGE_DENY) | ||
1954 | return addr; | ||
1955 | if (len < HPAGE_PMD_SIZE) | ||
1956 | return addr; | ||
1957 | if (flags & MAP_FIXED) | ||
1958 | return addr; | ||
1959 | /* | ||
1960 | * Our priority is to support MAP_SHARED mapped hugely; | ||
1961 | * and support MAP_PRIVATE mapped hugely too, until it is COWed. | ||
1962 | * But if caller specified an address hint, respect that as before. | ||
1963 | */ | ||
1964 | if (uaddr) | ||
1965 | return addr; | ||
1966 | |||
1967 | if (shmem_huge != SHMEM_HUGE_FORCE) { | ||
1968 | struct super_block *sb; | ||
1969 | |||
1970 | if (file) { | ||
1971 | VM_BUG_ON(file->f_op != &shmem_file_operations); | ||
1972 | sb = file_inode(file)->i_sb; | ||
1973 | } else { | ||
1974 | /* | ||
1975 | * Called directly from mm/mmap.c, or drivers/char/mem.c | ||
1976 | * for "/dev/zero", to create a shared anonymous object. | ||
1977 | */ | ||
1978 | if (IS_ERR(shm_mnt)) | ||
1979 | return addr; | ||
1980 | sb = shm_mnt->mnt_sb; | ||
1981 | } | ||
1982 | if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER) | ||
1983 | return addr; | ||
1984 | } | ||
1985 | |||
1986 | offset = (pgoff << PAGE_SHIFT) & (HPAGE_PMD_SIZE-1); | ||
1987 | if (offset && offset + len < 2 * HPAGE_PMD_SIZE) | ||
1988 | return addr; | ||
1989 | if ((addr & (HPAGE_PMD_SIZE-1)) == offset) | ||
1990 | return addr; | ||
1991 | |||
1992 | inflated_len = len + HPAGE_PMD_SIZE - PAGE_SIZE; | ||
1993 | if (inflated_len > TASK_SIZE) | ||
1994 | return addr; | ||
1995 | if (inflated_len < len) | ||
1996 | return addr; | ||
1997 | |||
1998 | inflated_addr = get_area(NULL, 0, inflated_len, 0, flags); | ||
1999 | if (IS_ERR_VALUE(inflated_addr)) | ||
2000 | return addr; | ||
2001 | if (inflated_addr & ~PAGE_MASK) | ||
2002 | return addr; | ||
2003 | |||
2004 | inflated_offset = inflated_addr & (HPAGE_PMD_SIZE-1); | ||
2005 | inflated_addr += offset - inflated_offset; | ||
2006 | if (inflated_offset > offset) | ||
2007 | inflated_addr += HPAGE_PMD_SIZE; | ||
2008 | |||
2009 | if (inflated_addr > TASK_SIZE - len) | ||
2010 | return addr; | ||
2011 | return inflated_addr; | ||
2012 | } | ||
2013 | |||
1435 | #ifdef CONFIG_NUMA | 2014 | #ifdef CONFIG_NUMA |
1436 | static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) | 2015 | static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) |
1437 | { | 2016 | { |
@@ -1456,7 +2035,7 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) | |||
1456 | struct shmem_inode_info *info = SHMEM_I(inode); | 2035 | struct shmem_inode_info *info = SHMEM_I(inode); |
1457 | int retval = -ENOMEM; | 2036 | int retval = -ENOMEM; |
1458 | 2037 | ||
1459 | spin_lock(&info->lock); | 2038 | spin_lock_irq(&info->lock); |
1460 | if (lock && !(info->flags & VM_LOCKED)) { | 2039 | if (lock && !(info->flags & VM_LOCKED)) { |
1461 | if (!user_shm_lock(inode->i_size, user)) | 2040 | if (!user_shm_lock(inode->i_size, user)) |
1462 | goto out_nomem; | 2041 | goto out_nomem; |
@@ -1471,7 +2050,7 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) | |||
1471 | retval = 0; | 2050 | retval = 0; |
1472 | 2051 | ||
1473 | out_nomem: | 2052 | out_nomem: |
1474 | spin_unlock(&info->lock); | 2053 | spin_unlock_irq(&info->lock); |
1475 | return retval; | 2054 | return retval; |
1476 | } | 2055 | } |
1477 | 2056 | ||
@@ -1479,6 +2058,11 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) | |||
1479 | { | 2058 | { |
1480 | file_accessed(file); | 2059 | file_accessed(file); |
1481 | vma->vm_ops = &shmem_vm_ops; | 2060 | vma->vm_ops = &shmem_vm_ops; |
2061 | if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && | ||
2062 | ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < | ||
2063 | (vma->vm_end & HPAGE_PMD_MASK)) { | ||
2064 | khugepaged_enter(vma, vma->vm_flags); | ||
2065 | } | ||
1482 | return 0; | 2066 | return 0; |
1483 | } | 2067 | } |
1484 | 2068 | ||
@@ -1504,6 +2088,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
1504 | spin_lock_init(&info->lock); | 2088 | spin_lock_init(&info->lock); |
1505 | info->seals = F_SEAL_SEAL; | 2089 | info->seals = F_SEAL_SEAL; |
1506 | info->flags = flags & VM_NORESERVE; | 2090 | info->flags = flags & VM_NORESERVE; |
2091 | INIT_LIST_HEAD(&info->shrinklist); | ||
1507 | INIT_LIST_HEAD(&info->swaplist); | 2092 | INIT_LIST_HEAD(&info->swaplist); |
1508 | simple_xattrs_init(&info->xattrs); | 2093 | simple_xattrs_init(&info->xattrs); |
1509 | cache_no_acl(inode); | 2094 | cache_no_acl(inode); |
@@ -1589,12 +2174,23 @@ shmem_write_end(struct file *file, struct address_space *mapping, | |||
1589 | i_size_write(inode, pos + copied); | 2174 | i_size_write(inode, pos + copied); |
1590 | 2175 | ||
1591 | if (!PageUptodate(page)) { | 2176 | if (!PageUptodate(page)) { |
2177 | struct page *head = compound_head(page); | ||
2178 | if (PageTransCompound(page)) { | ||
2179 | int i; | ||
2180 | |||
2181 | for (i = 0; i < HPAGE_PMD_NR; i++) { | ||
2182 | if (head + i == page) | ||
2183 | continue; | ||
2184 | clear_highpage(head + i); | ||
2185 | flush_dcache_page(head + i); | ||
2186 | } | ||
2187 | } | ||
1592 | if (copied < PAGE_SIZE) { | 2188 | if (copied < PAGE_SIZE) { |
1593 | unsigned from = pos & (PAGE_SIZE - 1); | 2189 | unsigned from = pos & (PAGE_SIZE - 1); |
1594 | zero_user_segments(page, 0, from, | 2190 | zero_user_segments(page, 0, from, |
1595 | from + copied, PAGE_SIZE); | 2191 | from + copied, PAGE_SIZE); |
1596 | } | 2192 | } |
1597 | SetPageUptodate(page); | 2193 | SetPageUptodate(head); |
1598 | } | 2194 | } |
1599 | set_page_dirty(page); | 2195 | set_page_dirty(page); |
1600 | unlock_page(page); | 2196 | unlock_page(page); |
@@ -2860,11 +3456,24 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, | |||
2860 | sbinfo->gid = make_kgid(current_user_ns(), gid); | 3456 | sbinfo->gid = make_kgid(current_user_ns(), gid); |
2861 | if (!gid_valid(sbinfo->gid)) | 3457 | if (!gid_valid(sbinfo->gid)) |
2862 | goto bad_val; | 3458 | goto bad_val; |
3459 | #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE | ||
3460 | } else if (!strcmp(this_char, "huge")) { | ||
3461 | int huge; | ||
3462 | huge = shmem_parse_huge(value); | ||
3463 | if (huge < 0) | ||
3464 | goto bad_val; | ||
3465 | if (!has_transparent_hugepage() && | ||
3466 | huge != SHMEM_HUGE_NEVER) | ||
3467 | goto bad_val; | ||
3468 | sbinfo->huge = huge; | ||
3469 | #endif | ||
3470 | #ifdef CONFIG_NUMA | ||
2863 | } else if (!strcmp(this_char,"mpol")) { | 3471 | } else if (!strcmp(this_char,"mpol")) { |
2864 | mpol_put(mpol); | 3472 | mpol_put(mpol); |
2865 | mpol = NULL; | 3473 | mpol = NULL; |
2866 | if (mpol_parse_str(value, &mpol)) | 3474 | if (mpol_parse_str(value, &mpol)) |
2867 | goto bad_val; | 3475 | goto bad_val; |
3476 | #endif | ||
2868 | } else { | 3477 | } else { |
2869 | pr_err("tmpfs: Bad mount option %s\n", this_char); | 3478 | pr_err("tmpfs: Bad mount option %s\n", this_char); |
2870 | goto error; | 3479 | goto error; |
@@ -2910,6 +3519,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) | |||
2910 | goto out; | 3519 | goto out; |
2911 | 3520 | ||
2912 | error = 0; | 3521 | error = 0; |
3522 | sbinfo->huge = config.huge; | ||
2913 | sbinfo->max_blocks = config.max_blocks; | 3523 | sbinfo->max_blocks = config.max_blocks; |
2914 | sbinfo->max_inodes = config.max_inodes; | 3524 | sbinfo->max_inodes = config.max_inodes; |
2915 | sbinfo->free_inodes = config.max_inodes - inodes; | 3525 | sbinfo->free_inodes = config.max_inodes - inodes; |
@@ -2943,6 +3553,11 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) | |||
2943 | if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) | 3553 | if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) |
2944 | seq_printf(seq, ",gid=%u", | 3554 | seq_printf(seq, ",gid=%u", |
2945 | from_kgid_munged(&init_user_ns, sbinfo->gid)); | 3555 | from_kgid_munged(&init_user_ns, sbinfo->gid)); |
3556 | #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE | ||
3557 | /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ | ||
3558 | if (sbinfo->huge) | ||
3559 | seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); | ||
3560 | #endif | ||
2946 | shmem_show_mpol(seq, sbinfo->mpol); | 3561 | shmem_show_mpol(seq, sbinfo->mpol); |
2947 | return 0; | 3562 | return 0; |
2948 | } | 3563 | } |
@@ -3072,6 +3687,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) | |||
3072 | if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) | 3687 | if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) |
3073 | goto failed; | 3688 | goto failed; |
3074 | sbinfo->free_inodes = sbinfo->max_inodes; | 3689 | sbinfo->free_inodes = sbinfo->max_inodes; |
3690 | spin_lock_init(&sbinfo->shrinklist_lock); | ||
3691 | INIT_LIST_HEAD(&sbinfo->shrinklist); | ||
3075 | 3692 | ||
3076 | sb->s_maxbytes = MAX_LFS_FILESIZE; | 3693 | sb->s_maxbytes = MAX_LFS_FILESIZE; |
3077 | sb->s_blocksize = PAGE_SIZE; | 3694 | sb->s_blocksize = PAGE_SIZE; |
@@ -3161,6 +3778,7 @@ static const struct address_space_operations shmem_aops = { | |||
3161 | 3778 | ||
3162 | static const struct file_operations shmem_file_operations = { | 3779 | static const struct file_operations shmem_file_operations = { |
3163 | .mmap = shmem_mmap, | 3780 | .mmap = shmem_mmap, |
3781 | .get_unmapped_area = shmem_get_unmapped_area, | ||
3164 | #ifdef CONFIG_TMPFS | 3782 | #ifdef CONFIG_TMPFS |
3165 | .llseek = shmem_file_llseek, | 3783 | .llseek = shmem_file_llseek, |
3166 | .read_iter = shmem_file_read_iter, | 3784 | .read_iter = shmem_file_read_iter, |
@@ -3233,6 +3851,10 @@ static const struct super_operations shmem_ops = { | |||
3233 | .evict_inode = shmem_evict_inode, | 3851 | .evict_inode = shmem_evict_inode, |
3234 | .drop_inode = generic_delete_inode, | 3852 | .drop_inode = generic_delete_inode, |
3235 | .put_super = shmem_put_super, | 3853 | .put_super = shmem_put_super, |
3854 | #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE | ||
3855 | .nr_cached_objects = shmem_unused_huge_count, | ||
3856 | .free_cached_objects = shmem_unused_huge_scan, | ||
3857 | #endif | ||
3236 | }; | 3858 | }; |
3237 | 3859 | ||
3238 | static const struct vm_operations_struct shmem_vm_ops = { | 3860 | static const struct vm_operations_struct shmem_vm_ops = { |
@@ -3282,6 +3904,13 @@ int __init shmem_init(void) | |||
3282 | pr_err("Could not kern_mount tmpfs\n"); | 3904 | pr_err("Could not kern_mount tmpfs\n"); |
3283 | goto out1; | 3905 | goto out1; |
3284 | } | 3906 | } |
3907 | |||
3908 | #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE | ||
3909 | if (has_transparent_hugepage() && shmem_huge < SHMEM_HUGE_DENY) | ||
3910 | SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; | ||
3911 | else | ||
3912 | shmem_huge = 0; /* just in case it was patched */ | ||
3913 | #endif | ||
3285 | return 0; | 3914 | return 0; |
3286 | 3915 | ||
3287 | out1: | 3916 | out1: |
@@ -3293,6 +3922,91 @@ out3: | |||
3293 | return error; | 3922 | return error; |
3294 | } | 3923 | } |
3295 | 3924 | ||
3925 | #if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS) | ||
3926 | static ssize_t shmem_enabled_show(struct kobject *kobj, | ||
3927 | struct kobj_attribute *attr, char *buf) | ||
3928 | { | ||
3929 | int values[] = { | ||
3930 | SHMEM_HUGE_ALWAYS, | ||
3931 | SHMEM_HUGE_WITHIN_SIZE, | ||
3932 | SHMEM_HUGE_ADVISE, | ||
3933 | SHMEM_HUGE_NEVER, | ||
3934 | SHMEM_HUGE_DENY, | ||
3935 | SHMEM_HUGE_FORCE, | ||
3936 | }; | ||
3937 | int i, count; | ||
3938 | |||
3939 | for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) { | ||
3940 | const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s "; | ||
3941 | |||
3942 | count += sprintf(buf + count, fmt, | ||
3943 | shmem_format_huge(values[i])); | ||
3944 | } | ||
3945 | buf[count - 1] = '\n'; | ||
3946 | return count; | ||
3947 | } | ||
3948 | |||
3949 | static ssize_t shmem_enabled_store(struct kobject *kobj, | ||
3950 | struct kobj_attribute *attr, const char *buf, size_t count) | ||
3951 | { | ||
3952 | char tmp[16]; | ||
3953 | int huge; | ||
3954 | |||
3955 | if (count + 1 > sizeof(tmp)) | ||
3956 | return -EINVAL; | ||
3957 | memcpy(tmp, buf, count); | ||
3958 | tmp[count] = '\0'; | ||
3959 | if (count && tmp[count - 1] == '\n') | ||
3960 | tmp[count - 1] = '\0'; | ||
3961 | |||
3962 | huge = shmem_parse_huge(tmp); | ||
3963 | if (huge == -EINVAL) | ||
3964 | return -EINVAL; | ||
3965 | if (!has_transparent_hugepage() && | ||
3966 | huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) | ||
3967 | return -EINVAL; | ||
3968 | |||
3969 | shmem_huge = huge; | ||
3970 | if (shmem_huge < SHMEM_HUGE_DENY) | ||
3971 | SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; | ||
3972 | return count; | ||
3973 | } | ||
3974 | |||
3975 | struct kobj_attribute shmem_enabled_attr = | ||
3976 | __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); | ||
3977 | |||
3978 | bool shmem_huge_enabled(struct vm_area_struct *vma) | ||
3979 | { | ||
3980 | struct inode *inode = file_inode(vma->vm_file); | ||
3981 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
3982 | loff_t i_size; | ||
3983 | pgoff_t off; | ||
3984 | |||
3985 | if (shmem_huge == SHMEM_HUGE_FORCE) | ||
3986 | return true; | ||
3987 | if (shmem_huge == SHMEM_HUGE_DENY) | ||
3988 | return false; | ||
3989 | switch (sbinfo->huge) { | ||
3990 | case SHMEM_HUGE_NEVER: | ||
3991 | return false; | ||
3992 | case SHMEM_HUGE_ALWAYS: | ||
3993 | return true; | ||
3994 | case SHMEM_HUGE_WITHIN_SIZE: | ||
3995 | off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); | ||
3996 | i_size = round_up(i_size_read(inode), PAGE_SIZE); | ||
3997 | if (i_size >= HPAGE_PMD_SIZE && | ||
3998 | i_size >> PAGE_SHIFT >= off) | ||
3999 | return true; | ||
4000 | case SHMEM_HUGE_ADVISE: | ||
4001 | /* TODO: implement fadvise() hints */ | ||
4002 | return (vma->vm_flags & VM_HUGEPAGE); | ||
4003 | default: | ||
4004 | VM_BUG_ON(1); | ||
4005 | return false; | ||
4006 | } | ||
4007 | } | ||
4008 | #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ | ||
4009 | |||
3296 | #else /* !CONFIG_SHMEM */ | 4010 | #else /* !CONFIG_SHMEM */ |
3297 | 4011 | ||
3298 | /* | 4012 | /* |
@@ -3335,6 +4049,15 @@ void shmem_unlock_mapping(struct address_space *mapping) | |||
3335 | { | 4049 | { |
3336 | } | 4050 | } |
3337 | 4051 | ||
4052 | #ifdef CONFIG_MMU | ||
4053 | unsigned long shmem_get_unmapped_area(struct file *file, | ||
4054 | unsigned long addr, unsigned long len, | ||
4055 | unsigned long pgoff, unsigned long flags) | ||
4056 | { | ||
4057 | return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); | ||
4058 | } | ||
4059 | #endif | ||
4060 | |||
3338 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | 4061 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) |
3339 | { | 4062 | { |
3340 | truncate_inode_pages_range(inode->i_mapping, lstart, lend); | 4063 | truncate_inode_pages_range(inode->i_mapping, lstart, lend); |
@@ -3461,6 +4184,13 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
3461 | fput(vma->vm_file); | 4184 | fput(vma->vm_file); |
3462 | vma->vm_file = file; | 4185 | vma->vm_file = file; |
3463 | vma->vm_ops = &shmem_vm_ops; | 4186 | vma->vm_ops = &shmem_vm_ops; |
4187 | |||
4188 | if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && | ||
4189 | ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < | ||
4190 | (vma->vm_end & HPAGE_PMD_MASK)) { | ||
4191 | khugepaged_enter(vma, vma->vm_flags); | ||
4192 | } | ||
4193 | |||
3464 | return 0; | 4194 | return 0; |
3465 | } | 4195 | } |
3466 | 4196 | ||
@@ -1236,61 +1236,6 @@ static void __init set_up_node(struct kmem_cache *cachep, int index) | |||
1236 | } | 1236 | } |
1237 | } | 1237 | } |
1238 | 1238 | ||
1239 | #ifdef CONFIG_SLAB_FREELIST_RANDOM | ||
1240 | static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list, | ||
1241 | size_t count) | ||
1242 | { | ||
1243 | size_t i; | ||
1244 | unsigned int rand; | ||
1245 | |||
1246 | for (i = 0; i < count; i++) | ||
1247 | list[i] = i; | ||
1248 | |||
1249 | /* Fisher-Yates shuffle */ | ||
1250 | for (i = count - 1; i > 0; i--) { | ||
1251 | rand = prandom_u32_state(state); | ||
1252 | rand %= (i + 1); | ||
1253 | swap(list[i], list[rand]); | ||
1254 | } | ||
1255 | } | ||
1256 | |||
1257 | /* Create a random sequence per cache */ | ||
1258 | static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp) | ||
1259 | { | ||
1260 | unsigned int seed, count = cachep->num; | ||
1261 | struct rnd_state state; | ||
1262 | |||
1263 | if (count < 2) | ||
1264 | return 0; | ||
1265 | |||
1266 | /* If it fails, we will just use the global lists */ | ||
1267 | cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp); | ||
1268 | if (!cachep->random_seq) | ||
1269 | return -ENOMEM; | ||
1270 | |||
1271 | /* Get best entropy at this stage */ | ||
1272 | get_random_bytes_arch(&seed, sizeof(seed)); | ||
1273 | prandom_seed_state(&state, seed); | ||
1274 | |||
1275 | freelist_randomize(&state, cachep->random_seq, count); | ||
1276 | return 0; | ||
1277 | } | ||
1278 | |||
1279 | /* Destroy the per-cache random freelist sequence */ | ||
1280 | static void cache_random_seq_destroy(struct kmem_cache *cachep) | ||
1281 | { | ||
1282 | kfree(cachep->random_seq); | ||
1283 | cachep->random_seq = NULL; | ||
1284 | } | ||
1285 | #else | ||
1286 | static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp) | ||
1287 | { | ||
1288 | return 0; | ||
1289 | } | ||
1290 | static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } | ||
1291 | #endif /* CONFIG_SLAB_FREELIST_RANDOM */ | ||
1292 | |||
1293 | |||
1294 | /* | 1239 | /* |
1295 | * Initialisation. Called after the page allocator have been initialised and | 1240 | * Initialisation. Called after the page allocator have been initialised and |
1296 | * before smp_init(). | 1241 | * before smp_init(). |
@@ -2535,7 +2480,7 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) | |||
2535 | union freelist_init_state { | 2480 | union freelist_init_state { |
2536 | struct { | 2481 | struct { |
2537 | unsigned int pos; | 2482 | unsigned int pos; |
2538 | freelist_idx_t *list; | 2483 | unsigned int *list; |
2539 | unsigned int count; | 2484 | unsigned int count; |
2540 | unsigned int rand; | 2485 | unsigned int rand; |
2541 | }; | 2486 | }; |
@@ -2554,7 +2499,7 @@ static bool freelist_state_initialize(union freelist_init_state *state, | |||
2554 | unsigned int rand; | 2499 | unsigned int rand; |
2555 | 2500 | ||
2556 | /* Use best entropy available to define a random shift */ | 2501 | /* Use best entropy available to define a random shift */ |
2557 | get_random_bytes_arch(&rand, sizeof(rand)); | 2502 | rand = get_random_int(); |
2558 | 2503 | ||
2559 | /* Use a random state if the pre-computed list is not available */ | 2504 | /* Use a random state if the pre-computed list is not available */ |
2560 | if (!cachep->random_seq) { | 2505 | if (!cachep->random_seq) { |
@@ -2576,13 +2521,20 @@ static freelist_idx_t next_random_slot(union freelist_init_state *state) | |||
2576 | return (state->list[state->pos++] + state->rand) % state->count; | 2521 | return (state->list[state->pos++] + state->rand) % state->count; |
2577 | } | 2522 | } |
2578 | 2523 | ||
2524 | /* Swap two freelist entries */ | ||
2525 | static void swap_free_obj(struct page *page, unsigned int a, unsigned int b) | ||
2526 | { | ||
2527 | swap(((freelist_idx_t *)page->freelist)[a], | ||
2528 | ((freelist_idx_t *)page->freelist)[b]); | ||
2529 | } | ||
2530 | |||
2579 | /* | 2531 | /* |
2580 | * Shuffle the freelist initialization state based on pre-computed lists. | 2532 | * Shuffle the freelist initialization state based on pre-computed lists. |
2581 | * return true if the list was successfully shuffled, false otherwise. | 2533 | * return true if the list was successfully shuffled, false otherwise. |
2582 | */ | 2534 | */ |
2583 | static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page) | 2535 | static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page) |
2584 | { | 2536 | { |
2585 | unsigned int objfreelist = 0, i, count = cachep->num; | 2537 | unsigned int objfreelist = 0, i, rand, count = cachep->num; |
2586 | union freelist_init_state state; | 2538 | union freelist_init_state state; |
2587 | bool precomputed; | 2539 | bool precomputed; |
2588 | 2540 | ||
@@ -2607,7 +2559,15 @@ static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page) | |||
2607 | * Later use a pre-computed list for speed. | 2559 | * Later use a pre-computed list for speed. |
2608 | */ | 2560 | */ |
2609 | if (!precomputed) { | 2561 | if (!precomputed) { |
2610 | freelist_randomize(&state.rnd_state, page->freelist, count); | 2562 | for (i = 0; i < count; i++) |
2563 | set_free_obj(page, i, i); | ||
2564 | |||
2565 | /* Fisher-Yates shuffle */ | ||
2566 | for (i = count - 1; i > 0; i--) { | ||
2567 | rand = prandom_u32_state(&state.rnd_state); | ||
2568 | rand %= (i + 1); | ||
2569 | swap_free_obj(page, i, rand); | ||
2570 | } | ||
2611 | } else { | 2571 | } else { |
2612 | for (i = 0; i < count; i++) | 2572 | for (i = 0; i < count; i++) |
2613 | set_free_obj(page, i, next_random_slot(&state)); | 2573 | set_free_obj(page, i, next_random_slot(&state)); |
@@ -2726,8 +2686,11 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, | |||
2726 | * critical path in kmem_cache_alloc(). | 2686 | * critical path in kmem_cache_alloc(). |
2727 | */ | 2687 | */ |
2728 | if (unlikely(flags & GFP_SLAB_BUG_MASK)) { | 2688 | if (unlikely(flags & GFP_SLAB_BUG_MASK)) { |
2729 | pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); | 2689 | gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; |
2730 | BUG(); | 2690 | flags &= ~GFP_SLAB_BUG_MASK; |
2691 | pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", | ||
2692 | invalid_mask, &invalid_mask, flags, &flags); | ||
2693 | dump_stack(); | ||
2731 | } | 2694 | } |
2732 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | 2695 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
2733 | 2696 | ||
@@ -3489,8 +3452,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, | |||
3489 | n->free_objects -= cachep->num; | 3452 | n->free_objects -= cachep->num; |
3490 | 3453 | ||
3491 | page = list_last_entry(&n->slabs_free, struct page, lru); | 3454 | page = list_last_entry(&n->slabs_free, struct page, lru); |
3492 | list_del(&page->lru); | 3455 | list_move(&page->lru, list); |
3493 | list_add(&page->lru, list); | ||
3494 | } | 3456 | } |
3495 | } | 3457 | } |
3496 | 3458 | ||
@@ -3979,7 +3941,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) | |||
3979 | int shared = 0; | 3941 | int shared = 0; |
3980 | int batchcount = 0; | 3942 | int batchcount = 0; |
3981 | 3943 | ||
3982 | err = cache_random_seq_create(cachep, gfp); | 3944 | err = cache_random_seq_create(cachep, cachep->num, gfp); |
3983 | if (err) | 3945 | if (err) |
3984 | goto end; | 3946 | goto end; |
3985 | 3947 | ||
@@ -42,6 +42,7 @@ struct kmem_cache { | |||
42 | #include <linux/kmemcheck.h> | 42 | #include <linux/kmemcheck.h> |
43 | #include <linux/kasan.h> | 43 | #include <linux/kasan.h> |
44 | #include <linux/kmemleak.h> | 44 | #include <linux/kmemleak.h> |
45 | #include <linux/random.h> | ||
45 | 46 | ||
46 | /* | 47 | /* |
47 | * State of the slab allocator. | 48 | * State of the slab allocator. |
@@ -253,8 +254,7 @@ static __always_inline int memcg_charge_slab(struct page *page, | |||
253 | if (is_root_cache(s)) | 254 | if (is_root_cache(s)) |
254 | return 0; | 255 | return 0; |
255 | 256 | ||
256 | ret = __memcg_kmem_charge_memcg(page, gfp, order, | 257 | ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg); |
257 | s->memcg_params.memcg); | ||
258 | if (ret) | 258 | if (ret) |
259 | return ret; | 259 | return ret; |
260 | 260 | ||
@@ -268,6 +268,9 @@ static __always_inline int memcg_charge_slab(struct page *page, | |||
268 | static __always_inline void memcg_uncharge_slab(struct page *page, int order, | 268 | static __always_inline void memcg_uncharge_slab(struct page *page, int order, |
269 | struct kmem_cache *s) | 269 | struct kmem_cache *s) |
270 | { | 270 | { |
271 | if (!memcg_kmem_enabled()) | ||
272 | return; | ||
273 | |||
271 | memcg_kmem_update_page_stat(page, | 274 | memcg_kmem_update_page_stat(page, |
272 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | 275 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? |
273 | MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, | 276 | MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, |
@@ -390,7 +393,11 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, | |||
390 | if (should_failslab(s, flags)) | 393 | if (should_failslab(s, flags)) |
391 | return NULL; | 394 | return NULL; |
392 | 395 | ||
393 | return memcg_kmem_get_cache(s, flags); | 396 | if (memcg_kmem_enabled() && |
397 | ((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT))) | ||
398 | return memcg_kmem_get_cache(s); | ||
399 | |||
400 | return s; | ||
394 | } | 401 | } |
395 | 402 | ||
396 | static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, | 403 | static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, |
@@ -407,7 +414,9 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, | |||
407 | s->flags, flags); | 414 | s->flags, flags); |
408 | kasan_slab_alloc(s, object, flags); | 415 | kasan_slab_alloc(s, object, flags); |
409 | } | 416 | } |
410 | memcg_kmem_put_cache(s); | 417 | |
418 | if (memcg_kmem_enabled()) | ||
419 | memcg_kmem_put_cache(s); | ||
411 | } | 420 | } |
412 | 421 | ||
413 | #ifndef CONFIG_SLOB | 422 | #ifndef CONFIG_SLOB |
@@ -464,4 +473,17 @@ int memcg_slab_show(struct seq_file *m, void *p); | |||
464 | 473 | ||
465 | void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); | 474 | void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); |
466 | 475 | ||
476 | #ifdef CONFIG_SLAB_FREELIST_RANDOM | ||
477 | int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, | ||
478 | gfp_t gfp); | ||
479 | void cache_random_seq_destroy(struct kmem_cache *cachep); | ||
480 | #else | ||
481 | static inline int cache_random_seq_create(struct kmem_cache *cachep, | ||
482 | unsigned int count, gfp_t gfp) | ||
483 | { | ||
484 | return 0; | ||
485 | } | ||
486 | static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } | ||
487 | #endif /* CONFIG_SLAB_FREELIST_RANDOM */ | ||
488 | |||
467 | #endif /* MM_SLAB_H */ | 489 | #endif /* MM_SLAB_H */ |
diff --git a/mm/slab_common.c b/mm/slab_common.c index 82317abb03ed..71f0b28a1bec 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -1012,7 +1012,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) | |||
1012 | struct page *page; | 1012 | struct page *page; |
1013 | 1013 | ||
1014 | flags |= __GFP_COMP; | 1014 | flags |= __GFP_COMP; |
1015 | page = alloc_kmem_pages(flags, order); | 1015 | page = alloc_pages(flags, order); |
1016 | ret = page ? page_address(page) : NULL; | 1016 | ret = page ? page_address(page) : NULL; |
1017 | kmemleak_alloc(ret, size, 1, flags); | 1017 | kmemleak_alloc(ret, size, 1, flags); |
1018 | kasan_kmalloc_large(ret, size, flags); | 1018 | kasan_kmalloc_large(ret, size, flags); |
@@ -1030,6 +1030,53 @@ void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) | |||
1030 | EXPORT_SYMBOL(kmalloc_order_trace); | 1030 | EXPORT_SYMBOL(kmalloc_order_trace); |
1031 | #endif | 1031 | #endif |
1032 | 1032 | ||
1033 | #ifdef CONFIG_SLAB_FREELIST_RANDOM | ||
1034 | /* Randomize a generic freelist */ | ||
1035 | static void freelist_randomize(struct rnd_state *state, unsigned int *list, | ||
1036 | size_t count) | ||
1037 | { | ||
1038 | size_t i; | ||
1039 | unsigned int rand; | ||
1040 | |||
1041 | for (i = 0; i < count; i++) | ||
1042 | list[i] = i; | ||
1043 | |||
1044 | /* Fisher-Yates shuffle */ | ||
1045 | for (i = count - 1; i > 0; i--) { | ||
1046 | rand = prandom_u32_state(state); | ||
1047 | rand %= (i + 1); | ||
1048 | swap(list[i], list[rand]); | ||
1049 | } | ||
1050 | } | ||
1051 | |||
1052 | /* Create a random sequence per cache */ | ||
1053 | int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, | ||
1054 | gfp_t gfp) | ||
1055 | { | ||
1056 | struct rnd_state state; | ||
1057 | |||
1058 | if (count < 2 || cachep->random_seq) | ||
1059 | return 0; | ||
1060 | |||
1061 | cachep->random_seq = kcalloc(count, sizeof(unsigned int), gfp); | ||
1062 | if (!cachep->random_seq) | ||
1063 | return -ENOMEM; | ||
1064 | |||
1065 | /* Get best entropy at this stage of boot */ | ||
1066 | prandom_seed_state(&state, get_random_long()); | ||
1067 | |||
1068 | freelist_randomize(&state, cachep->random_seq, count); | ||
1069 | return 0; | ||
1070 | } | ||
1071 | |||
1072 | /* Destroy the per-cache random freelist sequence */ | ||
1073 | void cache_random_seq_destroy(struct kmem_cache *cachep) | ||
1074 | { | ||
1075 | kfree(cachep->random_seq); | ||
1076 | cachep->random_seq = NULL; | ||
1077 | } | ||
1078 | #endif /* CONFIG_SLAB_FREELIST_RANDOM */ | ||
1079 | |||
1033 | #ifdef CONFIG_SLABINFO | 1080 | #ifdef CONFIG_SLABINFO |
1034 | 1081 | ||
1035 | #ifdef CONFIG_SLAB | 1082 | #ifdef CONFIG_SLAB |
@@ -1405,6 +1405,109 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, | |||
1405 | return page; | 1405 | return page; |
1406 | } | 1406 | } |
1407 | 1407 | ||
1408 | #ifdef CONFIG_SLAB_FREELIST_RANDOM | ||
1409 | /* Pre-initialize the random sequence cache */ | ||
1410 | static int init_cache_random_seq(struct kmem_cache *s) | ||
1411 | { | ||
1412 | int err; | ||
1413 | unsigned long i, count = oo_objects(s->oo); | ||
1414 | |||
1415 | err = cache_random_seq_create(s, count, GFP_KERNEL); | ||
1416 | if (err) { | ||
1417 | pr_err("SLUB: Unable to initialize free list for %s\n", | ||
1418 | s->name); | ||
1419 | return err; | ||
1420 | } | ||
1421 | |||
1422 | /* Transform to an offset on the set of pages */ | ||
1423 | if (s->random_seq) { | ||
1424 | for (i = 0; i < count; i++) | ||
1425 | s->random_seq[i] *= s->size; | ||
1426 | } | ||
1427 | return 0; | ||
1428 | } | ||
1429 | |||
1430 | /* Initialize each random sequence freelist per cache */ | ||
1431 | static void __init init_freelist_randomization(void) | ||
1432 | { | ||
1433 | struct kmem_cache *s; | ||
1434 | |||
1435 | mutex_lock(&slab_mutex); | ||
1436 | |||
1437 | list_for_each_entry(s, &slab_caches, list) | ||
1438 | init_cache_random_seq(s); | ||
1439 | |||
1440 | mutex_unlock(&slab_mutex); | ||
1441 | } | ||
1442 | |||
1443 | /* Get the next entry on the pre-computed freelist randomized */ | ||
1444 | static void *next_freelist_entry(struct kmem_cache *s, struct page *page, | ||
1445 | unsigned long *pos, void *start, | ||
1446 | unsigned long page_limit, | ||
1447 | unsigned long freelist_count) | ||
1448 | { | ||
1449 | unsigned int idx; | ||
1450 | |||
1451 | /* | ||
1452 | * If the target page allocation failed, the number of objects on the | ||
1453 | * page might be smaller than the usual size defined by the cache. | ||
1454 | */ | ||
1455 | do { | ||
1456 | idx = s->random_seq[*pos]; | ||
1457 | *pos += 1; | ||
1458 | if (*pos >= freelist_count) | ||
1459 | *pos = 0; | ||
1460 | } while (unlikely(idx >= page_limit)); | ||
1461 | |||
1462 | return (char *)start + idx; | ||
1463 | } | ||
1464 | |||
1465 | /* Shuffle the single linked freelist based on a random pre-computed sequence */ | ||
1466 | static bool shuffle_freelist(struct kmem_cache *s, struct page *page) | ||
1467 | { | ||
1468 | void *start; | ||
1469 | void *cur; | ||
1470 | void *next; | ||
1471 | unsigned long idx, pos, page_limit, freelist_count; | ||
1472 | |||
1473 | if (page->objects < 2 || !s->random_seq) | ||
1474 | return false; | ||
1475 | |||
1476 | freelist_count = oo_objects(s->oo); | ||
1477 | pos = get_random_int() % freelist_count; | ||
1478 | |||
1479 | page_limit = page->objects * s->size; | ||
1480 | start = fixup_red_left(s, page_address(page)); | ||
1481 | |||
1482 | /* First entry is used as the base of the freelist */ | ||
1483 | cur = next_freelist_entry(s, page, &pos, start, page_limit, | ||
1484 | freelist_count); | ||
1485 | page->freelist = cur; | ||
1486 | |||
1487 | for (idx = 1; idx < page->objects; idx++) { | ||
1488 | setup_object(s, page, cur); | ||
1489 | next = next_freelist_entry(s, page, &pos, start, page_limit, | ||
1490 | freelist_count); | ||
1491 | set_freepointer(s, cur, next); | ||
1492 | cur = next; | ||
1493 | } | ||
1494 | setup_object(s, page, cur); | ||
1495 | set_freepointer(s, cur, NULL); | ||
1496 | |||
1497 | return true; | ||
1498 | } | ||
1499 | #else | ||
1500 | static inline int init_cache_random_seq(struct kmem_cache *s) | ||
1501 | { | ||
1502 | return 0; | ||
1503 | } | ||
1504 | static inline void init_freelist_randomization(void) { } | ||
1505 | static inline bool shuffle_freelist(struct kmem_cache *s, struct page *page) | ||
1506 | { | ||
1507 | return false; | ||
1508 | } | ||
1509 | #endif /* CONFIG_SLAB_FREELIST_RANDOM */ | ||
1510 | |||
1408 | static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | 1511 | static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) |
1409 | { | 1512 | { |
1410 | struct page *page; | 1513 | struct page *page; |
@@ -1412,6 +1515,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1412 | gfp_t alloc_gfp; | 1515 | gfp_t alloc_gfp; |
1413 | void *start, *p; | 1516 | void *start, *p; |
1414 | int idx, order; | 1517 | int idx, order; |
1518 | bool shuffle; | ||
1415 | 1519 | ||
1416 | flags &= gfp_allowed_mask; | 1520 | flags &= gfp_allowed_mask; |
1417 | 1521 | ||
@@ -1473,15 +1577,19 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1473 | 1577 | ||
1474 | kasan_poison_slab(page); | 1578 | kasan_poison_slab(page); |
1475 | 1579 | ||
1476 | for_each_object_idx(p, idx, s, start, page->objects) { | 1580 | shuffle = shuffle_freelist(s, page); |
1477 | setup_object(s, page, p); | 1581 | |
1478 | if (likely(idx < page->objects)) | 1582 | if (!shuffle) { |
1479 | set_freepointer(s, p, p + s->size); | 1583 | for_each_object_idx(p, idx, s, start, page->objects) { |
1480 | else | 1584 | setup_object(s, page, p); |
1481 | set_freepointer(s, p, NULL); | 1585 | if (likely(idx < page->objects)) |
1586 | set_freepointer(s, p, p + s->size); | ||
1587 | else | ||
1588 | set_freepointer(s, p, NULL); | ||
1589 | } | ||
1590 | page->freelist = fixup_red_left(s, start); | ||
1482 | } | 1591 | } |
1483 | 1592 | ||
1484 | page->freelist = fixup_red_left(s, start); | ||
1485 | page->inuse = page->objects; | 1593 | page->inuse = page->objects; |
1486 | page->frozen = 1; | 1594 | page->frozen = 1; |
1487 | 1595 | ||
@@ -1504,8 +1612,10 @@ out: | |||
1504 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | 1612 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) |
1505 | { | 1613 | { |
1506 | if (unlikely(flags & GFP_SLAB_BUG_MASK)) { | 1614 | if (unlikely(flags & GFP_SLAB_BUG_MASK)) { |
1507 | pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); | 1615 | gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; |
1508 | BUG(); | 1616 | flags &= ~GFP_SLAB_BUG_MASK; |
1617 | pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n", | ||
1618 | invalid_mask, &invalid_mask, flags, &flags); | ||
1509 | } | 1619 | } |
1510 | 1620 | ||
1511 | return allocate_slab(s, | 1621 | return allocate_slab(s, |
@@ -2867,7 +2977,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, | |||
2867 | if (unlikely(!PageSlab(page))) { | 2977 | if (unlikely(!PageSlab(page))) { |
2868 | BUG_ON(!PageCompound(page)); | 2978 | BUG_ON(!PageCompound(page)); |
2869 | kfree_hook(object); | 2979 | kfree_hook(object); |
2870 | __free_kmem_pages(page, compound_order(page)); | 2980 | __free_pages(page, compound_order(page)); |
2871 | p[size] = NULL; /* mark object processed */ | 2981 | p[size] = NULL; /* mark object processed */ |
2872 | return size; | 2982 | return size; |
2873 | } | 2983 | } |
@@ -3207,6 +3317,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) | |||
3207 | 3317 | ||
3208 | void __kmem_cache_release(struct kmem_cache *s) | 3318 | void __kmem_cache_release(struct kmem_cache *s) |
3209 | { | 3319 | { |
3320 | cache_random_seq_destroy(s); | ||
3210 | free_percpu(s->cpu_slab); | 3321 | free_percpu(s->cpu_slab); |
3211 | free_kmem_cache_nodes(s); | 3322 | free_kmem_cache_nodes(s); |
3212 | } | 3323 | } |
@@ -3431,6 +3542,13 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) | |||
3431 | #ifdef CONFIG_NUMA | 3542 | #ifdef CONFIG_NUMA |
3432 | s->remote_node_defrag_ratio = 1000; | 3543 | s->remote_node_defrag_ratio = 1000; |
3433 | #endif | 3544 | #endif |
3545 | |||
3546 | /* Initialize the pre-computed randomized freelist if slab is up */ | ||
3547 | if (slab_state >= UP) { | ||
3548 | if (init_cache_random_seq(s)) | ||
3549 | goto error; | ||
3550 | } | ||
3551 | |||
3434 | if (!init_kmem_cache_nodes(s)) | 3552 | if (!init_kmem_cache_nodes(s)) |
3435 | goto error; | 3553 | goto error; |
3436 | 3554 | ||
@@ -3575,7 +3693,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) | |||
3575 | void *ptr = NULL; | 3693 | void *ptr = NULL; |
3576 | 3694 | ||
3577 | flags |= __GFP_COMP | __GFP_NOTRACK; | 3695 | flags |= __GFP_COMP | __GFP_NOTRACK; |
3578 | page = alloc_kmem_pages_node(node, flags, get_order(size)); | 3696 | page = alloc_pages_node(node, flags, get_order(size)); |
3579 | if (page) | 3697 | if (page) |
3580 | ptr = page_address(page); | 3698 | ptr = page_address(page); |
3581 | 3699 | ||
@@ -3656,7 +3774,7 @@ void kfree(const void *x) | |||
3656 | if (unlikely(!PageSlab(page))) { | 3774 | if (unlikely(!PageSlab(page))) { |
3657 | BUG_ON(!PageCompound(page)); | 3775 | BUG_ON(!PageCompound(page)); |
3658 | kfree_hook(x); | 3776 | kfree_hook(x); |
3659 | __free_kmem_pages(page, compound_order(page)); | 3777 | __free_pages(page, compound_order(page)); |
3660 | return; | 3778 | return; |
3661 | } | 3779 | } |
3662 | slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); | 3780 | slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); |
@@ -3947,6 +4065,9 @@ void __init kmem_cache_init(void) | |||
3947 | setup_kmalloc_cache_index_table(); | 4065 | setup_kmalloc_cache_index_table(); |
3948 | create_kmalloc_caches(0); | 4066 | create_kmalloc_caches(0); |
3949 | 4067 | ||
4068 | /* Setup random freelists for each cache */ | ||
4069 | init_freelist_randomization(); | ||
4070 | |||
3950 | #ifdef CONFIG_SMP | 4071 | #ifdef CONFIG_SMP |
3951 | register_cpu_notifier(&slab_notifier); | 4072 | register_cpu_notifier(&slab_notifier); |
3952 | #endif | 4073 | #endif |
@@ -292,6 +292,7 @@ static bool need_activate_page_drain(int cpu) | |||
292 | 292 | ||
293 | void activate_page(struct page *page) | 293 | void activate_page(struct page *page) |
294 | { | 294 | { |
295 | page = compound_head(page); | ||
295 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | 296 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
296 | struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); | 297 | struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); |
297 | 298 | ||
@@ -316,6 +317,7 @@ void activate_page(struct page *page) | |||
316 | { | 317 | { |
317 | struct zone *zone = page_zone(page); | 318 | struct zone *zone = page_zone(page); |
318 | 319 | ||
320 | page = compound_head(page); | ||
319 | spin_lock_irq(&zone->lru_lock); | 321 | spin_lock_irq(&zone->lru_lock); |
320 | __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); | 322 | __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL); |
321 | spin_unlock_irq(&zone->lru_lock); | 323 | spin_unlock_irq(&zone->lru_lock); |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 031713ab40ce..78cfa292a29a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -2493,7 +2493,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2493 | goto bad_swap; | 2493 | goto bad_swap; |
2494 | } | 2494 | } |
2495 | /* frontswap enabled? set up bit-per-page map for frontswap */ | 2495 | /* frontswap enabled? set up bit-per-page map for frontswap */ |
2496 | if (frontswap_enabled) | 2496 | if (IS_ENABLED(CONFIG_FRONTSWAP)) |
2497 | frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); | 2497 | frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); |
2498 | 2498 | ||
2499 | if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { | 2499 | if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { |
diff --git a/mm/truncate.c b/mm/truncate.c index 4064f8f53daa..a01cce450a26 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -155,10 +155,14 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) | |||
155 | 155 | ||
156 | int truncate_inode_page(struct address_space *mapping, struct page *page) | 156 | int truncate_inode_page(struct address_space *mapping, struct page *page) |
157 | { | 157 | { |
158 | loff_t holelen; | ||
159 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
160 | |||
161 | holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; | ||
158 | if (page_mapped(page)) { | 162 | if (page_mapped(page)) { |
159 | unmap_mapping_range(mapping, | 163 | unmap_mapping_range(mapping, |
160 | (loff_t)page->index << PAGE_SHIFT, | 164 | (loff_t)page->index << PAGE_SHIFT, |
161 | PAGE_SIZE, 0); | 165 | holelen, 0); |
162 | } | 166 | } |
163 | return truncate_complete_page(mapping, page); | 167 | return truncate_complete_page(mapping, page); |
164 | } | 168 | } |
@@ -279,7 +283,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
279 | 283 | ||
280 | if (!trylock_page(page)) | 284 | if (!trylock_page(page)) |
281 | continue; | 285 | continue; |
282 | WARN_ON(page->index != index); | 286 | WARN_ON(page_to_pgoff(page) != index); |
283 | if (PageWriteback(page)) { | 287 | if (PageWriteback(page)) { |
284 | unlock_page(page); | 288 | unlock_page(page); |
285 | continue; | 289 | continue; |
@@ -367,7 +371,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
367 | } | 371 | } |
368 | 372 | ||
369 | lock_page(page); | 373 | lock_page(page); |
370 | WARN_ON(page->index != index); | 374 | WARN_ON(page_to_pgoff(page) != index); |
371 | wait_on_page_writeback(page); | 375 | wait_on_page_writeback(page); |
372 | truncate_inode_page(mapping, page); | 376 | truncate_inode_page(mapping, page); |
373 | unlock_page(page); | 377 | unlock_page(page); |
@@ -487,7 +491,21 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
487 | 491 | ||
488 | if (!trylock_page(page)) | 492 | if (!trylock_page(page)) |
489 | continue; | 493 | continue; |
490 | WARN_ON(page->index != index); | 494 | |
495 | WARN_ON(page_to_pgoff(page) != index); | ||
496 | |||
497 | /* Middle of THP: skip */ | ||
498 | if (PageTransTail(page)) { | ||
499 | unlock_page(page); | ||
500 | continue; | ||
501 | } else if (PageTransHuge(page)) { | ||
502 | index += HPAGE_PMD_NR - 1; | ||
503 | i += HPAGE_PMD_NR - 1; | ||
504 | /* 'end' is in the middle of THP */ | ||
505 | if (index == round_down(end, HPAGE_PMD_NR)) | ||
506 | continue; | ||
507 | } | ||
508 | |||
491 | ret = invalidate_inode_page(page); | 509 | ret = invalidate_inode_page(page); |
492 | unlock_page(page); | 510 | unlock_page(page); |
493 | /* | 511 | /* |
@@ -594,7 +612,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
594 | } | 612 | } |
595 | 613 | ||
596 | lock_page(page); | 614 | lock_page(page); |
597 | WARN_ON(page->index != index); | 615 | WARN_ON(page_to_pgoff(page) != index); |
598 | if (page->mapping != mapping) { | 616 | if (page->mapping != mapping) { |
599 | unlock_page(page); | 617 | unlock_page(page); |
600 | continue; | 618 | continue; |
@@ -399,10 +399,12 @@ struct address_space *page_mapping(struct page *page) | |||
399 | } | 399 | } |
400 | 400 | ||
401 | mapping = page->mapping; | 401 | mapping = page->mapping; |
402 | if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) | 402 | if ((unsigned long)mapping & PAGE_MAPPING_ANON) |
403 | return NULL; | 403 | return NULL; |
404 | return mapping; | 404 | |
405 | return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS); | ||
405 | } | 406 | } |
407 | EXPORT_SYMBOL(page_mapping); | ||
406 | 408 | ||
407 | /* Slow path of page_mapcount() for compound pages */ | 409 | /* Slow path of page_mapcount() for compound pages */ |
408 | int __page_mapcount(struct page *page) | 410 | int __page_mapcount(struct page *page) |
@@ -410,6 +412,12 @@ int __page_mapcount(struct page *page) | |||
410 | int ret; | 412 | int ret; |
411 | 413 | ||
412 | ret = atomic_read(&page->_mapcount) + 1; | 414 | ret = atomic_read(&page->_mapcount) + 1; |
415 | /* | ||
416 | * For file THP page->_mapcount contains total number of mapping | ||
417 | * of the page: no need to look into compound_mapcount. | ||
418 | */ | ||
419 | if (!PageAnon(page) && !PageHuge(page)) | ||
420 | return ret; | ||
413 | page = compound_head(page); | 421 | page = compound_head(page); |
414 | ret += atomic_read(compound_mapcount_ptr(page)) + 1; | 422 | ret += atomic_read(compound_mapcount_ptr(page)) + 1; |
415 | if (PageDoubleMap(page)) | 423 | if (PageDoubleMap(page)) |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e11475cdeb7a..91f44e78c516 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -1501,7 +1501,7 @@ static void __vunmap(const void *addr, int deallocate_pages) | |||
1501 | struct page *page = area->pages[i]; | 1501 | struct page *page = area->pages[i]; |
1502 | 1502 | ||
1503 | BUG_ON(!page); | 1503 | BUG_ON(!page); |
1504 | __free_kmem_pages(page, 0); | 1504 | __free_pages(page, 0); |
1505 | } | 1505 | } |
1506 | 1506 | ||
1507 | kvfree(area->pages); | 1507 | kvfree(area->pages); |
@@ -1629,9 +1629,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
1629 | struct page *page; | 1629 | struct page *page; |
1630 | 1630 | ||
1631 | if (node == NUMA_NO_NODE) | 1631 | if (node == NUMA_NO_NODE) |
1632 | page = alloc_kmem_pages(alloc_mask, order); | 1632 | page = alloc_pages(alloc_mask, order); |
1633 | else | 1633 | else |
1634 | page = alloc_kmem_pages_node(node, alloc_mask, order); | 1634 | page = alloc_pages_node(node, alloc_mask, order); |
1635 | 1635 | ||
1636 | if (unlikely(!page)) { | 1636 | if (unlikely(!page)) { |
1637 | /* Successfully allocated i pages, free them in __vunmap() */ | 1637 | /* Successfully allocated i pages, free them in __vunmap() */ |
diff --git a/mm/vmscan.c b/mm/vmscan.c index c4a2f4512fca..21d417ccff69 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1055,8 +1055,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1055 | 1055 | ||
1056 | /* Adding to swap updated mapping */ | 1056 | /* Adding to swap updated mapping */ |
1057 | mapping = page_mapping(page); | 1057 | mapping = page_mapping(page); |
1058 | } else if (unlikely(PageTransHuge(page))) { | ||
1059 | /* Split file THP */ | ||
1060 | if (split_huge_page_to_list(page, page_list)) | ||
1061 | goto keep_locked; | ||
1058 | } | 1062 | } |
1059 | 1063 | ||
1064 | VM_BUG_ON_PAGE(PageTransHuge(page), page); | ||
1065 | |||
1060 | /* | 1066 | /* |
1061 | * The page is mapped into the page tables of one or more | 1067 | * The page is mapped into the page tables of one or more |
1062 | * processes. Try to unmap it here. | 1068 | * processes. Try to unmap it here. |
@@ -1254,7 +1260,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
1254 | 1260 | ||
1255 | list_for_each_entry_safe(page, next, page_list, lru) { | 1261 | list_for_each_entry_safe(page, next, page_list, lru) { |
1256 | if (page_is_file_cache(page) && !PageDirty(page) && | 1262 | if (page_is_file_cache(page) && !PageDirty(page) && |
1257 | !isolated_balloon_page(page)) { | 1263 | !__PageMovable(page)) { |
1258 | ClearPageActive(page); | 1264 | ClearPageActive(page); |
1259 | list_move(&page->lru, &clean_pages); | 1265 | list_move(&page->lru, &clean_pages); |
1260 | } | 1266 | } |
diff --git a/mm/vmstat.c b/mm/vmstat.c index cb2a67bb4158..7997f52935c9 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -718,7 +718,9 @@ const char * const vmstat_text[] = { | |||
718 | "nr_dirtied", | 718 | "nr_dirtied", |
719 | "nr_written", | 719 | "nr_written", |
720 | "nr_pages_scanned", | 720 | "nr_pages_scanned", |
721 | 721 | #if IS_ENABLED(CONFIG_ZSMALLOC) | |
722 | "nr_zspages", | ||
723 | #endif | ||
722 | #ifdef CONFIG_NUMA | 724 | #ifdef CONFIG_NUMA |
723 | "numa_hit", | 725 | "numa_hit", |
724 | "numa_miss", | 726 | "numa_miss", |
@@ -731,6 +733,8 @@ const char * const vmstat_text[] = { | |||
731 | "workingset_activate", | 733 | "workingset_activate", |
732 | "workingset_nodereclaim", | 734 | "workingset_nodereclaim", |
733 | "nr_anon_transparent_hugepages", | 735 | "nr_anon_transparent_hugepages", |
736 | "nr_shmem_hugepages", | ||
737 | "nr_shmem_pmdmapped", | ||
734 | "nr_free_cma", | 738 | "nr_free_cma", |
735 | 739 | ||
736 | /* enum writeback_stat_item counters */ | 740 | /* enum writeback_stat_item counters */ |
@@ -815,6 +819,8 @@ const char * const vmstat_text[] = { | |||
815 | "thp_fault_fallback", | 819 | "thp_fault_fallback", |
816 | "thp_collapse_alloc", | 820 | "thp_collapse_alloc", |
817 | "thp_collapse_alloc_failed", | 821 | "thp_collapse_alloc_failed", |
822 | "thp_file_alloc", | ||
823 | "thp_file_mapped", | ||
818 | "thp_split_page", | 824 | "thp_split_page", |
819 | "thp_split_page_failed", | 825 | "thp_split_page_failed", |
820 | "thp_deferred_split_page", | 826 | "thp_deferred_split_page", |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b6d4f258cb53..04176de6df70 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
@@ -16,32 +16,15 @@ | |||
16 | * struct page(s) to form a zspage. | 16 | * struct page(s) to form a zspage. |
17 | * | 17 | * |
18 | * Usage of struct page fields: | 18 | * Usage of struct page fields: |
19 | * page->private: points to the first component (0-order) page | 19 | * page->private: points to zspage |
20 | * page->index (union with page->freelist): offset of the first object | 20 | * page->freelist(index): links together all component pages of a zspage |
21 | * starting in this page. For the first page, this is | 21 | * For the huge page, this is always 0, so we use this field |
22 | * always 0, so we use this field (aka freelist) to point | 22 | * to store handle. |
23 | * to the first free object in zspage. | ||
24 | * page->lru: links together all component pages (except the first page) | ||
25 | * of a zspage | ||
26 | * | ||
27 | * For _first_ page only: | ||
28 | * | ||
29 | * page->private: refers to the component page after the first page | ||
30 | * If the page is first_page for huge object, it stores handle. | ||
31 | * Look at size_class->huge. | ||
32 | * page->freelist: points to the first free object in zspage. | ||
33 | * Free objects are linked together using in-place | ||
34 | * metadata. | ||
35 | * page->objects: maximum number of objects we can store in this | ||
36 | * zspage (class->zspage_order * PAGE_SIZE / class->size) | ||
37 | * page->lru: links together first pages of various zspages. | ||
38 | * Basically forming list of zspages in a fullness group. | ||
39 | * page->mapping: class index and fullness group of the zspage | ||
40 | * page->inuse: the number of objects that are used in this zspage | ||
41 | * | 23 | * |
42 | * Usage of struct page flags: | 24 | * Usage of struct page flags: |
43 | * PG_private: identifies the first component page | 25 | * PG_private: identifies the first component page |
44 | * PG_private2: identifies the last component page | 26 | * PG_private2: identifies the last component page |
27 | * PG_owner_priv_1: indentifies the huge component page | ||
45 | * | 28 | * |
46 | */ | 29 | */ |
47 | 30 | ||
@@ -66,6 +49,11 @@ | |||
66 | #include <linux/debugfs.h> | 49 | #include <linux/debugfs.h> |
67 | #include <linux/zsmalloc.h> | 50 | #include <linux/zsmalloc.h> |
68 | #include <linux/zpool.h> | 51 | #include <linux/zpool.h> |
52 | #include <linux/mount.h> | ||
53 | #include <linux/migrate.h> | ||
54 | #include <linux/pagemap.h> | ||
55 | |||
56 | #define ZSPAGE_MAGIC 0x58 | ||
69 | 57 | ||
70 | /* | 58 | /* |
71 | * This must be power of 2 and greater than of equal to sizeof(link_free). | 59 | * This must be power of 2 and greater than of equal to sizeof(link_free). |
@@ -88,9 +76,7 @@ | |||
88 | * Object location (<PFN>, <obj_idx>) is encoded as | 76 | * Object location (<PFN>, <obj_idx>) is encoded as |
89 | * as single (unsigned long) handle value. | 77 | * as single (unsigned long) handle value. |
90 | * | 78 | * |
91 | * Note that object index <obj_idx> is relative to system | 79 | * Note that object index <obj_idx> starts from 0. |
92 | * page <PFN> it is stored in, so for each sub-page belonging | ||
93 | * to a zspage, obj_idx starts with 0. | ||
94 | * | 80 | * |
95 | * This is made more complicated by various memory models and PAE. | 81 | * This is made more complicated by various memory models and PAE. |
96 | */ | 82 | */ |
@@ -149,33 +135,29 @@ | |||
149 | * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN | 135 | * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN |
150 | * (reason above) | 136 | * (reason above) |
151 | */ | 137 | */ |
152 | #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) | 138 | #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) |
153 | 139 | ||
154 | /* | 140 | /* |
155 | * We do not maintain any list for completely empty or full pages | 141 | * We do not maintain any list for completely empty or full pages |
156 | */ | 142 | */ |
157 | enum fullness_group { | 143 | enum fullness_group { |
158 | ZS_ALMOST_FULL, | ||
159 | ZS_ALMOST_EMPTY, | ||
160 | _ZS_NR_FULLNESS_GROUPS, | ||
161 | |||
162 | ZS_EMPTY, | 144 | ZS_EMPTY, |
163 | ZS_FULL | 145 | ZS_ALMOST_EMPTY, |
146 | ZS_ALMOST_FULL, | ||
147 | ZS_FULL, | ||
148 | NR_ZS_FULLNESS, | ||
164 | }; | 149 | }; |
165 | 150 | ||
166 | enum zs_stat_type { | 151 | enum zs_stat_type { |
152 | CLASS_EMPTY, | ||
153 | CLASS_ALMOST_EMPTY, | ||
154 | CLASS_ALMOST_FULL, | ||
155 | CLASS_FULL, | ||
167 | OBJ_ALLOCATED, | 156 | OBJ_ALLOCATED, |
168 | OBJ_USED, | 157 | OBJ_USED, |
169 | CLASS_ALMOST_FULL, | 158 | NR_ZS_STAT_TYPE, |
170 | CLASS_ALMOST_EMPTY, | ||
171 | }; | 159 | }; |
172 | 160 | ||
173 | #ifdef CONFIG_ZSMALLOC_STAT | ||
174 | #define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) | ||
175 | #else | ||
176 | #define NR_ZS_STAT_TYPE (OBJ_USED + 1) | ||
177 | #endif | ||
178 | |||
179 | struct zs_size_stat { | 161 | struct zs_size_stat { |
180 | unsigned long objs[NR_ZS_STAT_TYPE]; | 162 | unsigned long objs[NR_ZS_STAT_TYPE]; |
181 | }; | 163 | }; |
@@ -184,6 +166,10 @@ struct zs_size_stat { | |||
184 | static struct dentry *zs_stat_root; | 166 | static struct dentry *zs_stat_root; |
185 | #endif | 167 | #endif |
186 | 168 | ||
169 | #ifdef CONFIG_COMPACTION | ||
170 | static struct vfsmount *zsmalloc_mnt; | ||
171 | #endif | ||
172 | |||
187 | /* | 173 | /* |
188 | * number of size_classes | 174 | * number of size_classes |
189 | */ | 175 | */ |
@@ -207,35 +193,49 @@ static const int fullness_threshold_frac = 4; | |||
207 | 193 | ||
208 | struct size_class { | 194 | struct size_class { |
209 | spinlock_t lock; | 195 | spinlock_t lock; |
210 | struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; | 196 | struct list_head fullness_list[NR_ZS_FULLNESS]; |
211 | /* | 197 | /* |
212 | * Size of objects stored in this class. Must be multiple | 198 | * Size of objects stored in this class. Must be multiple |
213 | * of ZS_ALIGN. | 199 | * of ZS_ALIGN. |
214 | */ | 200 | */ |
215 | int size; | 201 | int size; |
216 | unsigned int index; | 202 | int objs_per_zspage; |
217 | |||
218 | struct zs_size_stat stats; | ||
219 | |||
220 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ | 203 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ |
221 | int pages_per_zspage; | 204 | int pages_per_zspage; |
222 | /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ | 205 | |
223 | bool huge; | 206 | unsigned int index; |
207 | struct zs_size_stat stats; | ||
224 | }; | 208 | }; |
225 | 209 | ||
210 | /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ | ||
211 | static void SetPageHugeObject(struct page *page) | ||
212 | { | ||
213 | SetPageOwnerPriv1(page); | ||
214 | } | ||
215 | |||
216 | static void ClearPageHugeObject(struct page *page) | ||
217 | { | ||
218 | ClearPageOwnerPriv1(page); | ||
219 | } | ||
220 | |||
221 | static int PageHugeObject(struct page *page) | ||
222 | { | ||
223 | return PageOwnerPriv1(page); | ||
224 | } | ||
225 | |||
226 | /* | 226 | /* |
227 | * Placed within free objects to form a singly linked list. | 227 | * Placed within free objects to form a singly linked list. |
228 | * For every zspage, first_page->freelist gives head of this list. | 228 | * For every zspage, zspage->freeobj gives head of this list. |
229 | * | 229 | * |
230 | * This must be power of 2 and less than or equal to ZS_ALIGN | 230 | * This must be power of 2 and less than or equal to ZS_ALIGN |
231 | */ | 231 | */ |
232 | struct link_free { | 232 | struct link_free { |
233 | union { | 233 | union { |
234 | /* | 234 | /* |
235 | * Position of next free chunk (encodes <PFN, obj_idx>) | 235 | * Free object index; |
236 | * It's valid for non-allocated object | 236 | * It's valid for non-allocated object |
237 | */ | 237 | */ |
238 | void *next; | 238 | unsigned long next; |
239 | /* | 239 | /* |
240 | * Handle of allocated object. | 240 | * Handle of allocated object. |
241 | */ | 241 | */ |
@@ -248,6 +248,7 @@ struct zs_pool { | |||
248 | 248 | ||
249 | struct size_class **size_class; | 249 | struct size_class **size_class; |
250 | struct kmem_cache *handle_cachep; | 250 | struct kmem_cache *handle_cachep; |
251 | struct kmem_cache *zspage_cachep; | ||
251 | 252 | ||
252 | atomic_long_t pages_allocated; | 253 | atomic_long_t pages_allocated; |
253 | 254 | ||
@@ -263,16 +264,36 @@ struct zs_pool { | |||
263 | #ifdef CONFIG_ZSMALLOC_STAT | 264 | #ifdef CONFIG_ZSMALLOC_STAT |
264 | struct dentry *stat_dentry; | 265 | struct dentry *stat_dentry; |
265 | #endif | 266 | #endif |
267 | #ifdef CONFIG_COMPACTION | ||
268 | struct inode *inode; | ||
269 | struct work_struct free_work; | ||
270 | #endif | ||
266 | }; | 271 | }; |
267 | 272 | ||
268 | /* | 273 | /* |
269 | * A zspage's class index and fullness group | 274 | * A zspage's class index and fullness group |
270 | * are encoded in its (first)page->mapping | 275 | * are encoded in its (first)page->mapping |
271 | */ | 276 | */ |
272 | #define CLASS_IDX_BITS 28 | 277 | #define FULLNESS_BITS 2 |
273 | #define FULLNESS_BITS 4 | 278 | #define CLASS_BITS 8 |
274 | #define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) | 279 | #define ISOLATED_BITS 3 |
275 | #define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) | 280 | #define MAGIC_VAL_BITS 8 |
281 | |||
282 | struct zspage { | ||
283 | struct { | ||
284 | unsigned int fullness:FULLNESS_BITS; | ||
285 | unsigned int class:CLASS_BITS; | ||
286 | unsigned int isolated:ISOLATED_BITS; | ||
287 | unsigned int magic:MAGIC_VAL_BITS; | ||
288 | }; | ||
289 | unsigned int inuse; | ||
290 | unsigned int freeobj; | ||
291 | struct page *first_page; | ||
292 | struct list_head list; /* fullness list */ | ||
293 | #ifdef CONFIG_COMPACTION | ||
294 | rwlock_t lock; | ||
295 | #endif | ||
296 | }; | ||
276 | 297 | ||
277 | struct mapping_area { | 298 | struct mapping_area { |
278 | #ifdef CONFIG_PGTABLE_MAPPING | 299 | #ifdef CONFIG_PGTABLE_MAPPING |
@@ -284,29 +305,74 @@ struct mapping_area { | |||
284 | enum zs_mapmode vm_mm; /* mapping mode */ | 305 | enum zs_mapmode vm_mm; /* mapping mode */ |
285 | }; | 306 | }; |
286 | 307 | ||
287 | static int create_handle_cache(struct zs_pool *pool) | 308 | #ifdef CONFIG_COMPACTION |
309 | static int zs_register_migration(struct zs_pool *pool); | ||
310 | static void zs_unregister_migration(struct zs_pool *pool); | ||
311 | static void migrate_lock_init(struct zspage *zspage); | ||
312 | static void migrate_read_lock(struct zspage *zspage); | ||
313 | static void migrate_read_unlock(struct zspage *zspage); | ||
314 | static void kick_deferred_free(struct zs_pool *pool); | ||
315 | static void init_deferred_free(struct zs_pool *pool); | ||
316 | static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); | ||
317 | #else | ||
318 | static int zsmalloc_mount(void) { return 0; } | ||
319 | static void zsmalloc_unmount(void) {} | ||
320 | static int zs_register_migration(struct zs_pool *pool) { return 0; } | ||
321 | static void zs_unregister_migration(struct zs_pool *pool) {} | ||
322 | static void migrate_lock_init(struct zspage *zspage) {} | ||
323 | static void migrate_read_lock(struct zspage *zspage) {} | ||
324 | static void migrate_read_unlock(struct zspage *zspage) {} | ||
325 | static void kick_deferred_free(struct zs_pool *pool) {} | ||
326 | static void init_deferred_free(struct zs_pool *pool) {} | ||
327 | static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} | ||
328 | #endif | ||
329 | |||
330 | static int create_cache(struct zs_pool *pool) | ||
288 | { | 331 | { |
289 | pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, | 332 | pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, |
290 | 0, 0, NULL); | 333 | 0, 0, NULL); |
291 | return pool->handle_cachep ? 0 : 1; | 334 | if (!pool->handle_cachep) |
335 | return 1; | ||
336 | |||
337 | pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage), | ||
338 | 0, 0, NULL); | ||
339 | if (!pool->zspage_cachep) { | ||
340 | kmem_cache_destroy(pool->handle_cachep); | ||
341 | pool->handle_cachep = NULL; | ||
342 | return 1; | ||
343 | } | ||
344 | |||
345 | return 0; | ||
292 | } | 346 | } |
293 | 347 | ||
294 | static void destroy_handle_cache(struct zs_pool *pool) | 348 | static void destroy_cache(struct zs_pool *pool) |
295 | { | 349 | { |
296 | kmem_cache_destroy(pool->handle_cachep); | 350 | kmem_cache_destroy(pool->handle_cachep); |
351 | kmem_cache_destroy(pool->zspage_cachep); | ||
297 | } | 352 | } |
298 | 353 | ||
299 | static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp) | 354 | static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) |
300 | { | 355 | { |
301 | return (unsigned long)kmem_cache_alloc(pool->handle_cachep, | 356 | return (unsigned long)kmem_cache_alloc(pool->handle_cachep, |
302 | gfp & ~__GFP_HIGHMEM); | 357 | gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); |
303 | } | 358 | } |
304 | 359 | ||
305 | static void free_handle(struct zs_pool *pool, unsigned long handle) | 360 | static void cache_free_handle(struct zs_pool *pool, unsigned long handle) |
306 | { | 361 | { |
307 | kmem_cache_free(pool->handle_cachep, (void *)handle); | 362 | kmem_cache_free(pool->handle_cachep, (void *)handle); |
308 | } | 363 | } |
309 | 364 | ||
365 | static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) | ||
366 | { | ||
367 | return kmem_cache_alloc(pool->zspage_cachep, | ||
368 | flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); | ||
369 | }; | ||
370 | |||
371 | static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) | ||
372 | { | ||
373 | kmem_cache_free(pool->zspage_cachep, zspage); | ||
374 | } | ||
375 | |||
310 | static void record_obj(unsigned long handle, unsigned long obj) | 376 | static void record_obj(unsigned long handle, unsigned long obj) |
311 | { | 377 | { |
312 | /* | 378 | /* |
@@ -409,38 +475,76 @@ static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) | |||
409 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ | 475 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ |
410 | static DEFINE_PER_CPU(struct mapping_area, zs_map_area); | 476 | static DEFINE_PER_CPU(struct mapping_area, zs_map_area); |
411 | 477 | ||
478 | static bool is_zspage_isolated(struct zspage *zspage) | ||
479 | { | ||
480 | return zspage->isolated; | ||
481 | } | ||
482 | |||
412 | static int is_first_page(struct page *page) | 483 | static int is_first_page(struct page *page) |
413 | { | 484 | { |
414 | return PagePrivate(page); | 485 | return PagePrivate(page); |
415 | } | 486 | } |
416 | 487 | ||
417 | static int is_last_page(struct page *page) | 488 | /* Protected by class->lock */ |
489 | static inline int get_zspage_inuse(struct zspage *zspage) | ||
490 | { | ||
491 | return zspage->inuse; | ||
492 | } | ||
493 | |||
494 | static inline void set_zspage_inuse(struct zspage *zspage, int val) | ||
495 | { | ||
496 | zspage->inuse = val; | ||
497 | } | ||
498 | |||
499 | static inline void mod_zspage_inuse(struct zspage *zspage, int val) | ||
500 | { | ||
501 | zspage->inuse += val; | ||
502 | } | ||
503 | |||
504 | static inline struct page *get_first_page(struct zspage *zspage) | ||
505 | { | ||
506 | struct page *first_page = zspage->first_page; | ||
507 | |||
508 | VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); | ||
509 | return first_page; | ||
510 | } | ||
511 | |||
512 | static inline int get_first_obj_offset(struct page *page) | ||
513 | { | ||
514 | return page->units; | ||
515 | } | ||
516 | |||
517 | static inline void set_first_obj_offset(struct page *page, int offset) | ||
518 | { | ||
519 | page->units = offset; | ||
520 | } | ||
521 | |||
522 | static inline unsigned int get_freeobj(struct zspage *zspage) | ||
523 | { | ||
524 | return zspage->freeobj; | ||
525 | } | ||
526 | |||
527 | static inline void set_freeobj(struct zspage *zspage, unsigned int obj) | ||
418 | { | 528 | { |
419 | return PagePrivate2(page); | 529 | zspage->freeobj = obj; |
420 | } | 530 | } |
421 | 531 | ||
422 | static void get_zspage_mapping(struct page *first_page, | 532 | static void get_zspage_mapping(struct zspage *zspage, |
423 | unsigned int *class_idx, | 533 | unsigned int *class_idx, |
424 | enum fullness_group *fullness) | 534 | enum fullness_group *fullness) |
425 | { | 535 | { |
426 | unsigned long m; | 536 | BUG_ON(zspage->magic != ZSPAGE_MAGIC); |
427 | VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); | ||
428 | 537 | ||
429 | m = (unsigned long)first_page->mapping; | 538 | *fullness = zspage->fullness; |
430 | *fullness = m & FULLNESS_MASK; | 539 | *class_idx = zspage->class; |
431 | *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; | ||
432 | } | 540 | } |
433 | 541 | ||
434 | static void set_zspage_mapping(struct page *first_page, | 542 | static void set_zspage_mapping(struct zspage *zspage, |
435 | unsigned int class_idx, | 543 | unsigned int class_idx, |
436 | enum fullness_group fullness) | 544 | enum fullness_group fullness) |
437 | { | 545 | { |
438 | unsigned long m; | 546 | zspage->class = class_idx; |
439 | VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); | 547 | zspage->fullness = fullness; |
440 | |||
441 | m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | | ||
442 | (fullness & FULLNESS_MASK); | ||
443 | first_page->mapping = (struct address_space *)m; | ||
444 | } | 548 | } |
445 | 549 | ||
446 | /* | 550 | /* |
@@ -464,23 +568,19 @@ static int get_size_class_index(int size) | |||
464 | static inline void zs_stat_inc(struct size_class *class, | 568 | static inline void zs_stat_inc(struct size_class *class, |
465 | enum zs_stat_type type, unsigned long cnt) | 569 | enum zs_stat_type type, unsigned long cnt) |
466 | { | 570 | { |
467 | if (type < NR_ZS_STAT_TYPE) | 571 | class->stats.objs[type] += cnt; |
468 | class->stats.objs[type] += cnt; | ||
469 | } | 572 | } |
470 | 573 | ||
471 | static inline void zs_stat_dec(struct size_class *class, | 574 | static inline void zs_stat_dec(struct size_class *class, |
472 | enum zs_stat_type type, unsigned long cnt) | 575 | enum zs_stat_type type, unsigned long cnt) |
473 | { | 576 | { |
474 | if (type < NR_ZS_STAT_TYPE) | 577 | class->stats.objs[type] -= cnt; |
475 | class->stats.objs[type] -= cnt; | ||
476 | } | 578 | } |
477 | 579 | ||
478 | static inline unsigned long zs_stat_get(struct size_class *class, | 580 | static inline unsigned long zs_stat_get(struct size_class *class, |
479 | enum zs_stat_type type) | 581 | enum zs_stat_type type) |
480 | { | 582 | { |
481 | if (type < NR_ZS_STAT_TYPE) | 583 | return class->stats.objs[type]; |
482 | return class->stats.objs[type]; | ||
483 | return 0; | ||
484 | } | 584 | } |
485 | 585 | ||
486 | #ifdef CONFIG_ZSMALLOC_STAT | 586 | #ifdef CONFIG_ZSMALLOC_STAT |
@@ -624,6 +724,7 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool) | |||
624 | } | 724 | } |
625 | #endif | 725 | #endif |
626 | 726 | ||
727 | |||
627 | /* | 728 | /* |
628 | * For each size class, zspages are divided into different groups | 729 | * For each size class, zspages are divided into different groups |
629 | * depending on how "full" they are. This was done so that we could | 730 | * depending on how "full" they are. This was done so that we could |
@@ -631,21 +732,20 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool) | |||
631 | * the pool (not yet implemented). This function returns fullness | 732 | * the pool (not yet implemented). This function returns fullness |
632 | * status of the given page. | 733 | * status of the given page. |
633 | */ | 734 | */ |
634 | static enum fullness_group get_fullness_group(struct page *first_page) | 735 | static enum fullness_group get_fullness_group(struct size_class *class, |
736 | struct zspage *zspage) | ||
635 | { | 737 | { |
636 | int inuse, max_objects; | 738 | int inuse, objs_per_zspage; |
637 | enum fullness_group fg; | 739 | enum fullness_group fg; |
638 | 740 | ||
639 | VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); | 741 | inuse = get_zspage_inuse(zspage); |
640 | 742 | objs_per_zspage = class->objs_per_zspage; | |
641 | inuse = first_page->inuse; | ||
642 | max_objects = first_page->objects; | ||
643 | 743 | ||
644 | if (inuse == 0) | 744 | if (inuse == 0) |
645 | fg = ZS_EMPTY; | 745 | fg = ZS_EMPTY; |
646 | else if (inuse == max_objects) | 746 | else if (inuse == objs_per_zspage) |
647 | fg = ZS_FULL; | 747 | fg = ZS_FULL; |
648 | else if (inuse <= 3 * max_objects / fullness_threshold_frac) | 748 | else if (inuse <= 3 * objs_per_zspage / fullness_threshold_frac) |
649 | fg = ZS_ALMOST_EMPTY; | 749 | fg = ZS_ALMOST_EMPTY; |
650 | else | 750 | else |
651 | fg = ZS_ALMOST_FULL; | 751 | fg = ZS_ALMOST_FULL; |
@@ -660,32 +760,25 @@ static enum fullness_group get_fullness_group(struct page *first_page) | |||
660 | * identified by <class, fullness_group>. | 760 | * identified by <class, fullness_group>. |
661 | */ | 761 | */ |
662 | static void insert_zspage(struct size_class *class, | 762 | static void insert_zspage(struct size_class *class, |
663 | enum fullness_group fullness, | 763 | struct zspage *zspage, |
664 | struct page *first_page) | 764 | enum fullness_group fullness) |
665 | { | 765 | { |
666 | struct page **head; | 766 | struct zspage *head; |
667 | |||
668 | VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); | ||
669 | |||
670 | if (fullness >= _ZS_NR_FULLNESS_GROUPS) | ||
671 | return; | ||
672 | |||
673 | zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? | ||
674 | CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); | ||
675 | |||
676 | head = &class->fullness_list[fullness]; | ||
677 | if (!*head) { | ||
678 | *head = first_page; | ||
679 | return; | ||
680 | } | ||
681 | 767 | ||
768 | zs_stat_inc(class, fullness, 1); | ||
769 | head = list_first_entry_or_null(&class->fullness_list[fullness], | ||
770 | struct zspage, list); | ||
682 | /* | 771 | /* |
683 | * We want to see more ZS_FULL pages and less almost | 772 | * We want to see more ZS_FULL pages and less almost empty/full. |
684 | * empty/full. Put pages with higher ->inuse first. | 773 | * Put pages with higher ->inuse first. |
685 | */ | 774 | */ |
686 | list_add_tail(&first_page->lru, &(*head)->lru); | 775 | if (head) { |
687 | if (first_page->inuse >= (*head)->inuse) | 776 | if (get_zspage_inuse(zspage) < get_zspage_inuse(head)) { |
688 | *head = first_page; | 777 | list_add(&zspage->list, &head->list); |
778 | return; | ||
779 | } | ||
780 | } | ||
781 | list_add(&zspage->list, &class->fullness_list[fullness]); | ||
689 | } | 782 | } |
690 | 783 | ||
691 | /* | 784 | /* |
@@ -693,27 +786,14 @@ static void insert_zspage(struct size_class *class, | |||
693 | * by <class, fullness_group>. | 786 | * by <class, fullness_group>. |
694 | */ | 787 | */ |
695 | static void remove_zspage(struct size_class *class, | 788 | static void remove_zspage(struct size_class *class, |
696 | enum fullness_group fullness, | 789 | struct zspage *zspage, |
697 | struct page *first_page) | 790 | enum fullness_group fullness) |
698 | { | 791 | { |
699 | struct page **head; | 792 | VM_BUG_ON(list_empty(&class->fullness_list[fullness])); |
700 | 793 | VM_BUG_ON(is_zspage_isolated(zspage)); | |
701 | VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); | ||
702 | |||
703 | if (fullness >= _ZS_NR_FULLNESS_GROUPS) | ||
704 | return; | ||
705 | |||
706 | head = &class->fullness_list[fullness]; | ||
707 | VM_BUG_ON_PAGE(!*head, first_page); | ||
708 | if (list_empty(&(*head)->lru)) | ||
709 | *head = NULL; | ||
710 | else if (*head == first_page) | ||
711 | *head = (struct page *)list_entry((*head)->lru.next, | ||
712 | struct page, lru); | ||
713 | 794 | ||
714 | list_del_init(&first_page->lru); | 795 | list_del_init(&zspage->list); |
715 | zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? | 796 | zs_stat_dec(class, fullness, 1); |
716 | CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); | ||
717 | } | 797 | } |
718 | 798 | ||
719 | /* | 799 | /* |
@@ -726,19 +806,22 @@ static void remove_zspage(struct size_class *class, | |||
726 | * fullness group. | 806 | * fullness group. |
727 | */ | 807 | */ |
728 | static enum fullness_group fix_fullness_group(struct size_class *class, | 808 | static enum fullness_group fix_fullness_group(struct size_class *class, |
729 | struct page *first_page) | 809 | struct zspage *zspage) |
730 | { | 810 | { |
731 | int class_idx; | 811 | int class_idx; |
732 | enum fullness_group currfg, newfg; | 812 | enum fullness_group currfg, newfg; |
733 | 813 | ||
734 | get_zspage_mapping(first_page, &class_idx, &currfg); | 814 | get_zspage_mapping(zspage, &class_idx, &currfg); |
735 | newfg = get_fullness_group(first_page); | 815 | newfg = get_fullness_group(class, zspage); |
736 | if (newfg == currfg) | 816 | if (newfg == currfg) |
737 | goto out; | 817 | goto out; |
738 | 818 | ||
739 | remove_zspage(class, currfg, first_page); | 819 | if (!is_zspage_isolated(zspage)) { |
740 | insert_zspage(class, newfg, first_page); | 820 | remove_zspage(class, zspage, currfg); |
741 | set_zspage_mapping(first_page, class_idx, newfg); | 821 | insert_zspage(class, zspage, newfg); |
822 | } | ||
823 | |||
824 | set_zspage_mapping(zspage, class_idx, newfg); | ||
742 | 825 | ||
743 | out: | 826 | out: |
744 | return newfg; | 827 | return newfg; |
@@ -780,64 +863,49 @@ static int get_pages_per_zspage(int class_size) | |||
780 | return max_usedpc_order; | 863 | return max_usedpc_order; |
781 | } | 864 | } |
782 | 865 | ||
783 | /* | 866 | static struct zspage *get_zspage(struct page *page) |
784 | * A single 'zspage' is composed of many system pages which are | ||
785 | * linked together using fields in struct page. This function finds | ||
786 | * the first/head page, given any component page of a zspage. | ||
787 | */ | ||
788 | static struct page *get_first_page(struct page *page) | ||
789 | { | 867 | { |
790 | if (is_first_page(page)) | 868 | struct zspage *zspage = (struct zspage *)page->private; |
791 | return page; | 869 | |
792 | else | 870 | BUG_ON(zspage->magic != ZSPAGE_MAGIC); |
793 | return (struct page *)page_private(page); | 871 | return zspage; |
794 | } | 872 | } |
795 | 873 | ||
796 | static struct page *get_next_page(struct page *page) | 874 | static struct page *get_next_page(struct page *page) |
797 | { | 875 | { |
798 | struct page *next; | 876 | if (unlikely(PageHugeObject(page))) |
877 | return NULL; | ||
799 | 878 | ||
800 | if (is_last_page(page)) | 879 | return page->freelist; |
801 | next = NULL; | 880 | } |
802 | else if (is_first_page(page)) | ||
803 | next = (struct page *)page_private(page); | ||
804 | else | ||
805 | next = list_entry(page->lru.next, struct page, lru); | ||
806 | 881 | ||
807 | return next; | 882 | /** |
883 | * obj_to_location - get (<page>, <obj_idx>) from encoded object value | ||
884 | * @page: page object resides in zspage | ||
885 | * @obj_idx: object index | ||
886 | */ | ||
887 | static void obj_to_location(unsigned long obj, struct page **page, | ||
888 | unsigned int *obj_idx) | ||
889 | { | ||
890 | obj >>= OBJ_TAG_BITS; | ||
891 | *page = pfn_to_page(obj >> OBJ_INDEX_BITS); | ||
892 | *obj_idx = (obj & OBJ_INDEX_MASK); | ||
808 | } | 893 | } |
809 | 894 | ||
810 | /* | 895 | /** |
811 | * Encode <page, obj_idx> as a single handle value. | 896 | * location_to_obj - get obj value encoded from (<page>, <obj_idx>) |
812 | * We use the least bit of handle for tagging. | 897 | * @page: page object resides in zspage |
898 | * @obj_idx: object index | ||
813 | */ | 899 | */ |
814 | static void *location_to_obj(struct page *page, unsigned long obj_idx) | 900 | static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) |
815 | { | 901 | { |
816 | unsigned long obj; | 902 | unsigned long obj; |
817 | 903 | ||
818 | if (!page) { | ||
819 | VM_BUG_ON(obj_idx); | ||
820 | return NULL; | ||
821 | } | ||
822 | |||
823 | obj = page_to_pfn(page) << OBJ_INDEX_BITS; | 904 | obj = page_to_pfn(page) << OBJ_INDEX_BITS; |
824 | obj |= ((obj_idx) & OBJ_INDEX_MASK); | 905 | obj |= obj_idx & OBJ_INDEX_MASK; |
825 | obj <<= OBJ_TAG_BITS; | 906 | obj <<= OBJ_TAG_BITS; |
826 | 907 | ||
827 | return (void *)obj; | 908 | return obj; |
828 | } | ||
829 | |||
830 | /* | ||
831 | * Decode <page, obj_idx> pair from the given object handle. We adjust the | ||
832 | * decoded obj_idx back to its original value since it was adjusted in | ||
833 | * location_to_obj(). | ||
834 | */ | ||
835 | static void obj_to_location(unsigned long obj, struct page **page, | ||
836 | unsigned long *obj_idx) | ||
837 | { | ||
838 | obj >>= OBJ_TAG_BITS; | ||
839 | *page = pfn_to_page(obj >> OBJ_INDEX_BITS); | ||
840 | *obj_idx = (obj & OBJ_INDEX_MASK); | ||
841 | } | 909 | } |
842 | 910 | ||
843 | static unsigned long handle_to_obj(unsigned long handle) | 911 | static unsigned long handle_to_obj(unsigned long handle) |
@@ -845,109 +913,147 @@ static unsigned long handle_to_obj(unsigned long handle) | |||
845 | return *(unsigned long *)handle; | 913 | return *(unsigned long *)handle; |
846 | } | 914 | } |
847 | 915 | ||
848 | static unsigned long obj_to_head(struct size_class *class, struct page *page, | 916 | static unsigned long obj_to_head(struct page *page, void *obj) |
849 | void *obj) | ||
850 | { | 917 | { |
851 | if (class->huge) { | 918 | if (unlikely(PageHugeObject(page))) { |
852 | VM_BUG_ON_PAGE(!is_first_page(page), page); | 919 | VM_BUG_ON_PAGE(!is_first_page(page), page); |
853 | return page_private(page); | 920 | return page->index; |
854 | } else | 921 | } else |
855 | return *(unsigned long *)obj; | 922 | return *(unsigned long *)obj; |
856 | } | 923 | } |
857 | 924 | ||
858 | static unsigned long obj_idx_to_offset(struct page *page, | 925 | static inline int testpin_tag(unsigned long handle) |
859 | unsigned long obj_idx, int class_size) | ||
860 | { | 926 | { |
861 | unsigned long off = 0; | 927 | return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); |
862 | |||
863 | if (!is_first_page(page)) | ||
864 | off = page->index; | ||
865 | |||
866 | return off + obj_idx * class_size; | ||
867 | } | 928 | } |
868 | 929 | ||
869 | static inline int trypin_tag(unsigned long handle) | 930 | static inline int trypin_tag(unsigned long handle) |
870 | { | 931 | { |
871 | unsigned long *ptr = (unsigned long *)handle; | 932 | return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); |
872 | |||
873 | return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); | ||
874 | } | 933 | } |
875 | 934 | ||
876 | static void pin_tag(unsigned long handle) | 935 | static void pin_tag(unsigned long handle) |
877 | { | 936 | { |
878 | while (!trypin_tag(handle)); | 937 | bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); |
879 | } | 938 | } |
880 | 939 | ||
881 | static void unpin_tag(unsigned long handle) | 940 | static void unpin_tag(unsigned long handle) |
882 | { | 941 | { |
883 | unsigned long *ptr = (unsigned long *)handle; | 942 | bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); |
884 | |||
885 | clear_bit_unlock(HANDLE_PIN_BIT, ptr); | ||
886 | } | 943 | } |
887 | 944 | ||
888 | static void reset_page(struct page *page) | 945 | static void reset_page(struct page *page) |
889 | { | 946 | { |
947 | __ClearPageMovable(page); | ||
890 | clear_bit(PG_private, &page->flags); | 948 | clear_bit(PG_private, &page->flags); |
891 | clear_bit(PG_private_2, &page->flags); | 949 | clear_bit(PG_private_2, &page->flags); |
892 | set_page_private(page, 0); | 950 | set_page_private(page, 0); |
893 | page->mapping = NULL; | ||
894 | page->freelist = NULL; | ||
895 | page_mapcount_reset(page); | 951 | page_mapcount_reset(page); |
952 | ClearPageHugeObject(page); | ||
953 | page->freelist = NULL; | ||
954 | } | ||
955 | |||
956 | /* | ||
957 | * To prevent zspage destroy during migration, zspage freeing should | ||
958 | * hold locks of all pages in the zspage. | ||
959 | */ | ||
960 | void lock_zspage(struct zspage *zspage) | ||
961 | { | ||
962 | struct page *page = get_first_page(zspage); | ||
963 | |||
964 | do { | ||
965 | lock_page(page); | ||
966 | } while ((page = get_next_page(page)) != NULL); | ||
896 | } | 967 | } |
897 | 968 | ||
898 | static void free_zspage(struct page *first_page) | 969 | int trylock_zspage(struct zspage *zspage) |
899 | { | 970 | { |
900 | struct page *nextp, *tmp, *head_extra; | 971 | struct page *cursor, *fail; |
901 | 972 | ||
902 | VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); | 973 | for (cursor = get_first_page(zspage); cursor != NULL; cursor = |
903 | VM_BUG_ON_PAGE(first_page->inuse, first_page); | 974 | get_next_page(cursor)) { |
975 | if (!trylock_page(cursor)) { | ||
976 | fail = cursor; | ||
977 | goto unlock; | ||
978 | } | ||
979 | } | ||
904 | 980 | ||
905 | head_extra = (struct page *)page_private(first_page); | 981 | return 1; |
982 | unlock: | ||
983 | for (cursor = get_first_page(zspage); cursor != fail; cursor = | ||
984 | get_next_page(cursor)) | ||
985 | unlock_page(cursor); | ||
906 | 986 | ||
907 | reset_page(first_page); | 987 | return 0; |
908 | __free_page(first_page); | 988 | } |
909 | 989 | ||
910 | /* zspage with only 1 system page */ | 990 | static void __free_zspage(struct zs_pool *pool, struct size_class *class, |
911 | if (!head_extra) | 991 | struct zspage *zspage) |
912 | return; | 992 | { |
993 | struct page *page, *next; | ||
994 | enum fullness_group fg; | ||
995 | unsigned int class_idx; | ||
913 | 996 | ||
914 | list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { | 997 | get_zspage_mapping(zspage, &class_idx, &fg); |
915 | list_del(&nextp->lru); | 998 | |
916 | reset_page(nextp); | 999 | assert_spin_locked(&class->lock); |
917 | __free_page(nextp); | 1000 | |
1001 | VM_BUG_ON(get_zspage_inuse(zspage)); | ||
1002 | VM_BUG_ON(fg != ZS_EMPTY); | ||
1003 | |||
1004 | next = page = get_first_page(zspage); | ||
1005 | do { | ||
1006 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
1007 | next = get_next_page(page); | ||
1008 | reset_page(page); | ||
1009 | unlock_page(page); | ||
1010 | dec_zone_page_state(page, NR_ZSPAGES); | ||
1011 | put_page(page); | ||
1012 | page = next; | ||
1013 | } while (page != NULL); | ||
1014 | |||
1015 | cache_free_zspage(pool, zspage); | ||
1016 | |||
1017 | zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | ||
1018 | class->size, class->pages_per_zspage)); | ||
1019 | atomic_long_sub(class->pages_per_zspage, | ||
1020 | &pool->pages_allocated); | ||
1021 | } | ||
1022 | |||
1023 | static void free_zspage(struct zs_pool *pool, struct size_class *class, | ||
1024 | struct zspage *zspage) | ||
1025 | { | ||
1026 | VM_BUG_ON(get_zspage_inuse(zspage)); | ||
1027 | VM_BUG_ON(list_empty(&zspage->list)); | ||
1028 | |||
1029 | if (!trylock_zspage(zspage)) { | ||
1030 | kick_deferred_free(pool); | ||
1031 | return; | ||
918 | } | 1032 | } |
919 | reset_page(head_extra); | 1033 | |
920 | __free_page(head_extra); | 1034 | remove_zspage(class, zspage, ZS_EMPTY); |
1035 | __free_zspage(pool, class, zspage); | ||
921 | } | 1036 | } |
922 | 1037 | ||
923 | /* Initialize a newly allocated zspage */ | 1038 | /* Initialize a newly allocated zspage */ |
924 | static void init_zspage(struct size_class *class, struct page *first_page) | 1039 | static void init_zspage(struct size_class *class, struct zspage *zspage) |
925 | { | 1040 | { |
1041 | unsigned int freeobj = 1; | ||
926 | unsigned long off = 0; | 1042 | unsigned long off = 0; |
927 | struct page *page = first_page; | 1043 | struct page *page = get_first_page(zspage); |
928 | |||
929 | VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); | ||
930 | 1044 | ||
931 | while (page) { | 1045 | while (page) { |
932 | struct page *next_page; | 1046 | struct page *next_page; |
933 | struct link_free *link; | 1047 | struct link_free *link; |
934 | unsigned int i = 1; | ||
935 | void *vaddr; | 1048 | void *vaddr; |
936 | 1049 | ||
937 | /* | 1050 | set_first_obj_offset(page, off); |
938 | * page->index stores offset of first object starting | ||
939 | * in the page. For the first page, this is always 0, | ||
940 | * so we use first_page->index (aka ->freelist) to store | ||
941 | * head of corresponding zspage's freelist. | ||
942 | */ | ||
943 | if (page != first_page) | ||
944 | page->index = off; | ||
945 | 1051 | ||
946 | vaddr = kmap_atomic(page); | 1052 | vaddr = kmap_atomic(page); |
947 | link = (struct link_free *)vaddr + off / sizeof(*link); | 1053 | link = (struct link_free *)vaddr + off / sizeof(*link); |
948 | 1054 | ||
949 | while ((off += class->size) < PAGE_SIZE) { | 1055 | while ((off += class->size) < PAGE_SIZE) { |
950 | link->next = location_to_obj(page, i++); | 1056 | link->next = freeobj++ << OBJ_TAG_BITS; |
951 | link += class->size / sizeof(*link); | 1057 | link += class->size / sizeof(*link); |
952 | } | 1058 | } |
953 | 1059 | ||
@@ -957,87 +1063,112 @@ static void init_zspage(struct size_class *class, struct page *first_page) | |||
957 | * page (if present) | 1063 | * page (if present) |
958 | */ | 1064 | */ |
959 | next_page = get_next_page(page); | 1065 | next_page = get_next_page(page); |
960 | link->next = location_to_obj(next_page, 0); | 1066 | if (next_page) { |
1067 | link->next = freeobj++ << OBJ_TAG_BITS; | ||
1068 | } else { | ||
1069 | /* | ||
1070 | * Reset OBJ_TAG_BITS bit to last link to tell | ||
1071 | * whether it's allocated object or not. | ||
1072 | */ | ||
1073 | link->next = -1 << OBJ_TAG_BITS; | ||
1074 | } | ||
961 | kunmap_atomic(vaddr); | 1075 | kunmap_atomic(vaddr); |
962 | page = next_page; | 1076 | page = next_page; |
963 | off %= PAGE_SIZE; | 1077 | off %= PAGE_SIZE; |
964 | } | 1078 | } |
1079 | |||
1080 | set_freeobj(zspage, 0); | ||
965 | } | 1081 | } |
966 | 1082 | ||
967 | /* | 1083 | static void create_page_chain(struct size_class *class, struct zspage *zspage, |
968 | * Allocate a zspage for the given size class | 1084 | struct page *pages[]) |
969 | */ | ||
970 | static struct page *alloc_zspage(struct size_class *class, gfp_t flags) | ||
971 | { | 1085 | { |
972 | int i, error; | 1086 | int i; |
973 | struct page *first_page = NULL, *uninitialized_var(prev_page); | 1087 | struct page *page; |
1088 | struct page *prev_page = NULL; | ||
1089 | int nr_pages = class->pages_per_zspage; | ||
974 | 1090 | ||
975 | /* | 1091 | /* |
976 | * Allocate individual pages and link them together as: | 1092 | * Allocate individual pages and link them together as: |
977 | * 1. first page->private = first sub-page | 1093 | * 1. all pages are linked together using page->freelist |
978 | * 2. all sub-pages are linked together using page->lru | 1094 | * 2. each sub-page point to zspage using page->private |
979 | * 3. each sub-page is linked to the first page using page->private | ||
980 | * | 1095 | * |
981 | * For each size class, First/Head pages are linked together using | 1096 | * we set PG_private to identify the first page (i.e. no other sub-page |
982 | * page->lru. Also, we set PG_private to identify the first page | 1097 | * has this flag set) and PG_private_2 to identify the last page. |
983 | * (i.e. no other sub-page has this flag set) and PG_private_2 to | ||
984 | * identify the last page. | ||
985 | */ | 1098 | */ |
986 | error = -ENOMEM; | 1099 | for (i = 0; i < nr_pages; i++) { |
987 | for (i = 0; i < class->pages_per_zspage; i++) { | 1100 | page = pages[i]; |
988 | struct page *page; | 1101 | set_page_private(page, (unsigned long)zspage); |
989 | 1102 | page->freelist = NULL; | |
990 | page = alloc_page(flags); | 1103 | if (i == 0) { |
991 | if (!page) | 1104 | zspage->first_page = page; |
992 | goto cleanup; | ||
993 | |||
994 | INIT_LIST_HEAD(&page->lru); | ||
995 | if (i == 0) { /* first page */ | ||
996 | SetPagePrivate(page); | 1105 | SetPagePrivate(page); |
997 | set_page_private(page, 0); | 1106 | if (unlikely(class->objs_per_zspage == 1 && |
998 | first_page = page; | 1107 | class->pages_per_zspage == 1)) |
999 | first_page->inuse = 0; | 1108 | SetPageHugeObject(page); |
1109 | } else { | ||
1110 | prev_page->freelist = page; | ||
1000 | } | 1111 | } |
1001 | if (i == 1) | 1112 | if (i == nr_pages - 1) |
1002 | set_page_private(first_page, (unsigned long)page); | ||
1003 | if (i >= 1) | ||
1004 | set_page_private(page, (unsigned long)first_page); | ||
1005 | if (i >= 2) | ||
1006 | list_add(&page->lru, &prev_page->lru); | ||
1007 | if (i == class->pages_per_zspage - 1) /* last page */ | ||
1008 | SetPagePrivate2(page); | 1113 | SetPagePrivate2(page); |
1009 | prev_page = page; | 1114 | prev_page = page; |
1010 | } | 1115 | } |
1116 | } | ||
1117 | |||
1118 | /* | ||
1119 | * Allocate a zspage for the given size class | ||
1120 | */ | ||
1121 | static struct zspage *alloc_zspage(struct zs_pool *pool, | ||
1122 | struct size_class *class, | ||
1123 | gfp_t gfp) | ||
1124 | { | ||
1125 | int i; | ||
1126 | struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE]; | ||
1127 | struct zspage *zspage = cache_alloc_zspage(pool, gfp); | ||
1128 | |||
1129 | if (!zspage) | ||
1130 | return NULL; | ||
1011 | 1131 | ||
1012 | init_zspage(class, first_page); | 1132 | memset(zspage, 0, sizeof(struct zspage)); |
1133 | zspage->magic = ZSPAGE_MAGIC; | ||
1134 | migrate_lock_init(zspage); | ||
1013 | 1135 | ||
1014 | first_page->freelist = location_to_obj(first_page, 0); | 1136 | for (i = 0; i < class->pages_per_zspage; i++) { |
1015 | /* Maximum number of objects we can store in this zspage */ | 1137 | struct page *page; |
1016 | first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; | ||
1017 | 1138 | ||
1018 | error = 0; /* Success */ | 1139 | page = alloc_page(gfp); |
1140 | if (!page) { | ||
1141 | while (--i >= 0) { | ||
1142 | dec_zone_page_state(pages[i], NR_ZSPAGES); | ||
1143 | __free_page(pages[i]); | ||
1144 | } | ||
1145 | cache_free_zspage(pool, zspage); | ||
1146 | return NULL; | ||
1147 | } | ||
1019 | 1148 | ||
1020 | cleanup: | 1149 | inc_zone_page_state(page, NR_ZSPAGES); |
1021 | if (unlikely(error) && first_page) { | 1150 | pages[i] = page; |
1022 | free_zspage(first_page); | ||
1023 | first_page = NULL; | ||
1024 | } | 1151 | } |
1025 | 1152 | ||
1026 | return first_page; | 1153 | create_page_chain(class, zspage, pages); |
1154 | init_zspage(class, zspage); | ||
1155 | |||
1156 | return zspage; | ||
1027 | } | 1157 | } |
1028 | 1158 | ||
1029 | static struct page *find_get_zspage(struct size_class *class) | 1159 | static struct zspage *find_get_zspage(struct size_class *class) |
1030 | { | 1160 | { |
1031 | int i; | 1161 | int i; |
1032 | struct page *page; | 1162 | struct zspage *zspage; |
1033 | 1163 | ||
1034 | for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { | 1164 | for (i = ZS_ALMOST_FULL; i >= ZS_EMPTY; i--) { |
1035 | page = class->fullness_list[i]; | 1165 | zspage = list_first_entry_or_null(&class->fullness_list[i], |
1036 | if (page) | 1166 | struct zspage, list); |
1167 | if (zspage) | ||
1037 | break; | 1168 | break; |
1038 | } | 1169 | } |
1039 | 1170 | ||
1040 | return page; | 1171 | return zspage; |
1041 | } | 1172 | } |
1042 | 1173 | ||
1043 | #ifdef CONFIG_PGTABLE_MAPPING | 1174 | #ifdef CONFIG_PGTABLE_MAPPING |
@@ -1242,11 +1373,9 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) | |||
1242 | return true; | 1373 | return true; |
1243 | } | 1374 | } |
1244 | 1375 | ||
1245 | static bool zspage_full(struct page *first_page) | 1376 | static bool zspage_full(struct size_class *class, struct zspage *zspage) |
1246 | { | 1377 | { |
1247 | VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); | 1378 | return get_zspage_inuse(zspage) == class->objs_per_zspage; |
1248 | |||
1249 | return first_page->inuse == first_page->objects; | ||
1250 | } | 1379 | } |
1251 | 1380 | ||
1252 | unsigned long zs_get_total_pages(struct zs_pool *pool) | 1381 | unsigned long zs_get_total_pages(struct zs_pool *pool) |
@@ -1272,8 +1401,10 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages); | |||
1272 | void *zs_map_object(struct zs_pool *pool, unsigned long handle, | 1401 | void *zs_map_object(struct zs_pool *pool, unsigned long handle, |
1273 | enum zs_mapmode mm) | 1402 | enum zs_mapmode mm) |
1274 | { | 1403 | { |
1404 | struct zspage *zspage; | ||
1275 | struct page *page; | 1405 | struct page *page; |
1276 | unsigned long obj, obj_idx, off; | 1406 | unsigned long obj, off; |
1407 | unsigned int obj_idx; | ||
1277 | 1408 | ||
1278 | unsigned int class_idx; | 1409 | unsigned int class_idx; |
1279 | enum fullness_group fg; | 1410 | enum fullness_group fg; |
@@ -1294,9 +1425,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
1294 | 1425 | ||
1295 | obj = handle_to_obj(handle); | 1426 | obj = handle_to_obj(handle); |
1296 | obj_to_location(obj, &page, &obj_idx); | 1427 | obj_to_location(obj, &page, &obj_idx); |
1297 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | 1428 | zspage = get_zspage(page); |
1429 | |||
1430 | /* migration cannot move any subpage in this zspage */ | ||
1431 | migrate_read_lock(zspage); | ||
1432 | |||
1433 | get_zspage_mapping(zspage, &class_idx, &fg); | ||
1298 | class = pool->size_class[class_idx]; | 1434 | class = pool->size_class[class_idx]; |
1299 | off = obj_idx_to_offset(page, obj_idx, class->size); | 1435 | off = (class->size * obj_idx) & ~PAGE_MASK; |
1300 | 1436 | ||
1301 | area = &get_cpu_var(zs_map_area); | 1437 | area = &get_cpu_var(zs_map_area); |
1302 | area->vm_mm = mm; | 1438 | area->vm_mm = mm; |
@@ -1314,7 +1450,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
1314 | 1450 | ||
1315 | ret = __zs_map_object(area, pages, off, class->size); | 1451 | ret = __zs_map_object(area, pages, off, class->size); |
1316 | out: | 1452 | out: |
1317 | if (!class->huge) | 1453 | if (likely(!PageHugeObject(page))) |
1318 | ret += ZS_HANDLE_SIZE; | 1454 | ret += ZS_HANDLE_SIZE; |
1319 | 1455 | ||
1320 | return ret; | 1456 | return ret; |
@@ -1323,8 +1459,10 @@ EXPORT_SYMBOL_GPL(zs_map_object); | |||
1323 | 1459 | ||
1324 | void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | 1460 | void zs_unmap_object(struct zs_pool *pool, unsigned long handle) |
1325 | { | 1461 | { |
1462 | struct zspage *zspage; | ||
1326 | struct page *page; | 1463 | struct page *page; |
1327 | unsigned long obj, obj_idx, off; | 1464 | unsigned long obj, off; |
1465 | unsigned int obj_idx; | ||
1328 | 1466 | ||
1329 | unsigned int class_idx; | 1467 | unsigned int class_idx; |
1330 | enum fullness_group fg; | 1468 | enum fullness_group fg; |
@@ -1333,9 +1471,10 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | |||
1333 | 1471 | ||
1334 | obj = handle_to_obj(handle); | 1472 | obj = handle_to_obj(handle); |
1335 | obj_to_location(obj, &page, &obj_idx); | 1473 | obj_to_location(obj, &page, &obj_idx); |
1336 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | 1474 | zspage = get_zspage(page); |
1475 | get_zspage_mapping(zspage, &class_idx, &fg); | ||
1337 | class = pool->size_class[class_idx]; | 1476 | class = pool->size_class[class_idx]; |
1338 | off = obj_idx_to_offset(page, obj_idx, class->size); | 1477 | off = (class->size * obj_idx) & ~PAGE_MASK; |
1339 | 1478 | ||
1340 | area = this_cpu_ptr(&zs_map_area); | 1479 | area = this_cpu_ptr(&zs_map_area); |
1341 | if (off + class->size <= PAGE_SIZE) | 1480 | if (off + class->size <= PAGE_SIZE) |
@@ -1350,38 +1489,50 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | |||
1350 | __zs_unmap_object(area, pages, off, class->size); | 1489 | __zs_unmap_object(area, pages, off, class->size); |
1351 | } | 1490 | } |
1352 | put_cpu_var(zs_map_area); | 1491 | put_cpu_var(zs_map_area); |
1492 | |||
1493 | migrate_read_unlock(zspage); | ||
1353 | unpin_tag(handle); | 1494 | unpin_tag(handle); |
1354 | } | 1495 | } |
1355 | EXPORT_SYMBOL_GPL(zs_unmap_object); | 1496 | EXPORT_SYMBOL_GPL(zs_unmap_object); |
1356 | 1497 | ||
1357 | static unsigned long obj_malloc(struct size_class *class, | 1498 | static unsigned long obj_malloc(struct size_class *class, |
1358 | struct page *first_page, unsigned long handle) | 1499 | struct zspage *zspage, unsigned long handle) |
1359 | { | 1500 | { |
1501 | int i, nr_page, offset; | ||
1360 | unsigned long obj; | 1502 | unsigned long obj; |
1361 | struct link_free *link; | 1503 | struct link_free *link; |
1362 | 1504 | ||
1363 | struct page *m_page; | 1505 | struct page *m_page; |
1364 | unsigned long m_objidx, m_offset; | 1506 | unsigned long m_offset; |
1365 | void *vaddr; | 1507 | void *vaddr; |
1366 | 1508 | ||
1367 | handle |= OBJ_ALLOCATED_TAG; | 1509 | handle |= OBJ_ALLOCATED_TAG; |
1368 | obj = (unsigned long)first_page->freelist; | 1510 | obj = get_freeobj(zspage); |
1369 | obj_to_location(obj, &m_page, &m_objidx); | 1511 | |
1370 | m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); | 1512 | offset = obj * class->size; |
1513 | nr_page = offset >> PAGE_SHIFT; | ||
1514 | m_offset = offset & ~PAGE_MASK; | ||
1515 | m_page = get_first_page(zspage); | ||
1516 | |||
1517 | for (i = 0; i < nr_page; i++) | ||
1518 | m_page = get_next_page(m_page); | ||
1371 | 1519 | ||
1372 | vaddr = kmap_atomic(m_page); | 1520 | vaddr = kmap_atomic(m_page); |
1373 | link = (struct link_free *)vaddr + m_offset / sizeof(*link); | 1521 | link = (struct link_free *)vaddr + m_offset / sizeof(*link); |
1374 | first_page->freelist = link->next; | 1522 | set_freeobj(zspage, link->next >> OBJ_TAG_BITS); |
1375 | if (!class->huge) | 1523 | if (likely(!PageHugeObject(m_page))) |
1376 | /* record handle in the header of allocated chunk */ | 1524 | /* record handle in the header of allocated chunk */ |
1377 | link->handle = handle; | 1525 | link->handle = handle; |
1378 | else | 1526 | else |
1379 | /* record handle in first_page->private */ | 1527 | /* record handle to page->index */ |
1380 | set_page_private(first_page, handle); | 1528 | zspage->first_page->index = handle; |
1529 | |||
1381 | kunmap_atomic(vaddr); | 1530 | kunmap_atomic(vaddr); |
1382 | first_page->inuse++; | 1531 | mod_zspage_inuse(zspage, 1); |
1383 | zs_stat_inc(class, OBJ_USED, 1); | 1532 | zs_stat_inc(class, OBJ_USED, 1); |
1384 | 1533 | ||
1534 | obj = location_to_obj(m_page, obj); | ||
1535 | |||
1385 | return obj; | 1536 | return obj; |
1386 | } | 1537 | } |
1387 | 1538 | ||
@@ -1399,12 +1550,13 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) | |||
1399 | { | 1550 | { |
1400 | unsigned long handle, obj; | 1551 | unsigned long handle, obj; |
1401 | struct size_class *class; | 1552 | struct size_class *class; |
1402 | struct page *first_page; | 1553 | enum fullness_group newfg; |
1554 | struct zspage *zspage; | ||
1403 | 1555 | ||
1404 | if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) | 1556 | if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) |
1405 | return 0; | 1557 | return 0; |
1406 | 1558 | ||
1407 | handle = alloc_handle(pool, gfp); | 1559 | handle = cache_alloc_handle(pool, gfp); |
1408 | if (!handle) | 1560 | if (!handle) |
1409 | return 0; | 1561 | return 0; |
1410 | 1562 | ||
@@ -1413,29 +1565,38 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) | |||
1413 | class = pool->size_class[get_size_class_index(size)]; | 1565 | class = pool->size_class[get_size_class_index(size)]; |
1414 | 1566 | ||
1415 | spin_lock(&class->lock); | 1567 | spin_lock(&class->lock); |
1416 | first_page = find_get_zspage(class); | 1568 | zspage = find_get_zspage(class); |
1417 | 1569 | if (likely(zspage)) { | |
1418 | if (!first_page) { | 1570 | obj = obj_malloc(class, zspage, handle); |
1571 | /* Now move the zspage to another fullness group, if required */ | ||
1572 | fix_fullness_group(class, zspage); | ||
1573 | record_obj(handle, obj); | ||
1419 | spin_unlock(&class->lock); | 1574 | spin_unlock(&class->lock); |
1420 | first_page = alloc_zspage(class, gfp); | ||
1421 | if (unlikely(!first_page)) { | ||
1422 | free_handle(pool, handle); | ||
1423 | return 0; | ||
1424 | } | ||
1425 | 1575 | ||
1426 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); | 1576 | return handle; |
1427 | atomic_long_add(class->pages_per_zspage, | 1577 | } |
1428 | &pool->pages_allocated); | ||
1429 | 1578 | ||
1430 | spin_lock(&class->lock); | 1579 | spin_unlock(&class->lock); |
1431 | zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | 1580 | |
1432 | class->size, class->pages_per_zspage)); | 1581 | zspage = alloc_zspage(pool, class, gfp); |
1582 | if (!zspage) { | ||
1583 | cache_free_handle(pool, handle); | ||
1584 | return 0; | ||
1433 | } | 1585 | } |
1434 | 1586 | ||
1435 | obj = obj_malloc(class, first_page, handle); | 1587 | spin_lock(&class->lock); |
1436 | /* Now move the zspage to another fullness group, if required */ | 1588 | obj = obj_malloc(class, zspage, handle); |
1437 | fix_fullness_group(class, first_page); | 1589 | newfg = get_fullness_group(class, zspage); |
1590 | insert_zspage(class, zspage, newfg); | ||
1591 | set_zspage_mapping(zspage, class->index, newfg); | ||
1438 | record_obj(handle, obj); | 1592 | record_obj(handle, obj); |
1593 | atomic_long_add(class->pages_per_zspage, | ||
1594 | &pool->pages_allocated); | ||
1595 | zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | ||
1596 | class->size, class->pages_per_zspage)); | ||
1597 | |||
1598 | /* We completely set up zspage so mark them as movable */ | ||
1599 | SetZsPageMovable(pool, zspage); | ||
1439 | spin_unlock(&class->lock); | 1600 | spin_unlock(&class->lock); |
1440 | 1601 | ||
1441 | return handle; | 1602 | return handle; |
@@ -1445,36 +1606,38 @@ EXPORT_SYMBOL_GPL(zs_malloc); | |||
1445 | static void obj_free(struct size_class *class, unsigned long obj) | 1606 | static void obj_free(struct size_class *class, unsigned long obj) |
1446 | { | 1607 | { |
1447 | struct link_free *link; | 1608 | struct link_free *link; |
1448 | struct page *first_page, *f_page; | 1609 | struct zspage *zspage; |
1449 | unsigned long f_objidx, f_offset; | 1610 | struct page *f_page; |
1611 | unsigned long f_offset; | ||
1612 | unsigned int f_objidx; | ||
1450 | void *vaddr; | 1613 | void *vaddr; |
1451 | 1614 | ||
1452 | obj &= ~OBJ_ALLOCATED_TAG; | 1615 | obj &= ~OBJ_ALLOCATED_TAG; |
1453 | obj_to_location(obj, &f_page, &f_objidx); | 1616 | obj_to_location(obj, &f_page, &f_objidx); |
1454 | first_page = get_first_page(f_page); | 1617 | f_offset = (class->size * f_objidx) & ~PAGE_MASK; |
1455 | 1618 | zspage = get_zspage(f_page); | |
1456 | f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); | ||
1457 | 1619 | ||
1458 | vaddr = kmap_atomic(f_page); | 1620 | vaddr = kmap_atomic(f_page); |
1459 | 1621 | ||
1460 | /* Insert this object in containing zspage's freelist */ | 1622 | /* Insert this object in containing zspage's freelist */ |
1461 | link = (struct link_free *)(vaddr + f_offset); | 1623 | link = (struct link_free *)(vaddr + f_offset); |
1462 | link->next = first_page->freelist; | 1624 | link->next = get_freeobj(zspage) << OBJ_TAG_BITS; |
1463 | if (class->huge) | ||
1464 | set_page_private(first_page, 0); | ||
1465 | kunmap_atomic(vaddr); | 1625 | kunmap_atomic(vaddr); |
1466 | first_page->freelist = (void *)obj; | 1626 | set_freeobj(zspage, f_objidx); |
1467 | first_page->inuse--; | 1627 | mod_zspage_inuse(zspage, -1); |
1468 | zs_stat_dec(class, OBJ_USED, 1); | 1628 | zs_stat_dec(class, OBJ_USED, 1); |
1469 | } | 1629 | } |
1470 | 1630 | ||
1471 | void zs_free(struct zs_pool *pool, unsigned long handle) | 1631 | void zs_free(struct zs_pool *pool, unsigned long handle) |
1472 | { | 1632 | { |
1473 | struct page *first_page, *f_page; | 1633 | struct zspage *zspage; |
1474 | unsigned long obj, f_objidx; | 1634 | struct page *f_page; |
1635 | unsigned long obj; | ||
1636 | unsigned int f_objidx; | ||
1475 | int class_idx; | 1637 | int class_idx; |
1476 | struct size_class *class; | 1638 | struct size_class *class; |
1477 | enum fullness_group fullness; | 1639 | enum fullness_group fullness; |
1640 | bool isolated; | ||
1478 | 1641 | ||
1479 | if (unlikely(!handle)) | 1642 | if (unlikely(!handle)) |
1480 | return; | 1643 | return; |
@@ -1482,25 +1645,31 @@ void zs_free(struct zs_pool *pool, unsigned long handle) | |||
1482 | pin_tag(handle); | 1645 | pin_tag(handle); |
1483 | obj = handle_to_obj(handle); | 1646 | obj = handle_to_obj(handle); |
1484 | obj_to_location(obj, &f_page, &f_objidx); | 1647 | obj_to_location(obj, &f_page, &f_objidx); |
1485 | first_page = get_first_page(f_page); | 1648 | zspage = get_zspage(f_page); |
1486 | 1649 | ||
1487 | get_zspage_mapping(first_page, &class_idx, &fullness); | 1650 | migrate_read_lock(zspage); |
1651 | |||
1652 | get_zspage_mapping(zspage, &class_idx, &fullness); | ||
1488 | class = pool->size_class[class_idx]; | 1653 | class = pool->size_class[class_idx]; |
1489 | 1654 | ||
1490 | spin_lock(&class->lock); | 1655 | spin_lock(&class->lock); |
1491 | obj_free(class, obj); | 1656 | obj_free(class, obj); |
1492 | fullness = fix_fullness_group(class, first_page); | 1657 | fullness = fix_fullness_group(class, zspage); |
1493 | if (fullness == ZS_EMPTY) { | 1658 | if (fullness != ZS_EMPTY) { |
1494 | zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | 1659 | migrate_read_unlock(zspage); |
1495 | class->size, class->pages_per_zspage)); | 1660 | goto out; |
1496 | atomic_long_sub(class->pages_per_zspage, | ||
1497 | &pool->pages_allocated); | ||
1498 | free_zspage(first_page); | ||
1499 | } | 1661 | } |
1662 | |||
1663 | isolated = is_zspage_isolated(zspage); | ||
1664 | migrate_read_unlock(zspage); | ||
1665 | /* If zspage is isolated, zs_page_putback will free the zspage */ | ||
1666 | if (likely(!isolated)) | ||
1667 | free_zspage(pool, class, zspage); | ||
1668 | out: | ||
1669 | |||
1500 | spin_unlock(&class->lock); | 1670 | spin_unlock(&class->lock); |
1501 | unpin_tag(handle); | 1671 | unpin_tag(handle); |
1502 | 1672 | cache_free_handle(pool, handle); | |
1503 | free_handle(pool, handle); | ||
1504 | } | 1673 | } |
1505 | EXPORT_SYMBOL_GPL(zs_free); | 1674 | EXPORT_SYMBOL_GPL(zs_free); |
1506 | 1675 | ||
@@ -1508,7 +1677,7 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, | |||
1508 | unsigned long src) | 1677 | unsigned long src) |
1509 | { | 1678 | { |
1510 | struct page *s_page, *d_page; | 1679 | struct page *s_page, *d_page; |
1511 | unsigned long s_objidx, d_objidx; | 1680 | unsigned int s_objidx, d_objidx; |
1512 | unsigned long s_off, d_off; | 1681 | unsigned long s_off, d_off; |
1513 | void *s_addr, *d_addr; | 1682 | void *s_addr, *d_addr; |
1514 | int s_size, d_size, size; | 1683 | int s_size, d_size, size; |
@@ -1519,8 +1688,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, | |||
1519 | obj_to_location(src, &s_page, &s_objidx); | 1688 | obj_to_location(src, &s_page, &s_objidx); |
1520 | obj_to_location(dst, &d_page, &d_objidx); | 1689 | obj_to_location(dst, &d_page, &d_objidx); |
1521 | 1690 | ||
1522 | s_off = obj_idx_to_offset(s_page, s_objidx, class->size); | 1691 | s_off = (class->size * s_objidx) & ~PAGE_MASK; |
1523 | d_off = obj_idx_to_offset(d_page, d_objidx, class->size); | 1692 | d_off = (class->size * d_objidx) & ~PAGE_MASK; |
1524 | 1693 | ||
1525 | if (s_off + class->size > PAGE_SIZE) | 1694 | if (s_off + class->size > PAGE_SIZE) |
1526 | s_size = PAGE_SIZE - s_off; | 1695 | s_size = PAGE_SIZE - s_off; |
@@ -1579,12 +1748,11 @@ static unsigned long find_alloced_obj(struct size_class *class, | |||
1579 | unsigned long handle = 0; | 1748 | unsigned long handle = 0; |
1580 | void *addr = kmap_atomic(page); | 1749 | void *addr = kmap_atomic(page); |
1581 | 1750 | ||
1582 | if (!is_first_page(page)) | 1751 | offset = get_first_obj_offset(page); |
1583 | offset = page->index; | ||
1584 | offset += class->size * index; | 1752 | offset += class->size * index; |
1585 | 1753 | ||
1586 | while (offset < PAGE_SIZE) { | 1754 | while (offset < PAGE_SIZE) { |
1587 | head = obj_to_head(class, page, addr + offset); | 1755 | head = obj_to_head(page, addr + offset); |
1588 | if (head & OBJ_ALLOCATED_TAG) { | 1756 | if (head & OBJ_ALLOCATED_TAG) { |
1589 | handle = head & ~OBJ_ALLOCATED_TAG; | 1757 | handle = head & ~OBJ_ALLOCATED_TAG; |
1590 | if (trypin_tag(handle)) | 1758 | if (trypin_tag(handle)) |
@@ -1601,7 +1769,7 @@ static unsigned long find_alloced_obj(struct size_class *class, | |||
1601 | } | 1769 | } |
1602 | 1770 | ||
1603 | struct zs_compact_control { | 1771 | struct zs_compact_control { |
1604 | /* Source page for migration which could be a subpage of zspage. */ | 1772 | /* Source spage for migration which could be a subpage of zspage */ |
1605 | struct page *s_page; | 1773 | struct page *s_page; |
1606 | /* Destination page for migration which should be a first page | 1774 | /* Destination page for migration which should be a first page |
1607 | * of zspage. */ | 1775 | * of zspage. */ |
@@ -1632,14 +1800,14 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, | |||
1632 | } | 1800 | } |
1633 | 1801 | ||
1634 | /* Stop if there is no more space */ | 1802 | /* Stop if there is no more space */ |
1635 | if (zspage_full(d_page)) { | 1803 | if (zspage_full(class, get_zspage(d_page))) { |
1636 | unpin_tag(handle); | 1804 | unpin_tag(handle); |
1637 | ret = -ENOMEM; | 1805 | ret = -ENOMEM; |
1638 | break; | 1806 | break; |
1639 | } | 1807 | } |
1640 | 1808 | ||
1641 | used_obj = handle_to_obj(handle); | 1809 | used_obj = handle_to_obj(handle); |
1642 | free_obj = obj_malloc(class, d_page, handle); | 1810 | free_obj = obj_malloc(class, get_zspage(d_page), handle); |
1643 | zs_object_copy(class, free_obj, used_obj); | 1811 | zs_object_copy(class, free_obj, used_obj); |
1644 | index++; | 1812 | index++; |
1645 | /* | 1813 | /* |
@@ -1661,68 +1829,422 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, | |||
1661 | return ret; | 1829 | return ret; |
1662 | } | 1830 | } |
1663 | 1831 | ||
1664 | static struct page *isolate_target_page(struct size_class *class) | 1832 | static struct zspage *isolate_zspage(struct size_class *class, bool source) |
1665 | { | 1833 | { |
1666 | int i; | 1834 | int i; |
1667 | struct page *page; | 1835 | struct zspage *zspage; |
1836 | enum fullness_group fg[2] = {ZS_ALMOST_EMPTY, ZS_ALMOST_FULL}; | ||
1668 | 1837 | ||
1669 | for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { | 1838 | if (!source) { |
1670 | page = class->fullness_list[i]; | 1839 | fg[0] = ZS_ALMOST_FULL; |
1671 | if (page) { | 1840 | fg[1] = ZS_ALMOST_EMPTY; |
1672 | remove_zspage(class, i, page); | 1841 | } |
1673 | break; | 1842 | |
1843 | for (i = 0; i < 2; i++) { | ||
1844 | zspage = list_first_entry_or_null(&class->fullness_list[fg[i]], | ||
1845 | struct zspage, list); | ||
1846 | if (zspage) { | ||
1847 | VM_BUG_ON(is_zspage_isolated(zspage)); | ||
1848 | remove_zspage(class, zspage, fg[i]); | ||
1849 | return zspage; | ||
1674 | } | 1850 | } |
1675 | } | 1851 | } |
1676 | 1852 | ||
1677 | return page; | 1853 | return zspage; |
1678 | } | 1854 | } |
1679 | 1855 | ||
1680 | /* | 1856 | /* |
1681 | * putback_zspage - add @first_page into right class's fullness list | 1857 | * putback_zspage - add @zspage into right class's fullness list |
1682 | * @pool: target pool | ||
1683 | * @class: destination class | 1858 | * @class: destination class |
1684 | * @first_page: target page | 1859 | * @zspage: target page |
1685 | * | 1860 | * |
1686 | * Return @fist_page's fullness_group | 1861 | * Return @zspage's fullness_group |
1687 | */ | 1862 | */ |
1688 | static enum fullness_group putback_zspage(struct zs_pool *pool, | 1863 | static enum fullness_group putback_zspage(struct size_class *class, |
1689 | struct size_class *class, | 1864 | struct zspage *zspage) |
1690 | struct page *first_page) | ||
1691 | { | 1865 | { |
1692 | enum fullness_group fullness; | 1866 | enum fullness_group fullness; |
1693 | 1867 | ||
1694 | fullness = get_fullness_group(first_page); | 1868 | VM_BUG_ON(is_zspage_isolated(zspage)); |
1695 | insert_zspage(class, fullness, first_page); | ||
1696 | set_zspage_mapping(first_page, class->index, fullness); | ||
1697 | 1869 | ||
1698 | if (fullness == ZS_EMPTY) { | 1870 | fullness = get_fullness_group(class, zspage); |
1699 | zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | 1871 | insert_zspage(class, zspage, fullness); |
1700 | class->size, class->pages_per_zspage)); | 1872 | set_zspage_mapping(zspage, class->index, fullness); |
1701 | atomic_long_sub(class->pages_per_zspage, | 1873 | |
1702 | &pool->pages_allocated); | 1874 | return fullness; |
1875 | } | ||
1876 | |||
1877 | #ifdef CONFIG_COMPACTION | ||
1878 | static struct dentry *zs_mount(struct file_system_type *fs_type, | ||
1879 | int flags, const char *dev_name, void *data) | ||
1880 | { | ||
1881 | static const struct dentry_operations ops = { | ||
1882 | .d_dname = simple_dname, | ||
1883 | }; | ||
1884 | |||
1885 | return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC); | ||
1886 | } | ||
1887 | |||
1888 | static struct file_system_type zsmalloc_fs = { | ||
1889 | .name = "zsmalloc", | ||
1890 | .mount = zs_mount, | ||
1891 | .kill_sb = kill_anon_super, | ||
1892 | }; | ||
1893 | |||
1894 | static int zsmalloc_mount(void) | ||
1895 | { | ||
1896 | int ret = 0; | ||
1703 | 1897 | ||
1704 | free_zspage(first_page); | 1898 | zsmalloc_mnt = kern_mount(&zsmalloc_fs); |
1899 | if (IS_ERR(zsmalloc_mnt)) | ||
1900 | ret = PTR_ERR(zsmalloc_mnt); | ||
1901 | |||
1902 | return ret; | ||
1903 | } | ||
1904 | |||
1905 | static void zsmalloc_unmount(void) | ||
1906 | { | ||
1907 | kern_unmount(zsmalloc_mnt); | ||
1908 | } | ||
1909 | |||
1910 | static void migrate_lock_init(struct zspage *zspage) | ||
1911 | { | ||
1912 | rwlock_init(&zspage->lock); | ||
1913 | } | ||
1914 | |||
1915 | static void migrate_read_lock(struct zspage *zspage) | ||
1916 | { | ||
1917 | read_lock(&zspage->lock); | ||
1918 | } | ||
1919 | |||
1920 | static void migrate_read_unlock(struct zspage *zspage) | ||
1921 | { | ||
1922 | read_unlock(&zspage->lock); | ||
1923 | } | ||
1924 | |||
1925 | static void migrate_write_lock(struct zspage *zspage) | ||
1926 | { | ||
1927 | write_lock(&zspage->lock); | ||
1928 | } | ||
1929 | |||
1930 | static void migrate_write_unlock(struct zspage *zspage) | ||
1931 | { | ||
1932 | write_unlock(&zspage->lock); | ||
1933 | } | ||
1934 | |||
1935 | /* Number of isolated subpage for *page migration* in this zspage */ | ||
1936 | static void inc_zspage_isolation(struct zspage *zspage) | ||
1937 | { | ||
1938 | zspage->isolated++; | ||
1939 | } | ||
1940 | |||
1941 | static void dec_zspage_isolation(struct zspage *zspage) | ||
1942 | { | ||
1943 | zspage->isolated--; | ||
1944 | } | ||
1945 | |||
1946 | static void replace_sub_page(struct size_class *class, struct zspage *zspage, | ||
1947 | struct page *newpage, struct page *oldpage) | ||
1948 | { | ||
1949 | struct page *page; | ||
1950 | struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; | ||
1951 | int idx = 0; | ||
1952 | |||
1953 | page = get_first_page(zspage); | ||
1954 | do { | ||
1955 | if (page == oldpage) | ||
1956 | pages[idx] = newpage; | ||
1957 | else | ||
1958 | pages[idx] = page; | ||
1959 | idx++; | ||
1960 | } while ((page = get_next_page(page)) != NULL); | ||
1961 | |||
1962 | create_page_chain(class, zspage, pages); | ||
1963 | set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); | ||
1964 | if (unlikely(PageHugeObject(oldpage))) | ||
1965 | newpage->index = oldpage->index; | ||
1966 | __SetPageMovable(newpage, page_mapping(oldpage)); | ||
1967 | } | ||
1968 | |||
1969 | bool zs_page_isolate(struct page *page, isolate_mode_t mode) | ||
1970 | { | ||
1971 | struct zs_pool *pool; | ||
1972 | struct size_class *class; | ||
1973 | int class_idx; | ||
1974 | enum fullness_group fullness; | ||
1975 | struct zspage *zspage; | ||
1976 | struct address_space *mapping; | ||
1977 | |||
1978 | /* | ||
1979 | * Page is locked so zspage couldn't be destroyed. For detail, look at | ||
1980 | * lock_zspage in free_zspage. | ||
1981 | */ | ||
1982 | VM_BUG_ON_PAGE(!PageMovable(page), page); | ||
1983 | VM_BUG_ON_PAGE(PageIsolated(page), page); | ||
1984 | |||
1985 | zspage = get_zspage(page); | ||
1986 | |||
1987 | /* | ||
1988 | * Without class lock, fullness could be stale while class_idx is okay | ||
1989 | * because class_idx is constant unless page is freed so we should get | ||
1990 | * fullness again under class lock. | ||
1991 | */ | ||
1992 | get_zspage_mapping(zspage, &class_idx, &fullness); | ||
1993 | mapping = page_mapping(page); | ||
1994 | pool = mapping->private_data; | ||
1995 | class = pool->size_class[class_idx]; | ||
1996 | |||
1997 | spin_lock(&class->lock); | ||
1998 | if (get_zspage_inuse(zspage) == 0) { | ||
1999 | spin_unlock(&class->lock); | ||
2000 | return false; | ||
1705 | } | 2001 | } |
1706 | 2002 | ||
1707 | return fullness; | 2003 | /* zspage is isolated for object migration */ |
2004 | if (list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { | ||
2005 | spin_unlock(&class->lock); | ||
2006 | return false; | ||
2007 | } | ||
2008 | |||
2009 | /* | ||
2010 | * If this is first time isolation for the zspage, isolate zspage from | ||
2011 | * size_class to prevent further object allocation from the zspage. | ||
2012 | */ | ||
2013 | if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { | ||
2014 | get_zspage_mapping(zspage, &class_idx, &fullness); | ||
2015 | remove_zspage(class, zspage, fullness); | ||
2016 | } | ||
2017 | |||
2018 | inc_zspage_isolation(zspage); | ||
2019 | spin_unlock(&class->lock); | ||
2020 | |||
2021 | return true; | ||
1708 | } | 2022 | } |
1709 | 2023 | ||
1710 | static struct page *isolate_source_page(struct size_class *class) | 2024 | int zs_page_migrate(struct address_space *mapping, struct page *newpage, |
2025 | struct page *page, enum migrate_mode mode) | ||
2026 | { | ||
2027 | struct zs_pool *pool; | ||
2028 | struct size_class *class; | ||
2029 | int class_idx; | ||
2030 | enum fullness_group fullness; | ||
2031 | struct zspage *zspage; | ||
2032 | struct page *dummy; | ||
2033 | void *s_addr, *d_addr, *addr; | ||
2034 | int offset, pos; | ||
2035 | unsigned long handle, head; | ||
2036 | unsigned long old_obj, new_obj; | ||
2037 | unsigned int obj_idx; | ||
2038 | int ret = -EAGAIN; | ||
2039 | |||
2040 | VM_BUG_ON_PAGE(!PageMovable(page), page); | ||
2041 | VM_BUG_ON_PAGE(!PageIsolated(page), page); | ||
2042 | |||
2043 | zspage = get_zspage(page); | ||
2044 | |||
2045 | /* Concurrent compactor cannot migrate any subpage in zspage */ | ||
2046 | migrate_write_lock(zspage); | ||
2047 | get_zspage_mapping(zspage, &class_idx, &fullness); | ||
2048 | pool = mapping->private_data; | ||
2049 | class = pool->size_class[class_idx]; | ||
2050 | offset = get_first_obj_offset(page); | ||
2051 | |||
2052 | spin_lock(&class->lock); | ||
2053 | if (!get_zspage_inuse(zspage)) { | ||
2054 | ret = -EBUSY; | ||
2055 | goto unlock_class; | ||
2056 | } | ||
2057 | |||
2058 | pos = offset; | ||
2059 | s_addr = kmap_atomic(page); | ||
2060 | while (pos < PAGE_SIZE) { | ||
2061 | head = obj_to_head(page, s_addr + pos); | ||
2062 | if (head & OBJ_ALLOCATED_TAG) { | ||
2063 | handle = head & ~OBJ_ALLOCATED_TAG; | ||
2064 | if (!trypin_tag(handle)) | ||
2065 | goto unpin_objects; | ||
2066 | } | ||
2067 | pos += class->size; | ||
2068 | } | ||
2069 | |||
2070 | /* | ||
2071 | * Here, any user cannot access all objects in the zspage so let's move. | ||
2072 | */ | ||
2073 | d_addr = kmap_atomic(newpage); | ||
2074 | memcpy(d_addr, s_addr, PAGE_SIZE); | ||
2075 | kunmap_atomic(d_addr); | ||
2076 | |||
2077 | for (addr = s_addr + offset; addr < s_addr + pos; | ||
2078 | addr += class->size) { | ||
2079 | head = obj_to_head(page, addr); | ||
2080 | if (head & OBJ_ALLOCATED_TAG) { | ||
2081 | handle = head & ~OBJ_ALLOCATED_TAG; | ||
2082 | if (!testpin_tag(handle)) | ||
2083 | BUG(); | ||
2084 | |||
2085 | old_obj = handle_to_obj(handle); | ||
2086 | obj_to_location(old_obj, &dummy, &obj_idx); | ||
2087 | new_obj = (unsigned long)location_to_obj(newpage, | ||
2088 | obj_idx); | ||
2089 | new_obj |= BIT(HANDLE_PIN_BIT); | ||
2090 | record_obj(handle, new_obj); | ||
2091 | } | ||
2092 | } | ||
2093 | |||
2094 | replace_sub_page(class, zspage, newpage, page); | ||
2095 | get_page(newpage); | ||
2096 | |||
2097 | dec_zspage_isolation(zspage); | ||
2098 | |||
2099 | /* | ||
2100 | * Page migration is done so let's putback isolated zspage to | ||
2101 | * the list if @page is final isolated subpage in the zspage. | ||
2102 | */ | ||
2103 | if (!is_zspage_isolated(zspage)) | ||
2104 | putback_zspage(class, zspage); | ||
2105 | |||
2106 | reset_page(page); | ||
2107 | put_page(page); | ||
2108 | page = newpage; | ||
2109 | |||
2110 | ret = MIGRATEPAGE_SUCCESS; | ||
2111 | unpin_objects: | ||
2112 | for (addr = s_addr + offset; addr < s_addr + pos; | ||
2113 | addr += class->size) { | ||
2114 | head = obj_to_head(page, addr); | ||
2115 | if (head & OBJ_ALLOCATED_TAG) { | ||
2116 | handle = head & ~OBJ_ALLOCATED_TAG; | ||
2117 | if (!testpin_tag(handle)) | ||
2118 | BUG(); | ||
2119 | unpin_tag(handle); | ||
2120 | } | ||
2121 | } | ||
2122 | kunmap_atomic(s_addr); | ||
2123 | unlock_class: | ||
2124 | spin_unlock(&class->lock); | ||
2125 | migrate_write_unlock(zspage); | ||
2126 | |||
2127 | return ret; | ||
2128 | } | ||
2129 | |||
2130 | void zs_page_putback(struct page *page) | ||
2131 | { | ||
2132 | struct zs_pool *pool; | ||
2133 | struct size_class *class; | ||
2134 | int class_idx; | ||
2135 | enum fullness_group fg; | ||
2136 | struct address_space *mapping; | ||
2137 | struct zspage *zspage; | ||
2138 | |||
2139 | VM_BUG_ON_PAGE(!PageMovable(page), page); | ||
2140 | VM_BUG_ON_PAGE(!PageIsolated(page), page); | ||
2141 | |||
2142 | zspage = get_zspage(page); | ||
2143 | get_zspage_mapping(zspage, &class_idx, &fg); | ||
2144 | mapping = page_mapping(page); | ||
2145 | pool = mapping->private_data; | ||
2146 | class = pool->size_class[class_idx]; | ||
2147 | |||
2148 | spin_lock(&class->lock); | ||
2149 | dec_zspage_isolation(zspage); | ||
2150 | if (!is_zspage_isolated(zspage)) { | ||
2151 | fg = putback_zspage(class, zspage); | ||
2152 | /* | ||
2153 | * Due to page_lock, we cannot free zspage immediately | ||
2154 | * so let's defer. | ||
2155 | */ | ||
2156 | if (fg == ZS_EMPTY) | ||
2157 | schedule_work(&pool->free_work); | ||
2158 | } | ||
2159 | spin_unlock(&class->lock); | ||
2160 | } | ||
2161 | |||
2162 | const struct address_space_operations zsmalloc_aops = { | ||
2163 | .isolate_page = zs_page_isolate, | ||
2164 | .migratepage = zs_page_migrate, | ||
2165 | .putback_page = zs_page_putback, | ||
2166 | }; | ||
2167 | |||
2168 | static int zs_register_migration(struct zs_pool *pool) | ||
2169 | { | ||
2170 | pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb); | ||
2171 | if (IS_ERR(pool->inode)) { | ||
2172 | pool->inode = NULL; | ||
2173 | return 1; | ||
2174 | } | ||
2175 | |||
2176 | pool->inode->i_mapping->private_data = pool; | ||
2177 | pool->inode->i_mapping->a_ops = &zsmalloc_aops; | ||
2178 | return 0; | ||
2179 | } | ||
2180 | |||
2181 | static void zs_unregister_migration(struct zs_pool *pool) | ||
2182 | { | ||
2183 | flush_work(&pool->free_work); | ||
2184 | if (pool->inode) | ||
2185 | iput(pool->inode); | ||
2186 | } | ||
2187 | |||
2188 | /* | ||
2189 | * Caller should hold page_lock of all pages in the zspage | ||
2190 | * In here, we cannot use zspage meta data. | ||
2191 | */ | ||
2192 | static void async_free_zspage(struct work_struct *work) | ||
1711 | { | 2193 | { |
1712 | int i; | 2194 | int i; |
1713 | struct page *page = NULL; | 2195 | struct size_class *class; |
2196 | unsigned int class_idx; | ||
2197 | enum fullness_group fullness; | ||
2198 | struct zspage *zspage, *tmp; | ||
2199 | LIST_HEAD(free_pages); | ||
2200 | struct zs_pool *pool = container_of(work, struct zs_pool, | ||
2201 | free_work); | ||
1714 | 2202 | ||
1715 | for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { | 2203 | for (i = 0; i < zs_size_classes; i++) { |
1716 | page = class->fullness_list[i]; | 2204 | class = pool->size_class[i]; |
1717 | if (!page) | 2205 | if (class->index != i) |
1718 | continue; | 2206 | continue; |
1719 | 2207 | ||
1720 | remove_zspage(class, i, page); | 2208 | spin_lock(&class->lock); |
1721 | break; | 2209 | list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages); |
2210 | spin_unlock(&class->lock); | ||
2211 | } | ||
2212 | |||
2213 | |||
2214 | list_for_each_entry_safe(zspage, tmp, &free_pages, list) { | ||
2215 | list_del(&zspage->list); | ||
2216 | lock_zspage(zspage); | ||
2217 | |||
2218 | get_zspage_mapping(zspage, &class_idx, &fullness); | ||
2219 | VM_BUG_ON(fullness != ZS_EMPTY); | ||
2220 | class = pool->size_class[class_idx]; | ||
2221 | spin_lock(&class->lock); | ||
2222 | __free_zspage(pool, pool->size_class[class_idx], zspage); | ||
2223 | spin_unlock(&class->lock); | ||
1722 | } | 2224 | } |
2225 | }; | ||
2226 | |||
2227 | static void kick_deferred_free(struct zs_pool *pool) | ||
2228 | { | ||
2229 | schedule_work(&pool->free_work); | ||
2230 | } | ||
2231 | |||
2232 | static void init_deferred_free(struct zs_pool *pool) | ||
2233 | { | ||
2234 | INIT_WORK(&pool->free_work, async_free_zspage); | ||
2235 | } | ||
2236 | |||
2237 | static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) | ||
2238 | { | ||
2239 | struct page *page = get_first_page(zspage); | ||
1723 | 2240 | ||
1724 | return page; | 2241 | do { |
2242 | WARN_ON(!trylock_page(page)); | ||
2243 | __SetPageMovable(page, pool->inode->i_mapping); | ||
2244 | unlock_page(page); | ||
2245 | } while ((page = get_next_page(page)) != NULL); | ||
1725 | } | 2246 | } |
2247 | #endif | ||
1726 | 2248 | ||
1727 | /* | 2249 | /* |
1728 | * | 2250 | * |
@@ -1748,20 +2270,20 @@ static unsigned long zs_can_compact(struct size_class *class) | |||
1748 | static void __zs_compact(struct zs_pool *pool, struct size_class *class) | 2270 | static void __zs_compact(struct zs_pool *pool, struct size_class *class) |
1749 | { | 2271 | { |
1750 | struct zs_compact_control cc; | 2272 | struct zs_compact_control cc; |
1751 | struct page *src_page; | 2273 | struct zspage *src_zspage; |
1752 | struct page *dst_page = NULL; | 2274 | struct zspage *dst_zspage = NULL; |
1753 | 2275 | ||
1754 | spin_lock(&class->lock); | 2276 | spin_lock(&class->lock); |
1755 | while ((src_page = isolate_source_page(class))) { | 2277 | while ((src_zspage = isolate_zspage(class, true))) { |
1756 | 2278 | ||
1757 | if (!zs_can_compact(class)) | 2279 | if (!zs_can_compact(class)) |
1758 | break; | 2280 | break; |
1759 | 2281 | ||
1760 | cc.index = 0; | 2282 | cc.index = 0; |
1761 | cc.s_page = src_page; | 2283 | cc.s_page = get_first_page(src_zspage); |
1762 | 2284 | ||
1763 | while ((dst_page = isolate_target_page(class))) { | 2285 | while ((dst_zspage = isolate_zspage(class, false))) { |
1764 | cc.d_page = dst_page; | 2286 | cc.d_page = get_first_page(dst_zspage); |
1765 | /* | 2287 | /* |
1766 | * If there is no more space in dst_page, resched | 2288 | * If there is no more space in dst_page, resched |
1767 | * and see if anyone had allocated another zspage. | 2289 | * and see if anyone had allocated another zspage. |
@@ -1769,23 +2291,25 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) | |||
1769 | if (!migrate_zspage(pool, class, &cc)) | 2291 | if (!migrate_zspage(pool, class, &cc)) |
1770 | break; | 2292 | break; |
1771 | 2293 | ||
1772 | putback_zspage(pool, class, dst_page); | 2294 | putback_zspage(class, dst_zspage); |
1773 | } | 2295 | } |
1774 | 2296 | ||
1775 | /* Stop if we couldn't find slot */ | 2297 | /* Stop if we couldn't find slot */ |
1776 | if (dst_page == NULL) | 2298 | if (dst_zspage == NULL) |
1777 | break; | 2299 | break; |
1778 | 2300 | ||
1779 | putback_zspage(pool, class, dst_page); | 2301 | putback_zspage(class, dst_zspage); |
1780 | if (putback_zspage(pool, class, src_page) == ZS_EMPTY) | 2302 | if (putback_zspage(class, src_zspage) == ZS_EMPTY) { |
2303 | free_zspage(pool, class, src_zspage); | ||
1781 | pool->stats.pages_compacted += class->pages_per_zspage; | 2304 | pool->stats.pages_compacted += class->pages_per_zspage; |
2305 | } | ||
1782 | spin_unlock(&class->lock); | 2306 | spin_unlock(&class->lock); |
1783 | cond_resched(); | 2307 | cond_resched(); |
1784 | spin_lock(&class->lock); | 2308 | spin_lock(&class->lock); |
1785 | } | 2309 | } |
1786 | 2310 | ||
1787 | if (src_page) | 2311 | if (src_zspage) |
1788 | putback_zspage(pool, class, src_page); | 2312 | putback_zspage(class, src_zspage); |
1789 | 2313 | ||
1790 | spin_unlock(&class->lock); | 2314 | spin_unlock(&class->lock); |
1791 | } | 2315 | } |
@@ -1892,6 +2416,7 @@ struct zs_pool *zs_create_pool(const char *name) | |||
1892 | if (!pool) | 2416 | if (!pool) |
1893 | return NULL; | 2417 | return NULL; |
1894 | 2418 | ||
2419 | init_deferred_free(pool); | ||
1895 | pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), | 2420 | pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), |
1896 | GFP_KERNEL); | 2421 | GFP_KERNEL); |
1897 | if (!pool->size_class) { | 2422 | if (!pool->size_class) { |
@@ -1903,7 +2428,7 @@ struct zs_pool *zs_create_pool(const char *name) | |||
1903 | if (!pool->name) | 2428 | if (!pool->name) |
1904 | goto err; | 2429 | goto err; |
1905 | 2430 | ||
1906 | if (create_handle_cache(pool)) | 2431 | if (create_cache(pool)) |
1907 | goto err; | 2432 | goto err; |
1908 | 2433 | ||
1909 | /* | 2434 | /* |
@@ -1914,6 +2439,7 @@ struct zs_pool *zs_create_pool(const char *name) | |||
1914 | int size; | 2439 | int size; |
1915 | int pages_per_zspage; | 2440 | int pages_per_zspage; |
1916 | struct size_class *class; | 2441 | struct size_class *class; |
2442 | int fullness = 0; | ||
1917 | 2443 | ||
1918 | size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; | 2444 | size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; |
1919 | if (size > ZS_MAX_ALLOC_SIZE) | 2445 | if (size > ZS_MAX_ALLOC_SIZE) |
@@ -1943,11 +2469,13 @@ struct zs_pool *zs_create_pool(const char *name) | |||
1943 | class->size = size; | 2469 | class->size = size; |
1944 | class->index = i; | 2470 | class->index = i; |
1945 | class->pages_per_zspage = pages_per_zspage; | 2471 | class->pages_per_zspage = pages_per_zspage; |
1946 | if (pages_per_zspage == 1 && | 2472 | class->objs_per_zspage = class->pages_per_zspage * |
1947 | get_maxobj_per_zspage(size, pages_per_zspage) == 1) | 2473 | PAGE_SIZE / class->size; |
1948 | class->huge = true; | ||
1949 | spin_lock_init(&class->lock); | 2474 | spin_lock_init(&class->lock); |
1950 | pool->size_class[i] = class; | 2475 | pool->size_class[i] = class; |
2476 | for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS; | ||
2477 | fullness++) | ||
2478 | INIT_LIST_HEAD(&class->fullness_list[fullness]); | ||
1951 | 2479 | ||
1952 | prev_class = class; | 2480 | prev_class = class; |
1953 | } | 2481 | } |
@@ -1955,6 +2483,9 @@ struct zs_pool *zs_create_pool(const char *name) | |||
1955 | /* debug only, don't abort if it fails */ | 2483 | /* debug only, don't abort if it fails */ |
1956 | zs_pool_stat_create(pool, name); | 2484 | zs_pool_stat_create(pool, name); |
1957 | 2485 | ||
2486 | if (zs_register_migration(pool)) | ||
2487 | goto err; | ||
2488 | |||
1958 | /* | 2489 | /* |
1959 | * Not critical, we still can use the pool | 2490 | * Not critical, we still can use the pool |
1960 | * and user can trigger compaction manually. | 2491 | * and user can trigger compaction manually. |
@@ -1974,6 +2505,7 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
1974 | int i; | 2505 | int i; |
1975 | 2506 | ||
1976 | zs_unregister_shrinker(pool); | 2507 | zs_unregister_shrinker(pool); |
2508 | zs_unregister_migration(pool); | ||
1977 | zs_pool_stat_destroy(pool); | 2509 | zs_pool_stat_destroy(pool); |
1978 | 2510 | ||
1979 | for (i = 0; i < zs_size_classes; i++) { | 2511 | for (i = 0; i < zs_size_classes; i++) { |
@@ -1986,8 +2518,8 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
1986 | if (class->index != i) | 2518 | if (class->index != i) |
1987 | continue; | 2519 | continue; |
1988 | 2520 | ||
1989 | for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { | 2521 | for (fg = ZS_EMPTY; fg < NR_ZS_FULLNESS; fg++) { |
1990 | if (class->fullness_list[fg]) { | 2522 | if (!list_empty(&class->fullness_list[fg])) { |
1991 | pr_info("Freeing non-empty class with size %db, fullness group %d\n", | 2523 | pr_info("Freeing non-empty class with size %db, fullness group %d\n", |
1992 | class->size, fg); | 2524 | class->size, fg); |
1993 | } | 2525 | } |
@@ -1995,7 +2527,7 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
1995 | kfree(class); | 2527 | kfree(class); |
1996 | } | 2528 | } |
1997 | 2529 | ||
1998 | destroy_handle_cache(pool); | 2530 | destroy_cache(pool); |
1999 | kfree(pool->size_class); | 2531 | kfree(pool->size_class); |
2000 | kfree(pool->name); | 2532 | kfree(pool->name); |
2001 | kfree(pool); | 2533 | kfree(pool); |
@@ -2004,7 +2536,13 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool); | |||
2004 | 2536 | ||
2005 | static int __init zs_init(void) | 2537 | static int __init zs_init(void) |
2006 | { | 2538 | { |
2007 | int ret = zs_register_cpu_notifier(); | 2539 | int ret; |
2540 | |||
2541 | ret = zsmalloc_mount(); | ||
2542 | if (ret) | ||
2543 | goto out; | ||
2544 | |||
2545 | ret = zs_register_cpu_notifier(); | ||
2008 | 2546 | ||
2009 | if (ret) | 2547 | if (ret) |
2010 | goto notifier_fail; | 2548 | goto notifier_fail; |
@@ -2021,7 +2559,8 @@ static int __init zs_init(void) | |||
2021 | 2559 | ||
2022 | notifier_fail: | 2560 | notifier_fail: |
2023 | zs_unregister_cpu_notifier(); | 2561 | zs_unregister_cpu_notifier(); |
2024 | 2562 | zsmalloc_unmount(); | |
2563 | out: | ||
2025 | return ret; | 2564 | return ret; |
2026 | } | 2565 | } |
2027 | 2566 | ||
@@ -2030,6 +2569,7 @@ static void __exit zs_exit(void) | |||
2030 | #ifdef CONFIG_ZPOOL | 2569 | #ifdef CONFIG_ZPOOL |
2031 | zpool_unregister_driver(&zs_zpool_driver); | 2570 | zpool_unregister_driver(&zs_zpool_driver); |
2032 | #endif | 2571 | #endif |
2572 | zsmalloc_unmount(); | ||
2033 | zs_unregister_cpu_notifier(); | 2573 | zs_unregister_cpu_notifier(); |
2034 | 2574 | ||
2035 | zs_stat_exit(); | 2575 | zs_stat_exit(); |
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 735362c26c8e..f1dffe84f0d5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c | |||
@@ -769,6 +769,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) | |||
769 | lockdep_set_class(&sk->sk_receive_queue.lock, | 769 | lockdep_set_class(&sk->sk_receive_queue.lock, |
770 | &af_unix_sk_receive_queue_lock_key); | 770 | &af_unix_sk_receive_queue_lock_key); |
771 | 771 | ||
772 | sk->sk_allocation = GFP_KERNEL_ACCOUNT; | ||
772 | sk->sk_write_space = unix_write_space; | 773 | sk->sk_write_space = unix_write_space; |
773 | sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; | 774 | sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; |
774 | sk->sk_destruct = unix_sock_destructor; | 775 | sk->sk_destruct = unix_sock_destructor; |
diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter index 0254f3ba0dba..19f5adfd877d 100755 --- a/scripts/bloat-o-meter +++ b/scripts/bloat-o-meter | |||
@@ -67,5 +67,5 @@ print("%-40s %7s %7s %+7s" % ("function", "old", "new", "delta")) | |||
67 | for d, n in delta: | 67 | for d, n in delta: |
68 | if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d)) | 68 | if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d)) |
69 | 69 | ||
70 | print("Total: Before=%d, After=%d, chg %f%%" % \ | 70 | print("Total: Before=%d, After=%d, chg %+.2f%%" % \ |
71 | (otot, ntot, (ntot - otot)*100/otot)) | 71 | (otot, ntot, (ntot - otot)*100.0/otot)) |
diff --git a/scripts/tags.sh b/scripts/tags.sh index f72f48f638ae..ed7eef24ef89 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh | |||
@@ -185,6 +185,9 @@ regex_c=( | |||
185 | '/\<CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/ClearPage\1/' | 185 | '/\<CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/ClearPage\1/' |
186 | '/\<__CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/__ClearPage\1/' | 186 | '/\<__CLEARPAGEFLAG_NOOP(\([[:alnum:]_]*\).*/__ClearPage\1/' |
187 | '/\<TESTCLEARFLAG_FALSE(\([[:alnum:]_]*\).*/TestClearPage\1/' | 187 | '/\<TESTCLEARFLAG_FALSE(\([[:alnum:]_]*\).*/TestClearPage\1/' |
188 | '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/Page\1/' | ||
189 | '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/__SetPage\1/' | ||
190 | '/^PAGE_MAPCOUNT_OPS(\([[:alnum:]_]*\).*/__ClearPage\1/' | ||
188 | '/^TASK_PFA_TEST([^,]*, *\([[:alnum:]_]*\))/task_\1/' | 191 | '/^TASK_PFA_TEST([^,]*, *\([[:alnum:]_]*\))/task_\1/' |
189 | '/^TASK_PFA_SET([^,]*, *\([[:alnum:]_]*\))/task_set_\1/' | 192 | '/^TASK_PFA_SET([^,]*, *\([[:alnum:]_]*\))/task_set_\1/' |
190 | '/^TASK_PFA_CLEAR([^,]*, *\([[:alnum:]_]*\))/task_clear_\1/' | 193 | '/^TASK_PFA_CLEAR([^,]*, *\([[:alnum:]_]*\))/task_clear_\1/' |
diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 77147b42d598..f1c055f3c243 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c | |||
@@ -79,12 +79,12 @@ static void add_list(char *buf, int len) | |||
79 | } | 79 | } |
80 | } | 80 | } |
81 | 81 | ||
82 | #define BUF_SIZE 1024 | 82 | #define BUF_SIZE (128 * 1024) |
83 | 83 | ||
84 | int main(int argc, char **argv) | 84 | int main(int argc, char **argv) |
85 | { | 85 | { |
86 | FILE *fin, *fout; | 86 | FILE *fin, *fout; |
87 | char buf[BUF_SIZE]; | 87 | char *buf; |
88 | int ret, i, count; | 88 | int ret, i, count; |
89 | struct block_list *list2; | 89 | struct block_list *list2; |
90 | struct stat st; | 90 | struct stat st; |
@@ -107,6 +107,11 @@ int main(int argc, char **argv) | |||
107 | max_size = st.st_size / 100; /* hack ... */ | 107 | max_size = st.st_size / 100; /* hack ... */ |
108 | 108 | ||
109 | list = malloc(max_size * sizeof(*list)); | 109 | list = malloc(max_size * sizeof(*list)); |
110 | buf = malloc(BUF_SIZE); | ||
111 | if (!list || !buf) { | ||
112 | printf("Out of memory\n"); | ||
113 | exit(1); | ||
114 | } | ||
110 | 115 | ||
111 | for ( ; ; ) { | 116 | for ( ; ; ) { |
112 | ret = read_block(buf, BUF_SIZE, fin); | 117 | ret = read_block(buf, BUF_SIZE, fin); |