aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile4
-rw-r--r--mm/cleancache.c276
-rw-r--r--mm/cma.c62
-rw-r--r--mm/cma.h24
-rw-r--r--mm/cma_debug.c205
-rw-r--r--mm/compaction.c75
-rw-r--r--mm/filemap.c130
-rw-r--r--mm/gup.c128
-rw-r--r--mm/huge_memory.c136
-rw-r--r--mm/hugetlb.c250
-rw-r--r--mm/internal.h8
-rw-r--r--mm/iov_iter.c753
-rw-r--r--mm/kasan/kasan.c27
-rw-r--r--mm/ksm.c10
-rw-r--r--mm/memblock.c22
-rw-r--r--mm/memcontrol.c245
-rw-r--r--mm/memory-failure.c122
-rw-r--r--mm/memory.c436
-rw-r--r--mm/memory_hotplug.c50
-rw-r--r--mm/mempolicy.c6
-rw-r--r--mm/mempool.c127
-rw-r--r--mm/memtest.c118
-rw-r--r--mm/migrate.c40
-rw-r--r--mm/mlock.c135
-rw-r--r--mm/mmap.c29
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/mremap.c35
-rw-r--r--mm/nommu.c5
-rw-r--r--mm/oom_kill.c9
-rw-r--r--mm/page-writeback.c29
-rw-r--r--mm/page_alloc.c256
-rw-r--r--mm/page_io.c7
-rw-r--r--mm/page_isolation.c1
-rw-r--r--mm/pagewalk.c9
-rw-r--r--mm/percpu.c4
-rw-r--r--mm/process_vm_access.c35
-rw-r--r--mm/rmap.c13
-rw-r--r--mm/shmem.c34
-rw-r--r--mm/slab.c22
-rw-r--r--mm/slob.c3
-rw-r--r--mm/slub.c38
-rw-r--r--mm/swap.c34
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c39
-rw-r--r--mm/util.c41
-rw-r--r--mm/vmalloc.c104
-rw-r--r--mm/zsmalloc.c971
49 files changed, 2807 insertions, 2313 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a03131b6ba8e..390214da4546 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -517,6 +517,12 @@ config CMA_DEBUG
517 processing calls such as dma_alloc_from_contiguous(). 517 processing calls such as dma_alloc_from_contiguous().
518 This option does not affect warning and error messages. 518 This option does not affect warning and error messages.
519 519
520config CMA_DEBUGFS
521 bool "CMA debugfs interface"
522 depends on CMA && DEBUG_FS
523 help
524 Turns on the DebugFS interface for CMA.
525
520config CMA_AREAS 526config CMA_AREAS
521 int "Maximum count of the CMA areas" 527 int "Maximum count of the CMA areas"
522 depends on CMA 528 depends on CMA
diff --git a/mm/Makefile b/mm/Makefile
index 3c1caa2693bd..98c4eaeabdcb 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -21,7 +21,7 @@ obj-y := filemap.o mempool.o oom_kill.o \
21 mm_init.o mmu_context.o percpu.o slab_common.o \ 21 mm_init.o mmu_context.o percpu.o slab_common.o \
22 compaction.o vmacache.o \ 22 compaction.o vmacache.o \
23 interval_tree.o list_lru.o workingset.o \ 23 interval_tree.o list_lru.o workingset.o \
24 iov_iter.o debug.o $(mmu-y) 24 debug.o $(mmu-y)
25 25
26obj-y += init-mm.o 26obj-y += init-mm.o
27 27
@@ -55,6 +55,7 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
55obj-$(CONFIG_KASAN) += kasan/ 55obj-$(CONFIG_KASAN) += kasan/
56obj-$(CONFIG_FAILSLAB) += failslab.o 56obj-$(CONFIG_FAILSLAB) += failslab.o
57obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 57obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
58obj-$(CONFIG_MEMTEST) += memtest.o
58obj-$(CONFIG_MIGRATION) += migrate.o 59obj-$(CONFIG_MIGRATION) += migrate.o
59obj-$(CONFIG_QUICKLIST) += quicklist.o 60obj-$(CONFIG_QUICKLIST) += quicklist.o
60obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o 61obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
@@ -76,3 +77,4 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
76obj-$(CONFIG_CMA) += cma.o 77obj-$(CONFIG_CMA) += cma.o
77obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o 78obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
78obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o 79obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
80obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 053bcd8f12fb..8fc50811119b 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -19,7 +19,7 @@
19#include <linux/cleancache.h> 19#include <linux/cleancache.h>
20 20
21/* 21/*
22 * cleancache_ops is set by cleancache_ops_register to contain the pointers 22 * cleancache_ops is set by cleancache_register_ops to contain the pointers
23 * to the cleancache "backend" implementation functions. 23 * to the cleancache "backend" implementation functions.
24 */ 24 */
25static struct cleancache_ops *cleancache_ops __read_mostly; 25static struct cleancache_ops *cleancache_ops __read_mostly;
@@ -34,145 +34,107 @@ static u64 cleancache_failed_gets;
34static u64 cleancache_puts; 34static u64 cleancache_puts;
35static u64 cleancache_invalidates; 35static u64 cleancache_invalidates;
36 36
37/* 37static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
38 * When no backend is registered all calls to init_fs and init_shared_fs 38{
39 * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or 39 switch (sb->cleancache_poolid) {
40 * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array 40 case CLEANCACHE_NO_BACKEND:
41 * [shared_|]fs_poolid_map) are given to the respective super block 41 __cleancache_init_fs(sb);
42 * (sb->cleancache_poolid) and no tmem_pools are created. When a backend 42 break;
43 * registers with cleancache the previous calls to init_fs and init_shared_fs 43 case CLEANCACHE_NO_BACKEND_SHARED:
44 * are executed to create tmem_pools and set the respective poolids. While no 44 __cleancache_init_shared_fs(sb);
45 * backend is registered all "puts", "gets" and "flushes" are ignored or failed. 45 break;
46 */ 46 }
47#define MAX_INITIALIZABLE_FS 32 47}
48#define FAKE_FS_POOLID_OFFSET 1000
49#define FAKE_SHARED_FS_POOLID_OFFSET 2000
50
51#define FS_NO_BACKEND (-1)
52#define FS_UNKNOWN (-2)
53static int fs_poolid_map[MAX_INITIALIZABLE_FS];
54static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS];
55static char *uuids[MAX_INITIALIZABLE_FS];
56/*
57 * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads
58 * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple
59 * threads calling mount (and ending up in __cleancache_init_[shared|]fs).
60 */
61static DEFINE_MUTEX(poolid_mutex);
62/*
63 * When set to false (default) all calls to the cleancache functions, except
64 * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded
65 * by the if (!cleancache_ops) return. This means multiple threads (from
66 * different filesystems) will be checking cleancache_ops. The usage of a
67 * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are
68 * OK if the time between the backend's have been initialized (and
69 * cleancache_ops has been set to not NULL) and when the filesystems start
70 * actually calling the backends. The inverse (when unloading) is obviously
71 * not good - but this shim does not do that (yet).
72 */
73
74/*
75 * The backends and filesystems work all asynchronously. This is b/c the
76 * backends can be built as modules.
77 * The usual sequence of events is:
78 * a) mount / -> __cleancache_init_fs is called. We set the
79 * [shared_|]fs_poolid_map and uuids for.
80 *
81 * b). user does I/Os -> we call the rest of __cleancache_* functions
82 * which return immediately as cleancache_ops is false.
83 *
84 * c). modprobe zcache -> cleancache_register_ops. We init the backend
85 * and set cleancache_ops to true, and for any fs_poolid_map
86 * (which is set by __cleancache_init_fs) we initialize the poolid.
87 *
88 * d). user does I/Os -> now that cleancache_ops is true all the
89 * __cleancache_* functions can call the backend. They all check
90 * that fs_poolid_map is valid and if so invoke the backend.
91 *
92 * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is
93 * reset (which is the second check in the __cleancache_* ops
94 * to call the backend).
95 *
96 * The sequence of event could also be c), followed by a), and d). and e). The
97 * c) would not happen anymore. There is also the chance of c), and one thread
98 * doing a) + d), and another doing e). For that case we depend on the
99 * filesystem calling __cleancache_invalidate_fs in the proper sequence (so
100 * that it handles all I/Os before it invalidates the fs (which is last part
101 * of unmounting process).
102 *
103 * Note: The acute reader will notice that there is no "rmmod zcache" case.
104 * This is b/c the functionality for that is not yet implemented and when
105 * done, will require some extra locking not yet devised.
106 */
107 48
108/* 49/*
109 * Register operations for cleancache, returning previous thus allowing 50 * Register operations for cleancache. Returns 0 on success.
110 * detection of multiple backends and possible nesting.
111 */ 51 */
112struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops) 52int cleancache_register_ops(struct cleancache_ops *ops)
113{ 53{
114 struct cleancache_ops *old = cleancache_ops; 54 if (cmpxchg(&cleancache_ops, NULL, ops))
115 int i; 55 return -EBUSY;
116 56
117 mutex_lock(&poolid_mutex);
118 for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
119 if (fs_poolid_map[i] == FS_NO_BACKEND)
120 fs_poolid_map[i] = ops->init_fs(PAGE_SIZE);
121 if (shared_fs_poolid_map[i] == FS_NO_BACKEND)
122 shared_fs_poolid_map[i] = ops->init_shared_fs
123 (uuids[i], PAGE_SIZE);
124 }
125 /* 57 /*
126 * We MUST set cleancache_ops _after_ we have called the backends 58 * A cleancache backend can be built as a module and hence loaded after
127 * init_fs or init_shared_fs functions. Otherwise the compiler might 59 * a cleancache enabled filesystem has called cleancache_init_fs. To
128 * re-order where cleancache_ops is set in this function. 60 * handle such a scenario, here we call ->init_fs or ->init_shared_fs
61 * for each active super block. To differentiate between local and
62 * shared filesystems, we temporarily initialize sb->cleancache_poolid
63 * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED
64 * respectively in case there is no backend registered at the time
65 * cleancache_init_fs or cleancache_init_shared_fs is called.
66 *
67 * Since filesystems can be mounted concurrently with cleancache
68 * backend registration, we have to be careful to guarantee that all
69 * cleancache enabled filesystems that has been mounted by the time
70 * cleancache_register_ops is called has got and all mounted later will
71 * get cleancache_poolid. This is assured by the following statements
72 * tied together:
73 *
74 * a) iterate_supers skips only those super blocks that has started
75 * ->kill_sb
76 *
77 * b) if iterate_supers encounters a super block that has not finished
78 * ->mount yet, it waits until it is finished
79 *
80 * c) cleancache_init_fs is called from ->mount and
81 * cleancache_invalidate_fs is called from ->kill_sb
82 *
83 * d) we call iterate_supers after cleancache_ops has been set
84 *
85 * From a) it follows that if iterate_supers skips a super block, then
86 * either the super block is already dead, in which case we do not need
87 * to bother initializing cleancache for it, or it was mounted after we
88 * initiated iterate_supers. In the latter case, it must have seen
89 * cleancache_ops set according to d) and initialized cleancache from
90 * ->mount by itself according to c). This proves that we call
91 * ->init_fs at least once for each active super block.
92 *
93 * From b) and c) it follows that if iterate_supers encounters a super
94 * block that has already started ->init_fs, it will wait until ->mount
95 * and hence ->init_fs has finished, then check cleancache_poolid, see
96 * that it has already been set and therefore do nothing. This proves
97 * that we call ->init_fs no more than once for each super block.
98 *
99 * Combined together, the last two paragraphs prove the function
100 * correctness.
101 *
102 * Note that various cleancache callbacks may proceed before this
103 * function is called or even concurrently with it, but since
104 * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop
105 * until the corresponding ->init_fs has been actually called and
106 * cleancache_ops has been set.
129 */ 107 */
130 barrier(); 108 iterate_supers(cleancache_register_ops_sb, NULL);
131 cleancache_ops = ops; 109 return 0;
132 mutex_unlock(&poolid_mutex);
133 return old;
134} 110}
135EXPORT_SYMBOL(cleancache_register_ops); 111EXPORT_SYMBOL(cleancache_register_ops);
136 112
137/* Called by a cleancache-enabled filesystem at time of mount */ 113/* Called by a cleancache-enabled filesystem at time of mount */
138void __cleancache_init_fs(struct super_block *sb) 114void __cleancache_init_fs(struct super_block *sb)
139{ 115{
140 int i; 116 int pool_id = CLEANCACHE_NO_BACKEND;
141 117
142 mutex_lock(&poolid_mutex); 118 if (cleancache_ops) {
143 for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { 119 pool_id = cleancache_ops->init_fs(PAGE_SIZE);
144 if (fs_poolid_map[i] == FS_UNKNOWN) { 120 if (pool_id < 0)
145 sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET; 121 pool_id = CLEANCACHE_NO_POOL;
146 if (cleancache_ops)
147 fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE);
148 else
149 fs_poolid_map[i] = FS_NO_BACKEND;
150 break;
151 }
152 } 122 }
153 mutex_unlock(&poolid_mutex); 123 sb->cleancache_poolid = pool_id;
154} 124}
155EXPORT_SYMBOL(__cleancache_init_fs); 125EXPORT_SYMBOL(__cleancache_init_fs);
156 126
157/* Called by a cleancache-enabled clustered filesystem at time of mount */ 127/* Called by a cleancache-enabled clustered filesystem at time of mount */
158void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) 128void __cleancache_init_shared_fs(struct super_block *sb)
159{ 129{
160 int i; 130 int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
161 131
162 mutex_lock(&poolid_mutex); 132 if (cleancache_ops) {
163 for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { 133 pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE);
164 if (shared_fs_poolid_map[i] == FS_UNKNOWN) { 134 if (pool_id < 0)
165 sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET; 135 pool_id = CLEANCACHE_NO_POOL;
166 uuids[i] = uuid;
167 if (cleancache_ops)
168 shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs
169 (uuid, PAGE_SIZE);
170 else
171 shared_fs_poolid_map[i] = FS_NO_BACKEND;
172 break;
173 }
174 } 136 }
175 mutex_unlock(&poolid_mutex); 137 sb->cleancache_poolid = pool_id;
176} 138}
177EXPORT_SYMBOL(__cleancache_init_shared_fs); 139EXPORT_SYMBOL(__cleancache_init_shared_fs);
178 140
@@ -202,19 +164,6 @@ static int cleancache_get_key(struct inode *inode,
202} 164}
203 165
204/* 166/*
205 * Returns a pool_id that is associated with a given fake poolid.
206 */
207static int get_poolid_from_fake(int fake_pool_id)
208{
209 if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET)
210 return shared_fs_poolid_map[fake_pool_id -
211 FAKE_SHARED_FS_POOLID_OFFSET];
212 else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET)
213 return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET];
214 return FS_NO_BACKEND;
215}
216
217/*
218 * "Get" data from cleancache associated with the poolid/inode/index 167 * "Get" data from cleancache associated with the poolid/inode/index
219 * that were specified when the data was put to cleanache and, if 168 * that were specified when the data was put to cleanache and, if
220 * successful, use it to fill the specified page with data and return 0. 169 * successful, use it to fill the specified page with data and return 0.
@@ -229,7 +178,6 @@ int __cleancache_get_page(struct page *page)
229{ 178{
230 int ret = -1; 179 int ret = -1;
231 int pool_id; 180 int pool_id;
232 int fake_pool_id;
233 struct cleancache_filekey key = { .u.key = { 0 } }; 181 struct cleancache_filekey key = { .u.key = { 0 } };
234 182
235 if (!cleancache_ops) { 183 if (!cleancache_ops) {
@@ -238,17 +186,14 @@ int __cleancache_get_page(struct page *page)
238 } 186 }
239 187
240 VM_BUG_ON_PAGE(!PageLocked(page), page); 188 VM_BUG_ON_PAGE(!PageLocked(page), page);
241 fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; 189 pool_id = page->mapping->host->i_sb->cleancache_poolid;
242 if (fake_pool_id < 0) 190 if (pool_id < 0)
243 goto out; 191 goto out;
244 pool_id = get_poolid_from_fake(fake_pool_id);
245 192
246 if (cleancache_get_key(page->mapping->host, &key) < 0) 193 if (cleancache_get_key(page->mapping->host, &key) < 0)
247 goto out; 194 goto out;
248 195
249 if (pool_id >= 0) 196 ret = cleancache_ops->get_page(pool_id, key, page->index, page);
250 ret = cleancache_ops->get_page(pool_id,
251 key, page->index, page);
252 if (ret == 0) 197 if (ret == 0)
253 cleancache_succ_gets++; 198 cleancache_succ_gets++;
254 else 199 else
@@ -271,7 +216,6 @@ EXPORT_SYMBOL(__cleancache_get_page);
271void __cleancache_put_page(struct page *page) 216void __cleancache_put_page(struct page *page)
272{ 217{
273 int pool_id; 218 int pool_id;
274 int fake_pool_id;
275 struct cleancache_filekey key = { .u.key = { 0 } }; 219 struct cleancache_filekey key = { .u.key = { 0 } };
276 220
277 if (!cleancache_ops) { 221 if (!cleancache_ops) {
@@ -280,12 +224,7 @@ void __cleancache_put_page(struct page *page)
280 } 224 }
281 225
282 VM_BUG_ON_PAGE(!PageLocked(page), page); 226 VM_BUG_ON_PAGE(!PageLocked(page), page);
283 fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; 227 pool_id = page->mapping->host->i_sb->cleancache_poolid;
284 if (fake_pool_id < 0)
285 return;
286
287 pool_id = get_poolid_from_fake(fake_pool_id);
288
289 if (pool_id >= 0 && 228 if (pool_id >= 0 &&
290 cleancache_get_key(page->mapping->host, &key) >= 0) { 229 cleancache_get_key(page->mapping->host, &key) >= 0) {
291 cleancache_ops->put_page(pool_id, key, page->index, page); 230 cleancache_ops->put_page(pool_id, key, page->index, page);
@@ -306,18 +245,13 @@ void __cleancache_invalidate_page(struct address_space *mapping,
306 struct page *page) 245 struct page *page)
307{ 246{
308 /* careful... page->mapping is NULL sometimes when this is called */ 247 /* careful... page->mapping is NULL sometimes when this is called */
309 int pool_id; 248 int pool_id = mapping->host->i_sb->cleancache_poolid;
310 int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
311 struct cleancache_filekey key = { .u.key = { 0 } }; 249 struct cleancache_filekey key = { .u.key = { 0 } };
312 250
313 if (!cleancache_ops) 251 if (!cleancache_ops)
314 return; 252 return;
315 253
316 if (fake_pool_id >= 0) { 254 if (pool_id >= 0) {
317 pool_id = get_poolid_from_fake(fake_pool_id);
318 if (pool_id < 0)
319 return;
320
321 VM_BUG_ON_PAGE(!PageLocked(page), page); 255 VM_BUG_ON_PAGE(!PageLocked(page), page);
322 if (cleancache_get_key(mapping->host, &key) >= 0) { 256 if (cleancache_get_key(mapping->host, &key) >= 0) {
323 cleancache_ops->invalidate_page(pool_id, 257 cleancache_ops->invalidate_page(pool_id,
@@ -339,18 +273,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page);
339 */ 273 */
340void __cleancache_invalidate_inode(struct address_space *mapping) 274void __cleancache_invalidate_inode(struct address_space *mapping)
341{ 275{
342 int pool_id; 276 int pool_id = mapping->host->i_sb->cleancache_poolid;
343 int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
344 struct cleancache_filekey key = { .u.key = { 0 } }; 277 struct cleancache_filekey key = { .u.key = { 0 } };
345 278
346 if (!cleancache_ops) 279 if (!cleancache_ops)
347 return; 280 return;
348 281
349 if (fake_pool_id < 0)
350 return;
351
352 pool_id = get_poolid_from_fake(fake_pool_id);
353
354 if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) 282 if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
355 cleancache_ops->invalidate_inode(pool_id, key); 283 cleancache_ops->invalidate_inode(pool_id, key);
356} 284}
@@ -363,32 +291,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode);
363 */ 291 */
364void __cleancache_invalidate_fs(struct super_block *sb) 292void __cleancache_invalidate_fs(struct super_block *sb)
365{ 293{
366 int index; 294 int pool_id;
367 int fake_pool_id = sb->cleancache_poolid;
368 int old_poolid = fake_pool_id;
369 295
370 mutex_lock(&poolid_mutex); 296 pool_id = sb->cleancache_poolid;
371 if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) { 297 sb->cleancache_poolid = CLEANCACHE_NO_POOL;
372 index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET; 298
373 old_poolid = shared_fs_poolid_map[index]; 299 if (cleancache_ops && pool_id >= 0)
374 shared_fs_poolid_map[index] = FS_UNKNOWN; 300 cleancache_ops->invalidate_fs(pool_id);
375 uuids[index] = NULL;
376 } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) {
377 index = fake_pool_id - FAKE_FS_POOLID_OFFSET;
378 old_poolid = fs_poolid_map[index];
379 fs_poolid_map[index] = FS_UNKNOWN;
380 }
381 sb->cleancache_poolid = -1;
382 if (cleancache_ops)
383 cleancache_ops->invalidate_fs(old_poolid);
384 mutex_unlock(&poolid_mutex);
385} 301}
386EXPORT_SYMBOL(__cleancache_invalidate_fs); 302EXPORT_SYMBOL(__cleancache_invalidate_fs);
387 303
388static int __init init_cleancache(void) 304static int __init init_cleancache(void)
389{ 305{
390 int i;
391
392#ifdef CONFIG_DEBUG_FS 306#ifdef CONFIG_DEBUG_FS
393 struct dentry *root = debugfs_create_dir("cleancache", NULL); 307 struct dentry *root = debugfs_create_dir("cleancache", NULL);
394 if (root == NULL) 308 if (root == NULL)
@@ -400,10 +314,6 @@ static int __init init_cleancache(void)
400 debugfs_create_u64("invalidates", S_IRUGO, 314 debugfs_create_u64("invalidates", S_IRUGO,
401 root, &cleancache_invalidates); 315 root, &cleancache_invalidates);
402#endif 316#endif
403 for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
404 fs_poolid_map[i] = FS_UNKNOWN;
405 shared_fs_poolid_map[i] = FS_UNKNOWN;
406 }
407 return 0; 317 return 0;
408} 318}
409module_init(init_cleancache) 319module_init(init_cleancache)
diff --git a/mm/cma.c b/mm/cma.c
index 75016fd1de90..3a7a67b93394 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -23,6 +23,7 @@
23# define DEBUG 23# define DEBUG
24#endif 24#endif
25#endif 25#endif
26#define CREATE_TRACE_POINTS
26 27
27#include <linux/memblock.h> 28#include <linux/memblock.h>
28#include <linux/err.h> 29#include <linux/err.h>
@@ -34,59 +35,54 @@
34#include <linux/cma.h> 35#include <linux/cma.h>
35#include <linux/highmem.h> 36#include <linux/highmem.h>
36#include <linux/io.h> 37#include <linux/io.h>
38#include <trace/events/cma.h>
37 39
38struct cma { 40#include "cma.h"
39 unsigned long base_pfn;
40 unsigned long count;
41 unsigned long *bitmap;
42 unsigned int order_per_bit; /* Order of pages represented by one bit */
43 struct mutex lock;
44};
45 41
46static struct cma cma_areas[MAX_CMA_AREAS]; 42struct cma cma_areas[MAX_CMA_AREAS];
47static unsigned cma_area_count; 43unsigned cma_area_count;
48static DEFINE_MUTEX(cma_mutex); 44static DEFINE_MUTEX(cma_mutex);
49 45
50phys_addr_t cma_get_base(struct cma *cma) 46phys_addr_t cma_get_base(const struct cma *cma)
51{ 47{
52 return PFN_PHYS(cma->base_pfn); 48 return PFN_PHYS(cma->base_pfn);
53} 49}
54 50
55unsigned long cma_get_size(struct cma *cma) 51unsigned long cma_get_size(const struct cma *cma)
56{ 52{
57 return cma->count << PAGE_SHIFT; 53 return cma->count << PAGE_SHIFT;
58} 54}
59 55
60static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) 56static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
57 int align_order)
61{ 58{
62 if (align_order <= cma->order_per_bit) 59 if (align_order <= cma->order_per_bit)
63 return 0; 60 return 0;
64 return (1UL << (align_order - cma->order_per_bit)) - 1; 61 return (1UL << (align_order - cma->order_per_bit)) - 1;
65} 62}
66 63
67static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) 64/*
65 * Find a PFN aligned to the specified order and return an offset represented in
66 * order_per_bits.
67 */
68static unsigned long cma_bitmap_aligned_offset(const struct cma *cma,
69 int align_order)
68{ 70{
69 unsigned int alignment;
70
71 if (align_order <= cma->order_per_bit) 71 if (align_order <= cma->order_per_bit)
72 return 0; 72 return 0;
73 alignment = 1UL << (align_order - cma->order_per_bit);
74 return ALIGN(cma->base_pfn, alignment) -
75 (cma->base_pfn >> cma->order_per_bit);
76}
77 73
78static unsigned long cma_bitmap_maxno(struct cma *cma) 74 return (ALIGN(cma->base_pfn, (1UL << align_order))
79{ 75 - cma->base_pfn) >> cma->order_per_bit;
80 return cma->count >> cma->order_per_bit;
81} 76}
82 77
83static unsigned long cma_bitmap_pages_to_bits(struct cma *cma, 78static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma,
84 unsigned long pages) 79 unsigned long pages)
85{ 80{
86 return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; 81 return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit;
87} 82}
88 83
89static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count) 84static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
85 unsigned int count)
90{ 86{
91 unsigned long bitmap_no, bitmap_count; 87 unsigned long bitmap_no, bitmap_count;
92 88
@@ -132,6 +128,12 @@ static int __init cma_activate_area(struct cma *cma)
132 } while (--i); 128 } while (--i);
133 129
134 mutex_init(&cma->lock); 130 mutex_init(&cma->lock);
131
132#ifdef CONFIG_CMA_DEBUGFS
133 INIT_HLIST_HEAD(&cma->mem_head);
134 spin_lock_init(&cma->mem_head_lock);
135#endif
136
135 return 0; 137 return 0;
136 138
137err: 139err:
@@ -165,7 +167,8 @@ core_initcall(cma_init_reserved_areas);
165 * This function creates custom contiguous area from already reserved memory. 167 * This function creates custom contiguous area from already reserved memory.
166 */ 168 */
167int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, 169int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
168 int order_per_bit, struct cma **res_cma) 170 unsigned int order_per_bit,
171 struct cma **res_cma)
169{ 172{
170 struct cma *cma; 173 struct cma *cma;
171 phys_addr_t alignment; 174 phys_addr_t alignment;
@@ -356,7 +359,7 @@ err:
356 * This function allocates part of contiguous memory on specific 359 * This function allocates part of contiguous memory on specific
357 * contiguous memory area. 360 * contiguous memory area.
358 */ 361 */
359struct page *cma_alloc(struct cma *cma, int count, unsigned int align) 362struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align)
360{ 363{
361 unsigned long mask, offset, pfn, start = 0; 364 unsigned long mask, offset, pfn, start = 0;
362 unsigned long bitmap_maxno, bitmap_no, bitmap_count; 365 unsigned long bitmap_maxno, bitmap_no, bitmap_count;
@@ -413,6 +416,8 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
413 start = bitmap_no + mask + 1; 416 start = bitmap_no + mask + 1;
414 } 417 }
415 418
419 trace_cma_alloc(page ? pfn : -1UL, page, count, align);
420
416 pr_debug("%s(): returned %p\n", __func__, page); 421 pr_debug("%s(): returned %p\n", __func__, page);
417 return page; 422 return page;
418} 423}
@@ -427,7 +432,7 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align)
427 * It returns false when provided pages do not belong to contiguous area and 432 * It returns false when provided pages do not belong to contiguous area and
428 * true otherwise. 433 * true otherwise.
429 */ 434 */
430bool cma_release(struct cma *cma, struct page *pages, int count) 435bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
431{ 436{
432 unsigned long pfn; 437 unsigned long pfn;
433 438
@@ -445,6 +450,7 @@ bool cma_release(struct cma *cma, struct page *pages, int count)
445 450
446 free_contig_range(pfn, count); 451 free_contig_range(pfn, count);
447 cma_clear_bitmap(cma, pfn, count); 452 cma_clear_bitmap(cma, pfn, count);
453 trace_cma_release(pfn, pages, count);
448 454
449 return true; 455 return true;
450} 456}
diff --git a/mm/cma.h b/mm/cma.h
new file mode 100644
index 000000000000..1132d733556d
--- /dev/null
+++ b/mm/cma.h
@@ -0,0 +1,24 @@
1#ifndef __MM_CMA_H__
2#define __MM_CMA_H__
3
4struct cma {
5 unsigned long base_pfn;
6 unsigned long count;
7 unsigned long *bitmap;
8 unsigned int order_per_bit; /* Order of pages represented by one bit */
9 struct mutex lock;
10#ifdef CONFIG_CMA_DEBUGFS
11 struct hlist_head mem_head;
12 spinlock_t mem_head_lock;
13#endif
14};
15
16extern struct cma cma_areas[MAX_CMA_AREAS];
17extern unsigned cma_area_count;
18
19static unsigned long cma_bitmap_maxno(struct cma *cma)
20{
21 return cma->count >> cma->order_per_bit;
22}
23
24#endif
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
new file mode 100644
index 000000000000..7621ee34daa0
--- /dev/null
+++ b/mm/cma_debug.c
@@ -0,0 +1,205 @@
1/*
2 * CMA DebugFS Interface
3 *
4 * Copyright (c) 2015 Sasha Levin <sasha.levin@oracle.com>
5 */
6
7
8#include <linux/debugfs.h>
9#include <linux/cma.h>
10#include <linux/list.h>
11#include <linux/kernel.h>
12#include <linux/slab.h>
13#include <linux/mm_types.h>
14
15#include "cma.h"
16
17struct cma_mem {
18 struct hlist_node node;
19 struct page *p;
20 unsigned long n;
21};
22
23static struct dentry *cma_debugfs_root;
24
25static int cma_debugfs_get(void *data, u64 *val)
26{
27 unsigned long *p = data;
28
29 *val = *p;
30
31 return 0;
32}
33DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n");
34
35static int cma_used_get(void *data, u64 *val)
36{
37 struct cma *cma = data;
38 unsigned long used;
39
40 mutex_lock(&cma->lock);
41 /* pages counter is smaller than sizeof(int) */
42 used = bitmap_weight(cma->bitmap, (int)cma->count);
43 mutex_unlock(&cma->lock);
44 *val = (u64)used << cma->order_per_bit;
45
46 return 0;
47}
48DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n");
49
50static int cma_maxchunk_get(void *data, u64 *val)
51{
52 struct cma *cma = data;
53 unsigned long maxchunk = 0;
54 unsigned long start, end = 0;
55
56 mutex_lock(&cma->lock);
57 for (;;) {
58 start = find_next_zero_bit(cma->bitmap, cma->count, end);
59 if (start >= cma->count)
60 break;
61 end = find_next_bit(cma->bitmap, cma->count, start);
62 maxchunk = max(end - start, maxchunk);
63 }
64 mutex_unlock(&cma->lock);
65 *val = (u64)maxchunk << cma->order_per_bit;
66
67 return 0;
68}
69DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n");
70
71static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem)
72{
73 spin_lock(&cma->mem_head_lock);
74 hlist_add_head(&mem->node, &cma->mem_head);
75 spin_unlock(&cma->mem_head_lock);
76}
77
78static struct cma_mem *cma_get_entry_from_list(struct cma *cma)
79{
80 struct cma_mem *mem = NULL;
81
82 spin_lock(&cma->mem_head_lock);
83 if (!hlist_empty(&cma->mem_head)) {
84 mem = hlist_entry(cma->mem_head.first, struct cma_mem, node);
85 hlist_del_init(&mem->node);
86 }
87 spin_unlock(&cma->mem_head_lock);
88
89 return mem;
90}
91
92static int cma_free_mem(struct cma *cma, int count)
93{
94 struct cma_mem *mem = NULL;
95
96 while (count) {
97 mem = cma_get_entry_from_list(cma);
98 if (mem == NULL)
99 return 0;
100
101 if (mem->n <= count) {
102 cma_release(cma, mem->p, mem->n);
103 count -= mem->n;
104 kfree(mem);
105 } else if (cma->order_per_bit == 0) {
106 cma_release(cma, mem->p, count);
107 mem->p += count;
108 mem->n -= count;
109 count = 0;
110 cma_add_to_cma_mem_list(cma, mem);
111 } else {
112 pr_debug("cma: cannot release partial block when order_per_bit != 0\n");
113 cma_add_to_cma_mem_list(cma, mem);
114 break;
115 }
116 }
117
118 return 0;
119
120}
121
122static int cma_free_write(void *data, u64 val)
123{
124 int pages = val;
125 struct cma *cma = data;
126
127 return cma_free_mem(cma, pages);
128}
129DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n");
130
131static int cma_alloc_mem(struct cma *cma, int count)
132{
133 struct cma_mem *mem;
134 struct page *p;
135
136 mem = kzalloc(sizeof(*mem), GFP_KERNEL);
137 if (!mem)
138 return -ENOMEM;
139
140 p = cma_alloc(cma, count, 0);
141 if (!p) {
142 kfree(mem);
143 return -ENOMEM;
144 }
145
146 mem->p = p;
147 mem->n = count;
148
149 cma_add_to_cma_mem_list(cma, mem);
150
151 return 0;
152}
153
154static int cma_alloc_write(void *data, u64 val)
155{
156 int pages = val;
157 struct cma *cma = data;
158
159 return cma_alloc_mem(cma, pages);
160}
161DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
162
163static void cma_debugfs_add_one(struct cma *cma, int idx)
164{
165 struct dentry *tmp;
166 char name[16];
167 int u32s;
168
169 sprintf(name, "cma-%d", idx);
170
171 tmp = debugfs_create_dir(name, cma_debugfs_root);
172
173 debugfs_create_file("alloc", S_IWUSR, cma_debugfs_root, cma,
174 &cma_alloc_fops);
175
176 debugfs_create_file("free", S_IWUSR, cma_debugfs_root, cma,
177 &cma_free_fops);
178
179 debugfs_create_file("base_pfn", S_IRUGO, tmp,
180 &cma->base_pfn, &cma_debugfs_fops);
181 debugfs_create_file("count", S_IRUGO, tmp,
182 &cma->count, &cma_debugfs_fops);
183 debugfs_create_file("order_per_bit", S_IRUGO, tmp,
184 &cma->order_per_bit, &cma_debugfs_fops);
185 debugfs_create_file("used", S_IRUGO, tmp, cma, &cma_used_fops);
186 debugfs_create_file("maxchunk", S_IRUGO, tmp, cma, &cma_maxchunk_fops);
187
188 u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32));
189 debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s);
190}
191
192static int __init cma_debugfs_init(void)
193{
194 int i;
195
196 cma_debugfs_root = debugfs_create_dir("cma", NULL);
197 if (!cma_debugfs_root)
198 return -ENOMEM;
199
200 for (i = 0; i < cma_area_count; i++)
201 cma_debugfs_add_one(&cma_areas[i], i);
202
203 return 0;
204}
205late_initcall(cma_debugfs_init);
diff --git a/mm/compaction.c b/mm/compaction.c
index 8c0d9459b54a..018f08da99a2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -391,28 +391,6 @@ static inline bool compact_should_abort(struct compact_control *cc)
391 return false; 391 return false;
392} 392}
393 393
394/* Returns true if the page is within a block suitable for migration to */
395static bool suitable_migration_target(struct page *page)
396{
397 /* If the page is a large free page, then disallow migration */
398 if (PageBuddy(page)) {
399 /*
400 * We are checking page_order without zone->lock taken. But
401 * the only small danger is that we skip a potentially suitable
402 * pageblock, so it's not worth to check order for valid range.
403 */
404 if (page_order_unsafe(page) >= pageblock_order)
405 return false;
406 }
407
408 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
409 if (migrate_async_suitable(get_pageblock_migratetype(page)))
410 return true;
411
412 /* Otherwise skip the block */
413 return false;
414}
415
416/* 394/*
417 * Isolate free pages onto a private freelist. If @strict is true, will abort 395 * Isolate free pages onto a private freelist. If @strict is true, will abort
418 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock 396 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
@@ -896,6 +874,29 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
896 874
897#endif /* CONFIG_COMPACTION || CONFIG_CMA */ 875#endif /* CONFIG_COMPACTION || CONFIG_CMA */
898#ifdef CONFIG_COMPACTION 876#ifdef CONFIG_COMPACTION
877
878/* Returns true if the page is within a block suitable for migration to */
879static bool suitable_migration_target(struct page *page)
880{
881 /* If the page is a large free page, then disallow migration */
882 if (PageBuddy(page)) {
883 /*
884 * We are checking page_order without zone->lock taken. But
885 * the only small danger is that we skip a potentially suitable
886 * pageblock, so it's not worth to check order for valid range.
887 */
888 if (page_order_unsafe(page) >= pageblock_order)
889 return false;
890 }
891
892 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
893 if (migrate_async_suitable(get_pageblock_migratetype(page)))
894 return true;
895
896 /* Otherwise skip the block */
897 return false;
898}
899
899/* 900/*
900 * Based on information in the current compact_control, find blocks 901 * Based on information in the current compact_control, find blocks
901 * suitable for isolating free pages from and then isolate them. 902 * suitable for isolating free pages from and then isolate them.
@@ -1047,6 +1048,12 @@ typedef enum {
1047} isolate_migrate_t; 1048} isolate_migrate_t;
1048 1049
1049/* 1050/*
1051 * Allow userspace to control policy on scanning the unevictable LRU for
1052 * compactable pages.
1053 */
1054int sysctl_compact_unevictable_allowed __read_mostly = 1;
1055
1056/*
1050 * Isolate all pages that can be migrated from the first suitable block, 1057 * Isolate all pages that can be migrated from the first suitable block,
1051 * starting at the block pointed to by the migrate scanner pfn within 1058 * starting at the block pointed to by the migrate scanner pfn within
1052 * compact_control. 1059 * compact_control.
@@ -1057,6 +1064,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1057 unsigned long low_pfn, end_pfn; 1064 unsigned long low_pfn, end_pfn;
1058 struct page *page; 1065 struct page *page;
1059 const isolate_mode_t isolate_mode = 1066 const isolate_mode_t isolate_mode =
1067 (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
1060 (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); 1068 (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
1061 1069
1062 /* 1070 /*
@@ -1174,13 +1182,24 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
1174 /* Direct compactor: Is a suitable page free? */ 1182 /* Direct compactor: Is a suitable page free? */
1175 for (order = cc->order; order < MAX_ORDER; order++) { 1183 for (order = cc->order; order < MAX_ORDER; order++) {
1176 struct free_area *area = &zone->free_area[order]; 1184 struct free_area *area = &zone->free_area[order];
1185 bool can_steal;
1177 1186
1178 /* Job done if page is free of the right migratetype */ 1187 /* Job done if page is free of the right migratetype */
1179 if (!list_empty(&area->free_list[migratetype])) 1188 if (!list_empty(&area->free_list[migratetype]))
1180 return COMPACT_PARTIAL; 1189 return COMPACT_PARTIAL;
1181 1190
1182 /* Job done if allocation would set block type */ 1191#ifdef CONFIG_CMA
1183 if (order >= pageblock_order && area->nr_free) 1192 /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
1193 if (migratetype == MIGRATE_MOVABLE &&
1194 !list_empty(&area->free_list[MIGRATE_CMA]))
1195 return COMPACT_PARTIAL;
1196#endif
1197 /*
1198 * Job done if allocation would steal freepages from
1199 * other migratetype buddy lists.
1200 */
1201 if (find_suitable_fallback(area, order, migratetype,
1202 true, &can_steal) != -1)
1184 return COMPACT_PARTIAL; 1203 return COMPACT_PARTIAL;
1185 } 1204 }
1186 1205
@@ -1587,6 +1606,14 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
1587 INIT_LIST_HEAD(&cc->freepages); 1606 INIT_LIST_HEAD(&cc->freepages);
1588 INIT_LIST_HEAD(&cc->migratepages); 1607 INIT_LIST_HEAD(&cc->migratepages);
1589 1608
1609 /*
1610 * When called via /proc/sys/vm/compact_memory
1611 * this makes sure we compact the whole zone regardless of
1612 * cached scanner positions.
1613 */
1614 if (cc->order == -1)
1615 __reset_isolation_suitable(zone);
1616
1590 if (cc->order == -1 || !compaction_deferred(zone, cc->order)) 1617 if (cc->order == -1 || !compaction_deferred(zone, cc->order))
1591 compact_zone(zone, cc); 1618 compact_zone(zone, cc);
1592 1619
diff --git a/mm/filemap.c b/mm/filemap.c
index ad7242043bdb..6bf5e42d560a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -13,7 +13,6 @@
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/aio.h>
17#include <linux/capability.h> 16#include <linux/capability.h>
18#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
19#include <linux/gfp.h> 18#include <linux/gfp.h>
@@ -203,16 +202,15 @@ void __delete_from_page_cache(struct page *page, void *shadow)
203 BUG_ON(page_mapped(page)); 202 BUG_ON(page_mapped(page));
204 203
205 /* 204 /*
206 * Some filesystems seem to re-dirty the page even after 205 * At this point page must be either written or cleaned by truncate.
207 * the VM has canceled the dirty bit (eg ext3 journaling). 206 * Dirty page here signals a bug and loss of unwritten data.
208 * 207 *
209 * Fix it up by doing a final dirty accounting check after 208 * This fixes dirty accounting after removing the page entirely but
210 * having removed the page entirely. 209 * leaves PageDirty set: it has no effect for truncated page and
210 * anyway will be cleared before returning page into buddy allocator.
211 */ 211 */
212 if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { 212 if (WARN_ON_ONCE(PageDirty(page)))
213 dec_zone_page_state(page, NR_FILE_DIRTY); 213 account_page_cleaned(page, mapping);
214 dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
215 }
216} 214}
217 215
218/** 216/**
@@ -1695,7 +1693,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
1695 loff_t *ppos = &iocb->ki_pos; 1693 loff_t *ppos = &iocb->ki_pos;
1696 loff_t pos = *ppos; 1694 loff_t pos = *ppos;
1697 1695
1698 if (io_is_direct(file)) { 1696 if (iocb->ki_flags & IOCB_DIRECT) {
1699 struct address_space *mapping = file->f_mapping; 1697 struct address_space *mapping = file->f_mapping;
1700 struct inode *inode = mapping->host; 1698 struct inode *inode = mapping->host;
1701 size_t count = iov_iter_count(iter); 1699 size_t count = iov_iter_count(iter);
@@ -1708,7 +1706,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
1708 pos + count - 1); 1706 pos + count - 1);
1709 if (!retval) { 1707 if (!retval) {
1710 struct iov_iter data = *iter; 1708 struct iov_iter data = *iter;
1711 retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos); 1709 retval = mapping->a_ops->direct_IO(iocb, &data, pos);
1712 } 1710 }
1713 1711
1714 if (retval > 0) { 1712 if (retval > 0) {
@@ -2261,41 +2259,38 @@ EXPORT_SYMBOL(read_cache_page_gfp);
2261 * Returns appropriate error code that caller should return or 2259 * Returns appropriate error code that caller should return or
2262 * zero in case that write should be allowed. 2260 * zero in case that write should be allowed.
2263 */ 2261 */
2264inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) 2262inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
2265{ 2263{
2264 struct file *file = iocb->ki_filp;
2266 struct inode *inode = file->f_mapping->host; 2265 struct inode *inode = file->f_mapping->host;
2267 unsigned long limit = rlimit(RLIMIT_FSIZE); 2266 unsigned long limit = rlimit(RLIMIT_FSIZE);
2267 loff_t pos;
2268 2268
2269 if (unlikely(*pos < 0)) 2269 if (!iov_iter_count(from))
2270 return -EINVAL; 2270 return 0;
2271 2271
2272 if (!isblk) { 2272 /* FIXME: this is for backwards compatibility with 2.4 */
2273 /* FIXME: this is for backwards compatibility with 2.4 */ 2273 if (iocb->ki_flags & IOCB_APPEND)
2274 if (file->f_flags & O_APPEND) 2274 iocb->ki_pos = i_size_read(inode);
2275 *pos = i_size_read(inode);
2276 2275
2277 if (limit != RLIM_INFINITY) { 2276 pos = iocb->ki_pos;
2278 if (*pos >= limit) { 2277
2279 send_sig(SIGXFSZ, current, 0); 2278 if (limit != RLIM_INFINITY) {
2280 return -EFBIG; 2279 if (iocb->ki_pos >= limit) {
2281 } 2280 send_sig(SIGXFSZ, current, 0);
2282 if (*count > limit - (typeof(limit))*pos) { 2281 return -EFBIG;
2283 *count = limit - (typeof(limit))*pos;
2284 }
2285 } 2282 }
2283 iov_iter_truncate(from, limit - (unsigned long)pos);
2286 } 2284 }
2287 2285
2288 /* 2286 /*
2289 * LFS rule 2287 * LFS rule
2290 */ 2288 */
2291 if (unlikely(*pos + *count > MAX_NON_LFS && 2289 if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
2292 !(file->f_flags & O_LARGEFILE))) { 2290 !(file->f_flags & O_LARGEFILE))) {
2293 if (*pos >= MAX_NON_LFS) { 2291 if (pos >= MAX_NON_LFS)
2294 return -EFBIG; 2292 return -EFBIG;
2295 } 2293 iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
2296 if (*count > MAX_NON_LFS - (unsigned long)*pos) {
2297 *count = MAX_NON_LFS - (unsigned long)*pos;
2298 }
2299 } 2294 }
2300 2295
2301 /* 2296 /*
@@ -2305,34 +2300,11 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
2305 * exceeded without writing data we send a signal and return EFBIG. 2300 * exceeded without writing data we send a signal and return EFBIG.
2306 * Linus frestrict idea will clean these up nicely.. 2301 * Linus frestrict idea will clean these up nicely..
2307 */ 2302 */
2308 if (likely(!isblk)) { 2303 if (unlikely(pos >= inode->i_sb->s_maxbytes))
2309 if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { 2304 return -EFBIG;
2310 if (*count || *pos > inode->i_sb->s_maxbytes) {
2311 return -EFBIG;
2312 }
2313 /* zero-length writes at ->s_maxbytes are OK */
2314 }
2315
2316 if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
2317 *count = inode->i_sb->s_maxbytes - *pos;
2318 } else {
2319#ifdef CONFIG_BLOCK
2320 loff_t isize;
2321 if (bdev_read_only(I_BDEV(inode)))
2322 return -EPERM;
2323 isize = i_size_read(inode);
2324 if (*pos >= isize) {
2325 if (*count || *pos > isize)
2326 return -ENOSPC;
2327 }
2328 2305
2329 if (*pos + *count > isize) 2306 iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
2330 *count = isize - *pos; 2307 return iov_iter_count(from);
2331#else
2332 return -EPERM;
2333#endif
2334 }
2335 return 0;
2336} 2308}
2337EXPORT_SYMBOL(generic_write_checks); 2309EXPORT_SYMBOL(generic_write_checks);
2338 2310
@@ -2396,7 +2368,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
2396 } 2368 }
2397 2369
2398 data = *from; 2370 data = *from;
2399 written = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos); 2371 written = mapping->a_ops->direct_IO(iocb, &data, pos);
2400 2372
2401 /* 2373 /*
2402 * Finally, try again to invalidate clean pages which might have been 2374 * Finally, try again to invalidate clean pages which might have been
@@ -2558,23 +2530,12 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
2558 struct file *file = iocb->ki_filp; 2530 struct file *file = iocb->ki_filp;
2559 struct address_space * mapping = file->f_mapping; 2531 struct address_space * mapping = file->f_mapping;
2560 struct inode *inode = mapping->host; 2532 struct inode *inode = mapping->host;
2561 loff_t pos = iocb->ki_pos;
2562 ssize_t written = 0; 2533 ssize_t written = 0;
2563 ssize_t err; 2534 ssize_t err;
2564 ssize_t status; 2535 ssize_t status;
2565 size_t count = iov_iter_count(from);
2566 2536
2567 /* We can write back this queue in page reclaim */ 2537 /* We can write back this queue in page reclaim */
2568 current->backing_dev_info = inode_to_bdi(inode); 2538 current->backing_dev_info = inode_to_bdi(inode);
2569 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2570 if (err)
2571 goto out;
2572
2573 if (count == 0)
2574 goto out;
2575
2576 iov_iter_truncate(from, count);
2577
2578 err = file_remove_suid(file); 2539 err = file_remove_suid(file);
2579 if (err) 2540 if (err)
2580 goto out; 2541 goto out;
@@ -2583,10 +2544,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
2583 if (err) 2544 if (err)
2584 goto out; 2545 goto out;
2585 2546
2586 if (io_is_direct(file)) { 2547 if (iocb->ki_flags & IOCB_DIRECT) {
2587 loff_t endbyte; 2548 loff_t pos, endbyte;
2588 2549
2589 written = generic_file_direct_write(iocb, from, pos); 2550 written = generic_file_direct_write(iocb, from, iocb->ki_pos);
2590 /* 2551 /*
2591 * If the write stopped short of completing, fall back to 2552 * If the write stopped short of completing, fall back to
2592 * buffered writes. Some filesystems do this for writes to 2553 * buffered writes. Some filesystems do this for writes to
@@ -2594,13 +2555,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
2594 * not succeed (even if it did, DAX does not handle dirty 2555 * not succeed (even if it did, DAX does not handle dirty
2595 * page-cache pages correctly). 2556 * page-cache pages correctly).
2596 */ 2557 */
2597 if (written < 0 || written == count || IS_DAX(inode)) 2558 if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
2598 goto out; 2559 goto out;
2599 2560
2600 pos += written; 2561 status = generic_perform_write(file, from, pos = iocb->ki_pos);
2601 count -= written;
2602
2603 status = generic_perform_write(file, from, pos);
2604 /* 2562 /*
2605 * If generic_perform_write() returned a synchronous error 2563 * If generic_perform_write() returned a synchronous error
2606 * then we want to return the number of bytes which were 2564 * then we want to return the number of bytes which were
@@ -2612,15 +2570,15 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
2612 err = status; 2570 err = status;
2613 goto out; 2571 goto out;
2614 } 2572 }
2615 iocb->ki_pos = pos + status;
2616 /* 2573 /*
2617 * We need to ensure that the page cache pages are written to 2574 * We need to ensure that the page cache pages are written to
2618 * disk and invalidated to preserve the expected O_DIRECT 2575 * disk and invalidated to preserve the expected O_DIRECT
2619 * semantics. 2576 * semantics.
2620 */ 2577 */
2621 endbyte = pos + status - 1; 2578 endbyte = pos + status - 1;
2622 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); 2579 err = filemap_write_and_wait_range(mapping, pos, endbyte);
2623 if (err == 0) { 2580 if (err == 0) {
2581 iocb->ki_pos = endbyte + 1;
2624 written += status; 2582 written += status;
2625 invalidate_mapping_pages(mapping, 2583 invalidate_mapping_pages(mapping,
2626 pos >> PAGE_CACHE_SHIFT, 2584 pos >> PAGE_CACHE_SHIFT,
@@ -2632,9 +2590,9 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
2632 */ 2590 */
2633 } 2591 }
2634 } else { 2592 } else {
2635 written = generic_perform_write(file, from, pos); 2593 written = generic_perform_write(file, from, iocb->ki_pos);
2636 if (likely(written >= 0)) 2594 if (likely(written > 0))
2637 iocb->ki_pos = pos + written; 2595 iocb->ki_pos += written;
2638 } 2596 }
2639out: 2597out:
2640 current->backing_dev_info = NULL; 2598 current->backing_dev_info = NULL;
@@ -2658,7 +2616,9 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
2658 ssize_t ret; 2616 ssize_t ret;
2659 2617
2660 mutex_lock(&inode->i_mutex); 2618 mutex_lock(&inode->i_mutex);
2661 ret = __generic_file_write_iter(iocb, from); 2619 ret = generic_write_checks(iocb, from);
2620 if (ret > 0)
2621 ret = __generic_file_write_iter(iocb, from);
2662 mutex_unlock(&inode->i_mutex); 2622 mutex_unlock(&inode->i_mutex);
2663 2623
2664 if (ret > 0) { 2624 if (ret > 0) {
diff --git a/mm/gup.c b/mm/gup.c
index a6e24e246f86..6297f6bccfb1 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -92,7 +92,7 @@ retry:
92 */ 92 */
93 mark_page_accessed(page); 93 mark_page_accessed(page);
94 } 94 }
95 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 95 if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
96 /* 96 /*
97 * The preliminary mapping check is mainly to avoid the 97 * The preliminary mapping check is mainly to avoid the
98 * pointless overhead of lock_page on the ZERO_PAGE 98 * pointless overhead of lock_page on the ZERO_PAGE
@@ -265,8 +265,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
265 unsigned int fault_flags = 0; 265 unsigned int fault_flags = 0;
266 int ret; 266 int ret;
267 267
268 /* For mlock, just skip the stack guard page. */ 268 /* For mm_populate(), just skip the stack guard page. */
269 if ((*flags & FOLL_MLOCK) && 269 if ((*flags & FOLL_POPULATE) &&
270 (stack_guard_page_start(vma, address) || 270 (stack_guard_page_start(vma, address) ||
271 stack_guard_page_end(vma, address + PAGE_SIZE))) 271 stack_guard_page_end(vma, address + PAGE_SIZE)))
272 return -ENOENT; 272 return -ENOENT;
@@ -819,6 +819,124 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
819EXPORT_SYMBOL(get_user_pages); 819EXPORT_SYMBOL(get_user_pages);
820 820
821/** 821/**
822 * populate_vma_page_range() - populate a range of pages in the vma.
823 * @vma: target vma
824 * @start: start address
825 * @end: end address
826 * @nonblocking:
827 *
828 * This takes care of mlocking the pages too if VM_LOCKED is set.
829 *
830 * return 0 on success, negative error code on error.
831 *
832 * vma->vm_mm->mmap_sem must be held.
833 *
834 * If @nonblocking is NULL, it may be held for read or write and will
835 * be unperturbed.
836 *
837 * If @nonblocking is non-NULL, it must held for read only and may be
838 * released. If it's released, *@nonblocking will be set to 0.
839 */
840long populate_vma_page_range(struct vm_area_struct *vma,
841 unsigned long start, unsigned long end, int *nonblocking)
842{
843 struct mm_struct *mm = vma->vm_mm;
844 unsigned long nr_pages = (end - start) / PAGE_SIZE;
845 int gup_flags;
846
847 VM_BUG_ON(start & ~PAGE_MASK);
848 VM_BUG_ON(end & ~PAGE_MASK);
849 VM_BUG_ON_VMA(start < vma->vm_start, vma);
850 VM_BUG_ON_VMA(end > vma->vm_end, vma);
851 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
852
853 gup_flags = FOLL_TOUCH | FOLL_POPULATE;
854 /*
855 * We want to touch writable mappings with a write fault in order
856 * to break COW, except for shared mappings because these don't COW
857 * and we would not want to dirty them for nothing.
858 */
859 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
860 gup_flags |= FOLL_WRITE;
861
862 /*
863 * We want mlock to succeed for regions that have any permissions
864 * other than PROT_NONE.
865 */
866 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
867 gup_flags |= FOLL_FORCE;
868
869 /*
870 * We made sure addr is within a VMA, so the following will
871 * not result in a stack expansion that recurses back here.
872 */
873 return __get_user_pages(current, mm, start, nr_pages, gup_flags,
874 NULL, NULL, nonblocking);
875}
876
877/*
878 * __mm_populate - populate and/or mlock pages within a range of address space.
879 *
880 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
881 * flags. VMAs must be already marked with the desired vm_flags, and
882 * mmap_sem must not be held.
883 */
884int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
885{
886 struct mm_struct *mm = current->mm;
887 unsigned long end, nstart, nend;
888 struct vm_area_struct *vma = NULL;
889 int locked = 0;
890 long ret = 0;
891
892 VM_BUG_ON(start & ~PAGE_MASK);
893 VM_BUG_ON(len != PAGE_ALIGN(len));
894 end = start + len;
895
896 for (nstart = start; nstart < end; nstart = nend) {
897 /*
898 * We want to fault in pages for [nstart; end) address range.
899 * Find first corresponding VMA.
900 */
901 if (!locked) {
902 locked = 1;
903 down_read(&mm->mmap_sem);
904 vma = find_vma(mm, nstart);
905 } else if (nstart >= vma->vm_end)
906 vma = vma->vm_next;
907 if (!vma || vma->vm_start >= end)
908 break;
909 /*
910 * Set [nstart; nend) to intersection of desired address
911 * range with the first VMA. Also, skip undesirable VMA types.
912 */
913 nend = min(end, vma->vm_end);
914 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
915 continue;
916 if (nstart < vma->vm_start)
917 nstart = vma->vm_start;
918 /*
919 * Now fault in a range of pages. populate_vma_page_range()
920 * double checks the vma flags, so that it won't mlock pages
921 * if the vma was already munlocked.
922 */
923 ret = populate_vma_page_range(vma, nstart, nend, &locked);
924 if (ret < 0) {
925 if (ignore_errors) {
926 ret = 0;
927 continue; /* continue at next VMA */
928 }
929 break;
930 }
931 nend = nstart + ret * PAGE_SIZE;
932 ret = 0;
933 }
934 if (locked)
935 up_read(&mm->mmap_sem);
936 return ret; /* 0 or negative error code */
937}
938
939/**
822 * get_dump_page() - pin user page in memory while writing it to core dump 940 * get_dump_page() - pin user page in memory while writing it to core dump
823 * @addr: user address 941 * @addr: user address
824 * 942 *
@@ -901,7 +1019,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
901 * 1019 *
902 * for an example see gup_get_pte in arch/x86/mm/gup.c 1020 * for an example see gup_get_pte in arch/x86/mm/gup.c
903 */ 1021 */
904 pte_t pte = ACCESS_ONCE(*ptep); 1022 pte_t pte = READ_ONCE(*ptep);
905 struct page *page; 1023 struct page *page;
906 1024
907 /* 1025 /*
@@ -1191,7 +1309,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
1191 local_irq_save(flags); 1309 local_irq_save(flags);
1192 pgdp = pgd_offset(mm, addr); 1310 pgdp = pgd_offset(mm, addr);
1193 do { 1311 do {
1194 pgd_t pgd = ACCESS_ONCE(*pgdp); 1312 pgd_t pgd = READ_ONCE(*pgdp);
1195 1313
1196 next = pgd_addr_end(addr, end); 1314 next = pgd_addr_end(addr, end);
1197 if (pgd_none(pgd)) 1315 if (pgd_none(pgd))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fc00c8cb5a82..078832cf3636 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -67,6 +67,7 @@ static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
67 67
68static int khugepaged(void *none); 68static int khugepaged(void *none);
69static int khugepaged_slab_init(void); 69static int khugepaged_slab_init(void);
70static void khugepaged_slab_exit(void);
70 71
71#define MM_SLOTS_HASH_BITS 10 72#define MM_SLOTS_HASH_BITS 10
72static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); 73static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -109,9 +110,6 @@ static int set_recommended_min_free_kbytes(void)
109 int nr_zones = 0; 110 int nr_zones = 0;
110 unsigned long recommended_min; 111 unsigned long recommended_min;
111 112
112 if (!khugepaged_enabled())
113 return 0;
114
115 for_each_populated_zone(zone) 113 for_each_populated_zone(zone)
116 nr_zones++; 114 nr_zones++;
117 115
@@ -143,9 +141,8 @@ static int set_recommended_min_free_kbytes(void)
143 setup_per_zone_wmarks(); 141 setup_per_zone_wmarks();
144 return 0; 142 return 0;
145} 143}
146late_initcall(set_recommended_min_free_kbytes);
147 144
148static int start_khugepaged(void) 145static int start_stop_khugepaged(void)
149{ 146{
150 int err = 0; 147 int err = 0;
151 if (khugepaged_enabled()) { 148 if (khugepaged_enabled()) {
@@ -156,6 +153,7 @@ static int start_khugepaged(void)
156 pr_err("khugepaged: kthread_run(khugepaged) failed\n"); 153 pr_err("khugepaged: kthread_run(khugepaged) failed\n");
157 err = PTR_ERR(khugepaged_thread); 154 err = PTR_ERR(khugepaged_thread);
158 khugepaged_thread = NULL; 155 khugepaged_thread = NULL;
156 goto fail;
159 } 157 }
160 158
161 if (!list_empty(&khugepaged_scan.mm_head)) 159 if (!list_empty(&khugepaged_scan.mm_head))
@@ -166,7 +164,7 @@ static int start_khugepaged(void)
166 kthread_stop(khugepaged_thread); 164 kthread_stop(khugepaged_thread);
167 khugepaged_thread = NULL; 165 khugepaged_thread = NULL;
168 } 166 }
169 167fail:
170 return err; 168 return err;
171} 169}
172 170
@@ -183,7 +181,7 @@ static struct page *get_huge_zero_page(void)
183 struct page *zero_page; 181 struct page *zero_page;
184retry: 182retry:
185 if (likely(atomic_inc_not_zero(&huge_zero_refcount))) 183 if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
186 return ACCESS_ONCE(huge_zero_page); 184 return READ_ONCE(huge_zero_page);
187 185
188 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, 186 zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
189 HPAGE_PMD_ORDER); 187 HPAGE_PMD_ORDER);
@@ -202,7 +200,7 @@ retry:
202 /* We take additional reference here. It will be put back by shrinker */ 200 /* We take additional reference here. It will be put back by shrinker */
203 atomic_set(&huge_zero_refcount, 2); 201 atomic_set(&huge_zero_refcount, 2);
204 preempt_enable(); 202 preempt_enable();
205 return ACCESS_ONCE(huge_zero_page); 203 return READ_ONCE(huge_zero_page);
206} 204}
207 205
208static void put_huge_zero_page(void) 206static void put_huge_zero_page(void)
@@ -300,7 +298,7 @@ static ssize_t enabled_store(struct kobject *kobj,
300 int err; 298 int err;
301 299
302 mutex_lock(&khugepaged_mutex); 300 mutex_lock(&khugepaged_mutex);
303 err = start_khugepaged(); 301 err = start_stop_khugepaged();
304 mutex_unlock(&khugepaged_mutex); 302 mutex_unlock(&khugepaged_mutex);
305 303
306 if (err) 304 if (err)
@@ -634,27 +632,38 @@ static int __init hugepage_init(void)
634 632
635 err = hugepage_init_sysfs(&hugepage_kobj); 633 err = hugepage_init_sysfs(&hugepage_kobj);
636 if (err) 634 if (err)
637 return err; 635 goto err_sysfs;
638 636
639 err = khugepaged_slab_init(); 637 err = khugepaged_slab_init();
640 if (err) 638 if (err)
641 goto out; 639 goto err_slab;
642 640
643 register_shrinker(&huge_zero_page_shrinker); 641 err = register_shrinker(&huge_zero_page_shrinker);
642 if (err)
643 goto err_hzp_shrinker;
644 644
645 /* 645 /*
646 * By default disable transparent hugepages on smaller systems, 646 * By default disable transparent hugepages on smaller systems,
647 * where the extra memory used could hurt more than TLB overhead 647 * where the extra memory used could hurt more than TLB overhead
648 * is likely to save. The admin can still enable it through /sys. 648 * is likely to save. The admin can still enable it through /sys.
649 */ 649 */
650 if (totalram_pages < (512 << (20 - PAGE_SHIFT))) 650 if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
651 transparent_hugepage_flags = 0; 651 transparent_hugepage_flags = 0;
652 return 0;
653 }
652 654
653 start_khugepaged(); 655 err = start_stop_khugepaged();
656 if (err)
657 goto err_khugepaged;
654 658
655 return 0; 659 return 0;
656out: 660err_khugepaged:
661 unregister_shrinker(&huge_zero_page_shrinker);
662err_hzp_shrinker:
663 khugepaged_slab_exit();
664err_slab:
657 hugepage_exit_sysfs(hugepage_kobj); 665 hugepage_exit_sysfs(hugepage_kobj);
666err_sysfs:
658 return err; 667 return err;
659} 668}
660subsys_initcall(hugepage_init); 669subsys_initcall(hugepage_init);
@@ -708,7 +717,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
708static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 717static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
709 struct vm_area_struct *vma, 718 struct vm_area_struct *vma,
710 unsigned long haddr, pmd_t *pmd, 719 unsigned long haddr, pmd_t *pmd,
711 struct page *page) 720 struct page *page, gfp_t gfp)
712{ 721{
713 struct mem_cgroup *memcg; 722 struct mem_cgroup *memcg;
714 pgtable_t pgtable; 723 pgtable_t pgtable;
@@ -716,7 +725,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
716 725
717 VM_BUG_ON_PAGE(!PageCompound(page), page); 726 VM_BUG_ON_PAGE(!PageCompound(page), page);
718 727
719 if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg)) 728 if (mem_cgroup_try_charge(page, mm, gfp, &memcg))
720 return VM_FAULT_OOM; 729 return VM_FAULT_OOM;
721 730
722 pgtable = pte_alloc_one(mm, haddr); 731 pgtable = pte_alloc_one(mm, haddr);
@@ -822,7 +831,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
822 count_vm_event(THP_FAULT_FALLBACK); 831 count_vm_event(THP_FAULT_FALLBACK);
823 return VM_FAULT_FALLBACK; 832 return VM_FAULT_FALLBACK;
824 } 833 }
825 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { 834 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) {
826 put_page(page); 835 put_page(page);
827 count_vm_event(THP_FAULT_FALLBACK); 836 count_vm_event(THP_FAULT_FALLBACK);
828 return VM_FAULT_FALLBACK; 837 return VM_FAULT_FALLBACK;
@@ -1080,6 +1089,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1080 unsigned long haddr; 1089 unsigned long haddr;
1081 unsigned long mmun_start; /* For mmu_notifiers */ 1090 unsigned long mmun_start; /* For mmu_notifiers */
1082 unsigned long mmun_end; /* For mmu_notifiers */ 1091 unsigned long mmun_end; /* For mmu_notifiers */
1092 gfp_t huge_gfp; /* for allocation and charge */
1083 1093
1084 ptl = pmd_lockptr(mm, pmd); 1094 ptl = pmd_lockptr(mm, pmd);
1085 VM_BUG_ON_VMA(!vma->anon_vma, vma); 1095 VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1106,10 +1116,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1106alloc: 1116alloc:
1107 if (transparent_hugepage_enabled(vma) && 1117 if (transparent_hugepage_enabled(vma) &&
1108 !transparent_hugepage_debug_cow()) { 1118 !transparent_hugepage_debug_cow()) {
1109 gfp_t gfp; 1119 huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
1110 1120 new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
1111 gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
1112 new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
1113 } else 1121 } else
1114 new_page = NULL; 1122 new_page = NULL;
1115 1123
@@ -1130,8 +1138,7 @@ alloc:
1130 goto out; 1138 goto out;
1131 } 1139 }
1132 1140
1133 if (unlikely(mem_cgroup_try_charge(new_page, mm, 1141 if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) {
1134 GFP_TRANSHUGE, &memcg))) {
1135 put_page(new_page); 1142 put_page(new_page);
1136 if (page) { 1143 if (page) {
1137 split_huge_page(page); 1144 split_huge_page(page);
@@ -1231,7 +1238,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1231 pmd, _pmd, 1)) 1238 pmd, _pmd, 1))
1232 update_mmu_cache_pmd(vma, addr, pmd); 1239 update_mmu_cache_pmd(vma, addr, pmd);
1233 } 1240 }
1234 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { 1241 if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
1235 if (page->mapping && trylock_page(page)) { 1242 if (page->mapping && trylock_page(page)) {
1236 lru_add_drain(); 1243 lru_add_drain();
1237 if (page->mapping) 1244 if (page->mapping)
@@ -1260,6 +1267,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1260 int target_nid, last_cpupid = -1; 1267 int target_nid, last_cpupid = -1;
1261 bool page_locked; 1268 bool page_locked;
1262 bool migrated = false; 1269 bool migrated = false;
1270 bool was_writable;
1263 int flags = 0; 1271 int flags = 0;
1264 1272
1265 /* A PROT_NONE fault should not end up here */ 1273 /* A PROT_NONE fault should not end up here */
@@ -1291,12 +1299,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1291 flags |= TNF_FAULT_LOCAL; 1299 flags |= TNF_FAULT_LOCAL;
1292 } 1300 }
1293 1301
1294 /* 1302 /* See similar comment in do_numa_page for explanation */
1295 * Avoid grouping on DSO/COW pages in specific and RO pages 1303 if (!(vma->vm_flags & VM_WRITE))
1296 * in general, RO pages shouldn't hurt as much anyway since
1297 * they can be in shared cache state.
1298 */
1299 if (!pmd_write(pmd))
1300 flags |= TNF_NO_GROUP; 1304 flags |= TNF_NO_GROUP;
1301 1305
1302 /* 1306 /*
@@ -1353,12 +1357,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1353 if (migrated) { 1357 if (migrated) {
1354 flags |= TNF_MIGRATED; 1358 flags |= TNF_MIGRATED;
1355 page_nid = target_nid; 1359 page_nid = target_nid;
1356 } 1360 } else
1361 flags |= TNF_MIGRATE_FAIL;
1357 1362
1358 goto out; 1363 goto out;
1359clear_pmdnuma: 1364clear_pmdnuma:
1360 BUG_ON(!PageLocked(page)); 1365 BUG_ON(!PageLocked(page));
1366 was_writable = pmd_write(pmd);
1361 pmd = pmd_modify(pmd, vma->vm_page_prot); 1367 pmd = pmd_modify(pmd, vma->vm_page_prot);
1368 pmd = pmd_mkyoung(pmd);
1369 if (was_writable)
1370 pmd = pmd_mkwrite(pmd);
1362 set_pmd_at(mm, haddr, pmdp, pmd); 1371 set_pmd_at(mm, haddr, pmdp, pmd);
1363 update_mmu_cache_pmd(vma, addr, pmdp); 1372 update_mmu_cache_pmd(vma, addr, pmdp);
1364 unlock_page(page); 1373 unlock_page(page);
@@ -1482,6 +1491,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1482 1491
1483 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 1492 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1484 pmd_t entry; 1493 pmd_t entry;
1494 bool preserve_write = prot_numa && pmd_write(*pmd);
1495 ret = 1;
1485 1496
1486 /* 1497 /*
1487 * Avoid trapping faults against the zero page. The read-only 1498 * Avoid trapping faults against the zero page. The read-only
@@ -1490,16 +1501,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1490 */ 1501 */
1491 if (prot_numa && is_huge_zero_pmd(*pmd)) { 1502 if (prot_numa && is_huge_zero_pmd(*pmd)) {
1492 spin_unlock(ptl); 1503 spin_unlock(ptl);
1493 return 0; 1504 return ret;
1494 } 1505 }
1495 1506
1496 if (!prot_numa || !pmd_protnone(*pmd)) { 1507 if (!prot_numa || !pmd_protnone(*pmd)) {
1497 ret = 1;
1498 entry = pmdp_get_and_clear_notify(mm, addr, pmd); 1508 entry = pmdp_get_and_clear_notify(mm, addr, pmd);
1499 entry = pmd_modify(entry, newprot); 1509 entry = pmd_modify(entry, newprot);
1510 if (preserve_write)
1511 entry = pmd_mkwrite(entry);
1500 ret = HPAGE_PMD_NR; 1512 ret = HPAGE_PMD_NR;
1501 set_pmd_at(mm, addr, pmd, entry); 1513 set_pmd_at(mm, addr, pmd, entry);
1502 BUG_ON(pmd_write(entry)); 1514 BUG_ON(!preserve_write && pmd_write(entry));
1503 } 1515 }
1504 spin_unlock(ptl); 1516 spin_unlock(ptl);
1505 } 1517 }
@@ -1971,6 +1983,11 @@ static int __init khugepaged_slab_init(void)
1971 return 0; 1983 return 0;
1972} 1984}
1973 1985
1986static void __init khugepaged_slab_exit(void)
1987{
1988 kmem_cache_destroy(mm_slot_cache);
1989}
1990
1974static inline struct mm_slot *alloc_mm_slot(void) 1991static inline struct mm_slot *alloc_mm_slot(void)
1975{ 1992{
1976 if (!mm_slot_cache) /* initialization failed */ 1993 if (!mm_slot_cache) /* initialization failed */
@@ -2104,7 +2121,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
2104{ 2121{
2105 while (--_pte >= pte) { 2122 while (--_pte >= pte) {
2106 pte_t pteval = *_pte; 2123 pte_t pteval = *_pte;
2107 if (!pte_none(pteval)) 2124 if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
2108 release_pte_page(pte_page(pteval)); 2125 release_pte_page(pte_page(pteval));
2109 } 2126 }
2110} 2127}
@@ -2115,13 +2132,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2115{ 2132{
2116 struct page *page; 2133 struct page *page;
2117 pte_t *_pte; 2134 pte_t *_pte;
2118 int none = 0; 2135 int none_or_zero = 0;
2119 bool referenced = false, writable = false; 2136 bool referenced = false, writable = false;
2120 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 2137 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
2121 _pte++, address += PAGE_SIZE) { 2138 _pte++, address += PAGE_SIZE) {
2122 pte_t pteval = *_pte; 2139 pte_t pteval = *_pte;
2123 if (pte_none(pteval)) { 2140 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
2124 if (++none <= khugepaged_max_ptes_none) 2141 if (++none_or_zero <= khugepaged_max_ptes_none)
2125 continue; 2142 continue;
2126 else 2143 else
2127 goto out; 2144 goto out;
@@ -2202,9 +2219,21 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
2202 pte_t pteval = *_pte; 2219 pte_t pteval = *_pte;
2203 struct page *src_page; 2220 struct page *src_page;
2204 2221
2205 if (pte_none(pteval)) { 2222 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
2206 clear_user_highpage(page, address); 2223 clear_user_highpage(page, address);
2207 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); 2224 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
2225 if (is_zero_pfn(pte_pfn(pteval))) {
2226 /*
2227 * ptl mostly unnecessary.
2228 */
2229 spin_lock(ptl);
2230 /*
2231 * paravirt calls inside pte_clear here are
2232 * superfluous.
2233 */
2234 pte_clear(vma->vm_mm, address, _pte);
2235 spin_unlock(ptl);
2236 }
2208 } else { 2237 } else {
2209 src_page = pte_page(pteval); 2238 src_page = pte_page(pteval);
2210 copy_user_highpage(page, src_page, address, vma); 2239 copy_user_highpage(page, src_page, address, vma);
@@ -2306,8 +2335,8 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
2306 return true; 2335 return true;
2307} 2336}
2308 2337
2309static struct page 2338static struct page *
2310*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, 2339khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
2311 struct vm_area_struct *vma, unsigned long address, 2340 struct vm_area_struct *vma, unsigned long address,
2312 int node) 2341 int node)
2313{ 2342{
@@ -2321,8 +2350,7 @@ static struct page
2321 */ 2350 */
2322 up_read(&mm->mmap_sem); 2351 up_read(&mm->mmap_sem);
2323 2352
2324 *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( 2353 *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER);
2325 khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
2326 if (unlikely(!*hpage)) { 2354 if (unlikely(!*hpage)) {
2327 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 2355 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2328 *hpage = ERR_PTR(-ENOMEM); 2356 *hpage = ERR_PTR(-ENOMEM);
@@ -2375,13 +2403,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
2375 return true; 2403 return true;
2376} 2404}
2377 2405
2378static struct page 2406static struct page *
2379*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, 2407khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm,
2380 struct vm_area_struct *vma, unsigned long address, 2408 struct vm_area_struct *vma, unsigned long address,
2381 int node) 2409 int node)
2382{ 2410{
2383 up_read(&mm->mmap_sem); 2411 up_read(&mm->mmap_sem);
2384 VM_BUG_ON(!*hpage); 2412 VM_BUG_ON(!*hpage);
2413
2385 return *hpage; 2414 return *hpage;
2386} 2415}
2387#endif 2416#endif
@@ -2416,16 +2445,21 @@ static void collapse_huge_page(struct mm_struct *mm,
2416 struct mem_cgroup *memcg; 2445 struct mem_cgroup *memcg;
2417 unsigned long mmun_start; /* For mmu_notifiers */ 2446 unsigned long mmun_start; /* For mmu_notifiers */
2418 unsigned long mmun_end; /* For mmu_notifiers */ 2447 unsigned long mmun_end; /* For mmu_notifiers */
2448 gfp_t gfp;
2419 2449
2420 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2450 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2421 2451
2452 /* Only allocate from the target node */
2453 gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) |
2454 __GFP_THISNODE;
2455
2422 /* release the mmap_sem read lock. */ 2456 /* release the mmap_sem read lock. */
2423 new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); 2457 new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node);
2424 if (!new_page) 2458 if (!new_page)
2425 return; 2459 return;
2426 2460
2427 if (unlikely(mem_cgroup_try_charge(new_page, mm, 2461 if (unlikely(mem_cgroup_try_charge(new_page, mm,
2428 GFP_TRANSHUGE, &memcg))) 2462 gfp, &memcg)))
2429 return; 2463 return;
2430 2464
2431 /* 2465 /*
@@ -2538,7 +2572,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2538{ 2572{
2539 pmd_t *pmd; 2573 pmd_t *pmd;
2540 pte_t *pte, *_pte; 2574 pte_t *pte, *_pte;
2541 int ret = 0, none = 0; 2575 int ret = 0, none_or_zero = 0;
2542 struct page *page; 2576 struct page *page;
2543 unsigned long _address; 2577 unsigned long _address;
2544 spinlock_t *ptl; 2578 spinlock_t *ptl;
@@ -2556,8 +2590,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2556 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; 2590 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
2557 _pte++, _address += PAGE_SIZE) { 2591 _pte++, _address += PAGE_SIZE) {
2558 pte_t pteval = *_pte; 2592 pte_t pteval = *_pte;
2559 if (pte_none(pteval)) { 2593 if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
2560 if (++none <= khugepaged_max_ptes_none) 2594 if (++none_or_zero <= khugepaged_max_ptes_none)
2561 continue; 2595 continue;
2562 else 2596 else
2563 goto out_unmap; 2597 goto out_unmap;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0a9ac6c26832..271e4432734c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock);
61static int num_fault_mutexes; 61static int num_fault_mutexes;
62static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; 62static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
63 63
64/* Forward declaration */
65static int hugetlb_acct_memory(struct hstate *h, long delta);
66
64static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) 67static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
65{ 68{
66 bool free = (spool->count == 0) && (spool->used_hpages == 0); 69 bool free = (spool->count == 0) && (spool->used_hpages == 0);
@@ -68,23 +71,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
68 spin_unlock(&spool->lock); 71 spin_unlock(&spool->lock);
69 72
70 /* If no pages are used, and no other handles to the subpool 73 /* If no pages are used, and no other handles to the subpool
71 * remain, free the subpool the subpool remain */ 74 * remain, give up any reservations mased on minimum size and
72 if (free) 75 * free the subpool */
76 if (free) {
77 if (spool->min_hpages != -1)
78 hugetlb_acct_memory(spool->hstate,
79 -spool->min_hpages);
73 kfree(spool); 80 kfree(spool);
81 }
74} 82}
75 83
76struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) 84struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
85 long min_hpages)
77{ 86{
78 struct hugepage_subpool *spool; 87 struct hugepage_subpool *spool;
79 88
80 spool = kmalloc(sizeof(*spool), GFP_KERNEL); 89 spool = kzalloc(sizeof(*spool), GFP_KERNEL);
81 if (!spool) 90 if (!spool)
82 return NULL; 91 return NULL;
83 92
84 spin_lock_init(&spool->lock); 93 spin_lock_init(&spool->lock);
85 spool->count = 1; 94 spool->count = 1;
86 spool->max_hpages = nr_blocks; 95 spool->max_hpages = max_hpages;
87 spool->used_hpages = 0; 96 spool->hstate = h;
97 spool->min_hpages = min_hpages;
98
99 if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
100 kfree(spool);
101 return NULL;
102 }
103 spool->rsv_hpages = min_hpages;
88 104
89 return spool; 105 return spool;
90} 106}
@@ -97,36 +113,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
97 unlock_or_release_subpool(spool); 113 unlock_or_release_subpool(spool);
98} 114}
99 115
100static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, 116/*
117 * Subpool accounting for allocating and reserving pages.
118 * Return -ENOMEM if there are not enough resources to satisfy the
119 * the request. Otherwise, return the number of pages by which the
120 * global pools must be adjusted (upward). The returned value may
121 * only be different than the passed value (delta) in the case where
122 * a subpool minimum size must be manitained.
123 */
124static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
101 long delta) 125 long delta)
102{ 126{
103 int ret = 0; 127 long ret = delta;
104 128
105 if (!spool) 129 if (!spool)
106 return 0; 130 return ret;
107 131
108 spin_lock(&spool->lock); 132 spin_lock(&spool->lock);
109 if ((spool->used_hpages + delta) <= spool->max_hpages) { 133
110 spool->used_hpages += delta; 134 if (spool->max_hpages != -1) { /* maximum size accounting */
111 } else { 135 if ((spool->used_hpages + delta) <= spool->max_hpages)
112 ret = -ENOMEM; 136 spool->used_hpages += delta;
137 else {
138 ret = -ENOMEM;
139 goto unlock_ret;
140 }
113 } 141 }
114 spin_unlock(&spool->lock);
115 142
143 if (spool->min_hpages != -1) { /* minimum size accounting */
144 if (delta > spool->rsv_hpages) {
145 /*
146 * Asking for more reserves than those already taken on
147 * behalf of subpool. Return difference.
148 */
149 ret = delta - spool->rsv_hpages;
150 spool->rsv_hpages = 0;
151 } else {
152 ret = 0; /* reserves already accounted for */
153 spool->rsv_hpages -= delta;
154 }
155 }
156
157unlock_ret:
158 spin_unlock(&spool->lock);
116 return ret; 159 return ret;
117} 160}
118 161
119static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, 162/*
163 * Subpool accounting for freeing and unreserving pages.
164 * Return the number of global page reservations that must be dropped.
165 * The return value may only be different than the passed value (delta)
166 * in the case where a subpool minimum size must be maintained.
167 */
168static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
120 long delta) 169 long delta)
121{ 170{
171 long ret = delta;
172
122 if (!spool) 173 if (!spool)
123 return; 174 return delta;
124 175
125 spin_lock(&spool->lock); 176 spin_lock(&spool->lock);
126 spool->used_hpages -= delta; 177
127 /* If hugetlbfs_put_super couldn't free spool due to 178 if (spool->max_hpages != -1) /* maximum size accounting */
128 * an outstanding quota reference, free it now. */ 179 spool->used_hpages -= delta;
180
181 if (spool->min_hpages != -1) { /* minimum size accounting */
182 if (spool->rsv_hpages + delta <= spool->min_hpages)
183 ret = 0;
184 else
185 ret = spool->rsv_hpages + delta - spool->min_hpages;
186
187 spool->rsv_hpages += delta;
188 if (spool->rsv_hpages > spool->min_hpages)
189 spool->rsv_hpages = spool->min_hpages;
190 }
191
192 /*
193 * If hugetlbfs_put_super couldn't free spool due to an outstanding
194 * quota reference, free it now.
195 */
129 unlock_or_release_subpool(spool); 196 unlock_or_release_subpool(spool);
197
198 return ret;
130} 199}
131 200
132static inline struct hugepage_subpool *subpool_inode(struct inode *inode) 201static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
@@ -855,6 +924,31 @@ struct hstate *size_to_hstate(unsigned long size)
855 return NULL; 924 return NULL;
856} 925}
857 926
927/*
928 * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
929 * to hstate->hugepage_activelist.)
930 *
931 * This function can be called for tail pages, but never returns true for them.
932 */
933bool page_huge_active(struct page *page)
934{
935 VM_BUG_ON_PAGE(!PageHuge(page), page);
936 return PageHead(page) && PagePrivate(&page[1]);
937}
938
939/* never called for tail page */
940static void set_page_huge_active(struct page *page)
941{
942 VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
943 SetPagePrivate(&page[1]);
944}
945
946static void clear_page_huge_active(struct page *page)
947{
948 VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
949 ClearPagePrivate(&page[1]);
950}
951
858void free_huge_page(struct page *page) 952void free_huge_page(struct page *page)
859{ 953{
860 /* 954 /*
@@ -874,7 +968,16 @@ void free_huge_page(struct page *page)
874 restore_reserve = PagePrivate(page); 968 restore_reserve = PagePrivate(page);
875 ClearPagePrivate(page); 969 ClearPagePrivate(page);
876 970
971 /*
972 * A return code of zero implies that the subpool will be under its
973 * minimum size if the reservation is not restored after page is free.
974 * Therefore, force restore_reserve operation.
975 */
976 if (hugepage_subpool_put_pages(spool, 1) == 0)
977 restore_reserve = true;
978
877 spin_lock(&hugetlb_lock); 979 spin_lock(&hugetlb_lock);
980 clear_page_huge_active(page);
878 hugetlb_cgroup_uncharge_page(hstate_index(h), 981 hugetlb_cgroup_uncharge_page(hstate_index(h),
879 pages_per_huge_page(h), page); 982 pages_per_huge_page(h), page);
880 if (restore_reserve) 983 if (restore_reserve)
@@ -891,7 +994,6 @@ void free_huge_page(struct page *page)
891 enqueue_huge_page(h, page); 994 enqueue_huge_page(h, page);
892 } 995 }
893 spin_unlock(&hugetlb_lock); 996 spin_unlock(&hugetlb_lock);
894 hugepage_subpool_put_pages(spool, 1);
895} 997}
896 998
897static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 999static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -917,7 +1019,6 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
917 __SetPageHead(page); 1019 __SetPageHead(page);
918 __ClearPageReserved(page); 1020 __ClearPageReserved(page);
919 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { 1021 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
920 __SetPageTail(p);
921 /* 1022 /*
922 * For gigantic hugepages allocated through bootmem at 1023 * For gigantic hugepages allocated through bootmem at
923 * boot, it's safer to be consistent with the not-gigantic 1024 * boot, it's safer to be consistent with the not-gigantic
@@ -933,6 +1034,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
933 __ClearPageReserved(p); 1034 __ClearPageReserved(p);
934 set_page_count(p, 0); 1035 set_page_count(p, 0);
935 p->first_page = page; 1036 p->first_page = page;
1037 /* Make sure p->first_page is always valid for PageTail() */
1038 smp_wmb();
1039 __SetPageTail(p);
936 } 1040 }
937} 1041}
938 1042
@@ -1384,7 +1488,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1384 if (chg < 0) 1488 if (chg < 0)
1385 return ERR_PTR(-ENOMEM); 1489 return ERR_PTR(-ENOMEM);
1386 if (chg || avoid_reserve) 1490 if (chg || avoid_reserve)
1387 if (hugepage_subpool_get_pages(spool, 1)) 1491 if (hugepage_subpool_get_pages(spool, 1) < 0)
1388 return ERR_PTR(-ENOSPC); 1492 return ERR_PTR(-ENOSPC);
1389 1493
1390 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 1494 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
@@ -2452,6 +2556,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2452 struct resv_map *resv = vma_resv_map(vma); 2556 struct resv_map *resv = vma_resv_map(vma);
2453 struct hugepage_subpool *spool = subpool_vma(vma); 2557 struct hugepage_subpool *spool = subpool_vma(vma);
2454 unsigned long reserve, start, end; 2558 unsigned long reserve, start, end;
2559 long gbl_reserve;
2455 2560
2456 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 2561 if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
2457 return; 2562 return;
@@ -2464,8 +2569,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2464 kref_put(&resv->refs, resv_map_release); 2569 kref_put(&resv->refs, resv_map_release);
2465 2570
2466 if (reserve) { 2571 if (reserve) {
2467 hugetlb_acct_memory(h, -reserve); 2572 /*
2468 hugepage_subpool_put_pages(spool, reserve); 2573 * Decrement reserve counts. The global reserve count may be
2574 * adjusted if the subpool has a minimum size.
2575 */
2576 gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
2577 hugetlb_acct_memory(h, -gbl_reserve);
2469 } 2578 }
2470} 2579}
2471 2580
@@ -2889,6 +2998,7 @@ retry_avoidcopy:
2889 copy_user_huge_page(new_page, old_page, address, vma, 2998 copy_user_huge_page(new_page, old_page, address, vma,
2890 pages_per_huge_page(h)); 2999 pages_per_huge_page(h));
2891 __SetPageUptodate(new_page); 3000 __SetPageUptodate(new_page);
3001 set_page_huge_active(new_page);
2892 3002
2893 mmun_start = address & huge_page_mask(h); 3003 mmun_start = address & huge_page_mask(h);
2894 mmun_end = mmun_start + huge_page_size(h); 3004 mmun_end = mmun_start + huge_page_size(h);
@@ -3001,6 +3111,7 @@ retry:
3001 } 3111 }
3002 clear_huge_page(page, address, pages_per_huge_page(h)); 3112 clear_huge_page(page, address, pages_per_huge_page(h));
3003 __SetPageUptodate(page); 3113 __SetPageUptodate(page);
3114 set_page_huge_active(page);
3004 3115
3005 if (vma->vm_flags & VM_MAYSHARE) { 3116 if (vma->vm_flags & VM_MAYSHARE) {
3006 int err; 3117 int err;
@@ -3276,6 +3387,15 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
3276 struct page *page; 3387 struct page *page;
3277 3388
3278 /* 3389 /*
3390 * If we have a pending SIGKILL, don't keep faulting pages and
3391 * potentially allocating memory.
3392 */
3393 if (unlikely(fatal_signal_pending(current))) {
3394 remainder = 0;
3395 break;
3396 }
3397
3398 /*
3279 * Some archs (sparc64, sh*) have multiple pte_ts to 3399 * Some archs (sparc64, sh*) have multiple pte_ts to
3280 * each hugepage. We have to make sure we get the 3400 * each hugepage. We have to make sure we get the
3281 * first, for the page indexing below to work. 3401 * first, for the page indexing below to work.
@@ -3436,6 +3556,7 @@ int hugetlb_reserve_pages(struct inode *inode,
3436 struct hstate *h = hstate_inode(inode); 3556 struct hstate *h = hstate_inode(inode);
3437 struct hugepage_subpool *spool = subpool_inode(inode); 3557 struct hugepage_subpool *spool = subpool_inode(inode);
3438 struct resv_map *resv_map; 3558 struct resv_map *resv_map;
3559 long gbl_reserve;
3439 3560
3440 /* 3561 /*
3441 * Only apply hugepage reservation if asked. At fault time, an 3562 * Only apply hugepage reservation if asked. At fault time, an
@@ -3472,8 +3593,13 @@ int hugetlb_reserve_pages(struct inode *inode,
3472 goto out_err; 3593 goto out_err;
3473 } 3594 }
3474 3595
3475 /* There must be enough pages in the subpool for the mapping */ 3596 /*
3476 if (hugepage_subpool_get_pages(spool, chg)) { 3597 * There must be enough pages in the subpool for the mapping. If
3598 * the subpool has a minimum size, there may be some global
3599 * reservations already in place (gbl_reserve).
3600 */
3601 gbl_reserve = hugepage_subpool_get_pages(spool, chg);
3602 if (gbl_reserve < 0) {
3477 ret = -ENOSPC; 3603 ret = -ENOSPC;
3478 goto out_err; 3604 goto out_err;
3479 } 3605 }
@@ -3482,9 +3608,10 @@ int hugetlb_reserve_pages(struct inode *inode,
3482 * Check enough hugepages are available for the reservation. 3608 * Check enough hugepages are available for the reservation.
3483 * Hand the pages back to the subpool if there are not 3609 * Hand the pages back to the subpool if there are not
3484 */ 3610 */
3485 ret = hugetlb_acct_memory(h, chg); 3611 ret = hugetlb_acct_memory(h, gbl_reserve);
3486 if (ret < 0) { 3612 if (ret < 0) {
3487 hugepage_subpool_put_pages(spool, chg); 3613 /* put back original number of pages, chg */
3614 (void)hugepage_subpool_put_pages(spool, chg);
3488 goto out_err; 3615 goto out_err;
3489 } 3616 }
3490 3617
@@ -3514,6 +3641,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3514 struct resv_map *resv_map = inode_resv_map(inode); 3641 struct resv_map *resv_map = inode_resv_map(inode);
3515 long chg = 0; 3642 long chg = 0;
3516 struct hugepage_subpool *spool = subpool_inode(inode); 3643 struct hugepage_subpool *spool = subpool_inode(inode);
3644 long gbl_reserve;
3517 3645
3518 if (resv_map) 3646 if (resv_map)
3519 chg = region_truncate(resv_map, offset); 3647 chg = region_truncate(resv_map, offset);
@@ -3521,8 +3649,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3521 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 3649 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
3522 spin_unlock(&inode->i_lock); 3650 spin_unlock(&inode->i_lock);
3523 3651
3524 hugepage_subpool_put_pages(spool, (chg - freed)); 3652 /*
3525 hugetlb_acct_memory(h, -(chg - freed)); 3653 * If the subpool has a minimum size, the number of global
3654 * reservations to be released may be adjusted.
3655 */
3656 gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
3657 hugetlb_acct_memory(h, -gbl_reserve);
3526} 3658}
3527 3659
3528#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE 3660#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -3733,8 +3865,7 @@ retry:
3733 if (!pmd_huge(*pmd)) 3865 if (!pmd_huge(*pmd))
3734 goto out; 3866 goto out;
3735 if (pmd_present(*pmd)) { 3867 if (pmd_present(*pmd)) {
3736 page = pte_page(*(pte_t *)pmd) + 3868 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
3737 ((address & ~PMD_MASK) >> PAGE_SHIFT);
3738 if (flags & FOLL_GET) 3869 if (flags & FOLL_GET)
3739 get_page(page); 3870 get_page(page);
3740 } else { 3871 } else {
@@ -3765,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
3765 3896
3766#ifdef CONFIG_MEMORY_FAILURE 3897#ifdef CONFIG_MEMORY_FAILURE
3767 3898
3768/* Should be called in hugetlb_lock */
3769static int is_hugepage_on_freelist(struct page *hpage)
3770{
3771 struct page *page;
3772 struct page *tmp;
3773 struct hstate *h = page_hstate(hpage);
3774 int nid = page_to_nid(hpage);
3775
3776 list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
3777 if (page == hpage)
3778 return 1;
3779 return 0;
3780}
3781
3782/* 3899/*
3783 * This function is called from memory failure code. 3900 * This function is called from memory failure code.
3784 * Assume the caller holds page lock of the head page. 3901 * Assume the caller holds page lock of the head page.
@@ -3790,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
3790 int ret = -EBUSY; 3907 int ret = -EBUSY;
3791 3908
3792 spin_lock(&hugetlb_lock); 3909 spin_lock(&hugetlb_lock);
3793 if (is_hugepage_on_freelist(hpage)) { 3910 /*
3911 * Just checking !page_huge_active is not enough, because that could be
3912 * an isolated/hwpoisoned hugepage (which have >0 refcount).
3913 */
3914 if (!page_huge_active(hpage) && !page_count(hpage)) {
3794 /* 3915 /*
3795 * Hwpoisoned hugepage isn't linked to activelist or freelist, 3916 * Hwpoisoned hugepage isn't linked to activelist or freelist,
3796 * but dangling hpage->lru can trigger list-debug warnings 3917 * but dangling hpage->lru can trigger list-debug warnings
@@ -3810,42 +3931,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
3810 3931
3811bool isolate_huge_page(struct page *page, struct list_head *list) 3932bool isolate_huge_page(struct page *page, struct list_head *list)
3812{ 3933{
3934 bool ret = true;
3935
3813 VM_BUG_ON_PAGE(!PageHead(page), page); 3936 VM_BUG_ON_PAGE(!PageHead(page), page);
3814 if (!get_page_unless_zero(page))
3815 return false;
3816 spin_lock(&hugetlb_lock); 3937 spin_lock(&hugetlb_lock);
3938 if (!page_huge_active(page) || !get_page_unless_zero(page)) {
3939 ret = false;
3940 goto unlock;
3941 }
3942 clear_page_huge_active(page);
3817 list_move_tail(&page->lru, list); 3943 list_move_tail(&page->lru, list);
3944unlock:
3818 spin_unlock(&hugetlb_lock); 3945 spin_unlock(&hugetlb_lock);
3819 return true; 3946 return ret;
3820} 3947}
3821 3948
3822void putback_active_hugepage(struct page *page) 3949void putback_active_hugepage(struct page *page)
3823{ 3950{
3824 VM_BUG_ON_PAGE(!PageHead(page), page); 3951 VM_BUG_ON_PAGE(!PageHead(page), page);
3825 spin_lock(&hugetlb_lock); 3952 spin_lock(&hugetlb_lock);
3953 set_page_huge_active(page);
3826 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); 3954 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
3827 spin_unlock(&hugetlb_lock); 3955 spin_unlock(&hugetlb_lock);
3828 put_page(page); 3956 put_page(page);
3829} 3957}
3830
3831bool is_hugepage_active(struct page *page)
3832{
3833 VM_BUG_ON_PAGE(!PageHuge(page), page);
3834 /*
3835 * This function can be called for a tail page because the caller,
3836 * scan_movable_pages, scans through a given pfn-range which typically
3837 * covers one memory block. In systems using gigantic hugepage (1GB
3838 * for x86_64,) a hugepage is larger than a memory block, and we don't
3839 * support migrating such large hugepages for now, so return false
3840 * when called for tail pages.
3841 */
3842 if (PageTail(page))
3843 return false;
3844 /*
3845 * Refcount of a hwpoisoned hugepages is 1, but they are not active,
3846 * so we should return false for them.
3847 */
3848 if (unlikely(PageHWPoison(page)))
3849 return false;
3850 return page_count(page) > 0;
3851}
diff --git a/mm/internal.h b/mm/internal.h
index a96da5b0029d..a25e359a4039 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -200,6 +200,8 @@ isolate_freepages_range(struct compact_control *cc,
200unsigned long 200unsigned long
201isolate_migratepages_range(struct compact_control *cc, 201isolate_migratepages_range(struct compact_control *cc,
202 unsigned long low_pfn, unsigned long end_pfn); 202 unsigned long low_pfn, unsigned long end_pfn);
203int find_suitable_fallback(struct free_area *area, unsigned int order,
204 int migratetype, bool only_stealable, bool *can_steal);
203 205
204#endif 206#endif
205 207
@@ -222,13 +224,13 @@ static inline unsigned long page_order(struct page *page)
222 * PageBuddy() should be checked first by the caller to minimize race window, 224 * PageBuddy() should be checked first by the caller to minimize race window,
223 * and invalid values must be handled gracefully. 225 * and invalid values must be handled gracefully.
224 * 226 *
225 * ACCESS_ONCE is used so that if the caller assigns the result into a local 227 * READ_ONCE is used so that if the caller assigns the result into a local
226 * variable and e.g. tests it for valid range before using, the compiler cannot 228 * variable and e.g. tests it for valid range before using, the compiler cannot
227 * decide to remove the variable and inline the page_private(page) multiple 229 * decide to remove the variable and inline the page_private(page) multiple
228 * times, potentially observing different values in the tests and the actual 230 * times, potentially observing different values in the tests and the actual
229 * use of the result. 231 * use of the result.
230 */ 232 */
231#define page_order_unsafe(page) ACCESS_ONCE(page_private(page)) 233#define page_order_unsafe(page) READ_ONCE(page_private(page))
232 234
233static inline bool is_cow_mapping(vm_flags_t flags) 235static inline bool is_cow_mapping(vm_flags_t flags)
234{ 236{
@@ -240,7 +242,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
240 struct vm_area_struct *prev, struct rb_node *rb_parent); 242 struct vm_area_struct *prev, struct rb_node *rb_parent);
241 243
242#ifdef CONFIG_MMU 244#ifdef CONFIG_MMU
243extern long __mlock_vma_pages_range(struct vm_area_struct *vma, 245extern long populate_vma_page_range(struct vm_area_struct *vma,
244 unsigned long start, unsigned long end, int *nonblocking); 246 unsigned long start, unsigned long end, int *nonblocking);
245extern void munlock_vma_pages_range(struct vm_area_struct *vma, 247extern void munlock_vma_pages_range(struct vm_area_struct *vma,
246 unsigned long start, unsigned long end); 248 unsigned long start, unsigned long end);
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
deleted file mode 100644
index 827732047da1..000000000000
--- a/mm/iov_iter.c
+++ /dev/null
@@ -1,753 +0,0 @@
1#include <linux/export.h>
2#include <linux/uio.h>
3#include <linux/pagemap.h>
4#include <linux/slab.h>
5#include <linux/vmalloc.h>
6#include <net/checksum.h>
7
8#define iterate_iovec(i, n, __v, __p, skip, STEP) { \
9 size_t left; \
10 size_t wanted = n; \
11 __p = i->iov; \
12 __v.iov_len = min(n, __p->iov_len - skip); \
13 if (likely(__v.iov_len)) { \
14 __v.iov_base = __p->iov_base + skip; \
15 left = (STEP); \
16 __v.iov_len -= left; \
17 skip += __v.iov_len; \
18 n -= __v.iov_len; \
19 } else { \
20 left = 0; \
21 } \
22 while (unlikely(!left && n)) { \
23 __p++; \
24 __v.iov_len = min(n, __p->iov_len); \
25 if (unlikely(!__v.iov_len)) \
26 continue; \
27 __v.iov_base = __p->iov_base; \
28 left = (STEP); \
29 __v.iov_len -= left; \
30 skip = __v.iov_len; \
31 n -= __v.iov_len; \
32 } \
33 n = wanted - n; \
34}
35
36#define iterate_kvec(i, n, __v, __p, skip, STEP) { \
37 size_t wanted = n; \
38 __p = i->kvec; \
39 __v.iov_len = min(n, __p->iov_len - skip); \
40 if (likely(__v.iov_len)) { \
41 __v.iov_base = __p->iov_base + skip; \
42 (void)(STEP); \
43 skip += __v.iov_len; \
44 n -= __v.iov_len; \
45 } \
46 while (unlikely(n)) { \
47 __p++; \
48 __v.iov_len = min(n, __p->iov_len); \
49 if (unlikely(!__v.iov_len)) \
50 continue; \
51 __v.iov_base = __p->iov_base; \
52 (void)(STEP); \
53 skip = __v.iov_len; \
54 n -= __v.iov_len; \
55 } \
56 n = wanted; \
57}
58
59#define iterate_bvec(i, n, __v, __p, skip, STEP) { \
60 size_t wanted = n; \
61 __p = i->bvec; \
62 __v.bv_len = min_t(size_t, n, __p->bv_len - skip); \
63 if (likely(__v.bv_len)) { \
64 __v.bv_page = __p->bv_page; \
65 __v.bv_offset = __p->bv_offset + skip; \
66 (void)(STEP); \
67 skip += __v.bv_len; \
68 n -= __v.bv_len; \
69 } \
70 while (unlikely(n)) { \
71 __p++; \
72 __v.bv_len = min_t(size_t, n, __p->bv_len); \
73 if (unlikely(!__v.bv_len)) \
74 continue; \
75 __v.bv_page = __p->bv_page; \
76 __v.bv_offset = __p->bv_offset; \
77 (void)(STEP); \
78 skip = __v.bv_len; \
79 n -= __v.bv_len; \
80 } \
81 n = wanted; \
82}
83
84#define iterate_all_kinds(i, n, v, I, B, K) { \
85 size_t skip = i->iov_offset; \
86 if (unlikely(i->type & ITER_BVEC)) { \
87 const struct bio_vec *bvec; \
88 struct bio_vec v; \
89 iterate_bvec(i, n, v, bvec, skip, (B)) \
90 } else if (unlikely(i->type & ITER_KVEC)) { \
91 const struct kvec *kvec; \
92 struct kvec v; \
93 iterate_kvec(i, n, v, kvec, skip, (K)) \
94 } else { \
95 const struct iovec *iov; \
96 struct iovec v; \
97 iterate_iovec(i, n, v, iov, skip, (I)) \
98 } \
99}
100
101#define iterate_and_advance(i, n, v, I, B, K) { \
102 size_t skip = i->iov_offset; \
103 if (unlikely(i->type & ITER_BVEC)) { \
104 const struct bio_vec *bvec; \
105 struct bio_vec v; \
106 iterate_bvec(i, n, v, bvec, skip, (B)) \
107 if (skip == bvec->bv_len) { \
108 bvec++; \
109 skip = 0; \
110 } \
111 i->nr_segs -= bvec - i->bvec; \
112 i->bvec = bvec; \
113 } else if (unlikely(i->type & ITER_KVEC)) { \
114 const struct kvec *kvec; \
115 struct kvec v; \
116 iterate_kvec(i, n, v, kvec, skip, (K)) \
117 if (skip == kvec->iov_len) { \
118 kvec++; \
119 skip = 0; \
120 } \
121 i->nr_segs -= kvec - i->kvec; \
122 i->kvec = kvec; \
123 } else { \
124 const struct iovec *iov; \
125 struct iovec v; \
126 iterate_iovec(i, n, v, iov, skip, (I)) \
127 if (skip == iov->iov_len) { \
128 iov++; \
129 skip = 0; \
130 } \
131 i->nr_segs -= iov - i->iov; \
132 i->iov = iov; \
133 } \
134 i->count -= n; \
135 i->iov_offset = skip; \
136}
137
138static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
139 struct iov_iter *i)
140{
141 size_t skip, copy, left, wanted;
142 const struct iovec *iov;
143 char __user *buf;
144 void *kaddr, *from;
145
146 if (unlikely(bytes > i->count))
147 bytes = i->count;
148
149 if (unlikely(!bytes))
150 return 0;
151
152 wanted = bytes;
153 iov = i->iov;
154 skip = i->iov_offset;
155 buf = iov->iov_base + skip;
156 copy = min(bytes, iov->iov_len - skip);
157
158 if (!fault_in_pages_writeable(buf, copy)) {
159 kaddr = kmap_atomic(page);
160 from = kaddr + offset;
161
162 /* first chunk, usually the only one */
163 left = __copy_to_user_inatomic(buf, from, copy);
164 copy -= left;
165 skip += copy;
166 from += copy;
167 bytes -= copy;
168
169 while (unlikely(!left && bytes)) {
170 iov++;
171 buf = iov->iov_base;
172 copy = min(bytes, iov->iov_len);
173 left = __copy_to_user_inatomic(buf, from, copy);
174 copy -= left;
175 skip = copy;
176 from += copy;
177 bytes -= copy;
178 }
179 if (likely(!bytes)) {
180 kunmap_atomic(kaddr);
181 goto done;
182 }
183 offset = from - kaddr;
184 buf += copy;
185 kunmap_atomic(kaddr);
186 copy = min(bytes, iov->iov_len - skip);
187 }
188 /* Too bad - revert to non-atomic kmap */
189 kaddr = kmap(page);
190 from = kaddr + offset;
191 left = __copy_to_user(buf, from, copy);
192 copy -= left;
193 skip += copy;
194 from += copy;
195 bytes -= copy;
196 while (unlikely(!left && bytes)) {
197 iov++;
198 buf = iov->iov_base;
199 copy = min(bytes, iov->iov_len);
200 left = __copy_to_user(buf, from, copy);
201 copy -= left;
202 skip = copy;
203 from += copy;
204 bytes -= copy;
205 }
206 kunmap(page);
207done:
208 if (skip == iov->iov_len) {
209 iov++;
210 skip = 0;
211 }
212 i->count -= wanted - bytes;
213 i->nr_segs -= iov - i->iov;
214 i->iov = iov;
215 i->iov_offset = skip;
216 return wanted - bytes;
217}
218
219static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
220 struct iov_iter *i)
221{
222 size_t skip, copy, left, wanted;
223 const struct iovec *iov;
224 char __user *buf;
225 void *kaddr, *to;
226
227 if (unlikely(bytes > i->count))
228 bytes = i->count;
229
230 if (unlikely(!bytes))
231 return 0;
232
233 wanted = bytes;
234 iov = i->iov;
235 skip = i->iov_offset;
236 buf = iov->iov_base + skip;
237 copy = min(bytes, iov->iov_len - skip);
238
239 if (!fault_in_pages_readable(buf, copy)) {
240 kaddr = kmap_atomic(page);
241 to = kaddr + offset;
242
243 /* first chunk, usually the only one */
244 left = __copy_from_user_inatomic(to, buf, copy);
245 copy -= left;
246 skip += copy;
247 to += copy;
248 bytes -= copy;
249
250 while (unlikely(!left && bytes)) {
251 iov++;
252 buf = iov->iov_base;
253 copy = min(bytes, iov->iov_len);
254 left = __copy_from_user_inatomic(to, buf, copy);
255 copy -= left;
256 skip = copy;
257 to += copy;
258 bytes -= copy;
259 }
260 if (likely(!bytes)) {
261 kunmap_atomic(kaddr);
262 goto done;
263 }
264 offset = to - kaddr;
265 buf += copy;
266 kunmap_atomic(kaddr);
267 copy = min(bytes, iov->iov_len - skip);
268 }
269 /* Too bad - revert to non-atomic kmap */
270 kaddr = kmap(page);
271 to = kaddr + offset;
272 left = __copy_from_user(to, buf, copy);
273 copy -= left;
274 skip += copy;
275 to += copy;
276 bytes -= copy;
277 while (unlikely(!left && bytes)) {
278 iov++;
279 buf = iov->iov_base;
280 copy = min(bytes, iov->iov_len);
281 left = __copy_from_user(to, buf, copy);
282 copy -= left;
283 skip = copy;
284 to += copy;
285 bytes -= copy;
286 }
287 kunmap(page);
288done:
289 if (skip == iov->iov_len) {
290 iov++;
291 skip = 0;
292 }
293 i->count -= wanted - bytes;
294 i->nr_segs -= iov - i->iov;
295 i->iov = iov;
296 i->iov_offset = skip;
297 return wanted - bytes;
298}
299
300/*
301 * Fault in the first iovec of the given iov_iter, to a maximum length
302 * of bytes. Returns 0 on success, or non-zero if the memory could not be
303 * accessed (ie. because it is an invalid address).
304 *
305 * writev-intensive code may want this to prefault several iovecs -- that
306 * would be possible (callers must not rely on the fact that _only_ the
307 * first iovec will be faulted with the current implementation).
308 */
309int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
310{
311 if (!(i->type & (ITER_BVEC|ITER_KVEC))) {
312 char __user *buf = i->iov->iov_base + i->iov_offset;
313 bytes = min(bytes, i->iov->iov_len - i->iov_offset);
314 return fault_in_pages_readable(buf, bytes);
315 }
316 return 0;
317}
318EXPORT_SYMBOL(iov_iter_fault_in_readable);
319
320void iov_iter_init(struct iov_iter *i, int direction,
321 const struct iovec *iov, unsigned long nr_segs,
322 size_t count)
323{
324 /* It will get better. Eventually... */
325 if (segment_eq(get_fs(), KERNEL_DS)) {
326 direction |= ITER_KVEC;
327 i->type = direction;
328 i->kvec = (struct kvec *)iov;
329 } else {
330 i->type = direction;
331 i->iov = iov;
332 }
333 i->nr_segs = nr_segs;
334 i->iov_offset = 0;
335 i->count = count;
336}
337EXPORT_SYMBOL(iov_iter_init);
338
339static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len)
340{
341 char *from = kmap_atomic(page);
342 memcpy(to, from + offset, len);
343 kunmap_atomic(from);
344}
345
346static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t len)
347{
348 char *to = kmap_atomic(page);
349 memcpy(to + offset, from, len);
350 kunmap_atomic(to);
351}
352
353static void memzero_page(struct page *page, size_t offset, size_t len)
354{
355 char *addr = kmap_atomic(page);
356 memset(addr + offset, 0, len);
357 kunmap_atomic(addr);
358}
359
360size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i)
361{
362 char *from = addr;
363 if (unlikely(bytes > i->count))
364 bytes = i->count;
365
366 if (unlikely(!bytes))
367 return 0;
368
369 iterate_and_advance(i, bytes, v,
370 __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len,
371 v.iov_len),
372 memcpy_to_page(v.bv_page, v.bv_offset,
373 (from += v.bv_len) - v.bv_len, v.bv_len),
374 memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len)
375 )
376
377 return bytes;
378}
379EXPORT_SYMBOL(copy_to_iter);
380
381size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
382{
383 char *to = addr;
384 if (unlikely(bytes > i->count))
385 bytes = i->count;
386
387 if (unlikely(!bytes))
388 return 0;
389
390 iterate_and_advance(i, bytes, v,
391 __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base,
392 v.iov_len),
393 memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
394 v.bv_offset, v.bv_len),
395 memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
396 )
397
398 return bytes;
399}
400EXPORT_SYMBOL(copy_from_iter);
401
402size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
403{
404 char *to = addr;
405 if (unlikely(bytes > i->count))
406 bytes = i->count;
407
408 if (unlikely(!bytes))
409 return 0;
410
411 iterate_and_advance(i, bytes, v,
412 __copy_from_user_nocache((to += v.iov_len) - v.iov_len,
413 v.iov_base, v.iov_len),
414 memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page,
415 v.bv_offset, v.bv_len),
416 memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
417 )
418
419 return bytes;
420}
421EXPORT_SYMBOL(copy_from_iter_nocache);
422
423size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
424 struct iov_iter *i)
425{
426 if (i->type & (ITER_BVEC|ITER_KVEC)) {
427 void *kaddr = kmap_atomic(page);
428 size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
429 kunmap_atomic(kaddr);
430 return wanted;
431 } else
432 return copy_page_to_iter_iovec(page, offset, bytes, i);
433}
434EXPORT_SYMBOL(copy_page_to_iter);
435
436size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
437 struct iov_iter *i)
438{
439 if (i->type & (ITER_BVEC|ITER_KVEC)) {
440 void *kaddr = kmap_atomic(page);
441 size_t wanted = copy_from_iter(kaddr + offset, bytes, i);
442 kunmap_atomic(kaddr);
443 return wanted;
444 } else
445 return copy_page_from_iter_iovec(page, offset, bytes, i);
446}
447EXPORT_SYMBOL(copy_page_from_iter);
448
449size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
450{
451 if (unlikely(bytes > i->count))
452 bytes = i->count;
453
454 if (unlikely(!bytes))
455 return 0;
456
457 iterate_and_advance(i, bytes, v,
458 __clear_user(v.iov_base, v.iov_len),
459 memzero_page(v.bv_page, v.bv_offset, v.bv_len),
460 memset(v.iov_base, 0, v.iov_len)
461 )
462
463 return bytes;
464}
465EXPORT_SYMBOL(iov_iter_zero);
466
467size_t iov_iter_copy_from_user_atomic(struct page *page,
468 struct iov_iter *i, unsigned long offset, size_t bytes)
469{
470 char *kaddr = kmap_atomic(page), *p = kaddr + offset;
471 iterate_all_kinds(i, bytes, v,
472 __copy_from_user_inatomic((p += v.iov_len) - v.iov_len,
473 v.iov_base, v.iov_len),
474 memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page,
475 v.bv_offset, v.bv_len),
476 memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)
477 )
478 kunmap_atomic(kaddr);
479 return bytes;
480}
481EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
482
483void iov_iter_advance(struct iov_iter *i, size_t size)
484{
485 iterate_and_advance(i, size, v, 0, 0, 0)
486}
487EXPORT_SYMBOL(iov_iter_advance);
488
489/*
490 * Return the count of just the current iov_iter segment.
491 */
492size_t iov_iter_single_seg_count(const struct iov_iter *i)
493{
494 if (i->nr_segs == 1)
495 return i->count;
496 else if (i->type & ITER_BVEC)
497 return min(i->count, i->bvec->bv_len - i->iov_offset);
498 else
499 return min(i->count, i->iov->iov_len - i->iov_offset);
500}
501EXPORT_SYMBOL(iov_iter_single_seg_count);
502
503void iov_iter_kvec(struct iov_iter *i, int direction,
504 const struct kvec *kvec, unsigned long nr_segs,
505 size_t count)
506{
507 BUG_ON(!(direction & ITER_KVEC));
508 i->type = direction;
509 i->kvec = kvec;
510 i->nr_segs = nr_segs;
511 i->iov_offset = 0;
512 i->count = count;
513}
514EXPORT_SYMBOL(iov_iter_kvec);
515
516void iov_iter_bvec(struct iov_iter *i, int direction,
517 const struct bio_vec *bvec, unsigned long nr_segs,
518 size_t count)
519{
520 BUG_ON(!(direction & ITER_BVEC));
521 i->type = direction;
522 i->bvec = bvec;
523 i->nr_segs = nr_segs;
524 i->iov_offset = 0;
525 i->count = count;
526}
527EXPORT_SYMBOL(iov_iter_bvec);
528
529unsigned long iov_iter_alignment(const struct iov_iter *i)
530{
531 unsigned long res = 0;
532 size_t size = i->count;
533
534 if (!size)
535 return 0;
536
537 iterate_all_kinds(i, size, v,
538 (res |= (unsigned long)v.iov_base | v.iov_len, 0),
539 res |= v.bv_offset | v.bv_len,
540 res |= (unsigned long)v.iov_base | v.iov_len
541 )
542 return res;
543}
544EXPORT_SYMBOL(iov_iter_alignment);
545
546ssize_t iov_iter_get_pages(struct iov_iter *i,
547 struct page **pages, size_t maxsize, unsigned maxpages,
548 size_t *start)
549{
550 if (maxsize > i->count)
551 maxsize = i->count;
552
553 if (!maxsize)
554 return 0;
555
556 iterate_all_kinds(i, maxsize, v, ({
557 unsigned long addr = (unsigned long)v.iov_base;
558 size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
559 int n;
560 int res;
561
562 if (len > maxpages * PAGE_SIZE)
563 len = maxpages * PAGE_SIZE;
564 addr &= ~(PAGE_SIZE - 1);
565 n = DIV_ROUND_UP(len, PAGE_SIZE);
566 res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages);
567 if (unlikely(res < 0))
568 return res;
569 return (res == n ? len : res * PAGE_SIZE) - *start;
570 0;}),({
571 /* can't be more than PAGE_SIZE */
572 *start = v.bv_offset;
573 get_page(*pages = v.bv_page);
574 return v.bv_len;
575 }),({
576 return -EFAULT;
577 })
578 )
579 return 0;
580}
581EXPORT_SYMBOL(iov_iter_get_pages);
582
583static struct page **get_pages_array(size_t n)
584{
585 struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL);
586 if (!p)
587 p = vmalloc(n * sizeof(struct page *));
588 return p;
589}
590
591ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
592 struct page ***pages, size_t maxsize,
593 size_t *start)
594{
595 struct page **p;
596
597 if (maxsize > i->count)
598 maxsize = i->count;
599
600 if (!maxsize)
601 return 0;
602
603 iterate_all_kinds(i, maxsize, v, ({
604 unsigned long addr = (unsigned long)v.iov_base;
605 size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
606 int n;
607 int res;
608
609 addr &= ~(PAGE_SIZE - 1);
610 n = DIV_ROUND_UP(len, PAGE_SIZE);
611 p = get_pages_array(n);
612 if (!p)
613 return -ENOMEM;
614 res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p);
615 if (unlikely(res < 0)) {
616 kvfree(p);
617 return res;
618 }
619 *pages = p;
620 return (res == n ? len : res * PAGE_SIZE) - *start;
621 0;}),({
622 /* can't be more than PAGE_SIZE */
623 *start = v.bv_offset;
624 *pages = p = get_pages_array(1);
625 if (!p)
626 return -ENOMEM;
627 get_page(*p = v.bv_page);
628 return v.bv_len;
629 }),({
630 return -EFAULT;
631 })
632 )
633 return 0;
634}
635EXPORT_SYMBOL(iov_iter_get_pages_alloc);
636
637size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
638 struct iov_iter *i)
639{
640 char *to = addr;
641 __wsum sum, next;
642 size_t off = 0;
643 if (unlikely(bytes > i->count))
644 bytes = i->count;
645
646 if (unlikely(!bytes))
647 return 0;
648
649 sum = *csum;
650 iterate_and_advance(i, bytes, v, ({
651 int err = 0;
652 next = csum_and_copy_from_user(v.iov_base,
653 (to += v.iov_len) - v.iov_len,
654 v.iov_len, 0, &err);
655 if (!err) {
656 sum = csum_block_add(sum, next, off);
657 off += v.iov_len;
658 }
659 err ? v.iov_len : 0;
660 }), ({
661 char *p = kmap_atomic(v.bv_page);
662 next = csum_partial_copy_nocheck(p + v.bv_offset,
663 (to += v.bv_len) - v.bv_len,
664 v.bv_len, 0);
665 kunmap_atomic(p);
666 sum = csum_block_add(sum, next, off);
667 off += v.bv_len;
668 }),({
669 next = csum_partial_copy_nocheck(v.iov_base,
670 (to += v.iov_len) - v.iov_len,
671 v.iov_len, 0);
672 sum = csum_block_add(sum, next, off);
673 off += v.iov_len;
674 })
675 )
676 *csum = sum;
677 return bytes;
678}
679EXPORT_SYMBOL(csum_and_copy_from_iter);
680
681size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum,
682 struct iov_iter *i)
683{
684 char *from = addr;
685 __wsum sum, next;
686 size_t off = 0;
687 if (unlikely(bytes > i->count))
688 bytes = i->count;
689
690 if (unlikely(!bytes))
691 return 0;
692
693 sum = *csum;
694 iterate_and_advance(i, bytes, v, ({
695 int err = 0;
696 next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
697 v.iov_base,
698 v.iov_len, 0, &err);
699 if (!err) {
700 sum = csum_block_add(sum, next, off);
701 off += v.iov_len;
702 }
703 err ? v.iov_len : 0;
704 }), ({
705 char *p = kmap_atomic(v.bv_page);
706 next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len,
707 p + v.bv_offset,
708 v.bv_len, 0);
709 kunmap_atomic(p);
710 sum = csum_block_add(sum, next, off);
711 off += v.bv_len;
712 }),({
713 next = csum_partial_copy_nocheck((from += v.iov_len) - v.iov_len,
714 v.iov_base,
715 v.iov_len, 0);
716 sum = csum_block_add(sum, next, off);
717 off += v.iov_len;
718 })
719 )
720 *csum = sum;
721 return bytes;
722}
723EXPORT_SYMBOL(csum_and_copy_to_iter);
724
725int iov_iter_npages(const struct iov_iter *i, int maxpages)
726{
727 size_t size = i->count;
728 int npages = 0;
729
730 if (!size)
731 return 0;
732
733 iterate_all_kinds(i, size, v, ({
734 unsigned long p = (unsigned long)v.iov_base;
735 npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
736 - p / PAGE_SIZE;
737 if (npages >= maxpages)
738 return maxpages;
739 0;}),({
740 npages++;
741 if (npages >= maxpages)
742 return maxpages;
743 }),({
744 unsigned long p = (unsigned long)v.iov_base;
745 npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
746 - p / PAGE_SIZE;
747 if (npages >= maxpages)
748 return maxpages;
749 })
750 )
751 return npages;
752}
753EXPORT_SYMBOL(iov_iter_npages);
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 78fee632a7ee..6c513a63ea84 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -29,6 +29,7 @@
29#include <linux/stacktrace.h> 29#include <linux/stacktrace.h>
30#include <linux/string.h> 30#include <linux/string.h>
31#include <linux/types.h> 31#include <linux/types.h>
32#include <linux/vmalloc.h>
32#include <linux/kasan.h> 33#include <linux/kasan.h>
33 34
34#include "kasan.h" 35#include "kasan.h"
@@ -388,6 +389,19 @@ void kasan_krealloc(const void *object, size_t size)
388 kasan_kmalloc(page->slab_cache, object, size); 389 kasan_kmalloc(page->slab_cache, object, size);
389} 390}
390 391
392void kasan_kfree(void *ptr)
393{
394 struct page *page;
395
396 page = virt_to_head_page(ptr);
397
398 if (unlikely(!PageSlab(page)))
399 kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
400 KASAN_FREE_PAGE);
401 else
402 kasan_slab_free(page->slab_cache, ptr);
403}
404
391void kasan_kfree_large(const void *ptr) 405void kasan_kfree_large(const void *ptr)
392{ 406{
393 struct page *page = virt_to_page(ptr); 407 struct page *page = virt_to_page(ptr);
@@ -414,12 +428,19 @@ int kasan_module_alloc(void *addr, size_t size)
414 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 428 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
415 PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, 429 PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
416 __builtin_return_address(0)); 430 __builtin_return_address(0));
417 return ret ? 0 : -ENOMEM; 431
432 if (ret) {
433 find_vm_area(addr)->flags |= VM_KASAN;
434 return 0;
435 }
436
437 return -ENOMEM;
418} 438}
419 439
420void kasan_module_free(void *addr) 440void kasan_free_shadow(const struct vm_struct *vm)
421{ 441{
422 vfree(kasan_mem_to_shadow(addr)); 442 if (vm->flags & VM_KASAN)
443 vfree(kasan_mem_to_shadow(vm->addr));
423} 444}
424 445
425static void register_global(struct kasan_global *global) 446static void register_global(struct kasan_global *global)
diff --git a/mm/ksm.c b/mm/ksm.c
index 4162dce2eb44..7ee101eaacdf 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -542,7 +542,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
542 expected_mapping = (void *)stable_node + 542 expected_mapping = (void *)stable_node +
543 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); 543 (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
544again: 544again:
545 kpfn = ACCESS_ONCE(stable_node->kpfn); 545 kpfn = READ_ONCE(stable_node->kpfn);
546 page = pfn_to_page(kpfn); 546 page = pfn_to_page(kpfn);
547 547
548 /* 548 /*
@@ -551,7 +551,7 @@ again:
551 * but on Alpha we need to be more careful. 551 * but on Alpha we need to be more careful.
552 */ 552 */
553 smp_read_barrier_depends(); 553 smp_read_barrier_depends();
554 if (ACCESS_ONCE(page->mapping) != expected_mapping) 554 if (READ_ONCE(page->mapping) != expected_mapping)
555 goto stale; 555 goto stale;
556 556
557 /* 557 /*
@@ -577,14 +577,14 @@ again:
577 cpu_relax(); 577 cpu_relax();
578 } 578 }
579 579
580 if (ACCESS_ONCE(page->mapping) != expected_mapping) { 580 if (READ_ONCE(page->mapping) != expected_mapping) {
581 put_page(page); 581 put_page(page);
582 goto stale; 582 goto stale;
583 } 583 }
584 584
585 if (lock_it) { 585 if (lock_it) {
586 lock_page(page); 586 lock_page(page);
587 if (ACCESS_ONCE(page->mapping) != expected_mapping) { 587 if (READ_ONCE(page->mapping) != expected_mapping) {
588 unlock_page(page); 588 unlock_page(page);
589 put_page(page); 589 put_page(page);
590 goto stale; 590 goto stale;
@@ -600,7 +600,7 @@ stale:
600 * before checking whether node->kpfn has been changed. 600 * before checking whether node->kpfn has been changed.
601 */ 601 */
602 smp_rmb(); 602 smp_rmb();
603 if (ACCESS_ONCE(stable_node->kpfn) != kpfn) 603 if (READ_ONCE(stable_node->kpfn) != kpfn)
604 goto again; 604 goto again;
605 remove_node_from_stable_tree(stable_node); 605 remove_node_from_stable_tree(stable_node);
606 return NULL; 606 return NULL;
diff --git a/mm/memblock.c b/mm/memblock.c
index 252b77bdf65e..9318b567ed79 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -580,10 +580,24 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
580 return memblock_add_range(&memblock.memory, base, size, nid, 0); 580 return memblock_add_range(&memblock.memory, base, size, nid, 0);
581} 581}
582 582
583static int __init_memblock memblock_add_region(phys_addr_t base,
584 phys_addr_t size,
585 int nid,
586 unsigned long flags)
587{
588 struct memblock_type *_rgn = &memblock.memory;
589
590 memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
591 (unsigned long long)base,
592 (unsigned long long)base + size - 1,
593 flags, (void *)_RET_IP_);
594
595 return memblock_add_range(_rgn, base, size, nid, flags);
596}
597
583int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) 598int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
584{ 599{
585 return memblock_add_range(&memblock.memory, base, size, 600 return memblock_add_region(base, size, MAX_NUMNODES, 0);
586 MAX_NUMNODES, 0);
587} 601}
588 602
589/** 603/**
@@ -699,14 +713,14 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base,
699 int nid, 713 int nid,
700 unsigned long flags) 714 unsigned long flags)
701{ 715{
702 struct memblock_type *_rgn = &memblock.reserved; 716 struct memblock_type *type = &memblock.reserved;
703 717
704 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", 718 memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
705 (unsigned long long)base, 719 (unsigned long long)base,
706 (unsigned long long)base + size - 1, 720 (unsigned long long)base + size - 1,
707 flags, (void *)_RET_IP_); 721 flags, (void *)_RET_IP_);
708 722
709 return memblock_add_range(_rgn, base, size, nid, flags); 723 return memblock_add_range(type, base, size, nid, flags);
710} 724}
711 725
712int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) 726int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9fe07692eaad..14c2f2017e37 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -14,6 +14,12 @@
14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 14 * Copyright (C) 2012 Parallels Inc. and Google Inc.
15 * Authors: Glauber Costa and Suleiman Souhlal 15 * Authors: Glauber Costa and Suleiman Souhlal
16 * 16 *
17 * Native page reclaim
18 * Charge lifetime sanitation
19 * Lockless page tracking & accounting
20 * Unified hierarchy configuration model
21 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
22 *
17 * This program is free software; you can redistribute it and/or modify 23 * This program is free software; you can redistribute it and/or modify
18 * it under the terms of the GNU General Public License as published by 24 * it under the terms of the GNU General Public License as published by
19 * the Free Software Foundation; either version 2 of the License, or 25 * the Free Software Foundation; either version 2 of the License, or
@@ -253,11 +259,6 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
253 * page cache and RSS per cgroup. We would eventually like to provide 259 * page cache and RSS per cgroup. We would eventually like to provide
254 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 260 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
255 * to help the administrator determine what knobs to tune. 261 * to help the administrator determine what knobs to tune.
256 *
257 * TODO: Add a water mark for the memory controller. Reclaim will begin when
258 * we hit the water mark. May be even add a low water mark, such that
259 * no reclaim occurs from a cgroup at it's low water mark, this is
260 * a feature that will be implemented much later in the future.
261 */ 262 */
262struct mem_cgroup { 263struct mem_cgroup {
263 struct cgroup_subsys_state css; 264 struct cgroup_subsys_state css;
@@ -454,6 +455,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
454 return memcg->css.id; 455 return memcg->css.id;
455} 456}
456 457
458/*
459 * A helper function to get mem_cgroup from ID. must be called under
460 * rcu_read_lock(). The caller is responsible for calling
461 * css_tryget_online() if the mem_cgroup is used for charging. (dropping
462 * refcnt from swap can be called against removed memcg.)
463 */
457static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 464static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
458{ 465{
459 struct cgroup_subsys_state *css; 466 struct cgroup_subsys_state *css;
@@ -667,7 +674,7 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
667static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 674static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
668{ 675{
669 unsigned long nr_pages = page_counter_read(&memcg->memory); 676 unsigned long nr_pages = page_counter_read(&memcg->memory);
670 unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); 677 unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
671 unsigned long excess = 0; 678 unsigned long excess = 0;
672 679
673 if (nr_pages > soft_limit) 680 if (nr_pages > soft_limit)
@@ -1035,7 +1042,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1035 goto out_unlock; 1042 goto out_unlock;
1036 1043
1037 do { 1044 do {
1038 pos = ACCESS_ONCE(iter->position); 1045 pos = READ_ONCE(iter->position);
1039 /* 1046 /*
1040 * A racing update may change the position and 1047 * A racing update may change the position and
1041 * put the last reference, hence css_tryget(), 1048 * put the last reference, hence css_tryget(),
@@ -1352,13 +1359,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1352 unsigned long limit; 1359 unsigned long limit;
1353 1360
1354 count = page_counter_read(&memcg->memory); 1361 count = page_counter_read(&memcg->memory);
1355 limit = ACCESS_ONCE(memcg->memory.limit); 1362 limit = READ_ONCE(memcg->memory.limit);
1356 if (count < limit) 1363 if (count < limit)
1357 margin = limit - count; 1364 margin = limit - count;
1358 1365
1359 if (do_swap_account) { 1366 if (do_swap_account) {
1360 count = page_counter_read(&memcg->memsw); 1367 count = page_counter_read(&memcg->memsw);
1361 limit = ACCESS_ONCE(memcg->memsw.limit); 1368 limit = READ_ONCE(memcg->memsw.limit);
1362 if (count <= limit) 1369 if (count <= limit)
1363 margin = min(margin, limit - count); 1370 margin = min(margin, limit - count);
1364 } 1371 }
@@ -1436,15 +1443,17 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1436 struct mem_cgroup *iter; 1443 struct mem_cgroup *iter;
1437 unsigned int i; 1444 unsigned int i;
1438 1445
1439 if (!p)
1440 return;
1441
1442 mutex_lock(&oom_info_lock); 1446 mutex_lock(&oom_info_lock);
1443 rcu_read_lock(); 1447 rcu_read_lock();
1444 1448
1445 pr_info("Task in "); 1449 if (p) {
1446 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1450 pr_info("Task in ");
1447 pr_cont(" killed as a result of limit of "); 1451 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1452 pr_cont(" killed as a result of limit of ");
1453 } else {
1454 pr_info("Memory limit reached of cgroup ");
1455 }
1456
1448 pr_cont_cgroup_path(memcg->css.cgroup); 1457 pr_cont_cgroup_path(memcg->css.cgroup);
1449 pr_cont("\n"); 1458 pr_cont("\n");
1450 1459
@@ -1531,7 +1540,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1531 return; 1540 return;
1532 } 1541 }
1533 1542
1534 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1543 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
1535 totalpages = mem_cgroup_get_limit(memcg) ? : 1; 1544 totalpages = mem_cgroup_get_limit(memcg) ? : 1;
1536 for_each_mem_cgroup_tree(iter, memcg) { 1545 for_each_mem_cgroup_tree(iter, memcg) {
1537 struct css_task_iter it; 1546 struct css_task_iter it;
@@ -2341,20 +2350,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2341} 2350}
2342 2351
2343/* 2352/*
2344 * A helper function to get mem_cgroup from ID. must be called under
2345 * rcu_read_lock(). The caller is responsible for calling
2346 * css_tryget_online() if the mem_cgroup is used for charging. (dropping
2347 * refcnt from swap can be called against removed memcg.)
2348 */
2349static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2350{
2351 /* ID 0 is unused ID */
2352 if (!id)
2353 return NULL;
2354 return mem_cgroup_from_id(id);
2355}
2356
2357/*
2358 * try_get_mem_cgroup_from_page - look up page's memcg association 2353 * try_get_mem_cgroup_from_page - look up page's memcg association
2359 * @page: the page 2354 * @page: the page
2360 * 2355 *
@@ -2380,7 +2375,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2380 ent.val = page_private(page); 2375 ent.val = page_private(page);
2381 id = lookup_swap_cgroup_id(ent); 2376 id = lookup_swap_cgroup_id(ent);
2382 rcu_read_lock(); 2377 rcu_read_lock();
2383 memcg = mem_cgroup_lookup(id); 2378 memcg = mem_cgroup_from_id(id);
2384 if (memcg && !css_tryget_online(&memcg->css)) 2379 if (memcg && !css_tryget_online(&memcg->css))
2385 memcg = NULL; 2380 memcg = NULL;
2386 rcu_read_unlock(); 2381 rcu_read_unlock();
@@ -2642,7 +2637,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
2642 return cachep; 2637 return cachep;
2643 2638
2644 memcg = get_mem_cgroup_from_mm(current->mm); 2639 memcg = get_mem_cgroup_from_mm(current->mm);
2645 kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id); 2640 kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2646 if (kmemcg_id < 0) 2641 if (kmemcg_id < 0)
2647 goto out; 2642 goto out;
2648 2643
@@ -2779,92 +2774,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
2779} 2774}
2780#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 2775#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2781 2776
2782/**
2783 * mem_cgroup_move_account - move account of the page
2784 * @page: the page
2785 * @nr_pages: number of regular pages (>1 for huge pages)
2786 * @from: mem_cgroup which the page is moved from.
2787 * @to: mem_cgroup which the page is moved to. @from != @to.
2788 *
2789 * The caller must confirm following.
2790 * - page is not on LRU (isolate_page() is useful.)
2791 * - compound_lock is held when nr_pages > 1
2792 *
2793 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
2794 * from old cgroup.
2795 */
2796static int mem_cgroup_move_account(struct page *page,
2797 unsigned int nr_pages,
2798 struct mem_cgroup *from,
2799 struct mem_cgroup *to)
2800{
2801 unsigned long flags;
2802 int ret;
2803
2804 VM_BUG_ON(from == to);
2805 VM_BUG_ON_PAGE(PageLRU(page), page);
2806 /*
2807 * The page is isolated from LRU. So, collapse function
2808 * will not handle this page. But page splitting can happen.
2809 * Do this check under compound_page_lock(). The caller should
2810 * hold it.
2811 */
2812 ret = -EBUSY;
2813 if (nr_pages > 1 && !PageTransHuge(page))
2814 goto out;
2815
2816 /*
2817 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
2818 * of its source page while we change it: page migration takes
2819 * both pages off the LRU, but page cache replacement doesn't.
2820 */
2821 if (!trylock_page(page))
2822 goto out;
2823
2824 ret = -EINVAL;
2825 if (page->mem_cgroup != from)
2826 goto out_unlock;
2827
2828 spin_lock_irqsave(&from->move_lock, flags);
2829
2830 if (!PageAnon(page) && page_mapped(page)) {
2831 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
2832 nr_pages);
2833 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
2834 nr_pages);
2835 }
2836
2837 if (PageWriteback(page)) {
2838 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
2839 nr_pages);
2840 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
2841 nr_pages);
2842 }
2843
2844 /*
2845 * It is safe to change page->mem_cgroup here because the page
2846 * is referenced, charged, and isolated - we can't race with
2847 * uncharging, charging, migration, or LRU putback.
2848 */
2849
2850 /* caller should have done css_get */
2851 page->mem_cgroup = to;
2852 spin_unlock_irqrestore(&from->move_lock, flags);
2853
2854 ret = 0;
2855
2856 local_irq_disable();
2857 mem_cgroup_charge_statistics(to, page, nr_pages);
2858 memcg_check_events(to, page);
2859 mem_cgroup_charge_statistics(from, page, -nr_pages);
2860 memcg_check_events(from, page);
2861 local_irq_enable();
2862out_unlock:
2863 unlock_page(page);
2864out:
2865 return ret;
2866}
2867
2868#ifdef CONFIG_MEMCG_SWAP 2777#ifdef CONFIG_MEMCG_SWAP
2869static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 2778static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
2870 bool charge) 2779 bool charge)
@@ -4816,6 +4725,92 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4816 return page; 4725 return page;
4817} 4726}
4818 4727
4728/**
4729 * mem_cgroup_move_account - move account of the page
4730 * @page: the page
4731 * @nr_pages: number of regular pages (>1 for huge pages)
4732 * @from: mem_cgroup which the page is moved from.
4733 * @to: mem_cgroup which the page is moved to. @from != @to.
4734 *
4735 * The caller must confirm following.
4736 * - page is not on LRU (isolate_page() is useful.)
4737 * - compound_lock is held when nr_pages > 1
4738 *
4739 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
4740 * from old cgroup.
4741 */
4742static int mem_cgroup_move_account(struct page *page,
4743 unsigned int nr_pages,
4744 struct mem_cgroup *from,
4745 struct mem_cgroup *to)
4746{
4747 unsigned long flags;
4748 int ret;
4749
4750 VM_BUG_ON(from == to);
4751 VM_BUG_ON_PAGE(PageLRU(page), page);
4752 /*
4753 * The page is isolated from LRU. So, collapse function
4754 * will not handle this page. But page splitting can happen.
4755 * Do this check under compound_page_lock(). The caller should
4756 * hold it.
4757 */
4758 ret = -EBUSY;
4759 if (nr_pages > 1 && !PageTransHuge(page))
4760 goto out;
4761
4762 /*
4763 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
4764 * of its source page while we change it: page migration takes
4765 * both pages off the LRU, but page cache replacement doesn't.
4766 */
4767 if (!trylock_page(page))
4768 goto out;
4769
4770 ret = -EINVAL;
4771 if (page->mem_cgroup != from)
4772 goto out_unlock;
4773
4774 spin_lock_irqsave(&from->move_lock, flags);
4775
4776 if (!PageAnon(page) && page_mapped(page)) {
4777 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
4778 nr_pages);
4779 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
4780 nr_pages);
4781 }
4782
4783 if (PageWriteback(page)) {
4784 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
4785 nr_pages);
4786 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
4787 nr_pages);
4788 }
4789
4790 /*
4791 * It is safe to change page->mem_cgroup here because the page
4792 * is referenced, charged, and isolated - we can't race with
4793 * uncharging, charging, migration, or LRU putback.
4794 */
4795
4796 /* caller should have done css_get */
4797 page->mem_cgroup = to;
4798 spin_unlock_irqrestore(&from->move_lock, flags);
4799
4800 ret = 0;
4801
4802 local_irq_disable();
4803 mem_cgroup_charge_statistics(to, page, nr_pages);
4804 memcg_check_events(to, page);
4805 mem_cgroup_charge_statistics(from, page, -nr_pages);
4806 memcg_check_events(from, page);
4807 local_irq_enable();
4808out_unlock:
4809 unlock_page(page);
4810out:
4811 return ret;
4812}
4813
4819static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 4814static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4820 unsigned long addr, pte_t ptent, union mc_target *target) 4815 unsigned long addr, pte_t ptent, union mc_target *target)
4821{ 4816{
@@ -5012,7 +5007,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
5012 * tunable will only affect upcoming migrations, not the current one. 5007 * tunable will only affect upcoming migrations, not the current one.
5013 * So we need to save it, and keep it going. 5008 * So we need to save it, and keep it going.
5014 */ 5009 */
5015 move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate); 5010 move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5016 if (move_flags) { 5011 if (move_flags) {
5017 struct mm_struct *mm; 5012 struct mm_struct *mm;
5018 struct mem_cgroup *from = mem_cgroup_from_task(p); 5013 struct mem_cgroup *from = mem_cgroup_from_task(p);
@@ -5232,7 +5227,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5232 * on for the root memcg is enough. 5227 * on for the root memcg is enough.
5233 */ 5228 */
5234 if (cgroup_on_dfl(root_css->cgroup)) 5229 if (cgroup_on_dfl(root_css->cgroup))
5235 mem_cgroup_from_css(root_css)->use_hierarchy = true; 5230 root_mem_cgroup->use_hierarchy = true;
5231 else
5232 root_mem_cgroup->use_hierarchy = false;
5236} 5233}
5237 5234
5238static u64 memory_current_read(struct cgroup_subsys_state *css, 5235static u64 memory_current_read(struct cgroup_subsys_state *css,
@@ -5244,7 +5241,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
5244static int memory_low_show(struct seq_file *m, void *v) 5241static int memory_low_show(struct seq_file *m, void *v)
5245{ 5242{
5246 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5243 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5247 unsigned long low = ACCESS_ONCE(memcg->low); 5244 unsigned long low = READ_ONCE(memcg->low);
5248 5245
5249 if (low == PAGE_COUNTER_MAX) 5246 if (low == PAGE_COUNTER_MAX)
5250 seq_puts(m, "max\n"); 5247 seq_puts(m, "max\n");
@@ -5274,7 +5271,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
5274static int memory_high_show(struct seq_file *m, void *v) 5271static int memory_high_show(struct seq_file *m, void *v)
5275{ 5272{
5276 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5273 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5277 unsigned long high = ACCESS_ONCE(memcg->high); 5274 unsigned long high = READ_ONCE(memcg->high);
5278 5275
5279 if (high == PAGE_COUNTER_MAX) 5276 if (high == PAGE_COUNTER_MAX)
5280 seq_puts(m, "max\n"); 5277 seq_puts(m, "max\n");
@@ -5304,7 +5301,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
5304static int memory_max_show(struct seq_file *m, void *v) 5301static int memory_max_show(struct seq_file *m, void *v)
5305{ 5302{
5306 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5303 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5307 unsigned long max = ACCESS_ONCE(memcg->memory.limit); 5304 unsigned long max = READ_ONCE(memcg->memory.limit);
5308 5305
5309 if (max == PAGE_COUNTER_MAX) 5306 if (max == PAGE_COUNTER_MAX)
5310 seq_puts(m, "max\n"); 5307 seq_puts(m, "max\n");
@@ -5859,7 +5856,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
5859 5856
5860 id = swap_cgroup_record(entry, 0); 5857 id = swap_cgroup_record(entry, 0);
5861 rcu_read_lock(); 5858 rcu_read_lock();
5862 memcg = mem_cgroup_lookup(id); 5859 memcg = mem_cgroup_from_id(id);
5863 if (memcg) { 5860 if (memcg) {
5864 if (!mem_cgroup_is_root(memcg)) 5861 if (!mem_cgroup_is_root(memcg))
5865 page_counter_uncharge(&memcg->memsw, 1); 5862 page_counter_uncharge(&memcg->memsw, 1);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d487f8dc6d39..d9359b770cd9 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -521,6 +521,52 @@ static const char *action_name[] = {
521 [RECOVERED] = "Recovered", 521 [RECOVERED] = "Recovered",
522}; 522};
523 523
524enum action_page_type {
525 MSG_KERNEL,
526 MSG_KERNEL_HIGH_ORDER,
527 MSG_SLAB,
528 MSG_DIFFERENT_COMPOUND,
529 MSG_POISONED_HUGE,
530 MSG_HUGE,
531 MSG_FREE_HUGE,
532 MSG_UNMAP_FAILED,
533 MSG_DIRTY_SWAPCACHE,
534 MSG_CLEAN_SWAPCACHE,
535 MSG_DIRTY_MLOCKED_LRU,
536 MSG_CLEAN_MLOCKED_LRU,
537 MSG_DIRTY_UNEVICTABLE_LRU,
538 MSG_CLEAN_UNEVICTABLE_LRU,
539 MSG_DIRTY_LRU,
540 MSG_CLEAN_LRU,
541 MSG_TRUNCATED_LRU,
542 MSG_BUDDY,
543 MSG_BUDDY_2ND,
544 MSG_UNKNOWN,
545};
546
547static const char * const action_page_types[] = {
548 [MSG_KERNEL] = "reserved kernel page",
549 [MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
550 [MSG_SLAB] = "kernel slab page",
551 [MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
552 [MSG_POISONED_HUGE] = "huge page already hardware poisoned",
553 [MSG_HUGE] = "huge page",
554 [MSG_FREE_HUGE] = "free huge page",
555 [MSG_UNMAP_FAILED] = "unmapping failed page",
556 [MSG_DIRTY_SWAPCACHE] = "dirty swapcache page",
557 [MSG_CLEAN_SWAPCACHE] = "clean swapcache page",
558 [MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page",
559 [MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page",
560 [MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page",
561 [MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page",
562 [MSG_DIRTY_LRU] = "dirty LRU page",
563 [MSG_CLEAN_LRU] = "clean LRU page",
564 [MSG_TRUNCATED_LRU] = "already truncated LRU page",
565 [MSG_BUDDY] = "free buddy page",
566 [MSG_BUDDY_2ND] = "free buddy page (2nd try)",
567 [MSG_UNKNOWN] = "unknown page",
568};
569
524/* 570/*
525 * XXX: It is possible that a page is isolated from LRU cache, 571 * XXX: It is possible that a page is isolated from LRU cache,
526 * and then kept in swap cache or failed to remove from page cache. 572 * and then kept in swap cache or failed to remove from page cache.
@@ -777,10 +823,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
777static struct page_state { 823static struct page_state {
778 unsigned long mask; 824 unsigned long mask;
779 unsigned long res; 825 unsigned long res;
780 char *msg; 826 enum action_page_type type;
781 int (*action)(struct page *p, unsigned long pfn); 827 int (*action)(struct page *p, unsigned long pfn);
782} error_states[] = { 828} error_states[] = {
783 { reserved, reserved, "reserved kernel", me_kernel }, 829 { reserved, reserved, MSG_KERNEL, me_kernel },
784 /* 830 /*
785 * free pages are specially detected outside this table: 831 * free pages are specially detected outside this table:
786 * PG_buddy pages only make a small fraction of all free pages. 832 * PG_buddy pages only make a small fraction of all free pages.
@@ -791,31 +837,31 @@ static struct page_state {
791 * currently unused objects without touching them. But just 837 * currently unused objects without touching them. But just
792 * treat it as standard kernel for now. 838 * treat it as standard kernel for now.
793 */ 839 */
794 { slab, slab, "kernel slab", me_kernel }, 840 { slab, slab, MSG_SLAB, me_kernel },
795 841
796#ifdef CONFIG_PAGEFLAGS_EXTENDED 842#ifdef CONFIG_PAGEFLAGS_EXTENDED
797 { head, head, "huge", me_huge_page }, 843 { head, head, MSG_HUGE, me_huge_page },
798 { tail, tail, "huge", me_huge_page }, 844 { tail, tail, MSG_HUGE, me_huge_page },
799#else 845#else
800 { compound, compound, "huge", me_huge_page }, 846 { compound, compound, MSG_HUGE, me_huge_page },
801#endif 847#endif
802 848
803 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, 849 { sc|dirty, sc|dirty, MSG_DIRTY_SWAPCACHE, me_swapcache_dirty },
804 { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, 850 { sc|dirty, sc, MSG_CLEAN_SWAPCACHE, me_swapcache_clean },
805 851
806 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, 852 { mlock|dirty, mlock|dirty, MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty },
807 { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, 853 { mlock|dirty, mlock, MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean },
808 854
809 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, 855 { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty },
810 { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, 856 { unevict|dirty, unevict, MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean },
811 857
812 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, 858 { lru|dirty, lru|dirty, MSG_DIRTY_LRU, me_pagecache_dirty },
813 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 859 { lru|dirty, lru, MSG_CLEAN_LRU, me_pagecache_clean },
814 860
815 /* 861 /*
816 * Catchall entry: must be at end. 862 * Catchall entry: must be at end.
817 */ 863 */
818 { 0, 0, "unknown page state", me_unknown }, 864 { 0, 0, MSG_UNKNOWN, me_unknown },
819}; 865};
820 866
821#undef dirty 867#undef dirty
@@ -835,10 +881,10 @@ static struct page_state {
835 * "Dirty/Clean" indication is not 100% accurate due to the possibility of 881 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
836 * setting PG_dirty outside page lock. See also comment above set_page_dirty(). 882 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
837 */ 883 */
838static void action_result(unsigned long pfn, char *msg, int result) 884static void action_result(unsigned long pfn, enum action_page_type type, int result)
839{ 885{
840 pr_err("MCE %#lx: %s page recovery: %s\n", 886 pr_err("MCE %#lx: recovery action for %s: %s\n",
841 pfn, msg, action_name[result]); 887 pfn, action_page_types[type], action_name[result]);
842} 888}
843 889
844static int page_action(struct page_state *ps, struct page *p, 890static int page_action(struct page_state *ps, struct page *p,
@@ -854,11 +900,11 @@ static int page_action(struct page_state *ps, struct page *p,
854 count--; 900 count--;
855 if (count != 0) { 901 if (count != 0) {
856 printk(KERN_ERR 902 printk(KERN_ERR
857 "MCE %#lx: %s page still referenced by %d users\n", 903 "MCE %#lx: %s still referenced by %d users\n",
858 pfn, ps->msg, count); 904 pfn, action_page_types[ps->type], count);
859 result = FAILED; 905 result = FAILED;
860 } 906 }
861 action_result(pfn, ps->msg, result); 907 action_result(pfn, ps->type, result);
862 908
863 /* Could do more checks here if page looks ok */ 909 /* Could do more checks here if page looks ok */
864 /* 910 /*
@@ -1106,7 +1152,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1106 if (!(flags & MF_COUNT_INCREASED) && 1152 if (!(flags & MF_COUNT_INCREASED) &&
1107 !get_page_unless_zero(hpage)) { 1153 !get_page_unless_zero(hpage)) {
1108 if (is_free_buddy_page(p)) { 1154 if (is_free_buddy_page(p)) {
1109 action_result(pfn, "free buddy", DELAYED); 1155 action_result(pfn, MSG_BUDDY, DELAYED);
1110 return 0; 1156 return 0;
1111 } else if (PageHuge(hpage)) { 1157 } else if (PageHuge(hpage)) {
1112 /* 1158 /*
@@ -1123,12 +1169,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1123 } 1169 }
1124 set_page_hwpoison_huge_page(hpage); 1170 set_page_hwpoison_huge_page(hpage);
1125 res = dequeue_hwpoisoned_huge_page(hpage); 1171 res = dequeue_hwpoisoned_huge_page(hpage);
1126 action_result(pfn, "free huge", 1172 action_result(pfn, MSG_FREE_HUGE,
1127 res ? IGNORED : DELAYED); 1173 res ? IGNORED : DELAYED);
1128 unlock_page(hpage); 1174 unlock_page(hpage);
1129 return res; 1175 return res;
1130 } else { 1176 } else {
1131 action_result(pfn, "high order kernel", IGNORED); 1177 action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED);
1132 return -EBUSY; 1178 return -EBUSY;
1133 } 1179 }
1134 } 1180 }
@@ -1150,9 +1196,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1150 */ 1196 */
1151 if (is_free_buddy_page(p)) { 1197 if (is_free_buddy_page(p)) {
1152 if (flags & MF_COUNT_INCREASED) 1198 if (flags & MF_COUNT_INCREASED)
1153 action_result(pfn, "free buddy", DELAYED); 1199 action_result(pfn, MSG_BUDDY, DELAYED);
1154 else 1200 else
1155 action_result(pfn, "free buddy, 2nd try", DELAYED); 1201 action_result(pfn, MSG_BUDDY_2ND,
1202 DELAYED);
1156 return 0; 1203 return 0;
1157 } 1204 }
1158 } 1205 }
@@ -1165,7 +1212,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1165 * If this happens just bail out. 1212 * If this happens just bail out.
1166 */ 1213 */
1167 if (compound_head(p) != hpage) { 1214 if (compound_head(p) != hpage) {
1168 action_result(pfn, "different compound page after locking", IGNORED); 1215 action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED);
1169 res = -EBUSY; 1216 res = -EBUSY;
1170 goto out; 1217 goto out;
1171 } 1218 }
@@ -1205,8 +1252,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1205 * on the head page to show that the hugepage is hwpoisoned 1252 * on the head page to show that the hugepage is hwpoisoned
1206 */ 1253 */
1207 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { 1254 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1208 action_result(pfn, "hugepage already hardware poisoned", 1255 action_result(pfn, MSG_POISONED_HUGE, IGNORED);
1209 IGNORED);
1210 unlock_page(hpage); 1256 unlock_page(hpage);
1211 put_page(hpage); 1257 put_page(hpage);
1212 return 0; 1258 return 0;
@@ -1235,7 +1281,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1235 */ 1281 */
1236 if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) 1282 if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
1237 != SWAP_SUCCESS) { 1283 != SWAP_SUCCESS) {
1238 action_result(pfn, "unmapping failed", IGNORED); 1284 action_result(pfn, MSG_UNMAP_FAILED, IGNORED);
1239 res = -EBUSY; 1285 res = -EBUSY;
1240 goto out; 1286 goto out;
1241 } 1287 }
@@ -1244,7 +1290,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1244 * Torn down by someone else? 1290 * Torn down by someone else?
1245 */ 1291 */
1246 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { 1292 if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
1247 action_result(pfn, "already truncated LRU", IGNORED); 1293 action_result(pfn, MSG_TRUNCATED_LRU, IGNORED);
1248 res = -EBUSY; 1294 res = -EBUSY;
1249 goto out; 1295 goto out;
1250 } 1296 }
@@ -1540,8 +1586,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
1540 } 1586 }
1541 unlock_page(hpage); 1587 unlock_page(hpage);
1542 1588
1543 /* Keep page count to indicate a given hugepage is isolated. */ 1589 ret = isolate_huge_page(hpage, &pagelist);
1544 list_move(&hpage->lru, &pagelist); 1590 if (ret) {
1591 /*
1592 * get_any_page() and isolate_huge_page() takes a refcount each,
1593 * so need to drop one here.
1594 */
1595 put_page(hpage);
1596 } else {
1597 pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
1598 return -EBUSY;
1599 }
1600
1545 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, 1601 ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
1546 MIGRATE_SYNC, MR_MEMORY_FAILURE); 1602 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1547 if (ret) { 1603 if (ret) {
diff --git a/mm/memory.c b/mm/memory.c
index 8068893697bb..22e037e3364e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -690,12 +690,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
690 /* 690 /*
691 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y 691 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
692 */ 692 */
693 if (vma->vm_ops) 693 pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
694 printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", 694 vma->vm_file,
695 vma->vm_ops->fault); 695 vma->vm_ops ? vma->vm_ops->fault : NULL,
696 if (vma->vm_file) 696 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
697 printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", 697 mapping ? mapping->a_ops->readpage : NULL);
698 vma->vm_file->f_op->mmap);
699 dump_stack(); 698 dump_stack();
700 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 699 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
701} 700}
@@ -1983,167 +1982,91 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
1983} 1982}
1984 1983
1985/* 1984/*
1986 * This routine handles present pages, when users try to write 1985 * Handle write page faults for pages that can be reused in the current vma
1987 * to a shared page. It is done by copying the page to a new address
1988 * and decrementing the shared-page counter for the old page.
1989 * 1986 *
1990 * Note that this routine assumes that the protection checks have been 1987 * This can happen either due to the mapping being with the VM_SHARED flag,
1991 * done by the caller (the low-level page fault routine in most cases). 1988 * or due to us being the last reference standing to the page. In either
1992 * Thus we can safely just mark it writable once we've done any necessary 1989 * case, all we need to do here is to mark the page as writable and update
1993 * COW. 1990 * any related book-keeping.
1994 *
1995 * We also mark the page dirty at this point even though the page will
1996 * change only once the write actually happens. This avoids a few races,
1997 * and potentially makes it more efficient.
1998 *
1999 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2000 * but allow concurrent faults), with pte both mapped and locked.
2001 * We return with mmap_sem still held, but pte unmapped and unlocked.
2002 */ 1991 */
2003static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, 1992static inline int wp_page_reuse(struct mm_struct *mm,
2004 unsigned long address, pte_t *page_table, pmd_t *pmd, 1993 struct vm_area_struct *vma, unsigned long address,
2005 spinlock_t *ptl, pte_t orig_pte) 1994 pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
1995 struct page *page, int page_mkwrite,
1996 int dirty_shared)
2006 __releases(ptl) 1997 __releases(ptl)
2007{ 1998{
2008 struct page *old_page, *new_page = NULL;
2009 pte_t entry; 1999 pte_t entry;
2010 int ret = 0;
2011 int page_mkwrite = 0;
2012 bool dirty_shared = false;
2013 unsigned long mmun_start = 0; /* For mmu_notifiers */
2014 unsigned long mmun_end = 0; /* For mmu_notifiers */
2015 struct mem_cgroup *memcg;
2016
2017 old_page = vm_normal_page(vma, address, orig_pte);
2018 if (!old_page) {
2019 /*
2020 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
2021 * VM_PFNMAP VMA.
2022 *
2023 * We should not cow pages in a shared writeable mapping.
2024 * Just mark the pages writable as we can't do any dirty
2025 * accounting on raw pfn maps.
2026 */
2027 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2028 (VM_WRITE|VM_SHARED))
2029 goto reuse;
2030 goto gotten;
2031 }
2032
2033 /* 2000 /*
2034 * Take out anonymous pages first, anonymous shared vmas are 2001 * Clear the pages cpupid information as the existing
2035 * not dirty accountable. 2002 * information potentially belongs to a now completely
2003 * unrelated process.
2036 */ 2004 */
2037 if (PageAnon(old_page) && !PageKsm(old_page)) { 2005 if (page)
2038 if (!trylock_page(old_page)) { 2006 page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1);
2039 page_cache_get(old_page);
2040 pte_unmap_unlock(page_table, ptl);
2041 lock_page(old_page);
2042 page_table = pte_offset_map_lock(mm, pmd, address,
2043 &ptl);
2044 if (!pte_same(*page_table, orig_pte)) {
2045 unlock_page(old_page);
2046 goto unlock;
2047 }
2048 page_cache_release(old_page);
2049 }
2050 if (reuse_swap_page(old_page)) {
2051 /*
2052 * The page is all ours. Move it to our anon_vma so
2053 * the rmap code will not search our parent or siblings.
2054 * Protected against the rmap code by the page lock.
2055 */
2056 page_move_anon_rmap(old_page, vma, address);
2057 unlock_page(old_page);
2058 goto reuse;
2059 }
2060 unlock_page(old_page);
2061 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2062 (VM_WRITE|VM_SHARED))) {
2063 page_cache_get(old_page);
2064 /*
2065 * Only catch write-faults on shared writable pages,
2066 * read-only shared pages can get COWed by
2067 * get_user_pages(.write=1, .force=1).
2068 */
2069 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2070 int tmp;
2071
2072 pte_unmap_unlock(page_table, ptl);
2073 tmp = do_page_mkwrite(vma, old_page, address);
2074 if (unlikely(!tmp || (tmp &
2075 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2076 page_cache_release(old_page);
2077 return tmp;
2078 }
2079 /*
2080 * Since we dropped the lock we need to revalidate
2081 * the PTE as someone else may have changed it. If
2082 * they did, we just return, as we can count on the
2083 * MMU to tell us if they didn't also make it writable.
2084 */
2085 page_table = pte_offset_map_lock(mm, pmd, address,
2086 &ptl);
2087 if (!pte_same(*page_table, orig_pte)) {
2088 unlock_page(old_page);
2089 goto unlock;
2090 }
2091 page_mkwrite = 1;
2092 }
2093
2094 dirty_shared = true;
2095
2096reuse:
2097 /*
2098 * Clear the pages cpupid information as the existing
2099 * information potentially belongs to a now completely
2100 * unrelated process.
2101 */
2102 if (old_page)
2103 page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
2104
2105 flush_cache_page(vma, address, pte_pfn(orig_pte));
2106 entry = pte_mkyoung(orig_pte);
2107 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2108 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2109 update_mmu_cache(vma, address, page_table);
2110 pte_unmap_unlock(page_table, ptl);
2111 ret |= VM_FAULT_WRITE;
2112 2007
2113 if (dirty_shared) { 2008 flush_cache_page(vma, address, pte_pfn(orig_pte));
2114 struct address_space *mapping; 2009 entry = pte_mkyoung(orig_pte);
2115 int dirtied; 2010 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2011 if (ptep_set_access_flags(vma, address, page_table, entry, 1))
2012 update_mmu_cache(vma, address, page_table);
2013 pte_unmap_unlock(page_table, ptl);
2116 2014
2117 if (!page_mkwrite) 2015 if (dirty_shared) {
2118 lock_page(old_page); 2016 struct address_space *mapping;
2017 int dirtied;
2119 2018
2120 dirtied = set_page_dirty(old_page); 2019 if (!page_mkwrite)
2121 VM_BUG_ON_PAGE(PageAnon(old_page), old_page); 2020 lock_page(page);
2122 mapping = old_page->mapping;
2123 unlock_page(old_page);
2124 page_cache_release(old_page);
2125 2021
2126 if ((dirtied || page_mkwrite) && mapping) { 2022 dirtied = set_page_dirty(page);
2127 /* 2023 VM_BUG_ON_PAGE(PageAnon(page), page);
2128 * Some device drivers do not set page.mapping 2024 mapping = page->mapping;
2129 * but still dirty their pages 2025 unlock_page(page);
2130 */ 2026 page_cache_release(page);
2131 balance_dirty_pages_ratelimited(mapping);
2132 }
2133 2027
2134 if (!page_mkwrite) 2028 if ((dirtied || page_mkwrite) && mapping) {
2135 file_update_time(vma->vm_file); 2029 /*
2030 * Some device drivers do not set page.mapping
2031 * but still dirty their pages
2032 */
2033 balance_dirty_pages_ratelimited(mapping);
2136 } 2034 }
2137 2035
2138 return ret; 2036 if (!page_mkwrite)
2037 file_update_time(vma->vm_file);
2139 } 2038 }
2140 2039
2141 /* 2040 return VM_FAULT_WRITE;
2142 * Ok, we need to copy. Oh, well.. 2041}
2143 */ 2042
2144 page_cache_get(old_page); 2043/*
2145gotten: 2044 * Handle the case of a page which we actually need to copy to a new page.
2146 pte_unmap_unlock(page_table, ptl); 2045 *
2046 * Called with mmap_sem locked and the old page referenced, but
2047 * without the ptl held.
2048 *
2049 * High level logic flow:
2050 *
2051 * - Allocate a page, copy the content of the old page to the new one.
2052 * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
2053 * - Take the PTL. If the pte changed, bail out and release the allocated page
2054 * - If the pte is still the way we remember it, update the page table and all
2055 * relevant references. This includes dropping the reference the page-table
2056 * held to the old page, as well as updating the rmap.
2057 * - In any case, unlock the PTL and drop the reference we took to the old page.
2058 */
2059static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
2060 unsigned long address, pte_t *page_table, pmd_t *pmd,
2061 pte_t orig_pte, struct page *old_page)
2062{
2063 struct page *new_page = NULL;
2064 spinlock_t *ptl = NULL;
2065 pte_t entry;
2066 int page_copied = 0;
2067 const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */
2068 const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */
2069 struct mem_cgroup *memcg;
2147 2070
2148 if (unlikely(anon_vma_prepare(vma))) 2071 if (unlikely(anon_vma_prepare(vma)))
2149 goto oom; 2072 goto oom;
@@ -2163,8 +2086,6 @@ gotten:
2163 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) 2086 if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
2164 goto oom_free_new; 2087 goto oom_free_new;
2165 2088
2166 mmun_start = address & PAGE_MASK;
2167 mmun_end = mmun_start + PAGE_SIZE;
2168 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2089 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2169 2090
2170 /* 2091 /*
@@ -2177,8 +2098,9 @@ gotten:
2177 dec_mm_counter_fast(mm, MM_FILEPAGES); 2098 dec_mm_counter_fast(mm, MM_FILEPAGES);
2178 inc_mm_counter_fast(mm, MM_ANONPAGES); 2099 inc_mm_counter_fast(mm, MM_ANONPAGES);
2179 } 2100 }
2180 } else 2101 } else {
2181 inc_mm_counter_fast(mm, MM_ANONPAGES); 2102 inc_mm_counter_fast(mm, MM_ANONPAGES);
2103 }
2182 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2104 flush_cache_page(vma, address, pte_pfn(orig_pte));
2183 entry = mk_pte(new_page, vma->vm_page_prot); 2105 entry = mk_pte(new_page, vma->vm_page_prot);
2184 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2106 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2227,29 +2149,29 @@ gotten:
2227 2149
2228 /* Free the old page.. */ 2150 /* Free the old page.. */
2229 new_page = old_page; 2151 new_page = old_page;
2230 ret |= VM_FAULT_WRITE; 2152 page_copied = 1;
2231 } else 2153 } else {
2232 mem_cgroup_cancel_charge(new_page, memcg); 2154 mem_cgroup_cancel_charge(new_page, memcg);
2155 }
2233 2156
2234 if (new_page) 2157 if (new_page)
2235 page_cache_release(new_page); 2158 page_cache_release(new_page);
2236unlock: 2159
2237 pte_unmap_unlock(page_table, ptl); 2160 pte_unmap_unlock(page_table, ptl);
2238 if (mmun_end > mmun_start) 2161 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2239 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2240 if (old_page) { 2162 if (old_page) {
2241 /* 2163 /*
2242 * Don't let another task, with possibly unlocked vma, 2164 * Don't let another task, with possibly unlocked vma,
2243 * keep the mlocked page. 2165 * keep the mlocked page.
2244 */ 2166 */
2245 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { 2167 if (page_copied && (vma->vm_flags & VM_LOCKED)) {
2246 lock_page(old_page); /* LRU manipulation */ 2168 lock_page(old_page); /* LRU manipulation */
2247 munlock_vma_page(old_page); 2169 munlock_vma_page(old_page);
2248 unlock_page(old_page); 2170 unlock_page(old_page);
2249 } 2171 }
2250 page_cache_release(old_page); 2172 page_cache_release(old_page);
2251 } 2173 }
2252 return ret; 2174 return page_copied ? VM_FAULT_WRITE : 0;
2253oom_free_new: 2175oom_free_new:
2254 page_cache_release(new_page); 2176 page_cache_release(new_page);
2255oom: 2177oom:
@@ -2258,6 +2180,179 @@ oom:
2258 return VM_FAULT_OOM; 2180 return VM_FAULT_OOM;
2259} 2181}
2260 2182
2183/*
2184 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
2185 * mapping
2186 */
2187static int wp_pfn_shared(struct mm_struct *mm,
2188 struct vm_area_struct *vma, unsigned long address,
2189 pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
2190 pmd_t *pmd)
2191{
2192 if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
2193 struct vm_fault vmf = {
2194 .page = NULL,
2195 .pgoff = linear_page_index(vma, address),
2196 .virtual_address = (void __user *)(address & PAGE_MASK),
2197 .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE,
2198 };
2199 int ret;
2200
2201 pte_unmap_unlock(page_table, ptl);
2202 ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);
2203 if (ret & VM_FAULT_ERROR)
2204 return ret;
2205 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2206 /*
2207 * We might have raced with another page fault while we
2208 * released the pte_offset_map_lock.
2209 */
2210 if (!pte_same(*page_table, orig_pte)) {
2211 pte_unmap_unlock(page_table, ptl);
2212 return 0;
2213 }
2214 }
2215 return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
2216 NULL, 0, 0);
2217}
2218
2219static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
2220 unsigned long address, pte_t *page_table,
2221 pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte,
2222 struct page *old_page)
2223 __releases(ptl)
2224{
2225 int page_mkwrite = 0;
2226
2227 page_cache_get(old_page);
2228
2229 /*
2230 * Only catch write-faults on shared writable pages,
2231 * read-only shared pages can get COWed by
2232 * get_user_pages(.write=1, .force=1).
2233 */
2234 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2235 int tmp;
2236
2237 pte_unmap_unlock(page_table, ptl);
2238 tmp = do_page_mkwrite(vma, old_page, address);
2239 if (unlikely(!tmp || (tmp &
2240 (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
2241 page_cache_release(old_page);
2242 return tmp;
2243 }
2244 /*
2245 * Since we dropped the lock we need to revalidate
2246 * the PTE as someone else may have changed it. If
2247 * they did, we just return, as we can count on the
2248 * MMU to tell us if they didn't also make it writable.
2249 */
2250 page_table = pte_offset_map_lock(mm, pmd, address,
2251 &ptl);
2252 if (!pte_same(*page_table, orig_pte)) {
2253 unlock_page(old_page);
2254 pte_unmap_unlock(page_table, ptl);
2255 page_cache_release(old_page);
2256 return 0;
2257 }
2258 page_mkwrite = 1;
2259 }
2260
2261 return wp_page_reuse(mm, vma, address, page_table, ptl,
2262 orig_pte, old_page, page_mkwrite, 1);
2263}
2264
2265/*
2266 * This routine handles present pages, when users try to write
2267 * to a shared page. It is done by copying the page to a new address
2268 * and decrementing the shared-page counter for the old page.
2269 *
2270 * Note that this routine assumes that the protection checks have been
2271 * done by the caller (the low-level page fault routine in most cases).
2272 * Thus we can safely just mark it writable once we've done any necessary
2273 * COW.
2274 *
2275 * We also mark the page dirty at this point even though the page will
2276 * change only once the write actually happens. This avoids a few races,
2277 * and potentially makes it more efficient.
2278 *
2279 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2280 * but allow concurrent faults), with pte both mapped and locked.
2281 * We return with mmap_sem still held, but pte unmapped and unlocked.
2282 */
2283static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2284 unsigned long address, pte_t *page_table, pmd_t *pmd,
2285 spinlock_t *ptl, pte_t orig_pte)
2286 __releases(ptl)
2287{
2288 struct page *old_page;
2289
2290 old_page = vm_normal_page(vma, address, orig_pte);
2291 if (!old_page) {
2292 /*
2293 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
2294 * VM_PFNMAP VMA.
2295 *
2296 * We should not cow pages in a shared writeable mapping.
2297 * Just mark the pages writable and/or call ops->pfn_mkwrite.
2298 */
2299 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2300 (VM_WRITE|VM_SHARED))
2301 return wp_pfn_shared(mm, vma, address, page_table, ptl,
2302 orig_pte, pmd);
2303
2304 pte_unmap_unlock(page_table, ptl);
2305 return wp_page_copy(mm, vma, address, page_table, pmd,
2306 orig_pte, old_page);
2307 }
2308
2309 /*
2310 * Take out anonymous pages first, anonymous shared vmas are
2311 * not dirty accountable.
2312 */
2313 if (PageAnon(old_page) && !PageKsm(old_page)) {
2314 if (!trylock_page(old_page)) {
2315 page_cache_get(old_page);
2316 pte_unmap_unlock(page_table, ptl);
2317 lock_page(old_page);
2318 page_table = pte_offset_map_lock(mm, pmd, address,
2319 &ptl);
2320 if (!pte_same(*page_table, orig_pte)) {
2321 unlock_page(old_page);
2322 pte_unmap_unlock(page_table, ptl);
2323 page_cache_release(old_page);
2324 return 0;
2325 }
2326 page_cache_release(old_page);
2327 }
2328 if (reuse_swap_page(old_page)) {
2329 /*
2330 * The page is all ours. Move it to our anon_vma so
2331 * the rmap code will not search our parent or siblings.
2332 * Protected against the rmap code by the page lock.
2333 */
2334 page_move_anon_rmap(old_page, vma, address);
2335 unlock_page(old_page);
2336 return wp_page_reuse(mm, vma, address, page_table, ptl,
2337 orig_pte, old_page, 0, 0);
2338 }
2339 unlock_page(old_page);
2340 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2341 (VM_WRITE|VM_SHARED))) {
2342 return wp_page_shared(mm, vma, address, page_table, pmd,
2343 ptl, orig_pte, old_page);
2344 }
2345
2346 /*
2347 * Ok, we need to copy. Oh, well..
2348 */
2349 page_cache_get(old_page);
2350
2351 pte_unmap_unlock(page_table, ptl);
2352 return wp_page_copy(mm, vma, address, page_table, pmd,
2353 orig_pte, old_page);
2354}
2355
2261static void unmap_mapping_range_vma(struct vm_area_struct *vma, 2356static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2262 unsigned long start_addr, unsigned long end_addr, 2357 unsigned long start_addr, unsigned long end_addr,
2263 struct zap_details *details) 2358 struct zap_details *details)
@@ -2784,7 +2879,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
2784 struct vm_fault vmf; 2879 struct vm_fault vmf;
2785 int off; 2880 int off;
2786 2881
2787 nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; 2882 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
2788 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; 2883 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
2789 2884
2790 start_addr = max(address & mask, vma->vm_start); 2885 start_addr = max(address & mask, vma->vm_start);
@@ -3035,6 +3130,7 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3035 int last_cpupid; 3130 int last_cpupid;
3036 int target_nid; 3131 int target_nid;
3037 bool migrated = false; 3132 bool migrated = false;
3133 bool was_writable = pte_write(pte);
3038 int flags = 0; 3134 int flags = 0;
3039 3135
3040 /* A PROT_NONE fault should not end up here */ 3136 /* A PROT_NONE fault should not end up here */
@@ -3059,6 +3155,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3059 /* Make it present again */ 3155 /* Make it present again */
3060 pte = pte_modify(pte, vma->vm_page_prot); 3156 pte = pte_modify(pte, vma->vm_page_prot);
3061 pte = pte_mkyoung(pte); 3157 pte = pte_mkyoung(pte);
3158 if (was_writable)
3159 pte = pte_mkwrite(pte);
3062 set_pte_at(mm, addr, ptep, pte); 3160 set_pte_at(mm, addr, ptep, pte);
3063 update_mmu_cache(vma, addr, ptep); 3161 update_mmu_cache(vma, addr, ptep);
3064 3162
@@ -3069,11 +3167,14 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3069 } 3167 }
3070 3168
3071 /* 3169 /*
3072 * Avoid grouping on DSO/COW pages in specific and RO pages 3170 * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
3073 * in general, RO pages shouldn't hurt as much anyway since 3171 * much anyway since they can be in shared cache state. This misses
3074 * they can be in shared cache state. 3172 * the case where a mapping is writable but the process never writes
3173 * to it but pte_write gets cleared during protection updates and
3174 * pte_dirty has unpredictable behaviour between PTE scan updates,
3175 * background writeback, dirty balancing and application behaviour.
3075 */ 3176 */
3076 if (!pte_write(pte)) 3177 if (!(vma->vm_flags & VM_WRITE))
3077 flags |= TNF_NO_GROUP; 3178 flags |= TNF_NO_GROUP;
3078 3179
3079 /* 3180 /*
@@ -3097,7 +3198,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3097 if (migrated) { 3198 if (migrated) {
3098 page_nid = target_nid; 3199 page_nid = target_nid;
3099 flags |= TNF_MIGRATED; 3200 flags |= TNF_MIGRATED;
3100 } 3201 } else
3202 flags |= TNF_MIGRATE_FAIL;
3101 3203
3102out: 3204out:
3103 if (page_nid != -1) 3205 if (page_nid != -1)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9fab10795bea..457bde530cbe 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -104,7 +104,7 @@ void put_online_mems(void)
104 104
105} 105}
106 106
107static void mem_hotplug_begin(void) 107void mem_hotplug_begin(void)
108{ 108{
109 mem_hotplug.active_writer = current; 109 mem_hotplug.active_writer = current;
110 110
@@ -119,7 +119,7 @@ static void mem_hotplug_begin(void)
119 } 119 }
120} 120}
121 121
122static void mem_hotplug_done(void) 122void mem_hotplug_done(void)
123{ 123{
124 mem_hotplug.active_writer = NULL; 124 mem_hotplug.active_writer = NULL;
125 mutex_unlock(&mem_hotplug.lock); 125 mutex_unlock(&mem_hotplug.lock);
@@ -502,7 +502,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
502 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 502 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
503 503
504 for (i = start_sec; i <= end_sec; i++) { 504 for (i = start_sec; i <= end_sec; i++) {
505 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); 505 err = __add_section(nid, zone, section_nr_to_pfn(i));
506 506
507 /* 507 /*
508 * EEXIST is finally dealt with by ioresource collision 508 * EEXIST is finally dealt with by ioresource collision
@@ -959,6 +959,7 @@ static void node_states_set_node(int node, struct memory_notify *arg)
959} 959}
960 960
961 961
962/* Must be protected by mem_hotplug_begin() */
962int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 963int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
963{ 964{
964 unsigned long flags; 965 unsigned long flags;
@@ -969,7 +970,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
969 int ret; 970 int ret;
970 struct memory_notify arg; 971 struct memory_notify arg;
971 972
972 mem_hotplug_begin();
973 /* 973 /*
974 * This doesn't need a lock to do pfn_to_page(). 974 * This doesn't need a lock to do pfn_to_page().
975 * The section can't be removed here because of the 975 * The section can't be removed here because of the
@@ -977,21 +977,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
977 */ 977 */
978 zone = page_zone(pfn_to_page(pfn)); 978 zone = page_zone(pfn_to_page(pfn));
979 979
980 ret = -EINVAL;
981 if ((zone_idx(zone) > ZONE_NORMAL || 980 if ((zone_idx(zone) > ZONE_NORMAL ||
982 online_type == MMOP_ONLINE_MOVABLE) && 981 online_type == MMOP_ONLINE_MOVABLE) &&
983 !can_online_high_movable(zone)) 982 !can_online_high_movable(zone))
984 goto out; 983 return -EINVAL;
985 984
986 if (online_type == MMOP_ONLINE_KERNEL && 985 if (online_type == MMOP_ONLINE_KERNEL &&
987 zone_idx(zone) == ZONE_MOVABLE) { 986 zone_idx(zone) == ZONE_MOVABLE) {
988 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) 987 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages))
989 goto out; 988 return -EINVAL;
990 } 989 }
991 if (online_type == MMOP_ONLINE_MOVABLE && 990 if (online_type == MMOP_ONLINE_MOVABLE &&
992 zone_idx(zone) == ZONE_MOVABLE - 1) { 991 zone_idx(zone) == ZONE_MOVABLE - 1) {
993 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) 992 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages))
994 goto out; 993 return -EINVAL;
995 } 994 }
996 995
997 /* Previous code may changed the zone of the pfn range */ 996 /* Previous code may changed the zone of the pfn range */
@@ -1007,7 +1006,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
1007 ret = notifier_to_errno(ret); 1006 ret = notifier_to_errno(ret);
1008 if (ret) { 1007 if (ret) {
1009 memory_notify(MEM_CANCEL_ONLINE, &arg); 1008 memory_notify(MEM_CANCEL_ONLINE, &arg);
1010 goto out; 1009 return ret;
1011 } 1010 }
1012 /* 1011 /*
1013 * If this zone is not populated, then it is not in zonelist. 1012 * If this zone is not populated, then it is not in zonelist.
@@ -1031,7 +1030,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
1031 (((unsigned long long) pfn + nr_pages) 1030 (((unsigned long long) pfn + nr_pages)
1032 << PAGE_SHIFT) - 1); 1031 << PAGE_SHIFT) - 1);
1033 memory_notify(MEM_CANCEL_ONLINE, &arg); 1032 memory_notify(MEM_CANCEL_ONLINE, &arg);
1034 goto out; 1033 return ret;
1035 } 1034 }
1036 1035
1037 zone->present_pages += onlined_pages; 1036 zone->present_pages += onlined_pages;
@@ -1061,9 +1060,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
1061 1060
1062 if (onlined_pages) 1061 if (onlined_pages)
1063 memory_notify(MEM_ONLINE, &arg); 1062 memory_notify(MEM_ONLINE, &arg);
1064out: 1063 return 0;
1065 mem_hotplug_done();
1066 return ret;
1067} 1064}
1068#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1065#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
1069 1066
@@ -1092,6 +1089,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
1092 return NULL; 1089 return NULL;
1093 1090
1094 arch_refresh_nodedata(nid, pgdat); 1091 arch_refresh_nodedata(nid, pgdat);
1092 } else {
1093 /* Reset the nr_zones and classzone_idx to 0 before reuse */
1094 pgdat->nr_zones = 0;
1095 pgdat->classzone_idx = 0;
1095 } 1096 }
1096 1097
1097 /* we can use NODE_DATA(nid) from here */ 1098 /* we can use NODE_DATA(nid) from here */
@@ -1372,7 +1373,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
1372 if (PageLRU(page)) 1373 if (PageLRU(page))
1373 return pfn; 1374 return pfn;
1374 if (PageHuge(page)) { 1375 if (PageHuge(page)) {
1375 if (is_hugepage_active(page)) 1376 if (page_huge_active(page))
1376 return pfn; 1377 return pfn;
1377 else 1378 else
1378 pfn = round_up(pfn + 1, 1379 pfn = round_up(pfn + 1,
@@ -1684,21 +1685,18 @@ static int __ref __offline_pages(unsigned long start_pfn,
1684 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1685 if (!test_pages_in_a_zone(start_pfn, end_pfn))
1685 return -EINVAL; 1686 return -EINVAL;
1686 1687
1687 mem_hotplug_begin();
1688
1689 zone = page_zone(pfn_to_page(start_pfn)); 1688 zone = page_zone(pfn_to_page(start_pfn));
1690 node = zone_to_nid(zone); 1689 node = zone_to_nid(zone);
1691 nr_pages = end_pfn - start_pfn; 1690 nr_pages = end_pfn - start_pfn;
1692 1691
1693 ret = -EINVAL;
1694 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1692 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
1695 goto out; 1693 return -EINVAL;
1696 1694
1697 /* set above range as isolated */ 1695 /* set above range as isolated */
1698 ret = start_isolate_page_range(start_pfn, end_pfn, 1696 ret = start_isolate_page_range(start_pfn, end_pfn,
1699 MIGRATE_MOVABLE, true); 1697 MIGRATE_MOVABLE, true);
1700 if (ret) 1698 if (ret)
1701 goto out; 1699 return ret;
1702 1700
1703 arg.start_pfn = start_pfn; 1701 arg.start_pfn = start_pfn;
1704 arg.nr_pages = nr_pages; 1702 arg.nr_pages = nr_pages;
@@ -1791,7 +1789,6 @@ repeat:
1791 writeback_set_ratelimit(); 1789 writeback_set_ratelimit();
1792 1790
1793 memory_notify(MEM_OFFLINE, &arg); 1791 memory_notify(MEM_OFFLINE, &arg);
1794 mem_hotplug_done();
1795 return 0; 1792 return 0;
1796 1793
1797failed_removal: 1794failed_removal:
@@ -1801,12 +1798,10 @@ failed_removal:
1801 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1798 memory_notify(MEM_CANCEL_OFFLINE, &arg);
1802 /* pushback to free area */ 1799 /* pushback to free area */
1803 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1800 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1804
1805out:
1806 mem_hotplug_done();
1807 return ret; 1801 return ret;
1808} 1802}
1809 1803
1804/* Must be protected by mem_hotplug_begin() */
1810int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1805int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1811{ 1806{
1812 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); 1807 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
@@ -1977,15 +1972,6 @@ void try_offline_node(int nid)
1977 if (is_vmalloc_addr(zone->wait_table)) 1972 if (is_vmalloc_addr(zone->wait_table))
1978 vfree(zone->wait_table); 1973 vfree(zone->wait_table);
1979 } 1974 }
1980
1981 /*
1982 * Since there is no way to guarentee the address of pgdat/zone is not
1983 * on stack of any kernel threads or used by other kernel objects
1984 * without reference counting or other symchronizing method, do not
1985 * reset node_data and free pgdat here. Just reset it to 0 and reuse
1986 * the memory when the node is online again.
1987 */
1988 memset(pgdat, 0, sizeof(*pgdat));
1989} 1975}
1990EXPORT_SYMBOL(try_offline_node); 1976EXPORT_SYMBOL(try_offline_node);
1991 1977
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4721046a134a..ede26291d4aa 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -945,7 +945,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
945 return alloc_huge_page_node(page_hstate(compound_head(page)), 945 return alloc_huge_page_node(page_hstate(compound_head(page)),
946 node); 946 node);
947 else 947 else
948 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); 948 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
949 __GFP_THISNODE, 0);
949} 950}
950 951
951/* 952/*
@@ -1985,7 +1986,8 @@ retry_cpuset:
1985 nmask = policy_nodemask(gfp, pol); 1986 nmask = policy_nodemask(gfp, pol);
1986 if (!nmask || node_isset(node, *nmask)) { 1987 if (!nmask || node_isset(node, *nmask)) {
1987 mpol_cond_put(pol); 1988 mpol_cond_put(pol);
1988 page = alloc_pages_exact_node(node, gfp, order); 1989 page = alloc_pages_exact_node(node,
1990 gfp | __GFP_THISNODE, order);
1989 goto out; 1991 goto out;
1990 } 1992 }
1991 } 1993 }
diff --git a/mm/mempool.c b/mm/mempool.c
index e209c98c7203..2cc08de8b1db 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -6,26 +6,138 @@
6 * extreme VM load. 6 * extreme VM load.
7 * 7 *
8 * started by Ingo Molnar, Copyright (C) 2001 8 * started by Ingo Molnar, Copyright (C) 2001
9 * debugging by David Rientjes, Copyright (C) 2015
9 */ 10 */
10 11
11#include <linux/mm.h> 12#include <linux/mm.h>
12#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/highmem.h>
15#include <linux/kasan.h>
13#include <linux/kmemleak.h> 16#include <linux/kmemleak.h>
14#include <linux/export.h> 17#include <linux/export.h>
15#include <linux/mempool.h> 18#include <linux/mempool.h>
16#include <linux/blkdev.h> 19#include <linux/blkdev.h>
17#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include "slab.h"
22
23#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
24static void poison_error(mempool_t *pool, void *element, size_t size,
25 size_t byte)
26{
27 const int nr = pool->curr_nr;
28 const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0);
29 const int end = min_t(int, byte + (BITS_PER_LONG / 8), size);
30 int i;
31
32 pr_err("BUG: mempool element poison mismatch\n");
33 pr_err("Mempool %p size %zu\n", pool, size);
34 pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : "");
35 for (i = start; i < end; i++)
36 pr_cont("%x ", *(u8 *)(element + i));
37 pr_cont("%s\n", end < size ? "..." : "");
38 dump_stack();
39}
40
41static void __check_element(mempool_t *pool, void *element, size_t size)
42{
43 u8 *obj = element;
44 size_t i;
45
46 for (i = 0; i < size; i++) {
47 u8 exp = (i < size - 1) ? POISON_FREE : POISON_END;
48
49 if (obj[i] != exp) {
50 poison_error(pool, element, size, i);
51 return;
52 }
53 }
54 memset(obj, POISON_INUSE, size);
55}
56
57static void check_element(mempool_t *pool, void *element)
58{
59 /* Mempools backed by slab allocator */
60 if (pool->free == mempool_free_slab || pool->free == mempool_kfree)
61 __check_element(pool, element, ksize(element));
62
63 /* Mempools backed by page allocator */
64 if (pool->free == mempool_free_pages) {
65 int order = (int)(long)pool->pool_data;
66 void *addr = kmap_atomic((struct page *)element);
67
68 __check_element(pool, addr, 1UL << (PAGE_SHIFT + order));
69 kunmap_atomic(addr);
70 }
71}
72
73static void __poison_element(void *element, size_t size)
74{
75 u8 *obj = element;
76
77 memset(obj, POISON_FREE, size - 1);
78 obj[size - 1] = POISON_END;
79}
80
81static void poison_element(mempool_t *pool, void *element)
82{
83 /* Mempools backed by slab allocator */
84 if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
85 __poison_element(element, ksize(element));
86
87 /* Mempools backed by page allocator */
88 if (pool->alloc == mempool_alloc_pages) {
89 int order = (int)(long)pool->pool_data;
90 void *addr = kmap_atomic((struct page *)element);
91
92 __poison_element(addr, 1UL << (PAGE_SHIFT + order));
93 kunmap_atomic(addr);
94 }
95}
96#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
97static inline void check_element(mempool_t *pool, void *element)
98{
99}
100static inline void poison_element(mempool_t *pool, void *element)
101{
102}
103#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
104
105static void kasan_poison_element(mempool_t *pool, void *element)
106{
107 if (pool->alloc == mempool_alloc_slab)
108 kasan_slab_free(pool->pool_data, element);
109 if (pool->alloc == mempool_kmalloc)
110 kasan_kfree(element);
111 if (pool->alloc == mempool_alloc_pages)
112 kasan_free_pages(element, (unsigned long)pool->pool_data);
113}
114
115static void kasan_unpoison_element(mempool_t *pool, void *element)
116{
117 if (pool->alloc == mempool_alloc_slab)
118 kasan_slab_alloc(pool->pool_data, element);
119 if (pool->alloc == mempool_kmalloc)
120 kasan_krealloc(element, (size_t)pool->pool_data);
121 if (pool->alloc == mempool_alloc_pages)
122 kasan_alloc_pages(element, (unsigned long)pool->pool_data);
123}
18 124
19static void add_element(mempool_t *pool, void *element) 125static void add_element(mempool_t *pool, void *element)
20{ 126{
21 BUG_ON(pool->curr_nr >= pool->min_nr); 127 BUG_ON(pool->curr_nr >= pool->min_nr);
128 poison_element(pool, element);
129 kasan_poison_element(pool, element);
22 pool->elements[pool->curr_nr++] = element; 130 pool->elements[pool->curr_nr++] = element;
23} 131}
24 132
25static void *remove_element(mempool_t *pool) 133static void *remove_element(mempool_t *pool)
26{ 134{
27 BUG_ON(pool->curr_nr <= 0); 135 void *element = pool->elements[--pool->curr_nr];
28 return pool->elements[--pool->curr_nr]; 136
137 BUG_ON(pool->curr_nr < 0);
138 check_element(pool, element);
139 kasan_unpoison_element(pool, element);
140 return element;
29} 141}
30 142
31/** 143/**
@@ -113,23 +225,24 @@ EXPORT_SYMBOL(mempool_create_node);
113 * mempool_create(). 225 * mempool_create().
114 * @new_min_nr: the new minimum number of elements guaranteed to be 226 * @new_min_nr: the new minimum number of elements guaranteed to be
115 * allocated for this pool. 227 * allocated for this pool.
116 * @gfp_mask: the usual allocation bitmask.
117 * 228 *
118 * This function shrinks/grows the pool. In the case of growing, 229 * This function shrinks/grows the pool. In the case of growing,
119 * it cannot be guaranteed that the pool will be grown to the new 230 * it cannot be guaranteed that the pool will be grown to the new
120 * size immediately, but new mempool_free() calls will refill it. 231 * size immediately, but new mempool_free() calls will refill it.
232 * This function may sleep.
121 * 233 *
122 * Note, the caller must guarantee that no mempool_destroy is called 234 * Note, the caller must guarantee that no mempool_destroy is called
123 * while this function is running. mempool_alloc() & mempool_free() 235 * while this function is running. mempool_alloc() & mempool_free()
124 * might be called (eg. from IRQ contexts) while this function executes. 236 * might be called (eg. from IRQ contexts) while this function executes.
125 */ 237 */
126int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) 238int mempool_resize(mempool_t *pool, int new_min_nr)
127{ 239{
128 void *element; 240 void *element;
129 void **new_elements; 241 void **new_elements;
130 unsigned long flags; 242 unsigned long flags;
131 243
132 BUG_ON(new_min_nr <= 0); 244 BUG_ON(new_min_nr <= 0);
245 might_sleep();
133 246
134 spin_lock_irqsave(&pool->lock, flags); 247 spin_lock_irqsave(&pool->lock, flags);
135 if (new_min_nr <= pool->min_nr) { 248 if (new_min_nr <= pool->min_nr) {
@@ -145,7 +258,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
145 spin_unlock_irqrestore(&pool->lock, flags); 258 spin_unlock_irqrestore(&pool->lock, flags);
146 259
147 /* Grow the pool */ 260 /* Grow the pool */
148 new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask); 261 new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements),
262 GFP_KERNEL);
149 if (!new_elements) 263 if (!new_elements)
150 return -ENOMEM; 264 return -ENOMEM;
151 265
@@ -164,7 +278,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
164 278
165 while (pool->curr_nr < pool->min_nr) { 279 while (pool->curr_nr < pool->min_nr) {
166 spin_unlock_irqrestore(&pool->lock, flags); 280 spin_unlock_irqrestore(&pool->lock, flags);
167 element = pool->alloc(gfp_mask, pool->pool_data); 281 element = pool->alloc(GFP_KERNEL, pool->pool_data);
168 if (!element) 282 if (!element)
169 goto out; 283 goto out;
170 spin_lock_irqsave(&pool->lock, flags); 284 spin_lock_irqsave(&pool->lock, flags);
@@ -332,6 +446,7 @@ EXPORT_SYMBOL(mempool_free);
332void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) 446void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
333{ 447{
334 struct kmem_cache *mem = pool_data; 448 struct kmem_cache *mem = pool_data;
449 VM_BUG_ON(mem->ctor);
335 return kmem_cache_alloc(mem, gfp_mask); 450 return kmem_cache_alloc(mem, gfp_mask);
336} 451}
337EXPORT_SYMBOL(mempool_alloc_slab); 452EXPORT_SYMBOL(mempool_alloc_slab);
diff --git a/mm/memtest.c b/mm/memtest.c
new file mode 100644
index 000000000000..1997d934b13b
--- /dev/null
+++ b/mm/memtest.c
@@ -0,0 +1,118 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/string.h>
4#include <linux/types.h>
5#include <linux/mm.h>
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/pfn.h>
9#include <linux/memblock.h>
10
11static u64 patterns[] __initdata = {
12 /* The first entry has to be 0 to leave memtest with zeroed memory */
13 0,
14 0xffffffffffffffffULL,
15 0x5555555555555555ULL,
16 0xaaaaaaaaaaaaaaaaULL,
17 0x1111111111111111ULL,
18 0x2222222222222222ULL,
19 0x4444444444444444ULL,
20 0x8888888888888888ULL,
21 0x3333333333333333ULL,
22 0x6666666666666666ULL,
23 0x9999999999999999ULL,
24 0xccccccccccccccccULL,
25 0x7777777777777777ULL,
26 0xbbbbbbbbbbbbbbbbULL,
27 0xddddddddddddddddULL,
28 0xeeeeeeeeeeeeeeeeULL,
29 0x7a6c7258554e494cULL, /* yeah ;-) */
30};
31
32static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad)
33{
34 printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n",
35 (unsigned long long) pattern,
36 (unsigned long long) start_bad,
37 (unsigned long long) end_bad);
38 memblock_reserve(start_bad, end_bad - start_bad);
39}
40
41static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size)
42{
43 u64 *p, *start, *end;
44 phys_addr_t start_bad, last_bad;
45 phys_addr_t start_phys_aligned;
46 const size_t incr = sizeof(pattern);
47
48 start_phys_aligned = ALIGN(start_phys, incr);
49 start = __va(start_phys_aligned);
50 end = start + (size - (start_phys_aligned - start_phys)) / incr;
51 start_bad = 0;
52 last_bad = 0;
53
54 for (p = start; p < end; p++)
55 *p = pattern;
56
57 for (p = start; p < end; p++, start_phys_aligned += incr) {
58 if (*p == pattern)
59 continue;
60 if (start_phys_aligned == last_bad + incr) {
61 last_bad += incr;
62 continue;
63 }
64 if (start_bad)
65 reserve_bad_mem(pattern, start_bad, last_bad + incr);
66 start_bad = last_bad = start_phys_aligned;
67 }
68 if (start_bad)
69 reserve_bad_mem(pattern, start_bad, last_bad + incr);
70}
71
72static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end)
73{
74 u64 i;
75 phys_addr_t this_start, this_end;
76
77 for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
78 this_start = clamp(this_start, start, end);
79 this_end = clamp(this_end, start, end);
80 if (this_start < this_end) {
81 printk(KERN_INFO " %010llx - %010llx pattern %016llx\n",
82 (unsigned long long)this_start,
83 (unsigned long long)this_end,
84 (unsigned long long)cpu_to_be64(pattern));
85 memtest(pattern, this_start, this_end - this_start);
86 }
87 }
88}
89
90/* default is disabled */
91static int memtest_pattern __initdata;
92
93static int __init parse_memtest(char *arg)
94{
95 if (arg)
96 memtest_pattern = simple_strtoul(arg, NULL, 0);
97 else
98 memtest_pattern = ARRAY_SIZE(patterns);
99
100 return 0;
101}
102
103early_param("memtest", parse_memtest);
104
105void __init early_memtest(phys_addr_t start, phys_addr_t end)
106{
107 unsigned int i;
108 unsigned int idx = 0;
109
110 if (!memtest_pattern)
111 return;
112
113 printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
114 for (i = memtest_pattern-1; i < UINT_MAX; --i) {
115 idx = i % ARRAY_SIZE(patterns);
116 do_one_pass(patterns[idx], start, end);
117 }
118}
diff --git a/mm/migrate.c b/mm/migrate.c
index 85e042686031..f53838fe3dfe 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -537,7 +537,8 @@ void migrate_page_copy(struct page *newpage, struct page *page)
537 * Please do not reorder this without considering how mm/ksm.c's 537 * Please do not reorder this without considering how mm/ksm.c's
538 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). 538 * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
539 */ 539 */
540 ClearPageSwapCache(page); 540 if (PageSwapCache(page))
541 ClearPageSwapCache(page);
541 ClearPagePrivate(page); 542 ClearPagePrivate(page);
542 set_page_private(page, 0); 543 set_page_private(page, 0);
543 544
@@ -901,12 +902,23 @@ out:
901} 902}
902 903
903/* 904/*
905 * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work
906 * around it.
907 */
908#if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM)
909#define ICE_noinline noinline
910#else
911#define ICE_noinline
912#endif
913
914/*
904 * Obtain the lock on page, remove all ptes and migrate the page 915 * Obtain the lock on page, remove all ptes and migrate the page
905 * to the newly allocated page in newpage. 916 * to the newly allocated page in newpage.
906 */ 917 */
907static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, 918static ICE_noinline int unmap_and_move(new_page_t get_new_page,
908 unsigned long private, struct page *page, int force, 919 free_page_t put_new_page,
909 enum migrate_mode mode) 920 unsigned long private, struct page *page,
921 int force, enum migrate_mode mode)
910{ 922{
911 int rc = 0; 923 int rc = 0;
912 int *result = NULL; 924 int *result = NULL;
@@ -1554,30 +1566,10 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
1554 * page migration rate limiting control. 1566 * page migration rate limiting control.
1555 * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs 1567 * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
1556 * window of time. Default here says do not migrate more than 1280M per second. 1568 * window of time. Default here says do not migrate more than 1280M per second.
1557 * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
1558 * as it is faults that reset the window, pte updates will happen unconditionally
1559 * if there has not been a fault since @pteupdate_interval_millisecs after the
1560 * throttle window closed.
1561 */ 1569 */
1562static unsigned int migrate_interval_millisecs __read_mostly = 100; 1570static unsigned int migrate_interval_millisecs __read_mostly = 100;
1563static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
1564static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); 1571static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
1565 1572
1566/* Returns true if NUMA migration is currently rate limited */
1567bool migrate_ratelimited(int node)
1568{
1569 pg_data_t *pgdat = NODE_DATA(node);
1570
1571 if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
1572 msecs_to_jiffies(pteupdate_interval_millisecs)))
1573 return false;
1574
1575 if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
1576 return false;
1577
1578 return true;
1579}
1580
1581/* Returns true if the node is migrate rate-limited after the update */ 1573/* Returns true if the node is migrate rate-limited after the update */
1582static bool numamigrate_update_ratelimit(pg_data_t *pgdat, 1574static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
1583 unsigned long nr_pages) 1575 unsigned long nr_pages)
diff --git a/mm/mlock.c b/mm/mlock.c
index 73cf0987088c..6fd2cf15e868 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -26,10 +26,10 @@
26 26
27int can_do_mlock(void) 27int can_do_mlock(void)
28{ 28{
29 if (capable(CAP_IPC_LOCK))
30 return 1;
31 if (rlimit(RLIMIT_MEMLOCK) != 0) 29 if (rlimit(RLIMIT_MEMLOCK) != 0)
32 return 1; 30 return 1;
31 if (capable(CAP_IPC_LOCK))
32 return 1;
33 return 0; 33 return 0;
34} 34}
35EXPORT_SYMBOL(can_do_mlock); 35EXPORT_SYMBOL(can_do_mlock);
@@ -205,62 +205,6 @@ out:
205 return nr_pages - 1; 205 return nr_pages - 1;
206} 206}
207 207
208/**
209 * __mlock_vma_pages_range() - mlock a range of pages in the vma.
210 * @vma: target vma
211 * @start: start address
212 * @end: end address
213 * @nonblocking:
214 *
215 * This takes care of making the pages present too.
216 *
217 * return 0 on success, negative error code on error.
218 *
219 * vma->vm_mm->mmap_sem must be held.
220 *
221 * If @nonblocking is NULL, it may be held for read or write and will
222 * be unperturbed.
223 *
224 * If @nonblocking is non-NULL, it must held for read only and may be
225 * released. If it's released, *@nonblocking will be set to 0.
226 */
227long __mlock_vma_pages_range(struct vm_area_struct *vma,
228 unsigned long start, unsigned long end, int *nonblocking)
229{
230 struct mm_struct *mm = vma->vm_mm;
231 unsigned long nr_pages = (end - start) / PAGE_SIZE;
232 int gup_flags;
233
234 VM_BUG_ON(start & ~PAGE_MASK);
235 VM_BUG_ON(end & ~PAGE_MASK);
236 VM_BUG_ON_VMA(start < vma->vm_start, vma);
237 VM_BUG_ON_VMA(end > vma->vm_end, vma);
238 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
239
240 gup_flags = FOLL_TOUCH | FOLL_MLOCK;
241 /*
242 * We want to touch writable mappings with a write fault in order
243 * to break COW, except for shared mappings because these don't COW
244 * and we would not want to dirty them for nothing.
245 */
246 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
247 gup_flags |= FOLL_WRITE;
248
249 /*
250 * We want mlock to succeed for regions that have any permissions
251 * other than PROT_NONE.
252 */
253 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
254 gup_flags |= FOLL_FORCE;
255
256 /*
257 * We made sure addr is within a VMA, so the following will
258 * not result in a stack expansion that recurses back here.
259 */
260 return __get_user_pages(current, mm, start, nr_pages, gup_flags,
261 NULL, NULL, nonblocking);
262}
263
264/* 208/*
265 * convert get_user_pages() return value to posix mlock() error 209 * convert get_user_pages() return value to posix mlock() error
266 */ 210 */
@@ -596,7 +540,7 @@ success:
596 /* 540 /*
597 * vm_flags is protected by the mmap_sem held in write mode. 541 * vm_flags is protected by the mmap_sem held in write mode.
598 * It's okay if try_to_unmap_one unmaps a page just after we 542 * It's okay if try_to_unmap_one unmaps a page just after we
599 * set VM_LOCKED, __mlock_vma_pages_range will bring it back. 543 * set VM_LOCKED, populate_vma_page_range will bring it back.
600 */ 544 */
601 545
602 if (lock) 546 if (lock)
@@ -660,69 +604,6 @@ static int do_mlock(unsigned long start, size_t len, int on)
660 return error; 604 return error;
661} 605}
662 606
663/*
664 * __mm_populate - populate and/or mlock pages within a range of address space.
665 *
666 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
667 * flags. VMAs must be already marked with the desired vm_flags, and
668 * mmap_sem must not be held.
669 */
670int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
671{
672 struct mm_struct *mm = current->mm;
673 unsigned long end, nstart, nend;
674 struct vm_area_struct *vma = NULL;
675 int locked = 0;
676 long ret = 0;
677
678 VM_BUG_ON(start & ~PAGE_MASK);
679 VM_BUG_ON(len != PAGE_ALIGN(len));
680 end = start + len;
681
682 for (nstart = start; nstart < end; nstart = nend) {
683 /*
684 * We want to fault in pages for [nstart; end) address range.
685 * Find first corresponding VMA.
686 */
687 if (!locked) {
688 locked = 1;
689 down_read(&mm->mmap_sem);
690 vma = find_vma(mm, nstart);
691 } else if (nstart >= vma->vm_end)
692 vma = vma->vm_next;
693 if (!vma || vma->vm_start >= end)
694 break;
695 /*
696 * Set [nstart; nend) to intersection of desired address
697 * range with the first VMA. Also, skip undesirable VMA types.
698 */
699 nend = min(end, vma->vm_end);
700 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
701 continue;
702 if (nstart < vma->vm_start)
703 nstart = vma->vm_start;
704 /*
705 * Now fault in a range of pages. __mlock_vma_pages_range()
706 * double checks the vma flags, so that it won't mlock pages
707 * if the vma was already munlocked.
708 */
709 ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
710 if (ret < 0) {
711 if (ignore_errors) {
712 ret = 0;
713 continue; /* continue at next VMA */
714 }
715 ret = __mlock_posix_error_return(ret);
716 break;
717 }
718 nend = nstart + ret * PAGE_SIZE;
719 ret = 0;
720 }
721 if (locked)
722 up_read(&mm->mmap_sem);
723 return ret; /* 0 or negative error code */
724}
725
726SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) 607SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
727{ 608{
728 unsigned long locked; 609 unsigned long locked;
@@ -750,9 +631,13 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
750 error = do_mlock(start, len, 1); 631 error = do_mlock(start, len, 1);
751 632
752 up_write(&current->mm->mmap_sem); 633 up_write(&current->mm->mmap_sem);
753 if (!error) 634 if (error)
754 error = __mm_populate(start, len, 0); 635 return error;
755 return error; 636
637 error = __mm_populate(start, len, 0);
638 if (error)
639 return __mlock_posix_error_return(error);
640 return 0;
756} 641}
757 642
758SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) 643SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
diff --git a/mm/mmap.c b/mm/mmap.c
index da9990acc08b..bb50cacc3ea5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -774,10 +774,8 @@ again: remove_next = 1 + (end > next->vm_end);
774 774
775 importer->anon_vma = exporter->anon_vma; 775 importer->anon_vma = exporter->anon_vma;
776 error = anon_vma_clone(importer, exporter); 776 error = anon_vma_clone(importer, exporter);
777 if (error) { 777 if (error)
778 importer->anon_vma = NULL;
779 return error; 778 return error;
780 }
781 } 779 }
782 } 780 }
783 781
@@ -1135,7 +1133,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
1135 * by another page fault trying to merge _that_. But that's ok: if it 1133 * by another page fault trying to merge _that_. But that's ok: if it
1136 * is being set up, that automatically means that it will be a singleton 1134 * is being set up, that automatically means that it will be a singleton
1137 * acceptable for merging, so we can do all of this optimistically. But 1135 * acceptable for merging, so we can do all of this optimistically. But
1138 * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. 1136 * we do that READ_ONCE() to make sure that we never re-load the pointer.
1139 * 1137 *
1140 * IOW: that the "list_is_singular()" test on the anon_vma_chain only 1138 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
1141 * matters for the 'stable anon_vma' case (ie the thing we want to avoid 1139 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
@@ -1149,7 +1147,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *
1149static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) 1147static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
1150{ 1148{
1151 if (anon_vma_compatible(a, b)) { 1149 if (anon_vma_compatible(a, b)) {
1152 struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); 1150 struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
1153 1151
1154 if (anon_vma && list_is_singular(&old->anon_vma_chain)) 1152 if (anon_vma && list_is_singular(&old->anon_vma_chain))
1155 return anon_vma; 1153 return anon_vma;
@@ -1553,11 +1551,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1553 1551
1554 /* Clear old maps */ 1552 /* Clear old maps */
1555 error = -ENOMEM; 1553 error = -ENOMEM;
1556munmap_back: 1554 while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
1557 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { 1555 &rb_parent)) {
1558 if (do_munmap(mm, addr, len)) 1556 if (do_munmap(mm, addr, len))
1559 return -ENOMEM; 1557 return -ENOMEM;
1560 goto munmap_back;
1561 } 1558 }
1562 1559
1563 /* 1560 /*
@@ -1573,7 +1570,8 @@ munmap_back:
1573 /* 1570 /*
1574 * Can we just expand an old mapping? 1571 * Can we just expand an old mapping?
1575 */ 1572 */
1576 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); 1573 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff,
1574 NULL);
1577 if (vma) 1575 if (vma)
1578 goto out; 1576 goto out;
1579 1577
@@ -2102,7 +2100,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
2102 actual_size = size; 2100 actual_size = size;
2103 if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) 2101 if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
2104 actual_size -= PAGE_SIZE; 2102 actual_size -= PAGE_SIZE;
2105 if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) 2103 if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur))
2106 return -ENOMEM; 2104 return -ENOMEM;
2107 2105
2108 /* mlock limit tests */ 2106 /* mlock limit tests */
@@ -2110,7 +2108,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
2110 unsigned long locked; 2108 unsigned long locked;
2111 unsigned long limit; 2109 unsigned long limit;
2112 locked = mm->locked_vm + grow; 2110 locked = mm->locked_vm + grow;
2113 limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); 2111 limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
2114 limit >>= PAGE_SHIFT; 2112 limit >>= PAGE_SHIFT;
2115 if (locked > limit && !capable(CAP_IPC_LOCK)) 2113 if (locked > limit && !capable(CAP_IPC_LOCK))
2116 return -ENOMEM; 2114 return -ENOMEM;
@@ -2318,7 +2316,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
2318 if (!prev || expand_stack(prev, addr)) 2316 if (!prev || expand_stack(prev, addr))
2319 return NULL; 2317 return NULL;
2320 if (prev->vm_flags & VM_LOCKED) 2318 if (prev->vm_flags & VM_LOCKED)
2321 __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); 2319 populate_vma_page_range(prev, addr, prev->vm_end, NULL);
2322 return prev; 2320 return prev;
2323} 2321}
2324#else 2322#else
@@ -2353,7 +2351,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
2353 if (expand_stack(vma, addr)) 2351 if (expand_stack(vma, addr))
2354 return NULL; 2352 return NULL;
2355 if (vma->vm_flags & VM_LOCKED) 2353 if (vma->vm_flags & VM_LOCKED)
2356 __mlock_vma_pages_range(vma, addr, start, NULL); 2354 populate_vma_page_range(vma, addr, start, NULL);
2357 return vma; 2355 return vma;
2358} 2356}
2359#endif 2357#endif
@@ -2741,11 +2739,10 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2741 /* 2739 /*
2742 * Clear old maps. this also does some error checking for us 2740 * Clear old maps. this also does some error checking for us
2743 */ 2741 */
2744 munmap_back: 2742 while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
2745 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { 2743 &rb_parent)) {
2746 if (do_munmap(mm, addr, len)) 2744 if (do_munmap(mm, addr, len))
2747 return -ENOMEM; 2745 return -ENOMEM;
2748 goto munmap_back;
2749 } 2746 }
2750 2747
2751 /* Check against address space limits *after* clearing old maps... */ 2748 /* Check against address space limits *after* clearing old maps... */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 44727811bf4c..88584838e704 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -75,6 +75,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
75 oldpte = *pte; 75 oldpte = *pte;
76 if (pte_present(oldpte)) { 76 if (pte_present(oldpte)) {
77 pte_t ptent; 77 pte_t ptent;
78 bool preserve_write = prot_numa && pte_write(oldpte);
78 79
79 /* 80 /*
80 * Avoid trapping faults against the zero or KSM 81 * Avoid trapping faults against the zero or KSM
@@ -94,6 +95,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
94 95
95 ptent = ptep_modify_prot_start(mm, addr, pte); 96 ptent = ptep_modify_prot_start(mm, addr, pte);
96 ptent = pte_modify(ptent, newprot); 97 ptent = pte_modify(ptent, newprot);
98 if (preserve_write)
99 ptent = pte_mkwrite(ptent);
97 100
98 /* Avoid taking write faults for known dirty pages */ 101 /* Avoid taking write faults for known dirty pages */
99 if (dirty_accountable && pte_dirty(ptent) && 102 if (dirty_accountable && pte_dirty(ptent) &&
diff --git a/mm/mremap.c b/mm/mremap.c
index 57dadc025c64..034e2d360652 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -286,8 +286,14 @@ static unsigned long move_vma(struct vm_area_struct *vma,
286 old_len = new_len; 286 old_len = new_len;
287 old_addr = new_addr; 287 old_addr = new_addr;
288 new_addr = -ENOMEM; 288 new_addr = -ENOMEM;
289 } else if (vma->vm_file && vma->vm_file->f_op->mremap) 289 } else if (vma->vm_file && vma->vm_file->f_op->mremap) {
290 vma->vm_file->f_op->mremap(vma->vm_file, new_vma); 290 err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
291 if (err < 0) {
292 move_page_tables(new_vma, new_addr, vma, old_addr,
293 moved_len, true);
294 return err;
295 }
296 }
291 297
292 /* Conceal VM_ACCOUNT so old reservation is not undone */ 298 /* Conceal VM_ACCOUNT so old reservation is not undone */
293 if (vm_flags & VM_ACCOUNT) { 299 if (vm_flags & VM_ACCOUNT) {
@@ -339,25 +345,25 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
339 struct vm_area_struct *vma = find_vma(mm, addr); 345 struct vm_area_struct *vma = find_vma(mm, addr);
340 346
341 if (!vma || vma->vm_start > addr) 347 if (!vma || vma->vm_start > addr)
342 goto Efault; 348 return ERR_PTR(-EFAULT);
343 349
344 if (is_vm_hugetlb_page(vma)) 350 if (is_vm_hugetlb_page(vma))
345 goto Einval; 351 return ERR_PTR(-EINVAL);
346 352
347 /* We can't remap across vm area boundaries */ 353 /* We can't remap across vm area boundaries */
348 if (old_len > vma->vm_end - addr) 354 if (old_len > vma->vm_end - addr)
349 goto Efault; 355 return ERR_PTR(-EFAULT);
350 356
351 /* Need to be careful about a growing mapping */ 357 /* Need to be careful about a growing mapping */
352 if (new_len > old_len) { 358 if (new_len > old_len) {
353 unsigned long pgoff; 359 unsigned long pgoff;
354 360
355 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) 361 if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
356 goto Efault; 362 return ERR_PTR(-EFAULT);
357 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; 363 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
358 pgoff += vma->vm_pgoff; 364 pgoff += vma->vm_pgoff;
359 if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) 365 if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
360 goto Einval; 366 return ERR_PTR(-EINVAL);
361 } 367 }
362 368
363 if (vma->vm_flags & VM_LOCKED) { 369 if (vma->vm_flags & VM_LOCKED) {
@@ -366,29 +372,20 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
366 lock_limit = rlimit(RLIMIT_MEMLOCK); 372 lock_limit = rlimit(RLIMIT_MEMLOCK);
367 locked += new_len - old_len; 373 locked += new_len - old_len;
368 if (locked > lock_limit && !capable(CAP_IPC_LOCK)) 374 if (locked > lock_limit && !capable(CAP_IPC_LOCK))
369 goto Eagain; 375 return ERR_PTR(-EAGAIN);
370 } 376 }
371 377
372 if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) 378 if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
373 goto Enomem; 379 return ERR_PTR(-ENOMEM);
374 380
375 if (vma->vm_flags & VM_ACCOUNT) { 381 if (vma->vm_flags & VM_ACCOUNT) {
376 unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; 382 unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
377 if (security_vm_enough_memory_mm(mm, charged)) 383 if (security_vm_enough_memory_mm(mm, charged))
378 goto Efault; 384 return ERR_PTR(-ENOMEM);
379 *p = charged; 385 *p = charged;
380 } 386 }
381 387
382 return vma; 388 return vma;
383
384Efault: /* very odd choice for most of the cases, but... */
385 return ERR_PTR(-EFAULT);
386Einval:
387 return ERR_PTR(-EINVAL);
388Enomem:
389 return ERR_PTR(-ENOMEM);
390Eagain:
391 return ERR_PTR(-EAGAIN);
392} 389}
393 390
394static unsigned long mremap_to(unsigned long addr, unsigned long old_len, 391static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
diff --git a/mm/nommu.c b/mm/nommu.c
index 3e67e7538ecf..e544508e2a4b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -62,6 +62,7 @@ void *high_memory;
62EXPORT_SYMBOL(high_memory); 62EXPORT_SYMBOL(high_memory);
63struct page *mem_map; 63struct page *mem_map;
64unsigned long max_mapnr; 64unsigned long max_mapnr;
65EXPORT_SYMBOL(max_mapnr);
65unsigned long highest_memmap_pfn; 66unsigned long highest_memmap_pfn;
66struct percpu_counter vm_committed_as; 67struct percpu_counter vm_committed_as;
67int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 68int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
@@ -1015,7 +1016,7 @@ static int validate_mmap_request(struct file *file,
1015 * device */ 1016 * device */
1016 if (!file->f_op->get_unmapped_area) 1017 if (!file->f_op->get_unmapped_area)
1017 capabilities &= ~NOMMU_MAP_DIRECT; 1018 capabilities &= ~NOMMU_MAP_DIRECT;
1018 if (!file->f_op->read) 1019 if (!(file->f_mode & FMODE_CAN_READ))
1019 capabilities &= ~NOMMU_MAP_COPY; 1020 capabilities &= ~NOMMU_MAP_COPY;
1020 1021
1021 /* The file shall have been opened with read permission. */ 1022 /* The file shall have been opened with read permission. */
@@ -1239,7 +1240,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1239 1240
1240 old_fs = get_fs(); 1241 old_fs = get_fs();
1241 set_fs(KERNEL_DS); 1242 set_fs(KERNEL_DS);
1242 ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); 1243 ret = __vfs_read(vma->vm_file, base, len, &fpos);
1243 set_fs(old_fs); 1244 set_fs(old_fs);
1244 1245
1245 if (ret < 0) 1246 if (ret < 0)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 642f38cb175a..2b665da1b3c9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -408,7 +408,7 @@ bool oom_killer_disabled __read_mostly;
408static DECLARE_RWSEM(oom_sem); 408static DECLARE_RWSEM(oom_sem);
409 409
410/** 410/**
411 * mark_tsk_oom_victim - marks the given taks as OOM victim. 411 * mark_tsk_oom_victim - marks the given task as OOM victim.
412 * @tsk: task to mark 412 * @tsk: task to mark
413 * 413 *
414 * Has to be called with oom_sem taken for read and never after 414 * Has to be called with oom_sem taken for read and never after
@@ -612,7 +612,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
612 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 612 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
613 */ 613 */
614void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, 614void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
615 int order, const nodemask_t *nodemask) 615 int order, const nodemask_t *nodemask,
616 struct mem_cgroup *memcg)
616{ 617{
617 if (likely(!sysctl_panic_on_oom)) 618 if (likely(!sysctl_panic_on_oom))
618 return; 619 return;
@@ -625,7 +626,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
625 if (constraint != CONSTRAINT_NONE) 626 if (constraint != CONSTRAINT_NONE)
626 return; 627 return;
627 } 628 }
628 dump_header(NULL, gfp_mask, order, NULL, nodemask); 629 dump_header(NULL, gfp_mask, order, memcg, nodemask);
629 panic("Out of memory: %s panic_on_oom is enabled\n", 630 panic("Out of memory: %s panic_on_oom is enabled\n",
630 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); 631 sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
631} 632}
@@ -740,7 +741,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
740 constraint = constrained_alloc(zonelist, gfp_mask, nodemask, 741 constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
741 &totalpages); 742 &totalpages);
742 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; 743 mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
743 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); 744 check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL);
744 745
745 if (sysctl_oom_kill_allocating_task && current->mm && 746 if (sysctl_oom_kill_allocating_task && current->mm &&
746 !oom_unkillable_task(current, NULL, nodemask) && 747 !oom_unkillable_task(current, NULL, nodemask) &&
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 45e187b2d971..5daf5568b9e1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -857,8 +857,11 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
857 * bw * elapsed + write_bandwidth * (period - elapsed) 857 * bw * elapsed + write_bandwidth * (period - elapsed)
858 * write_bandwidth = --------------------------------------------------- 858 * write_bandwidth = ---------------------------------------------------
859 * period 859 * period
860 *
861 * @written may have decreased due to account_page_redirty().
862 * Avoid underflowing @bw calculation.
860 */ 863 */
861 bw = written - bdi->written_stamp; 864 bw = written - min(written, bdi->written_stamp);
862 bw *= HZ; 865 bw *= HZ;
863 if (unlikely(elapsed > period)) { 866 if (unlikely(elapsed > period)) {
864 do_div(bw, elapsed); 867 do_div(bw, elapsed);
@@ -922,7 +925,7 @@ static void global_update_bandwidth(unsigned long thresh,
922 unsigned long now) 925 unsigned long now)
923{ 926{
924 static DEFINE_SPINLOCK(dirty_lock); 927 static DEFINE_SPINLOCK(dirty_lock);
925 static unsigned long update_time; 928 static unsigned long update_time = INITIAL_JIFFIES;
926 929
927 /* 930 /*
928 * check locklessly first to optimize away locking for the most time 931 * check locklessly first to optimize away locking for the most time
@@ -2108,6 +2111,25 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
2108EXPORT_SYMBOL(account_page_dirtied); 2111EXPORT_SYMBOL(account_page_dirtied);
2109 2112
2110/* 2113/*
2114 * Helper function for deaccounting dirty page without writeback.
2115 *
2116 * Doing this should *normally* only ever be done when a page
2117 * is truncated, and is not actually mapped anywhere at all. However,
2118 * fs/buffer.c does this when it notices that somebody has cleaned
2119 * out all the buffers on a page without actually doing it through
2120 * the VM. Can you say "ext3 is horribly ugly"? Thought you could.
2121 */
2122void account_page_cleaned(struct page *page, struct address_space *mapping)
2123{
2124 if (mapping_cap_account_dirty(mapping)) {
2125 dec_zone_page_state(page, NR_FILE_DIRTY);
2126 dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
2127 task_io_account_cancelled_write(PAGE_CACHE_SIZE);
2128 }
2129}
2130EXPORT_SYMBOL(account_page_cleaned);
2131
2132/*
2111 * For address_spaces which do not use buffers. Just tag the page as dirty in 2133 * For address_spaces which do not use buffers. Just tag the page as dirty in
2112 * its radix tree. 2134 * its radix tree.
2113 * 2135 *
@@ -2206,7 +2228,8 @@ int set_page_dirty(struct page *page)
2206 * it will confuse readahead and make it restart the size rampup 2228 * it will confuse readahead and make it restart the size rampup
2207 * process. But it's a trivial problem. 2229 * process. But it's a trivial problem.
2208 */ 2230 */
2209 ClearPageReclaim(page); 2231 if (PageReclaim(page))
2232 ClearPageReclaim(page);
2210#ifdef CONFIG_BLOCK 2233#ifdef CONFIG_BLOCK
2211 if (!spd) 2234 if (!spd)
2212 spd = __set_page_dirty_buffers; 2235 spd = __set_page_dirty_buffers;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7abfa70cdc1a..ebffa0e4a9c0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1032,11 +1032,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
1032static int fallbacks[MIGRATE_TYPES][4] = { 1032static int fallbacks[MIGRATE_TYPES][4] = {
1033 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 1033 [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
1034 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, 1034 [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
1035 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
1035#ifdef CONFIG_CMA 1036#ifdef CONFIG_CMA
1036 [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
1037 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ 1037 [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
1038#else
1039 [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
1040#endif 1038#endif
1041 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ 1039 [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
1042#ifdef CONFIG_MEMORY_ISOLATION 1040#ifdef CONFIG_MEMORY_ISOLATION
@@ -1044,6 +1042,17 @@ static int fallbacks[MIGRATE_TYPES][4] = {
1044#endif 1042#endif
1045}; 1043};
1046 1044
1045#ifdef CONFIG_CMA
1046static struct page *__rmqueue_cma_fallback(struct zone *zone,
1047 unsigned int order)
1048{
1049 return __rmqueue_smallest(zone, order, MIGRATE_CMA);
1050}
1051#else
1052static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
1053 unsigned int order) { return NULL; }
1054#endif
1055
1047/* 1056/*
1048 * Move the free pages in a range to the free lists of the requested type. 1057 * Move the free pages in a range to the free lists of the requested type.
1049 * Note that start_page and end_pages are not aligned on a pageblock 1058 * Note that start_page and end_pages are not aligned on a pageblock
@@ -1136,14 +1145,40 @@ static void change_pageblock_range(struct page *pageblock_page,
1136 * as fragmentation caused by those allocations polluting movable pageblocks 1145 * as fragmentation caused by those allocations polluting movable pageblocks
1137 * is worse than movable allocations stealing from unmovable and reclaimable 1146 * is worse than movable allocations stealing from unmovable and reclaimable
1138 * pageblocks. 1147 * pageblocks.
1139 *
1140 * If we claim more than half of the pageblock, change pageblock's migratetype
1141 * as well.
1142 */ 1148 */
1143static void try_to_steal_freepages(struct zone *zone, struct page *page, 1149static bool can_steal_fallback(unsigned int order, int start_mt)
1144 int start_type, int fallback_type) 1150{
1151 /*
1152 * Leaving this order check is intended, although there is
1153 * relaxed order check in next check. The reason is that
1154 * we can actually steal whole pageblock if this condition met,
1155 * but, below check doesn't guarantee it and that is just heuristic
1156 * so could be changed anytime.
1157 */
1158 if (order >= pageblock_order)
1159 return true;
1160
1161 if (order >= pageblock_order / 2 ||
1162 start_mt == MIGRATE_RECLAIMABLE ||
1163 start_mt == MIGRATE_UNMOVABLE ||
1164 page_group_by_mobility_disabled)
1165 return true;
1166
1167 return false;
1168}
1169
1170/*
1171 * This function implements actual steal behaviour. If order is large enough,
1172 * we can steal whole pageblock. If not, we first move freepages in this
1173 * pageblock and check whether half of pages are moved or not. If half of
1174 * pages are moved, we can change migratetype of pageblock and permanently
1175 * use it's pages as requested migratetype in the future.
1176 */
1177static void steal_suitable_fallback(struct zone *zone, struct page *page,
1178 int start_type)
1145{ 1179{
1146 int current_order = page_order(page); 1180 int current_order = page_order(page);
1181 int pages;
1147 1182
1148 /* Take ownership for orders >= pageblock_order */ 1183 /* Take ownership for orders >= pageblock_order */
1149 if (current_order >= pageblock_order) { 1184 if (current_order >= pageblock_order) {
@@ -1151,19 +1186,49 @@ static void try_to_steal_freepages(struct zone *zone, struct page *page,
1151 return; 1186 return;
1152 } 1187 }
1153 1188
1154 if (current_order >= pageblock_order / 2 || 1189 pages = move_freepages_block(zone, page, start_type);
1155 start_type == MIGRATE_RECLAIMABLE || 1190
1156 start_type == MIGRATE_UNMOVABLE || 1191 /* Claim the whole block if over half of it is free */
1157 page_group_by_mobility_disabled) { 1192 if (pages >= (1 << (pageblock_order-1)) ||
1158 int pages; 1193 page_group_by_mobility_disabled)
1194 set_pageblock_migratetype(page, start_type);
1195}
1196
1197/*
1198 * Check whether there is a suitable fallback freepage with requested order.
1199 * If only_stealable is true, this function returns fallback_mt only if
1200 * we can steal other freepages all together. This would help to reduce
1201 * fragmentation due to mixed migratetype pages in one pageblock.
1202 */
1203int find_suitable_fallback(struct free_area *area, unsigned int order,
1204 int migratetype, bool only_stealable, bool *can_steal)
1205{
1206 int i;
1207 int fallback_mt;
1208
1209 if (area->nr_free == 0)
1210 return -1;
1211
1212 *can_steal = false;
1213 for (i = 0;; i++) {
1214 fallback_mt = fallbacks[migratetype][i];
1215 if (fallback_mt == MIGRATE_RESERVE)
1216 break;
1217
1218 if (list_empty(&area->free_list[fallback_mt]))
1219 continue;
1159 1220
1160 pages = move_freepages_block(zone, page, start_type); 1221 if (can_steal_fallback(order, migratetype))
1222 *can_steal = true;
1161 1223
1162 /* Claim the whole block if over half of it is free */ 1224 if (!only_stealable)
1163 if (pages >= (1 << (pageblock_order-1)) || 1225 return fallback_mt;
1164 page_group_by_mobility_disabled) 1226
1165 set_pageblock_migratetype(page, start_type); 1227 if (*can_steal)
1228 return fallback_mt;
1166 } 1229 }
1230
1231 return -1;
1167} 1232}
1168 1233
1169/* Remove an element from the buddy allocator from the fallback list */ 1234/* Remove an element from the buddy allocator from the fallback list */
@@ -1173,64 +1238,45 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1173 struct free_area *area; 1238 struct free_area *area;
1174 unsigned int current_order; 1239 unsigned int current_order;
1175 struct page *page; 1240 struct page *page;
1241 int fallback_mt;
1242 bool can_steal;
1176 1243
1177 /* Find the largest possible block of pages in the other list */ 1244 /* Find the largest possible block of pages in the other list */
1178 for (current_order = MAX_ORDER-1; 1245 for (current_order = MAX_ORDER-1;
1179 current_order >= order && current_order <= MAX_ORDER-1; 1246 current_order >= order && current_order <= MAX_ORDER-1;
1180 --current_order) { 1247 --current_order) {
1181 int i; 1248 area = &(zone->free_area[current_order]);
1182 for (i = 0;; i++) { 1249 fallback_mt = find_suitable_fallback(area, current_order,
1183 int migratetype = fallbacks[start_migratetype][i]; 1250 start_migratetype, false, &can_steal);
1184 int buddy_type = start_migratetype; 1251 if (fallback_mt == -1)
1185 1252 continue;
1186 /* MIGRATE_RESERVE handled later if necessary */
1187 if (migratetype == MIGRATE_RESERVE)
1188 break;
1189
1190 area = &(zone->free_area[current_order]);
1191 if (list_empty(&area->free_list[migratetype]))
1192 continue;
1193
1194 page = list_entry(area->free_list[migratetype].next,
1195 struct page, lru);
1196 area->nr_free--;
1197
1198 if (!is_migrate_cma(migratetype)) {
1199 try_to_steal_freepages(zone, page,
1200 start_migratetype,
1201 migratetype);
1202 } else {
1203 /*
1204 * When borrowing from MIGRATE_CMA, we need to
1205 * release the excess buddy pages to CMA
1206 * itself, and we do not try to steal extra
1207 * free pages.
1208 */
1209 buddy_type = migratetype;
1210 }
1211 1253
1212 /* Remove the page from the freelists */ 1254 page = list_entry(area->free_list[fallback_mt].next,
1213 list_del(&page->lru); 1255 struct page, lru);
1214 rmv_page_order(page); 1256 if (can_steal)
1257 steal_suitable_fallback(zone, page, start_migratetype);
1215 1258
1216 expand(zone, page, order, current_order, area, 1259 /* Remove the page from the freelists */
1217 buddy_type); 1260 area->nr_free--;
1261 list_del(&page->lru);
1262 rmv_page_order(page);
1218 1263
1219 /* 1264 expand(zone, page, order, current_order, area,
1220 * The freepage_migratetype may differ from pageblock's 1265 start_migratetype);
1221 * migratetype depending on the decisions in 1266 /*
1222 * try_to_steal_freepages(). This is OK as long as it 1267 * The freepage_migratetype may differ from pageblock's
1223 * does not differ for MIGRATE_CMA pageblocks. For CMA 1268 * migratetype depending on the decisions in
1224 * we need to make sure unallocated pages flushed from 1269 * try_to_steal_freepages(). This is OK as long as it
1225 * pcp lists are returned to the correct freelist. 1270 * does not differ for MIGRATE_CMA pageblocks. For CMA
1226 */ 1271 * we need to make sure unallocated pages flushed from
1227 set_freepage_migratetype(page, buddy_type); 1272 * pcp lists are returned to the correct freelist.
1273 */
1274 set_freepage_migratetype(page, start_migratetype);
1228 1275
1229 trace_mm_page_alloc_extfrag(page, order, current_order, 1276 trace_mm_page_alloc_extfrag(page, order, current_order,
1230 start_migratetype, migratetype); 1277 start_migratetype, fallback_mt);
1231 1278
1232 return page; 1279 return page;
1233 }
1234 } 1280 }
1235 1281
1236 return NULL; 1282 return NULL;
@@ -1249,7 +1295,11 @@ retry_reserve:
1249 page = __rmqueue_smallest(zone, order, migratetype); 1295 page = __rmqueue_smallest(zone, order, migratetype);
1250 1296
1251 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { 1297 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
1252 page = __rmqueue_fallback(zone, order, migratetype); 1298 if (migratetype == MIGRATE_MOVABLE)
1299 page = __rmqueue_cma_fallback(zone, order);
1300
1301 if (!page)
1302 page = __rmqueue_fallback(zone, order, migratetype);
1253 1303
1254 /* 1304 /*
1255 * Use MIGRATE_RESERVE rather than fail an allocation. goto 1305 * Use MIGRATE_RESERVE rather than fail an allocation. goto
@@ -1321,7 +1371,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1321 int to_drain, batch; 1371 int to_drain, batch;
1322 1372
1323 local_irq_save(flags); 1373 local_irq_save(flags);
1324 batch = ACCESS_ONCE(pcp->batch); 1374 batch = READ_ONCE(pcp->batch);
1325 to_drain = min(pcp->count, batch); 1375 to_drain = min(pcp->count, batch);
1326 if (to_drain > 0) { 1376 if (to_drain > 0) {
1327 free_pcppages_bulk(zone, to_drain, pcp); 1377 free_pcppages_bulk(zone, to_drain, pcp);
@@ -1520,7 +1570,7 @@ void free_hot_cold_page(struct page *page, bool cold)
1520 list_add_tail(&page->lru, &pcp->lists[migratetype]); 1570 list_add_tail(&page->lru, &pcp->lists[migratetype]);
1521 pcp->count++; 1571 pcp->count++;
1522 if (pcp->count >= pcp->high) { 1572 if (pcp->count >= pcp->high) {
1523 unsigned long batch = ACCESS_ONCE(pcp->batch); 1573 unsigned long batch = READ_ONCE(pcp->batch);
1524 free_pcppages_bulk(zone, batch, pcp); 1574 free_pcppages_bulk(zone, batch, pcp);
1525 pcp->count -= batch; 1575 pcp->count -= batch;
1526 } 1576 }
@@ -2362,18 +2412,13 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2362 *did_some_progress = 1; 2412 *did_some_progress = 1;
2363 goto out; 2413 goto out;
2364 } 2414 }
2365 /* 2415 /* The OOM killer may not free memory on a specific node */
2366 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
2367 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
2368 * The caller should handle page allocation failure by itself if
2369 * it specifies __GFP_THISNODE.
2370 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
2371 */
2372 if (gfp_mask & __GFP_THISNODE) 2416 if (gfp_mask & __GFP_THISNODE)
2373 goto out; 2417 goto out;
2374 } 2418 }
2375 /* Exhausted what can be done so it's blamo time */ 2419 /* Exhausted what can be done so it's blamo time */
2376 if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)) 2420 if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
2421 || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
2377 *did_some_progress = 1; 2422 *did_some_progress = 1;
2378out: 2423out:
2379 oom_zonelist_unlock(ac->zonelist, gfp_mask); 2424 oom_zonelist_unlock(ac->zonelist, gfp_mask);
@@ -2622,15 +2667,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2622 } 2667 }
2623 2668
2624 /* 2669 /*
2625 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 2670 * If this allocation cannot block and it is for a specific node, then
2626 * __GFP_NOWARN set) should not cause reclaim since the subsystem 2671 * fail early. There's no need to wakeup kswapd or retry for a
2627 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 2672 * speculative node-specific allocation.
2628 * using a larger set of nodes after it has established that the
2629 * allowed per node queues are empty and that nodes are
2630 * over allocated.
2631 */ 2673 */
2632 if (IS_ENABLED(CONFIG_NUMA) && 2674 if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
2633 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2634 goto nopage; 2675 goto nopage;
2635 2676
2636retry: 2677retry:
@@ -2823,7 +2864,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2823 /* 2864 /*
2824 * Check the zones suitable for the gfp_mask contain at least one 2865 * Check the zones suitable for the gfp_mask contain at least one
2825 * valid zone. It's possible to have an empty zonelist as a result 2866 * valid zone. It's possible to have an empty zonelist as a result
2826 * of GFP_THISNODE and a memoryless node 2867 * of __GFP_THISNODE and a memoryless node
2827 */ 2868 */
2828 if (unlikely(!zonelist->_zonerefs->zone)) 2869 if (unlikely(!zonelist->_zonerefs->zone))
2829 return NULL; 2870 return NULL;
@@ -3200,38 +3241,31 @@ static void show_migration_types(unsigned char type)
3200 * Show free area list (used inside shift_scroll-lock stuff) 3241 * Show free area list (used inside shift_scroll-lock stuff)
3201 * We also calculate the percentage fragmentation. We do this by counting the 3242 * We also calculate the percentage fragmentation. We do this by counting the
3202 * memory on each free list with the exception of the first item on the list. 3243 * memory on each free list with the exception of the first item on the list.
3203 * Suppresses nodes that are not allowed by current's cpuset if 3244 *
3204 * SHOW_MEM_FILTER_NODES is passed. 3245 * Bits in @filter:
3246 * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
3247 * cpuset.
3205 */ 3248 */
3206void show_free_areas(unsigned int filter) 3249void show_free_areas(unsigned int filter)
3207{ 3250{
3251 unsigned long free_pcp = 0;
3208 int cpu; 3252 int cpu;
3209 struct zone *zone; 3253 struct zone *zone;
3210 3254
3211 for_each_populated_zone(zone) { 3255 for_each_populated_zone(zone) {
3212 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3256 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3213 continue; 3257 continue;
3214 show_node(zone);
3215 printk("%s per-cpu:\n", zone->name);
3216 3258
3217 for_each_online_cpu(cpu) { 3259 for_each_online_cpu(cpu)
3218 struct per_cpu_pageset *pageset; 3260 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
3219
3220 pageset = per_cpu_ptr(zone->pageset, cpu);
3221
3222 printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
3223 cpu, pageset->pcp.high,
3224 pageset->pcp.batch, pageset->pcp.count);
3225 }
3226 } 3261 }
3227 3262
3228 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" 3263 printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
3229 " active_file:%lu inactive_file:%lu isolated_file:%lu\n" 3264 " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
3230 " unevictable:%lu" 3265 " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
3231 " dirty:%lu writeback:%lu unstable:%lu\n" 3266 " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
3232 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
3233 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" 3267 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
3234 " free_cma:%lu\n", 3268 " free:%lu free_pcp:%lu free_cma:%lu\n",
3235 global_page_state(NR_ACTIVE_ANON), 3269 global_page_state(NR_ACTIVE_ANON),
3236 global_page_state(NR_INACTIVE_ANON), 3270 global_page_state(NR_INACTIVE_ANON),
3237 global_page_state(NR_ISOLATED_ANON), 3271 global_page_state(NR_ISOLATED_ANON),
@@ -3242,13 +3276,14 @@ void show_free_areas(unsigned int filter)
3242 global_page_state(NR_FILE_DIRTY), 3276 global_page_state(NR_FILE_DIRTY),
3243 global_page_state(NR_WRITEBACK), 3277 global_page_state(NR_WRITEBACK),
3244 global_page_state(NR_UNSTABLE_NFS), 3278 global_page_state(NR_UNSTABLE_NFS),
3245 global_page_state(NR_FREE_PAGES),
3246 global_page_state(NR_SLAB_RECLAIMABLE), 3279 global_page_state(NR_SLAB_RECLAIMABLE),
3247 global_page_state(NR_SLAB_UNRECLAIMABLE), 3280 global_page_state(NR_SLAB_UNRECLAIMABLE),
3248 global_page_state(NR_FILE_MAPPED), 3281 global_page_state(NR_FILE_MAPPED),
3249 global_page_state(NR_SHMEM), 3282 global_page_state(NR_SHMEM),
3250 global_page_state(NR_PAGETABLE), 3283 global_page_state(NR_PAGETABLE),
3251 global_page_state(NR_BOUNCE), 3284 global_page_state(NR_BOUNCE),
3285 global_page_state(NR_FREE_PAGES),
3286 free_pcp,
3252 global_page_state(NR_FREE_CMA_PAGES)); 3287 global_page_state(NR_FREE_CMA_PAGES));
3253 3288
3254 for_each_populated_zone(zone) { 3289 for_each_populated_zone(zone) {
@@ -3256,6 +3291,11 @@ void show_free_areas(unsigned int filter)
3256 3291
3257 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3292 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3258 continue; 3293 continue;
3294
3295 free_pcp = 0;
3296 for_each_online_cpu(cpu)
3297 free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
3298
3259 show_node(zone); 3299 show_node(zone);
3260 printk("%s" 3300 printk("%s"
3261 " free:%lukB" 3301 " free:%lukB"
@@ -3282,6 +3322,8 @@ void show_free_areas(unsigned int filter)
3282 " pagetables:%lukB" 3322 " pagetables:%lukB"
3283 " unstable:%lukB" 3323 " unstable:%lukB"
3284 " bounce:%lukB" 3324 " bounce:%lukB"
3325 " free_pcp:%lukB"
3326 " local_pcp:%ukB"
3285 " free_cma:%lukB" 3327 " free_cma:%lukB"
3286 " writeback_tmp:%lukB" 3328 " writeback_tmp:%lukB"
3287 " pages_scanned:%lu" 3329 " pages_scanned:%lu"
@@ -3313,6 +3355,8 @@ void show_free_areas(unsigned int filter)
3313 K(zone_page_state(zone, NR_PAGETABLE)), 3355 K(zone_page_state(zone, NR_PAGETABLE)),
3314 K(zone_page_state(zone, NR_UNSTABLE_NFS)), 3356 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
3315 K(zone_page_state(zone, NR_BOUNCE)), 3357 K(zone_page_state(zone, NR_BOUNCE)),
3358 K(free_pcp),
3359 K(this_cpu_read(zone->pageset->pcp.count)),
3316 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3360 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
3317 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3361 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
3318 K(zone_page_state(zone, NR_PAGES_SCANNED)), 3362 K(zone_page_state(zone, NR_PAGES_SCANNED)),
@@ -5716,7 +5760,7 @@ static void __setup_per_zone_wmarks(void)
5716 * value here. 5760 * value here.
5717 * 5761 *
5718 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) 5762 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
5719 * deltas controls asynch page reclaim, and so should 5763 * deltas control asynch page reclaim, and so should
5720 * not be capped for highmem. 5764 * not be capped for highmem.
5721 */ 5765 */
5722 unsigned long min_pages; 5766 unsigned long min_pages;
@@ -6163,7 +6207,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
6163 mask <<= (BITS_PER_LONG - bitidx - 1); 6207 mask <<= (BITS_PER_LONG - bitidx - 1);
6164 flags <<= (BITS_PER_LONG - bitidx - 1); 6208 flags <<= (BITS_PER_LONG - bitidx - 1);
6165 6209
6166 word = ACCESS_ONCE(bitmap[word_bitidx]); 6210 word = READ_ONCE(bitmap[word_bitidx]);
6167 for (;;) { 6211 for (;;) {
6168 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); 6212 old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
6169 if (word == old_word) 6213 if (word == old_word)
diff --git a/mm/page_io.c b/mm/page_io.c
index e6045804c8d8..6424869e275e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -20,8 +20,8 @@
20#include <linux/buffer_head.h> 20#include <linux/buffer_head.h>
21#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/frontswap.h> 22#include <linux/frontswap.h>
23#include <linux/aio.h>
24#include <linux/blkdev.h> 23#include <linux/blkdev.h>
24#include <linux/uio.h>
25#include <asm/pgtable.h> 25#include <asm/pgtable.h>
26 26
27static struct bio *get_swap_bio(gfp_t gfp_flags, 27static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -274,13 +274,10 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
274 iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE); 274 iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE);
275 init_sync_kiocb(&kiocb, swap_file); 275 init_sync_kiocb(&kiocb, swap_file);
276 kiocb.ki_pos = page_file_offset(page); 276 kiocb.ki_pos = page_file_offset(page);
277 kiocb.ki_nbytes = PAGE_SIZE;
278 277
279 set_page_writeback(page); 278 set_page_writeback(page);
280 unlock_page(page); 279 unlock_page(page);
281 ret = mapping->a_ops->direct_IO(ITER_BVEC | WRITE, 280 ret = mapping->a_ops->direct_IO(&kiocb, &from, kiocb.ki_pos);
282 &kiocb, &from,
283 kiocb.ki_pos);
284 if (ret == PAGE_SIZE) { 281 if (ret == PAGE_SIZE) {
285 count_vm_event(PSWPOUT); 282 count_vm_event(PSWPOUT);
286 ret = 0; 283 ret = 0;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 72f5ac381ab3..755a42c76eb4 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -103,6 +103,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype)
103 103
104 if (!is_migrate_isolate_page(buddy)) { 104 if (!is_migrate_isolate_page(buddy)) {
105 __isolate_free_page(page, order); 105 __isolate_free_page(page, order);
106 kernel_map_pages(page, (1 << order), 1);
106 set_page_refcounted(page); 107 set_page_refcounted(page);
107 isolated_page = page; 108 isolated_page = page;
108 } 109 }
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 75c1f2878519..29f2f8b853ae 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -265,8 +265,15 @@ int walk_page_range(unsigned long start, unsigned long end,
265 vma = vma->vm_next; 265 vma = vma->vm_next;
266 266
267 err = walk_page_test(start, next, walk); 267 err = walk_page_test(start, next, walk);
268 if (err > 0) 268 if (err > 0) {
269 /*
270 * positive return values are purely for
271 * controlling the pagewalk, so should never
272 * be passed to the callers.
273 */
274 err = 0;
269 continue; 275 continue;
276 }
270 if (err < 0) 277 if (err < 0)
271 break; 278 break;
272 } 279 }
diff --git a/mm/percpu.c b/mm/percpu.c
index 73c97a5f4495..dfd02484e8de 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1310,7 +1310,7 @@ bool is_kernel_percpu_address(unsigned long addr)
1310 * and, from the second one, the backing allocator (currently either vm or 1310 * and, from the second one, the backing allocator (currently either vm or
1311 * km) provides translation. 1311 * km) provides translation.
1312 * 1312 *
1313 * The addr can be tranlated simply without checking if it falls into the 1313 * The addr can be translated simply without checking if it falls into the
1314 * first chunk. But the current code reflects better how percpu allocator 1314 * first chunk. But the current code reflects better how percpu allocator
1315 * actually works, and the verification can discover both bugs in percpu 1315 * actually works, and the verification can discover both bugs in percpu
1316 * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current 1316 * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
@@ -1762,7 +1762,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
1762 * and other parameters considering needed percpu size, allocation 1762 * and other parameters considering needed percpu size, allocation
1763 * atom size and distances between CPUs. 1763 * atom size and distances between CPUs.
1764 * 1764 *
1765 * Groups are always mutliples of atom size and CPUs which are of 1765 * Groups are always multiples of atom size and CPUs which are of
1766 * LOCAL_DISTANCE both ways are grouped together and share space for 1766 * LOCAL_DISTANCE both ways are grouped together and share space for
1767 * units in the same group. The returned configuration is guaranteed 1767 * units in the same group. The returned configuration is guaranteed
1768 * to have CPUs on different nodes on different groups and >=75% usage 1768 * to have CPUs on different nodes on different groups and >=75% usage
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index b1597690530c..e88d071648c2 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -257,22 +257,18 @@ static ssize_t process_vm_rw(pid_t pid,
257 struct iovec *iov_r = iovstack_r; 257 struct iovec *iov_r = iovstack_r;
258 struct iov_iter iter; 258 struct iov_iter iter;
259 ssize_t rc; 259 ssize_t rc;
260 int dir = vm_write ? WRITE : READ;
260 261
261 if (flags != 0) 262 if (flags != 0)
262 return -EINVAL; 263 return -EINVAL;
263 264
264 /* Check iovecs */ 265 /* Check iovecs */
265 if (vm_write) 266 rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter);
266 rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, 267 if (rc < 0)
267 iovstack_l, &iov_l); 268 return rc;
268 else 269 if (!iov_iter_count(&iter))
269 rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
270 iovstack_l, &iov_l);
271 if (rc <= 0)
272 goto free_iovecs; 270 goto free_iovecs;
273 271
274 iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc);
275
276 rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, 272 rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
277 iovstack_r, &iov_r); 273 iovstack_r, &iov_r);
278 if (rc <= 0) 274 if (rc <= 0)
@@ -283,8 +279,7 @@ static ssize_t process_vm_rw(pid_t pid,
283free_iovecs: 279free_iovecs:
284 if (iov_r != iovstack_r) 280 if (iov_r != iovstack_r)
285 kfree(iov_r); 281 kfree(iov_r);
286 if (iov_l != iovstack_l) 282 kfree(iov_l);
287 kfree(iov_l);
288 283
289 return rc; 284 return rc;
290} 285}
@@ -320,21 +315,16 @@ compat_process_vm_rw(compat_pid_t pid,
320 struct iovec *iov_r = iovstack_r; 315 struct iovec *iov_r = iovstack_r;
321 struct iov_iter iter; 316 struct iov_iter iter;
322 ssize_t rc = -EFAULT; 317 ssize_t rc = -EFAULT;
318 int dir = vm_write ? WRITE : READ;
323 319
324 if (flags != 0) 320 if (flags != 0)
325 return -EINVAL; 321 return -EINVAL;
326 322
327 if (vm_write) 323 rc = compat_import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter);
328 rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, 324 if (rc < 0)
329 UIO_FASTIOV, iovstack_l, 325 return rc;
330 &iov_l); 326 if (!iov_iter_count(&iter))
331 else
332 rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
333 UIO_FASTIOV, iovstack_l,
334 &iov_l);
335 if (rc <= 0)
336 goto free_iovecs; 327 goto free_iovecs;
337 iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc);
338 rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, 328 rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
339 UIO_FASTIOV, iovstack_r, 329 UIO_FASTIOV, iovstack_r,
340 &iov_r); 330 &iov_r);
@@ -346,8 +336,7 @@ compat_process_vm_rw(compat_pid_t pid,
346free_iovecs: 336free_iovecs:
347 if (iov_r != iovstack_r) 337 if (iov_r != iovstack_r)
348 kfree(iov_r); 338 kfree(iov_r);
349 if (iov_l != iovstack_l) 339 kfree(iov_l);
350 kfree(iov_l);
351 return rc; 340 return rc;
352} 341}
353 342
diff --git a/mm/rmap.c b/mm/rmap.c
index 5e3e09081164..24dd3f9fee27 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -287,6 +287,13 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
287 return 0; 287 return 0;
288 288
289 enomem_failure: 289 enomem_failure:
290 /*
291 * dst->anon_vma is dropped here otherwise its degree can be incorrectly
292 * decremented in unlink_anon_vmas().
293 * We can safely do this because callers of anon_vma_clone() don't care
294 * about dst->anon_vma if anon_vma_clone() failed.
295 */
296 dst->anon_vma = NULL;
290 unlink_anon_vmas(dst); 297 unlink_anon_vmas(dst);
291 return -ENOMEM; 298 return -ENOMEM;
292} 299}
@@ -449,7 +456,7 @@ struct anon_vma *page_get_anon_vma(struct page *page)
449 unsigned long anon_mapping; 456 unsigned long anon_mapping;
450 457
451 rcu_read_lock(); 458 rcu_read_lock();
452 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); 459 anon_mapping = (unsigned long)READ_ONCE(page->mapping);
453 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 460 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
454 goto out; 461 goto out;
455 if (!page_mapped(page)) 462 if (!page_mapped(page))
@@ -493,14 +500,14 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page)
493 unsigned long anon_mapping; 500 unsigned long anon_mapping;
494 501
495 rcu_read_lock(); 502 rcu_read_lock();
496 anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); 503 anon_mapping = (unsigned long)READ_ONCE(page->mapping);
497 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) 504 if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
498 goto out; 505 goto out;
499 if (!page_mapped(page)) 506 if (!page_mapped(page))
500 goto out; 507 goto out;
501 508
502 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); 509 anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
503 root_anon_vma = ACCESS_ONCE(anon_vma->root); 510 root_anon_vma = READ_ONCE(anon_vma->root);
504 if (down_read_trylock(&root_anon_vma->rwsem)) { 511 if (down_read_trylock(&root_anon_vma->rwsem)) {
505 /* 512 /*
506 * If the page is still mapped, then this anon_vma is still 513 * If the page is still mapped, then this anon_vma is still
diff --git a/mm/shmem.c b/mm/shmem.c
index cf2d0ca010bc..de981370fbc5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -31,7 +31,7 @@
31#include <linux/mm.h> 31#include <linux/mm.h>
32#include <linux/export.h> 32#include <linux/export.h>
33#include <linux/swap.h> 33#include <linux/swap.h>
34#include <linux/aio.h> 34#include <linux/uio.h>
35 35
36static struct vfsmount *shm_mnt; 36static struct vfsmount *shm_mnt;
37 37
@@ -544,7 +544,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
544 544
545static int shmem_setattr(struct dentry *dentry, struct iattr *attr) 545static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
546{ 546{
547 struct inode *inode = dentry->d_inode; 547 struct inode *inode = d_inode(dentry);
548 struct shmem_inode_info *info = SHMEM_I(inode); 548 struct shmem_inode_info *info = SHMEM_I(inode);
549 int error; 549 int error;
550 550
@@ -2274,7 +2274,7 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
2274 */ 2274 */
2275static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 2275static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
2276{ 2276{
2277 struct inode *inode = old_dentry->d_inode; 2277 struct inode *inode = d_inode(old_dentry);
2278 int ret; 2278 int ret;
2279 2279
2280 /* 2280 /*
@@ -2298,7 +2298,7 @@ out:
2298 2298
2299static int shmem_unlink(struct inode *dir, struct dentry *dentry) 2299static int shmem_unlink(struct inode *dir, struct dentry *dentry)
2300{ 2300{
2301 struct inode *inode = dentry->d_inode; 2301 struct inode *inode = d_inode(dentry);
2302 2302
2303 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) 2303 if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
2304 shmem_free_inode(inode->i_sb); 2304 shmem_free_inode(inode->i_sb);
@@ -2315,7 +2315,7 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
2315 if (!simple_empty(dentry)) 2315 if (!simple_empty(dentry))
2316 return -ENOTEMPTY; 2316 return -ENOTEMPTY;
2317 2317
2318 drop_nlink(dentry->d_inode); 2318 drop_nlink(d_inode(dentry));
2319 drop_nlink(dir); 2319 drop_nlink(dir);
2320 return shmem_unlink(dir, dentry); 2320 return shmem_unlink(dir, dentry);
2321} 2321}
@@ -2336,8 +2336,8 @@ static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, stru
2336 } 2336 }
2337 old_dir->i_ctime = old_dir->i_mtime = 2337 old_dir->i_ctime = old_dir->i_mtime =
2338 new_dir->i_ctime = new_dir->i_mtime = 2338 new_dir->i_ctime = new_dir->i_mtime =
2339 old_dentry->d_inode->i_ctime = 2339 d_inode(old_dentry)->i_ctime =
2340 new_dentry->d_inode->i_ctime = CURRENT_TIME; 2340 d_inode(new_dentry)->i_ctime = CURRENT_TIME;
2341 2341
2342 return 0; 2342 return 0;
2343} 2343}
@@ -2376,7 +2376,7 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
2376 */ 2376 */
2377static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) 2377static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
2378{ 2378{
2379 struct inode *inode = old_dentry->d_inode; 2379 struct inode *inode = d_inode(old_dentry);
2380 int they_are_dirs = S_ISDIR(inode->i_mode); 2380 int they_are_dirs = S_ISDIR(inode->i_mode);
2381 2381
2382 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 2382 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
@@ -2396,10 +2396,10 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
2396 return error; 2396 return error;
2397 } 2397 }
2398 2398
2399 if (new_dentry->d_inode) { 2399 if (d_really_is_positive(new_dentry)) {
2400 (void) shmem_unlink(new_dir, new_dentry); 2400 (void) shmem_unlink(new_dir, new_dentry);
2401 if (they_are_dirs) { 2401 if (they_are_dirs) {
2402 drop_nlink(new_dentry->d_inode); 2402 drop_nlink(d_inode(new_dentry));
2403 drop_nlink(old_dir); 2403 drop_nlink(old_dir);
2404 } 2404 }
2405 } else if (they_are_dirs) { 2405 } else if (they_are_dirs) {
@@ -2476,14 +2476,14 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
2476 2476
2477static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) 2477static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
2478{ 2478{
2479 nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink); 2479 nd_set_link(nd, SHMEM_I(d_inode(dentry))->symlink);
2480 return NULL; 2480 return NULL;
2481} 2481}
2482 2482
2483static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) 2483static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
2484{ 2484{
2485 struct page *page = NULL; 2485 struct page *page = NULL;
2486 int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); 2486 int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL);
2487 nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); 2487 nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
2488 if (page) 2488 if (page)
2489 unlock_page(page); 2489 unlock_page(page);
@@ -2574,7 +2574,7 @@ static int shmem_xattr_validate(const char *name)
2574static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, 2574static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
2575 void *buffer, size_t size) 2575 void *buffer, size_t size)
2576{ 2576{
2577 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); 2577 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
2578 int err; 2578 int err;
2579 2579
2580 /* 2580 /*
@@ -2595,7 +2595,7 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
2595static int shmem_setxattr(struct dentry *dentry, const char *name, 2595static int shmem_setxattr(struct dentry *dentry, const char *name,
2596 const void *value, size_t size, int flags) 2596 const void *value, size_t size, int flags)
2597{ 2597{
2598 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); 2598 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
2599 int err; 2599 int err;
2600 2600
2601 /* 2601 /*
@@ -2615,7 +2615,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
2615 2615
2616static int shmem_removexattr(struct dentry *dentry, const char *name) 2616static int shmem_removexattr(struct dentry *dentry, const char *name)
2617{ 2617{
2618 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); 2618 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
2619 int err; 2619 int err;
2620 2620
2621 /* 2621 /*
@@ -2635,7 +2635,7 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
2635 2635
2636static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) 2636static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
2637{ 2637{
2638 struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); 2638 struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
2639 return simple_xattr_list(&info->xattrs, buffer, size); 2639 return simple_xattr_list(&info->xattrs, buffer, size);
2640} 2640}
2641#endif /* CONFIG_TMPFS_XATTR */ 2641#endif /* CONFIG_TMPFS_XATTR */
@@ -3118,8 +3118,6 @@ static const struct file_operations shmem_file_operations = {
3118 .mmap = shmem_mmap, 3118 .mmap = shmem_mmap,
3119#ifdef CONFIG_TMPFS 3119#ifdef CONFIG_TMPFS
3120 .llseek = shmem_file_llseek, 3120 .llseek = shmem_file_llseek,
3121 .read = new_sync_read,
3122 .write = new_sync_write,
3123 .read_iter = shmem_file_read_iter, 3121 .read_iter = shmem_file_read_iter,
3124 .write_iter = generic_file_write_iter, 3122 .write_iter = generic_file_write_iter,
3125 .fsync = noop_fsync, 3123 .fsync = noop_fsync,
diff --git a/mm/slab.c b/mm/slab.c
index c4b89eaf4c96..7eb38dd1cefa 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -857,6 +857,11 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
857 return NULL; 857 return NULL;
858} 858}
859 859
860static inline gfp_t gfp_exact_node(gfp_t flags)
861{
862 return flags;
863}
864
860#else /* CONFIG_NUMA */ 865#else /* CONFIG_NUMA */
861 866
862static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); 867static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
@@ -1023,6 +1028,15 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1023 1028
1024 return __cache_free_alien(cachep, objp, node, page_node); 1029 return __cache_free_alien(cachep, objp, node, page_node);
1025} 1030}
1031
1032/*
1033 * Construct gfp mask to allocate from a specific node but do not invoke reclaim
1034 * or warn about failures.
1035 */
1036static inline gfp_t gfp_exact_node(gfp_t flags)
1037{
1038 return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT;
1039}
1026#endif 1040#endif
1027 1041
1028/* 1042/*
@@ -2825,7 +2839,7 @@ alloc_done:
2825 if (unlikely(!ac->avail)) { 2839 if (unlikely(!ac->avail)) {
2826 int x; 2840 int x;
2827force_grow: 2841force_grow:
2828 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); 2842 x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
2829 2843
2830 /* cache_grow can reenable interrupts, then ac could change. */ 2844 /* cache_grow can reenable interrupts, then ac could change. */
2831 ac = cpu_cache_get(cachep); 2845 ac = cpu_cache_get(cachep);
@@ -3019,7 +3033,7 @@ retry:
3019 get_node(cache, nid) && 3033 get_node(cache, nid) &&
3020 get_node(cache, nid)->free_objects) { 3034 get_node(cache, nid)->free_objects) {
3021 obj = ____cache_alloc_node(cache, 3035 obj = ____cache_alloc_node(cache,
3022 flags | GFP_THISNODE, nid); 3036 gfp_exact_node(flags), nid);
3023 if (obj) 3037 if (obj)
3024 break; 3038 break;
3025 } 3039 }
@@ -3047,7 +3061,7 @@ retry:
3047 nid = page_to_nid(page); 3061 nid = page_to_nid(page);
3048 if (cache_grow(cache, flags, nid, page)) { 3062 if (cache_grow(cache, flags, nid, page)) {
3049 obj = ____cache_alloc_node(cache, 3063 obj = ____cache_alloc_node(cache,
3050 flags | GFP_THISNODE, nid); 3064 gfp_exact_node(flags), nid);
3051 if (!obj) 3065 if (!obj)
3052 /* 3066 /*
3053 * Another processor may allocate the 3067 * Another processor may allocate the
@@ -3118,7 +3132,7 @@ retry:
3118 3132
3119must_grow: 3133must_grow:
3120 spin_unlock(&n->list_lock); 3134 spin_unlock(&n->list_lock);
3121 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); 3135 x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL);
3122 if (x) 3136 if (x)
3123 goto retry; 3137 goto retry;
3124 3138
diff --git a/mm/slob.c b/mm/slob.c
index 94a7fede6d48..4765f65019c7 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -532,7 +532,7 @@ int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
532 return 0; 532 return 0;
533} 533}
534 534
535void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) 535static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
536{ 536{
537 void *b; 537 void *b;
538 538
@@ -558,7 +558,6 @@ void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
558 kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); 558 kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
559 return b; 559 return b;
560} 560}
561EXPORT_SYMBOL(slob_alloc_node);
562 561
563void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 562void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
564{ 563{
diff --git a/mm/slub.c b/mm/slub.c
index 6832c4eab104..54c0876b43d5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -374,7 +374,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
374 if (cmpxchg_double(&page->freelist, &page->counters, 374 if (cmpxchg_double(&page->freelist, &page->counters,
375 freelist_old, counters_old, 375 freelist_old, counters_old,
376 freelist_new, counters_new)) 376 freelist_new, counters_new))
377 return 1; 377 return true;
378 } else 378 } else
379#endif 379#endif
380 { 380 {
@@ -384,7 +384,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
384 page->freelist = freelist_new; 384 page->freelist = freelist_new;
385 set_page_slub_counters(page, counters_new); 385 set_page_slub_counters(page, counters_new);
386 slab_unlock(page); 386 slab_unlock(page);
387 return 1; 387 return true;
388 } 388 }
389 slab_unlock(page); 389 slab_unlock(page);
390 } 390 }
@@ -396,7 +396,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
396 pr_info("%s %s: cmpxchg double redo ", n, s->name); 396 pr_info("%s %s: cmpxchg double redo ", n, s->name);
397#endif 397#endif
398 398
399 return 0; 399 return false;
400} 400}
401 401
402static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, 402static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
@@ -410,7 +410,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
410 if (cmpxchg_double(&page->freelist, &page->counters, 410 if (cmpxchg_double(&page->freelist, &page->counters,
411 freelist_old, counters_old, 411 freelist_old, counters_old,
412 freelist_new, counters_new)) 412 freelist_new, counters_new))
413 return 1; 413 return true;
414 } else 414 } else
415#endif 415#endif
416 { 416 {
@@ -424,7 +424,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
424 set_page_slub_counters(page, counters_new); 424 set_page_slub_counters(page, counters_new);
425 slab_unlock(page); 425 slab_unlock(page);
426 local_irq_restore(flags); 426 local_irq_restore(flags);
427 return 1; 427 return true;
428 } 428 }
429 slab_unlock(page); 429 slab_unlock(page);
430 local_irq_restore(flags); 430 local_irq_restore(flags);
@@ -437,7 +437,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
437 pr_info("%s %s: cmpxchg double redo ", n, s->name); 437 pr_info("%s %s: cmpxchg double redo ", n, s->name);
438#endif 438#endif
439 439
440 return 0; 440 return false;
441} 441}
442 442
443#ifdef CONFIG_SLUB_DEBUG 443#ifdef CONFIG_SLUB_DEBUG
@@ -1137,15 +1137,6 @@ static int __init setup_slub_debug(char *str)
1137 */ 1137 */
1138 goto check_slabs; 1138 goto check_slabs;
1139 1139
1140 if (tolower(*str) == 'o') {
1141 /*
1142 * Avoid enabling debugging on caches if its minimum order
1143 * would increase as a result.
1144 */
1145 disable_higher_order_debug = 1;
1146 goto out;
1147 }
1148
1149 slub_debug = 0; 1140 slub_debug = 0;
1150 if (*str == '-') 1141 if (*str == '-')
1151 /* 1142 /*
@@ -1176,6 +1167,13 @@ static int __init setup_slub_debug(char *str)
1176 case 'a': 1167 case 'a':
1177 slub_debug |= SLAB_FAILSLAB; 1168 slub_debug |= SLAB_FAILSLAB;
1178 break; 1169 break;
1170 case 'o':
1171 /*
1172 * Avoid enabling debugging on caches if its minimum
1173 * order would increase as a result.
1174 */
1175 disable_higher_order_debug = 1;
1176 break;
1179 default: 1177 default:
1180 pr_err("slub_debug option '%c' unknown. skipped\n", 1178 pr_err("slub_debug option '%c' unknown. skipped\n",
1181 *str); 1179 *str);
@@ -2449,7 +2447,8 @@ redo:
2449 do { 2447 do {
2450 tid = this_cpu_read(s->cpu_slab->tid); 2448 tid = this_cpu_read(s->cpu_slab->tid);
2451 c = raw_cpu_ptr(s->cpu_slab); 2449 c = raw_cpu_ptr(s->cpu_slab);
2452 } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); 2450 } while (IS_ENABLED(CONFIG_PREEMPT) &&
2451 unlikely(tid != READ_ONCE(c->tid)));
2453 2452
2454 /* 2453 /*
2455 * Irqless object alloc/free algorithm used here depends on sequence 2454 * Irqless object alloc/free algorithm used here depends on sequence
@@ -2718,7 +2717,8 @@ redo:
2718 do { 2717 do {
2719 tid = this_cpu_read(s->cpu_slab->tid); 2718 tid = this_cpu_read(s->cpu_slab->tid);
2720 c = raw_cpu_ptr(s->cpu_slab); 2719 c = raw_cpu_ptr(s->cpu_slab);
2721 } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); 2720 } while (IS_ENABLED(CONFIG_PREEMPT) &&
2721 unlikely(tid != READ_ONCE(c->tid)));
2722 2722
2723 /* Same with comment on barrier() in slab_alloc_node() */ 2723 /* Same with comment on barrier() in slab_alloc_node() */
2724 barrier(); 2724 barrier();
@@ -4277,7 +4277,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4277 int node; 4277 int node;
4278 struct page *page; 4278 struct page *page;
4279 4279
4280 page = ACCESS_ONCE(c->page); 4280 page = READ_ONCE(c->page);
4281 if (!page) 4281 if (!page)
4282 continue; 4282 continue;
4283 4283
@@ -4292,7 +4292,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4292 total += x; 4292 total += x;
4293 nodes[node] += x; 4293 nodes[node] += x;
4294 4294
4295 page = ACCESS_ONCE(c->partial); 4295 page = READ_ONCE(c->partial);
4296 if (page) { 4296 if (page) {
4297 node = page_to_nid(page); 4297 node = page_to_nid(page);
4298 if (flags & SO_TOTAL) 4298 if (flags & SO_TOTAL)
diff --git a/mm/swap.c b/mm/swap.c
index cd3a5e64cea9..a7251a8ed532 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,6 +31,7 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/uio.h> 33#include <linux/uio.h>
34#include <linux/hugetlb.h>
34 35
35#include "internal.h" 36#include "internal.h"
36 37
@@ -42,7 +43,7 @@ int page_cluster;
42 43
43static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); 44static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
44static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 45static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
45static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); 46static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
46 47
47/* 48/*
48 * This path almost never happens for VM activity - pages are normally 49 * This path almost never happens for VM activity - pages are normally
@@ -75,7 +76,14 @@ static void __put_compound_page(struct page *page)
75{ 76{
76 compound_page_dtor *dtor; 77 compound_page_dtor *dtor;
77 78
78 __page_cache_release(page); 79 /*
80 * __page_cache_release() is supposed to be called for thp, not for
81 * hugetlb. This is because hugetlb page does never have PageLRU set
82 * (it's never listed to any LRU lists) and no memcg routines should
83 * be called for hugetlb (it has a separate hugetlb_cgroup.)
84 */
85 if (!PageHuge(page))
86 __page_cache_release(page);
79 dtor = get_compound_page_dtor(page); 87 dtor = get_compound_page_dtor(page);
80 (*dtor)(page); 88 (*dtor)(page);
81} 89}
@@ -743,7 +751,7 @@ void lru_cache_add_active_or_unevictable(struct page *page,
743 * be write it out by flusher threads as this is much more effective 751 * be write it out by flusher threads as this is much more effective
744 * than the single-page writeout from reclaim. 752 * than the single-page writeout from reclaim.
745 */ 753 */
746static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, 754static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
747 void *arg) 755 void *arg)
748{ 756{
749 int lru, file; 757 int lru, file;
@@ -811,36 +819,36 @@ void lru_add_drain_cpu(int cpu)
811 local_irq_restore(flags); 819 local_irq_restore(flags);
812 } 820 }
813 821
814 pvec = &per_cpu(lru_deactivate_pvecs, cpu); 822 pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
815 if (pagevec_count(pvec)) 823 if (pagevec_count(pvec))
816 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 824 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
817 825
818 activate_page_drain(cpu); 826 activate_page_drain(cpu);
819} 827}
820 828
821/** 829/**
822 * deactivate_page - forcefully deactivate a page 830 * deactivate_file_page - forcefully deactivate a file page
823 * @page: page to deactivate 831 * @page: page to deactivate
824 * 832 *
825 * This function hints the VM that @page is a good reclaim candidate, 833 * This function hints the VM that @page is a good reclaim candidate,
826 * for example if its invalidation fails due to the page being dirty 834 * for example if its invalidation fails due to the page being dirty
827 * or under writeback. 835 * or under writeback.
828 */ 836 */
829void deactivate_page(struct page *page) 837void deactivate_file_page(struct page *page)
830{ 838{
831 /* 839 /*
832 * In a workload with many unevictable page such as mprotect, unevictable 840 * In a workload with many unevictable page such as mprotect,
833 * page deactivation for accelerating reclaim is pointless. 841 * unevictable page deactivation for accelerating reclaim is pointless.
834 */ 842 */
835 if (PageUnevictable(page)) 843 if (PageUnevictable(page))
836 return; 844 return;
837 845
838 if (likely(get_page_unless_zero(page))) { 846 if (likely(get_page_unless_zero(page))) {
839 struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); 847 struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
840 848
841 if (!pagevec_add(pvec, page)) 849 if (!pagevec_add(pvec, page))
842 pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); 850 pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
843 put_cpu_var(lru_deactivate_pvecs); 851 put_cpu_var(lru_deactivate_file_pvecs);
844 } 852 }
845} 853}
846 854
@@ -872,7 +880,7 @@ void lru_add_drain_all(void)
872 880
873 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || 881 if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
874 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || 882 pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
875 pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || 883 pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
876 need_activate_page_drain(cpu)) { 884 need_activate_page_drain(cpu)) {
877 INIT_WORK(work, lru_add_drain_per_cpu); 885 INIT_WORK(work, lru_add_drain_per_cpu);
878 schedule_work_on(cpu, work); 886 schedule_work_on(cpu, work);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 405923f77334..8bc8e66138da 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -390,7 +390,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
390 unsigned int pages, max_pages, last_ra; 390 unsigned int pages, max_pages, last_ra;
391 static atomic_t last_readahead_pages; 391 static atomic_t last_readahead_pages;
392 392
393 max_pages = 1 << ACCESS_ONCE(page_cluster); 393 max_pages = 1 << READ_ONCE(page_cluster);
394 if (max_pages <= 1) 394 if (max_pages <= 1)
395 return 1; 395 return 1;
396 396
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 63f55ccb9b26..a7e72103f23b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1312,7 +1312,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1312 else 1312 else
1313 continue; 1313 continue;
1314 } 1314 }
1315 count = ACCESS_ONCE(si->swap_map[i]); 1315 count = READ_ONCE(si->swap_map[i]);
1316 if (count && swap_count(count) != SWAP_MAP_BAD) 1316 if (count && swap_count(count) != SWAP_MAP_BAD)
1317 break; 1317 break;
1318 } 1318 }
diff --git a/mm/truncate.c b/mm/truncate.c
index ddec5a5966d7..66af9031fae8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -93,35 +93,6 @@ void do_invalidatepage(struct page *page, unsigned int offset,
93} 93}
94 94
95/* 95/*
96 * This cancels just the dirty bit on the kernel page itself, it
97 * does NOT actually remove dirty bits on any mmap's that may be
98 * around. It also leaves the page tagged dirty, so any sync
99 * activity will still find it on the dirty lists, and in particular,
100 * clear_page_dirty_for_io() will still look at the dirty bits in
101 * the VM.
102 *
103 * Doing this should *normally* only ever be done when a page
104 * is truncated, and is not actually mapped anywhere at all. However,
105 * fs/buffer.c does this when it notices that somebody has cleaned
106 * out all the buffers on a page without actually doing it through
107 * the VM. Can you say "ext3 is horribly ugly"? Tought you could.
108 */
109void cancel_dirty_page(struct page *page, unsigned int account_size)
110{
111 if (TestClearPageDirty(page)) {
112 struct address_space *mapping = page->mapping;
113 if (mapping && mapping_cap_account_dirty(mapping)) {
114 dec_zone_page_state(page, NR_FILE_DIRTY);
115 dec_bdi_stat(inode_to_bdi(mapping->host),
116 BDI_RECLAIMABLE);
117 if (account_size)
118 task_io_account_cancelled_write(account_size);
119 }
120 }
121}
122EXPORT_SYMBOL(cancel_dirty_page);
123
124/*
125 * If truncate cannot remove the fs-private metadata from the page, the page 96 * If truncate cannot remove the fs-private metadata from the page, the page
126 * becomes orphaned. It will be left on the LRU and may even be mapped into 97 * becomes orphaned. It will be left on the LRU and may even be mapped into
127 * user pagetables if we're racing with filemap_fault(). 98 * user pagetables if we're racing with filemap_fault().
@@ -140,7 +111,13 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
140 if (page_has_private(page)) 111 if (page_has_private(page))
141 do_invalidatepage(page, 0, PAGE_CACHE_SIZE); 112 do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
142 113
143 cancel_dirty_page(page, PAGE_CACHE_SIZE); 114 /*
115 * Some filesystems seem to re-dirty the page even after
116 * the VM has canceled the dirty bit (eg ext3 journaling).
117 * Hence dirty accounting check is placed after invalidation.
118 */
119 if (TestClearPageDirty(page))
120 account_page_cleaned(page, mapping);
144 121
145 ClearPageMappedToDisk(page); 122 ClearPageMappedToDisk(page);
146 delete_from_page_cache(page); 123 delete_from_page_cache(page);
@@ -513,7 +490,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
513 * of interest and try to speed up its reclaim. 490 * of interest and try to speed up its reclaim.
514 */ 491 */
515 if (!ret) 492 if (!ret)
516 deactivate_page(page); 493 deactivate_file_page(page);
517 count += ret; 494 count += ret;
518 } 495 }
519 pagevec_remove_exceptionals(&pvec); 496 pagevec_remove_exceptionals(&pvec);
diff --git a/mm/util.c b/mm/util.c
index 3981ae9d1b15..68ff8a5361e7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -325,9 +325,37 @@ void kvfree(const void *addr)
325} 325}
326EXPORT_SYMBOL(kvfree); 326EXPORT_SYMBOL(kvfree);
327 327
328static inline void *__page_rmapping(struct page *page)
329{
330 unsigned long mapping;
331
332 mapping = (unsigned long)page->mapping;
333 mapping &= ~PAGE_MAPPING_FLAGS;
334
335 return (void *)mapping;
336}
337
338/* Neutral page->mapping pointer to address_space or anon_vma or other */
339void *page_rmapping(struct page *page)
340{
341 page = compound_head(page);
342 return __page_rmapping(page);
343}
344
345struct anon_vma *page_anon_vma(struct page *page)
346{
347 unsigned long mapping;
348
349 page = compound_head(page);
350 mapping = (unsigned long)page->mapping;
351 if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
352 return NULL;
353 return __page_rmapping(page);
354}
355
328struct address_space *page_mapping(struct page *page) 356struct address_space *page_mapping(struct page *page)
329{ 357{
330 struct address_space *mapping = page->mapping; 358 unsigned long mapping;
331 359
332 /* This happens if someone calls flush_dcache_page on slab page */ 360 /* This happens if someone calls flush_dcache_page on slab page */
333 if (unlikely(PageSlab(page))) 361 if (unlikely(PageSlab(page)))
@@ -337,10 +365,13 @@ struct address_space *page_mapping(struct page *page)
337 swp_entry_t entry; 365 swp_entry_t entry;
338 366
339 entry.val = page_private(page); 367 entry.val = page_private(page);
340 mapping = swap_address_space(entry); 368 return swap_address_space(entry);
341 } else if ((unsigned long)mapping & PAGE_MAPPING_ANON) 369 }
342 mapping = NULL; 370
343 return mapping; 371 mapping = (unsigned long)page->mapping;
372 if (mapping & PAGE_MAPPING_FLAGS)
373 return NULL;
374 return page->mapping;
344} 375}
345 376
346int overcommit_ratio_handler(struct ctl_table *table, int write, 377int overcommit_ratio_handler(struct ctl_table *table, int write,
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 35b25e1340ca..2faaa2976447 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -29,6 +29,7 @@
29#include <linux/atomic.h> 29#include <linux/atomic.h>
30#include <linux/compiler.h> 30#include <linux/compiler.h>
31#include <linux/llist.h> 31#include <linux/llist.h>
32#include <linux/bitops.h>
32 33
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
@@ -74,6 +75,8 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
74 pmd = pmd_offset(pud, addr); 75 pmd = pmd_offset(pud, addr);
75 do { 76 do {
76 next = pmd_addr_end(addr, end); 77 next = pmd_addr_end(addr, end);
78 if (pmd_clear_huge(pmd))
79 continue;
77 if (pmd_none_or_clear_bad(pmd)) 80 if (pmd_none_or_clear_bad(pmd))
78 continue; 81 continue;
79 vunmap_pte_range(pmd, addr, next); 82 vunmap_pte_range(pmd, addr, next);
@@ -88,6 +91,8 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
88 pud = pud_offset(pgd, addr); 91 pud = pud_offset(pgd, addr);
89 do { 92 do {
90 next = pud_addr_end(addr, end); 93 next = pud_addr_end(addr, end);
94 if (pud_clear_huge(pud))
95 continue;
91 if (pud_none_or_clear_bad(pud)) 96 if (pud_none_or_clear_bad(pud))
92 continue; 97 continue;
93 vunmap_pmd_range(pud, addr, next); 98 vunmap_pmd_range(pud, addr, next);
@@ -760,7 +765,7 @@ struct vmap_block {
760 spinlock_t lock; 765 spinlock_t lock;
761 struct vmap_area *va; 766 struct vmap_area *va;
762 unsigned long free, dirty; 767 unsigned long free, dirty;
763 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); 768 unsigned long dirty_min, dirty_max; /*< dirty range */
764 struct list_head free_list; 769 struct list_head free_list;
765 struct rcu_head rcu_head; 770 struct rcu_head rcu_head;
766 struct list_head purge; 771 struct list_head purge;
@@ -791,13 +796,31 @@ static unsigned long addr_to_vb_idx(unsigned long addr)
791 return addr; 796 return addr;
792} 797}
793 798
794static struct vmap_block *new_vmap_block(gfp_t gfp_mask) 799static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
800{
801 unsigned long addr;
802
803 addr = va_start + (pages_off << PAGE_SHIFT);
804 BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
805 return (void *)addr;
806}
807
808/**
809 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
810 * block. Of course pages number can't exceed VMAP_BBMAP_BITS
811 * @order: how many 2^order pages should be occupied in newly allocated block
812 * @gfp_mask: flags for the page level allocator
813 *
814 * Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
815 */
816static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
795{ 817{
796 struct vmap_block_queue *vbq; 818 struct vmap_block_queue *vbq;
797 struct vmap_block *vb; 819 struct vmap_block *vb;
798 struct vmap_area *va; 820 struct vmap_area *va;
799 unsigned long vb_idx; 821 unsigned long vb_idx;
800 int node, err; 822 int node, err;
823 void *vaddr;
801 824
802 node = numa_node_id(); 825 node = numa_node_id();
803 826
@@ -821,11 +844,15 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
821 return ERR_PTR(err); 844 return ERR_PTR(err);
822 } 845 }
823 846
847 vaddr = vmap_block_vaddr(va->va_start, 0);
824 spin_lock_init(&vb->lock); 848 spin_lock_init(&vb->lock);
825 vb->va = va; 849 vb->va = va;
826 vb->free = VMAP_BBMAP_BITS; 850 /* At least something should be left free */
851 BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
852 vb->free = VMAP_BBMAP_BITS - (1UL << order);
827 vb->dirty = 0; 853 vb->dirty = 0;
828 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); 854 vb->dirty_min = VMAP_BBMAP_BITS;
855 vb->dirty_max = 0;
829 INIT_LIST_HEAD(&vb->free_list); 856 INIT_LIST_HEAD(&vb->free_list);
830 857
831 vb_idx = addr_to_vb_idx(va->va_start); 858 vb_idx = addr_to_vb_idx(va->va_start);
@@ -837,11 +864,11 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
837 864
838 vbq = &get_cpu_var(vmap_block_queue); 865 vbq = &get_cpu_var(vmap_block_queue);
839 spin_lock(&vbq->lock); 866 spin_lock(&vbq->lock);
840 list_add_rcu(&vb->free_list, &vbq->free); 867 list_add_tail_rcu(&vb->free_list, &vbq->free);
841 spin_unlock(&vbq->lock); 868 spin_unlock(&vbq->lock);
842 put_cpu_var(vmap_block_queue); 869 put_cpu_var(vmap_block_queue);
843 870
844 return vb; 871 return vaddr;
845} 872}
846 873
847static void free_vmap_block(struct vmap_block *vb) 874static void free_vmap_block(struct vmap_block *vb)
@@ -876,7 +903,8 @@ static void purge_fragmented_blocks(int cpu)
876 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { 903 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
877 vb->free = 0; /* prevent further allocs after releasing lock */ 904 vb->free = 0; /* prevent further allocs after releasing lock */
878 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ 905 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
879 bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); 906 vb->dirty_min = 0;
907 vb->dirty_max = VMAP_BBMAP_BITS;
880 spin_lock(&vbq->lock); 908 spin_lock(&vbq->lock);
881 list_del_rcu(&vb->free_list); 909 list_del_rcu(&vb->free_list);
882 spin_unlock(&vbq->lock); 910 spin_unlock(&vbq->lock);
@@ -905,7 +933,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
905{ 933{
906 struct vmap_block_queue *vbq; 934 struct vmap_block_queue *vbq;
907 struct vmap_block *vb; 935 struct vmap_block *vb;
908 unsigned long addr = 0; 936 void *vaddr = NULL;
909 unsigned int order; 937 unsigned int order;
910 938
911 BUG_ON(size & ~PAGE_MASK); 939 BUG_ON(size & ~PAGE_MASK);
@@ -920,43 +948,38 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
920 } 948 }
921 order = get_order(size); 949 order = get_order(size);
922 950
923again:
924 rcu_read_lock(); 951 rcu_read_lock();
925 vbq = &get_cpu_var(vmap_block_queue); 952 vbq = &get_cpu_var(vmap_block_queue);
926 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 953 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
927 int i; 954 unsigned long pages_off;
928 955
929 spin_lock(&vb->lock); 956 spin_lock(&vb->lock);
930 if (vb->free < 1UL << order) 957 if (vb->free < (1UL << order)) {
931 goto next; 958 spin_unlock(&vb->lock);
959 continue;
960 }
932 961
933 i = VMAP_BBMAP_BITS - vb->free; 962 pages_off = VMAP_BBMAP_BITS - vb->free;
934 addr = vb->va->va_start + (i << PAGE_SHIFT); 963 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
935 BUG_ON(addr_to_vb_idx(addr) !=
936 addr_to_vb_idx(vb->va->va_start));
937 vb->free -= 1UL << order; 964 vb->free -= 1UL << order;
938 if (vb->free == 0) { 965 if (vb->free == 0) {
939 spin_lock(&vbq->lock); 966 spin_lock(&vbq->lock);
940 list_del_rcu(&vb->free_list); 967 list_del_rcu(&vb->free_list);
941 spin_unlock(&vbq->lock); 968 spin_unlock(&vbq->lock);
942 } 969 }
970
943 spin_unlock(&vb->lock); 971 spin_unlock(&vb->lock);
944 break; 972 break;
945next:
946 spin_unlock(&vb->lock);
947 } 973 }
948 974
949 put_cpu_var(vmap_block_queue); 975 put_cpu_var(vmap_block_queue);
950 rcu_read_unlock(); 976 rcu_read_unlock();
951 977
952 if (!addr) { 978 /* Allocate new block if nothing was found */
953 vb = new_vmap_block(gfp_mask); 979 if (!vaddr)
954 if (IS_ERR(vb)) 980 vaddr = new_vmap_block(order, gfp_mask);
955 return vb;
956 goto again;
957 }
958 981
959 return (void *)addr; 982 return vaddr;
960} 983}
961 984
962static void vb_free(const void *addr, unsigned long size) 985static void vb_free(const void *addr, unsigned long size)
@@ -974,6 +997,7 @@ static void vb_free(const void *addr, unsigned long size)
974 order = get_order(size); 997 order = get_order(size);
975 998
976 offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); 999 offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
1000 offset >>= PAGE_SHIFT;
977 1001
978 vb_idx = addr_to_vb_idx((unsigned long)addr); 1002 vb_idx = addr_to_vb_idx((unsigned long)addr);
979 rcu_read_lock(); 1003 rcu_read_lock();
@@ -984,7 +1008,10 @@ static void vb_free(const void *addr, unsigned long size)
984 vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); 1008 vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
985 1009
986 spin_lock(&vb->lock); 1010 spin_lock(&vb->lock);
987 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); 1011
1012 /* Expand dirty range */
1013 vb->dirty_min = min(vb->dirty_min, offset);
1014 vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
988 1015
989 vb->dirty += 1UL << order; 1016 vb->dirty += 1UL << order;
990 if (vb->dirty == VMAP_BBMAP_BITS) { 1017 if (vb->dirty == VMAP_BBMAP_BITS) {
@@ -1023,25 +1050,18 @@ void vm_unmap_aliases(void)
1023 1050
1024 rcu_read_lock(); 1051 rcu_read_lock();
1025 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1052 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
1026 int i, j;
1027
1028 spin_lock(&vb->lock); 1053 spin_lock(&vb->lock);
1029 i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); 1054 if (vb->dirty) {
1030 if (i < VMAP_BBMAP_BITS) { 1055 unsigned long va_start = vb->va->va_start;
1031 unsigned long s, e; 1056 unsigned long s, e;
1032 1057
1033 j = find_last_bit(vb->dirty_map, 1058 s = va_start + (vb->dirty_min << PAGE_SHIFT);
1034 VMAP_BBMAP_BITS); 1059 e = va_start + (vb->dirty_max << PAGE_SHIFT);
1035 j = j + 1; /* need exclusive index */
1036 1060
1037 s = vb->va->va_start + (i << PAGE_SHIFT); 1061 start = min(s, start);
1038 e = vb->va->va_start + (j << PAGE_SHIFT); 1062 end = max(e, end);
1039 flush = 1;
1040 1063
1041 if (s < start) 1064 flush = 1;
1042 start = s;
1043 if (e > end)
1044 end = e;
1045 } 1065 }
1046 spin_unlock(&vb->lock); 1066 spin_unlock(&vb->lock);
1047 } 1067 }
@@ -1314,7 +1334,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1314 1334
1315 BUG_ON(in_interrupt()); 1335 BUG_ON(in_interrupt());
1316 if (flags & VM_IOREMAP) 1336 if (flags & VM_IOREMAP)
1317 align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); 1337 align = 1ul << clamp_t(int, fls_long(size),
1338 PAGE_SHIFT, IOREMAP_MAX_ORDER);
1318 1339
1319 size = PAGE_ALIGN(size); 1340 size = PAGE_ALIGN(size);
1320 if (unlikely(!size)) 1341 if (unlikely(!size))
@@ -1418,6 +1439,7 @@ struct vm_struct *remove_vm_area(const void *addr)
1418 spin_unlock(&vmap_area_lock); 1439 spin_unlock(&vmap_area_lock);
1419 1440
1420 vmap_debug_free_range(va->va_start, va->va_end); 1441 vmap_debug_free_range(va->va_start, va->va_end);
1442 kasan_free_shadow(vm);
1421 free_unmap_vmap_area(va); 1443 free_unmap_vmap_area(va);
1422 vm->size -= PAGE_SIZE; 1444 vm->size -= PAGE_SIZE;
1423 1445
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 0dec1fa5f656..08bd7a3d464a 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -12,35 +12,6 @@
12 */ 12 */
13 13
14/* 14/*
15 * This allocator is designed for use with zram. Thus, the allocator is
16 * supposed to work well under low memory conditions. In particular, it
17 * never attempts higher order page allocation which is very likely to
18 * fail under memory pressure. On the other hand, if we just use single
19 * (0-order) pages, it would suffer from very high fragmentation --
20 * any object of size PAGE_SIZE/2 or larger would occupy an entire page.
21 * This was one of the major issues with its predecessor (xvmalloc).
22 *
23 * To overcome these issues, zsmalloc allocates a bunch of 0-order pages
24 * and links them together using various 'struct page' fields. These linked
25 * pages act as a single higher-order page i.e. an object can span 0-order
26 * page boundaries. The code refers to these linked pages as a single entity
27 * called zspage.
28 *
29 * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE
30 * since this satisfies the requirements of all its current users (in the
31 * worst case, page is incompressible and is thus stored "as-is" i.e. in
32 * uncompressed form). For allocation requests larger than this size, failure
33 * is returned (see zs_malloc).
34 *
35 * Additionally, zs_malloc() does not return a dereferenceable pointer.
36 * Instead, it returns an opaque handle (unsigned long) which encodes actual
37 * location of the allocated object. The reason for this indirection is that
38 * zsmalloc does not keep zspages permanently mapped since that would cause
39 * issues on 32-bit systems where the VA region for kernel space mappings
40 * is very small. So, before using the allocating memory, the object has to
41 * be mapped using zs_map_object() to get a usable pointer and subsequently
42 * unmapped using zs_unmap_object().
43 *
44 * Following is how we use various fields and flags of underlying 15 * Following is how we use various fields and flags of underlying
45 * struct page(s) to form a zspage. 16 * struct page(s) to form a zspage.
46 * 17 *
@@ -57,6 +28,8 @@
57 * 28 *
58 * page->private (union with page->first_page): refers to the 29 * page->private (union with page->first_page): refers to the
59 * component page after the first page 30 * component page after the first page
31 * If the page is first_page for huge object, it stores handle.
32 * Look at size_class->huge.
60 * page->freelist: points to the first free object in zspage. 33 * page->freelist: points to the first free object in zspage.
61 * Free objects are linked together using in-place 34 * Free objects are linked together using in-place
62 * metadata. 35 * metadata.
@@ -78,6 +51,7 @@
78 51
79#include <linux/module.h> 52#include <linux/module.h>
80#include <linux/kernel.h> 53#include <linux/kernel.h>
54#include <linux/sched.h>
81#include <linux/bitops.h> 55#include <linux/bitops.h>
82#include <linux/errno.h> 56#include <linux/errno.h>
83#include <linux/highmem.h> 57#include <linux/highmem.h>
@@ -110,6 +84,8 @@
110#define ZS_MAX_ZSPAGE_ORDER 2 84#define ZS_MAX_ZSPAGE_ORDER 2
111#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) 85#define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER)
112 86
87#define ZS_HANDLE_SIZE (sizeof(unsigned long))
88
113/* 89/*
114 * Object location (<PFN>, <obj_idx>) is encoded as 90 * Object location (<PFN>, <obj_idx>) is encoded as
115 * as single (unsigned long) handle value. 91 * as single (unsigned long) handle value.
@@ -133,13 +109,33 @@
133#endif 109#endif
134#endif 110#endif
135#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) 111#define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT)
136#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) 112
113/*
114 * Memory for allocating for handle keeps object position by
115 * encoding <page, obj_idx> and the encoded value has a room
116 * in least bit(ie, look at obj_to_location).
117 * We use the bit to synchronize between object access by
118 * user and migration.
119 */
120#define HANDLE_PIN_BIT 0
121
122/*
123 * Head in allocated object should have OBJ_ALLOCATED_TAG
124 * to identify the object was allocated or not.
125 * It's okay to add the status bit in the least bit because
126 * header keeps handle which is 4byte-aligned address so we
127 * have room for two bit at least.
128 */
129#define OBJ_ALLOCATED_TAG 1
130#define OBJ_TAG_BITS 1
131#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
137#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) 132#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
138 133
139#define MAX(a, b) ((a) >= (b) ? (a) : (b)) 134#define MAX(a, b) ((a) >= (b) ? (a) : (b))
140/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ 135/* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */
141#define ZS_MIN_ALLOC_SIZE \ 136#define ZS_MIN_ALLOC_SIZE \
142 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) 137 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS))
138/* each chunk includes extra space to keep handle */
143#define ZS_MAX_ALLOC_SIZE PAGE_SIZE 139#define ZS_MAX_ALLOC_SIZE PAGE_SIZE
144 140
145/* 141/*
@@ -172,6 +168,8 @@ enum fullness_group {
172enum zs_stat_type { 168enum zs_stat_type {
173 OBJ_ALLOCATED, 169 OBJ_ALLOCATED,
174 OBJ_USED, 170 OBJ_USED,
171 CLASS_ALMOST_FULL,
172 CLASS_ALMOST_EMPTY,
175 NR_ZS_STAT_TYPE, 173 NR_ZS_STAT_TYPE,
176}; 174};
177 175
@@ -216,6 +214,8 @@ struct size_class {
216 214
217 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 215 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
218 int pages_per_zspage; 216 int pages_per_zspage;
217 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
218 bool huge;
219 219
220#ifdef CONFIG_ZSMALLOC_STAT 220#ifdef CONFIG_ZSMALLOC_STAT
221 struct zs_size_stat stats; 221 struct zs_size_stat stats;
@@ -233,14 +233,24 @@ struct size_class {
233 * This must be power of 2 and less than or equal to ZS_ALIGN 233 * This must be power of 2 and less than or equal to ZS_ALIGN
234 */ 234 */
235struct link_free { 235struct link_free {
236 /* Handle of next free chunk (encodes <PFN, obj_idx>) */ 236 union {
237 void *next; 237 /*
238 * Position of next free chunk (encodes <PFN, obj_idx>)
239 * It's valid for non-allocated object
240 */
241 void *next;
242 /*
243 * Handle of allocated object.
244 */
245 unsigned long handle;
246 };
238}; 247};
239 248
240struct zs_pool { 249struct zs_pool {
241 char *name; 250 char *name;
242 251
243 struct size_class **size_class; 252 struct size_class **size_class;
253 struct kmem_cache *handle_cachep;
244 254
245 gfp_t flags; /* allocation flags used when growing pool */ 255 gfp_t flags; /* allocation flags used when growing pool */
246 atomic_long_t pages_allocated; 256 atomic_long_t pages_allocated;
@@ -267,8 +277,37 @@ struct mapping_area {
267#endif 277#endif
268 char *vm_addr; /* address of kmap_atomic()'ed pages */ 278 char *vm_addr; /* address of kmap_atomic()'ed pages */
269 enum zs_mapmode vm_mm; /* mapping mode */ 279 enum zs_mapmode vm_mm; /* mapping mode */
280 bool huge;
270}; 281};
271 282
283static int create_handle_cache(struct zs_pool *pool)
284{
285 pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE,
286 0, 0, NULL);
287 return pool->handle_cachep ? 0 : 1;
288}
289
290static void destroy_handle_cache(struct zs_pool *pool)
291{
292 kmem_cache_destroy(pool->handle_cachep);
293}
294
295static unsigned long alloc_handle(struct zs_pool *pool)
296{
297 return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
298 pool->flags & ~__GFP_HIGHMEM);
299}
300
301static void free_handle(struct zs_pool *pool, unsigned long handle)
302{
303 kmem_cache_free(pool->handle_cachep, (void *)handle);
304}
305
306static void record_obj(unsigned long handle, unsigned long obj)
307{
308 *(unsigned long *)handle = obj;
309}
310
272/* zpool driver */ 311/* zpool driver */
273 312
274#ifdef CONFIG_ZPOOL 313#ifdef CONFIG_ZPOOL
@@ -346,6 +385,11 @@ static struct zpool_driver zs_zpool_driver = {
346MODULE_ALIAS("zpool-zsmalloc"); 385MODULE_ALIAS("zpool-zsmalloc");
347#endif /* CONFIG_ZPOOL */ 386#endif /* CONFIG_ZPOOL */
348 387
388static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
389{
390 return pages_per_zspage * PAGE_SIZE / size;
391}
392
349/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 393/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
350static DEFINE_PER_CPU(struct mapping_area, zs_map_area); 394static DEFINE_PER_CPU(struct mapping_area, zs_map_area);
351 395
@@ -396,9 +440,182 @@ static int get_size_class_index(int size)
396 idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, 440 idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE,
397 ZS_SIZE_CLASS_DELTA); 441 ZS_SIZE_CLASS_DELTA);
398 442
399 return idx; 443 return min(zs_size_classes - 1, idx);
444}
445
446#ifdef CONFIG_ZSMALLOC_STAT
447
448static inline void zs_stat_inc(struct size_class *class,
449 enum zs_stat_type type, unsigned long cnt)
450{
451 class->stats.objs[type] += cnt;
452}
453
454static inline void zs_stat_dec(struct size_class *class,
455 enum zs_stat_type type, unsigned long cnt)
456{
457 class->stats.objs[type] -= cnt;
458}
459
460static inline unsigned long zs_stat_get(struct size_class *class,
461 enum zs_stat_type type)
462{
463 return class->stats.objs[type];
464}
465
466static int __init zs_stat_init(void)
467{
468 if (!debugfs_initialized())
469 return -ENODEV;
470
471 zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
472 if (!zs_stat_root)
473 return -ENOMEM;
474
475 return 0;
476}
477
478static void __exit zs_stat_exit(void)
479{
480 debugfs_remove_recursive(zs_stat_root);
481}
482
483static int zs_stats_size_show(struct seq_file *s, void *v)
484{
485 int i;
486 struct zs_pool *pool = s->private;
487 struct size_class *class;
488 int objs_per_zspage;
489 unsigned long class_almost_full, class_almost_empty;
490 unsigned long obj_allocated, obj_used, pages_used;
491 unsigned long total_class_almost_full = 0, total_class_almost_empty = 0;
492 unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
493
494 seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n",
495 "class", "size", "almost_full", "almost_empty",
496 "obj_allocated", "obj_used", "pages_used",
497 "pages_per_zspage");
498
499 for (i = 0; i < zs_size_classes; i++) {
500 class = pool->size_class[i];
501
502 if (class->index != i)
503 continue;
504
505 spin_lock(&class->lock);
506 class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL);
507 class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY);
508 obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
509 obj_used = zs_stat_get(class, OBJ_USED);
510 spin_unlock(&class->lock);
511
512 objs_per_zspage = get_maxobj_per_zspage(class->size,
513 class->pages_per_zspage);
514 pages_used = obj_allocated / objs_per_zspage *
515 class->pages_per_zspage;
516
517 seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n",
518 i, class->size, class_almost_full, class_almost_empty,
519 obj_allocated, obj_used, pages_used,
520 class->pages_per_zspage);
521
522 total_class_almost_full += class_almost_full;
523 total_class_almost_empty += class_almost_empty;
524 total_objs += obj_allocated;
525 total_used_objs += obj_used;
526 total_pages += pages_used;
527 }
528
529 seq_puts(s, "\n");
530 seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n",
531 "Total", "", total_class_almost_full,
532 total_class_almost_empty, total_objs,
533 total_used_objs, total_pages);
534
535 return 0;
536}
537
538static int zs_stats_size_open(struct inode *inode, struct file *file)
539{
540 return single_open(file, zs_stats_size_show, inode->i_private);
541}
542
543static const struct file_operations zs_stat_size_ops = {
544 .open = zs_stats_size_open,
545 .read = seq_read,
546 .llseek = seq_lseek,
547 .release = single_release,
548};
549
550static int zs_pool_stat_create(char *name, struct zs_pool *pool)
551{
552 struct dentry *entry;
553
554 if (!zs_stat_root)
555 return -ENODEV;
556
557 entry = debugfs_create_dir(name, zs_stat_root);
558 if (!entry) {
559 pr_warn("debugfs dir <%s> creation failed\n", name);
560 return -ENOMEM;
561 }
562 pool->stat_dentry = entry;
563
564 entry = debugfs_create_file("classes", S_IFREG | S_IRUGO,
565 pool->stat_dentry, pool, &zs_stat_size_ops);
566 if (!entry) {
567 pr_warn("%s: debugfs file entry <%s> creation failed\n",
568 name, "classes");
569 return -ENOMEM;
570 }
571
572 return 0;
573}
574
575static void zs_pool_stat_destroy(struct zs_pool *pool)
576{
577 debugfs_remove_recursive(pool->stat_dentry);
578}
579
580#else /* CONFIG_ZSMALLOC_STAT */
581
582static inline void zs_stat_inc(struct size_class *class,
583 enum zs_stat_type type, unsigned long cnt)
584{
585}
586
587static inline void zs_stat_dec(struct size_class *class,
588 enum zs_stat_type type, unsigned long cnt)
589{
590}
591
592static inline unsigned long zs_stat_get(struct size_class *class,
593 enum zs_stat_type type)
594{
595 return 0;
596}
597
598static int __init zs_stat_init(void)
599{
600 return 0;
601}
602
603static void __exit zs_stat_exit(void)
604{
605}
606
607static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
608{
609 return 0;
610}
611
612static inline void zs_pool_stat_destroy(struct zs_pool *pool)
613{
400} 614}
401 615
616#endif
617
618
402/* 619/*
403 * For each size class, zspages are divided into different groups 620 * For each size class, zspages are divided into different groups
404 * depending on how "full" they are. This was done so that we could 621 * depending on how "full" they are. This was done so that we could
@@ -419,7 +636,7 @@ static enum fullness_group get_fullness_group(struct page *page)
419 fg = ZS_EMPTY; 636 fg = ZS_EMPTY;
420 else if (inuse == max_objects) 637 else if (inuse == max_objects)
421 fg = ZS_FULL; 638 fg = ZS_FULL;
422 else if (inuse <= max_objects / fullness_threshold_frac) 639 else if (inuse <= 3 * max_objects / fullness_threshold_frac)
423 fg = ZS_ALMOST_EMPTY; 640 fg = ZS_ALMOST_EMPTY;
424 else 641 else
425 fg = ZS_ALMOST_FULL; 642 fg = ZS_ALMOST_FULL;
@@ -448,6 +665,8 @@ static void insert_zspage(struct page *page, struct size_class *class,
448 list_add_tail(&page->lru, &(*head)->lru); 665 list_add_tail(&page->lru, &(*head)->lru);
449 666
450 *head = page; 667 *head = page;
668 zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ?
669 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
451} 670}
452 671
453/* 672/*
@@ -473,6 +692,8 @@ static void remove_zspage(struct page *page, struct size_class *class,
473 struct page, lru); 692 struct page, lru);
474 693
475 list_del_init(&page->lru); 694 list_del_init(&page->lru);
695 zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
696 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
476} 697}
477 698
478/* 699/*
@@ -484,11 +705,10 @@ static void remove_zspage(struct page *page, struct size_class *class,
484 * page from the freelist of the old fullness group to that of the new 705 * page from the freelist of the old fullness group to that of the new
485 * fullness group. 706 * fullness group.
486 */ 707 */
487static enum fullness_group fix_fullness_group(struct zs_pool *pool, 708static enum fullness_group fix_fullness_group(struct size_class *class,
488 struct page *page) 709 struct page *page)
489{ 710{
490 int class_idx; 711 int class_idx;
491 struct size_class *class;
492 enum fullness_group currfg, newfg; 712 enum fullness_group currfg, newfg;
493 713
494 BUG_ON(!is_first_page(page)); 714 BUG_ON(!is_first_page(page));
@@ -498,7 +718,6 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool,
498 if (newfg == currfg) 718 if (newfg == currfg)
499 goto out; 719 goto out;
500 720
501 class = pool->size_class[class_idx];
502 remove_zspage(page, class, currfg); 721 remove_zspage(page, class, currfg);
503 insert_zspage(page, class, newfg); 722 insert_zspage(page, class, newfg);
504 set_zspage_mapping(page, class_idx, newfg); 723 set_zspage_mapping(page, class_idx, newfg);
@@ -512,7 +731,8 @@ out:
512 * to form a zspage for each size class. This is important 731 * to form a zspage for each size class. This is important
513 * to reduce wastage due to unusable space left at end of 732 * to reduce wastage due to unusable space left at end of
514 * each zspage which is given as: 733 * each zspage which is given as:
515 * wastage = Zp - Zp % size_class 734 * wastage = Zp % class_size
735 * usage = Zp - wastage
516 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... 736 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ...
517 * 737 *
518 * For example, for size class of 3/8 * PAGE_SIZE, we should 738 * For example, for size class of 3/8 * PAGE_SIZE, we should
@@ -571,35 +791,50 @@ static struct page *get_next_page(struct page *page)
571 791
572/* 792/*
573 * Encode <page, obj_idx> as a single handle value. 793 * Encode <page, obj_idx> as a single handle value.
574 * On hardware platforms with physical memory starting at 0x0 the pfn 794 * We use the least bit of handle for tagging.
575 * could be 0 so we ensure that the handle will never be 0 by adjusting the
576 * encoded obj_idx value before encoding.
577 */ 795 */
578static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) 796static void *location_to_obj(struct page *page, unsigned long obj_idx)
579{ 797{
580 unsigned long handle; 798 unsigned long obj;
581 799
582 if (!page) { 800 if (!page) {
583 BUG_ON(obj_idx); 801 BUG_ON(obj_idx);
584 return NULL; 802 return NULL;
585 } 803 }
586 804
587 handle = page_to_pfn(page) << OBJ_INDEX_BITS; 805 obj = page_to_pfn(page) << OBJ_INDEX_BITS;
588 handle |= ((obj_idx + 1) & OBJ_INDEX_MASK); 806 obj |= ((obj_idx) & OBJ_INDEX_MASK);
807 obj <<= OBJ_TAG_BITS;
589 808
590 return (void *)handle; 809 return (void *)obj;
591} 810}
592 811
593/* 812/*
594 * Decode <page, obj_idx> pair from the given object handle. We adjust the 813 * Decode <page, obj_idx> pair from the given object handle. We adjust the
595 * decoded obj_idx back to its original value since it was adjusted in 814 * decoded obj_idx back to its original value since it was adjusted in
596 * obj_location_to_handle(). 815 * location_to_obj().
597 */ 816 */
598static void obj_handle_to_location(unsigned long handle, struct page **page, 817static void obj_to_location(unsigned long obj, struct page **page,
599 unsigned long *obj_idx) 818 unsigned long *obj_idx)
600{ 819{
601 *page = pfn_to_page(handle >> OBJ_INDEX_BITS); 820 obj >>= OBJ_TAG_BITS;
602 *obj_idx = (handle & OBJ_INDEX_MASK) - 1; 821 *page = pfn_to_page(obj >> OBJ_INDEX_BITS);
822 *obj_idx = (obj & OBJ_INDEX_MASK);
823}
824
825static unsigned long handle_to_obj(unsigned long handle)
826{
827 return *(unsigned long *)handle;
828}
829
830static unsigned long obj_to_head(struct size_class *class, struct page *page,
831 void *obj)
832{
833 if (class->huge) {
834 VM_BUG_ON(!is_first_page(page));
835 return *(unsigned long *)page_private(page);
836 } else
837 return *(unsigned long *)obj;
603} 838}
604 839
605static unsigned long obj_idx_to_offset(struct page *page, 840static unsigned long obj_idx_to_offset(struct page *page,
@@ -613,6 +848,25 @@ static unsigned long obj_idx_to_offset(struct page *page,
613 return off + obj_idx * class_size; 848 return off + obj_idx * class_size;
614} 849}
615 850
851static inline int trypin_tag(unsigned long handle)
852{
853 unsigned long *ptr = (unsigned long *)handle;
854
855 return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr);
856}
857
858static void pin_tag(unsigned long handle)
859{
860 while (!trypin_tag(handle));
861}
862
863static void unpin_tag(unsigned long handle)
864{
865 unsigned long *ptr = (unsigned long *)handle;
866
867 clear_bit_unlock(HANDLE_PIN_BIT, ptr);
868}
869
616static void reset_page(struct page *page) 870static void reset_page(struct page *page)
617{ 871{
618 clear_bit(PG_private, &page->flags); 872 clear_bit(PG_private, &page->flags);
@@ -674,7 +928,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
674 link = (struct link_free *)vaddr + off / sizeof(*link); 928 link = (struct link_free *)vaddr + off / sizeof(*link);
675 929
676 while ((off += class->size) < PAGE_SIZE) { 930 while ((off += class->size) < PAGE_SIZE) {
677 link->next = obj_location_to_handle(page, i++); 931 link->next = location_to_obj(page, i++);
678 link += class->size / sizeof(*link); 932 link += class->size / sizeof(*link);
679 } 933 }
680 934
@@ -684,7 +938,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
684 * page (if present) 938 * page (if present)
685 */ 939 */
686 next_page = get_next_page(page); 940 next_page = get_next_page(page);
687 link->next = obj_location_to_handle(next_page, 0); 941 link->next = location_to_obj(next_page, 0);
688 kunmap_atomic(vaddr); 942 kunmap_atomic(vaddr);
689 page = next_page; 943 page = next_page;
690 off %= PAGE_SIZE; 944 off %= PAGE_SIZE;
@@ -738,7 +992,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
738 992
739 init_zspage(first_page, class); 993 init_zspage(first_page, class);
740 994
741 first_page->freelist = obj_location_to_handle(first_page, 0); 995 first_page->freelist = location_to_obj(first_page, 0);
742 /* Maximum number of objects we can store in this zspage */ 996 /* Maximum number of objects we can store in this zspage */
743 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; 997 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size;
744 998
@@ -860,12 +1114,19 @@ static void __zs_unmap_object(struct mapping_area *area,
860{ 1114{
861 int sizes[2]; 1115 int sizes[2];
862 void *addr; 1116 void *addr;
863 char *buf = area->vm_buf; 1117 char *buf;
864 1118
865 /* no write fastpath */ 1119 /* no write fastpath */
866 if (area->vm_mm == ZS_MM_RO) 1120 if (area->vm_mm == ZS_MM_RO)
867 goto out; 1121 goto out;
868 1122
1123 buf = area->vm_buf;
1124 if (!area->huge) {
1125 buf = buf + ZS_HANDLE_SIZE;
1126 size -= ZS_HANDLE_SIZE;
1127 off += ZS_HANDLE_SIZE;
1128 }
1129
869 sizes[0] = PAGE_SIZE - off; 1130 sizes[0] = PAGE_SIZE - off;
870 sizes[1] = size - sizes[0]; 1131 sizes[1] = size - sizes[0];
871 1132
@@ -952,11 +1213,6 @@ static void init_zs_size_classes(void)
952 zs_size_classes = nr; 1213 zs_size_classes = nr;
953} 1214}
954 1215
955static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage)
956{
957 return pages_per_zspage * PAGE_SIZE / size;
958}
959
960static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) 1216static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
961{ 1217{
962 if (prev->pages_per_zspage != pages_per_zspage) 1218 if (prev->pages_per_zspage != pages_per_zspage)
@@ -969,166 +1225,13 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
969 return true; 1225 return true;
970} 1226}
971 1227
972#ifdef CONFIG_ZSMALLOC_STAT 1228static bool zspage_full(struct page *page)
973
974static inline void zs_stat_inc(struct size_class *class,
975 enum zs_stat_type type, unsigned long cnt)
976{
977 class->stats.objs[type] += cnt;
978}
979
980static inline void zs_stat_dec(struct size_class *class,
981 enum zs_stat_type type, unsigned long cnt)
982{
983 class->stats.objs[type] -= cnt;
984}
985
986static inline unsigned long zs_stat_get(struct size_class *class,
987 enum zs_stat_type type)
988{
989 return class->stats.objs[type];
990}
991
992static int __init zs_stat_init(void)
993{
994 if (!debugfs_initialized())
995 return -ENODEV;
996
997 zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
998 if (!zs_stat_root)
999 return -ENOMEM;
1000
1001 return 0;
1002}
1003
1004static void __exit zs_stat_exit(void)
1005{
1006 debugfs_remove_recursive(zs_stat_root);
1007}
1008
1009static int zs_stats_size_show(struct seq_file *s, void *v)
1010{ 1229{
1011 int i; 1230 BUG_ON(!is_first_page(page));
1012 struct zs_pool *pool = s->private;
1013 struct size_class *class;
1014 int objs_per_zspage;
1015 unsigned long obj_allocated, obj_used, pages_used;
1016 unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
1017
1018 seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
1019 "obj_allocated", "obj_used", "pages_used");
1020
1021 for (i = 0; i < zs_size_classes; i++) {
1022 class = pool->size_class[i];
1023
1024 if (class->index != i)
1025 continue;
1026
1027 spin_lock(&class->lock);
1028 obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
1029 obj_used = zs_stat_get(class, OBJ_USED);
1030 spin_unlock(&class->lock);
1031
1032 objs_per_zspage = get_maxobj_per_zspage(class->size,
1033 class->pages_per_zspage);
1034 pages_used = obj_allocated / objs_per_zspage *
1035 class->pages_per_zspage;
1036
1037 seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i,
1038 class->size, obj_allocated, obj_used, pages_used);
1039
1040 total_objs += obj_allocated;
1041 total_used_objs += obj_used;
1042 total_pages += pages_used;
1043 }
1044
1045 seq_puts(s, "\n");
1046 seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "",
1047 total_objs, total_used_objs, total_pages);
1048
1049 return 0;
1050}
1051
1052static int zs_stats_size_open(struct inode *inode, struct file *file)
1053{
1054 return single_open(file, zs_stats_size_show, inode->i_private);
1055}
1056
1057static const struct file_operations zs_stat_size_ops = {
1058 .open = zs_stats_size_open,
1059 .read = seq_read,
1060 .llseek = seq_lseek,
1061 .release = single_release,
1062};
1063
1064static int zs_pool_stat_create(char *name, struct zs_pool *pool)
1065{
1066 struct dentry *entry;
1067
1068 if (!zs_stat_root)
1069 return -ENODEV;
1070
1071 entry = debugfs_create_dir(name, zs_stat_root);
1072 if (!entry) {
1073 pr_warn("debugfs dir <%s> creation failed\n", name);
1074 return -ENOMEM;
1075 }
1076 pool->stat_dentry = entry;
1077
1078 entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
1079 pool->stat_dentry, pool, &zs_stat_size_ops);
1080 if (!entry) {
1081 pr_warn("%s: debugfs file entry <%s> creation failed\n",
1082 name, "obj_in_classes");
1083 return -ENOMEM;
1084 }
1085
1086 return 0;
1087}
1088
1089static void zs_pool_stat_destroy(struct zs_pool *pool)
1090{
1091 debugfs_remove_recursive(pool->stat_dentry);
1092}
1093
1094#else /* CONFIG_ZSMALLOC_STAT */
1095
1096static inline void zs_stat_inc(struct size_class *class,
1097 enum zs_stat_type type, unsigned long cnt)
1098{
1099}
1100
1101static inline void zs_stat_dec(struct size_class *class,
1102 enum zs_stat_type type, unsigned long cnt)
1103{
1104}
1105
1106static inline unsigned long zs_stat_get(struct size_class *class,
1107 enum zs_stat_type type)
1108{
1109 return 0;
1110}
1111
1112static int __init zs_stat_init(void)
1113{
1114 return 0;
1115}
1116
1117static void __exit zs_stat_exit(void)
1118{
1119}
1120
1121static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
1122{
1123 return 0;
1124}
1125 1231
1126static inline void zs_pool_stat_destroy(struct zs_pool *pool) 1232 return page->inuse == page->objects;
1127{
1128} 1233}
1129 1234
1130#endif
1131
1132unsigned long zs_get_total_pages(struct zs_pool *pool) 1235unsigned long zs_get_total_pages(struct zs_pool *pool)
1133{ 1236{
1134 return atomic_long_read(&pool->pages_allocated); 1237 return atomic_long_read(&pool->pages_allocated);
@@ -1153,13 +1256,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1153 enum zs_mapmode mm) 1256 enum zs_mapmode mm)
1154{ 1257{
1155 struct page *page; 1258 struct page *page;
1156 unsigned long obj_idx, off; 1259 unsigned long obj, obj_idx, off;
1157 1260
1158 unsigned int class_idx; 1261 unsigned int class_idx;
1159 enum fullness_group fg; 1262 enum fullness_group fg;
1160 struct size_class *class; 1263 struct size_class *class;
1161 struct mapping_area *area; 1264 struct mapping_area *area;
1162 struct page *pages[2]; 1265 struct page *pages[2];
1266 void *ret;
1163 1267
1164 BUG_ON(!handle); 1268 BUG_ON(!handle);
1165 1269
@@ -1170,7 +1274,11 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1170 */ 1274 */
1171 BUG_ON(in_interrupt()); 1275 BUG_ON(in_interrupt());
1172 1276
1173 obj_handle_to_location(handle, &page, &obj_idx); 1277 /* From now on, migration cannot move the object */
1278 pin_tag(handle);
1279
1280 obj = handle_to_obj(handle);
1281 obj_to_location(obj, &page, &obj_idx);
1174 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1282 get_zspage_mapping(get_first_page(page), &class_idx, &fg);
1175 class = pool->size_class[class_idx]; 1283 class = pool->size_class[class_idx];
1176 off = obj_idx_to_offset(page, obj_idx, class->size); 1284 off = obj_idx_to_offset(page, obj_idx, class->size);
@@ -1180,7 +1288,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1180 if (off + class->size <= PAGE_SIZE) { 1288 if (off + class->size <= PAGE_SIZE) {
1181 /* this object is contained entirely within a page */ 1289 /* this object is contained entirely within a page */
1182 area->vm_addr = kmap_atomic(page); 1290 area->vm_addr = kmap_atomic(page);
1183 return area->vm_addr + off; 1291 ret = area->vm_addr + off;
1292 goto out;
1184 } 1293 }
1185 1294
1186 /* this object spans two pages */ 1295 /* this object spans two pages */
@@ -1188,14 +1297,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
1188 pages[1] = get_next_page(page); 1297 pages[1] = get_next_page(page);
1189 BUG_ON(!pages[1]); 1298 BUG_ON(!pages[1]);
1190 1299
1191 return __zs_map_object(area, pages, off, class->size); 1300 ret = __zs_map_object(area, pages, off, class->size);
1301out:
1302 if (!class->huge)
1303 ret += ZS_HANDLE_SIZE;
1304
1305 return ret;
1192} 1306}
1193EXPORT_SYMBOL_GPL(zs_map_object); 1307EXPORT_SYMBOL_GPL(zs_map_object);
1194 1308
1195void zs_unmap_object(struct zs_pool *pool, unsigned long handle) 1309void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1196{ 1310{
1197 struct page *page; 1311 struct page *page;
1198 unsigned long obj_idx, off; 1312 unsigned long obj, obj_idx, off;
1199 1313
1200 unsigned int class_idx; 1314 unsigned int class_idx;
1201 enum fullness_group fg; 1315 enum fullness_group fg;
@@ -1204,7 +1318,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1204 1318
1205 BUG_ON(!handle); 1319 BUG_ON(!handle);
1206 1320
1207 obj_handle_to_location(handle, &page, &obj_idx); 1321 obj = handle_to_obj(handle);
1322 obj_to_location(obj, &page, &obj_idx);
1208 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1323 get_zspage_mapping(get_first_page(page), &class_idx, &fg);
1209 class = pool->size_class[class_idx]; 1324 class = pool->size_class[class_idx];
1210 off = obj_idx_to_offset(page, obj_idx, class->size); 1325 off = obj_idx_to_offset(page, obj_idx, class->size);
@@ -1222,9 +1337,42 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
1222 __zs_unmap_object(area, pages, off, class->size); 1337 __zs_unmap_object(area, pages, off, class->size);
1223 } 1338 }
1224 put_cpu_var(zs_map_area); 1339 put_cpu_var(zs_map_area);
1340 unpin_tag(handle);
1225} 1341}
1226EXPORT_SYMBOL_GPL(zs_unmap_object); 1342EXPORT_SYMBOL_GPL(zs_unmap_object);
1227 1343
1344static unsigned long obj_malloc(struct page *first_page,
1345 struct size_class *class, unsigned long handle)
1346{
1347 unsigned long obj;
1348 struct link_free *link;
1349
1350 struct page *m_page;
1351 unsigned long m_objidx, m_offset;
1352 void *vaddr;
1353
1354 handle |= OBJ_ALLOCATED_TAG;
1355 obj = (unsigned long)first_page->freelist;
1356 obj_to_location(obj, &m_page, &m_objidx);
1357 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
1358
1359 vaddr = kmap_atomic(m_page);
1360 link = (struct link_free *)vaddr + m_offset / sizeof(*link);
1361 first_page->freelist = link->next;
1362 if (!class->huge)
1363 /* record handle in the header of allocated chunk */
1364 link->handle = handle;
1365 else
1366 /* record handle in first_page->private */
1367 set_page_private(first_page, handle);
1368 kunmap_atomic(vaddr);
1369 first_page->inuse++;
1370 zs_stat_inc(class, OBJ_USED, 1);
1371
1372 return obj;
1373}
1374
1375
1228/** 1376/**
1229 * zs_malloc - Allocate block of given size from pool. 1377 * zs_malloc - Allocate block of given size from pool.
1230 * @pool: pool to allocate from 1378 * @pool: pool to allocate from
@@ -1236,17 +1384,19 @@ EXPORT_SYMBOL_GPL(zs_unmap_object);
1236 */ 1384 */
1237unsigned long zs_malloc(struct zs_pool *pool, size_t size) 1385unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1238{ 1386{
1239 unsigned long obj; 1387 unsigned long handle, obj;
1240 struct link_free *link;
1241 struct size_class *class; 1388 struct size_class *class;
1242 void *vaddr; 1389 struct page *first_page;
1243
1244 struct page *first_page, *m_page;
1245 unsigned long m_objidx, m_offset;
1246 1390
1247 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) 1391 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
1248 return 0; 1392 return 0;
1249 1393
1394 handle = alloc_handle(pool);
1395 if (!handle)
1396 return 0;
1397
1398 /* extra space in chunk to keep the handle */
1399 size += ZS_HANDLE_SIZE;
1250 class = pool->size_class[get_size_class_index(size)]; 1400 class = pool->size_class[get_size_class_index(size)];
1251 1401
1252 spin_lock(&class->lock); 1402 spin_lock(&class->lock);
@@ -1255,8 +1405,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1255 if (!first_page) { 1405 if (!first_page) {
1256 spin_unlock(&class->lock); 1406 spin_unlock(&class->lock);
1257 first_page = alloc_zspage(class, pool->flags); 1407 first_page = alloc_zspage(class, pool->flags);
1258 if (unlikely(!first_page)) 1408 if (unlikely(!first_page)) {
1409 free_handle(pool, handle);
1259 return 0; 1410 return 0;
1411 }
1260 1412
1261 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1413 set_zspage_mapping(first_page, class->index, ZS_EMPTY);
1262 atomic_long_add(class->pages_per_zspage, 1414 atomic_long_add(class->pages_per_zspage,
@@ -1267,73 +1419,360 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1267 class->size, class->pages_per_zspage)); 1419 class->size, class->pages_per_zspage));
1268 } 1420 }
1269 1421
1270 obj = (unsigned long)first_page->freelist; 1422 obj = obj_malloc(first_page, class, handle);
1271 obj_handle_to_location(obj, &m_page, &m_objidx);
1272 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size);
1273
1274 vaddr = kmap_atomic(m_page);
1275 link = (struct link_free *)vaddr + m_offset / sizeof(*link);
1276 first_page->freelist = link->next;
1277 memset(link, POISON_INUSE, sizeof(*link));
1278 kunmap_atomic(vaddr);
1279
1280 first_page->inuse++;
1281 zs_stat_inc(class, OBJ_USED, 1);
1282 /* Now move the zspage to another fullness group, if required */ 1423 /* Now move the zspage to another fullness group, if required */
1283 fix_fullness_group(pool, first_page); 1424 fix_fullness_group(class, first_page);
1425 record_obj(handle, obj);
1284 spin_unlock(&class->lock); 1426 spin_unlock(&class->lock);
1285 1427
1286 return obj; 1428 return handle;
1287} 1429}
1288EXPORT_SYMBOL_GPL(zs_malloc); 1430EXPORT_SYMBOL_GPL(zs_malloc);
1289 1431
1290void zs_free(struct zs_pool *pool, unsigned long obj) 1432static void obj_free(struct zs_pool *pool, struct size_class *class,
1433 unsigned long obj)
1291{ 1434{
1292 struct link_free *link; 1435 struct link_free *link;
1293 struct page *first_page, *f_page; 1436 struct page *first_page, *f_page;
1294 unsigned long f_objidx, f_offset; 1437 unsigned long f_objidx, f_offset;
1295 void *vaddr; 1438 void *vaddr;
1296
1297 int class_idx; 1439 int class_idx;
1298 struct size_class *class;
1299 enum fullness_group fullness; 1440 enum fullness_group fullness;
1300 1441
1301 if (unlikely(!obj)) 1442 BUG_ON(!obj);
1302 return;
1303 1443
1304 obj_handle_to_location(obj, &f_page, &f_objidx); 1444 obj &= ~OBJ_ALLOCATED_TAG;
1445 obj_to_location(obj, &f_page, &f_objidx);
1305 first_page = get_first_page(f_page); 1446 first_page = get_first_page(f_page);
1306 1447
1307 get_zspage_mapping(first_page, &class_idx, &fullness); 1448 get_zspage_mapping(first_page, &class_idx, &fullness);
1308 class = pool->size_class[class_idx];
1309 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); 1449 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size);
1310 1450
1311 spin_lock(&class->lock); 1451 vaddr = kmap_atomic(f_page);
1312 1452
1313 /* Insert this object in containing zspage's freelist */ 1453 /* Insert this object in containing zspage's freelist */
1314 vaddr = kmap_atomic(f_page);
1315 link = (struct link_free *)(vaddr + f_offset); 1454 link = (struct link_free *)(vaddr + f_offset);
1316 link->next = first_page->freelist; 1455 link->next = first_page->freelist;
1456 if (class->huge)
1457 set_page_private(first_page, 0);
1317 kunmap_atomic(vaddr); 1458 kunmap_atomic(vaddr);
1318 first_page->freelist = (void *)obj; 1459 first_page->freelist = (void *)obj;
1319
1320 first_page->inuse--; 1460 first_page->inuse--;
1321 fullness = fix_fullness_group(pool, first_page);
1322
1323 zs_stat_dec(class, OBJ_USED, 1); 1461 zs_stat_dec(class, OBJ_USED, 1);
1324 if (fullness == ZS_EMPTY) 1462}
1463
1464void zs_free(struct zs_pool *pool, unsigned long handle)
1465{
1466 struct page *first_page, *f_page;
1467 unsigned long obj, f_objidx;
1468 int class_idx;
1469 struct size_class *class;
1470 enum fullness_group fullness;
1471
1472 if (unlikely(!handle))
1473 return;
1474
1475 pin_tag(handle);
1476 obj = handle_to_obj(handle);
1477 obj_to_location(obj, &f_page, &f_objidx);
1478 first_page = get_first_page(f_page);
1479
1480 get_zspage_mapping(first_page, &class_idx, &fullness);
1481 class = pool->size_class[class_idx];
1482
1483 spin_lock(&class->lock);
1484 obj_free(pool, class, obj);
1485 fullness = fix_fullness_group(class, first_page);
1486 if (fullness == ZS_EMPTY) {
1325 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1487 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
1326 class->size, class->pages_per_zspage)); 1488 class->size, class->pages_per_zspage));
1327 1489 atomic_long_sub(class->pages_per_zspage,
1490 &pool->pages_allocated);
1491 free_zspage(first_page);
1492 }
1328 spin_unlock(&class->lock); 1493 spin_unlock(&class->lock);
1494 unpin_tag(handle);
1495
1496 free_handle(pool, handle);
1497}
1498EXPORT_SYMBOL_GPL(zs_free);
1499
1500static void zs_object_copy(unsigned long src, unsigned long dst,
1501 struct size_class *class)
1502{
1503 struct page *s_page, *d_page;
1504 unsigned long s_objidx, d_objidx;
1505 unsigned long s_off, d_off;
1506 void *s_addr, *d_addr;
1507 int s_size, d_size, size;
1508 int written = 0;
1509
1510 s_size = d_size = class->size;
1511
1512 obj_to_location(src, &s_page, &s_objidx);
1513 obj_to_location(dst, &d_page, &d_objidx);
1514
1515 s_off = obj_idx_to_offset(s_page, s_objidx, class->size);
1516 d_off = obj_idx_to_offset(d_page, d_objidx, class->size);
1517
1518 if (s_off + class->size > PAGE_SIZE)
1519 s_size = PAGE_SIZE - s_off;
1520
1521 if (d_off + class->size > PAGE_SIZE)
1522 d_size = PAGE_SIZE - d_off;
1523
1524 s_addr = kmap_atomic(s_page);
1525 d_addr = kmap_atomic(d_page);
1526
1527 while (1) {
1528 size = min(s_size, d_size);
1529 memcpy(d_addr + d_off, s_addr + s_off, size);
1530 written += size;
1531
1532 if (written == class->size)
1533 break;
1534
1535 s_off += size;
1536 s_size -= size;
1537 d_off += size;
1538 d_size -= size;
1539
1540 if (s_off >= PAGE_SIZE) {
1541 kunmap_atomic(d_addr);
1542 kunmap_atomic(s_addr);
1543 s_page = get_next_page(s_page);
1544 BUG_ON(!s_page);
1545 s_addr = kmap_atomic(s_page);
1546 d_addr = kmap_atomic(d_page);
1547 s_size = class->size - written;
1548 s_off = 0;
1549 }
1550
1551 if (d_off >= PAGE_SIZE) {
1552 kunmap_atomic(d_addr);
1553 d_page = get_next_page(d_page);
1554 BUG_ON(!d_page);
1555 d_addr = kmap_atomic(d_page);
1556 d_size = class->size - written;
1557 d_off = 0;
1558 }
1559 }
1560
1561 kunmap_atomic(d_addr);
1562 kunmap_atomic(s_addr);
1563}
1564
1565/*
1566 * Find alloced object in zspage from index object and
1567 * return handle.
1568 */
1569static unsigned long find_alloced_obj(struct page *page, int index,
1570 struct size_class *class)
1571{
1572 unsigned long head;
1573 int offset = 0;
1574 unsigned long handle = 0;
1575 void *addr = kmap_atomic(page);
1576
1577 if (!is_first_page(page))
1578 offset = page->index;
1579 offset += class->size * index;
1580
1581 while (offset < PAGE_SIZE) {
1582 head = obj_to_head(class, page, addr + offset);
1583 if (head & OBJ_ALLOCATED_TAG) {
1584 handle = head & ~OBJ_ALLOCATED_TAG;
1585 if (trypin_tag(handle))
1586 break;
1587 handle = 0;
1588 }
1589
1590 offset += class->size;
1591 index++;
1592 }
1593
1594 kunmap_atomic(addr);
1595 return handle;
1596}
1597
1598struct zs_compact_control {
1599 /* Source page for migration which could be a subpage of zspage. */
1600 struct page *s_page;
1601 /* Destination page for migration which should be a first page
1602 * of zspage. */
1603 struct page *d_page;
1604 /* Starting object index within @s_page which used for live object
1605 * in the subpage. */
1606 int index;
1607 /* how many of objects are migrated */
1608 int nr_migrated;
1609};
1610
1611static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
1612 struct zs_compact_control *cc)
1613{
1614 unsigned long used_obj, free_obj;
1615 unsigned long handle;
1616 struct page *s_page = cc->s_page;
1617 struct page *d_page = cc->d_page;
1618 unsigned long index = cc->index;
1619 int nr_migrated = 0;
1620 int ret = 0;
1621
1622 while (1) {
1623 handle = find_alloced_obj(s_page, index, class);
1624 if (!handle) {
1625 s_page = get_next_page(s_page);
1626 if (!s_page)
1627 break;
1628 index = 0;
1629 continue;
1630 }
1631
1632 /* Stop if there is no more space */
1633 if (zspage_full(d_page)) {
1634 unpin_tag(handle);
1635 ret = -ENOMEM;
1636 break;
1637 }
1638
1639 used_obj = handle_to_obj(handle);
1640 free_obj = obj_malloc(d_page, class, handle);
1641 zs_object_copy(used_obj, free_obj, class);
1642 index++;
1643 record_obj(handle, free_obj);
1644 unpin_tag(handle);
1645 obj_free(pool, class, used_obj);
1646 nr_migrated++;
1647 }
1648
1649 /* Remember last position in this iteration */
1650 cc->s_page = s_page;
1651 cc->index = index;
1652 cc->nr_migrated = nr_migrated;
1653
1654 return ret;
1655}
1656
1657static struct page *alloc_target_page(struct size_class *class)
1658{
1659 int i;
1660 struct page *page;
1661
1662 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
1663 page = class->fullness_list[i];
1664 if (page) {
1665 remove_zspage(page, class, i);
1666 break;
1667 }
1668 }
1669
1670 return page;
1671}
1672
1673static void putback_zspage(struct zs_pool *pool, struct size_class *class,
1674 struct page *first_page)
1675{
1676 enum fullness_group fullness;
1677
1678 BUG_ON(!is_first_page(first_page));
1679
1680 fullness = get_fullness_group(first_page);
1681 insert_zspage(first_page, class, fullness);
1682 set_zspage_mapping(first_page, class->index, fullness);
1329 1683
1330 if (fullness == ZS_EMPTY) { 1684 if (fullness == ZS_EMPTY) {
1685 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
1686 class->size, class->pages_per_zspage));
1331 atomic_long_sub(class->pages_per_zspage, 1687 atomic_long_sub(class->pages_per_zspage,
1332 &pool->pages_allocated); 1688 &pool->pages_allocated);
1689
1333 free_zspage(first_page); 1690 free_zspage(first_page);
1334 } 1691 }
1335} 1692}
1336EXPORT_SYMBOL_GPL(zs_free); 1693
1694static struct page *isolate_source_page(struct size_class *class)
1695{
1696 struct page *page;
1697
1698 page = class->fullness_list[ZS_ALMOST_EMPTY];
1699 if (page)
1700 remove_zspage(page, class, ZS_ALMOST_EMPTY);
1701
1702 return page;
1703}
1704
1705static unsigned long __zs_compact(struct zs_pool *pool,
1706 struct size_class *class)
1707{
1708 int nr_to_migrate;
1709 struct zs_compact_control cc;
1710 struct page *src_page;
1711 struct page *dst_page = NULL;
1712 unsigned long nr_total_migrated = 0;
1713
1714 spin_lock(&class->lock);
1715 while ((src_page = isolate_source_page(class))) {
1716
1717 BUG_ON(!is_first_page(src_page));
1718
1719 /* The goal is to migrate all live objects in source page */
1720 nr_to_migrate = src_page->inuse;
1721 cc.index = 0;
1722 cc.s_page = src_page;
1723
1724 while ((dst_page = alloc_target_page(class))) {
1725 cc.d_page = dst_page;
1726 /*
1727 * If there is no more space in dst_page, try to
1728 * allocate another zspage.
1729 */
1730 if (!migrate_zspage(pool, class, &cc))
1731 break;
1732
1733 putback_zspage(pool, class, dst_page);
1734 nr_total_migrated += cc.nr_migrated;
1735 nr_to_migrate -= cc.nr_migrated;
1736 }
1737
1738 /* Stop if we couldn't find slot */
1739 if (dst_page == NULL)
1740 break;
1741
1742 putback_zspage(pool, class, dst_page);
1743 putback_zspage(pool, class, src_page);
1744 spin_unlock(&class->lock);
1745 nr_total_migrated += cc.nr_migrated;
1746 cond_resched();
1747 spin_lock(&class->lock);
1748 }
1749
1750 if (src_page)
1751 putback_zspage(pool, class, src_page);
1752
1753 spin_unlock(&class->lock);
1754
1755 return nr_total_migrated;
1756}
1757
1758unsigned long zs_compact(struct zs_pool *pool)
1759{
1760 int i;
1761 unsigned long nr_migrated = 0;
1762 struct size_class *class;
1763
1764 for (i = zs_size_classes - 1; i >= 0; i--) {
1765 class = pool->size_class[i];
1766 if (!class)
1767 continue;
1768 if (class->index != i)
1769 continue;
1770 nr_migrated += __zs_compact(pool, class);
1771 }
1772
1773 return nr_migrated;
1774}
1775EXPORT_SYMBOL_GPL(zs_compact);
1337 1776
1338/** 1777/**
1339 * zs_create_pool - Creates an allocation pool to work from. 1778 * zs_create_pool - Creates an allocation pool to work from.
@@ -1355,20 +1794,20 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
1355 if (!pool) 1794 if (!pool)
1356 return NULL; 1795 return NULL;
1357 1796
1358 pool->name = kstrdup(name, GFP_KERNEL);
1359 if (!pool->name) {
1360 kfree(pool);
1361 return NULL;
1362 }
1363
1364 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), 1797 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
1365 GFP_KERNEL); 1798 GFP_KERNEL);
1366 if (!pool->size_class) { 1799 if (!pool->size_class) {
1367 kfree(pool->name);
1368 kfree(pool); 1800 kfree(pool);
1369 return NULL; 1801 return NULL;
1370 } 1802 }
1371 1803
1804 pool->name = kstrdup(name, GFP_KERNEL);
1805 if (!pool->name)
1806 goto err;
1807
1808 if (create_handle_cache(pool))
1809 goto err;
1810
1372 /* 1811 /*
1373 * Iterate reversly, because, size of size_class that we want to use 1812 * Iterate reversly, because, size of size_class that we want to use
1374 * for merging should be larger or equal to current size. 1813 * for merging should be larger or equal to current size.
@@ -1406,6 +1845,9 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags)
1406 class->size = size; 1845 class->size = size;
1407 class->index = i; 1846 class->index = i;
1408 class->pages_per_zspage = pages_per_zspage; 1847 class->pages_per_zspage = pages_per_zspage;
1848 if (pages_per_zspage == 1 &&
1849 get_maxobj_per_zspage(size, pages_per_zspage) == 1)
1850 class->huge = true;
1409 spin_lock_init(&class->lock); 1851 spin_lock_init(&class->lock);
1410 pool->size_class[i] = class; 1852 pool->size_class[i] = class;
1411 1853
@@ -1450,6 +1892,7 @@ void zs_destroy_pool(struct zs_pool *pool)
1450 kfree(class); 1892 kfree(class);
1451 } 1893 }
1452 1894
1895 destroy_handle_cache(pool);
1453 kfree(pool->size_class); 1896 kfree(pool->size_class);
1454 kfree(pool->name); 1897 kfree(pool->name);
1455 kfree(pool); 1898 kfree(pool);