aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig11
-rw-r--r--mm/Kconfig.debug9
-rw-r--r--mm/Makefile6
-rw-r--r--mm/backing-dev.c107
-rw-r--r--mm/cleancache.c2
-rw-r--r--mm/cma.c2
-rw-r--r--mm/compaction.c181
-rw-r--r--mm/debug.c4
-rw-r--r--mm/fadvise.c4
-rw-r--r--mm/filemap.c5
-rw-r--r--mm/filemap_xip.c4
-rw-r--r--mm/fremap.c283
-rw-r--r--mm/gup.c244
-rw-r--r--mm/huge_memory.c156
-rw-r--r--mm/hugetlb.c160
-rw-r--r--mm/hugetlb_cgroup.c2
-rw-r--r--mm/internal.h28
-rw-r--r--mm/interval_tree.c34
-rw-r--r--mm/kasan/Makefile8
-rw-r--r--mm/kasan/kasan.c516
-rw-r--r--mm/kasan/kasan.h75
-rw-r--r--mm/kasan/report.c269
-rw-r--r--mm/kmemleak.c6
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/list_lru.c467
-rw-r--r--mm/madvise.c30
-rw-r--r--mm/memcontrol.c1096
-rw-r--r--mm/memory-failure.c13
-rw-r--r--mm/memory.c330
-rw-r--r--mm/mempolicy.c286
-rw-r--r--mm/migrate.c45
-rw-r--r--mm/mincore.c175
-rw-r--r--mm/mm_init.c4
-rw-r--r--mm/mmap.c113
-rw-r--r--mm/mmzone.c4
-rw-r--r--mm/mprotect.c50
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/msync.c5
-rw-r--r--mm/nommu.c115
-rw-r--r--mm/oom_kill.c169
-rw-r--r--mm/page-writeback.c89
-rw-r--r--mm/page_alloc.c526
-rw-r--r--mm/page_counter.c7
-rw-r--r--mm/page_owner.c26
-rw-r--r--mm/pagewalk.c235
-rw-r--r--mm/percpu.c6
-rw-r--r--mm/pgtable-generic.c2
-rw-r--r--mm/process_vm_access.c7
-rw-r--r--mm/readahead.c4
-rw-r--r--mm/rmap.c279
-rw-r--r--mm/shmem.c29
-rw-r--r--mm/slab.c17
-rw-r--r--mm/slab.h67
-rw-r--r--mm/slab_common.c323
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c232
-rw-r--r--mm/swap.c6
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/truncate.c2
-rw-r--r--mm/util.c48
-rw-r--r--mm/vmalloc.c16
-rw-r--r--mm/vmscan.c147
-rw-r--r--mm/vmstat.c130
-rw-r--r--mm/workingset.c9
-rw-r--r--mm/zbud.c3
-rw-r--r--mm/zpool.c6
-rw-r--r--mm/zsmalloc.c239
-rw-r--r--mm/zswap.c5
68 files changed, 4573 insertions, 2919 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 1d1ae6b078fd..de5239c152f9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -325,6 +325,7 @@ config VIRT_TO_BUS
325 325
326config MMU_NOTIFIER 326config MMU_NOTIFIER
327 bool 327 bool
328 select SRCU
328 329
329config KSM 330config KSM
330 bool "Enable KSM for page merging" 331 bool "Enable KSM for page merging"
@@ -601,6 +602,16 @@ config PGTABLE_MAPPING
601 You can check speed with zsmalloc benchmark: 602 You can check speed with zsmalloc benchmark:
602 https://github.com/spartacus06/zsmapbench 603 https://github.com/spartacus06/zsmapbench
603 604
605config ZSMALLOC_STAT
606 bool "Export zsmalloc statistics"
607 depends on ZSMALLOC
608 select DEBUG_FS
609 help
610 This option enables code in the zsmalloc to collect various
611 statistics about whats happening in zsmalloc and exports that
612 information to userspace via debugfs.
613 If unsure, say N.
614
604config GENERIC_EARLY_IOREMAP 615config GENERIC_EARLY_IOREMAP
605 bool 616 bool
606 617
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 56badfc4810a..957d3da53ddd 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -14,7 +14,6 @@ config DEBUG_PAGEALLOC
14 depends on !KMEMCHECK 14 depends on !KMEMCHECK
15 select PAGE_EXTENSION 15 select PAGE_EXTENSION
16 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC 16 select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
17 select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
18 ---help--- 17 ---help---
19 Unmap pages from the kernel linear mapping after free_pages(). 18 Unmap pages from the kernel linear mapping after free_pages().
20 This results in a large slowdown, but helps to find certain types 19 This results in a large slowdown, but helps to find certain types
@@ -27,13 +26,5 @@ config DEBUG_PAGEALLOC
27 that would result in incorrect warnings of memory corruption after 26 that would result in incorrect warnings of memory corruption after
28 a resume because free pages are not saved to the suspend image. 27 a resume because free pages are not saved to the suspend image.
29 28
30config WANT_PAGE_DEBUG_FLAGS
31 bool
32
33config PAGE_POISONING 29config PAGE_POISONING
34 bool 30 bool
35 select WANT_PAGE_DEBUG_FLAGS
36
37config PAGE_GUARD
38 bool
39 select WANT_PAGE_DEBUG_FLAGS
diff --git a/mm/Makefile b/mm/Makefile
index 4bf586e66378..088c68e9ec35 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -2,8 +2,11 @@
2# Makefile for the linux memory manager. 2# Makefile for the linux memory manager.
3# 3#
4 4
5KASAN_SANITIZE_slab_common.o := n
6KASAN_SANITIZE_slub.o := n
7
5mmu-y := nommu.o 8mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \ 9mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 10 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o pagewalk.o pgtable-generic.o 11 vmalloc.o pagewalk.o pgtable-generic.o
9 12
@@ -49,6 +52,7 @@ obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
49obj-$(CONFIG_SLAB) += slab.o 52obj-$(CONFIG_SLAB) += slab.o
50obj-$(CONFIG_SLUB) += slub.o 53obj-$(CONFIG_SLUB) += slub.o
51obj-$(CONFIG_KMEMCHECK) += kmemcheck.o 54obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
55obj-$(CONFIG_KASAN) += kasan/
52obj-$(CONFIG_FAILSLAB) += failslab.o 56obj-$(CONFIG_FAILSLAB) += failslab.o
53obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o 57obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
54obj-$(CONFIG_FS_XIP) += filemap_xip.o 58obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0ae0df55000b..7690ec77c722 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,19 +14,10 @@
14 14
15static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); 15static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
16 16
17struct backing_dev_info default_backing_dev_info = {
18 .name = "default",
19 .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
20 .state = 0,
21 .capabilities = BDI_CAP_MAP_COPY,
22};
23EXPORT_SYMBOL_GPL(default_backing_dev_info);
24
25struct backing_dev_info noop_backing_dev_info = { 17struct backing_dev_info noop_backing_dev_info = {
26 .name = "noop", 18 .name = "noop",
27 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 19 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
28}; 20};
29EXPORT_SYMBOL_GPL(noop_backing_dev_info);
30 21
31static struct class *bdi_class; 22static struct class *bdi_class;
32 23
@@ -40,17 +31,6 @@ LIST_HEAD(bdi_list);
40/* bdi_wq serves all asynchronous writeback tasks */ 31/* bdi_wq serves all asynchronous writeback tasks */
41struct workqueue_struct *bdi_wq; 32struct workqueue_struct *bdi_wq;
42 33
43static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
44{
45 if (wb1 < wb2) {
46 spin_lock(&wb1->list_lock);
47 spin_lock_nested(&wb2->list_lock, 1);
48 } else {
49 spin_lock(&wb2->list_lock);
50 spin_lock_nested(&wb1->list_lock, 1);
51 }
52}
53
54#ifdef CONFIG_DEBUG_FS 34#ifdef CONFIG_DEBUG_FS
55#include <linux/debugfs.h> 35#include <linux/debugfs.h>
56#include <linux/seq_file.h> 36#include <linux/seq_file.h>
@@ -264,9 +244,6 @@ static int __init default_bdi_init(void)
264 if (!bdi_wq) 244 if (!bdi_wq)
265 return -ENOMEM; 245 return -ENOMEM;
266 246
267 err = bdi_init(&default_backing_dev_info);
268 if (!err)
269 bdi_register(&default_backing_dev_info, NULL, "default");
270 err = bdi_init(&noop_backing_dev_info); 247 err = bdi_init(&noop_backing_dev_info);
271 248
272 return err; 249 return err;
@@ -355,19 +332,19 @@ EXPORT_SYMBOL(bdi_register_dev);
355 */ 332 */
356static void bdi_wb_shutdown(struct backing_dev_info *bdi) 333static void bdi_wb_shutdown(struct backing_dev_info *bdi)
357{ 334{
358 if (!bdi_cap_writeback_dirty(bdi)) 335 /* Make sure nobody queues further work */
336 spin_lock_bh(&bdi->wb_lock);
337 if (!test_and_clear_bit(BDI_registered, &bdi->state)) {
338 spin_unlock_bh(&bdi->wb_lock);
359 return; 339 return;
340 }
341 spin_unlock_bh(&bdi->wb_lock);
360 342
361 /* 343 /*
362 * Make sure nobody finds us on the bdi_list anymore 344 * Make sure nobody finds us on the bdi_list anymore
363 */ 345 */
364 bdi_remove_from_list(bdi); 346 bdi_remove_from_list(bdi);
365 347
366 /* Make sure nobody queues further work */
367 spin_lock_bh(&bdi->wb_lock);
368 clear_bit(BDI_registered, &bdi->state);
369 spin_unlock_bh(&bdi->wb_lock);
370
371 /* 348 /*
372 * Drain work list and shutdown the delayed_work. At this point, 349 * Drain work list and shutdown the delayed_work. At this point,
373 * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi 350 * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
@@ -375,37 +352,22 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
375 */ 352 */
376 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); 353 mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
377 flush_delayed_work(&bdi->wb.dwork); 354 flush_delayed_work(&bdi->wb.dwork);
378 WARN_ON(!list_empty(&bdi->work_list));
379 WARN_ON(delayed_work_pending(&bdi->wb.dwork));
380} 355}
381 356
382/* 357/*
383 * This bdi is going away now, make sure that no super_blocks point to it 358 * Called when the device behind @bdi has been removed or ejected.
359 *
360 * We can't really do much here except for reducing the dirty ratio at
361 * the moment. In the future we should be able to set a flag so that
362 * the filesystem can handle errors at mark_inode_dirty time instead
363 * of only at writeback time.
384 */ 364 */
385static void bdi_prune_sb(struct backing_dev_info *bdi)
386{
387 struct super_block *sb;
388
389 spin_lock(&sb_lock);
390 list_for_each_entry(sb, &super_blocks, s_list) {
391 if (sb->s_bdi == bdi)
392 sb->s_bdi = &default_backing_dev_info;
393 }
394 spin_unlock(&sb_lock);
395}
396
397void bdi_unregister(struct backing_dev_info *bdi) 365void bdi_unregister(struct backing_dev_info *bdi)
398{ 366{
399 if (bdi->dev) { 367 if (WARN_ON_ONCE(!bdi->dev))
400 bdi_set_min_ratio(bdi, 0); 368 return;
401 trace_writeback_bdi_unregister(bdi);
402 bdi_prune_sb(bdi);
403 369
404 bdi_wb_shutdown(bdi); 370 bdi_set_min_ratio(bdi, 0);
405 bdi_debug_unregister(bdi);
406 device_unregister(bdi->dev);
407 bdi->dev = NULL;
408 }
409} 371}
410EXPORT_SYMBOL(bdi_unregister); 372EXPORT_SYMBOL(bdi_unregister);
411 373
@@ -474,37 +436,19 @@ void bdi_destroy(struct backing_dev_info *bdi)
474{ 436{
475 int i; 437 int i;
476 438
477 /* 439 bdi_wb_shutdown(bdi);
478 * Splice our entries to the default_backing_dev_info. This
479 * condition shouldn't happen. @wb must be empty at this point and
480 * dirty inodes on it might cause other issues. This workaround is
481 * added by ce5f8e779519 ("writeback: splice dirty inode entries to
482 * default bdi on bdi_destroy()") without root-causing the issue.
483 *
484 * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com
485 * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350
486 *
487 * We should probably add WARN_ON() to find out whether it still
488 * happens and track it down if so.
489 */
490 if (bdi_has_dirty_io(bdi)) {
491 struct bdi_writeback *dst = &default_backing_dev_info.wb;
492
493 bdi_lock_two(&bdi->wb, dst);
494 list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
495 list_splice(&bdi->wb.b_io, &dst->b_io);
496 list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
497 spin_unlock(&bdi->wb.list_lock);
498 spin_unlock(&dst->list_lock);
499 }
500
501 bdi_unregister(bdi);
502 440
441 WARN_ON(!list_empty(&bdi->work_list));
503 WARN_ON(delayed_work_pending(&bdi->wb.dwork)); 442 WARN_ON(delayed_work_pending(&bdi->wb.dwork));
504 443
444 if (bdi->dev) {
445 bdi_debug_unregister(bdi);
446 device_unregister(bdi->dev);
447 bdi->dev = NULL;
448 }
449
505 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) 450 for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
506 percpu_counter_destroy(&bdi->bdi_stat[i]); 451 percpu_counter_destroy(&bdi->bdi_stat[i]);
507
508 fprop_local_destroy_percpu(&bdi->completions); 452 fprop_local_destroy_percpu(&bdi->completions);
509} 453}
510EXPORT_SYMBOL(bdi_destroy); 454EXPORT_SYMBOL(bdi_destroy);
@@ -513,13 +457,12 @@ EXPORT_SYMBOL(bdi_destroy);
513 * For use from filesystems to quickly init and register a bdi associated 457 * For use from filesystems to quickly init and register a bdi associated
514 * with dirty writeback 458 * with dirty writeback
515 */ 459 */
516int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, 460int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
517 unsigned int cap)
518{ 461{
519 int err; 462 int err;
520 463
521 bdi->name = name; 464 bdi->name = name;
522 bdi->capabilities = cap; 465 bdi->capabilities = 0;
523 err = bdi_init(bdi); 466 err = bdi_init(bdi);
524 if (err) 467 if (err)
525 return err; 468 return err;
diff --git a/mm/cleancache.c b/mm/cleancache.c
index d0eac4350403..053bcd8f12fb 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -25,7 +25,7 @@
25static struct cleancache_ops *cleancache_ops __read_mostly; 25static struct cleancache_ops *cleancache_ops __read_mostly;
26 26
27/* 27/*
28 * Counters available via /sys/kernel/debug/frontswap (if debugfs is 28 * Counters available via /sys/kernel/debug/cleancache (if debugfs is
29 * properly configured. These are for information only so are not protected 29 * properly configured. These are for information only so are not protected
30 * against increment races. 30 * against increment races.
31 */ 31 */
diff --git a/mm/cma.c b/mm/cma.c
index a85ae28709a3..75016fd1de90 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -199,6 +199,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
199 cma->order_per_bit = order_per_bit; 199 cma->order_per_bit = order_per_bit;
200 *res_cma = cma; 200 *res_cma = cma;
201 cma_area_count++; 201 cma_area_count++;
202 totalcma_pages += (size / PAGE_SIZE);
202 203
203 return 0; 204 return 0;
204} 205}
@@ -337,7 +338,6 @@ int __init cma_declare_contiguous(phys_addr_t base,
337 if (ret) 338 if (ret)
338 goto err; 339 goto err;
339 340
340 totalcma_pages += (size / PAGE_SIZE);
341 pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, 341 pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
342 &base); 342 &base);
343 return 0; 343 return 0;
diff --git a/mm/compaction.c b/mm/compaction.c
index 546e571e9d60..8c0d9459b54a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,7 @@
16#include <linux/sysfs.h> 16#include <linux/sysfs.h>
17#include <linux/balloon_compaction.h> 17#include <linux/balloon_compaction.h>
18#include <linux/page-isolation.h> 18#include <linux/page-isolation.h>
19#include <linux/kasan.h>
19#include "internal.h" 20#include "internal.h"
20 21
21#ifdef CONFIG_COMPACTION 22#ifdef CONFIG_COMPACTION
@@ -34,6 +35,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
34#endif 35#endif
35 36
36#if defined CONFIG_COMPACTION || defined CONFIG_CMA 37#if defined CONFIG_COMPACTION || defined CONFIG_CMA
38#ifdef CONFIG_TRACEPOINTS
39static const char *const compaction_status_string[] = {
40 "deferred",
41 "skipped",
42 "continue",
43 "partial",
44 "complete",
45 "no_suitable_page",
46 "not_suitable_zone",
47};
48#endif
37 49
38#define CREATE_TRACE_POINTS 50#define CREATE_TRACE_POINTS
39#include <trace/events/compaction.h> 51#include <trace/events/compaction.h>
@@ -61,6 +73,7 @@ static void map_pages(struct list_head *list)
61 list_for_each_entry(page, list, lru) { 73 list_for_each_entry(page, list, lru) {
62 arch_alloc_page(page, 0); 74 arch_alloc_page(page, 0);
63 kernel_map_pages(page, 1, 1); 75 kernel_map_pages(page, 1, 1);
76 kasan_alloc_pages(page, 0);
64 } 77 }
65} 78}
66 79
@@ -113,6 +126,77 @@ static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
113} 126}
114 127
115#ifdef CONFIG_COMPACTION 128#ifdef CONFIG_COMPACTION
129
130/* Do not skip compaction more than 64 times */
131#define COMPACT_MAX_DEFER_SHIFT 6
132
133/*
134 * Compaction is deferred when compaction fails to result in a page
135 * allocation success. 1 << compact_defer_limit compactions are skipped up
136 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
137 */
138void defer_compaction(struct zone *zone, int order)
139{
140 zone->compact_considered = 0;
141 zone->compact_defer_shift++;
142
143 if (order < zone->compact_order_failed)
144 zone->compact_order_failed = order;
145
146 if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
147 zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
148
149 trace_mm_compaction_defer_compaction(zone, order);
150}
151
152/* Returns true if compaction should be skipped this time */
153bool compaction_deferred(struct zone *zone, int order)
154{
155 unsigned long defer_limit = 1UL << zone->compact_defer_shift;
156
157 if (order < zone->compact_order_failed)
158 return false;
159
160 /* Avoid possible overflow */
161 if (++zone->compact_considered > defer_limit)
162 zone->compact_considered = defer_limit;
163
164 if (zone->compact_considered >= defer_limit)
165 return false;
166
167 trace_mm_compaction_deferred(zone, order);
168
169 return true;
170}
171
172/*
173 * Update defer tracking counters after successful compaction of given order,
174 * which means an allocation either succeeded (alloc_success == true) or is
175 * expected to succeed.
176 */
177void compaction_defer_reset(struct zone *zone, int order,
178 bool alloc_success)
179{
180 if (alloc_success) {
181 zone->compact_considered = 0;
182 zone->compact_defer_shift = 0;
183 }
184 if (order >= zone->compact_order_failed)
185 zone->compact_order_failed = order + 1;
186
187 trace_mm_compaction_defer_reset(zone, order);
188}
189
190/* Returns true if restarting compaction after many failures */
191bool compaction_restarting(struct zone *zone, int order)
192{
193 if (order < zone->compact_order_failed)
194 return false;
195
196 return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
197 zone->compact_considered >= 1UL << zone->compact_defer_shift;
198}
199
116/* Returns true if the pageblock should be scanned for pages to isolate. */ 200/* Returns true if the pageblock should be scanned for pages to isolate. */
117static inline bool isolation_suitable(struct compact_control *cc, 201static inline bool isolation_suitable(struct compact_control *cc,
118 struct page *page) 202 struct page *page)
@@ -408,6 +492,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
408 492
409 /* If a page was split, advance to the end of it */ 493 /* If a page was split, advance to the end of it */
410 if (isolated) { 494 if (isolated) {
495 cc->nr_freepages += isolated;
496 if (!strict &&
497 cc->nr_migratepages <= cc->nr_freepages) {
498 blockpfn += isolated;
499 break;
500 }
501
411 blockpfn += isolated - 1; 502 blockpfn += isolated - 1;
412 cursor += isolated - 1; 503 cursor += isolated - 1;
413 continue; 504 continue;
@@ -421,11 +512,12 @@ isolate_fail:
421 512
422 } 513 }
423 514
515 trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
516 nr_scanned, total_isolated);
517
424 /* Record how far we have got within the block */ 518 /* Record how far we have got within the block */
425 *start_pfn = blockpfn; 519 *start_pfn = blockpfn;
426 520
427 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
428
429 /* 521 /*
430 * If strict isolation is requested by CMA then check that all the 522 * If strict isolation is requested by CMA then check that all the
431 * pages requested were isolated. If there were any failures, 0 is 523 * pages requested were isolated. If there were any failures, 0 is
@@ -581,6 +673,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
581 unsigned long flags = 0; 673 unsigned long flags = 0;
582 bool locked = false; 674 bool locked = false;
583 struct page *page = NULL, *valid_page = NULL; 675 struct page *page = NULL, *valid_page = NULL;
676 unsigned long start_pfn = low_pfn;
584 677
585 /* 678 /*
586 * Ensure that there are not too many pages isolated from the LRU 679 * Ensure that there are not too many pages isolated from the LRU
@@ -741,7 +834,8 @@ isolate_success:
741 if (low_pfn == end_pfn) 834 if (low_pfn == end_pfn)
742 update_pageblock_skip(cc, valid_page, nr_isolated, true); 835 update_pageblock_skip(cc, valid_page, nr_isolated, true);
743 836
744 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 837 trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
838 nr_scanned, nr_isolated);
745 839
746 count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); 840 count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
747 if (nr_isolated) 841 if (nr_isolated)
@@ -814,7 +908,6 @@ static void isolate_freepages(struct compact_control *cc)
814 unsigned long isolate_start_pfn; /* exact pfn we start at */ 908 unsigned long isolate_start_pfn; /* exact pfn we start at */
815 unsigned long block_end_pfn; /* end of current pageblock */ 909 unsigned long block_end_pfn; /* end of current pageblock */
816 unsigned long low_pfn; /* lowest pfn scanner is able to scan */ 910 unsigned long low_pfn; /* lowest pfn scanner is able to scan */
817 int nr_freepages = cc->nr_freepages;
818 struct list_head *freelist = &cc->freepages; 911 struct list_head *freelist = &cc->freepages;
819 912
820 /* 913 /*
@@ -839,11 +932,11 @@ static void isolate_freepages(struct compact_control *cc)
839 * pages on cc->migratepages. We stop searching if the migrate 932 * pages on cc->migratepages. We stop searching if the migrate
840 * and free page scanners meet or enough free pages are isolated. 933 * and free page scanners meet or enough free pages are isolated.
841 */ 934 */
842 for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; 935 for (; block_start_pfn >= low_pfn &&
936 cc->nr_migratepages > cc->nr_freepages;
843 block_end_pfn = block_start_pfn, 937 block_end_pfn = block_start_pfn,
844 block_start_pfn -= pageblock_nr_pages, 938 block_start_pfn -= pageblock_nr_pages,
845 isolate_start_pfn = block_start_pfn) { 939 isolate_start_pfn = block_start_pfn) {
846 unsigned long isolated;
847 940
848 /* 941 /*
849 * This can iterate a massively long zone without finding any 942 * This can iterate a massively long zone without finding any
@@ -868,9 +961,8 @@ static void isolate_freepages(struct compact_control *cc)
868 continue; 961 continue;
869 962
870 /* Found a block suitable for isolating free pages from. */ 963 /* Found a block suitable for isolating free pages from. */
871 isolated = isolate_freepages_block(cc, &isolate_start_pfn, 964 isolate_freepages_block(cc, &isolate_start_pfn,
872 block_end_pfn, freelist, false); 965 block_end_pfn, freelist, false);
873 nr_freepages += isolated;
874 966
875 /* 967 /*
876 * Remember where the free scanner should restart next time, 968 * Remember where the free scanner should restart next time,
@@ -902,8 +994,6 @@ static void isolate_freepages(struct compact_control *cc)
902 */ 994 */
903 if (block_start_pfn < low_pfn) 995 if (block_start_pfn < low_pfn)
904 cc->free_pfn = cc->migrate_pfn; 996 cc->free_pfn = cc->migrate_pfn;
905
906 cc->nr_freepages = nr_freepages;
907} 997}
908 998
909/* 999/*
@@ -1015,8 +1105,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1015 low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, 1105 low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
1016 isolate_mode); 1106 isolate_mode);
1017 1107
1018 if (!low_pfn || cc->contended) 1108 if (!low_pfn || cc->contended) {
1109 acct_isolated(zone, cc);
1019 return ISOLATE_ABORT; 1110 return ISOLATE_ABORT;
1111 }
1020 1112
1021 /* 1113 /*
1022 * Either we isolated something and proceed with migration. Or 1114 * Either we isolated something and proceed with migration. Or
@@ -1037,7 +1129,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1037 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; 1129 return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
1038} 1130}
1039 1131
1040static int compact_finished(struct zone *zone, struct compact_control *cc, 1132static int __compact_finished(struct zone *zone, struct compact_control *cc,
1041 const int migratetype) 1133 const int migratetype)
1042{ 1134{
1043 unsigned int order; 1135 unsigned int order;
@@ -1088,11 +1180,24 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
1088 return COMPACT_PARTIAL; 1180 return COMPACT_PARTIAL;
1089 1181
1090 /* Job done if allocation would set block type */ 1182 /* Job done if allocation would set block type */
1091 if (cc->order >= pageblock_order && area->nr_free) 1183 if (order >= pageblock_order && area->nr_free)
1092 return COMPACT_PARTIAL; 1184 return COMPACT_PARTIAL;
1093 } 1185 }
1094 1186
1095 return COMPACT_CONTINUE; 1187 return COMPACT_NO_SUITABLE_PAGE;
1188}
1189
1190static int compact_finished(struct zone *zone, struct compact_control *cc,
1191 const int migratetype)
1192{
1193 int ret;
1194
1195 ret = __compact_finished(zone, cc, migratetype);
1196 trace_mm_compaction_finished(zone, cc->order, ret);
1197 if (ret == COMPACT_NO_SUITABLE_PAGE)
1198 ret = COMPACT_CONTINUE;
1199
1200 return ret;
1096} 1201}
1097 1202
1098/* 1203/*
@@ -1102,7 +1207,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
1102 * COMPACT_PARTIAL - If the allocation would succeed without compaction 1207 * COMPACT_PARTIAL - If the allocation would succeed without compaction
1103 * COMPACT_CONTINUE - If compaction should run now 1208 * COMPACT_CONTINUE - If compaction should run now
1104 */ 1209 */
1105unsigned long compaction_suitable(struct zone *zone, int order, 1210static unsigned long __compaction_suitable(struct zone *zone, int order,
1106 int alloc_flags, int classzone_idx) 1211 int alloc_flags, int classzone_idx)
1107{ 1212{
1108 int fragindex; 1213 int fragindex;
@@ -1146,11 +1251,24 @@ unsigned long compaction_suitable(struct zone *zone, int order,
1146 */ 1251 */
1147 fragindex = fragmentation_index(zone, order); 1252 fragindex = fragmentation_index(zone, order);
1148 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) 1253 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
1149 return COMPACT_SKIPPED; 1254 return COMPACT_NOT_SUITABLE_ZONE;
1150 1255
1151 return COMPACT_CONTINUE; 1256 return COMPACT_CONTINUE;
1152} 1257}
1153 1258
1259unsigned long compaction_suitable(struct zone *zone, int order,
1260 int alloc_flags, int classzone_idx)
1261{
1262 unsigned long ret;
1263
1264 ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
1265 trace_mm_compaction_suitable(zone, order, ret);
1266 if (ret == COMPACT_NOT_SUITABLE_ZONE)
1267 ret = COMPACT_SKIPPED;
1268
1269 return ret;
1270}
1271
1154static int compact_zone(struct zone *zone, struct compact_control *cc) 1272static int compact_zone(struct zone *zone, struct compact_control *cc)
1155{ 1273{
1156 int ret; 1274 int ret;
@@ -1197,7 +1315,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1197 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; 1315 zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
1198 } 1316 }
1199 1317
1200 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); 1318 trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
1319 cc->free_pfn, end_pfn, sync);
1201 1320
1202 migrate_prep_local(); 1321 migrate_prep_local();
1203 1322
@@ -1299,7 +1418,8 @@ out:
1299 zone->compact_cached_free_pfn = free_pfn; 1418 zone->compact_cached_free_pfn = free_pfn;
1300 } 1419 }
1301 1420
1302 trace_mm_compaction_end(ret); 1421 trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
1422 cc->free_pfn, end_pfn, sync, ret);
1303 1423
1304 return ret; 1424 return ret;
1305} 1425}
@@ -1335,22 +1455,20 @@ int sysctl_extfrag_threshold = 500;
1335 1455
1336/** 1456/**
1337 * try_to_compact_pages - Direct compact to satisfy a high-order allocation 1457 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
1338 * @zonelist: The zonelist used for the current allocation
1339 * @order: The order of the current allocation
1340 * @gfp_mask: The GFP mask of the current allocation 1458 * @gfp_mask: The GFP mask of the current allocation
1341 * @nodemask: The allowed nodes to allocate from 1459 * @order: The order of the current allocation
1460 * @alloc_flags: The allocation flags of the current allocation
1461 * @ac: The context of current allocation
1342 * @mode: The migration mode for async, sync light, or sync migration 1462 * @mode: The migration mode for async, sync light, or sync migration
1343 * @contended: Return value that determines if compaction was aborted due to 1463 * @contended: Return value that determines if compaction was aborted due to
1344 * need_resched() or lock contention 1464 * need_resched() or lock contention
1345 * 1465 *
1346 * This is the main entry point for direct page compaction. 1466 * This is the main entry point for direct page compaction.
1347 */ 1467 */
1348unsigned long try_to_compact_pages(struct zonelist *zonelist, 1468unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1349 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1469 int alloc_flags, const struct alloc_context *ac,
1350 enum migrate_mode mode, int *contended, 1470 enum migrate_mode mode, int *contended)
1351 int alloc_flags, int classzone_idx)
1352{ 1471{
1353 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1354 int may_enter_fs = gfp_mask & __GFP_FS; 1472 int may_enter_fs = gfp_mask & __GFP_FS;
1355 int may_perform_io = gfp_mask & __GFP_IO; 1473 int may_perform_io = gfp_mask & __GFP_IO;
1356 struct zoneref *z; 1474 struct zoneref *z;
@@ -1364,9 +1482,11 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1364 if (!order || !may_enter_fs || !may_perform_io) 1482 if (!order || !may_enter_fs || !may_perform_io)
1365 return COMPACT_SKIPPED; 1483 return COMPACT_SKIPPED;
1366 1484
1485 trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);
1486
1367 /* Compact each zone in the list */ 1487 /* Compact each zone in the list */
1368 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1488 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
1369 nodemask) { 1489 ac->nodemask) {
1370 int status; 1490 int status;
1371 int zone_contended; 1491 int zone_contended;
1372 1492
@@ -1374,7 +1494,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1374 continue; 1494 continue;
1375 1495
1376 status = compact_zone_order(zone, order, gfp_mask, mode, 1496 status = compact_zone_order(zone, order, gfp_mask, mode,
1377 &zone_contended, alloc_flags, classzone_idx); 1497 &zone_contended, alloc_flags,
1498 ac->classzone_idx);
1378 rc = max(status, rc); 1499 rc = max(status, rc);
1379 /* 1500 /*
1380 * It takes at least one zone that wasn't lock contended 1501 * It takes at least one zone that wasn't lock contended
@@ -1384,7 +1505,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
1384 1505
1385 /* If a normal allocation would succeed, stop compacting */ 1506 /* If a normal allocation would succeed, stop compacting */
1386 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 1507 if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
1387 classzone_idx, alloc_flags)) { 1508 ac->classzone_idx, alloc_flags)) {
1388 /* 1509 /*
1389 * We think the allocation will succeed in this zone, 1510 * We think the allocation will succeed in this zone,
1390 * but it is not certain, hence the false. The caller 1511 * but it is not certain, hence the false. The caller
diff --git a/mm/debug.c b/mm/debug.c
index 0e58f3211f89..3eb3ac2fcee7 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -130,7 +130,6 @@ static const struct trace_print_flags vmaflags_names[] = {
130 {VM_ACCOUNT, "account" }, 130 {VM_ACCOUNT, "account" },
131 {VM_NORESERVE, "noreserve" }, 131 {VM_NORESERVE, "noreserve" },
132 {VM_HUGETLB, "hugetlb" }, 132 {VM_HUGETLB, "hugetlb" },
133 {VM_NONLINEAR, "nonlinear" },
134#if defined(CONFIG_X86) 133#if defined(CONFIG_X86)
135 {VM_PAT, "pat" }, 134 {VM_PAT, "pat" },
136#elif defined(CONFIG_PPC) 135#elif defined(CONFIG_PPC)
@@ -174,7 +173,7 @@ void dump_mm(const struct mm_struct *mm)
174 "get_unmapped_area %p\n" 173 "get_unmapped_area %p\n"
175#endif 174#endif
176 "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" 175 "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
177 "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n" 176 "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
178 "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" 177 "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
179 "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" 178 "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
180 "start_code %lx end_code %lx start_data %lx end_data %lx\n" 179 "start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -207,6 +206,7 @@ void dump_mm(const struct mm_struct *mm)
207 mm->pgd, atomic_read(&mm->mm_users), 206 mm->pgd, atomic_read(&mm->mm_users),
208 atomic_read(&mm->mm_count), 207 atomic_read(&mm->mm_count),
209 atomic_long_read((atomic_long_t *)&mm->nr_ptes), 208 atomic_long_read((atomic_long_t *)&mm->nr_ptes),
209 mm_nr_pmds((struct mm_struct *)mm),
210 mm->map_count, 210 mm->map_count,
211 mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, 211 mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
212 mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, 212 mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 2ad7adf4f0a4..fac23ecf8d72 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -73,7 +73,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
73 else 73 else
74 endbyte--; /* inclusive */ 74 endbyte--; /* inclusive */
75 75
76 bdi = mapping->backing_dev_info; 76 bdi = inode_to_bdi(mapping->host);
77 77
78 switch (advice) { 78 switch (advice) {
79 case POSIX_FADV_NORMAL: 79 case POSIX_FADV_NORMAL:
@@ -113,7 +113,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
113 case POSIX_FADV_NOREUSE: 113 case POSIX_FADV_NOREUSE:
114 break; 114 break;
115 case POSIX_FADV_DONTNEED: 115 case POSIX_FADV_DONTNEED:
116 if (!bdi_write_congested(mapping->backing_dev_info)) 116 if (!bdi_write_congested(bdi))
117 __filemap_fdatawrite_range(mapping, offset, endbyte, 117 __filemap_fdatawrite_range(mapping, offset, endbyte,
118 WB_SYNC_NONE); 118 WB_SYNC_NONE);
119 119
diff --git a/mm/filemap.c b/mm/filemap.c
index 673e4581a2e5..d9f5336552d7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -211,7 +211,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
211 */ 211 */
212 if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { 212 if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
213 dec_zone_page_state(page, NR_FILE_DIRTY); 213 dec_zone_page_state(page, NR_FILE_DIRTY);
214 dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 214 dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
215 } 215 }
216} 216}
217 217
@@ -2087,7 +2087,6 @@ const struct vm_operations_struct generic_file_vm_ops = {
2087 .fault = filemap_fault, 2087 .fault = filemap_fault,
2088 .map_pages = filemap_map_pages, 2088 .map_pages = filemap_map_pages,
2089 .page_mkwrite = filemap_page_mkwrite, 2089 .page_mkwrite = filemap_page_mkwrite,
2090 .remap_pages = generic_file_remap_pages,
2091}; 2090};
2092 2091
2093/* This is used for a general mmap of a disk file */ 2092/* This is used for a general mmap of a disk file */
@@ -2565,7 +2564,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
2565 size_t count = iov_iter_count(from); 2564 size_t count = iov_iter_count(from);
2566 2565
2567 /* We can write back this queue in page reclaim */ 2566 /* We can write back this queue in page reclaim */
2568 current->backing_dev_info = mapping->backing_dev_info; 2567 current->backing_dev_info = inode_to_bdi(inode);
2569 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 2568 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
2570 if (err) 2569 if (err)
2571 goto out; 2570 goto out;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 0d105aeff82f..c175f9f25210 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -9,6 +9,7 @@
9 */ 9 */
10 10
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include <linux/backing-dev.h>
12#include <linux/pagemap.h> 13#include <linux/pagemap.h>
13#include <linux/export.h> 14#include <linux/export.h>
14#include <linux/uio.h> 15#include <linux/uio.h>
@@ -301,7 +302,6 @@ out:
301static const struct vm_operations_struct xip_file_vm_ops = { 302static const struct vm_operations_struct xip_file_vm_ops = {
302 .fault = xip_file_fault, 303 .fault = xip_file_fault,
303 .page_mkwrite = filemap_page_mkwrite, 304 .page_mkwrite = filemap_page_mkwrite,
304 .remap_pages = generic_file_remap_pages,
305}; 305};
306 306
307int xip_file_mmap(struct file * file, struct vm_area_struct * vma) 307int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -410,7 +410,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
410 count = len; 410 count = len;
411 411
412 /* We can write back this queue in page reclaim */ 412 /* We can write back this queue in page reclaim */
413 current->backing_dev_info = mapping->backing_dev_info; 413 current->backing_dev_info = inode_to_bdi(inode);
414 414
415 ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode)); 415 ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
416 if (ret) 416 if (ret)
diff --git a/mm/fremap.c b/mm/fremap.c
deleted file mode 100644
index 2805d71cf476..000000000000
--- a/mm/fremap.c
+++ /dev/null
@@ -1,283 +0,0 @@
1/*
2 * linux/mm/fremap.c
3 *
4 * Explicit pagetable population and nonlinear (random) mappings support.
5 *
6 * started by Ingo Molnar, Copyright (C) 2002, 2003
7 */
8#include <linux/export.h>
9#include <linux/backing-dev.h>
10#include <linux/mm.h>
11#include <linux/swap.h>
12#include <linux/file.h>
13#include <linux/mman.h>
14#include <linux/pagemap.h>
15#include <linux/swapops.h>
16#include <linux/rmap.h>
17#include <linux/syscalls.h>
18#include <linux/mmu_notifier.h>
19
20#include <asm/mmu_context.h>
21#include <asm/cacheflush.h>
22#include <asm/tlbflush.h>
23
24#include "internal.h"
25
26static int mm_counter(struct page *page)
27{
28 return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES;
29}
30
31static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
32 unsigned long addr, pte_t *ptep)
33{
34 pte_t pte = *ptep;
35 struct page *page;
36 swp_entry_t entry;
37
38 if (pte_present(pte)) {
39 flush_cache_page(vma, addr, pte_pfn(pte));
40 pte = ptep_clear_flush_notify(vma, addr, ptep);
41 page = vm_normal_page(vma, addr, pte);
42 if (page) {
43 if (pte_dirty(pte))
44 set_page_dirty(page);
45 update_hiwater_rss(mm);
46 dec_mm_counter(mm, mm_counter(page));
47 page_remove_rmap(page);
48 page_cache_release(page);
49 }
50 } else { /* zap_pte() is not called when pte_none() */
51 if (!pte_file(pte)) {
52 update_hiwater_rss(mm);
53 entry = pte_to_swp_entry(pte);
54 if (non_swap_entry(entry)) {
55 if (is_migration_entry(entry)) {
56 page = migration_entry_to_page(entry);
57 dec_mm_counter(mm, mm_counter(page));
58 }
59 } else {
60 free_swap_and_cache(entry);
61 dec_mm_counter(mm, MM_SWAPENTS);
62 }
63 }
64 pte_clear_not_present_full(mm, addr, ptep, 0);
65 }
66}
67
68/*
69 * Install a file pte to a given virtual memory address, release any
70 * previously existing mapping.
71 */
72static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
73 unsigned long addr, unsigned long pgoff, pgprot_t prot)
74{
75 int err = -ENOMEM;
76 pte_t *pte, ptfile;
77 spinlock_t *ptl;
78
79 pte = get_locked_pte(mm, addr, &ptl);
80 if (!pte)
81 goto out;
82
83 ptfile = pgoff_to_pte(pgoff);
84
85 if (!pte_none(*pte))
86 zap_pte(mm, vma, addr, pte);
87
88 set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile));
89 /*
90 * We don't need to run update_mmu_cache() here because the "file pte"
91 * being installed by install_file_pte() is not a real pte - it's a
92 * non-present entry (like a swap entry), noting what file offset should
93 * be mapped there when there's a fault (in a non-linear vma where
94 * that's not obvious).
95 */
96 pte_unmap_unlock(pte, ptl);
97 err = 0;
98out:
99 return err;
100}
101
102int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
103 unsigned long size, pgoff_t pgoff)
104{
105 struct mm_struct *mm = vma->vm_mm;
106 int err;
107
108 do {
109 err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot);
110 if (err)
111 return err;
112
113 size -= PAGE_SIZE;
114 addr += PAGE_SIZE;
115 pgoff++;
116 } while (size);
117
118 return 0;
119}
120EXPORT_SYMBOL(generic_file_remap_pages);
121
122/**
123 * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
124 * @start: start of the remapped virtual memory range
125 * @size: size of the remapped virtual memory range
126 * @prot: new protection bits of the range (see NOTE)
127 * @pgoff: to-be-mapped page of the backing store file
128 * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
129 *
130 * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
131 * (shared backing store file).
132 *
133 * This syscall works purely via pagetables, so it's the most efficient
134 * way to map the same (large) file into a given virtual window. Unlike
135 * mmap()/mremap() it does not create any new vmas. The new mappings are
136 * also safe across swapout.
137 *
138 * NOTE: the @prot parameter right now is ignored (but must be zero),
139 * and the vma's default protection is used. Arbitrary protections
140 * might be implemented in the future.
141 */
142SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
143 unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
144{
145 struct mm_struct *mm = current->mm;
146 struct address_space *mapping;
147 struct vm_area_struct *vma;
148 int err = -EINVAL;
149 int has_write_lock = 0;
150 vm_flags_t vm_flags = 0;
151
152 pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
153 "See Documentation/vm/remap_file_pages.txt.\n",
154 current->comm, current->pid);
155
156 if (prot)
157 return err;
158 /*
159 * Sanitize the syscall parameters:
160 */
161 start = start & PAGE_MASK;
162 size = size & PAGE_MASK;
163
164 /* Does the address range wrap, or is the span zero-sized? */
165 if (start + size <= start)
166 return err;
167
168 /* Does pgoff wrap? */
169 if (pgoff + (size >> PAGE_SHIFT) < pgoff)
170 return err;
171
172 /* Can we represent this offset inside this architecture's pte's? */
173#if PTE_FILE_MAX_BITS < BITS_PER_LONG
174 if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
175 return err;
176#endif
177
178 /* We need down_write() to change vma->vm_flags. */
179 down_read(&mm->mmap_sem);
180 retry:
181 vma = find_vma(mm, start);
182
183 /*
184 * Make sure the vma is shared, that it supports prefaulting,
185 * and that the remapped range is valid and fully within
186 * the single existing vma.
187 */
188 if (!vma || !(vma->vm_flags & VM_SHARED))
189 goto out;
190
191 if (!vma->vm_ops || !vma->vm_ops->remap_pages)
192 goto out;
193
194 if (start < vma->vm_start || start + size > vma->vm_end)
195 goto out;
196
197 /* Must set VM_NONLINEAR before any pages are populated. */
198 if (!(vma->vm_flags & VM_NONLINEAR)) {
199 /*
200 * vm_private_data is used as a swapout cursor
201 * in a VM_NONLINEAR vma.
202 */
203 if (vma->vm_private_data)
204 goto out;
205
206 /* Don't need a nonlinear mapping, exit success */
207 if (pgoff == linear_page_index(vma, start)) {
208 err = 0;
209 goto out;
210 }
211
212 if (!has_write_lock) {
213get_write_lock:
214 up_read(&mm->mmap_sem);
215 down_write(&mm->mmap_sem);
216 has_write_lock = 1;
217 goto retry;
218 }
219 mapping = vma->vm_file->f_mapping;
220 /*
221 * page_mkclean doesn't work on nonlinear vmas, so if
222 * dirty pages need to be accounted, emulate with linear
223 * vmas.
224 */
225 if (mapping_cap_account_dirty(mapping)) {
226 unsigned long addr;
227 struct file *file = get_file(vma->vm_file);
228 /* mmap_region may free vma; grab the info now */
229 vm_flags = vma->vm_flags;
230
231 addr = mmap_region(file, start, size, vm_flags, pgoff);
232 fput(file);
233 if (IS_ERR_VALUE(addr)) {
234 err = addr;
235 } else {
236 BUG_ON(addr != start);
237 err = 0;
238 }
239 goto out_freed;
240 }
241 i_mmap_lock_write(mapping);
242 flush_dcache_mmap_lock(mapping);
243 vma->vm_flags |= VM_NONLINEAR;
244 vma_interval_tree_remove(vma, &mapping->i_mmap);
245 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
246 flush_dcache_mmap_unlock(mapping);
247 i_mmap_unlock_write(mapping);
248 }
249
250 if (vma->vm_flags & VM_LOCKED) {
251 /*
252 * drop PG_Mlocked flag for over-mapped range
253 */
254 if (!has_write_lock)
255 goto get_write_lock;
256 vm_flags = vma->vm_flags;
257 munlock_vma_pages_range(vma, start, start + size);
258 vma->vm_flags = vm_flags;
259 }
260
261 mmu_notifier_invalidate_range_start(mm, start, start + size);
262 err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
263 mmu_notifier_invalidate_range_end(mm, start, start + size);
264
265 /*
266 * We can't clear VM_NONLINEAR because we'd have to do
267 * it after ->populate completes, and that would prevent
268 * downgrading the lock. (Locks can't be upgraded).
269 */
270
271out:
272 if (vma)
273 vm_flags = vma->vm_flags;
274out_freed:
275 if (likely(!has_write_lock))
276 up_read(&mm->mmap_sem);
277 else
278 up_write(&mm->mmap_sem);
279 if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
280 mm_populate(start, size);
281
282 return err;
283}
diff --git a/mm/gup.c b/mm/gup.c
index bed30efad77c..a6e24e246f86 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -55,7 +55,7 @@ retry:
55 */ 55 */
56 if (likely(!(flags & FOLL_MIGRATION))) 56 if (likely(!(flags & FOLL_MIGRATION)))
57 goto no_page; 57 goto no_page;
58 if (pte_none(pte) || pte_file(pte)) 58 if (pte_none(pte))
59 goto no_page; 59 goto no_page;
60 entry = pte_to_swp_entry(pte); 60 entry = pte_to_swp_entry(pte);
61 if (!is_migration_entry(entry)) 61 if (!is_migration_entry(entry))
@@ -64,7 +64,7 @@ retry:
64 migration_entry_wait(mm, pmd, address); 64 migration_entry_wait(mm, pmd, address);
65 goto retry; 65 goto retry;
66 } 66 }
67 if ((flags & FOLL_NUMA) && pte_numa(pte)) 67 if ((flags & FOLL_NUMA) && pte_protnone(pte))
68 goto no_page; 68 goto no_page;
69 if ((flags & FOLL_WRITE) && !pte_write(pte)) { 69 if ((flags & FOLL_WRITE) && !pte_write(pte)) {
70 pte_unmap_unlock(ptep, ptl); 70 pte_unmap_unlock(ptep, ptl);
@@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
167 if (pud_none(*pud)) 167 if (pud_none(*pud))
168 return no_page_table(vma, flags); 168 return no_page_table(vma, flags);
169 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { 169 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
170 if (flags & FOLL_GET) 170 page = follow_huge_pud(mm, address, pud, flags);
171 return NULL; 171 if (page)
172 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); 172 return page;
173 return page; 173 return no_page_table(vma, flags);
174 } 174 }
175 if (unlikely(pud_bad(*pud))) 175 if (unlikely(pud_bad(*pud)))
176 return no_page_table(vma, flags); 176 return no_page_table(vma, flags);
@@ -179,21 +179,12 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
179 if (pmd_none(*pmd)) 179 if (pmd_none(*pmd))
180 return no_page_table(vma, flags); 180 return no_page_table(vma, flags);
181 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { 181 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
182 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 182 page = follow_huge_pmd(mm, address, pmd, flags);
183 if (flags & FOLL_GET) { 183 if (page)
184 /* 184 return page;
185 * Refcount on tail pages are not well-defined and 185 return no_page_table(vma, flags);
186 * shouldn't be taken. The caller should handle a NULL
187 * return when trying to follow tail pages.
188 */
189 if (PageHead(page))
190 get_page(page);
191 else
192 page = NULL;
193 }
194 return page;
195 } 186 }
196 if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) 187 if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
197 return no_page_table(vma, flags); 188 return no_page_table(vma, flags);
198 if (pmd_trans_huge(*pmd)) { 189 if (pmd_trans_huge(*pmd)) {
199 if (flags & FOLL_SPLIT) { 190 if (flags & FOLL_SPLIT) {
@@ -296,7 +287,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
296 return -ENOMEM; 287 return -ENOMEM;
297 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) 288 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
298 return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; 289 return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
299 if (ret & VM_FAULT_SIGBUS) 290 if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
300 return -EFAULT; 291 return -EFAULT;
301 BUG(); 292 BUG();
302 } 293 }
@@ -571,7 +562,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
571 return -ENOMEM; 562 return -ENOMEM;
572 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) 563 if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
573 return -EHWPOISON; 564 return -EHWPOISON;
574 if (ret & VM_FAULT_SIGBUS) 565 if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
575 return -EFAULT; 566 return -EFAULT;
576 BUG(); 567 BUG();
577 } 568 }
@@ -584,6 +575,185 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
584 return 0; 575 return 0;
585} 576}
586 577
578static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
579 struct mm_struct *mm,
580 unsigned long start,
581 unsigned long nr_pages,
582 int write, int force,
583 struct page **pages,
584 struct vm_area_struct **vmas,
585 int *locked, bool notify_drop,
586 unsigned int flags)
587{
588 long ret, pages_done;
589 bool lock_dropped;
590
591 if (locked) {
592 /* if VM_FAULT_RETRY can be returned, vmas become invalid */
593 BUG_ON(vmas);
594 /* check caller initialized locked */
595 BUG_ON(*locked != 1);
596 }
597
598 if (pages)
599 flags |= FOLL_GET;
600 if (write)
601 flags |= FOLL_WRITE;
602 if (force)
603 flags |= FOLL_FORCE;
604
605 pages_done = 0;
606 lock_dropped = false;
607 for (;;) {
608 ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
609 vmas, locked);
610 if (!locked)
611 /* VM_FAULT_RETRY couldn't trigger, bypass */
612 return ret;
613
614 /* VM_FAULT_RETRY cannot return errors */
615 if (!*locked) {
616 BUG_ON(ret < 0);
617 BUG_ON(ret >= nr_pages);
618 }
619
620 if (!pages)
621 /* If it's a prefault don't insist harder */
622 return ret;
623
624 if (ret > 0) {
625 nr_pages -= ret;
626 pages_done += ret;
627 if (!nr_pages)
628 break;
629 }
630 if (*locked) {
631 /* VM_FAULT_RETRY didn't trigger */
632 if (!pages_done)
633 pages_done = ret;
634 break;
635 }
636 /* VM_FAULT_RETRY triggered, so seek to the faulting offset */
637 pages += ret;
638 start += ret << PAGE_SHIFT;
639
640 /*
641 * Repeat on the address that fired VM_FAULT_RETRY
642 * without FAULT_FLAG_ALLOW_RETRY but with
643 * FAULT_FLAG_TRIED.
644 */
645 *locked = 1;
646 lock_dropped = true;
647 down_read(&mm->mmap_sem);
648 ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
649 pages, NULL, NULL);
650 if (ret != 1) {
651 BUG_ON(ret > 1);
652 if (!pages_done)
653 pages_done = ret;
654 break;
655 }
656 nr_pages--;
657 pages_done++;
658 if (!nr_pages)
659 break;
660 pages++;
661 start += PAGE_SIZE;
662 }
663 if (notify_drop && lock_dropped && *locked) {
664 /*
665 * We must let the caller know we temporarily dropped the lock
666 * and so the critical section protected by it was lost.
667 */
668 up_read(&mm->mmap_sem);
669 *locked = 0;
670 }
671 return pages_done;
672}
673
674/*
675 * We can leverage the VM_FAULT_RETRY functionality in the page fault
676 * paths better by using either get_user_pages_locked() or
677 * get_user_pages_unlocked().
678 *
679 * get_user_pages_locked() is suitable to replace the form:
680 *
681 * down_read(&mm->mmap_sem);
682 * do_something()
683 * get_user_pages(tsk, mm, ..., pages, NULL);
684 * up_read(&mm->mmap_sem);
685 *
686 * to:
687 *
688 * int locked = 1;
689 * down_read(&mm->mmap_sem);
690 * do_something()
691 * get_user_pages_locked(tsk, mm, ..., pages, &locked);
692 * if (locked)
693 * up_read(&mm->mmap_sem);
694 */
695long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
696 unsigned long start, unsigned long nr_pages,
697 int write, int force, struct page **pages,
698 int *locked)
699{
700 return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
701 pages, NULL, locked, true, FOLL_TOUCH);
702}
703EXPORT_SYMBOL(get_user_pages_locked);
704
705/*
706 * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
707 * pass additional gup_flags as last parameter (like FOLL_HWPOISON).
708 *
709 * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
710 * caller if required (just like with __get_user_pages). "FOLL_GET",
711 * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed
712 * according to the parameters "pages", "write", "force"
713 * respectively.
714 */
715__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
716 unsigned long start, unsigned long nr_pages,
717 int write, int force, struct page **pages,
718 unsigned int gup_flags)
719{
720 long ret;
721 int locked = 1;
722 down_read(&mm->mmap_sem);
723 ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
724 pages, NULL, &locked, false, gup_flags);
725 if (locked)
726 up_read(&mm->mmap_sem);
727 return ret;
728}
729EXPORT_SYMBOL(__get_user_pages_unlocked);
730
731/*
732 * get_user_pages_unlocked() is suitable to replace the form:
733 *
734 * down_read(&mm->mmap_sem);
735 * get_user_pages(tsk, mm, ..., pages, NULL);
736 * up_read(&mm->mmap_sem);
737 *
738 * with:
739 *
740 * get_user_pages_unlocked(tsk, mm, ..., pages);
741 *
742 * It is functionally equivalent to get_user_pages_fast so
743 * get_user_pages_fast should be used instead, if the two parameters
744 * "tsk" and "mm" are respectively equal to current and current->mm,
745 * or if "force" shall be set to 1 (get_user_pages_fast misses the
746 * "force" parameter).
747 */
748long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
749 unsigned long start, unsigned long nr_pages,
750 int write, int force, struct page **pages)
751{
752 return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
753 force, pages, FOLL_TOUCH);
754}
755EXPORT_SYMBOL(get_user_pages_unlocked);
756
587/* 757/*
588 * get_user_pages() - pin user pages in memory 758 * get_user_pages() - pin user pages in memory
589 * @tsk: the task_struct to use for page fault accounting, or 759 * @tsk: the task_struct to use for page fault accounting, or
@@ -633,22 +803,18 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
633 * use the correct cache flushing APIs. 803 * use the correct cache flushing APIs.
634 * 804 *
635 * See also get_user_pages_fast, for performance critical applications. 805 * See also get_user_pages_fast, for performance critical applications.
806 *
807 * get_user_pages should be phased out in favor of
808 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
809 * should use get_user_pages because it cannot pass
810 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
636 */ 811 */
637long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 812long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
638 unsigned long start, unsigned long nr_pages, int write, 813 unsigned long start, unsigned long nr_pages, int write,
639 int force, struct page **pages, struct vm_area_struct **vmas) 814 int force, struct page **pages, struct vm_area_struct **vmas)
640{ 815{
641 int flags = FOLL_TOUCH; 816 return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
642 817 pages, vmas, NULL, false, FOLL_TOUCH);
643 if (pages)
644 flags |= FOLL_GET;
645 if (write)
646 flags |= FOLL_WRITE;
647 if (force)
648 flags |= FOLL_FORCE;
649
650 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
651 NULL);
652} 818}
653EXPORT_SYMBOL(get_user_pages); 819EXPORT_SYMBOL(get_user_pages);
654 820
@@ -740,10 +906,10 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
740 906
741 /* 907 /*
742 * Similar to the PMD case below, NUMA hinting must take slow 908 * Similar to the PMD case below, NUMA hinting must take slow
743 * path 909 * path using the pte_protnone check.
744 */ 910 */
745 if (!pte_present(pte) || pte_special(pte) || 911 if (!pte_present(pte) || pte_special(pte) ||
746 pte_numa(pte) || (write && !pte_write(pte))) 912 pte_protnone(pte) || (write && !pte_write(pte)))
747 goto pte_unmap; 913 goto pte_unmap;
748 914
749 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 915 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -938,7 +1104,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
938 * slowpath for accounting purposes and so that they 1104 * slowpath for accounting purposes and so that they
939 * can be serialised against THP migration. 1105 * can be serialised against THP migration.
940 */ 1106 */
941 if (pmd_numa(pmd)) 1107 if (pmd_protnone(pmd))
942 return 0; 1108 return 0;
943 1109
944 if (!gup_huge_pmd(pmd, pmdp, addr, next, write, 1110 if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
@@ -1077,10 +1243,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1077 start += nr << PAGE_SHIFT; 1243 start += nr << PAGE_SHIFT;
1078 pages += nr; 1244 pages += nr;
1079 1245
1080 down_read(&mm->mmap_sem); 1246 ret = get_user_pages_unlocked(current, mm, start,
1081 ret = get_user_pages(current, mm, start, 1247 nr_pages - nr, write, 0, pages);
1082 nr_pages - nr, write, 0, pages, NULL);
1083 up_read(&mm->mmap_sem);
1084 1248
1085 /* Have to be a bit careful with return values */ 1249 /* Have to be a bit careful with return values */
1086 if (nr > 0) { 1250 if (nr > 0) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 817a875f2b8c..fc00c8cb5a82 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -171,12 +171,7 @@ static int start_khugepaged(void)
171} 171}
172 172
173static atomic_t huge_zero_refcount; 173static atomic_t huge_zero_refcount;
174static struct page *huge_zero_page __read_mostly; 174struct page *huge_zero_page __read_mostly;
175
176static inline bool is_huge_zero_page(struct page *page)
177{
178 return ACCESS_ONCE(huge_zero_page) == page;
179}
180 175
181static inline bool is_huge_zero_pmd(pmd_t pmd) 176static inline bool is_huge_zero_pmd(pmd_t pmd)
182{ 177{
@@ -766,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
766 return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; 761 return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
767} 762}
768 763
769static inline struct page *alloc_hugepage_vma(int defrag,
770 struct vm_area_struct *vma,
771 unsigned long haddr, int nd,
772 gfp_t extra_gfp)
773{
774 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
775 HPAGE_PMD_ORDER, vma, haddr, nd);
776}
777
778/* Caller must hold page table lock. */ 764/* Caller must hold page table lock. */
779static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, 765static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
780 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, 766 struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
@@ -795,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
795 unsigned long address, pmd_t *pmd, 781 unsigned long address, pmd_t *pmd,
796 unsigned int flags) 782 unsigned int flags)
797{ 783{
784 gfp_t gfp;
798 struct page *page; 785 struct page *page;
799 unsigned long haddr = address & HPAGE_PMD_MASK; 786 unsigned long haddr = address & HPAGE_PMD_MASK;
800 787
@@ -829,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
829 } 816 }
830 return 0; 817 return 0;
831 } 818 }
832 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 819 gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
833 vma, haddr, numa_node_id(), 0); 820 page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
834 if (unlikely(!page)) { 821 if (unlikely(!page)) {
835 count_vm_event(THP_FAULT_FALLBACK); 822 count_vm_event(THP_FAULT_FALLBACK);
836 return VM_FAULT_FALLBACK; 823 return VM_FAULT_FALLBACK;
@@ -1118,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1118 spin_unlock(ptl); 1105 spin_unlock(ptl);
1119alloc: 1106alloc:
1120 if (transparent_hugepage_enabled(vma) && 1107 if (transparent_hugepage_enabled(vma) &&
1121 !transparent_hugepage_debug_cow()) 1108 !transparent_hugepage_debug_cow()) {
1122 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), 1109 gfp_t gfp;
1123 vma, haddr, numa_node_id(), 0); 1110
1124 else 1111 gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
1112 new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
1113 } else
1125 new_page = NULL; 1114 new_page = NULL;
1126 1115
1127 if (unlikely(!new_page)) { 1116 if (unlikely(!new_page)) {
@@ -1222,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1222 return ERR_PTR(-EFAULT); 1211 return ERR_PTR(-EFAULT);
1223 1212
1224 /* Full NUMA hinting faults to serialise migration in fault paths */ 1213 /* Full NUMA hinting faults to serialise migration in fault paths */
1225 if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) 1214 if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
1226 goto out; 1215 goto out;
1227 1216
1228 page = pmd_page(*pmd); 1217 page = pmd_page(*pmd);
@@ -1273,6 +1262,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1273 bool migrated = false; 1262 bool migrated = false;
1274 int flags = 0; 1263 int flags = 0;
1275 1264
1265 /* A PROT_NONE fault should not end up here */
1266 BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
1267
1276 ptl = pmd_lock(mm, pmdp); 1268 ptl = pmd_lock(mm, pmdp);
1277 if (unlikely(!pmd_same(pmd, *pmdp))) 1269 if (unlikely(!pmd_same(pmd, *pmdp)))
1278 goto out_unlock; 1270 goto out_unlock;
@@ -1283,8 +1275,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1283 * check_same as the page may no longer be mapped. 1275 * check_same as the page may no longer be mapped.
1284 */ 1276 */
1285 if (unlikely(pmd_trans_migrating(*pmdp))) { 1277 if (unlikely(pmd_trans_migrating(*pmdp))) {
1278 page = pmd_page(*pmdp);
1286 spin_unlock(ptl); 1279 spin_unlock(ptl);
1287 wait_migrate_huge_page(vma->anon_vma, pmdp); 1280 wait_on_page_locked(page);
1288 goto out; 1281 goto out;
1289 } 1282 }
1290 1283
@@ -1352,7 +1345,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1352 1345
1353 /* 1346 /*
1354 * Migrate the THP to the requested node, returns with page unlocked 1347 * Migrate the THP to the requested node, returns with page unlocked
1355 * and pmd_numa cleared. 1348 * and access rights restored.
1356 */ 1349 */
1357 spin_unlock(ptl); 1350 spin_unlock(ptl);
1358 migrated = migrate_misplaced_transhuge_page(mm, vma, 1351 migrated = migrate_misplaced_transhuge_page(mm, vma,
@@ -1365,9 +1358,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1365 goto out; 1358 goto out;
1366clear_pmdnuma: 1359clear_pmdnuma:
1367 BUG_ON(!PageLocked(page)); 1360 BUG_ON(!PageLocked(page));
1368 pmd = pmd_mknonnuma(pmd); 1361 pmd = pmd_modify(pmd, vma->vm_page_prot);
1369 set_pmd_at(mm, haddr, pmdp, pmd); 1362 set_pmd_at(mm, haddr, pmdp, pmd);
1370 VM_BUG_ON(pmd_numa(*pmdp));
1371 update_mmu_cache_pmd(vma, addr, pmdp); 1363 update_mmu_cache_pmd(vma, addr, pmdp);
1372 unlock_page(page); 1364 unlock_page(page);
1373out_unlock: 1365out_unlock:
@@ -1423,26 +1415,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1423 return ret; 1415 return ret;
1424} 1416}
1425 1417
1426int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1427 unsigned long addr, unsigned long end,
1428 unsigned char *vec)
1429{
1430 spinlock_t *ptl;
1431 int ret = 0;
1432
1433 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1434 /*
1435 * All logical pages in the range are present
1436 * if backed by a huge page.
1437 */
1438 spin_unlock(ptl);
1439 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1440 ret = 1;
1441 }
1442
1443 return ret;
1444}
1445
1446int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, 1418int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1447 unsigned long old_addr, 1419 unsigned long old_addr,
1448 unsigned long new_addr, unsigned long old_end, 1420 unsigned long new_addr, unsigned long old_end,
@@ -1510,29 +1482,24 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1510 1482
1511 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 1483 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1512 pmd_t entry; 1484 pmd_t entry;
1513 ret = 1; 1485
1514 if (!prot_numa) { 1486 /*
1487 * Avoid trapping faults against the zero page. The read-only
1488 * data is likely to be read-cached on the local CPU and
1489 * local/remote hits to the zero page are not interesting.
1490 */
1491 if (prot_numa && is_huge_zero_pmd(*pmd)) {
1492 spin_unlock(ptl);
1493 return 0;
1494 }
1495
1496 if (!prot_numa || !pmd_protnone(*pmd)) {
1497 ret = 1;
1515 entry = pmdp_get_and_clear_notify(mm, addr, pmd); 1498 entry = pmdp_get_and_clear_notify(mm, addr, pmd);
1516 if (pmd_numa(entry))
1517 entry = pmd_mknonnuma(entry);
1518 entry = pmd_modify(entry, newprot); 1499 entry = pmd_modify(entry, newprot);
1519 ret = HPAGE_PMD_NR; 1500 ret = HPAGE_PMD_NR;
1520 set_pmd_at(mm, addr, pmd, entry); 1501 set_pmd_at(mm, addr, pmd, entry);
1521 BUG_ON(pmd_write(entry)); 1502 BUG_ON(pmd_write(entry));
1522 } else {
1523 struct page *page = pmd_page(*pmd);
1524
1525 /*
1526 * Do not trap faults against the zero page. The
1527 * read-only data is likely to be read-cached on the
1528 * local CPU cache and it is less useful to know about
1529 * local vs remote hits on the zero page.
1530 */
1531 if (!is_huge_zero_page(page) &&
1532 !pmd_numa(*pmd)) {
1533 pmdp_set_numa(mm, addr, pmd);
1534 ret = HPAGE_PMD_NR;
1535 }
1536 } 1503 }
1537 spin_unlock(ptl); 1504 spin_unlock(ptl);
1538 } 1505 }
@@ -1797,9 +1764,9 @@ static int __split_huge_page_map(struct page *page,
1797 pte_t *pte, entry; 1764 pte_t *pte, entry;
1798 BUG_ON(PageCompound(page+i)); 1765 BUG_ON(PageCompound(page+i));
1799 /* 1766 /*
1800 * Note that pmd_numa is not transferred deliberately 1767 * Note that NUMA hinting access restrictions are not
1801 * to avoid any possibility that pte_numa leaks to 1768 * transferred to avoid any possibility of altering
1802 * a PROT_NONE VMA by accident. 1769 * permissions across VMAs.
1803 */ 1770 */
1804 entry = mk_pte(page + i, vma->vm_page_prot); 1771 entry = mk_pte(page + i, vma->vm_page_prot);
1805 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1772 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2148,7 +2115,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2148{ 2115{
2149 struct page *page; 2116 struct page *page;
2150 pte_t *_pte; 2117 pte_t *_pte;
2151 int referenced = 0, none = 0; 2118 int none = 0;
2119 bool referenced = false, writable = false;
2152 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 2120 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
2153 _pte++, address += PAGE_SIZE) { 2121 _pte++, address += PAGE_SIZE) {
2154 pte_t pteval = *_pte; 2122 pte_t pteval = *_pte;
@@ -2158,7 +2126,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2158 else 2126 else
2159 goto out; 2127 goto out;
2160 } 2128 }
2161 if (!pte_present(pteval) || !pte_write(pteval)) 2129 if (!pte_present(pteval))
2162 goto out; 2130 goto out;
2163 page = vm_normal_page(vma, address, pteval); 2131 page = vm_normal_page(vma, address, pteval);
2164 if (unlikely(!page)) 2132 if (unlikely(!page))
@@ -2168,9 +2136,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2168 VM_BUG_ON_PAGE(!PageAnon(page), page); 2136 VM_BUG_ON_PAGE(!PageAnon(page), page);
2169 VM_BUG_ON_PAGE(!PageSwapBacked(page), page); 2137 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
2170 2138
2171 /* cannot use mapcount: can't collapse if there's a gup pin */
2172 if (page_count(page) != 1)
2173 goto out;
2174 /* 2139 /*
2175 * We can do it before isolate_lru_page because the 2140 * We can do it before isolate_lru_page because the
2176 * page can't be freed from under us. NOTE: PG_lock 2141 * page can't be freed from under us. NOTE: PG_lock
@@ -2179,6 +2144,29 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2179 */ 2144 */
2180 if (!trylock_page(page)) 2145 if (!trylock_page(page))
2181 goto out; 2146 goto out;
2147
2148 /*
2149 * cannot use mapcount: can't collapse if there's a gup pin.
2150 * The page must only be referenced by the scanned process
2151 * and page swap cache.
2152 */
2153 if (page_count(page) != 1 + !!PageSwapCache(page)) {
2154 unlock_page(page);
2155 goto out;
2156 }
2157 if (pte_write(pteval)) {
2158 writable = true;
2159 } else {
2160 if (PageSwapCache(page) && !reuse_swap_page(page)) {
2161 unlock_page(page);
2162 goto out;
2163 }
2164 /*
2165 * Page is not in the swap cache. It can be collapsed
2166 * into a THP.
2167 */
2168 }
2169
2182 /* 2170 /*
2183 * Isolate the page to avoid collapsing an hugepage 2171 * Isolate the page to avoid collapsing an hugepage
2184 * currently in use by the VM. 2172 * currently in use by the VM.
@@ -2195,9 +2183,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
2195 /* If there is no mapped pte young don't collapse the page */ 2183 /* If there is no mapped pte young don't collapse the page */
2196 if (pte_young(pteval) || PageReferenced(page) || 2184 if (pte_young(pteval) || PageReferenced(page) ||
2197 mmu_notifier_test_young(vma->vm_mm, address)) 2185 mmu_notifier_test_young(vma->vm_mm, address))
2198 referenced = 1; 2186 referenced = true;
2199 } 2187 }
2200 if (likely(referenced)) 2188 if (likely(referenced && writable))
2201 return 1; 2189 return 1;
2202out: 2190out:
2203 release_pte_pages(pte, _pte); 2191 release_pte_pages(pte, _pte);
@@ -2550,11 +2538,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2550{ 2538{
2551 pmd_t *pmd; 2539 pmd_t *pmd;
2552 pte_t *pte, *_pte; 2540 pte_t *pte, *_pte;
2553 int ret = 0, referenced = 0, none = 0; 2541 int ret = 0, none = 0;
2554 struct page *page; 2542 struct page *page;
2555 unsigned long _address; 2543 unsigned long _address;
2556 spinlock_t *ptl; 2544 spinlock_t *ptl;
2557 int node = NUMA_NO_NODE; 2545 int node = NUMA_NO_NODE;
2546 bool writable = false, referenced = false;
2558 2547
2559 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2548 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2560 2549
@@ -2573,8 +2562,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2573 else 2562 else
2574 goto out_unmap; 2563 goto out_unmap;
2575 } 2564 }
2576 if (!pte_present(pteval) || !pte_write(pteval)) 2565 if (!pte_present(pteval))
2577 goto out_unmap; 2566 goto out_unmap;
2567 if (pte_write(pteval))
2568 writable = true;
2569
2578 page = vm_normal_page(vma, _address, pteval); 2570 page = vm_normal_page(vma, _address, pteval);
2579 if (unlikely(!page)) 2571 if (unlikely(!page))
2580 goto out_unmap; 2572 goto out_unmap;
@@ -2591,14 +2583,18 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2591 VM_BUG_ON_PAGE(PageCompound(page), page); 2583 VM_BUG_ON_PAGE(PageCompound(page), page);
2592 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) 2584 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
2593 goto out_unmap; 2585 goto out_unmap;
2594 /* cannot use mapcount: can't collapse if there's a gup pin */ 2586 /*
2595 if (page_count(page) != 1) 2587 * cannot use mapcount: can't collapse if there's a gup pin.
2588 * The page must only be referenced by the scanned process
2589 * and page swap cache.
2590 */
2591 if (page_count(page) != 1 + !!PageSwapCache(page))
2596 goto out_unmap; 2592 goto out_unmap;
2597 if (pte_young(pteval) || PageReferenced(page) || 2593 if (pte_young(pteval) || PageReferenced(page) ||
2598 mmu_notifier_test_young(vma->vm_mm, address)) 2594 mmu_notifier_test_young(vma->vm_mm, address))
2599 referenced = 1; 2595 referenced = true;
2600 } 2596 }
2601 if (referenced) 2597 if (referenced && writable)
2602 ret = 1; 2598 ret = 1;
2603out_unmap: 2599out_unmap:
2604 pte_unmap_unlock(pte, ptl); 2600 pte_unmap_unlock(pte, ptl);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 85032de5e20f..0a9ac6c26832 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -35,7 +35,7 @@
35#include <linux/node.h> 35#include <linux/node.h>
36#include "internal.h" 36#include "internal.h"
37 37
38unsigned long hugepages_treat_as_movable; 38int hugepages_treat_as_movable;
39 39
40int hugetlb_max_hstate __read_mostly; 40int hugetlb_max_hstate __read_mostly;
41unsigned int default_hstate_idx; 41unsigned int default_hstate_idx;
@@ -2657,9 +2657,10 @@ again:
2657 goto unlock; 2657 goto unlock;
2658 2658
2659 /* 2659 /*
2660 * HWPoisoned hugepage is already unmapped and dropped reference 2660 * Migrating hugepage or HWPoisoned hugepage is already
2661 * unmapped and its refcount is dropped, so just clear pte here.
2661 */ 2662 */
2662 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { 2663 if (unlikely(!pte_present(pte))) {
2663 huge_pte_clear(mm, address, ptep); 2664 huge_pte_clear(mm, address, ptep);
2664 goto unlock; 2665 goto unlock;
2665 } 2666 }
@@ -3134,6 +3135,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3134 struct page *pagecache_page = NULL; 3135 struct page *pagecache_page = NULL;
3135 struct hstate *h = hstate_vma(vma); 3136 struct hstate *h = hstate_vma(vma);
3136 struct address_space *mapping; 3137 struct address_space *mapping;
3138 int need_wait_lock = 0;
3137 3139
3138 address &= huge_page_mask(h); 3140 address &= huge_page_mask(h);
3139 3141
@@ -3172,6 +3174,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3172 ret = 0; 3174 ret = 0;
3173 3175
3174 /* 3176 /*
3177 * entry could be a migration/hwpoison entry at this point, so this
3178 * check prevents the kernel from going below assuming that we have
3179 * a active hugepage in pagecache. This goto expects the 2nd page fault,
3180 * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
3181 * handle it.
3182 */
3183 if (!pte_present(entry))
3184 goto out_mutex;
3185
3186 /*
3175 * If we are going to COW the mapping later, we examine the pending 3187 * If we are going to COW the mapping later, we examine the pending
3176 * reservations for this page now. This will ensure that any 3188 * reservations for this page now. This will ensure that any
3177 * allocations necessary to record that reservation occur outside the 3189 * allocations necessary to record that reservation occur outside the
@@ -3190,30 +3202,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3190 vma, address); 3202 vma, address);
3191 } 3203 }
3192 3204
3205 ptl = huge_pte_lock(h, mm, ptep);
3206
3207 /* Check for a racing update before calling hugetlb_cow */
3208 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
3209 goto out_ptl;
3210
3193 /* 3211 /*
3194 * hugetlb_cow() requires page locks of pte_page(entry) and 3212 * hugetlb_cow() requires page locks of pte_page(entry) and
3195 * pagecache_page, so here we need take the former one 3213 * pagecache_page, so here we need take the former one
3196 * when page != pagecache_page or !pagecache_page. 3214 * when page != pagecache_page or !pagecache_page.
3197 * Note that locking order is always pagecache_page -> page,
3198 * so no worry about deadlock.
3199 */ 3215 */
3200 page = pte_page(entry); 3216 page = pte_page(entry);
3201 get_page(page);
3202 if (page != pagecache_page) 3217 if (page != pagecache_page)
3203 lock_page(page); 3218 if (!trylock_page(page)) {
3204 3219 need_wait_lock = 1;
3205 ptl = huge_pte_lockptr(h, mm, ptep); 3220 goto out_ptl;
3206 spin_lock(ptl); 3221 }
3207 /* Check for a racing update before calling hugetlb_cow */
3208 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
3209 goto out_ptl;
3210 3222
3223 get_page(page);
3211 3224
3212 if (flags & FAULT_FLAG_WRITE) { 3225 if (flags & FAULT_FLAG_WRITE) {
3213 if (!huge_pte_write(entry)) { 3226 if (!huge_pte_write(entry)) {
3214 ret = hugetlb_cow(mm, vma, address, ptep, entry, 3227 ret = hugetlb_cow(mm, vma, address, ptep, entry,
3215 pagecache_page, ptl); 3228 pagecache_page, ptl);
3216 goto out_ptl; 3229 goto out_put_page;
3217 } 3230 }
3218 entry = huge_pte_mkdirty(entry); 3231 entry = huge_pte_mkdirty(entry);
3219 } 3232 }
@@ -3221,7 +3234,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3221 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 3234 if (huge_ptep_set_access_flags(vma, address, ptep, entry,
3222 flags & FAULT_FLAG_WRITE)) 3235 flags & FAULT_FLAG_WRITE))
3223 update_mmu_cache(vma, address, ptep); 3236 update_mmu_cache(vma, address, ptep);
3224 3237out_put_page:
3238 if (page != pagecache_page)
3239 unlock_page(page);
3240 put_page(page);
3225out_ptl: 3241out_ptl:
3226 spin_unlock(ptl); 3242 spin_unlock(ptl);
3227 3243
@@ -3229,12 +3245,17 @@ out_ptl:
3229 unlock_page(pagecache_page); 3245 unlock_page(pagecache_page);
3230 put_page(pagecache_page); 3246 put_page(pagecache_page);
3231 } 3247 }
3232 if (page != pagecache_page)
3233 unlock_page(page);
3234 put_page(page);
3235
3236out_mutex: 3248out_mutex:
3237 mutex_unlock(&htlb_fault_mutex_table[hash]); 3249 mutex_unlock(&htlb_fault_mutex_table[hash]);
3250 /*
3251 * Generally it's safe to hold refcount during waiting page lock. But
3252 * here we just wait to defer the next page fault to avoid busy loop and
3253 * the page is not used after unlocked before returning from the current
3254 * page fault. So we are safe from accessing freed page, even if we wait
3255 * here without taking refcount.
3256 */
3257 if (need_wait_lock)
3258 wait_on_page_locked(page);
3238 return ret; 3259 return ret;
3239} 3260}
3240 3261
@@ -3364,7 +3385,26 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
3364 spin_unlock(ptl); 3385 spin_unlock(ptl);
3365 continue; 3386 continue;
3366 } 3387 }
3367 if (!huge_pte_none(huge_ptep_get(ptep))) { 3388 pte = huge_ptep_get(ptep);
3389 if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
3390 spin_unlock(ptl);
3391 continue;
3392 }
3393 if (unlikely(is_hugetlb_entry_migration(pte))) {
3394 swp_entry_t entry = pte_to_swp_entry(pte);
3395
3396 if (is_write_migration_entry(entry)) {
3397 pte_t newpte;
3398
3399 make_migration_entry_read(&entry);
3400 newpte = swp_entry_to_pte(entry);
3401 set_huge_pte_at(mm, address, ptep, newpte);
3402 pages++;
3403 }
3404 spin_unlock(ptl);
3405 continue;
3406 }
3407 if (!huge_pte_none(pte)) {
3368 pte = huge_ptep_get_and_clear(mm, address, ptep); 3408 pte = huge_ptep_get_and_clear(mm, address, ptep);
3369 pte = pte_mkhuge(huge_pte_modify(pte, newprot)); 3409 pte = pte_mkhuge(huge_pte_modify(pte, newprot));
3370 pte = arch_make_huge_pte(pte, vma, NULL, 0); 3410 pte = arch_make_huge_pte(pte, vma, NULL, 0);
@@ -3558,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3558 if (saddr) { 3598 if (saddr) {
3559 spte = huge_pte_offset(svma->vm_mm, saddr); 3599 spte = huge_pte_offset(svma->vm_mm, saddr);
3560 if (spte) { 3600 if (spte) {
3601 mm_inc_nr_pmds(mm);
3561 get_page(virt_to_page(spte)); 3602 get_page(virt_to_page(spte));
3562 break; 3603 break;
3563 } 3604 }
@@ -3569,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
3569 3610
3570 ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); 3611 ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
3571 spin_lock(ptl); 3612 spin_lock(ptl);
3572 if (pud_none(*pud)) 3613 if (pud_none(*pud)) {
3573 pud_populate(mm, pud, 3614 pud_populate(mm, pud,
3574 (pmd_t *)((unsigned long)spte & PAGE_MASK)); 3615 (pmd_t *)((unsigned long)spte & PAGE_MASK));
3575 else 3616 } else {
3576 put_page(virt_to_page(spte)); 3617 put_page(virt_to_page(spte));
3618 mm_inc_nr_pmds(mm);
3619 }
3577 spin_unlock(ptl); 3620 spin_unlock(ptl);
3578out: 3621out:
3579 pte = (pte_t *)pmd_alloc(mm, pud, addr); 3622 pte = (pte_t *)pmd_alloc(mm, pud, addr);
@@ -3604,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
3604 3647
3605 pud_clear(pud); 3648 pud_clear(pud);
3606 put_page(virt_to_page(ptep)); 3649 put_page(virt_to_page(ptep));
3650 mm_dec_nr_pmds(mm);
3607 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; 3651 *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
3608 return 1; 3652 return 1;
3609} 3653}
@@ -3660,42 +3704,64 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
3660 return (pte_t *) pmd; 3704 return (pte_t *) pmd;
3661} 3705}
3662 3706
3663struct page * 3707#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
3664follow_huge_pmd(struct mm_struct *mm, unsigned long address,
3665 pmd_t *pmd, int write)
3666{
3667 struct page *page;
3668 3708
3669 page = pte_page(*(pte_t *)pmd); 3709/*
3670 if (page) 3710 * These functions are overwritable if your architecture needs its own
3671 page += ((address & ~PMD_MASK) >> PAGE_SHIFT); 3711 * behavior.
3672 return page; 3712 */
3713struct page * __weak
3714follow_huge_addr(struct mm_struct *mm, unsigned long address,
3715 int write)
3716{
3717 return ERR_PTR(-EINVAL);
3673} 3718}
3674 3719
3675struct page * 3720struct page * __weak
3676follow_huge_pud(struct mm_struct *mm, unsigned long address, 3721follow_huge_pmd(struct mm_struct *mm, unsigned long address,
3677 pud_t *pud, int write) 3722 pmd_t *pmd, int flags)
3678{ 3723{
3679 struct page *page; 3724 struct page *page = NULL;
3680 3725 spinlock_t *ptl;
3681 page = pte_page(*(pte_t *)pud); 3726retry:
3682 if (page) 3727 ptl = pmd_lockptr(mm, pmd);
3683 page += ((address & ~PUD_MASK) >> PAGE_SHIFT); 3728 spin_lock(ptl);
3729 /*
3730 * make sure that the address range covered by this pmd is not
3731 * unmapped from other threads.
3732 */
3733 if (!pmd_huge(*pmd))
3734 goto out;
3735 if (pmd_present(*pmd)) {
3736 page = pte_page(*(pte_t *)pmd) +
3737 ((address & ~PMD_MASK) >> PAGE_SHIFT);
3738 if (flags & FOLL_GET)
3739 get_page(page);
3740 } else {
3741 if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
3742 spin_unlock(ptl);
3743 __migration_entry_wait(mm, (pte_t *)pmd, ptl);
3744 goto retry;
3745 }
3746 /*
3747 * hwpoisoned entry is treated as no_page_table in
3748 * follow_page_mask().
3749 */
3750 }
3751out:
3752 spin_unlock(ptl);
3684 return page; 3753 return page;
3685} 3754}
3686 3755
3687#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
3688
3689/* Can be overriden by architectures */
3690struct page * __weak 3756struct page * __weak
3691follow_huge_pud(struct mm_struct *mm, unsigned long address, 3757follow_huge_pud(struct mm_struct *mm, unsigned long address,
3692 pud_t *pud, int write) 3758 pud_t *pud, int flags)
3693{ 3759{
3694 BUG(); 3760 if (flags & FOLL_GET)
3695 return NULL; 3761 return NULL;
3696}
3697 3762
3698#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ 3763 return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
3764}
3699 3765
3700#ifdef CONFIG_MEMORY_FAILURE 3766#ifdef CONFIG_MEMORY_FAILURE
3701 3767
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 037e1c00a5b7..6e0057439a46 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -279,7 +279,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
279 return -EINVAL; 279 return -EINVAL;
280 280
281 buf = strstrip(buf); 281 buf = strstrip(buf);
282 ret = page_counter_memparse(buf, &nr_pages); 282 ret = page_counter_memparse(buf, "-1", &nr_pages);
283 if (ret) 283 if (ret)
284 return ret; 284 return ret;
285 285
diff --git a/mm/internal.h b/mm/internal.h
index efad241f7014..a96da5b0029d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -110,6 +110,28 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
110 */ 110 */
111 111
112/* 112/*
113 * Structure for holding the mostly immutable allocation parameters passed
114 * between functions involved in allocations, including the alloc_pages*
115 * family of functions.
116 *
117 * nodemask, migratetype and high_zoneidx are initialized only once in
118 * __alloc_pages_nodemask() and then never change.
119 *
120 * zonelist, preferred_zone and classzone_idx are set first in
121 * __alloc_pages_nodemask() for the fast path, and might be later changed
122 * in __alloc_pages_slowpath(). All other functions pass the whole strucure
123 * by a const pointer.
124 */
125struct alloc_context {
126 struct zonelist *zonelist;
127 nodemask_t *nodemask;
128 struct zone *preferred_zone;
129 int classzone_idx;
130 int migratetype;
131 enum zone_type high_zoneidx;
132};
133
134/*
113 * Locate the struct page for both the matching buddy in our 135 * Locate the struct page for both the matching buddy in our
114 * pair (buddy1) and the combined O(n+1) page they form (page). 136 * pair (buddy1) and the combined O(n+1) page they form (page).
115 * 137 *
@@ -329,8 +351,10 @@ extern int mminit_loglevel;
329#define mminit_dprintk(level, prefix, fmt, arg...) \ 351#define mminit_dprintk(level, prefix, fmt, arg...) \
330do { \ 352do { \
331 if (level < mminit_loglevel) { \ 353 if (level < mminit_loglevel) { \
332 printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \ 354 if (level <= MMINIT_WARNING) \
333 printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \ 355 printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \
356 else \
357 printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
334 } \ 358 } \
335} while (0) 359} while (0)
336 360
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 8da581fa9060..f2c2492681bf 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -21,8 +21,8 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
21 return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; 21 return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
22} 22}
23 23
24INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb, 24INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
25 unsigned long, shared.linear.rb_subtree_last, 25 unsigned long, shared.rb_subtree_last,
26 vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) 26 vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
27 27
28/* Insert node immediately after prev in the interval tree */ 28/* Insert node immediately after prev in the interval tree */
@@ -36,26 +36,26 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
36 36
37 VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); 37 VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);
38 38
39 if (!prev->shared.linear.rb.rb_right) { 39 if (!prev->shared.rb.rb_right) {
40 parent = prev; 40 parent = prev;
41 link = &prev->shared.linear.rb.rb_right; 41 link = &prev->shared.rb.rb_right;
42 } else { 42 } else {
43 parent = rb_entry(prev->shared.linear.rb.rb_right, 43 parent = rb_entry(prev->shared.rb.rb_right,
44 struct vm_area_struct, shared.linear.rb); 44 struct vm_area_struct, shared.rb);
45 if (parent->shared.linear.rb_subtree_last < last) 45 if (parent->shared.rb_subtree_last < last)
46 parent->shared.linear.rb_subtree_last = last; 46 parent->shared.rb_subtree_last = last;
47 while (parent->shared.linear.rb.rb_left) { 47 while (parent->shared.rb.rb_left) {
48 parent = rb_entry(parent->shared.linear.rb.rb_left, 48 parent = rb_entry(parent->shared.rb.rb_left,
49 struct vm_area_struct, shared.linear.rb); 49 struct vm_area_struct, shared.rb);
50 if (parent->shared.linear.rb_subtree_last < last) 50 if (parent->shared.rb_subtree_last < last)
51 parent->shared.linear.rb_subtree_last = last; 51 parent->shared.rb_subtree_last = last;
52 } 52 }
53 link = &parent->shared.linear.rb.rb_left; 53 link = &parent->shared.rb.rb_left;
54 } 54 }
55 55
56 node->shared.linear.rb_subtree_last = last; 56 node->shared.rb_subtree_last = last;
57 rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link); 57 rb_link_node(&node->shared.rb, &parent->shared.rb, link);
58 rb_insert_augmented(&node->shared.linear.rb, root, 58 rb_insert_augmented(&node->shared.rb, root,
59 &vma_interval_tree_augment); 59 &vma_interval_tree_augment);
60} 60}
61 61
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
new file mode 100644
index 000000000000..bd837b8c2f41
--- /dev/null
+++ b/mm/kasan/Makefile
@@ -0,0 +1,8 @@
1KASAN_SANITIZE := n
2
3CFLAGS_REMOVE_kasan.o = -pg
4# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
5# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
6CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
7
8obj-y := kasan.o report.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
new file mode 100644
index 000000000000..78fee632a7ee
--- /dev/null
+++ b/mm/kasan/kasan.c
@@ -0,0 +1,516 @@
1/*
2 * This file contains shadow memory manipulation code.
3 *
4 * Copyright (c) 2014 Samsung Electronics Co., Ltd.
5 * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
6 *
7 * Some of code borrowed from https://github.com/xairy/linux by
8 * Andrey Konovalov <adech.fo@gmail.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 *
14 */
15
16#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
17#define DISABLE_BRANCH_PROFILING
18
19#include <linux/export.h>
20#include <linux/init.h>
21#include <linux/kernel.h>
22#include <linux/memblock.h>
23#include <linux/memory.h>
24#include <linux/mm.h>
25#include <linux/module.h>
26#include <linux/printk.h>
27#include <linux/sched.h>
28#include <linux/slab.h>
29#include <linux/stacktrace.h>
30#include <linux/string.h>
31#include <linux/types.h>
32#include <linux/kasan.h>
33
34#include "kasan.h"
35#include "../slab.h"
36
37/*
38 * Poisons the shadow memory for 'size' bytes starting from 'addr'.
39 * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
40 */
41static void kasan_poison_shadow(const void *address, size_t size, u8 value)
42{
43 void *shadow_start, *shadow_end;
44
45 shadow_start = kasan_mem_to_shadow(address);
46 shadow_end = kasan_mem_to_shadow(address + size);
47
48 memset(shadow_start, value, shadow_end - shadow_start);
49}
50
51void kasan_unpoison_shadow(const void *address, size_t size)
52{
53 kasan_poison_shadow(address, size, 0);
54
55 if (size & KASAN_SHADOW_MASK) {
56 u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
57 *shadow = size & KASAN_SHADOW_MASK;
58 }
59}
60
61
62/*
63 * All functions below always inlined so compiler could
64 * perform better optimizations in each of __asan_loadX/__assn_storeX
65 * depending on memory access size X.
66 */
67
68static __always_inline bool memory_is_poisoned_1(unsigned long addr)
69{
70 s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
71
72 if (unlikely(shadow_value)) {
73 s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
74 return unlikely(last_accessible_byte >= shadow_value);
75 }
76
77 return false;
78}
79
80static __always_inline bool memory_is_poisoned_2(unsigned long addr)
81{
82 u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
83
84 if (unlikely(*shadow_addr)) {
85 if (memory_is_poisoned_1(addr + 1))
86 return true;
87
88 if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
89 return false;
90
91 return unlikely(*(u8 *)shadow_addr);
92 }
93
94 return false;
95}
96
97static __always_inline bool memory_is_poisoned_4(unsigned long addr)
98{
99 u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
100
101 if (unlikely(*shadow_addr)) {
102 if (memory_is_poisoned_1(addr + 3))
103 return true;
104
105 if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
106 return false;
107
108 return unlikely(*(u8 *)shadow_addr);
109 }
110
111 return false;
112}
113
114static __always_inline bool memory_is_poisoned_8(unsigned long addr)
115{
116 u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
117
118 if (unlikely(*shadow_addr)) {
119 if (memory_is_poisoned_1(addr + 7))
120 return true;
121
122 if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7))
123 return false;
124
125 return unlikely(*(u8 *)shadow_addr);
126 }
127
128 return false;
129}
130
131static __always_inline bool memory_is_poisoned_16(unsigned long addr)
132{
133 u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr);
134
135 if (unlikely(*shadow_addr)) {
136 u16 shadow_first_bytes = *(u16 *)shadow_addr;
137 s8 last_byte = (addr + 15) & KASAN_SHADOW_MASK;
138
139 if (unlikely(shadow_first_bytes))
140 return true;
141
142 if (likely(!last_byte))
143 return false;
144
145 return memory_is_poisoned_1(addr + 15);
146 }
147
148 return false;
149}
150
151static __always_inline unsigned long bytes_is_zero(const u8 *start,
152 size_t size)
153{
154 while (size) {
155 if (unlikely(*start))
156 return (unsigned long)start;
157 start++;
158 size--;
159 }
160
161 return 0;
162}
163
164static __always_inline unsigned long memory_is_zero(const void *start,
165 const void *end)
166{
167 unsigned int words;
168 unsigned long ret;
169 unsigned int prefix = (unsigned long)start % 8;
170
171 if (end - start <= 16)
172 return bytes_is_zero(start, end - start);
173
174 if (prefix) {
175 prefix = 8 - prefix;
176 ret = bytes_is_zero(start, prefix);
177 if (unlikely(ret))
178 return ret;
179 start += prefix;
180 }
181
182 words = (end - start) / 8;
183 while (words) {
184 if (unlikely(*(u64 *)start))
185 return bytes_is_zero(start, 8);
186 start += 8;
187 words--;
188 }
189
190 return bytes_is_zero(start, (end - start) % 8);
191}
192
193static __always_inline bool memory_is_poisoned_n(unsigned long addr,
194 size_t size)
195{
196 unsigned long ret;
197
198 ret = memory_is_zero(kasan_mem_to_shadow((void *)addr),
199 kasan_mem_to_shadow((void *)addr + size - 1) + 1);
200
201 if (unlikely(ret)) {
202 unsigned long last_byte = addr + size - 1;
203 s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
204
205 if (unlikely(ret != (unsigned long)last_shadow ||
206 ((last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
207 return true;
208 }
209 return false;
210}
211
212static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
213{
214 if (__builtin_constant_p(size)) {
215 switch (size) {
216 case 1:
217 return memory_is_poisoned_1(addr);
218 case 2:
219 return memory_is_poisoned_2(addr);
220 case 4:
221 return memory_is_poisoned_4(addr);
222 case 8:
223 return memory_is_poisoned_8(addr);
224 case 16:
225 return memory_is_poisoned_16(addr);
226 default:
227 BUILD_BUG();
228 }
229 }
230
231 return memory_is_poisoned_n(addr, size);
232}
233
234
235static __always_inline void check_memory_region(unsigned long addr,
236 size_t size, bool write)
237{
238 struct kasan_access_info info;
239
240 if (unlikely(size == 0))
241 return;
242
243 if (unlikely((void *)addr <
244 kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
245 info.access_addr = (void *)addr;
246 info.access_size = size;
247 info.is_write = write;
248 info.ip = _RET_IP_;
249 kasan_report_user_access(&info);
250 return;
251 }
252
253 if (likely(!memory_is_poisoned(addr, size)))
254 return;
255
256 kasan_report(addr, size, write, _RET_IP_);
257}
258
259void __asan_loadN(unsigned long addr, size_t size);
260void __asan_storeN(unsigned long addr, size_t size);
261
262#undef memset
263void *memset(void *addr, int c, size_t len)
264{
265 __asan_storeN((unsigned long)addr, len);
266
267 return __memset(addr, c, len);
268}
269
270#undef memmove
271void *memmove(void *dest, const void *src, size_t len)
272{
273 __asan_loadN((unsigned long)src, len);
274 __asan_storeN((unsigned long)dest, len);
275
276 return __memmove(dest, src, len);
277}
278
279#undef memcpy
280void *memcpy(void *dest, const void *src, size_t len)
281{
282 __asan_loadN((unsigned long)src, len);
283 __asan_storeN((unsigned long)dest, len);
284
285 return __memcpy(dest, src, len);
286}
287
288void kasan_alloc_pages(struct page *page, unsigned int order)
289{
290 if (likely(!PageHighMem(page)))
291 kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
292}
293
294void kasan_free_pages(struct page *page, unsigned int order)
295{
296 if (likely(!PageHighMem(page)))
297 kasan_poison_shadow(page_address(page),
298 PAGE_SIZE << order,
299 KASAN_FREE_PAGE);
300}
301
302void kasan_poison_slab(struct page *page)
303{
304 kasan_poison_shadow(page_address(page),
305 PAGE_SIZE << compound_order(page),
306 KASAN_KMALLOC_REDZONE);
307}
308
309void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
310{
311 kasan_unpoison_shadow(object, cache->object_size);
312}
313
314void kasan_poison_object_data(struct kmem_cache *cache, void *object)
315{
316 kasan_poison_shadow(object,
317 round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
318 KASAN_KMALLOC_REDZONE);
319}
320
321void kasan_slab_alloc(struct kmem_cache *cache, void *object)
322{
323 kasan_kmalloc(cache, object, cache->object_size);
324}
325
326void kasan_slab_free(struct kmem_cache *cache, void *object)
327{
328 unsigned long size = cache->object_size;
329 unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
330
331 /* RCU slabs could be legally used after free within the RCU period */
332 if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
333 return;
334
335 kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
336}
337
338void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
339{
340 unsigned long redzone_start;
341 unsigned long redzone_end;
342
343 if (unlikely(object == NULL))
344 return;
345
346 redzone_start = round_up((unsigned long)(object + size),
347 KASAN_SHADOW_SCALE_SIZE);
348 redzone_end = round_up((unsigned long)object + cache->object_size,
349 KASAN_SHADOW_SCALE_SIZE);
350
351 kasan_unpoison_shadow(object, size);
352 kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
353 KASAN_KMALLOC_REDZONE);
354}
355EXPORT_SYMBOL(kasan_kmalloc);
356
357void kasan_kmalloc_large(const void *ptr, size_t size)
358{
359 struct page *page;
360 unsigned long redzone_start;
361 unsigned long redzone_end;
362
363 if (unlikely(ptr == NULL))
364 return;
365
366 page = virt_to_page(ptr);
367 redzone_start = round_up((unsigned long)(ptr + size),
368 KASAN_SHADOW_SCALE_SIZE);
369 redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page));
370
371 kasan_unpoison_shadow(ptr, size);
372 kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
373 KASAN_PAGE_REDZONE);
374}
375
376void kasan_krealloc(const void *object, size_t size)
377{
378 struct page *page;
379
380 if (unlikely(object == ZERO_SIZE_PTR))
381 return;
382
383 page = virt_to_head_page(object);
384
385 if (unlikely(!PageSlab(page)))
386 kasan_kmalloc_large(object, size);
387 else
388 kasan_kmalloc(page->slab_cache, object, size);
389}
390
391void kasan_kfree_large(const void *ptr)
392{
393 struct page *page = virt_to_page(ptr);
394
395 kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
396 KASAN_FREE_PAGE);
397}
398
399int kasan_module_alloc(void *addr, size_t size)
400{
401 void *ret;
402 size_t shadow_size;
403 unsigned long shadow_start;
404
405 shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
406 shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT,
407 PAGE_SIZE);
408
409 if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
410 return -EINVAL;
411
412 ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
413 shadow_start + shadow_size,
414 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
415 PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
416 __builtin_return_address(0));
417 return ret ? 0 : -ENOMEM;
418}
419
420void kasan_module_free(void *addr)
421{
422 vfree(kasan_mem_to_shadow(addr));
423}
424
425static void register_global(struct kasan_global *global)
426{
427 size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
428
429 kasan_unpoison_shadow(global->beg, global->size);
430
431 kasan_poison_shadow(global->beg + aligned_size,
432 global->size_with_redzone - aligned_size,
433 KASAN_GLOBAL_REDZONE);
434}
435
436void __asan_register_globals(struct kasan_global *globals, size_t size)
437{
438 int i;
439
440 for (i = 0; i < size; i++)
441 register_global(&globals[i]);
442}
443EXPORT_SYMBOL(__asan_register_globals);
444
445void __asan_unregister_globals(struct kasan_global *globals, size_t size)
446{
447}
448EXPORT_SYMBOL(__asan_unregister_globals);
449
450#define DEFINE_ASAN_LOAD_STORE(size) \
451 void __asan_load##size(unsigned long addr) \
452 { \
453 check_memory_region(addr, size, false); \
454 } \
455 EXPORT_SYMBOL(__asan_load##size); \
456 __alias(__asan_load##size) \
457 void __asan_load##size##_noabort(unsigned long); \
458 EXPORT_SYMBOL(__asan_load##size##_noabort); \
459 void __asan_store##size(unsigned long addr) \
460 { \
461 check_memory_region(addr, size, true); \
462 } \
463 EXPORT_SYMBOL(__asan_store##size); \
464 __alias(__asan_store##size) \
465 void __asan_store##size##_noabort(unsigned long); \
466 EXPORT_SYMBOL(__asan_store##size##_noabort)
467
468DEFINE_ASAN_LOAD_STORE(1);
469DEFINE_ASAN_LOAD_STORE(2);
470DEFINE_ASAN_LOAD_STORE(4);
471DEFINE_ASAN_LOAD_STORE(8);
472DEFINE_ASAN_LOAD_STORE(16);
473
474void __asan_loadN(unsigned long addr, size_t size)
475{
476 check_memory_region(addr, size, false);
477}
478EXPORT_SYMBOL(__asan_loadN);
479
480__alias(__asan_loadN)
481void __asan_loadN_noabort(unsigned long, size_t);
482EXPORT_SYMBOL(__asan_loadN_noabort);
483
484void __asan_storeN(unsigned long addr, size_t size)
485{
486 check_memory_region(addr, size, true);
487}
488EXPORT_SYMBOL(__asan_storeN);
489
490__alias(__asan_storeN)
491void __asan_storeN_noabort(unsigned long, size_t);
492EXPORT_SYMBOL(__asan_storeN_noabort);
493
494/* to shut up compiler complaints */
495void __asan_handle_no_return(void) {}
496EXPORT_SYMBOL(__asan_handle_no_return);
497
498#ifdef CONFIG_MEMORY_HOTPLUG
499static int kasan_mem_notifier(struct notifier_block *nb,
500 unsigned long action, void *data)
501{
502 return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK;
503}
504
505static int __init kasan_memhotplug_init(void)
506{
507 pr_err("WARNING: KASan doesn't support memory hot-add\n");
508 pr_err("Memory hot-add will be disabled\n");
509
510 hotplug_memory_notifier(kasan_mem_notifier, 0);
511
512 return 0;
513}
514
515module_init(kasan_memhotplug_init);
516#endif
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
new file mode 100644
index 000000000000..4986b0acab21
--- /dev/null
+++ b/mm/kasan/kasan.h
@@ -0,0 +1,75 @@
1#ifndef __MM_KASAN_KASAN_H
2#define __MM_KASAN_KASAN_H
3
4#include <linux/kasan.h>
5
6#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
7#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
8
9#define KASAN_FREE_PAGE 0xFF /* page was freed */
10#define KASAN_FREE_PAGE 0xFF /* page was freed */
11#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
12#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
13#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
14#define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */
15
16/*
17 * Stack redzone shadow values
18 * (Those are compiler's ABI, don't change them)
19 */
20#define KASAN_STACK_LEFT 0xF1
21#define KASAN_STACK_MID 0xF2
22#define KASAN_STACK_RIGHT 0xF3
23#define KASAN_STACK_PARTIAL 0xF4
24
25/* Don't break randconfig/all*config builds */
26#ifndef KASAN_ABI_VERSION
27#define KASAN_ABI_VERSION 1
28#endif
29
30struct kasan_access_info {
31 const void *access_addr;
32 const void *first_bad_addr;
33 size_t access_size;
34 bool is_write;
35 unsigned long ip;
36};
37
38/* The layout of struct dictated by compiler */
39struct kasan_source_location {
40 const char *filename;
41 int line_no;
42 int column_no;
43};
44
45/* The layout of struct dictated by compiler */
46struct kasan_global {
47 const void *beg; /* Address of the beginning of the global variable. */
48 size_t size; /* Size of the global variable. */
49 size_t size_with_redzone; /* Size of the variable + size of the red zone. 32 bytes aligned */
50 const void *name;
51 const void *module_name; /* Name of the module where the global variable is declared. */
52 unsigned long has_dynamic_init; /* This needed for C++ */
53#if KASAN_ABI_VERSION >= 4
54 struct kasan_source_location *location;
55#endif
56};
57
58void kasan_report_error(struct kasan_access_info *info);
59void kasan_report_user_access(struct kasan_access_info *info);
60
61static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
62{
63 return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
64 << KASAN_SHADOW_SCALE_SHIFT);
65}
66
67static inline bool kasan_enabled(void)
68{
69 return !current->kasan_depth;
70}
71
72void kasan_report(unsigned long addr, size_t size,
73 bool is_write, unsigned long ip);
74
75#endif
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
new file mode 100644
index 000000000000..680ceedf810a
--- /dev/null
+++ b/mm/kasan/report.c
@@ -0,0 +1,269 @@
1/*
2 * This file contains error reporting code.
3 *
4 * Copyright (c) 2014 Samsung Electronics Co., Ltd.
5 * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
6 *
7 * Some of code borrowed from https://github.com/xairy/linux by
8 * Andrey Konovalov <adech.fo@gmail.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 *
14 */
15
16#include <linux/kernel.h>
17#include <linux/mm.h>
18#include <linux/printk.h>
19#include <linux/sched.h>
20#include <linux/slab.h>
21#include <linux/stacktrace.h>
22#include <linux/string.h>
23#include <linux/types.h>
24#include <linux/kasan.h>
25
26#include <asm/sections.h>
27
28#include "kasan.h"
29#include "../slab.h"
30
31/* Shadow layout customization. */
32#define SHADOW_BYTES_PER_BLOCK 1
33#define SHADOW_BLOCKS_PER_ROW 16
34#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
35#define SHADOW_ROWS_AROUND_ADDR 2
36
37static const void *find_first_bad_addr(const void *addr, size_t size)
38{
39 u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr);
40 const void *first_bad_addr = addr;
41
42 while (!shadow_val && first_bad_addr < addr + size) {
43 first_bad_addr += KASAN_SHADOW_SCALE_SIZE;
44 shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr);
45 }
46 return first_bad_addr;
47}
48
49static void print_error_description(struct kasan_access_info *info)
50{
51 const char *bug_type = "unknown crash";
52 u8 shadow_val;
53
54 info->first_bad_addr = find_first_bad_addr(info->access_addr,
55 info->access_size);
56
57 shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
58
59 switch (shadow_val) {
60 case KASAN_FREE_PAGE:
61 case KASAN_KMALLOC_FREE:
62 bug_type = "use after free";
63 break;
64 case KASAN_PAGE_REDZONE:
65 case KASAN_KMALLOC_REDZONE:
66 case KASAN_GLOBAL_REDZONE:
67 case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
68 bug_type = "out of bounds access";
69 break;
70 case KASAN_STACK_LEFT:
71 case KASAN_STACK_MID:
72 case KASAN_STACK_RIGHT:
73 case KASAN_STACK_PARTIAL:
74 bug_type = "out of bounds on stack";
75 break;
76 }
77
78 pr_err("BUG: KASan: %s in %pS at addr %p\n",
79 bug_type, (void *)info->ip,
80 info->access_addr);
81 pr_err("%s of size %zu by task %s/%d\n",
82 info->is_write ? "Write" : "Read",
83 info->access_size, current->comm, task_pid_nr(current));
84}
85
86static inline bool kernel_or_module_addr(const void *addr)
87{
88 return (addr >= (void *)_stext && addr < (void *)_end)
89 || (addr >= (void *)MODULES_VADDR
90 && addr < (void *)MODULES_END);
91}
92
93static inline bool init_task_stack_addr(const void *addr)
94{
95 return addr >= (void *)&init_thread_union.stack &&
96 (addr <= (void *)&init_thread_union.stack +
97 sizeof(init_thread_union.stack));
98}
99
100static void print_address_description(struct kasan_access_info *info)
101{
102 const void *addr = info->access_addr;
103
104 if ((addr >= (void *)PAGE_OFFSET) &&
105 (addr < high_memory)) {
106 struct page *page = virt_to_head_page(addr);
107
108 if (PageSlab(page)) {
109 void *object;
110 struct kmem_cache *cache = page->slab_cache;
111 void *last_object;
112
113 object = virt_to_obj(cache, page_address(page), addr);
114 last_object = page_address(page) +
115 page->objects * cache->size;
116
117 if (unlikely(object > last_object))
118 object = last_object; /* we hit into padding */
119
120 object_err(cache, page, object,
121 "kasan: bad access detected");
122 return;
123 }
124 dump_page(page, "kasan: bad access detected");
125 }
126
127 if (kernel_or_module_addr(addr)) {
128 if (!init_task_stack_addr(addr))
129 pr_err("Address belongs to variable %pS\n", addr);
130 }
131
132 dump_stack();
133}
134
135static bool row_is_guilty(const void *row, const void *guilty)
136{
137 return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW);
138}
139
140static int shadow_pointer_offset(const void *row, const void *shadow)
141{
142 /* The length of ">ff00ff00ff00ff00: " is
143 * 3 + (BITS_PER_LONG/8)*2 chars.
144 */
145 return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 +
146 (shadow - row) / SHADOW_BYTES_PER_BLOCK + 1;
147}
148
149static void print_shadow_for_address(const void *addr)
150{
151 int i;
152 const void *shadow = kasan_mem_to_shadow(addr);
153 const void *shadow_row;
154
155 shadow_row = (void *)round_down((unsigned long)shadow,
156 SHADOW_BYTES_PER_ROW)
157 - SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW;
158
159 pr_err("Memory state around the buggy address:\n");
160
161 for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
162 const void *kaddr = kasan_shadow_to_mem(shadow_row);
163 char buffer[4 + (BITS_PER_LONG/8)*2];
164
165 snprintf(buffer, sizeof(buffer),
166 (i == 0) ? ">%p: " : " %p: ", kaddr);
167
168 kasan_disable_current();
169 print_hex_dump(KERN_ERR, buffer,
170 DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
171 shadow_row, SHADOW_BYTES_PER_ROW, 0);
172 kasan_enable_current();
173
174 if (row_is_guilty(shadow_row, shadow))
175 pr_err("%*c\n",
176 shadow_pointer_offset(shadow_row, shadow),
177 '^');
178
179 shadow_row += SHADOW_BYTES_PER_ROW;
180 }
181}
182
183static DEFINE_SPINLOCK(report_lock);
184
185void kasan_report_error(struct kasan_access_info *info)
186{
187 unsigned long flags;
188
189 spin_lock_irqsave(&report_lock, flags);
190 pr_err("================================="
191 "=================================\n");
192 print_error_description(info);
193 print_address_description(info);
194 print_shadow_for_address(info->first_bad_addr);
195 pr_err("================================="
196 "=================================\n");
197 spin_unlock_irqrestore(&report_lock, flags);
198}
199
200void kasan_report_user_access(struct kasan_access_info *info)
201{
202 unsigned long flags;
203
204 spin_lock_irqsave(&report_lock, flags);
205 pr_err("================================="
206 "=================================\n");
207 pr_err("BUG: KASan: user-memory-access on address %p\n",
208 info->access_addr);
209 pr_err("%s of size %zu by task %s/%d\n",
210 info->is_write ? "Write" : "Read",
211 info->access_size, current->comm, task_pid_nr(current));
212 dump_stack();
213 pr_err("================================="
214 "=================================\n");
215 spin_unlock_irqrestore(&report_lock, flags);
216}
217
218void kasan_report(unsigned long addr, size_t size,
219 bool is_write, unsigned long ip)
220{
221 struct kasan_access_info info;
222
223 if (likely(!kasan_enabled()))
224 return;
225
226 info.access_addr = (void *)addr;
227 info.access_size = size;
228 info.is_write = is_write;
229 info.ip = ip;
230 kasan_report_error(&info);
231}
232
233
234#define DEFINE_ASAN_REPORT_LOAD(size) \
235void __asan_report_load##size##_noabort(unsigned long addr) \
236{ \
237 kasan_report(addr, size, false, _RET_IP_); \
238} \
239EXPORT_SYMBOL(__asan_report_load##size##_noabort)
240
241#define DEFINE_ASAN_REPORT_STORE(size) \
242void __asan_report_store##size##_noabort(unsigned long addr) \
243{ \
244 kasan_report(addr, size, true, _RET_IP_); \
245} \
246EXPORT_SYMBOL(__asan_report_store##size##_noabort)
247
248DEFINE_ASAN_REPORT_LOAD(1);
249DEFINE_ASAN_REPORT_LOAD(2);
250DEFINE_ASAN_REPORT_LOAD(4);
251DEFINE_ASAN_REPORT_LOAD(8);
252DEFINE_ASAN_REPORT_LOAD(16);
253DEFINE_ASAN_REPORT_STORE(1);
254DEFINE_ASAN_REPORT_STORE(2);
255DEFINE_ASAN_REPORT_STORE(4);
256DEFINE_ASAN_REPORT_STORE(8);
257DEFINE_ASAN_REPORT_STORE(16);
258
259void __asan_report_load_n_noabort(unsigned long addr, size_t size)
260{
261 kasan_report(addr, size, false, _RET_IP_);
262}
263EXPORT_SYMBOL(__asan_report_load_n_noabort);
264
265void __asan_report_store_n_noabort(unsigned long addr, size_t size)
266{
267 kasan_report(addr, size, true, _RET_IP_);
268}
269EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 3cda50c1e394..5405aff5a590 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -98,6 +98,7 @@
98#include <asm/processor.h> 98#include <asm/processor.h>
99#include <linux/atomic.h> 99#include <linux/atomic.h>
100 100
101#include <linux/kasan.h>
101#include <linux/kmemcheck.h> 102#include <linux/kmemcheck.h>
102#include <linux/kmemleak.h> 103#include <linux/kmemleak.h>
103#include <linux/memory_hotplug.h> 104#include <linux/memory_hotplug.h>
@@ -1113,7 +1114,10 @@ static bool update_checksum(struct kmemleak_object *object)
1113 if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) 1114 if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
1114 return false; 1115 return false;
1115 1116
1117 kasan_disable_current();
1116 object->checksum = crc32(0, (void *)object->pointer, object->size); 1118 object->checksum = crc32(0, (void *)object->pointer, object->size);
1119 kasan_enable_current();
1120
1117 return object->checksum != old_csum; 1121 return object->checksum != old_csum;
1118} 1122}
1119 1123
@@ -1164,7 +1168,9 @@ static void scan_block(void *_start, void *_end,
1164 BYTES_PER_POINTER)) 1168 BYTES_PER_POINTER))
1165 continue; 1169 continue;
1166 1170
1171 kasan_disable_current();
1167 pointer = *ptr; 1172 pointer = *ptr;
1173 kasan_enable_current();
1168 1174
1169 object = find_and_get_object(pointer, 1); 1175 object = find_and_get_object(pointer, 1);
1170 if (!object) 1176 if (!object)
diff --git a/mm/ksm.c b/mm/ksm.c
index d247efab5073..4162dce2eb44 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -376,7 +376,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
376 else 376 else
377 ret = VM_FAULT_WRITE; 377 ret = VM_FAULT_WRITE;
378 put_page(page); 378 put_page(page);
379 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); 379 } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
380 /* 380 /*
381 * We must loop because handle_mm_fault() may back out if there's 381 * We must loop because handle_mm_fault() may back out if there's
382 * any difficulty e.g. if pte accessed bit gets updated concurrently. 382 * any difficulty e.g. if pte accessed bit gets updated concurrently.
@@ -1748,7 +1748,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1748 */ 1748 */
1749 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | 1749 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1750 VM_PFNMAP | VM_IO | VM_DONTEXPAND | 1750 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1751 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) 1751 VM_HUGETLB | VM_MIXEDMAP))
1752 return 0; /* just ignore the advice */ 1752 return 0; /* just ignore the advice */
1753 1753
1754#ifdef VM_SAO 1754#ifdef VM_SAO
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f1a0db194173..909eca2c820e 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -9,18 +9,100 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/list_lru.h> 10#include <linux/list_lru.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/mutex.h>
13#include <linux/memcontrol.h>
14
15#ifdef CONFIG_MEMCG_KMEM
16static LIST_HEAD(list_lrus);
17static DEFINE_MUTEX(list_lrus_mutex);
18
19static void list_lru_register(struct list_lru *lru)
20{
21 mutex_lock(&list_lrus_mutex);
22 list_add(&lru->list, &list_lrus);
23 mutex_unlock(&list_lrus_mutex);
24}
25
26static void list_lru_unregister(struct list_lru *lru)
27{
28 mutex_lock(&list_lrus_mutex);
29 list_del(&lru->list);
30 mutex_unlock(&list_lrus_mutex);
31}
32#else
33static void list_lru_register(struct list_lru *lru)
34{
35}
36
37static void list_lru_unregister(struct list_lru *lru)
38{
39}
40#endif /* CONFIG_MEMCG_KMEM */
41
42#ifdef CONFIG_MEMCG_KMEM
43static inline bool list_lru_memcg_aware(struct list_lru *lru)
44{
45 return !!lru->node[0].memcg_lrus;
46}
47
48static inline struct list_lru_one *
49list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
50{
51 /*
52 * The lock protects the array of per cgroup lists from relocation
53 * (see memcg_update_list_lru_node).
54 */
55 lockdep_assert_held(&nlru->lock);
56 if (nlru->memcg_lrus && idx >= 0)
57 return nlru->memcg_lrus->lru[idx];
58
59 return &nlru->lru;
60}
61
62static inline struct list_lru_one *
63list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
64{
65 struct mem_cgroup *memcg;
66
67 if (!nlru->memcg_lrus)
68 return &nlru->lru;
69
70 memcg = mem_cgroup_from_kmem(ptr);
71 if (!memcg)
72 return &nlru->lru;
73
74 return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
75}
76#else
77static inline bool list_lru_memcg_aware(struct list_lru *lru)
78{
79 return false;
80}
81
82static inline struct list_lru_one *
83list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
84{
85 return &nlru->lru;
86}
87
88static inline struct list_lru_one *
89list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
90{
91 return &nlru->lru;
92}
93#endif /* CONFIG_MEMCG_KMEM */
12 94
13bool list_lru_add(struct list_lru *lru, struct list_head *item) 95bool list_lru_add(struct list_lru *lru, struct list_head *item)
14{ 96{
15 int nid = page_to_nid(virt_to_page(item)); 97 int nid = page_to_nid(virt_to_page(item));
16 struct list_lru_node *nlru = &lru->node[nid]; 98 struct list_lru_node *nlru = &lru->node[nid];
99 struct list_lru_one *l;
17 100
18 spin_lock(&nlru->lock); 101 spin_lock(&nlru->lock);
19 WARN_ON_ONCE(nlru->nr_items < 0); 102 l = list_lru_from_kmem(nlru, item);
20 if (list_empty(item)) { 103 if (list_empty(item)) {
21 list_add_tail(item, &nlru->list); 104 list_add_tail(item, &l->list);
22 if (nlru->nr_items++ == 0) 105 l->nr_items++;
23 node_set(nid, lru->active_nodes);
24 spin_unlock(&nlru->lock); 106 spin_unlock(&nlru->lock);
25 return true; 107 return true;
26 } 108 }
@@ -33,13 +115,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
33{ 115{
34 int nid = page_to_nid(virt_to_page(item)); 116 int nid = page_to_nid(virt_to_page(item));
35 struct list_lru_node *nlru = &lru->node[nid]; 117 struct list_lru_node *nlru = &lru->node[nid];
118 struct list_lru_one *l;
36 119
37 spin_lock(&nlru->lock); 120 spin_lock(&nlru->lock);
121 l = list_lru_from_kmem(nlru, item);
38 if (!list_empty(item)) { 122 if (!list_empty(item)) {
39 list_del_init(item); 123 list_del_init(item);
40 if (--nlru->nr_items == 0) 124 l->nr_items--;
41 node_clear(nid, lru->active_nodes);
42 WARN_ON_ONCE(nlru->nr_items < 0);
43 spin_unlock(&nlru->lock); 125 spin_unlock(&nlru->lock);
44 return true; 126 return true;
45 } 127 }
@@ -48,33 +130,72 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
48} 130}
49EXPORT_SYMBOL_GPL(list_lru_del); 131EXPORT_SYMBOL_GPL(list_lru_del);
50 132
51unsigned long 133void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
52list_lru_count_node(struct list_lru *lru, int nid) 134{
135 list_del_init(item);
136 list->nr_items--;
137}
138EXPORT_SYMBOL_GPL(list_lru_isolate);
139
140void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
141 struct list_head *head)
142{
143 list_move(item, head);
144 list->nr_items--;
145}
146EXPORT_SYMBOL_GPL(list_lru_isolate_move);
147
148static unsigned long __list_lru_count_one(struct list_lru *lru,
149 int nid, int memcg_idx)
53{ 150{
54 unsigned long count = 0;
55 struct list_lru_node *nlru = &lru->node[nid]; 151 struct list_lru_node *nlru = &lru->node[nid];
152 struct list_lru_one *l;
153 unsigned long count;
56 154
57 spin_lock(&nlru->lock); 155 spin_lock(&nlru->lock);
58 WARN_ON_ONCE(nlru->nr_items < 0); 156 l = list_lru_from_memcg_idx(nlru, memcg_idx);
59 count += nlru->nr_items; 157 count = l->nr_items;
60 spin_unlock(&nlru->lock); 158 spin_unlock(&nlru->lock);
61 159
62 return count; 160 return count;
63} 161}
162
163unsigned long list_lru_count_one(struct list_lru *lru,
164 int nid, struct mem_cgroup *memcg)
165{
166 return __list_lru_count_one(lru, nid, memcg_cache_id(memcg));
167}
168EXPORT_SYMBOL_GPL(list_lru_count_one);
169
170unsigned long list_lru_count_node(struct list_lru *lru, int nid)
171{
172 long count = 0;
173 int memcg_idx;
174
175 count += __list_lru_count_one(lru, nid, -1);
176 if (list_lru_memcg_aware(lru)) {
177 for_each_memcg_cache_index(memcg_idx)
178 count += __list_lru_count_one(lru, nid, memcg_idx);
179 }
180 return count;
181}
64EXPORT_SYMBOL_GPL(list_lru_count_node); 182EXPORT_SYMBOL_GPL(list_lru_count_node);
65 183
66unsigned long 184static unsigned long
67list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, 185__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
68 void *cb_arg, unsigned long *nr_to_walk) 186 list_lru_walk_cb isolate, void *cb_arg,
187 unsigned long *nr_to_walk)
69{ 188{
70 189
71 struct list_lru_node *nlru = &lru->node[nid]; 190 struct list_lru_node *nlru = &lru->node[nid];
191 struct list_lru_one *l;
72 struct list_head *item, *n; 192 struct list_head *item, *n;
73 unsigned long isolated = 0; 193 unsigned long isolated = 0;
74 194
75 spin_lock(&nlru->lock); 195 spin_lock(&nlru->lock);
196 l = list_lru_from_memcg_idx(nlru, memcg_idx);
76restart: 197restart:
77 list_for_each_safe(item, n, &nlru->list) { 198 list_for_each_safe(item, n, &l->list) {
78 enum lru_status ret; 199 enum lru_status ret;
79 200
80 /* 201 /*
@@ -85,14 +206,11 @@ restart:
85 break; 206 break;
86 --*nr_to_walk; 207 --*nr_to_walk;
87 208
88 ret = isolate(item, &nlru->lock, cb_arg); 209 ret = isolate(item, l, &nlru->lock, cb_arg);
89 switch (ret) { 210 switch (ret) {
90 case LRU_REMOVED_RETRY: 211 case LRU_REMOVED_RETRY:
91 assert_spin_locked(&nlru->lock); 212 assert_spin_locked(&nlru->lock);
92 case LRU_REMOVED: 213 case LRU_REMOVED:
93 if (--nlru->nr_items == 0)
94 node_clear(nid, lru->active_nodes);
95 WARN_ON_ONCE(nlru->nr_items < 0);
96 isolated++; 214 isolated++;
97 /* 215 /*
98 * If the lru lock has been dropped, our list 216 * If the lru lock has been dropped, our list
@@ -103,7 +221,7 @@ restart:
103 goto restart; 221 goto restart;
104 break; 222 break;
105 case LRU_ROTATE: 223 case LRU_ROTATE:
106 list_move_tail(item, &nlru->list); 224 list_move_tail(item, &l->list);
107 break; 225 break;
108 case LRU_SKIP: 226 case LRU_SKIP:
109 break; 227 break;
@@ -122,31 +240,322 @@ restart:
122 spin_unlock(&nlru->lock); 240 spin_unlock(&nlru->lock);
123 return isolated; 241 return isolated;
124} 242}
243
244unsigned long
245list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
246 list_lru_walk_cb isolate, void *cb_arg,
247 unsigned long *nr_to_walk)
248{
249 return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg),
250 isolate, cb_arg, nr_to_walk);
251}
252EXPORT_SYMBOL_GPL(list_lru_walk_one);
253
254unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
255 list_lru_walk_cb isolate, void *cb_arg,
256 unsigned long *nr_to_walk)
257{
258 long isolated = 0;
259 int memcg_idx;
260
261 isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg,
262 nr_to_walk);
263 if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
264 for_each_memcg_cache_index(memcg_idx) {
265 isolated += __list_lru_walk_one(lru, nid, memcg_idx,
266 isolate, cb_arg, nr_to_walk);
267 if (*nr_to_walk <= 0)
268 break;
269 }
270 }
271 return isolated;
272}
125EXPORT_SYMBOL_GPL(list_lru_walk_node); 273EXPORT_SYMBOL_GPL(list_lru_walk_node);
126 274
127int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) 275static void init_one_lru(struct list_lru_one *l)
276{
277 INIT_LIST_HEAD(&l->list);
278 l->nr_items = 0;
279}
280
281#ifdef CONFIG_MEMCG_KMEM
282static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
283 int begin, int end)
284{
285 int i;
286
287 for (i = begin; i < end; i++)
288 kfree(memcg_lrus->lru[i]);
289}
290
291static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
292 int begin, int end)
293{
294 int i;
295
296 for (i = begin; i < end; i++) {
297 struct list_lru_one *l;
298
299 l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
300 if (!l)
301 goto fail;
302
303 init_one_lru(l);
304 memcg_lrus->lru[i] = l;
305 }
306 return 0;
307fail:
308 __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1);
309 return -ENOMEM;
310}
311
312static int memcg_init_list_lru_node(struct list_lru_node *nlru)
313{
314 int size = memcg_nr_cache_ids;
315
316 nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
317 if (!nlru->memcg_lrus)
318 return -ENOMEM;
319
320 if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
321 kfree(nlru->memcg_lrus);
322 return -ENOMEM;
323 }
324
325 return 0;
326}
327
328static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
329{
330 __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
331 kfree(nlru->memcg_lrus);
332}
333
334static int memcg_update_list_lru_node(struct list_lru_node *nlru,
335 int old_size, int new_size)
336{
337 struct list_lru_memcg *old, *new;
338
339 BUG_ON(old_size > new_size);
340
341 old = nlru->memcg_lrus;
342 new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
343 if (!new)
344 return -ENOMEM;
345
346 if (__memcg_init_list_lru_node(new, old_size, new_size)) {
347 kfree(new);
348 return -ENOMEM;
349 }
350
351 memcpy(new, old, old_size * sizeof(void *));
352
353 /*
354 * The lock guarantees that we won't race with a reader
355 * (see list_lru_from_memcg_idx).
356 *
357 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
358 * we have to use IRQ-safe primitives here to avoid deadlock.
359 */
360 spin_lock_irq(&nlru->lock);
361 nlru->memcg_lrus = new;
362 spin_unlock_irq(&nlru->lock);
363
364 kfree(old);
365 return 0;
366}
367
368static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
369 int old_size, int new_size)
370{
371 /* do not bother shrinking the array back to the old size, because we
372 * cannot handle allocation failures here */
373 __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size);
374}
375
376static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
377{
378 int i;
379
380 for (i = 0; i < nr_node_ids; i++) {
381 if (!memcg_aware)
382 lru->node[i].memcg_lrus = NULL;
383 else if (memcg_init_list_lru_node(&lru->node[i]))
384 goto fail;
385 }
386 return 0;
387fail:
388 for (i = i - 1; i >= 0; i--)
389 memcg_destroy_list_lru_node(&lru->node[i]);
390 return -ENOMEM;
391}
392
393static void memcg_destroy_list_lru(struct list_lru *lru)
394{
395 int i;
396
397 if (!list_lru_memcg_aware(lru))
398 return;
399
400 for (i = 0; i < nr_node_ids; i++)
401 memcg_destroy_list_lru_node(&lru->node[i]);
402}
403
404static int memcg_update_list_lru(struct list_lru *lru,
405 int old_size, int new_size)
406{
407 int i;
408
409 if (!list_lru_memcg_aware(lru))
410 return 0;
411
412 for (i = 0; i < nr_node_ids; i++) {
413 if (memcg_update_list_lru_node(&lru->node[i],
414 old_size, new_size))
415 goto fail;
416 }
417 return 0;
418fail:
419 for (i = i - 1; i >= 0; i--)
420 memcg_cancel_update_list_lru_node(&lru->node[i],
421 old_size, new_size);
422 return -ENOMEM;
423}
424
425static void memcg_cancel_update_list_lru(struct list_lru *lru,
426 int old_size, int new_size)
427{
428 int i;
429
430 if (!list_lru_memcg_aware(lru))
431 return;
432
433 for (i = 0; i < nr_node_ids; i++)
434 memcg_cancel_update_list_lru_node(&lru->node[i],
435 old_size, new_size);
436}
437
438int memcg_update_all_list_lrus(int new_size)
439{
440 int ret = 0;
441 struct list_lru *lru;
442 int old_size = memcg_nr_cache_ids;
443
444 mutex_lock(&list_lrus_mutex);
445 list_for_each_entry(lru, &list_lrus, list) {
446 ret = memcg_update_list_lru(lru, old_size, new_size);
447 if (ret)
448 goto fail;
449 }
450out:
451 mutex_unlock(&list_lrus_mutex);
452 return ret;
453fail:
454 list_for_each_entry_continue_reverse(lru, &list_lrus, list)
455 memcg_cancel_update_list_lru(lru, old_size, new_size);
456 goto out;
457}
458
459static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
460 int src_idx, int dst_idx)
461{
462 struct list_lru_one *src, *dst;
463
464 /*
465 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
466 * we have to use IRQ-safe primitives here to avoid deadlock.
467 */
468 spin_lock_irq(&nlru->lock);
469
470 src = list_lru_from_memcg_idx(nlru, src_idx);
471 dst = list_lru_from_memcg_idx(nlru, dst_idx);
472
473 list_splice_init(&src->list, &dst->list);
474 dst->nr_items += src->nr_items;
475 src->nr_items = 0;
476
477 spin_unlock_irq(&nlru->lock);
478}
479
480static void memcg_drain_list_lru(struct list_lru *lru,
481 int src_idx, int dst_idx)
482{
483 int i;
484
485 if (!list_lru_memcg_aware(lru))
486 return;
487
488 for (i = 0; i < nr_node_ids; i++)
489 memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
490}
491
492void memcg_drain_all_list_lrus(int src_idx, int dst_idx)
493{
494 struct list_lru *lru;
495
496 mutex_lock(&list_lrus_mutex);
497 list_for_each_entry(lru, &list_lrus, list)
498 memcg_drain_list_lru(lru, src_idx, dst_idx);
499 mutex_unlock(&list_lrus_mutex);
500}
501#else
502static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
503{
504 return 0;
505}
506
507static void memcg_destroy_list_lru(struct list_lru *lru)
508{
509}
510#endif /* CONFIG_MEMCG_KMEM */
511
512int __list_lru_init(struct list_lru *lru, bool memcg_aware,
513 struct lock_class_key *key)
128{ 514{
129 int i; 515 int i;
130 size_t size = sizeof(*lru->node) * nr_node_ids; 516 size_t size = sizeof(*lru->node) * nr_node_ids;
517 int err = -ENOMEM;
518
519 memcg_get_cache_ids();
131 520
132 lru->node = kzalloc(size, GFP_KERNEL); 521 lru->node = kzalloc(size, GFP_KERNEL);
133 if (!lru->node) 522 if (!lru->node)
134 return -ENOMEM; 523 goto out;
135 524
136 nodes_clear(lru->active_nodes);
137 for (i = 0; i < nr_node_ids; i++) { 525 for (i = 0; i < nr_node_ids; i++) {
138 spin_lock_init(&lru->node[i].lock); 526 spin_lock_init(&lru->node[i].lock);
139 if (key) 527 if (key)
140 lockdep_set_class(&lru->node[i].lock, key); 528 lockdep_set_class(&lru->node[i].lock, key);
141 INIT_LIST_HEAD(&lru->node[i].list); 529 init_one_lru(&lru->node[i].lru);
142 lru->node[i].nr_items = 0;
143 } 530 }
144 return 0; 531
532 err = memcg_init_list_lru(lru, memcg_aware);
533 if (err) {
534 kfree(lru->node);
535 goto out;
536 }
537
538 list_lru_register(lru);
539out:
540 memcg_put_cache_ids();
541 return err;
145} 542}
146EXPORT_SYMBOL_GPL(list_lru_init_key); 543EXPORT_SYMBOL_GPL(__list_lru_init);
147 544
148void list_lru_destroy(struct list_lru *lru) 545void list_lru_destroy(struct list_lru *lru)
149{ 546{
547 /* Already destroyed or not yet initialized? */
548 if (!lru->node)
549 return;
550
551 memcg_get_cache_ids();
552
553 list_lru_unregister(lru);
554
555 memcg_destroy_list_lru(lru);
150 kfree(lru->node); 556 kfree(lru->node);
557 lru->node = NULL;
558
559 memcg_put_cache_ids();
151} 560}
152EXPORT_SYMBOL_GPL(list_lru_destroy); 561EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/madvise.c b/mm/madvise.c
index a271adc93289..1077cbdc8b52 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -155,7 +155,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
155 pte = *(orig_pte + ((index - start) / PAGE_SIZE)); 155 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
156 pte_unmap_unlock(orig_pte, ptl); 156 pte_unmap_unlock(orig_pte, ptl);
157 157
158 if (pte_present(pte) || pte_none(pte) || pte_file(pte)) 158 if (pte_present(pte) || pte_none(pte))
159 continue; 159 continue;
160 entry = pte_to_swp_entry(pte); 160 entry = pte_to_swp_entry(pte);
161 if (unlikely(non_swap_entry(entry))) 161 if (unlikely(non_swap_entry(entry)))
@@ -222,19 +222,22 @@ static long madvise_willneed(struct vm_area_struct *vma,
222 struct file *file = vma->vm_file; 222 struct file *file = vma->vm_file;
223 223
224#ifdef CONFIG_SWAP 224#ifdef CONFIG_SWAP
225 if (!file || mapping_cap_swap_backed(file->f_mapping)) { 225 if (!file) {
226 *prev = vma; 226 *prev = vma;
227 if (!file) 227 force_swapin_readahead(vma, start, end);
228 force_swapin_readahead(vma, start, end);
229 else
230 force_shm_swapin_readahead(vma, start, end,
231 file->f_mapping);
232 return 0; 228 return 0;
233 } 229 }
234#endif
235 230
231 if (shmem_mapping(file->f_mapping)) {
232 *prev = vma;
233 force_shm_swapin_readahead(vma, start, end,
234 file->f_mapping);
235 return 0;
236 }
237#else
236 if (!file) 238 if (!file)
237 return -EBADF; 239 return -EBADF;
240#endif
238 241
239 if (file->f_mapping->a_ops->get_xip_mem) { 242 if (file->f_mapping->a_ops->get_xip_mem) {
240 /* no bad return value, but ignore advice */ 243 /* no bad return value, but ignore advice */
@@ -278,14 +281,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
278 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) 281 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
279 return -EINVAL; 282 return -EINVAL;
280 283
281 if (unlikely(vma->vm_flags & VM_NONLINEAR)) { 284 zap_page_range(vma, start, end - start, NULL);
282 struct zap_details details = {
283 .nonlinear_vma = vma,
284 .last_index = ULONG_MAX,
285 };
286 zap_page_range(vma, start, end - start, &details);
287 } else
288 zap_page_range(vma, start, end - start, NULL);
289 return 0; 285 return 0;
290} 286}
291 287
@@ -303,7 +299,7 @@ static long madvise_remove(struct vm_area_struct *vma,
303 299
304 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 300 *prev = NULL; /* tell sys_madvise we drop mmap_sem */
305 301
306 if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) 302 if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
307 return -EINVAL; 303 return -EINVAL;
308 304
309 f = vma->vm_file; 305 f = vma->vm_file;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ef91e856c7e4..d18d3a6e7337 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -72,22 +72,13 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
72#define MEM_CGROUP_RECLAIM_RETRIES 5 72#define MEM_CGROUP_RECLAIM_RETRIES 5
73static struct mem_cgroup *root_mem_cgroup __read_mostly; 73static struct mem_cgroup *root_mem_cgroup __read_mostly;
74 74
75/* Whether the swap controller is active */
75#ifdef CONFIG_MEMCG_SWAP 76#ifdef CONFIG_MEMCG_SWAP
76/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
77int do_swap_account __read_mostly; 77int do_swap_account __read_mostly;
78
79/* for remember boot option*/
80#ifdef CONFIG_MEMCG_SWAP_ENABLED
81static int really_do_swap_account __initdata = 1;
82#else
83static int really_do_swap_account __initdata;
84#endif
85
86#else 78#else
87#define do_swap_account 0 79#define do_swap_account 0
88#endif 80#endif
89 81
90
91static const char * const mem_cgroup_stat_names[] = { 82static const char * const mem_cgroup_stat_names[] = {
92 "cache", 83 "cache",
93 "rss", 84 "rss",
@@ -97,14 +88,6 @@ static const char * const mem_cgroup_stat_names[] = {
97 "swap", 88 "swap",
98}; 89};
99 90
100enum mem_cgroup_events_index {
101 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
102 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
103 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
104 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
105 MEM_CGROUP_EVENTS_NSTATS,
106};
107
108static const char * const mem_cgroup_events_names[] = { 91static const char * const mem_cgroup_events_names[] = {
109 "pgpgin", 92 "pgpgin",
110 "pgpgout", 93 "pgpgout",
@@ -138,7 +121,7 @@ enum mem_cgroup_events_target {
138 121
139struct mem_cgroup_stat_cpu { 122struct mem_cgroup_stat_cpu {
140 long count[MEM_CGROUP_STAT_NSTATS]; 123 long count[MEM_CGROUP_STAT_NSTATS];
141 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 124 unsigned long events[MEMCG_NR_EVENTS];
142 unsigned long nr_page_events; 125 unsigned long nr_page_events;
143 unsigned long targets[MEM_CGROUP_NTARGETS]; 126 unsigned long targets[MEM_CGROUP_NTARGETS];
144}; 127};
@@ -284,6 +267,10 @@ struct mem_cgroup {
284 struct page_counter memsw; 267 struct page_counter memsw;
285 struct page_counter kmem; 268 struct page_counter kmem;
286 269
270 /* Normal memory consumption range */
271 unsigned long low;
272 unsigned long high;
273
287 unsigned long soft_limit; 274 unsigned long soft_limit;
288 275
289 /* vmpressure notifications */ 276 /* vmpressure notifications */
@@ -325,9 +312,11 @@ struct mem_cgroup {
325 /* 312 /*
326 * set > 0 if pages under this cgroup are moving to other cgroup. 313 * set > 0 if pages under this cgroup are moving to other cgroup.
327 */ 314 */
328 atomic_t moving_account; 315 atomic_t moving_account;
329 /* taken only while moving_account > 0 */ 316 /* taken only while moving_account > 0 */
330 spinlock_t move_lock; 317 spinlock_t move_lock;
318 struct task_struct *move_lock_task;
319 unsigned long move_lock_flags;
331 /* 320 /*
332 * percpu counter. 321 * percpu counter.
333 */ 322 */
@@ -343,11 +332,10 @@ struct mem_cgroup {
343 struct cg_proto tcp_mem; 332 struct cg_proto tcp_mem;
344#endif 333#endif
345#if defined(CONFIG_MEMCG_KMEM) 334#if defined(CONFIG_MEMCG_KMEM)
346 /* analogous to slab_common's slab_caches list, but per-memcg; 335 /* Index in the kmem_cache->memcg_params.memcg_caches array */
347 * protected by memcg_slab_mutex */
348 struct list_head memcg_slab_caches;
349 /* Index in the kmem_cache->memcg_params->memcg_caches array */
350 int kmemcg_id; 336 int kmemcg_id;
337 bool kmem_acct_activated;
338 bool kmem_acct_active;
351#endif 339#endif
352 340
353 int last_scanned_node; 341 int last_scanned_node;
@@ -366,29 +354,26 @@ struct mem_cgroup {
366}; 354};
367 355
368#ifdef CONFIG_MEMCG_KMEM 356#ifdef CONFIG_MEMCG_KMEM
369static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 357bool memcg_kmem_is_active(struct mem_cgroup *memcg)
370{ 358{
371 return memcg->kmemcg_id >= 0; 359 return memcg->kmem_acct_active;
372} 360}
373#endif 361#endif
374 362
375/* Stuffs for move charges at task migration. */ 363/* Stuffs for move charges at task migration. */
376/* 364/*
377 * Types of charges to be moved. "move_charge_at_immitgrate" and 365 * Types of charges to be moved.
378 * "immigrate_flags" are treated as a left-shifted bitmap of these types.
379 */ 366 */
380enum move_type { 367#define MOVE_ANON 0x1U
381 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 368#define MOVE_FILE 0x2U
382 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 369#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
383 NR_MOVE_TYPE,
384};
385 370
386/* "mc" and its members are protected by cgroup_mutex */ 371/* "mc" and its members are protected by cgroup_mutex */
387static struct move_charge_struct { 372static struct move_charge_struct {
388 spinlock_t lock; /* for from, to */ 373 spinlock_t lock; /* for from, to */
389 struct mem_cgroup *from; 374 struct mem_cgroup *from;
390 struct mem_cgroup *to; 375 struct mem_cgroup *to;
391 unsigned long immigrate_flags; 376 unsigned long flags;
392 unsigned long precharge; 377 unsigned long precharge;
393 unsigned long moved_charge; 378 unsigned long moved_charge;
394 unsigned long moved_swap; 379 unsigned long moved_swap;
@@ -399,16 +384,6 @@ static struct move_charge_struct {
399 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 384 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
400}; 385};
401 386
402static bool move_anon(void)
403{
404 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
405}
406
407static bool move_file(void)
408{
409 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
410}
411
412/* 387/*
413 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 388 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
414 * limit reclaim to prevent infinite loops, if they ever occur. 389 * limit reclaim to prevent infinite loops, if they ever occur.
@@ -544,33 +519,35 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
544} 519}
545EXPORT_SYMBOL(tcp_proto_cgroup); 520EXPORT_SYMBOL(tcp_proto_cgroup);
546 521
547static void disarm_sock_keys(struct mem_cgroup *memcg)
548{
549 if (!memcg_proto_activated(&memcg->tcp_mem))
550 return;
551 static_key_slow_dec(&memcg_socket_limit_enabled);
552}
553#else
554static void disarm_sock_keys(struct mem_cgroup *memcg)
555{
556}
557#endif 522#endif
558 523
559#ifdef CONFIG_MEMCG_KMEM 524#ifdef CONFIG_MEMCG_KMEM
560/* 525/*
561 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. 526 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
562 * The main reason for not using cgroup id for this: 527 * The main reason for not using cgroup id for this:
563 * this works better in sparse environments, where we have a lot of memcgs, 528 * this works better in sparse environments, where we have a lot of memcgs,
564 * but only a few kmem-limited. Or also, if we have, for instance, 200 529 * but only a few kmem-limited. Or also, if we have, for instance, 200
565 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 530 * memcgs, and none but the 200th is kmem-limited, we'd have to have a
566 * 200 entry array for that. 531 * 200 entry array for that.
567 * 532 *
568 * The current size of the caches array is stored in 533 * The current size of the caches array is stored in memcg_nr_cache_ids. It
569 * memcg_limited_groups_array_size. It will double each time we have to 534 * will double each time we have to increase it.
570 * increase it.
571 */ 535 */
572static DEFINE_IDA(kmem_limited_groups); 536static DEFINE_IDA(memcg_cache_ida);
573int memcg_limited_groups_array_size; 537int memcg_nr_cache_ids;
538
539/* Protects memcg_nr_cache_ids */
540static DECLARE_RWSEM(memcg_cache_ids_sem);
541
542void memcg_get_cache_ids(void)
543{
544 down_read(&memcg_cache_ids_sem);
545}
546
547void memcg_put_cache_ids(void)
548{
549 up_read(&memcg_cache_ids_sem);
550}
574 551
575/* 552/*
576 * MIN_SIZE is different than 1, because we would like to avoid going through 553 * MIN_SIZE is different than 1, because we would like to avoid going through
@@ -596,32 +573,8 @@ int memcg_limited_groups_array_size;
596struct static_key memcg_kmem_enabled_key; 573struct static_key memcg_kmem_enabled_key;
597EXPORT_SYMBOL(memcg_kmem_enabled_key); 574EXPORT_SYMBOL(memcg_kmem_enabled_key);
598 575
599static void memcg_free_cache_id(int id);
600
601static void disarm_kmem_keys(struct mem_cgroup *memcg)
602{
603 if (memcg_kmem_is_active(memcg)) {
604 static_key_slow_dec(&memcg_kmem_enabled_key);
605 memcg_free_cache_id(memcg->kmemcg_id);
606 }
607 /*
608 * This check can't live in kmem destruction function,
609 * since the charges will outlive the cgroup
610 */
611 WARN_ON(page_counter_read(&memcg->kmem));
612}
613#else
614static void disarm_kmem_keys(struct mem_cgroup *memcg)
615{
616}
617#endif /* CONFIG_MEMCG_KMEM */ 576#endif /* CONFIG_MEMCG_KMEM */
618 577
619static void disarm_static_keys(struct mem_cgroup *memcg)
620{
621 disarm_sock_keys(memcg);
622 disarm_kmem_keys(memcg);
623}
624
625static struct mem_cgroup_per_zone * 578static struct mem_cgroup_per_zone *
626mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) 579mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
627{ 580{
@@ -1368,6 +1321,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1368 return inactive * inactive_ratio < active; 1321 return inactive * inactive_ratio < active;
1369} 1322}
1370 1323
1324bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
1325{
1326 struct mem_cgroup_per_zone *mz;
1327 struct mem_cgroup *memcg;
1328
1329 if (mem_cgroup_disabled())
1330 return true;
1331
1332 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1333 memcg = mz->memcg;
1334
1335 return !!(memcg->css.flags & CSS_ONLINE);
1336}
1337
1371#define mem_cgroup_from_counter(counter, member) \ 1338#define mem_cgroup_from_counter(counter, member) \
1372 container_of(counter, struct mem_cgroup, member) 1339 container_of(counter, struct mem_cgroup, member)
1373 1340
@@ -1477,9 +1444,9 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1477 1444
1478 pr_info("Task in "); 1445 pr_info("Task in ");
1479 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1446 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1480 pr_info(" killed as a result of limit of "); 1447 pr_cont(" killed as a result of limit of ");
1481 pr_cont_cgroup_path(memcg->css.cgroup); 1448 pr_cont_cgroup_path(memcg->css.cgroup);
1482 pr_info("\n"); 1449 pr_cont("\n");
1483 1450
1484 rcu_read_unlock(); 1451 rcu_read_unlock();
1485 1452
@@ -1560,7 +1527,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1560 * quickly exit and free its memory. 1527 * quickly exit and free its memory.
1561 */ 1528 */
1562 if (fatal_signal_pending(current) || task_will_free_mem(current)) { 1529 if (fatal_signal_pending(current) || task_will_free_mem(current)) {
1563 set_thread_flag(TIF_MEMDIE); 1530 mark_tsk_oom_victim(current);
1564 return; 1531 return;
1565 } 1532 }
1566 1533
@@ -1934,7 +1901,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
1934 if (!memcg) 1901 if (!memcg)
1935 return false; 1902 return false;
1936 1903
1937 if (!handle) 1904 if (!handle || oom_killer_disabled)
1938 goto cleanup; 1905 goto cleanup;
1939 1906
1940 owait.memcg = memcg; 1907 owait.memcg = memcg;
@@ -1980,34 +1947,33 @@ cleanup:
1980/** 1947/**
1981 * mem_cgroup_begin_page_stat - begin a page state statistics transaction 1948 * mem_cgroup_begin_page_stat - begin a page state statistics transaction
1982 * @page: page that is going to change accounted state 1949 * @page: page that is going to change accounted state
1983 * @locked: &memcg->move_lock slowpath was taken
1984 * @flags: IRQ-state flags for &memcg->move_lock
1985 * 1950 *
1986 * This function must mark the beginning of an accounted page state 1951 * This function must mark the beginning of an accounted page state
1987 * change to prevent double accounting when the page is concurrently 1952 * change to prevent double accounting when the page is concurrently
1988 * being moved to another memcg: 1953 * being moved to another memcg:
1989 * 1954 *
1990 * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); 1955 * memcg = mem_cgroup_begin_page_stat(page);
1991 * if (TestClearPageState(page)) 1956 * if (TestClearPageState(page))
1992 * mem_cgroup_update_page_stat(memcg, state, -1); 1957 * mem_cgroup_update_page_stat(memcg, state, -1);
1993 * mem_cgroup_end_page_stat(memcg, locked, flags); 1958 * mem_cgroup_end_page_stat(memcg);
1994 *
1995 * The RCU lock is held throughout the transaction. The fast path can
1996 * get away without acquiring the memcg->move_lock (@locked is false)
1997 * because page moving starts with an RCU grace period.
1998 *
1999 * The RCU lock also protects the memcg from being freed when the page
2000 * state that is going to change is the only thing preventing the page
2001 * from being uncharged. E.g. end-writeback clearing PageWriteback(),
2002 * which allows migration to go ahead and uncharge the page before the
2003 * account transaction might be complete.
2004 */ 1959 */
2005struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, 1960struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
2006 bool *locked,
2007 unsigned long *flags)
2008{ 1961{
2009 struct mem_cgroup *memcg; 1962 struct mem_cgroup *memcg;
1963 unsigned long flags;
2010 1964
1965 /*
1966 * The RCU lock is held throughout the transaction. The fast
1967 * path can get away without acquiring the memcg->move_lock
1968 * because page moving starts with an RCU grace period.
1969 *
1970 * The RCU lock also protects the memcg from being freed when
1971 * the page state that is going to change is the only thing
1972 * preventing the page from being uncharged.
1973 * E.g. end-writeback clearing PageWriteback(), which allows
1974 * migration to go ahead and uncharge the page before the
1975 * account transaction might be complete.
1976 */
2011 rcu_read_lock(); 1977 rcu_read_lock();
2012 1978
2013 if (mem_cgroup_disabled()) 1979 if (mem_cgroup_disabled())
@@ -2017,16 +1983,22 @@ again:
2017 if (unlikely(!memcg)) 1983 if (unlikely(!memcg))
2018 return NULL; 1984 return NULL;
2019 1985
2020 *locked = false;
2021 if (atomic_read(&memcg->moving_account) <= 0) 1986 if (atomic_read(&memcg->moving_account) <= 0)
2022 return memcg; 1987 return memcg;
2023 1988
2024 spin_lock_irqsave(&memcg->move_lock, *flags); 1989 spin_lock_irqsave(&memcg->move_lock, flags);
2025 if (memcg != page->mem_cgroup) { 1990 if (memcg != page->mem_cgroup) {
2026 spin_unlock_irqrestore(&memcg->move_lock, *flags); 1991 spin_unlock_irqrestore(&memcg->move_lock, flags);
2027 goto again; 1992 goto again;
2028 } 1993 }
2029 *locked = true; 1994
1995 /*
1996 * When charge migration first begins, we can have locked and
1997 * unlocked page stat updates happening concurrently. Track
1998 * the task who has the lock for mem_cgroup_end_page_stat().
1999 */
2000 memcg->move_lock_task = current;
2001 memcg->move_lock_flags = flags;
2030 2002
2031 return memcg; 2003 return memcg;
2032} 2004}
@@ -2034,14 +2006,17 @@ again:
2034/** 2006/**
2035 * mem_cgroup_end_page_stat - finish a page state statistics transaction 2007 * mem_cgroup_end_page_stat - finish a page state statistics transaction
2036 * @memcg: the memcg that was accounted against 2008 * @memcg: the memcg that was accounted against
2037 * @locked: value received from mem_cgroup_begin_page_stat()
2038 * @flags: value received from mem_cgroup_begin_page_stat()
2039 */ 2009 */
2040void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, 2010void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
2041 unsigned long *flags)
2042{ 2011{
2043 if (memcg && *locked) 2012 if (memcg && memcg->move_lock_task == current) {
2044 spin_unlock_irqrestore(&memcg->move_lock, *flags); 2013 unsigned long flags = memcg->move_lock_flags;
2014
2015 memcg->move_lock_task = NULL;
2016 memcg->move_lock_flags = 0;
2017
2018 spin_unlock_irqrestore(&memcg->move_lock, flags);
2019 }
2045 2020
2046 rcu_read_unlock(); 2021 rcu_read_unlock();
2047} 2022}
@@ -2134,17 +2109,6 @@ static void drain_local_stock(struct work_struct *dummy)
2134 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2109 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2135} 2110}
2136 2111
2137static void __init memcg_stock_init(void)
2138{
2139 int cpu;
2140
2141 for_each_possible_cpu(cpu) {
2142 struct memcg_stock_pcp *stock =
2143 &per_cpu(memcg_stock, cpu);
2144 INIT_WORK(&stock->work, drain_local_stock);
2145 }
2146}
2147
2148/* 2112/*
2149 * Cache charges(val) to local per_cpu area. 2113 * Cache charges(val) to local per_cpu area.
2150 * This will be consumed by consume_stock() function, later. 2114 * This will be consumed by consume_stock() function, later.
@@ -2294,6 +2258,8 @@ retry:
2294 if (!(gfp_mask & __GFP_WAIT)) 2258 if (!(gfp_mask & __GFP_WAIT))
2295 goto nomem; 2259 goto nomem;
2296 2260
2261 mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
2262
2297 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2263 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2298 gfp_mask, may_swap); 2264 gfp_mask, may_swap);
2299 2265
@@ -2335,6 +2301,8 @@ retry:
2335 if (fatal_signal_pending(current)) 2301 if (fatal_signal_pending(current))
2336 goto bypass; 2302 goto bypass;
2337 2303
2304 mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
2305
2338 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); 2306 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
2339nomem: 2307nomem:
2340 if (!(gfp_mask & __GFP_NOFAIL)) 2308 if (!(gfp_mask & __GFP_NOFAIL))
@@ -2346,6 +2314,16 @@ done_restock:
2346 css_get_many(&memcg->css, batch); 2314 css_get_many(&memcg->css, batch);
2347 if (batch > nr_pages) 2315 if (batch > nr_pages)
2348 refill_stock(memcg, batch - nr_pages); 2316 refill_stock(memcg, batch - nr_pages);
2317 /*
2318 * If the hierarchy is above the normal consumption range,
2319 * make the charging task trim their excess contribution.
2320 */
2321 do {
2322 if (page_counter_read(&memcg->memory) <= memcg->high)
2323 continue;
2324 mem_cgroup_events(memcg, MEMCG_HIGH, 1);
2325 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2326 } while ((memcg = parent_mem_cgroup(memcg)));
2349done: 2327done:
2350 return ret; 2328 return ret;
2351} 2329}
@@ -2476,27 +2454,8 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2476} 2454}
2477 2455
2478#ifdef CONFIG_MEMCG_KMEM 2456#ifdef CONFIG_MEMCG_KMEM
2479/* 2457int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
2480 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or 2458 unsigned long nr_pages)
2481 * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
2482 */
2483static DEFINE_MUTEX(memcg_slab_mutex);
2484
2485/*
2486 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2487 * in the memcg_cache_params struct.
2488 */
2489static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2490{
2491 struct kmem_cache *cachep;
2492
2493 VM_BUG_ON(p->is_root_cache);
2494 cachep = p->root_cache;
2495 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
2496}
2497
2498static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
2499 unsigned long nr_pages)
2500{ 2459{
2501 struct page_counter *counter; 2460 struct page_counter *counter;
2502 int ret = 0; 2461 int ret = 0;
@@ -2533,8 +2492,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
2533 return ret; 2492 return ret;
2534} 2493}
2535 2494
2536static void memcg_uncharge_kmem(struct mem_cgroup *memcg, 2495void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
2537 unsigned long nr_pages)
2538{ 2496{
2539 page_counter_uncharge(&memcg->memory, nr_pages); 2497 page_counter_uncharge(&memcg->memory, nr_pages);
2540 if (do_swap_account) 2498 if (do_swap_account)
@@ -2560,18 +2518,19 @@ static int memcg_alloc_cache_id(void)
2560 int id, size; 2518 int id, size;
2561 int err; 2519 int err;
2562 2520
2563 id = ida_simple_get(&kmem_limited_groups, 2521 id = ida_simple_get(&memcg_cache_ida,
2564 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2522 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2565 if (id < 0) 2523 if (id < 0)
2566 return id; 2524 return id;
2567 2525
2568 if (id < memcg_limited_groups_array_size) 2526 if (id < memcg_nr_cache_ids)
2569 return id; 2527 return id;
2570 2528
2571 /* 2529 /*
2572 * There's no space for the new id in memcg_caches arrays, 2530 * There's no space for the new id in memcg_caches arrays,
2573 * so we have to grow them. 2531 * so we have to grow them.
2574 */ 2532 */
2533 down_write(&memcg_cache_ids_sem);
2575 2534
2576 size = 2 * (id + 1); 2535 size = 2 * (id + 1);
2577 if (size < MEMCG_CACHES_MIN_SIZE) 2536 if (size < MEMCG_CACHES_MIN_SIZE)
@@ -2579,12 +2538,16 @@ static int memcg_alloc_cache_id(void)
2579 else if (size > MEMCG_CACHES_MAX_SIZE) 2538 else if (size > MEMCG_CACHES_MAX_SIZE)
2580 size = MEMCG_CACHES_MAX_SIZE; 2539 size = MEMCG_CACHES_MAX_SIZE;
2581 2540
2582 mutex_lock(&memcg_slab_mutex);
2583 err = memcg_update_all_caches(size); 2541 err = memcg_update_all_caches(size);
2584 mutex_unlock(&memcg_slab_mutex); 2542 if (!err)
2543 err = memcg_update_all_list_lrus(size);
2544 if (!err)
2545 memcg_nr_cache_ids = size;
2546
2547 up_write(&memcg_cache_ids_sem);
2585 2548
2586 if (err) { 2549 if (err) {
2587 ida_simple_remove(&kmem_limited_groups, id); 2550 ida_simple_remove(&memcg_cache_ida, id);
2588 return err; 2551 return err;
2589 } 2552 }
2590 return id; 2553 return id;
@@ -2592,136 +2555,23 @@ static int memcg_alloc_cache_id(void)
2592 2555
2593static void memcg_free_cache_id(int id) 2556static void memcg_free_cache_id(int id)
2594{ 2557{
2595 ida_simple_remove(&kmem_limited_groups, id); 2558 ida_simple_remove(&memcg_cache_ida, id);
2596}
2597
2598/*
2599 * We should update the current array size iff all caches updates succeed. This
2600 * can only be done from the slab side. The slab mutex needs to be held when
2601 * calling this.
2602 */
2603void memcg_update_array_size(int num)
2604{
2605 memcg_limited_groups_array_size = num;
2606} 2559}
2607 2560
2608static void memcg_register_cache(struct mem_cgroup *memcg, 2561struct memcg_kmem_cache_create_work {
2609 struct kmem_cache *root_cache)
2610{
2611 static char memcg_name_buf[NAME_MAX + 1]; /* protected by
2612 memcg_slab_mutex */
2613 struct kmem_cache *cachep;
2614 int id;
2615
2616 lockdep_assert_held(&memcg_slab_mutex);
2617
2618 id = memcg_cache_id(memcg);
2619
2620 /*
2621 * Since per-memcg caches are created asynchronously on first
2622 * allocation (see memcg_kmem_get_cache()), several threads can try to
2623 * create the same cache, but only one of them may succeed.
2624 */
2625 if (cache_from_memcg_idx(root_cache, id))
2626 return;
2627
2628 cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
2629 cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
2630 /*
2631 * If we could not create a memcg cache, do not complain, because
2632 * that's not critical at all as we can always proceed with the root
2633 * cache.
2634 */
2635 if (!cachep)
2636 return;
2637
2638 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
2639
2640 /*
2641 * Since readers won't lock (see cache_from_memcg_idx()), we need a
2642 * barrier here to ensure nobody will see the kmem_cache partially
2643 * initialized.
2644 */
2645 smp_wmb();
2646
2647 BUG_ON(root_cache->memcg_params->memcg_caches[id]);
2648 root_cache->memcg_params->memcg_caches[id] = cachep;
2649}
2650
2651static void memcg_unregister_cache(struct kmem_cache *cachep)
2652{
2653 struct kmem_cache *root_cache;
2654 struct mem_cgroup *memcg;
2655 int id;
2656
2657 lockdep_assert_held(&memcg_slab_mutex);
2658
2659 BUG_ON(is_root_cache(cachep));
2660
2661 root_cache = cachep->memcg_params->root_cache;
2662 memcg = cachep->memcg_params->memcg;
2663 id = memcg_cache_id(memcg);
2664
2665 BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
2666 root_cache->memcg_params->memcg_caches[id] = NULL;
2667
2668 list_del(&cachep->memcg_params->list);
2669
2670 kmem_cache_destroy(cachep);
2671}
2672
2673int __memcg_cleanup_cache_params(struct kmem_cache *s)
2674{
2675 struct kmem_cache *c;
2676 int i, failed = 0;
2677
2678 mutex_lock(&memcg_slab_mutex);
2679 for_each_memcg_cache_index(i) {
2680 c = cache_from_memcg_idx(s, i);
2681 if (!c)
2682 continue;
2683
2684 memcg_unregister_cache(c);
2685
2686 if (cache_from_memcg_idx(s, i))
2687 failed++;
2688 }
2689 mutex_unlock(&memcg_slab_mutex);
2690 return failed;
2691}
2692
2693static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
2694{
2695 struct kmem_cache *cachep;
2696 struct memcg_cache_params *params, *tmp;
2697
2698 if (!memcg_kmem_is_active(memcg))
2699 return;
2700
2701 mutex_lock(&memcg_slab_mutex);
2702 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
2703 cachep = memcg_params_to_cache(params);
2704 memcg_unregister_cache(cachep);
2705 }
2706 mutex_unlock(&memcg_slab_mutex);
2707}
2708
2709struct memcg_register_cache_work {
2710 struct mem_cgroup *memcg; 2562 struct mem_cgroup *memcg;
2711 struct kmem_cache *cachep; 2563 struct kmem_cache *cachep;
2712 struct work_struct work; 2564 struct work_struct work;
2713}; 2565};
2714 2566
2715static void memcg_register_cache_func(struct work_struct *w) 2567static void memcg_kmem_cache_create_func(struct work_struct *w)
2716{ 2568{
2717 struct memcg_register_cache_work *cw = 2569 struct memcg_kmem_cache_create_work *cw =
2718 container_of(w, struct memcg_register_cache_work, work); 2570 container_of(w, struct memcg_kmem_cache_create_work, work);
2719 struct mem_cgroup *memcg = cw->memcg; 2571 struct mem_cgroup *memcg = cw->memcg;
2720 struct kmem_cache *cachep = cw->cachep; 2572 struct kmem_cache *cachep = cw->cachep;
2721 2573
2722 mutex_lock(&memcg_slab_mutex); 2574 memcg_create_kmem_cache(memcg, cachep);
2723 memcg_register_cache(memcg, cachep);
2724 mutex_unlock(&memcg_slab_mutex);
2725 2575
2726 css_put(&memcg->css); 2576 css_put(&memcg->css);
2727 kfree(cw); 2577 kfree(cw);
@@ -2730,10 +2580,10 @@ static void memcg_register_cache_func(struct work_struct *w)
2730/* 2580/*
2731 * Enqueue the creation of a per-memcg kmem_cache. 2581 * Enqueue the creation of a per-memcg kmem_cache.
2732 */ 2582 */
2733static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, 2583static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2734 struct kmem_cache *cachep) 2584 struct kmem_cache *cachep)
2735{ 2585{
2736 struct memcg_register_cache_work *cw; 2586 struct memcg_kmem_cache_create_work *cw;
2737 2587
2738 cw = kmalloc(sizeof(*cw), GFP_NOWAIT); 2588 cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
2739 if (!cw) 2589 if (!cw)
@@ -2743,18 +2593,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
2743 2593
2744 cw->memcg = memcg; 2594 cw->memcg = memcg;
2745 cw->cachep = cachep; 2595 cw->cachep = cachep;
2596 INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2746 2597
2747 INIT_WORK(&cw->work, memcg_register_cache_func);
2748 schedule_work(&cw->work); 2598 schedule_work(&cw->work);
2749} 2599}
2750 2600
2751static void memcg_schedule_register_cache(struct mem_cgroup *memcg, 2601static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2752 struct kmem_cache *cachep) 2602 struct kmem_cache *cachep)
2753{ 2603{
2754 /* 2604 /*
2755 * We need to stop accounting when we kmalloc, because if the 2605 * We need to stop accounting when we kmalloc, because if the
2756 * corresponding kmalloc cache is not yet created, the first allocation 2606 * corresponding kmalloc cache is not yet created, the first allocation
2757 * in __memcg_schedule_register_cache will recurse. 2607 * in __memcg_schedule_kmem_cache_create will recurse.
2758 * 2608 *
2759 * However, it is better to enclose the whole function. Depending on 2609 * However, it is better to enclose the whole function. Depending on
2760 * the debugging options enabled, INIT_WORK(), for instance, can 2610 * the debugging options enabled, INIT_WORK(), for instance, can
@@ -2763,24 +2613,10 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
2763 * the safest choice is to do it like this, wrapping the whole function. 2613 * the safest choice is to do it like this, wrapping the whole function.
2764 */ 2614 */
2765 current->memcg_kmem_skip_account = 1; 2615 current->memcg_kmem_skip_account = 1;
2766 __memcg_schedule_register_cache(memcg, cachep); 2616 __memcg_schedule_kmem_cache_create(memcg, cachep);
2767 current->memcg_kmem_skip_account = 0; 2617 current->memcg_kmem_skip_account = 0;
2768} 2618}
2769 2619
2770int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
2771{
2772 unsigned int nr_pages = 1 << order;
2773
2774 return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
2775}
2776
2777void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
2778{
2779 unsigned int nr_pages = 1 << order;
2780
2781 memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
2782}
2783
2784/* 2620/*
2785 * Return the kmem_cache we're supposed to use for a slab allocation. 2621 * Return the kmem_cache we're supposed to use for a slab allocation.
2786 * We try to use the current memcg's version of the cache. 2622 * We try to use the current memcg's version of the cache.
@@ -2798,18 +2634,19 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
2798{ 2634{
2799 struct mem_cgroup *memcg; 2635 struct mem_cgroup *memcg;
2800 struct kmem_cache *memcg_cachep; 2636 struct kmem_cache *memcg_cachep;
2637 int kmemcg_id;
2801 2638
2802 VM_BUG_ON(!cachep->memcg_params); 2639 VM_BUG_ON(!is_root_cache(cachep));
2803 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
2804 2640
2805 if (current->memcg_kmem_skip_account) 2641 if (current->memcg_kmem_skip_account)
2806 return cachep; 2642 return cachep;
2807 2643
2808 memcg = get_mem_cgroup_from_mm(current->mm); 2644 memcg = get_mem_cgroup_from_mm(current->mm);
2809 if (!memcg_kmem_is_active(memcg)) 2645 kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
2646 if (kmemcg_id < 0)
2810 goto out; 2647 goto out;
2811 2648
2812 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); 2649 memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2813 if (likely(memcg_cachep)) 2650 if (likely(memcg_cachep))
2814 return memcg_cachep; 2651 return memcg_cachep;
2815 2652
@@ -2825,7 +2662,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
2825 * could happen with the slab_mutex held. So it's better to 2662 * could happen with the slab_mutex held. So it's better to
2826 * defer everything. 2663 * defer everything.
2827 */ 2664 */
2828 memcg_schedule_register_cache(memcg, cachep); 2665 memcg_schedule_kmem_cache_create(memcg, cachep);
2829out: 2666out:
2830 css_put(&memcg->css); 2667 css_put(&memcg->css);
2831 return cachep; 2668 return cachep;
@@ -2834,7 +2671,7 @@ out:
2834void __memcg_kmem_put_cache(struct kmem_cache *cachep) 2671void __memcg_kmem_put_cache(struct kmem_cache *cachep)
2835{ 2672{
2836 if (!is_root_cache(cachep)) 2673 if (!is_root_cache(cachep))
2837 css_put(&cachep->memcg_params->memcg->css); 2674 css_put(&cachep->memcg_params.memcg->css);
2838} 2675}
2839 2676
2840/* 2677/*
@@ -2899,6 +2736,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
2899 memcg_uncharge_kmem(memcg, 1 << order); 2736 memcg_uncharge_kmem(memcg, 1 << order);
2900 page->mem_cgroup = NULL; 2737 page->mem_cgroup = NULL;
2901} 2738}
2739
2740struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
2741{
2742 struct mem_cgroup *memcg = NULL;
2743 struct kmem_cache *cachep;
2744 struct page *page;
2745
2746 page = virt_to_head_page(ptr);
2747 if (PageSlab(page)) {
2748 cachep = page->slab_cache;
2749 if (!is_root_cache(cachep))
2750 memcg = cachep->memcg_params.memcg;
2751 } else
2752 /* page allocated by alloc_kmem_pages */
2753 memcg = page->mem_cgroup;
2754
2755 return memcg;
2756}
2902#endif /* CONFIG_MEMCG_KMEM */ 2757#endif /* CONFIG_MEMCG_KMEM */
2903 2758
2904#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2759#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3043,18 +2898,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
3043 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 2898 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3044 mem_cgroup_swap_statistics(from, false); 2899 mem_cgroup_swap_statistics(from, false);
3045 mem_cgroup_swap_statistics(to, true); 2900 mem_cgroup_swap_statistics(to, true);
3046 /*
3047 * This function is only called from task migration context now.
3048 * It postpones page_counter and refcount handling till the end
3049 * of task migration(mem_cgroup_clear_mc()) for performance
3050 * improvement. But we cannot postpone css_get(to) because if
3051 * the process that has been moved to @to does swap-in, the
3052 * refcount of @to might be decreased to 0.
3053 *
3054 * We are in attach() phase, so the cgroup is guaranteed to be
3055 * alive, so we can just call css_get().
3056 */
3057 css_get(&to->css);
3058 return 0; 2901 return 0;
3059 } 2902 }
3060 return -EINVAL; 2903 return -EINVAL;
@@ -3445,8 +3288,9 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
3445 int err = 0; 3288 int err = 0;
3446 int memcg_id; 3289 int memcg_id;
3447 3290
3448 if (memcg_kmem_is_active(memcg)) 3291 BUG_ON(memcg->kmemcg_id >= 0);
3449 return 0; 3292 BUG_ON(memcg->kmem_acct_activated);
3293 BUG_ON(memcg->kmem_acct_active);
3450 3294
3451 /* 3295 /*
3452 * For simplicity, we won't allow this to be disabled. It also can't 3296 * For simplicity, we won't allow this to be disabled. It also can't
@@ -3489,6 +3333,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
3489 * patched. 3333 * patched.
3490 */ 3334 */
3491 memcg->kmemcg_id = memcg_id; 3335 memcg->kmemcg_id = memcg_id;
3336 memcg->kmem_acct_activated = true;
3337 memcg->kmem_acct_active = true;
3492out: 3338out:
3493 return err; 3339 return err;
3494} 3340}
@@ -3545,7 +3391,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3545 int ret; 3391 int ret;
3546 3392
3547 buf = strstrip(buf); 3393 buf = strstrip(buf);
3548 ret = page_counter_memparse(buf, &nr_pages); 3394 ret = page_counter_memparse(buf, "-1", &nr_pages);
3549 if (ret) 3395 if (ret)
3550 return ret; 3396 return ret;
3551 3397
@@ -3621,7 +3467,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3621{ 3467{
3622 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3468 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3623 3469
3624 if (val >= (1 << NR_MOVE_TYPE)) 3470 if (val & ~MOVE_MASK)
3625 return -EINVAL; 3471 return -EINVAL;
3626 3472
3627 /* 3473 /*
@@ -3699,6 +3545,10 @@ static int memcg_stat_show(struct seq_file *m, void *v)
3699 struct mem_cgroup *mi; 3545 struct mem_cgroup *mi;
3700 unsigned int i; 3546 unsigned int i;
3701 3547
3548 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
3549 MEM_CGROUP_STAT_NSTATS);
3550 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
3551 MEM_CGROUP_EVENTS_NSTATS);
3702 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 3552 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3703 3553
3704 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 3554 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
@@ -3913,7 +3763,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3913 unsigned long usage; 3763 unsigned long usage;
3914 int i, size, ret; 3764 int i, size, ret;
3915 3765
3916 ret = page_counter_memparse(args, &threshold); 3766 ret = page_counter_memparse(args, "-1", &threshold);
3917 if (ret) 3767 if (ret)
3918 return ret; 3768 return ret;
3919 3769
@@ -4164,9 +4014,59 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4164 return mem_cgroup_sockets_init(memcg, ss); 4014 return mem_cgroup_sockets_init(memcg, ss);
4165} 4015}
4166 4016
4017static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
4018{
4019 struct cgroup_subsys_state *css;
4020 struct mem_cgroup *parent, *child;
4021 int kmemcg_id;
4022
4023 if (!memcg->kmem_acct_active)
4024 return;
4025
4026 /*
4027 * Clear the 'active' flag before clearing memcg_caches arrays entries.
4028 * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
4029 * guarantees no cache will be created for this cgroup after we are
4030 * done (see memcg_create_kmem_cache()).
4031 */
4032 memcg->kmem_acct_active = false;
4033
4034 memcg_deactivate_kmem_caches(memcg);
4035
4036 kmemcg_id = memcg->kmemcg_id;
4037 BUG_ON(kmemcg_id < 0);
4038
4039 parent = parent_mem_cgroup(memcg);
4040 if (!parent)
4041 parent = root_mem_cgroup;
4042
4043 /*
4044 * Change kmemcg_id of this cgroup and all its descendants to the
4045 * parent's id, and then move all entries from this cgroup's list_lrus
4046 * to ones of the parent. After we have finished, all list_lrus
4047 * corresponding to this cgroup are guaranteed to remain empty. The
4048 * ordering is imposed by list_lru_node->lock taken by
4049 * memcg_drain_all_list_lrus().
4050 */
4051 css_for_each_descendant_pre(css, &memcg->css) {
4052 child = mem_cgroup_from_css(css);
4053 BUG_ON(child->kmemcg_id != kmemcg_id);
4054 child->kmemcg_id = parent->kmemcg_id;
4055 if (!memcg->use_hierarchy)
4056 break;
4057 }
4058 memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
4059
4060 memcg_free_cache_id(kmemcg_id);
4061}
4062
4167static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4063static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4168{ 4064{
4169 memcg_unregister_all_caches(memcg); 4065 if (memcg->kmem_acct_activated) {
4066 memcg_destroy_kmem_caches(memcg);
4067 static_key_slow_dec(&memcg_kmem_enabled_key);
4068 WARN_ON(page_counter_read(&memcg->kmem));
4069 }
4170 mem_cgroup_sockets_destroy(memcg); 4070 mem_cgroup_sockets_destroy(memcg);
4171} 4071}
4172#else 4072#else
@@ -4175,6 +4075,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4175 return 0; 4075 return 0;
4176} 4076}
4177 4077
4078static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
4079{
4080}
4081
4178static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4082static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4179{ 4083{
4180} 4084}
@@ -4403,7 +4307,7 @@ out_kfree:
4403 return ret; 4307 return ret;
4404} 4308}
4405 4309
4406static struct cftype mem_cgroup_files[] = { 4310static struct cftype mem_cgroup_legacy_files[] = {
4407 { 4311 {
4408 .name = "usage_in_bytes", 4312 .name = "usage_in_bytes",
4409 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4313 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
@@ -4514,34 +4418,6 @@ static struct cftype mem_cgroup_files[] = {
4514 { }, /* terminate */ 4418 { }, /* terminate */
4515}; 4419};
4516 4420
4517#ifdef CONFIG_MEMCG_SWAP
4518static struct cftype memsw_cgroup_files[] = {
4519 {
4520 .name = "memsw.usage_in_bytes",
4521 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4522 .read_u64 = mem_cgroup_read_u64,
4523 },
4524 {
4525 .name = "memsw.max_usage_in_bytes",
4526 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4527 .write = mem_cgroup_reset,
4528 .read_u64 = mem_cgroup_read_u64,
4529 },
4530 {
4531 .name = "memsw.limit_in_bytes",
4532 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4533 .write = mem_cgroup_write,
4534 .read_u64 = mem_cgroup_read_u64,
4535 },
4536 {
4537 .name = "memsw.failcnt",
4538 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4539 .write = mem_cgroup_reset,
4540 .read_u64 = mem_cgroup_read_u64,
4541 },
4542 { }, /* terminate */
4543};
4544#endif
4545static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4421static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4546{ 4422{
4547 struct mem_cgroup_per_node *pn; 4423 struct mem_cgroup_per_node *pn;
@@ -4621,8 +4497,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4621 free_mem_cgroup_per_zone_info(memcg, node); 4497 free_mem_cgroup_per_zone_info(memcg, node);
4622 4498
4623 free_percpu(memcg->stat); 4499 free_percpu(memcg->stat);
4624
4625 disarm_static_keys(memcg);
4626 kfree(memcg); 4500 kfree(memcg);
4627} 4501}
4628 4502
@@ -4637,29 +4511,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4637} 4511}
4638EXPORT_SYMBOL(parent_mem_cgroup); 4512EXPORT_SYMBOL(parent_mem_cgroup);
4639 4513
4640static void __init mem_cgroup_soft_limit_tree_init(void)
4641{
4642 struct mem_cgroup_tree_per_node *rtpn;
4643 struct mem_cgroup_tree_per_zone *rtpz;
4644 int tmp, node, zone;
4645
4646 for_each_node(node) {
4647 tmp = node;
4648 if (!node_state(node, N_NORMAL_MEMORY))
4649 tmp = -1;
4650 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4651 BUG_ON(!rtpn);
4652
4653 soft_limit_tree.rb_tree_per_node[node] = rtpn;
4654
4655 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4656 rtpz = &rtpn->rb_tree_per_zone[zone];
4657 rtpz->rb_root = RB_ROOT;
4658 spin_lock_init(&rtpz->lock);
4659 }
4660 }
4661}
4662
4663static struct cgroup_subsys_state * __ref 4514static struct cgroup_subsys_state * __ref
4664mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 4515mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4665{ 4516{
@@ -4679,6 +4530,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4679 if (parent_css == NULL) { 4530 if (parent_css == NULL) {
4680 root_mem_cgroup = memcg; 4531 root_mem_cgroup = memcg;
4681 page_counter_init(&memcg->memory, NULL); 4532 page_counter_init(&memcg->memory, NULL);
4533 memcg->high = PAGE_COUNTER_MAX;
4534 memcg->soft_limit = PAGE_COUNTER_MAX;
4682 page_counter_init(&memcg->memsw, NULL); 4535 page_counter_init(&memcg->memsw, NULL);
4683 page_counter_init(&memcg->kmem, NULL); 4536 page_counter_init(&memcg->kmem, NULL);
4684 } 4537 }
@@ -4693,7 +4546,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4693 spin_lock_init(&memcg->event_list_lock); 4546 spin_lock_init(&memcg->event_list_lock);
4694#ifdef CONFIG_MEMCG_KMEM 4547#ifdef CONFIG_MEMCG_KMEM
4695 memcg->kmemcg_id = -1; 4548 memcg->kmemcg_id = -1;
4696 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
4697#endif 4549#endif
4698 4550
4699 return &memcg->css; 4551 return &memcg->css;
@@ -4724,6 +4576,8 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
4724 4576
4725 if (parent->use_hierarchy) { 4577 if (parent->use_hierarchy) {
4726 page_counter_init(&memcg->memory, &parent->memory); 4578 page_counter_init(&memcg->memory, &parent->memory);
4579 memcg->high = PAGE_COUNTER_MAX;
4580 memcg->soft_limit = PAGE_COUNTER_MAX;
4727 page_counter_init(&memcg->memsw, &parent->memsw); 4581 page_counter_init(&memcg->memsw, &parent->memsw);
4728 page_counter_init(&memcg->kmem, &parent->kmem); 4582 page_counter_init(&memcg->kmem, &parent->kmem);
4729 4583
@@ -4733,6 +4587,8 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
4733 */ 4587 */
4734 } else { 4588 } else {
4735 page_counter_init(&memcg->memory, NULL); 4589 page_counter_init(&memcg->memory, NULL);
4590 memcg->high = PAGE_COUNTER_MAX;
4591 memcg->soft_limit = PAGE_COUNTER_MAX;
4736 page_counter_init(&memcg->memsw, NULL); 4592 page_counter_init(&memcg->memsw, NULL);
4737 page_counter_init(&memcg->kmem, NULL); 4593 page_counter_init(&memcg->kmem, NULL);
4738 /* 4594 /*
@@ -4777,6 +4633,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4777 spin_unlock(&memcg->event_list_lock); 4633 spin_unlock(&memcg->event_list_lock);
4778 4634
4779 vmpressure_cleanup(&memcg->vmpressure); 4635 vmpressure_cleanup(&memcg->vmpressure);
4636
4637 memcg_deactivate_kmem(memcg);
4780} 4638}
4781 4639
4782static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 4640static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -4807,7 +4665,9 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
4807 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); 4665 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
4808 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); 4666 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
4809 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); 4667 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
4810 memcg->soft_limit = 0; 4668 memcg->low = 0;
4669 memcg->high = PAGE_COUNTER_MAX;
4670 memcg->soft_limit = PAGE_COUNTER_MAX;
4811} 4671}
4812 4672
4813#ifdef CONFIG_MMU 4673#ifdef CONFIG_MMU
@@ -4883,12 +4743,12 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4883 if (!page || !page_mapped(page)) 4743 if (!page || !page_mapped(page))
4884 return NULL; 4744 return NULL;
4885 if (PageAnon(page)) { 4745 if (PageAnon(page)) {
4886 /* we don't move shared anon */ 4746 if (!(mc.flags & MOVE_ANON))
4887 if (!move_anon())
4888 return NULL; 4747 return NULL;
4889 } else if (!move_file()) 4748 } else {
4890 /* we ignore mapcount for file pages */ 4749 if (!(mc.flags & MOVE_FILE))
4891 return NULL; 4750 return NULL;
4751 }
4892 if (!get_page_unless_zero(page)) 4752 if (!get_page_unless_zero(page))
4893 return NULL; 4753 return NULL;
4894 4754
@@ -4902,7 +4762,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4902 struct page *page = NULL; 4762 struct page *page = NULL;
4903 swp_entry_t ent = pte_to_swp_entry(ptent); 4763 swp_entry_t ent = pte_to_swp_entry(ptent);
4904 4764
4905 if (!move_anon() || non_swap_entry(ent)) 4765 if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
4906 return NULL; 4766 return NULL;
4907 /* 4767 /*
4908 * Because lookup_swap_cache() updates some statistics counter, 4768 * Because lookup_swap_cache() updates some statistics counter,
@@ -4931,14 +4791,11 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4931 4791
4932 if (!vma->vm_file) /* anonymous vma */ 4792 if (!vma->vm_file) /* anonymous vma */
4933 return NULL; 4793 return NULL;
4934 if (!move_file()) 4794 if (!(mc.flags & MOVE_FILE))
4935 return NULL; 4795 return NULL;
4936 4796
4937 mapping = vma->vm_file->f_mapping; 4797 mapping = vma->vm_file->f_mapping;
4938 if (pte_none(ptent)) 4798 pgoff = linear_page_index(vma, addr);
4939 pgoff = linear_page_index(vma, addr);
4940 else /* pte_file(ptent) is true */
4941 pgoff = pte_to_pgoff(ptent);
4942 4799
4943 /* page is moved even if it's not RSS of this task(page-faulted). */ 4800 /* page is moved even if it's not RSS of this task(page-faulted). */
4944#ifdef CONFIG_SWAP 4801#ifdef CONFIG_SWAP
@@ -4970,7 +4827,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4970 page = mc_handle_present_pte(vma, addr, ptent); 4827 page = mc_handle_present_pte(vma, addr, ptent);
4971 else if (is_swap_pte(ptent)) 4828 else if (is_swap_pte(ptent))
4972 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 4829 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
4973 else if (pte_none(ptent) || pte_file(ptent)) 4830 else if (pte_none(ptent))
4974 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4831 page = mc_handle_file_pte(vma, addr, ptent, &ent);
4975 4832
4976 if (!page && !ent.val) 4833 if (!page && !ent.val)
@@ -5013,7 +4870,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5013 4870
5014 page = pmd_page(pmd); 4871 page = pmd_page(pmd);
5015 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 4872 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5016 if (!move_anon()) 4873 if (!(mc.flags & MOVE_ANON))
5017 return ret; 4874 return ret;
5018 if (page->mem_cgroup == mc.from) { 4875 if (page->mem_cgroup == mc.from) {
5019 ret = MC_TARGET_PAGE; 4876 ret = MC_TARGET_PAGE;
@@ -5036,7 +4893,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5036 unsigned long addr, unsigned long end, 4893 unsigned long addr, unsigned long end,
5037 struct mm_walk *walk) 4894 struct mm_walk *walk)
5038{ 4895{
5039 struct vm_area_struct *vma = walk->private; 4896 struct vm_area_struct *vma = walk->vma;
5040 pte_t *pte; 4897 pte_t *pte;
5041 spinlock_t *ptl; 4898 spinlock_t *ptl;
5042 4899
@@ -5062,20 +4919,13 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5062static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 4919static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5063{ 4920{
5064 unsigned long precharge; 4921 unsigned long precharge;
5065 struct vm_area_struct *vma;
5066 4922
4923 struct mm_walk mem_cgroup_count_precharge_walk = {
4924 .pmd_entry = mem_cgroup_count_precharge_pte_range,
4925 .mm = mm,
4926 };
5067 down_read(&mm->mmap_sem); 4927 down_read(&mm->mmap_sem);
5068 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4928 walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
5069 struct mm_walk mem_cgroup_count_precharge_walk = {
5070 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5071 .mm = mm,
5072 .private = vma,
5073 };
5074 if (is_vm_hugetlb_page(vma))
5075 continue;
5076 walk_page_range(vma->vm_start, vma->vm_end,
5077 &mem_cgroup_count_precharge_walk);
5078 }
5079 up_read(&mm->mmap_sem); 4929 up_read(&mm->mmap_sem);
5080 4930
5081 precharge = mc.precharge; 4931 precharge = mc.precharge;
@@ -5155,15 +5005,15 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
5155 struct task_struct *p = cgroup_taskset_first(tset); 5005 struct task_struct *p = cgroup_taskset_first(tset);
5156 int ret = 0; 5006 int ret = 0;
5157 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5007 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5158 unsigned long move_charge_at_immigrate; 5008 unsigned long move_flags;
5159 5009
5160 /* 5010 /*
5161 * We are now commited to this value whatever it is. Changes in this 5011 * We are now commited to this value whatever it is. Changes in this
5162 * tunable will only affect upcoming migrations, not the current one. 5012 * tunable will only affect upcoming migrations, not the current one.
5163 * So we need to save it, and keep it going. 5013 * So we need to save it, and keep it going.
5164 */ 5014 */
5165 move_charge_at_immigrate = memcg->move_charge_at_immigrate; 5015 move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate);
5166 if (move_charge_at_immigrate) { 5016 if (move_flags) {
5167 struct mm_struct *mm; 5017 struct mm_struct *mm;
5168 struct mem_cgroup *from = mem_cgroup_from_task(p); 5018 struct mem_cgroup *from = mem_cgroup_from_task(p);
5169 5019
@@ -5183,7 +5033,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
5183 spin_lock(&mc.lock); 5033 spin_lock(&mc.lock);
5184 mc.from = from; 5034 mc.from = from;
5185 mc.to = memcg; 5035 mc.to = memcg;
5186 mc.immigrate_flags = move_charge_at_immigrate; 5036 mc.flags = move_flags;
5187 spin_unlock(&mc.lock); 5037 spin_unlock(&mc.lock);
5188 /* We set mc.moving_task later */ 5038 /* We set mc.moving_task later */
5189 5039
@@ -5208,7 +5058,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5208 struct mm_walk *walk) 5058 struct mm_walk *walk)
5209{ 5059{
5210 int ret = 0; 5060 int ret = 0;
5211 struct vm_area_struct *vma = walk->private; 5061 struct vm_area_struct *vma = walk->vma;
5212 pte_t *pte; 5062 pte_t *pte;
5213 spinlock_t *ptl; 5063 spinlock_t *ptl;
5214 enum mc_target_type target_type; 5064 enum mc_target_type target_type;
@@ -5304,7 +5154,10 @@ put: /* get_mctgt_type() gets the page */
5304 5154
5305static void mem_cgroup_move_charge(struct mm_struct *mm) 5155static void mem_cgroup_move_charge(struct mm_struct *mm)
5306{ 5156{
5307 struct vm_area_struct *vma; 5157 struct mm_walk mem_cgroup_move_charge_walk = {
5158 .pmd_entry = mem_cgroup_move_charge_pte_range,
5159 .mm = mm,
5160 };
5308 5161
5309 lru_add_drain_all(); 5162 lru_add_drain_all();
5310 /* 5163 /*
@@ -5327,24 +5180,11 @@ retry:
5327 cond_resched(); 5180 cond_resched();
5328 goto retry; 5181 goto retry;
5329 } 5182 }
5330 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5183 /*
5331 int ret; 5184 * When we have consumed all precharges and failed in doing
5332 struct mm_walk mem_cgroup_move_charge_walk = { 5185 * additional charge, the page walk just aborts.
5333 .pmd_entry = mem_cgroup_move_charge_pte_range, 5186 */
5334 .mm = mm, 5187 walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
5335 .private = vma,
5336 };
5337 if (is_vm_hugetlb_page(vma))
5338 continue;
5339 ret = walk_page_range(vma->vm_start, vma->vm_end,
5340 &mem_cgroup_move_charge_walk);
5341 if (ret)
5342 /*
5343 * means we have consumed all precharges and failed in
5344 * doing additional charge. Just abandon here.
5345 */
5346 break;
5347 }
5348 up_read(&mm->mmap_sem); 5188 up_read(&mm->mmap_sem);
5349 atomic_dec(&mc.from->moving_account); 5189 atomic_dec(&mc.from->moving_account);
5350} 5190}
@@ -5395,118 +5235,211 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5395 mem_cgroup_from_css(root_css)->use_hierarchy = true; 5235 mem_cgroup_from_css(root_css)->use_hierarchy = true;
5396} 5236}
5397 5237
5398struct cgroup_subsys memory_cgrp_subsys = { 5238static u64 memory_current_read(struct cgroup_subsys_state *css,
5399 .css_alloc = mem_cgroup_css_alloc, 5239 struct cftype *cft)
5400 .css_online = mem_cgroup_css_online, 5240{
5401 .css_offline = mem_cgroup_css_offline, 5241 return mem_cgroup_usage(mem_cgroup_from_css(css), false);
5402 .css_free = mem_cgroup_css_free, 5242}
5403 .css_reset = mem_cgroup_css_reset,
5404 .can_attach = mem_cgroup_can_attach,
5405 .cancel_attach = mem_cgroup_cancel_attach,
5406 .attach = mem_cgroup_move_task,
5407 .bind = mem_cgroup_bind,
5408 .legacy_cftypes = mem_cgroup_files,
5409 .early_init = 0,
5410};
5411 5243
5412#ifdef CONFIG_MEMCG_SWAP 5244static int memory_low_show(struct seq_file *m, void *v)
5413static int __init enable_swap_account(char *s)
5414{ 5245{
5415 if (!strcmp(s, "1")) 5246 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5416 really_do_swap_account = 1; 5247 unsigned long low = ACCESS_ONCE(memcg->low);
5417 else if (!strcmp(s, "0")) 5248
5418 really_do_swap_account = 0; 5249 if (low == PAGE_COUNTER_MAX)
5419 return 1; 5250 seq_puts(m, "infinity\n");
5251 else
5252 seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5253
5254 return 0;
5420} 5255}
5421__setup("swapaccount=", enable_swap_account);
5422 5256
5423static void __init memsw_file_init(void) 5257static ssize_t memory_low_write(struct kernfs_open_file *of,
5258 char *buf, size_t nbytes, loff_t off)
5424{ 5259{
5425 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, 5260 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5426 memsw_cgroup_files)); 5261 unsigned long low;
5262 int err;
5263
5264 buf = strstrip(buf);
5265 err = page_counter_memparse(buf, "infinity", &low);
5266 if (err)
5267 return err;
5268
5269 memcg->low = low;
5270
5271 return nbytes;
5427} 5272}
5428 5273
5429static void __init enable_swap_cgroup(void) 5274static int memory_high_show(struct seq_file *m, void *v)
5430{ 5275{
5431 if (!mem_cgroup_disabled() && really_do_swap_account) { 5276 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5432 do_swap_account = 1; 5277 unsigned long high = ACCESS_ONCE(memcg->high);
5433 memsw_file_init(); 5278
5434 } 5279 if (high == PAGE_COUNTER_MAX)
5280 seq_puts(m, "infinity\n");
5281 else
5282 seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5283
5284 return 0;
5435} 5285}
5436 5286
5437#else 5287static ssize_t memory_high_write(struct kernfs_open_file *of,
5438static void __init enable_swap_cgroup(void) 5288 char *buf, size_t nbytes, loff_t off)
5439{ 5289{
5290 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5291 unsigned long high;
5292 int err;
5293
5294 buf = strstrip(buf);
5295 err = page_counter_memparse(buf, "infinity", &high);
5296 if (err)
5297 return err;
5298
5299 memcg->high = high;
5300
5301 return nbytes;
5440} 5302}
5441#endif
5442 5303
5443#ifdef CONFIG_MEMCG_SWAP 5304static int memory_max_show(struct seq_file *m, void *v)
5444/**
5445 * mem_cgroup_swapout - transfer a memsw charge to swap
5446 * @page: page whose memsw charge to transfer
5447 * @entry: swap entry to move the charge to
5448 *
5449 * Transfer the memsw charge of @page to @entry.
5450 */
5451void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5452{ 5305{
5453 struct mem_cgroup *memcg; 5306 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5454 unsigned short oldid; 5307 unsigned long max = ACCESS_ONCE(memcg->memory.limit);
5455 5308
5456 VM_BUG_ON_PAGE(PageLRU(page), page); 5309 if (max == PAGE_COUNTER_MAX)
5457 VM_BUG_ON_PAGE(page_count(page), page); 5310 seq_puts(m, "infinity\n");
5311 else
5312 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5458 5313
5459 if (!do_swap_account) 5314 return 0;
5460 return; 5315}
5461 5316
5462 memcg = page->mem_cgroup; 5317static ssize_t memory_max_write(struct kernfs_open_file *of,
5318 char *buf, size_t nbytes, loff_t off)
5319{
5320 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5321 unsigned long max;
5322 int err;
5463 5323
5464 /* Readahead page, never charged */ 5324 buf = strstrip(buf);
5465 if (!memcg) 5325 err = page_counter_memparse(buf, "infinity", &max);
5466 return; 5326 if (err)
5327 return err;
5467 5328
5468 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); 5329 err = mem_cgroup_resize_limit(memcg, max);
5469 VM_BUG_ON_PAGE(oldid, page); 5330 if (err)
5470 mem_cgroup_swap_statistics(memcg, true); 5331 return err;
5471 5332
5472 page->mem_cgroup = NULL; 5333 return nbytes;
5334}
5473 5335
5474 if (!mem_cgroup_is_root(memcg)) 5336static int memory_events_show(struct seq_file *m, void *v)
5475 page_counter_uncharge(&memcg->memory, 1); 5337{
5338 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5476 5339
5477 /* XXX: caller holds IRQ-safe mapping->tree_lock */ 5340 seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
5478 VM_BUG_ON(!irqs_disabled()); 5341 seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
5342 seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
5343 seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
5479 5344
5480 mem_cgroup_charge_statistics(memcg, page, -1); 5345 return 0;
5481 memcg_check_events(memcg, page);
5482} 5346}
5483 5347
5348static struct cftype memory_files[] = {
5349 {
5350 .name = "current",
5351 .read_u64 = memory_current_read,
5352 },
5353 {
5354 .name = "low",
5355 .flags = CFTYPE_NOT_ON_ROOT,
5356 .seq_show = memory_low_show,
5357 .write = memory_low_write,
5358 },
5359 {
5360 .name = "high",
5361 .flags = CFTYPE_NOT_ON_ROOT,
5362 .seq_show = memory_high_show,
5363 .write = memory_high_write,
5364 },
5365 {
5366 .name = "max",
5367 .flags = CFTYPE_NOT_ON_ROOT,
5368 .seq_show = memory_max_show,
5369 .write = memory_max_write,
5370 },
5371 {
5372 .name = "events",
5373 .flags = CFTYPE_NOT_ON_ROOT,
5374 .seq_show = memory_events_show,
5375 },
5376 { } /* terminate */
5377};
5378
5379struct cgroup_subsys memory_cgrp_subsys = {
5380 .css_alloc = mem_cgroup_css_alloc,
5381 .css_online = mem_cgroup_css_online,
5382 .css_offline = mem_cgroup_css_offline,
5383 .css_free = mem_cgroup_css_free,
5384 .css_reset = mem_cgroup_css_reset,
5385 .can_attach = mem_cgroup_can_attach,
5386 .cancel_attach = mem_cgroup_cancel_attach,
5387 .attach = mem_cgroup_move_task,
5388 .bind = mem_cgroup_bind,
5389 .dfl_cftypes = memory_files,
5390 .legacy_cftypes = mem_cgroup_legacy_files,
5391 .early_init = 0,
5392};
5393
5484/** 5394/**
5485 * mem_cgroup_uncharge_swap - uncharge a swap entry 5395 * mem_cgroup_events - count memory events against a cgroup
5486 * @entry: swap entry to uncharge 5396 * @memcg: the memory cgroup
5397 * @idx: the event index
5398 * @nr: the number of events to account for
5399 */
5400void mem_cgroup_events(struct mem_cgroup *memcg,
5401 enum mem_cgroup_events_index idx,
5402 unsigned int nr)
5403{
5404 this_cpu_add(memcg->stat->events[idx], nr);
5405}
5406
5407/**
5408 * mem_cgroup_low - check if memory consumption is below the normal range
5409 * @root: the highest ancestor to consider
5410 * @memcg: the memory cgroup to check
5487 * 5411 *
5488 * Drop the memsw charge associated with @entry. 5412 * Returns %true if memory consumption of @memcg, and that of all
5413 * configurable ancestors up to @root, is below the normal range.
5489 */ 5414 */
5490void mem_cgroup_uncharge_swap(swp_entry_t entry) 5415bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
5491{ 5416{
5492 struct mem_cgroup *memcg; 5417 if (mem_cgroup_disabled())
5493 unsigned short id; 5418 return false;
5494 5419
5495 if (!do_swap_account) 5420 /*
5496 return; 5421 * The toplevel group doesn't have a configurable range, so
5422 * it's never low when looked at directly, and it is not
5423 * considered an ancestor when assessing the hierarchy.
5424 */
5497 5425
5498 id = swap_cgroup_record(entry, 0); 5426 if (memcg == root_mem_cgroup)
5499 rcu_read_lock(); 5427 return false;
5500 memcg = mem_cgroup_lookup(id); 5428
5501 if (memcg) { 5429 if (page_counter_read(&memcg->memory) > memcg->low)
5502 if (!mem_cgroup_is_root(memcg)) 5430 return false;
5503 page_counter_uncharge(&memcg->memsw, 1); 5431
5504 mem_cgroup_swap_statistics(memcg, false); 5432 while (memcg != root) {
5505 css_put(&memcg->css); 5433 memcg = parent_mem_cgroup(memcg);
5434
5435 if (memcg == root_mem_cgroup)
5436 break;
5437
5438 if (page_counter_read(&memcg->memory) > memcg->low)
5439 return false;
5506 } 5440 }
5507 rcu_read_unlock(); 5441 return true;
5508} 5442}
5509#endif
5510 5443
5511/** 5444/**
5512 * mem_cgroup_try_charge - try charging a page 5445 * mem_cgroup_try_charge - try charging a page
@@ -5782,7 +5715,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
5782 * mem_cgroup_migrate - migrate a charge to another page 5715 * mem_cgroup_migrate - migrate a charge to another page
5783 * @oldpage: currently charged page 5716 * @oldpage: currently charged page
5784 * @newpage: page to transfer the charge to 5717 * @newpage: page to transfer the charge to
5785 * @lrucare: both pages might be on the LRU already 5718 * @lrucare: either or both pages might be on the LRU already
5786 * 5719 *
5787 * Migrate the charge from @oldpage to @newpage. 5720 * Migrate the charge from @oldpage to @newpage.
5788 * 5721 *
@@ -5840,10 +5773,155 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
5840 */ 5773 */
5841static int __init mem_cgroup_init(void) 5774static int __init mem_cgroup_init(void)
5842{ 5775{
5776 int cpu, node;
5777
5843 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 5778 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
5844 enable_swap_cgroup(); 5779
5845 mem_cgroup_soft_limit_tree_init(); 5780 for_each_possible_cpu(cpu)
5846 memcg_stock_init(); 5781 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
5782 drain_local_stock);
5783
5784 for_each_node(node) {
5785 struct mem_cgroup_tree_per_node *rtpn;
5786 int zone;
5787
5788 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
5789 node_online(node) ? node : NUMA_NO_NODE);
5790
5791 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5792 struct mem_cgroup_tree_per_zone *rtpz;
5793
5794 rtpz = &rtpn->rb_tree_per_zone[zone];
5795 rtpz->rb_root = RB_ROOT;
5796 spin_lock_init(&rtpz->lock);
5797 }
5798 soft_limit_tree.rb_tree_per_node[node] = rtpn;
5799 }
5800
5847 return 0; 5801 return 0;
5848} 5802}
5849subsys_initcall(mem_cgroup_init); 5803subsys_initcall(mem_cgroup_init);
5804
5805#ifdef CONFIG_MEMCG_SWAP
5806/**
5807 * mem_cgroup_swapout - transfer a memsw charge to swap
5808 * @page: page whose memsw charge to transfer
5809 * @entry: swap entry to move the charge to
5810 *
5811 * Transfer the memsw charge of @page to @entry.
5812 */
5813void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5814{
5815 struct mem_cgroup *memcg;
5816 unsigned short oldid;
5817
5818 VM_BUG_ON_PAGE(PageLRU(page), page);
5819 VM_BUG_ON_PAGE(page_count(page), page);
5820
5821 if (!do_swap_account)
5822 return;
5823
5824 memcg = page->mem_cgroup;
5825
5826 /* Readahead page, never charged */
5827 if (!memcg)
5828 return;
5829
5830 oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
5831 VM_BUG_ON_PAGE(oldid, page);
5832 mem_cgroup_swap_statistics(memcg, true);
5833
5834 page->mem_cgroup = NULL;
5835
5836 if (!mem_cgroup_is_root(memcg))
5837 page_counter_uncharge(&memcg->memory, 1);
5838
5839 /* XXX: caller holds IRQ-safe mapping->tree_lock */
5840 VM_BUG_ON(!irqs_disabled());
5841
5842 mem_cgroup_charge_statistics(memcg, page, -1);
5843 memcg_check_events(memcg, page);
5844}
5845
5846/**
5847 * mem_cgroup_uncharge_swap - uncharge a swap entry
5848 * @entry: swap entry to uncharge
5849 *
5850 * Drop the memsw charge associated with @entry.
5851 */
5852void mem_cgroup_uncharge_swap(swp_entry_t entry)
5853{
5854 struct mem_cgroup *memcg;
5855 unsigned short id;
5856
5857 if (!do_swap_account)
5858 return;
5859
5860 id = swap_cgroup_record(entry, 0);
5861 rcu_read_lock();
5862 memcg = mem_cgroup_lookup(id);
5863 if (memcg) {
5864 if (!mem_cgroup_is_root(memcg))
5865 page_counter_uncharge(&memcg->memsw, 1);
5866 mem_cgroup_swap_statistics(memcg, false);
5867 css_put(&memcg->css);
5868 }
5869 rcu_read_unlock();
5870}
5871
5872/* for remember boot option*/
5873#ifdef CONFIG_MEMCG_SWAP_ENABLED
5874static int really_do_swap_account __initdata = 1;
5875#else
5876static int really_do_swap_account __initdata;
5877#endif
5878
5879static int __init enable_swap_account(char *s)
5880{
5881 if (!strcmp(s, "1"))
5882 really_do_swap_account = 1;
5883 else if (!strcmp(s, "0"))
5884 really_do_swap_account = 0;
5885 return 1;
5886}
5887__setup("swapaccount=", enable_swap_account);
5888
5889static struct cftype memsw_cgroup_files[] = {
5890 {
5891 .name = "memsw.usage_in_bytes",
5892 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
5893 .read_u64 = mem_cgroup_read_u64,
5894 },
5895 {
5896 .name = "memsw.max_usage_in_bytes",
5897 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
5898 .write = mem_cgroup_reset,
5899 .read_u64 = mem_cgroup_read_u64,
5900 },
5901 {
5902 .name = "memsw.limit_in_bytes",
5903 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
5904 .write = mem_cgroup_write,
5905 .read_u64 = mem_cgroup_read_u64,
5906 },
5907 {
5908 .name = "memsw.failcnt",
5909 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
5910 .write = mem_cgroup_reset,
5911 .read_u64 = mem_cgroup_read_u64,
5912 },
5913 { }, /* terminate */
5914};
5915
5916static int __init mem_cgroup_swap_init(void)
5917{
5918 if (!mem_cgroup_disabled() && really_do_swap_account) {
5919 do_swap_account = 1;
5920 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
5921 memsw_cgroup_files));
5922 }
5923 return 0;
5924}
5925subsys_initcall(mem_cgroup_swap_init);
5926
5927#endif /* CONFIG_MEMCG_SWAP */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index feb803bf3443..d487f8dc6d39 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -242,15 +242,8 @@ void shake_page(struct page *p, int access)
242 * Only call shrink_node_slabs here (which would also shrink 242 * Only call shrink_node_slabs here (which would also shrink
243 * other caches) if access is not potentially fatal. 243 * other caches) if access is not potentially fatal.
244 */ 244 */
245 if (access) { 245 if (access)
246 int nr; 246 drop_slab_node(page_to_nid(p));
247 int nid = page_to_nid(p);
248 do {
249 nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000);
250 if (page_count(p) == 1)
251 break;
252 } while (nr > 10);
253 }
254} 247}
255EXPORT_SYMBOL_GPL(shake_page); 248EXPORT_SYMBOL_GPL(shake_page);
256 249
@@ -1654,8 +1647,6 @@ static int __soft_offline_page(struct page *page, int flags)
1654 * setting PG_hwpoison. 1647 * setting PG_hwpoison.
1655 */ 1648 */
1656 if (!is_free_buddy_page(page)) 1649 if (!is_free_buddy_page(page))
1657 lru_add_drain_all();
1658 if (!is_free_buddy_page(page))
1659 drain_all_pages(page_zone(page)); 1650 drain_all_pages(page_zone(page));
1660 SetPageHWPoison(page); 1651 SetPageHWPoison(page);
1661 if (!is_free_buddy_page(page)) 1652 if (!is_free_buddy_page(page))
diff --git a/mm/memory.c b/mm/memory.c
index ca920d1fd314..99275325f303 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -235,6 +235,9 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long
235 235
236static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) 236static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
237{ 237{
238 if (!tlb->end)
239 return;
240
238 tlb_flush(tlb); 241 tlb_flush(tlb);
239 mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); 242 mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
240#ifdef CONFIG_HAVE_RCU_TABLE_FREE 243#ifdef CONFIG_HAVE_RCU_TABLE_FREE
@@ -247,7 +250,7 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb)
247{ 250{
248 struct mmu_gather_batch *batch; 251 struct mmu_gather_batch *batch;
249 252
250 for (batch = &tlb->local; batch; batch = batch->next) { 253 for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
251 free_pages_and_swap_cache(batch->pages, batch->nr); 254 free_pages_and_swap_cache(batch->pages, batch->nr);
252 batch->nr = 0; 255 batch->nr = 0;
253 } 256 }
@@ -256,9 +259,6 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb)
256 259
257void tlb_flush_mmu(struct mmu_gather *tlb) 260void tlb_flush_mmu(struct mmu_gather *tlb)
258{ 261{
259 if (!tlb->end)
260 return;
261
262 tlb_flush_mmu_tlbonly(tlb); 262 tlb_flush_mmu_tlbonly(tlb);
263 tlb_flush_mmu_free(tlb); 263 tlb_flush_mmu_free(tlb);
264} 264}
@@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
428 pmd = pmd_offset(pud, start); 428 pmd = pmd_offset(pud, start);
429 pud_clear(pud); 429 pud_clear(pud);
430 pmd_free_tlb(tlb, pmd, start); 430 pmd_free_tlb(tlb, pmd, start);
431 mm_dec_nr_pmds(tlb->mm);
431} 432}
432 433
433static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 434static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -754,6 +755,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
754 if (HAVE_PTE_SPECIAL) { 755 if (HAVE_PTE_SPECIAL) {
755 if (likely(!pte_special(pte))) 756 if (likely(!pte_special(pte)))
756 goto check_pfn; 757 goto check_pfn;
758 if (vma->vm_ops && vma->vm_ops->find_special_page)
759 return vma->vm_ops->find_special_page(vma, addr);
757 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) 760 if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
758 return NULL; 761 return NULL;
759 if (!is_zero_pfn(pfn)) 762 if (!is_zero_pfn(pfn))
@@ -811,42 +814,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
811 814
812 /* pte contains position in swap or file, so copy. */ 815 /* pte contains position in swap or file, so copy. */
813 if (unlikely(!pte_present(pte))) { 816 if (unlikely(!pte_present(pte))) {
814 if (!pte_file(pte)) { 817 swp_entry_t entry = pte_to_swp_entry(pte);
815 swp_entry_t entry = pte_to_swp_entry(pte); 818
816 819 if (likely(!non_swap_entry(entry))) {
817 if (likely(!non_swap_entry(entry))) { 820 if (swap_duplicate(entry) < 0)
818 if (swap_duplicate(entry) < 0) 821 return entry.val;
819 return entry.val; 822
820 823 /* make sure dst_mm is on swapoff's mmlist. */
821 /* make sure dst_mm is on swapoff's mmlist. */ 824 if (unlikely(list_empty(&dst_mm->mmlist))) {
822 if (unlikely(list_empty(&dst_mm->mmlist))) { 825 spin_lock(&mmlist_lock);
823 spin_lock(&mmlist_lock); 826 if (list_empty(&dst_mm->mmlist))
824 if (list_empty(&dst_mm->mmlist)) 827 list_add(&dst_mm->mmlist,
825 list_add(&dst_mm->mmlist, 828 &src_mm->mmlist);
826 &src_mm->mmlist); 829 spin_unlock(&mmlist_lock);
827 spin_unlock(&mmlist_lock); 830 }
828 } 831 rss[MM_SWAPENTS]++;
829 rss[MM_SWAPENTS]++; 832 } else if (is_migration_entry(entry)) {
830 } else if (is_migration_entry(entry)) { 833 page = migration_entry_to_page(entry);
831 page = migration_entry_to_page(entry); 834
832 835 if (PageAnon(page))
833 if (PageAnon(page)) 836 rss[MM_ANONPAGES]++;
834 rss[MM_ANONPAGES]++; 837 else
835 else 838 rss[MM_FILEPAGES]++;
836 rss[MM_FILEPAGES]++; 839
837 840 if (is_write_migration_entry(entry) &&
838 if (is_write_migration_entry(entry) && 841 is_cow_mapping(vm_flags)) {
839 is_cow_mapping(vm_flags)) { 842 /*
840 /* 843 * COW mappings require pages in both
841 * COW mappings require pages in both 844 * parent and child to be set to read.
842 * parent and child to be set to read. 845 */
843 */ 846 make_migration_entry_read(&entry);
844 make_migration_entry_read(&entry); 847 pte = swp_entry_to_pte(entry);
845 pte = swp_entry_to_pte(entry); 848 if (pte_swp_soft_dirty(*src_pte))
846 if (pte_swp_soft_dirty(*src_pte)) 849 pte = pte_swp_mksoft_dirty(pte);
847 pte = pte_swp_mksoft_dirty(pte); 850 set_pte_at(src_mm, addr, src_pte, pte);
848 set_pte_at(src_mm, addr, src_pte, pte);
849 }
850 } 851 }
851 } 852 }
852 goto out_set_pte; 853 goto out_set_pte;
@@ -1020,11 +1021,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1020 * readonly mappings. The tradeoff is that copy_page_range is more 1021 * readonly mappings. The tradeoff is that copy_page_range is more
1021 * efficient than faulting. 1022 * efficient than faulting.
1022 */ 1023 */
1023 if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | 1024 if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
1024 VM_PFNMAP | VM_MIXEDMAP))) { 1025 !vma->anon_vma)
1025 if (!vma->anon_vma) 1026 return 0;
1026 return 0;
1027 }
1028 1027
1029 if (is_vm_hugetlb_page(vma)) 1028 if (is_vm_hugetlb_page(vma))
1030 return copy_hugetlb_page_range(dst_mm, src_mm, vma); 1029 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
@@ -1082,6 +1081,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
1082 spinlock_t *ptl; 1081 spinlock_t *ptl;
1083 pte_t *start_pte; 1082 pte_t *start_pte;
1084 pte_t *pte; 1083 pte_t *pte;
1084 swp_entry_t entry;
1085 1085
1086again: 1086again:
1087 init_rss_vec(rss); 1087 init_rss_vec(rss);
@@ -1107,28 +1107,12 @@ again:
1107 if (details->check_mapping && 1107 if (details->check_mapping &&
1108 details->check_mapping != page->mapping) 1108 details->check_mapping != page->mapping)
1109 continue; 1109 continue;
1110 /*
1111 * Each page->index must be checked when
1112 * invalidating or truncating nonlinear.
1113 */
1114 if (details->nonlinear_vma &&
1115 (page->index < details->first_index ||
1116 page->index > details->last_index))
1117 continue;
1118 } 1110 }
1119 ptent = ptep_get_and_clear_full(mm, addr, pte, 1111 ptent = ptep_get_and_clear_full(mm, addr, pte,
1120 tlb->fullmm); 1112 tlb->fullmm);
1121 tlb_remove_tlb_entry(tlb, pte, addr); 1113 tlb_remove_tlb_entry(tlb, pte, addr);
1122 if (unlikely(!page)) 1114 if (unlikely(!page))
1123 continue; 1115 continue;
1124 if (unlikely(details) && details->nonlinear_vma
1125 && linear_page_index(details->nonlinear_vma,
1126 addr) != page->index) {
1127 pte_t ptfile = pgoff_to_pte(page->index);
1128 if (pte_soft_dirty(ptent))
1129 ptfile = pte_file_mksoft_dirty(ptfile);
1130 set_pte_at(mm, addr, pte, ptfile);
1131 }
1132 if (PageAnon(page)) 1116 if (PageAnon(page))
1133 rss[MM_ANONPAGES]--; 1117 rss[MM_ANONPAGES]--;
1134 else { 1118 else {
@@ -1151,33 +1135,25 @@ again:
1151 } 1135 }
1152 continue; 1136 continue;
1153 } 1137 }
1154 /* 1138 /* If details->check_mapping, we leave swap entries. */
1155 * If details->check_mapping, we leave swap entries;
1156 * if details->nonlinear_vma, we leave file entries.
1157 */
1158 if (unlikely(details)) 1139 if (unlikely(details))
1159 continue; 1140 continue;
1160 if (pte_file(ptent)) {
1161 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
1162 print_bad_pte(vma, addr, ptent, NULL);
1163 } else {
1164 swp_entry_t entry = pte_to_swp_entry(ptent);
1165 1141
1166 if (!non_swap_entry(entry)) 1142 entry = pte_to_swp_entry(ptent);
1167 rss[MM_SWAPENTS]--; 1143 if (!non_swap_entry(entry))
1168 else if (is_migration_entry(entry)) { 1144 rss[MM_SWAPENTS]--;
1169 struct page *page; 1145 else if (is_migration_entry(entry)) {
1146 struct page *page;
1170 1147
1171 page = migration_entry_to_page(entry); 1148 page = migration_entry_to_page(entry);
1172 1149
1173 if (PageAnon(page)) 1150 if (PageAnon(page))
1174 rss[MM_ANONPAGES]--; 1151 rss[MM_ANONPAGES]--;
1175 else 1152 else
1176 rss[MM_FILEPAGES]--; 1153 rss[MM_FILEPAGES]--;
1177 }
1178 if (unlikely(!free_swap_and_cache(entry)))
1179 print_bad_pte(vma, addr, ptent, NULL);
1180 } 1154 }
1155 if (unlikely(!free_swap_and_cache(entry)))
1156 print_bad_pte(vma, addr, ptent, NULL);
1181 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 1157 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
1182 } while (pte++, addr += PAGE_SIZE, addr != end); 1158 } while (pte++, addr += PAGE_SIZE, addr != end);
1183 1159
@@ -1277,7 +1253,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
1277 pgd_t *pgd; 1253 pgd_t *pgd;
1278 unsigned long next; 1254 unsigned long next;
1279 1255
1280 if (details && !details->check_mapping && !details->nonlinear_vma) 1256 if (details && !details->check_mapping)
1281 details = NULL; 1257 details = NULL;
1282 1258
1283 BUG_ON(addr >= end); 1259 BUG_ON(addr >= end);
@@ -1371,7 +1347,7 @@ void unmap_vmas(struct mmu_gather *tlb,
1371 * @vma: vm_area_struct holding the applicable pages 1347 * @vma: vm_area_struct holding the applicable pages
1372 * @start: starting address of pages to zap 1348 * @start: starting address of pages to zap
1373 * @size: number of bytes to zap 1349 * @size: number of bytes to zap
1374 * @details: details of nonlinear truncation or shared cache invalidation 1350 * @details: details of shared cache invalidation
1375 * 1351 *
1376 * Caller must protect the VMA list 1352 * Caller must protect the VMA list
1377 */ 1353 */
@@ -1397,7 +1373,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
1397 * @vma: vm_area_struct holding the applicable pages 1373 * @vma: vm_area_struct holding the applicable pages
1398 * @address: starting address of pages to zap 1374 * @address: starting address of pages to zap
1399 * @size: number of bytes to zap 1375 * @size: number of bytes to zap
1400 * @details: details of nonlinear truncation or shared cache invalidation 1376 * @details: details of shared cache invalidation
1401 * 1377 *
1402 * The range must fit into one VMA. 1378 * The range must fit into one VMA.
1403 */ 1379 */
@@ -1922,12 +1898,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
1922EXPORT_SYMBOL_GPL(apply_to_page_range); 1898EXPORT_SYMBOL_GPL(apply_to_page_range);
1923 1899
1924/* 1900/*
1925 * handle_pte_fault chooses page fault handler according to an entry 1901 * handle_pte_fault chooses page fault handler according to an entry which was
1926 * which was read non-atomically. Before making any commitment, on 1902 * read non-atomically. Before making any commitment, on those architectures
1927 * those architectures or configurations (e.g. i386 with PAE) which 1903 * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
1928 * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault 1904 * parts, do_swap_page must check under lock before unmapping the pte and
1929 * must check under lock before unmapping the pte and proceeding 1905 * proceeding (but do_wp_page is only called after already making such a check;
1930 * (but do_wp_page is only called after already making such a check;
1931 * and do_anonymous_page can safely check later on). 1906 * and do_anonymous_page can safely check later on).
1932 */ 1907 */
1933static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, 1908static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
@@ -2033,7 +2008,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2033 pte_t entry; 2008 pte_t entry;
2034 int ret = 0; 2009 int ret = 0;
2035 int page_mkwrite = 0; 2010 int page_mkwrite = 0;
2036 struct page *dirty_page = NULL; 2011 bool dirty_shared = false;
2037 unsigned long mmun_start = 0; /* For mmu_notifiers */ 2012 unsigned long mmun_start = 0; /* For mmu_notifiers */
2038 unsigned long mmun_end = 0; /* For mmu_notifiers */ 2013 unsigned long mmun_end = 0; /* For mmu_notifiers */
2039 struct mem_cgroup *memcg; 2014 struct mem_cgroup *memcg;
@@ -2084,6 +2059,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2084 unlock_page(old_page); 2059 unlock_page(old_page);
2085 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2060 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2086 (VM_WRITE|VM_SHARED))) { 2061 (VM_WRITE|VM_SHARED))) {
2062 page_cache_get(old_page);
2087 /* 2063 /*
2088 * Only catch write-faults on shared writable pages, 2064 * Only catch write-faults on shared writable pages,
2089 * read-only shared pages can get COWed by 2065 * read-only shared pages can get COWed by
@@ -2091,7 +2067,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2091 */ 2067 */
2092 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 2068 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
2093 int tmp; 2069 int tmp;
2094 page_cache_get(old_page); 2070
2095 pte_unmap_unlock(page_table, ptl); 2071 pte_unmap_unlock(page_table, ptl);
2096 tmp = do_page_mkwrite(vma, old_page, address); 2072 tmp = do_page_mkwrite(vma, old_page, address);
2097 if (unlikely(!tmp || (tmp & 2073 if (unlikely(!tmp || (tmp &
@@ -2111,11 +2087,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2111 unlock_page(old_page); 2087 unlock_page(old_page);
2112 goto unlock; 2088 goto unlock;
2113 } 2089 }
2114
2115 page_mkwrite = 1; 2090 page_mkwrite = 1;
2116 } 2091 }
2117 dirty_page = old_page; 2092
2118 get_page(dirty_page); 2093 dirty_shared = true;
2119 2094
2120reuse: 2095reuse:
2121 /* 2096 /*
@@ -2134,38 +2109,29 @@ reuse:
2134 pte_unmap_unlock(page_table, ptl); 2109 pte_unmap_unlock(page_table, ptl);
2135 ret |= VM_FAULT_WRITE; 2110 ret |= VM_FAULT_WRITE;
2136 2111
2137 if (!dirty_page) 2112 if (dirty_shared) {
2138 return ret; 2113 struct address_space *mapping;
2114 int dirtied;
2139 2115
2140 /* 2116 if (!page_mkwrite)
2141 * Yes, Virginia, this is actually required to prevent a race 2117 lock_page(old_page);
2142 * with clear_page_dirty_for_io() from clearing the page dirty 2118
2143 * bit after it clear all dirty ptes, but before a racing 2119 dirtied = set_page_dirty(old_page);
2144 * do_wp_page installs a dirty pte. 2120 VM_BUG_ON_PAGE(PageAnon(old_page), old_page);
2145 * 2121 mapping = old_page->mapping;
2146 * do_shared_fault is protected similarly. 2122 unlock_page(old_page);
2147 */ 2123 page_cache_release(old_page);
2148 if (!page_mkwrite) { 2124
2149 wait_on_page_locked(dirty_page); 2125 if ((dirtied || page_mkwrite) && mapping) {
2150 set_page_dirty_balance(dirty_page);
2151 /* file_update_time outside page_lock */
2152 if (vma->vm_file)
2153 file_update_time(vma->vm_file);
2154 }
2155 put_page(dirty_page);
2156 if (page_mkwrite) {
2157 struct address_space *mapping = dirty_page->mapping;
2158
2159 set_page_dirty(dirty_page);
2160 unlock_page(dirty_page);
2161 page_cache_release(dirty_page);
2162 if (mapping) {
2163 /* 2126 /*
2164 * Some device drivers do not set page.mapping 2127 * Some device drivers do not set page.mapping
2165 * but still dirty their pages 2128 * but still dirty their pages
2166 */ 2129 */
2167 balance_dirty_pages_ratelimited(mapping); 2130 balance_dirty_pages_ratelimited(mapping);
2168 } 2131 }
2132
2133 if (!page_mkwrite)
2134 file_update_time(vma->vm_file);
2169 } 2135 }
2170 2136
2171 return ret; 2137 return ret;
@@ -2324,25 +2290,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
2324 } 2290 }
2325} 2291}
2326 2292
2327static inline void unmap_mapping_range_list(struct list_head *head,
2328 struct zap_details *details)
2329{
2330 struct vm_area_struct *vma;
2331
2332 /*
2333 * In nonlinear VMAs there is no correspondence between virtual address
2334 * offset and file offset. So we must perform an exhaustive search
2335 * across *all* the pages in each nonlinear VMA, not just the pages
2336 * whose virtual address lies outside the file truncation point.
2337 */
2338 list_for_each_entry(vma, head, shared.nonlinear) {
2339 details->nonlinear_vma = vma;
2340 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2341 }
2342}
2343
2344/** 2293/**
2345 * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. 2294 * unmap_mapping_range - unmap the portion of all mmaps in the specified
2295 * address_space corresponding to the specified page range in the underlying
2296 * file.
2297 *
2346 * @mapping: the address space containing mmaps to be unmapped. 2298 * @mapping: the address space containing mmaps to be unmapped.
2347 * @holebegin: byte in first page to unmap, relative to the start of 2299 * @holebegin: byte in first page to unmap, relative to the start of
2348 * the underlying file. This will be rounded down to a PAGE_SIZE 2300 * the underlying file. This will be rounded down to a PAGE_SIZE
@@ -2371,7 +2323,6 @@ void unmap_mapping_range(struct address_space *mapping,
2371 } 2323 }
2372 2324
2373 details.check_mapping = even_cows? NULL: mapping; 2325 details.check_mapping = even_cows? NULL: mapping;
2374 details.nonlinear_vma = NULL;
2375 details.first_index = hba; 2326 details.first_index = hba;
2376 details.last_index = hba + hlen - 1; 2327 details.last_index = hba + hlen - 1;
2377 if (details.last_index < details.first_index) 2328 if (details.last_index < details.first_index)
@@ -2381,8 +2332,6 @@ void unmap_mapping_range(struct address_space *mapping,
2381 i_mmap_lock_write(mapping); 2332 i_mmap_lock_write(mapping);
2382 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) 2333 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2383 unmap_mapping_range_tree(&mapping->i_mmap, &details); 2334 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2384 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2385 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
2386 i_mmap_unlock_write(mapping); 2335 i_mmap_unlock_write(mapping);
2387} 2336}
2388EXPORT_SYMBOL(unmap_mapping_range); 2337EXPORT_SYMBOL(unmap_mapping_range);
@@ -2593,7 +2542,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
2593 if (prev && prev->vm_end == address) 2542 if (prev && prev->vm_end == address)
2594 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; 2543 return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
2595 2544
2596 expand_downwards(vma, address - PAGE_SIZE); 2545 return expand_downwards(vma, address - PAGE_SIZE);
2597 } 2546 }
2598 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { 2547 if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
2599 struct vm_area_struct *next = vma->vm_next; 2548 struct vm_area_struct *next = vma->vm_next;
@@ -2602,7 +2551,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
2602 if (next && next->vm_start == address + PAGE_SIZE) 2551 if (next && next->vm_start == address + PAGE_SIZE)
2603 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; 2552 return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
2604 2553
2605 expand_upwards(vma, address + PAGE_SIZE); 2554 return expand_upwards(vma, address + PAGE_SIZE);
2606 } 2555 }
2607 return 0; 2556 return 0;
2608} 2557}
@@ -2625,7 +2574,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2625 2574
2626 /* Check if we need to add a guard page to the stack */ 2575 /* Check if we need to add a guard page to the stack */
2627 if (check_stack_guard_page(vma, address) < 0) 2576 if (check_stack_guard_page(vma, address) < 0)
2628 return VM_FAULT_SIGBUS; 2577 return VM_FAULT_SIGSEGV;
2629 2578
2630 /* Use the zero-page for reads */ 2579 /* Use the zero-page for reads */
2631 if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) { 2580 if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
@@ -2743,8 +2692,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
2743 entry = mk_pte(page, vma->vm_page_prot); 2692 entry = mk_pte(page, vma->vm_page_prot);
2744 if (write) 2693 if (write)
2745 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2694 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2746 else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
2747 entry = pte_mksoft_dirty(entry);
2748 if (anon) { 2695 if (anon) {
2749 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 2696 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2750 page_add_new_anon_rmap(page, vma, address); 2697 page_add_new_anon_rmap(page, vma, address);
@@ -2879,8 +2826,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2879 * if page by the offset is not ready to be mapped (cold cache or 2826 * if page by the offset is not ready to be mapped (cold cache or
2880 * something). 2827 * something).
2881 */ 2828 */
2882 if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && 2829 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
2883 fault_around_bytes >> PAGE_SHIFT > 1) {
2884 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2830 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
2885 do_fault_around(vma, address, pte, pgoff, flags); 2831 do_fault_around(vma, address, pte, pgoff, flags);
2886 if (!pte_same(*pte, orig_pte)) 2832 if (!pte_same(*pte, orig_pte))
@@ -3012,8 +2958,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3012 balance_dirty_pages_ratelimited(mapping); 2958 balance_dirty_pages_ratelimited(mapping);
3013 } 2959 }
3014 2960
3015 /* file_update_time outside page_lock */ 2961 if (!vma->vm_ops->page_mkwrite)
3016 if (vma->vm_file && !vma->vm_ops->page_mkwrite)
3017 file_update_time(vma->vm_file); 2962 file_update_time(vma->vm_file);
3018 2963
3019 return ret; 2964 return ret;
@@ -3025,7 +2970,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3025 * The mmap_sem may have been released depending on flags and our 2970 * The mmap_sem may have been released depending on flags and our
3026 * return value. See filemap_fault() and __lock_page_or_retry(). 2971 * return value. See filemap_fault() and __lock_page_or_retry().
3027 */ 2972 */
3028static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 2973static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3029 unsigned long address, pte_t *page_table, pmd_t *pmd, 2974 unsigned long address, pte_t *page_table, pmd_t *pmd,
3030 unsigned int flags, pte_t orig_pte) 2975 unsigned int flags, pte_t orig_pte)
3031{ 2976{
@@ -3042,46 +2987,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3042 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 2987 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3043} 2988}
3044 2989
3045/*
3046 * Fault of a previously existing named mapping. Repopulate the pte
3047 * from the encoded file_pte if possible. This enables swappable
3048 * nonlinear vmas.
3049 *
3050 * We enter with non-exclusive mmap_sem (to exclude vma changes,
3051 * but allow concurrent faults), and pte mapped but not yet locked.
3052 * We return with pte unmapped and unlocked.
3053 * The mmap_sem may have been released depending on flags and our
3054 * return value. See filemap_fault() and __lock_page_or_retry().
3055 */
3056static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3057 unsigned long address, pte_t *page_table, pmd_t *pmd,
3058 unsigned int flags, pte_t orig_pte)
3059{
3060 pgoff_t pgoff;
3061
3062 flags |= FAULT_FLAG_NONLINEAR;
3063
3064 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
3065 return 0;
3066
3067 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
3068 /*
3069 * Page table corrupted: show pte and kill process.
3070 */
3071 print_bad_pte(vma, address, orig_pte, NULL);
3072 return VM_FAULT_SIGBUS;
3073 }
3074
3075 pgoff = pte_to_pgoff(orig_pte);
3076 if (!(flags & FAULT_FLAG_WRITE))
3077 return do_read_fault(mm, vma, address, pmd, pgoff, flags,
3078 orig_pte);
3079 if (!(vma->vm_flags & VM_SHARED))
3080 return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
3081 orig_pte);
3082 return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3083}
3084
3085static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 2990static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
3086 unsigned long addr, int page_nid, 2991 unsigned long addr, int page_nid,
3087 int *flags) 2992 int *flags)
@@ -3108,14 +3013,17 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3108 bool migrated = false; 3013 bool migrated = false;
3109 int flags = 0; 3014 int flags = 0;
3110 3015
3016 /* A PROT_NONE fault should not end up here */
3017 BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
3018
3111 /* 3019 /*
3112 * The "pte" at this point cannot be used safely without 3020 * The "pte" at this point cannot be used safely without
3113 * validation through pte_unmap_same(). It's of NUMA type but 3021 * validation through pte_unmap_same(). It's of NUMA type but
3114 * the pfn may be screwed if the read is non atomic. 3022 * the pfn may be screwed if the read is non atomic.
3115 * 3023 *
3116 * ptep_modify_prot_start is not called as this is clearing 3024 * We can safely just do a "set_pte_at()", because the old
3117 * the _PAGE_NUMA bit and it is not really expected that there 3025 * page table entry is not accessible, so there would be no
3118 * would be concurrent hardware modifications to the PTE. 3026 * concurrent hardware modifications to the PTE.
3119 */ 3027 */
3120 ptl = pte_lockptr(mm, pmd); 3028 ptl = pte_lockptr(mm, pmd);
3121 spin_lock(ptl); 3029 spin_lock(ptl);
@@ -3124,7 +3032,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3124 goto out; 3032 goto out;
3125 } 3033 }
3126 3034
3127 pte = pte_mknonnuma(pte); 3035 /* Make it present again */
3036 pte = pte_modify(pte, vma->vm_page_prot);
3037 pte = pte_mkyoung(pte);
3128 set_pte_at(mm, addr, ptep, pte); 3038 set_pte_at(mm, addr, ptep, pte);
3129 update_mmu_cache(vma, addr, ptep); 3039 update_mmu_cache(vma, addr, ptep);
3130 3040
@@ -3133,7 +3043,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3133 pte_unmap_unlock(ptep, ptl); 3043 pte_unmap_unlock(ptep, ptl);
3134 return 0; 3044 return 0;
3135 } 3045 }
3136 BUG_ON(is_zero_pfn(page_to_pfn(page)));
3137 3046
3138 /* 3047 /*
3139 * Avoid grouping on DSO/COW pages in specific and RO pages 3048 * Avoid grouping on DSO/COW pages in specific and RO pages
@@ -3209,20 +3118,17 @@ static int handle_pte_fault(struct mm_struct *mm,
3209 if (pte_none(entry)) { 3118 if (pte_none(entry)) {
3210 if (vma->vm_ops) { 3119 if (vma->vm_ops) {
3211 if (likely(vma->vm_ops->fault)) 3120 if (likely(vma->vm_ops->fault))
3212 return do_linear_fault(mm, vma, address, 3121 return do_fault(mm, vma, address, pte,
3213 pte, pmd, flags, entry); 3122 pmd, flags, entry);
3214 } 3123 }
3215 return do_anonymous_page(mm, vma, address, 3124 return do_anonymous_page(mm, vma, address,
3216 pte, pmd, flags); 3125 pte, pmd, flags);
3217 } 3126 }
3218 if (pte_file(entry))
3219 return do_nonlinear_fault(mm, vma, address,
3220 pte, pmd, flags, entry);
3221 return do_swap_page(mm, vma, address, 3127 return do_swap_page(mm, vma, address,
3222 pte, pmd, flags, entry); 3128 pte, pmd, flags, entry);
3223 } 3129 }
3224 3130
3225 if (pte_numa(entry)) 3131 if (pte_protnone(entry))
3226 return do_numa_page(mm, vma, address, entry, pte, pmd); 3132 return do_numa_page(mm, vma, address, entry, pte, pmd);
3227 3133
3228 ptl = pte_lockptr(mm, pmd); 3134 ptl = pte_lockptr(mm, pmd);
@@ -3300,7 +3206,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3300 if (pmd_trans_splitting(orig_pmd)) 3206 if (pmd_trans_splitting(orig_pmd))
3301 return 0; 3207 return 0;
3302 3208
3303 if (pmd_numa(orig_pmd)) 3209 if (pmd_protnone(orig_pmd))
3304 return do_huge_pmd_numa_page(mm, vma, address, 3210 return do_huge_pmd_numa_page(mm, vma, address,
3305 orig_pmd, pmd); 3211 orig_pmd, pmd);
3306 3212
@@ -3421,15 +3327,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
3421 3327
3422 spin_lock(&mm->page_table_lock); 3328 spin_lock(&mm->page_table_lock);
3423#ifndef __ARCH_HAS_4LEVEL_HACK 3329#ifndef __ARCH_HAS_4LEVEL_HACK
3424 if (pud_present(*pud)) /* Another has populated it */ 3330 if (!pud_present(*pud)) {
3425 pmd_free(mm, new); 3331 mm_inc_nr_pmds(mm);
3426 else
3427 pud_populate(mm, pud, new); 3332 pud_populate(mm, pud, new);
3428#else 3333 } else /* Another has populated it */
3429 if (pgd_present(*pud)) /* Another has populated it */
3430 pmd_free(mm, new); 3334 pmd_free(mm, new);
3431 else 3335#else
3336 if (!pgd_present(*pud)) {
3337 mm_inc_nr_pmds(mm);
3432 pgd_populate(mm, pud, new); 3338 pgd_populate(mm, pud, new);
3339 } else /* Another has populated it */
3340 pmd_free(mm, new);
3433#endif /* __ARCH_HAS_4LEVEL_HACK */ 3341#endif /* __ARCH_HAS_4LEVEL_HACK */
3434 spin_unlock(&mm->page_table_lock); 3342 spin_unlock(&mm->page_table_lock);
3435 return 0; 3343 return 0;
@@ -3554,7 +3462,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3554 if (follow_phys(vma, addr, write, &prot, &phys_addr)) 3462 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3555 return -EINVAL; 3463 return -EINVAL;
3556 3464
3557 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); 3465 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
3558 if (write) 3466 if (write)
3559 memcpy_toio(maddr + offset, buf, len); 3467 memcpy_toio(maddr + offset, buf, len);
3560 else 3468 else
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0e0961b8c39c..4721046a134a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -471,24 +471,34 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
471static void migrate_page_add(struct page *page, struct list_head *pagelist, 471static void migrate_page_add(struct page *page, struct list_head *pagelist,
472 unsigned long flags); 472 unsigned long flags);
473 473
474struct queue_pages {
475 struct list_head *pagelist;
476 unsigned long flags;
477 nodemask_t *nmask;
478 struct vm_area_struct *prev;
479};
480
474/* 481/*
475 * Scan through pages checking if pages follow certain conditions, 482 * Scan through pages checking if pages follow certain conditions,
476 * and move them to the pagelist if they do. 483 * and move them to the pagelist if they do.
477 */ 484 */
478static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 485static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
479 unsigned long addr, unsigned long end, 486 unsigned long end, struct mm_walk *walk)
480 const nodemask_t *nodes, unsigned long flags,
481 void *private)
482{ 487{
483 pte_t *orig_pte; 488 struct vm_area_struct *vma = walk->vma;
489 struct page *page;
490 struct queue_pages *qp = walk->private;
491 unsigned long flags = qp->flags;
492 int nid;
484 pte_t *pte; 493 pte_t *pte;
485 spinlock_t *ptl; 494 spinlock_t *ptl;
486 495
487 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 496 split_huge_page_pmd(vma, addr, pmd);
488 do { 497 if (pmd_trans_unstable(pmd))
489 struct page *page; 498 return 0;
490 int nid;
491 499
500 pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
501 for (; addr != end; pte++, addr += PAGE_SIZE) {
492 if (!pte_present(*pte)) 502 if (!pte_present(*pte))
493 continue; 503 continue;
494 page = vm_normal_page(vma, addr, *pte); 504 page = vm_normal_page(vma, addr, *pte);
@@ -501,114 +511,46 @@ static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
501 if (PageReserved(page)) 511 if (PageReserved(page))
502 continue; 512 continue;
503 nid = page_to_nid(page); 513 nid = page_to_nid(page);
504 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 514 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
505 continue; 515 continue;
506 516
507 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 517 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
508 migrate_page_add(page, private, flags); 518 migrate_page_add(page, qp->pagelist, flags);
509 else 519 }
510 break; 520 pte_unmap_unlock(pte - 1, ptl);
511 } while (pte++, addr += PAGE_SIZE, addr != end); 521 cond_resched();
512 pte_unmap_unlock(orig_pte, ptl); 522 return 0;
513 return addr != end;
514} 523}
515 524
516static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, 525static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
517 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, 526 unsigned long addr, unsigned long end,
518 void *private) 527 struct mm_walk *walk)
519{ 528{
520#ifdef CONFIG_HUGETLB_PAGE 529#ifdef CONFIG_HUGETLB_PAGE
530 struct queue_pages *qp = walk->private;
531 unsigned long flags = qp->flags;
521 int nid; 532 int nid;
522 struct page *page; 533 struct page *page;
523 spinlock_t *ptl; 534 spinlock_t *ptl;
524 pte_t entry; 535 pte_t entry;
525 536
526 ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); 537 ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
527 entry = huge_ptep_get((pte_t *)pmd); 538 entry = huge_ptep_get(pte);
528 if (!pte_present(entry)) 539 if (!pte_present(entry))
529 goto unlock; 540 goto unlock;
530 page = pte_page(entry); 541 page = pte_page(entry);
531 nid = page_to_nid(page); 542 nid = page_to_nid(page);
532 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 543 if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
533 goto unlock; 544 goto unlock;
534 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ 545 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
535 if (flags & (MPOL_MF_MOVE_ALL) || 546 if (flags & (MPOL_MF_MOVE_ALL) ||
536 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) 547 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
537 isolate_huge_page(page, private); 548 isolate_huge_page(page, qp->pagelist);
538unlock: 549unlock:
539 spin_unlock(ptl); 550 spin_unlock(ptl);
540#else 551#else
541 BUG(); 552 BUG();
542#endif 553#endif
543}
544
545static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
546 unsigned long addr, unsigned long end,
547 const nodemask_t *nodes, unsigned long flags,
548 void *private)
549{
550 pmd_t *pmd;
551 unsigned long next;
552
553 pmd = pmd_offset(pud, addr);
554 do {
555 next = pmd_addr_end(addr, end);
556 if (!pmd_present(*pmd))
557 continue;
558 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
559 queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
560 flags, private);
561 continue;
562 }
563 split_huge_page_pmd(vma, addr, pmd);
564 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
565 continue;
566 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
567 flags, private))
568 return -EIO;
569 } while (pmd++, addr = next, addr != end);
570 return 0;
571}
572
573static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
574 unsigned long addr, unsigned long end,
575 const nodemask_t *nodes, unsigned long flags,
576 void *private)
577{
578 pud_t *pud;
579 unsigned long next;
580
581 pud = pud_offset(pgd, addr);
582 do {
583 next = pud_addr_end(addr, end);
584 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
585 continue;
586 if (pud_none_or_clear_bad(pud))
587 continue;
588 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
589 flags, private))
590 return -EIO;
591 } while (pud++, addr = next, addr != end);
592 return 0;
593}
594
595static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
596 unsigned long addr, unsigned long end,
597 const nodemask_t *nodes, unsigned long flags,
598 void *private)
599{
600 pgd_t *pgd;
601 unsigned long next;
602
603 pgd = pgd_offset(vma->vm_mm, addr);
604 do {
605 next = pgd_addr_end(addr, end);
606 if (pgd_none_or_clear_bad(pgd))
607 continue;
608 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
609 flags, private))
610 return -EIO;
611 } while (pgd++, addr = next, addr != end);
612 return 0; 554 return 0;
613} 555}
614 556
@@ -627,7 +569,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
627{ 569{
628 int nr_updated; 570 int nr_updated;
629 571
630 nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); 572 nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
631 if (nr_updated) 573 if (nr_updated)
632 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); 574 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
633 575
@@ -641,6 +583,49 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
641} 583}
642#endif /* CONFIG_NUMA_BALANCING */ 584#endif /* CONFIG_NUMA_BALANCING */
643 585
586static int queue_pages_test_walk(unsigned long start, unsigned long end,
587 struct mm_walk *walk)
588{
589 struct vm_area_struct *vma = walk->vma;
590 struct queue_pages *qp = walk->private;
591 unsigned long endvma = vma->vm_end;
592 unsigned long flags = qp->flags;
593
594 if (vma->vm_flags & VM_PFNMAP)
595 return 1;
596
597 if (endvma > end)
598 endvma = end;
599 if (vma->vm_start > start)
600 start = vma->vm_start;
601
602 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
603 if (!vma->vm_next && vma->vm_end < end)
604 return -EFAULT;
605 if (qp->prev && qp->prev->vm_end < vma->vm_start)
606 return -EFAULT;
607 }
608
609 qp->prev = vma;
610
611 if (vma->vm_flags & VM_PFNMAP)
612 return 1;
613
614 if (flags & MPOL_MF_LAZY) {
615 /* Similar to task_numa_work, skip inaccessible VMAs */
616 if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
617 change_prot_numa(vma, start, endvma);
618 return 1;
619 }
620
621 if ((flags & MPOL_MF_STRICT) ||
622 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
623 vma_migratable(vma)))
624 /* queue pages from current vma */
625 return 0;
626 return 1;
627}
628
644/* 629/*
645 * Walk through page tables and collect pages to be migrated. 630 * Walk through page tables and collect pages to be migrated.
646 * 631 *
@@ -650,50 +635,24 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
650 */ 635 */
651static int 636static int
652queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, 637queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
653 const nodemask_t *nodes, unsigned long flags, void *private) 638 nodemask_t *nodes, unsigned long flags,
654{ 639 struct list_head *pagelist)
655 int err = 0; 640{
656 struct vm_area_struct *vma, *prev; 641 struct queue_pages qp = {
657 642 .pagelist = pagelist,
658 vma = find_vma(mm, start); 643 .flags = flags,
659 if (!vma) 644 .nmask = nodes,
660 return -EFAULT; 645 .prev = NULL,
661 prev = NULL; 646 };
662 for (; vma && vma->vm_start < end; vma = vma->vm_next) { 647 struct mm_walk queue_pages_walk = {
663 unsigned long endvma = vma->vm_end; 648 .hugetlb_entry = queue_pages_hugetlb,
664 649 .pmd_entry = queue_pages_pte_range,
665 if (endvma > end) 650 .test_walk = queue_pages_test_walk,
666 endvma = end; 651 .mm = mm,
667 if (vma->vm_start > start) 652 .private = &qp,
668 start = vma->vm_start; 653 };
669 654
670 if (!(flags & MPOL_MF_DISCONTIG_OK)) { 655 return walk_page_range(start, end, &queue_pages_walk);
671 if (!vma->vm_next && vma->vm_end < end)
672 return -EFAULT;
673 if (prev && prev->vm_end < vma->vm_start)
674 return -EFAULT;
675 }
676
677 if (flags & MPOL_MF_LAZY) {
678 /* Similar to task_numa_work, skip inaccessible VMAs */
679 if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
680 change_prot_numa(vma, start, endvma);
681 goto next;
682 }
683
684 if ((flags & MPOL_MF_STRICT) ||
685 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
686 vma_migratable(vma))) {
687
688 err = queue_pages_pgd_range(vma, start, endvma, nodes,
689 flags, private);
690 if (err)
691 break;
692 }
693next:
694 prev = vma;
695 }
696 return err;
697} 656}
698 657
699/* 658/*
@@ -1988,43 +1947,63 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1988 * @order:Order of the GFP allocation. 1947 * @order:Order of the GFP allocation.
1989 * @vma: Pointer to VMA or NULL if not available. 1948 * @vma: Pointer to VMA or NULL if not available.
1990 * @addr: Virtual Address of the allocation. Must be inside the VMA. 1949 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1950 * @node: Which node to prefer for allocation (modulo policy).
1951 * @hugepage: for hugepages try only the preferred node if possible
1991 * 1952 *
1992 * This function allocates a page from the kernel page pool and applies 1953 * This function allocates a page from the kernel page pool and applies
1993 * a NUMA policy associated with the VMA or the current process. 1954 * a NUMA policy associated with the VMA or the current process.
1994 * When VMA is not NULL caller must hold down_read on the mmap_sem of the 1955 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
1995 * mm_struct of the VMA to prevent it from going away. Should be used for 1956 * mm_struct of the VMA to prevent it from going away. Should be used for
1996 * all allocations for pages that will be mapped into 1957 * all allocations for pages that will be mapped into user space. Returns
1997 * user space. Returns NULL when no page can be allocated. 1958 * NULL when no page can be allocated.
1998 *
1999 * Should be called with the mm_sem of the vma hold.
2000 */ 1959 */
2001struct page * 1960struct page *
2002alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 1961alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
2003 unsigned long addr, int node) 1962 unsigned long addr, int node, bool hugepage)
2004{ 1963{
2005 struct mempolicy *pol; 1964 struct mempolicy *pol;
2006 struct page *page; 1965 struct page *page;
2007 unsigned int cpuset_mems_cookie; 1966 unsigned int cpuset_mems_cookie;
1967 struct zonelist *zl;
1968 nodemask_t *nmask;
2008 1969
2009retry_cpuset: 1970retry_cpuset:
2010 pol = get_vma_policy(vma, addr); 1971 pol = get_vma_policy(vma, addr);
2011 cpuset_mems_cookie = read_mems_allowed_begin(); 1972 cpuset_mems_cookie = read_mems_allowed_begin();
2012 1973
2013 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1974 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage &&
1975 pol->mode != MPOL_INTERLEAVE)) {
1976 /*
1977 * For hugepage allocation and non-interleave policy which
1978 * allows the current node, we only try to allocate from the
1979 * current node and don't fall back to other nodes, as the
1980 * cost of remote accesses would likely offset THP benefits.
1981 *
1982 * If the policy is interleave, or does not allow the current
1983 * node in its nodemask, we allocate the standard way.
1984 */
1985 nmask = policy_nodemask(gfp, pol);
1986 if (!nmask || node_isset(node, *nmask)) {
1987 mpol_cond_put(pol);
1988 page = alloc_pages_exact_node(node, gfp, order);
1989 goto out;
1990 }
1991 }
1992
1993 if (pol->mode == MPOL_INTERLEAVE) {
2014 unsigned nid; 1994 unsigned nid;
2015 1995
2016 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); 1996 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
2017 mpol_cond_put(pol); 1997 mpol_cond_put(pol);
2018 page = alloc_page_interleave(gfp, order, nid); 1998 page = alloc_page_interleave(gfp, order, nid);
2019 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 1999 goto out;
2020 goto retry_cpuset;
2021
2022 return page;
2023 } 2000 }
2024 page = __alloc_pages_nodemask(gfp, order, 2001
2025 policy_zonelist(gfp, pol, node), 2002 nmask = policy_nodemask(gfp, pol);
2026 policy_nodemask(gfp, pol)); 2003 zl = policy_zonelist(gfp, pol, node);
2027 mpol_cond_put(pol); 2004 mpol_cond_put(pol);
2005 page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2006out:
2028 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) 2007 if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2029 goto retry_cpuset; 2008 goto retry_cpuset;
2030 return page; 2009 return page;
@@ -2838,8 +2817,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2838 p += snprintf(p, buffer + maxlen - p, "relative"); 2817 p += snprintf(p, buffer + maxlen - p, "relative");
2839 } 2818 }
2840 2819
2841 if (!nodes_empty(nodes)) { 2820 if (!nodes_empty(nodes))
2842 p += snprintf(p, buffer + maxlen - p, ":"); 2821 p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2843 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); 2822 nodemask_pr_args(&nodes));
2844 }
2845} 2823}
diff --git a/mm/migrate.c b/mm/migrate.c
index 344cdf692fc8..85e042686031 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -179,37 +179,6 @@ out:
179} 179}
180 180
181/* 181/*
182 * Congratulations to trinity for discovering this bug.
183 * mm/fremap.c's remap_file_pages() accepts any range within a single vma to
184 * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then
185 * replace the specified range by file ptes throughout (maybe populated after).
186 * If page migration finds a page within that range, while it's still located
187 * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem:
188 * zap_pte() clears the temporary migration entry before mmap_sem is dropped.
189 * But if the migrating page is in a part of the vma outside the range to be
190 * remapped, then it will not be cleared, and remove_migration_ptes() needs to
191 * deal with it. Fortunately, this part of the vma is of course still linear,
192 * so we just need to use linear location on the nonlinear list.
193 */
194static int remove_linear_migration_ptes_from_nonlinear(struct page *page,
195 struct address_space *mapping, void *arg)
196{
197 struct vm_area_struct *vma;
198 /* hugetlbfs does not support remap_pages, so no huge pgoff worries */
199 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
200 unsigned long addr;
201
202 list_for_each_entry(vma,
203 &mapping->i_mmap_nonlinear, shared.nonlinear) {
204
205 addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
206 if (addr >= vma->vm_start && addr < vma->vm_end)
207 remove_migration_pte(page, vma, addr, arg);
208 }
209 return SWAP_AGAIN;
210}
211
212/*
213 * Get rid of all migration entries and replace them by 182 * Get rid of all migration entries and replace them by
214 * references to the indicated page. 183 * references to the indicated page.
215 */ 184 */
@@ -218,7 +187,6 @@ static void remove_migration_ptes(struct page *old, struct page *new)
218 struct rmap_walk_control rwc = { 187 struct rmap_walk_control rwc = {
219 .rmap_one = remove_migration_pte, 188 .rmap_one = remove_migration_pte,
220 .arg = old, 189 .arg = old,
221 .file_nonlinear = remove_linear_migration_ptes_from_nonlinear,
222 }; 190 };
223 191
224 rmap_walk(new, &rwc); 192 rmap_walk(new, &rwc);
@@ -229,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new)
229 * get to the page and wait until migration is finished. 197 * get to the page and wait until migration is finished.
230 * When we return from this function the fault will be retried. 198 * When we return from this function the fault will be retried.
231 */ 199 */
232static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, 200void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
233 spinlock_t *ptl) 201 spinlock_t *ptl)
234{ 202{
235 pte_t pte; 203 pte_t pte;
@@ -1268,7 +1236,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
1268 goto put_and_set; 1236 goto put_and_set;
1269 1237
1270 if (PageHuge(page)) { 1238 if (PageHuge(page)) {
1271 isolate_huge_page(page, &pagelist); 1239 if (PageHead(page))
1240 isolate_huge_page(page, &pagelist);
1272 goto put_and_set; 1241 goto put_and_set;
1273 } 1242 }
1274 1243
@@ -1685,12 +1654,6 @@ bool pmd_trans_migrating(pmd_t pmd)
1685 return PageLocked(page); 1654 return PageLocked(page);
1686} 1655}
1687 1656
1688void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
1689{
1690 struct page *page = pmd_page(*pmd);
1691 wait_on_page_locked(page);
1692}
1693
1694/* 1657/*
1695 * Attempt to migrate a misplaced page to the specified destination 1658 * Attempt to migrate a misplaced page to the specified destination
1696 * node. Caller is expected to have an elevated reference count on 1659 * node. Caller is expected to have an elevated reference count on
@@ -1884,7 +1847,7 @@ out_fail:
1884out_dropref: 1847out_dropref:
1885 ptl = pmd_lock(mm, pmd); 1848 ptl = pmd_lock(mm, pmd);
1886 if (pmd_same(*pmd, entry)) { 1849 if (pmd_same(*pmd, entry)) {
1887 entry = pmd_mknonnuma(entry); 1850 entry = pmd_modify(entry, vma->vm_page_prot);
1888 set_pmd_at(mm, mmun_start, pmd, entry); 1851 set_pmd_at(mm, mmun_start, pmd, entry);
1889 update_mmu_cache_pmd(vma, address, &entry); 1852 update_mmu_cache_pmd(vma, address, &entry);
1890 } 1853 }
diff --git a/mm/mincore.c b/mm/mincore.c
index c8c528b36641..be25efde64a4 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -19,38 +19,25 @@
19#include <asm/uaccess.h> 19#include <asm/uaccess.h>
20#include <asm/pgtable.h> 20#include <asm/pgtable.h>
21 21
22static void mincore_hugetlb_page_range(struct vm_area_struct *vma, 22static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
23 unsigned long addr, unsigned long end, 23 unsigned long end, struct mm_walk *walk)
24 unsigned char *vec)
25{ 24{
26#ifdef CONFIG_HUGETLB_PAGE 25#ifdef CONFIG_HUGETLB_PAGE
27 struct hstate *h; 26 unsigned char present;
27 unsigned char *vec = walk->private;
28 28
29 h = hstate_vma(vma); 29 /*
30 while (1) { 30 * Hugepages under user process are always in RAM and never
31 unsigned char present; 31 * swapped out, but theoretically it needs to be checked.
32 pte_t *ptep; 32 */
33 /* 33 present = pte && !huge_pte_none(huge_ptep_get(pte));
34 * Huge pages are always in RAM for now, but 34 for (; addr != end; vec++, addr += PAGE_SIZE)
35 * theoretically it needs to be checked. 35 *vec = present;
36 */ 36 walk->private = vec;
37 ptep = huge_pte_offset(current->mm,
38 addr & huge_page_mask(h));
39 present = ptep && !huge_pte_none(huge_ptep_get(ptep));
40 while (1) {
41 *vec = present;
42 vec++;
43 addr += PAGE_SIZE;
44 if (addr == end)
45 return;
46 /* check hugepage border */
47 if (!(addr & ~huge_page_mask(h)))
48 break;
49 }
50 }
51#else 37#else
52 BUG(); 38 BUG();
53#endif 39#endif
40 return 0;
54} 41}
55 42
56/* 43/*
@@ -94,9 +81,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
94 return present; 81 return present;
95} 82}
96 83
97static void mincore_unmapped_range(struct vm_area_struct *vma, 84static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
98 unsigned long addr, unsigned long end, 85 struct vm_area_struct *vma, unsigned char *vec)
99 unsigned char *vec)
100{ 86{
101 unsigned long nr = (end - addr) >> PAGE_SHIFT; 87 unsigned long nr = (end - addr) >> PAGE_SHIFT;
102 int i; 88 int i;
@@ -111,30 +97,47 @@ static void mincore_unmapped_range(struct vm_area_struct *vma,
111 for (i = 0; i < nr; i++) 97 for (i = 0; i < nr; i++)
112 vec[i] = 0; 98 vec[i] = 0;
113 } 99 }
100 return nr;
101}
102
103static int mincore_unmapped_range(unsigned long addr, unsigned long end,
104 struct mm_walk *walk)
105{
106 walk->private += __mincore_unmapped_range(addr, end,
107 walk->vma, walk->private);
108 return 0;
114} 109}
115 110
116static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 111static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
117 unsigned long addr, unsigned long end, 112 struct mm_walk *walk)
118 unsigned char *vec)
119{ 113{
120 unsigned long next;
121 spinlock_t *ptl; 114 spinlock_t *ptl;
115 struct vm_area_struct *vma = walk->vma;
122 pte_t *ptep; 116 pte_t *ptep;
117 unsigned char *vec = walk->private;
118 int nr = (end - addr) >> PAGE_SHIFT;
119
120 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
121 memset(vec, 1, nr);
122 spin_unlock(ptl);
123 goto out;
124 }
125
126 if (pmd_trans_unstable(pmd)) {
127 __mincore_unmapped_range(addr, end, vma, vec);
128 goto out;
129 }
123 130
124 ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 131 ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
125 do { 132 for (; addr != end; ptep++, addr += PAGE_SIZE) {
126 pte_t pte = *ptep; 133 pte_t pte = *ptep;
127 pgoff_t pgoff;
128 134
129 next = addr + PAGE_SIZE;
130 if (pte_none(pte)) 135 if (pte_none(pte))
131 mincore_unmapped_range(vma, addr, next, vec); 136 __mincore_unmapped_range(addr, addr + PAGE_SIZE,
137 vma, vec);
132 else if (pte_present(pte)) 138 else if (pte_present(pte))
133 *vec = 1; 139 *vec = 1;
134 else if (pte_file(pte)) { 140 else { /* pte is a swap entry */
135 pgoff = pte_to_pgoff(pte);
136 *vec = mincore_page(vma->vm_file->f_mapping, pgoff);
137 } else { /* pte is a swap entry */
138 swp_entry_t entry = pte_to_swp_entry(pte); 141 swp_entry_t entry = pte_to_swp_entry(pte);
139 142
140 if (non_swap_entry(entry)) { 143 if (non_swap_entry(entry)) {
@@ -145,9 +148,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
145 *vec = 1; 148 *vec = 1;
146 } else { 149 } else {
147#ifdef CONFIG_SWAP 150#ifdef CONFIG_SWAP
148 pgoff = entry.val;
149 *vec = mincore_page(swap_address_space(entry), 151 *vec = mincore_page(swap_address_space(entry),
150 pgoff); 152 entry.val);
151#else 153#else
152 WARN_ON(1); 154 WARN_ON(1);
153 *vec = 1; 155 *vec = 1;
@@ -155,69 +157,12 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
155 } 157 }
156 } 158 }
157 vec++; 159 vec++;
158 } while (ptep++, addr = next, addr != end); 160 }
159 pte_unmap_unlock(ptep - 1, ptl); 161 pte_unmap_unlock(ptep - 1, ptl);
160} 162out:
161 163 walk->private += nr;
162static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, 164 cond_resched();
163 unsigned long addr, unsigned long end, 165 return 0;
164 unsigned char *vec)
165{
166 unsigned long next;
167 pmd_t *pmd;
168
169 pmd = pmd_offset(pud, addr);
170 do {
171 next = pmd_addr_end(addr, end);
172 if (pmd_trans_huge(*pmd)) {
173 if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
174 vec += (next - addr) >> PAGE_SHIFT;
175 continue;
176 }
177 /* fall through */
178 }
179 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
180 mincore_unmapped_range(vma, addr, next, vec);
181 else
182 mincore_pte_range(vma, pmd, addr, next, vec);
183 vec += (next - addr) >> PAGE_SHIFT;
184 } while (pmd++, addr = next, addr != end);
185}
186
187static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
188 unsigned long addr, unsigned long end,
189 unsigned char *vec)
190{
191 unsigned long next;
192 pud_t *pud;
193
194 pud = pud_offset(pgd, addr);
195 do {
196 next = pud_addr_end(addr, end);
197 if (pud_none_or_clear_bad(pud))
198 mincore_unmapped_range(vma, addr, next, vec);
199 else
200 mincore_pmd_range(vma, pud, addr, next, vec);
201 vec += (next - addr) >> PAGE_SHIFT;
202 } while (pud++, addr = next, addr != end);
203}
204
205static void mincore_page_range(struct vm_area_struct *vma,
206 unsigned long addr, unsigned long end,
207 unsigned char *vec)
208{
209 unsigned long next;
210 pgd_t *pgd;
211
212 pgd = pgd_offset(vma->vm_mm, addr);
213 do {
214 next = pgd_addr_end(addr, end);
215 if (pgd_none_or_clear_bad(pgd))
216 mincore_unmapped_range(vma, addr, next, vec);
217 else
218 mincore_pud_range(vma, pgd, addr, next, vec);
219 vec += (next - addr) >> PAGE_SHIFT;
220 } while (pgd++, addr = next, addr != end);
221} 166}
222 167
223/* 168/*
@@ -229,18 +174,22 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
229{ 174{
230 struct vm_area_struct *vma; 175 struct vm_area_struct *vma;
231 unsigned long end; 176 unsigned long end;
177 int err;
178 struct mm_walk mincore_walk = {
179 .pmd_entry = mincore_pte_range,
180 .pte_hole = mincore_unmapped_range,
181 .hugetlb_entry = mincore_hugetlb,
182 .private = vec,
183 };
232 184
233 vma = find_vma(current->mm, addr); 185 vma = find_vma(current->mm, addr);
234 if (!vma || addr < vma->vm_start) 186 if (!vma || addr < vma->vm_start)
235 return -ENOMEM; 187 return -ENOMEM;
236 188 mincore_walk.mm = vma->vm_mm;
237 end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); 189 end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
238 190 err = walk_page_range(addr, end, &mincore_walk);
239 if (is_vm_hugetlb_page(vma)) 191 if (err < 0)
240 mincore_hugetlb_page_range(vma, addr, end, vec); 192 return err;
241 else
242 mincore_page_range(vma, addr, end, vec);
243
244 return (end - addr) >> PAGE_SHIFT; 193 return (end - addr) >> PAGE_SHIFT;
245} 194}
246 195
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4074caf9936b..5f420f7fafa1 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -14,14 +14,14 @@
14#include "internal.h" 14#include "internal.h"
15 15
16#ifdef CONFIG_DEBUG_MEMORY_INIT 16#ifdef CONFIG_DEBUG_MEMORY_INIT
17int mminit_loglevel; 17int __meminitdata mminit_loglevel;
18 18
19#ifndef SECTIONS_SHIFT 19#ifndef SECTIONS_SHIFT
20#define SECTIONS_SHIFT 0 20#define SECTIONS_SHIFT 0
21#endif 21#endif
22 22
23/* The zonelists are simply reported, validation is manual. */ 23/* The zonelists are simply reported, validation is manual. */
24void mminit_verify_zonelist(void) 24void __init mminit_verify_zonelist(void)
25{ 25{
26 int nid; 26 int nid;
27 27
diff --git a/mm/mmap.c b/mm/mmap.c
index 7b36aa7cc89a..da9990acc08b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -152,7 +152,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
152 */ 152 */
153int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) 153int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
154{ 154{
155 unsigned long free, allowed, reserve; 155 long free, allowed, reserve;
156 156
157 VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < 157 VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
158 -(s64)vm_committed_as_batch * num_online_cpus(), 158 -(s64)vm_committed_as_batch * num_online_cpus(),
@@ -220,7 +220,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
220 */ 220 */
221 if (mm) { 221 if (mm) {
222 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); 222 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
223 allowed -= min(mm->total_vm / 32, reserve); 223 allowed -= min_t(long, mm->total_vm / 32, reserve);
224 } 224 }
225 225
226 if (percpu_counter_read_positive(&vm_committed_as) < allowed) 226 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
@@ -243,10 +243,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
243 mapping_unmap_writable(mapping); 243 mapping_unmap_writable(mapping);
244 244
245 flush_dcache_mmap_lock(mapping); 245 flush_dcache_mmap_lock(mapping);
246 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 246 vma_interval_tree_remove(vma, &mapping->i_mmap);
247 list_del_init(&vma->shared.nonlinear);
248 else
249 vma_interval_tree_remove(vma, &mapping->i_mmap);
250 flush_dcache_mmap_unlock(mapping); 247 flush_dcache_mmap_unlock(mapping);
251} 248}
252 249
@@ -649,10 +646,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
649 atomic_inc(&mapping->i_mmap_writable); 646 atomic_inc(&mapping->i_mmap_writable);
650 647
651 flush_dcache_mmap_lock(mapping); 648 flush_dcache_mmap_lock(mapping);
652 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 649 vma_interval_tree_insert(vma, &mapping->i_mmap);
653 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
654 else
655 vma_interval_tree_insert(vma, &mapping->i_mmap);
656 flush_dcache_mmap_unlock(mapping); 650 flush_dcache_mmap_unlock(mapping);
657 } 651 }
658} 652}
@@ -778,23 +772,22 @@ again: remove_next = 1 + (end > next->vm_end);
778 if (exporter && exporter->anon_vma && !importer->anon_vma) { 772 if (exporter && exporter->anon_vma && !importer->anon_vma) {
779 int error; 773 int error;
780 774
775 importer->anon_vma = exporter->anon_vma;
781 error = anon_vma_clone(importer, exporter); 776 error = anon_vma_clone(importer, exporter);
782 if (error) 777 if (error) {
778 importer->anon_vma = NULL;
783 return error; 779 return error;
784 importer->anon_vma = exporter->anon_vma; 780 }
785 } 781 }
786 } 782 }
787 783
788 if (file) { 784 if (file) {
789 mapping = file->f_mapping; 785 mapping = file->f_mapping;
790 if (!(vma->vm_flags & VM_NONLINEAR)) { 786 root = &mapping->i_mmap;
791 root = &mapping->i_mmap; 787 uprobe_munmap(vma, vma->vm_start, vma->vm_end);
792 uprobe_munmap(vma, vma->vm_start, vma->vm_end);
793 788
794 if (adjust_next) 789 if (adjust_next)
795 uprobe_munmap(next, next->vm_start, 790 uprobe_munmap(next, next->vm_start, next->vm_end);
796 next->vm_end);
797 }
798 791
799 i_mmap_lock_write(mapping); 792 i_mmap_lock_write(mapping);
800 if (insert) { 793 if (insert) {
@@ -2099,14 +2092,17 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
2099{ 2092{
2100 struct mm_struct *mm = vma->vm_mm; 2093 struct mm_struct *mm = vma->vm_mm;
2101 struct rlimit *rlim = current->signal->rlim; 2094 struct rlimit *rlim = current->signal->rlim;
2102 unsigned long new_start; 2095 unsigned long new_start, actual_size;
2103 2096
2104 /* address space limit tests */ 2097 /* address space limit tests */
2105 if (!may_expand_vm(mm, grow)) 2098 if (!may_expand_vm(mm, grow))
2106 return -ENOMEM; 2099 return -ENOMEM;
2107 2100
2108 /* Stack limit test */ 2101 /* Stack limit test */
2109 if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) 2102 actual_size = size;
2103 if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
2104 actual_size -= PAGE_SIZE;
2105 if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
2110 return -ENOMEM; 2106 return -ENOMEM;
2111 2107
2112 /* mlock limit tests */ 2108 /* mlock limit tests */
@@ -2629,6 +2625,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
2629 return vm_munmap(addr, len); 2625 return vm_munmap(addr, len);
2630} 2626}
2631 2627
2628
2629/*
2630 * Emulation of deprecated remap_file_pages() syscall.
2631 */
2632SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
2633 unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
2634{
2635
2636 struct mm_struct *mm = current->mm;
2637 struct vm_area_struct *vma;
2638 unsigned long populate = 0;
2639 unsigned long ret = -EINVAL;
2640 struct file *file;
2641
2642 pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
2643 "See Documentation/vm/remap_file_pages.txt.\n",
2644 current->comm, current->pid);
2645
2646 if (prot)
2647 return ret;
2648 start = start & PAGE_MASK;
2649 size = size & PAGE_MASK;
2650
2651 if (start + size <= start)
2652 return ret;
2653
2654 /* Does pgoff wrap? */
2655 if (pgoff + (size >> PAGE_SHIFT) < pgoff)
2656 return ret;
2657
2658 down_write(&mm->mmap_sem);
2659 vma = find_vma(mm, start);
2660
2661 if (!vma || !(vma->vm_flags & VM_SHARED))
2662 goto out;
2663
2664 if (start < vma->vm_start || start + size > vma->vm_end)
2665 goto out;
2666
2667 if (pgoff == linear_page_index(vma, start)) {
2668 ret = 0;
2669 goto out;
2670 }
2671
2672 prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
2673 prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
2674 prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
2675
2676 flags &= MAP_NONBLOCK;
2677 flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
2678 if (vma->vm_flags & VM_LOCKED) {
2679 flags |= MAP_LOCKED;
2680 /* drop PG_Mlocked flag for over-mapped range */
2681 munlock_vma_pages_range(vma, start, start + size);
2682 }
2683
2684 file = get_file(vma->vm_file);
2685 ret = do_mmap_pgoff(vma->vm_file, start, size,
2686 prot, flags, pgoff, &populate);
2687 fput(file);
2688out:
2689 up_write(&mm->mmap_sem);
2690 if (populate)
2691 mm_populate(ret, populate);
2692 if (!IS_ERR_VALUE(ret))
2693 ret = 0;
2694 return ret;
2695}
2696
2632static inline void verify_mm_writelocked(struct mm_struct *mm) 2697static inline void verify_mm_writelocked(struct mm_struct *mm)
2633{ 2698{
2634#ifdef CONFIG_DEBUG_VM 2699#ifdef CONFIG_DEBUG_VM
@@ -2786,9 +2851,6 @@ void exit_mmap(struct mm_struct *mm)
2786 vma = remove_vma(vma); 2851 vma = remove_vma(vma);
2787 } 2852 }
2788 vm_unacct_memory(nr_accounted); 2853 vm_unacct_memory(nr_accounted);
2789
2790 WARN_ON(atomic_long_read(&mm->nr_ptes) >
2791 (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
2792} 2854}
2793 2855
2794/* Insert vm structure into process list sorted by address 2856/* Insert vm structure into process list sorted by address
@@ -3103,8 +3165,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
3103 * 3165 *
3104 * mmap_sem in write mode is required in order to block all operations 3166 * mmap_sem in write mode is required in order to block all operations
3105 * that could modify pagetables and free pages without need of 3167 * that could modify pagetables and free pages without need of
3106 * altering the vma layout (for example populate_range() with 3168 * altering the vma layout. It's also needed in write mode to avoid new
3107 * nonlinear vmas). It's also needed in write mode to avoid new
3108 * anon_vmas to be associated with existing vmas. 3169 * anon_vmas to be associated with existing vmas.
3109 * 3170 *
3110 * A single task can't take more than one mm_take_all_locks() in a row 3171 * A single task can't take more than one mm_take_all_locks() in a row
diff --git a/mm/mmzone.c b/mm/mmzone.c
index bf34fb8556db..7d87ebb0d632 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -54,8 +54,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
54/* Returns the next zone at or below highest_zoneidx in a zonelist */ 54/* Returns the next zone at or below highest_zoneidx in a zonelist */
55struct zoneref *next_zones_zonelist(struct zoneref *z, 55struct zoneref *next_zones_zonelist(struct zoneref *z,
56 enum zone_type highest_zoneidx, 56 enum zone_type highest_zoneidx,
57 nodemask_t *nodes, 57 nodemask_t *nodes)
58 struct zone **zone)
59{ 58{
60 /* 59 /*
61 * Find the next suitable zone to use for the allocation. 60 * Find the next suitable zone to use for the allocation.
@@ -69,7 +68,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
69 (z->zone && !zref_in_nodemask(z, nodes))) 68 (z->zone && !zref_in_nodemask(z, nodes)))
70 z++; 69 z++;
71 70
72 *zone = zonelist_zone(z);
73 return z; 71 return z;
74} 72}
75 73
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ace93454ce8e..44727811bf4c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -75,37 +75,35 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
75 oldpte = *pte; 75 oldpte = *pte;
76 if (pte_present(oldpte)) { 76 if (pte_present(oldpte)) {
77 pte_t ptent; 77 pte_t ptent;
78 bool updated = false;
79 78
80 if (!prot_numa) { 79 /*
81 ptent = ptep_modify_prot_start(mm, addr, pte); 80 * Avoid trapping faults against the zero or KSM
82 if (pte_numa(ptent)) 81 * pages. See similar comment in change_huge_pmd.
83 ptent = pte_mknonnuma(ptent); 82 */
84 ptent = pte_modify(ptent, newprot); 83 if (prot_numa) {
85 /*
86 * Avoid taking write faults for pages we
87 * know to be dirty.
88 */
89 if (dirty_accountable && pte_dirty(ptent) &&
90 (pte_soft_dirty(ptent) ||
91 !(vma->vm_flags & VM_SOFTDIRTY)))
92 ptent = pte_mkwrite(ptent);
93 ptep_modify_prot_commit(mm, addr, pte, ptent);
94 updated = true;
95 } else {
96 struct page *page; 84 struct page *page;
97 85
98 page = vm_normal_page(vma, addr, oldpte); 86 page = vm_normal_page(vma, addr, oldpte);
99 if (page && !PageKsm(page)) { 87 if (!page || PageKsm(page))
100 if (!pte_numa(oldpte)) { 88 continue;
101 ptep_set_numa(mm, addr, pte); 89
102 updated = true; 90 /* Avoid TLB flush if possible */
103 } 91 if (pte_protnone(oldpte))
104 } 92 continue;
105 } 93 }
106 if (updated) 94
107 pages++; 95 ptent = ptep_modify_prot_start(mm, addr, pte);
108 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { 96 ptent = pte_modify(ptent, newprot);
97
98 /* Avoid taking write faults for known dirty pages */
99 if (dirty_accountable && pte_dirty(ptent) &&
100 (pte_soft_dirty(ptent) ||
101 !(vma->vm_flags & VM_SOFTDIRTY))) {
102 ptent = pte_mkwrite(ptent);
103 }
104 ptep_modify_prot_commit(mm, addr, pte, ptent);
105 pages++;
106 } else if (IS_ENABLED(CONFIG_MIGRATION)) {
109 swp_entry_t entry = pte_to_swp_entry(oldpte); 107 swp_entry_t entry = pte_to_swp_entry(oldpte);
110 108
111 if (is_write_migration_entry(entry)) { 109 if (is_write_migration_entry(entry)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 17fa018f5f39..57dadc025c64 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -81,8 +81,6 @@ static pte_t move_soft_dirty_pte(pte_t pte)
81 pte = pte_mksoft_dirty(pte); 81 pte = pte_mksoft_dirty(pte);
82 else if (is_swap_pte(pte)) 82 else if (is_swap_pte(pte))
83 pte = pte_swp_mksoft_dirty(pte); 83 pte = pte_swp_mksoft_dirty(pte);
84 else if (pte_file(pte))
85 pte = pte_file_mksoft_dirty(pte);
86#endif 84#endif
87 return pte; 85 return pte;
88} 86}
diff --git a/mm/msync.c b/mm/msync.c
index 992a1673d488..bb04d53ae852 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -86,10 +86,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
86 (vma->vm_flags & VM_SHARED)) { 86 (vma->vm_flags & VM_SHARED)) {
87 get_file(file); 87 get_file(file);
88 up_read(&mm->mmap_sem); 88 up_read(&mm->mmap_sem);
89 if (vma->vm_flags & VM_NONLINEAR) 89 error = vfs_fsync_range(file, fstart, fend, 1);
90 error = vfs_fsync(file, 1);
91 else
92 error = vfs_fsync_range(file, fstart, fend, 1);
93 fput(file); 90 fput(file);
94 if (error || start >= end) 91 if (error || start >= end)
95 goto out; 92 goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index b51eadf6d952..7296360fc057 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -59,6 +59,7 @@
59#endif 59#endif
60 60
61void *high_memory; 61void *high_memory;
62EXPORT_SYMBOL(high_memory);
62struct page *mem_map; 63struct page *mem_map;
63unsigned long max_mapnr; 64unsigned long max_mapnr;
64unsigned long highest_memmap_pfn; 65unsigned long highest_memmap_pfn;
@@ -213,6 +214,39 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
213} 214}
214EXPORT_SYMBOL(get_user_pages); 215EXPORT_SYMBOL(get_user_pages);
215 216
217long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
218 unsigned long start, unsigned long nr_pages,
219 int write, int force, struct page **pages,
220 int *locked)
221{
222 return get_user_pages(tsk, mm, start, nr_pages, write, force,
223 pages, NULL);
224}
225EXPORT_SYMBOL(get_user_pages_locked);
226
227long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
228 unsigned long start, unsigned long nr_pages,
229 int write, int force, struct page **pages,
230 unsigned int gup_flags)
231{
232 long ret;
233 down_read(&mm->mmap_sem);
234 ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
235 pages, NULL);
236 up_read(&mm->mmap_sem);
237 return ret;
238}
239EXPORT_SYMBOL(__get_user_pages_unlocked);
240
241long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
242 unsigned long start, unsigned long nr_pages,
243 int write, int force, struct page **pages)
244{
245 return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
246 force, pages, 0);
247}
248EXPORT_SYMBOL(get_user_pages_unlocked);
249
216/** 250/**
217 * follow_pfn - look up PFN at a user virtual address 251 * follow_pfn - look up PFN at a user virtual address
218 * @vma: memory mapping 252 * @vma: memory mapping
@@ -946,9 +980,6 @@ static int validate_mmap_request(struct file *file,
946 return -EOVERFLOW; 980 return -EOVERFLOW;
947 981
948 if (file) { 982 if (file) {
949 /* validate file mapping requests */
950 struct address_space *mapping;
951
952 /* files must support mmap */ 983 /* files must support mmap */
953 if (!file->f_op->mmap) 984 if (!file->f_op->mmap)
954 return -ENODEV; 985 return -ENODEV;
@@ -957,28 +988,22 @@ static int validate_mmap_request(struct file *file,
957 * - we support chardevs that provide their own "memory" 988 * - we support chardevs that provide their own "memory"
958 * - we support files/blockdevs that are memory backed 989 * - we support files/blockdevs that are memory backed
959 */ 990 */
960 mapping = file->f_mapping; 991 if (file->f_op->mmap_capabilities) {
961 if (!mapping) 992 capabilities = file->f_op->mmap_capabilities(file);
962 mapping = file_inode(file)->i_mapping; 993 } else {
963
964 capabilities = 0;
965 if (mapping && mapping->backing_dev_info)
966 capabilities = mapping->backing_dev_info->capabilities;
967
968 if (!capabilities) {
969 /* no explicit capabilities set, so assume some 994 /* no explicit capabilities set, so assume some
970 * defaults */ 995 * defaults */
971 switch (file_inode(file)->i_mode & S_IFMT) { 996 switch (file_inode(file)->i_mode & S_IFMT) {
972 case S_IFREG: 997 case S_IFREG:
973 case S_IFBLK: 998 case S_IFBLK:
974 capabilities = BDI_CAP_MAP_COPY; 999 capabilities = NOMMU_MAP_COPY;
975 break; 1000 break;
976 1001
977 case S_IFCHR: 1002 case S_IFCHR:
978 capabilities = 1003 capabilities =
979 BDI_CAP_MAP_DIRECT | 1004 NOMMU_MAP_DIRECT |
980 BDI_CAP_READ_MAP | 1005 NOMMU_MAP_READ |
981 BDI_CAP_WRITE_MAP; 1006 NOMMU_MAP_WRITE;
982 break; 1007 break;
983 1008
984 default: 1009 default:
@@ -989,9 +1014,9 @@ static int validate_mmap_request(struct file *file,
989 /* eliminate any capabilities that we can't support on this 1014 /* eliminate any capabilities that we can't support on this
990 * device */ 1015 * device */
991 if (!file->f_op->get_unmapped_area) 1016 if (!file->f_op->get_unmapped_area)
992 capabilities &= ~BDI_CAP_MAP_DIRECT; 1017 capabilities &= ~NOMMU_MAP_DIRECT;
993 if (!file->f_op->read) 1018 if (!file->f_op->read)
994 capabilities &= ~BDI_CAP_MAP_COPY; 1019 capabilities &= ~NOMMU_MAP_COPY;
995 1020
996 /* The file shall have been opened with read permission. */ 1021 /* The file shall have been opened with read permission. */
997 if (!(file->f_mode & FMODE_READ)) 1022 if (!(file->f_mode & FMODE_READ))
@@ -1010,29 +1035,29 @@ static int validate_mmap_request(struct file *file,
1010 if (locks_verify_locked(file)) 1035 if (locks_verify_locked(file))
1011 return -EAGAIN; 1036 return -EAGAIN;
1012 1037
1013 if (!(capabilities & BDI_CAP_MAP_DIRECT)) 1038 if (!(capabilities & NOMMU_MAP_DIRECT))
1014 return -ENODEV; 1039 return -ENODEV;
1015 1040
1016 /* we mustn't privatise shared mappings */ 1041 /* we mustn't privatise shared mappings */
1017 capabilities &= ~BDI_CAP_MAP_COPY; 1042 capabilities &= ~NOMMU_MAP_COPY;
1018 } else { 1043 } else {
1019 /* we're going to read the file into private memory we 1044 /* we're going to read the file into private memory we
1020 * allocate */ 1045 * allocate */
1021 if (!(capabilities & BDI_CAP_MAP_COPY)) 1046 if (!(capabilities & NOMMU_MAP_COPY))
1022 return -ENODEV; 1047 return -ENODEV;
1023 1048
1024 /* we don't permit a private writable mapping to be 1049 /* we don't permit a private writable mapping to be
1025 * shared with the backing device */ 1050 * shared with the backing device */
1026 if (prot & PROT_WRITE) 1051 if (prot & PROT_WRITE)
1027 capabilities &= ~BDI_CAP_MAP_DIRECT; 1052 capabilities &= ~NOMMU_MAP_DIRECT;
1028 } 1053 }
1029 1054
1030 if (capabilities & BDI_CAP_MAP_DIRECT) { 1055 if (capabilities & NOMMU_MAP_DIRECT) {
1031 if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || 1056 if (((prot & PROT_READ) && !(capabilities & NOMMU_MAP_READ)) ||
1032 ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || 1057 ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) ||
1033 ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) 1058 ((prot & PROT_EXEC) && !(capabilities & NOMMU_MAP_EXEC))
1034 ) { 1059 ) {
1035 capabilities &= ~BDI_CAP_MAP_DIRECT; 1060 capabilities &= ~NOMMU_MAP_DIRECT;
1036 if (flags & MAP_SHARED) { 1061 if (flags & MAP_SHARED) {
1037 printk(KERN_WARNING 1062 printk(KERN_WARNING
1038 "MAP_SHARED not completely supported on !MMU\n"); 1063 "MAP_SHARED not completely supported on !MMU\n");
@@ -1049,21 +1074,21 @@ static int validate_mmap_request(struct file *file,
1049 } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { 1074 } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
1050 /* handle implication of PROT_EXEC by PROT_READ */ 1075 /* handle implication of PROT_EXEC by PROT_READ */
1051 if (current->personality & READ_IMPLIES_EXEC) { 1076 if (current->personality & READ_IMPLIES_EXEC) {
1052 if (capabilities & BDI_CAP_EXEC_MAP) 1077 if (capabilities & NOMMU_MAP_EXEC)
1053 prot |= PROT_EXEC; 1078 prot |= PROT_EXEC;
1054 } 1079 }
1055 } else if ((prot & PROT_READ) && 1080 } else if ((prot & PROT_READ) &&
1056 (prot & PROT_EXEC) && 1081 (prot & PROT_EXEC) &&
1057 !(capabilities & BDI_CAP_EXEC_MAP) 1082 !(capabilities & NOMMU_MAP_EXEC)
1058 ) { 1083 ) {
1059 /* backing file is not executable, try to copy */ 1084 /* backing file is not executable, try to copy */
1060 capabilities &= ~BDI_CAP_MAP_DIRECT; 1085 capabilities &= ~NOMMU_MAP_DIRECT;
1061 } 1086 }
1062 } else { 1087 } else {
1063 /* anonymous mappings are always memory backed and can be 1088 /* anonymous mappings are always memory backed and can be
1064 * privately mapped 1089 * privately mapped
1065 */ 1090 */
1066 capabilities = BDI_CAP_MAP_COPY; 1091 capabilities = NOMMU_MAP_COPY;
1067 1092
1068 /* handle PROT_EXEC implication by PROT_READ */ 1093 /* handle PROT_EXEC implication by PROT_READ */
1069 if ((prot & PROT_READ) && 1094 if ((prot & PROT_READ) &&
@@ -1095,7 +1120,7 @@ static unsigned long determine_vm_flags(struct file *file,
1095 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); 1120 vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
1096 /* vm_flags |= mm->def_flags; */ 1121 /* vm_flags |= mm->def_flags; */
1097 1122
1098 if (!(capabilities & BDI_CAP_MAP_DIRECT)) { 1123 if (!(capabilities & NOMMU_MAP_DIRECT)) {
1099 /* attempt to share read-only copies of mapped file chunks */ 1124 /* attempt to share read-only copies of mapped file chunks */
1100 vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; 1125 vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
1101 if (file && !(prot & PROT_WRITE)) 1126 if (file && !(prot & PROT_WRITE))
@@ -1104,7 +1129,7 @@ static unsigned long determine_vm_flags(struct file *file,
1104 /* overlay a shareable mapping on the backing device or inode 1129 /* overlay a shareable mapping on the backing device or inode
1105 * if possible - used for chardevs, ramfs/tmpfs/shmfs and 1130 * if possible - used for chardevs, ramfs/tmpfs/shmfs and
1106 * romfs/cramfs */ 1131 * romfs/cramfs */
1107 vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS); 1132 vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
1108 if (flags & MAP_SHARED) 1133 if (flags & MAP_SHARED)
1109 vm_flags |= VM_SHARED; 1134 vm_flags |= VM_SHARED;
1110 } 1135 }
@@ -1157,7 +1182,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1157 * shared mappings on devices or memory 1182 * shared mappings on devices or memory
1158 * - VM_MAYSHARE will be set if it may attempt to share 1183 * - VM_MAYSHARE will be set if it may attempt to share
1159 */ 1184 */
1160 if (capabilities & BDI_CAP_MAP_DIRECT) { 1185 if (capabilities & NOMMU_MAP_DIRECT) {
1161 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1186 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1162 if (ret == 0) { 1187 if (ret == 0) {
1163 /* shouldn't return success if we're not sharing */ 1188 /* shouldn't return success if we're not sharing */
@@ -1346,7 +1371,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1346 if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && 1371 if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
1347 !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { 1372 !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
1348 /* new mapping is not a subset of the region */ 1373 /* new mapping is not a subset of the region */
1349 if (!(capabilities & BDI_CAP_MAP_DIRECT)) 1374 if (!(capabilities & NOMMU_MAP_DIRECT))
1350 goto sharing_violation; 1375 goto sharing_violation;
1351 continue; 1376 continue;
1352 } 1377 }
@@ -1385,7 +1410,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1385 * - this is the hook for quasi-memory character devices to 1410 * - this is the hook for quasi-memory character devices to
1386 * tell us the location of a shared mapping 1411 * tell us the location of a shared mapping
1387 */ 1412 */
1388 if (capabilities & BDI_CAP_MAP_DIRECT) { 1413 if (capabilities & NOMMU_MAP_DIRECT) {
1389 addr = file->f_op->get_unmapped_area(file, addr, len, 1414 addr = file->f_op->get_unmapped_area(file, addr, len,
1390 pgoff, flags); 1415 pgoff, flags);
1391 if (IS_ERR_VALUE(addr)) { 1416 if (IS_ERR_VALUE(addr)) {
@@ -1397,10 +1422,10 @@ unsigned long do_mmap_pgoff(struct file *file,
1397 * the mapping so we'll have to attempt to copy 1422 * the mapping so we'll have to attempt to copy
1398 * it */ 1423 * it */
1399 ret = -ENODEV; 1424 ret = -ENODEV;
1400 if (!(capabilities & BDI_CAP_MAP_COPY)) 1425 if (!(capabilities & NOMMU_MAP_COPY))
1401 goto error_just_free; 1426 goto error_just_free;
1402 1427
1403 capabilities &= ~BDI_CAP_MAP_DIRECT; 1428 capabilities &= ~NOMMU_MAP_DIRECT;
1404 } else { 1429 } else {
1405 vma->vm_start = region->vm_start = addr; 1430 vma->vm_start = region->vm_start = addr;
1406 vma->vm_end = region->vm_end = addr + len; 1431 vma->vm_end = region->vm_end = addr + len;
@@ -1411,7 +1436,7 @@ unsigned long do_mmap_pgoff(struct file *file,
1411 vma->vm_region = region; 1436 vma->vm_region = region;
1412 1437
1413 /* set up the mapping 1438 /* set up the mapping
1414 * - the region is filled in if BDI_CAP_MAP_DIRECT is still set 1439 * - the region is filled in if NOMMU_MAP_DIRECT is still set
1415 */ 1440 */
1416 if (file && vma->vm_flags & VM_SHARED) 1441 if (file && vma->vm_flags & VM_SHARED)
1417 ret = do_mmap_shared_file(vma); 1442 ret = do_mmap_shared_file(vma);
@@ -1894,7 +1919,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
1894 */ 1919 */
1895int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) 1920int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1896{ 1921{
1897 unsigned long free, allowed, reserve; 1922 long free, allowed, reserve;
1898 1923
1899 vm_acct_memory(pages); 1924 vm_acct_memory(pages);
1900 1925
@@ -1958,7 +1983,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1958 */ 1983 */
1959 if (mm) { 1984 if (mm) {
1960 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); 1985 reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
1961 allowed -= min(mm->total_vm / 32, reserve); 1986 allowed -= min_t(long, mm->total_vm / 32, reserve);
1962 } 1987 }
1963 1988
1964 if (percpu_counter_read_positive(&vm_committed_as) < allowed) 1989 if (percpu_counter_read_positive(&vm_committed_as) < allowed)
@@ -1983,14 +2008,6 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
1983} 2008}
1984EXPORT_SYMBOL(filemap_map_pages); 2009EXPORT_SYMBOL(filemap_map_pages);
1985 2010
1986int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
1987 unsigned long size, pgoff_t pgoff)
1988{
1989 BUG();
1990 return 0;
1991}
1992EXPORT_SYMBOL(generic_file_remap_pages);
1993
1994static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, 2011static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
1995 unsigned long addr, void *buf, int len, int write) 2012 unsigned long addr, void *buf, int len, int write)
1996{ 2013{
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d503e9ce1c7b..642f38cb175a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
169 * The baseline for the badness score is the proportion of RAM that each 169 * The baseline for the badness score is the proportion of RAM that each
170 * task's rss, pagetable and swap space use. 170 * task's rss, pagetable and swap space use.
171 */ 171 */
172 points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + 172 points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
173 get_mm_counter(p->mm, MM_SWAPENTS); 173 atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
174 task_unlock(p); 174 task_unlock(p);
175 175
176 /* 176 /*
@@ -266,8 +266,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
266 * Don't allow any other task to have access to the reserves. 266 * Don't allow any other task to have access to the reserves.
267 */ 267 */
268 if (test_tsk_thread_flag(task, TIF_MEMDIE)) { 268 if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
269 if (unlikely(frozen(task)))
270 __thaw_task(task);
271 if (!force_kill) 269 if (!force_kill)
272 return OOM_SCAN_ABORT; 270 return OOM_SCAN_ABORT;
273 } 271 }
@@ -353,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
353 struct task_struct *p; 351 struct task_struct *p;
354 struct task_struct *task; 352 struct task_struct *task;
355 353
356 pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); 354 pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n");
357 rcu_read_lock(); 355 rcu_read_lock();
358 for_each_process(p) { 356 for_each_process(p) {
359 if (oom_unkillable_task(p, memcg, nodemask)) 357 if (oom_unkillable_task(p, memcg, nodemask))
@@ -369,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
369 continue; 367 continue;
370 } 368 }
371 369
372 pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", 370 pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n",
373 task->pid, from_kuid(&init_user_ns, task_uid(task)), 371 task->pid, from_kuid(&init_user_ns, task_uid(task)),
374 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 372 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
375 atomic_long_read(&task->mm->nr_ptes), 373 atomic_long_read(&task->mm->nr_ptes),
374 mm_nr_pmds(task->mm),
376 get_mm_counter(task->mm, MM_SWAPENTS), 375 get_mm_counter(task->mm, MM_SWAPENTS),
377 task->signal->oom_score_adj, task->comm); 376 task->signal->oom_score_adj, task->comm);
378 task_unlock(task); 377 task_unlock(task);
@@ -400,20 +399,98 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
400} 399}
401 400
402/* 401/*
403 * Number of OOM killer invocations (including memcg OOM killer). 402 * Number of OOM victims in flight
404 * Primarily used by PM freezer to check for potential races with
405 * OOM killed frozen task.
406 */ 403 */
407static atomic_t oom_kills = ATOMIC_INIT(0); 404static atomic_t oom_victims = ATOMIC_INIT(0);
405static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
408 406
409int oom_kills_count(void) 407bool oom_killer_disabled __read_mostly;
408static DECLARE_RWSEM(oom_sem);
409
410/**
411 * mark_tsk_oom_victim - marks the given taks as OOM victim.
412 * @tsk: task to mark
413 *
414 * Has to be called with oom_sem taken for read and never after
415 * oom has been disabled already.
416 */
417void mark_tsk_oom_victim(struct task_struct *tsk)
410{ 418{
411 return atomic_read(&oom_kills); 419 WARN_ON(oom_killer_disabled);
420 /* OOM killer might race with memcg OOM */
421 if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
422 return;
423 /*
424 * Make sure that the task is woken up from uninterruptible sleep
425 * if it is frozen because OOM killer wouldn't be able to free
426 * any memory and livelock. freezing_slow_path will tell the freezer
427 * that TIF_MEMDIE tasks should be ignored.
428 */
429 __thaw_task(tsk);
430 atomic_inc(&oom_victims);
431}
432
433/**
434 * unmark_oom_victim - unmarks the current task as OOM victim.
435 *
436 * Wakes up all waiters in oom_killer_disable()
437 */
438void unmark_oom_victim(void)
439{
440 if (!test_and_clear_thread_flag(TIF_MEMDIE))
441 return;
442
443 down_read(&oom_sem);
444 /*
445 * There is no need to signal the lasst oom_victim if there
446 * is nobody who cares.
447 */
448 if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
449 wake_up_all(&oom_victims_wait);
450 up_read(&oom_sem);
451}
452
453/**
454 * oom_killer_disable - disable OOM killer
455 *
456 * Forces all page allocations to fail rather than trigger OOM killer.
457 * Will block and wait until all OOM victims are killed.
458 *
459 * The function cannot be called when there are runnable user tasks because
460 * the userspace would see unexpected allocation failures as a result. Any
461 * new usage of this function should be consulted with MM people.
462 *
463 * Returns true if successful and false if the OOM killer cannot be
464 * disabled.
465 */
466bool oom_killer_disable(void)
467{
468 /*
469 * Make sure to not race with an ongoing OOM killer
470 * and that the current is not the victim.
471 */
472 down_write(&oom_sem);
473 if (test_thread_flag(TIF_MEMDIE)) {
474 up_write(&oom_sem);
475 return false;
476 }
477
478 oom_killer_disabled = true;
479 up_write(&oom_sem);
480
481 wait_event(oom_victims_wait, !atomic_read(&oom_victims));
482
483 return true;
412} 484}
413 485
414void note_oom_kill(void) 486/**
487 * oom_killer_enable - enable OOM killer
488 */
489void oom_killer_enable(void)
415{ 490{
416 atomic_inc(&oom_kills); 491 down_write(&oom_sem);
492 oom_killer_disabled = false;
493 up_write(&oom_sem);
417} 494}
418 495
419#define K(x) ((x) << (PAGE_SHIFT-10)) 496#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -438,11 +515,14 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
438 * If the task is already exiting, don't alarm the sysadmin or kill 515 * If the task is already exiting, don't alarm the sysadmin or kill
439 * its children or threads, just set TIF_MEMDIE so it can die quickly 516 * its children or threads, just set TIF_MEMDIE so it can die quickly
440 */ 517 */
441 if (task_will_free_mem(p)) { 518 task_lock(p);
442 set_tsk_thread_flag(p, TIF_MEMDIE); 519 if (p->mm && task_will_free_mem(p)) {
520 mark_tsk_oom_victim(p);
521 task_unlock(p);
443 put_task_struct(p); 522 put_task_struct(p);
444 return; 523 return;
445 } 524 }
525 task_unlock(p);
446 526
447 if (__ratelimit(&oom_rs)) 527 if (__ratelimit(&oom_rs))
448 dump_header(p, gfp_mask, order, memcg, nodemask); 528 dump_header(p, gfp_mask, order, memcg, nodemask);
@@ -492,6 +572,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
492 572
493 /* mm cannot safely be dereferenced after task_unlock(victim) */ 573 /* mm cannot safely be dereferenced after task_unlock(victim) */
494 mm = victim->mm; 574 mm = victim->mm;
575 mark_tsk_oom_victim(victim);
495 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", 576 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
496 task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), 577 task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
497 K(get_mm_counter(victim->mm, MM_ANONPAGES)), 578 K(get_mm_counter(victim->mm, MM_ANONPAGES)),
@@ -522,7 +603,6 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
522 } 603 }
523 rcu_read_unlock(); 604 rcu_read_unlock();
524 605
525 set_tsk_thread_flag(victim, TIF_MEMDIE);
526 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); 606 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
527 put_task_struct(victim); 607 put_task_struct(victim);
528} 608}
@@ -611,7 +691,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
611} 691}
612 692
613/** 693/**
614 * out_of_memory - kill the "best" process when we run out of memory 694 * __out_of_memory - kill the "best" process when we run out of memory
615 * @zonelist: zonelist pointer 695 * @zonelist: zonelist pointer
616 * @gfp_mask: memory allocation flags 696 * @gfp_mask: memory allocation flags
617 * @order: amount of memory being requested as a power of 2 697 * @order: amount of memory being requested as a power of 2
@@ -623,7 +703,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
623 * OR try to be smart about which process to kill. Note that we 703 * OR try to be smart about which process to kill. Note that we
624 * don't have to be perfect here, we just have to be good. 704 * don't have to be perfect here, we just have to be good.
625 */ 705 */
626void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 706static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
627 int order, nodemask_t *nodemask, bool force_kill) 707 int order, nodemask_t *nodemask, bool force_kill)
628{ 708{
629 const nodemask_t *mpol_mask; 709 const nodemask_t *mpol_mask;
@@ -643,9 +723,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
643 * If current has a pending SIGKILL or is exiting, then automatically 723 * If current has a pending SIGKILL or is exiting, then automatically
644 * select it. The goal is to allow it to allocate so that it may 724 * select it. The goal is to allow it to allocate so that it may
645 * quickly exit and free its memory. 725 * quickly exit and free its memory.
726 *
727 * But don't select if current has already released its mm and cleared
728 * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
646 */ 729 */
647 if (fatal_signal_pending(current) || task_will_free_mem(current)) { 730 if (current->mm &&
648 set_thread_flag(TIF_MEMDIE); 731 (fatal_signal_pending(current) || task_will_free_mem(current))) {
732 mark_tsk_oom_victim(current);
649 return; 733 return;
650 } 734 }
651 735
@@ -688,6 +772,32 @@ out:
688 schedule_timeout_killable(1); 772 schedule_timeout_killable(1);
689} 773}
690 774
775/**
776 * out_of_memory - tries to invoke OOM killer.
777 * @zonelist: zonelist pointer
778 * @gfp_mask: memory allocation flags
779 * @order: amount of memory being requested as a power of 2
780 * @nodemask: nodemask passed to page allocator
781 * @force_kill: true if a task must be killed, even if others are exiting
782 *
783 * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
784 * when it returns false. Otherwise returns true.
785 */
786bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
787 int order, nodemask_t *nodemask, bool force_kill)
788{
789 bool ret = false;
790
791 down_read(&oom_sem);
792 if (!oom_killer_disabled) {
793 __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
794 ret = true;
795 }
796 up_read(&oom_sem);
797
798 return ret;
799}
800
691/* 801/*
692 * The pagefault handler calls here because it is out of memory, so kill a 802 * The pagefault handler calls here because it is out of memory, so kill a
693 * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a 803 * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
@@ -697,12 +807,25 @@ void pagefault_out_of_memory(void)
697{ 807{
698 struct zonelist *zonelist; 808 struct zonelist *zonelist;
699 809
810 down_read(&oom_sem);
700 if (mem_cgroup_oom_synchronize(true)) 811 if (mem_cgroup_oom_synchronize(true))
701 return; 812 goto unlock;
702 813
703 zonelist = node_zonelist(first_memory_node, GFP_KERNEL); 814 zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
704 if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { 815 if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
705 out_of_memory(NULL, 0, 0, NULL, false); 816 if (!oom_killer_disabled)
817 __out_of_memory(NULL, 0, 0, NULL, false);
818 else
819 /*
820 * There shouldn't be any user tasks runable while the
821 * OOM killer is disabled so the current task has to
822 * be a racing OOM victim for which oom_killer_disable()
823 * is waiting for.
824 */
825 WARN_ON(test_thread_flag(TIF_MEMDIE));
826
706 oom_zonelist_unlock(zonelist, GFP_KERNEL); 827 oom_zonelist_unlock(zonelist, GFP_KERNEL);
707 } 828 }
829unlock:
830 up_read(&oom_sem);
708} 831}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d5d81f5384d1..45e187b2d971 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1351,7 +1351,7 @@ static void balance_dirty_pages(struct address_space *mapping,
1351 unsigned long task_ratelimit; 1351 unsigned long task_ratelimit;
1352 unsigned long dirty_ratelimit; 1352 unsigned long dirty_ratelimit;
1353 unsigned long pos_ratio; 1353 unsigned long pos_ratio;
1354 struct backing_dev_info *bdi = mapping->backing_dev_info; 1354 struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
1355 bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; 1355 bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
1356 unsigned long start_time = jiffies; 1356 unsigned long start_time = jiffies;
1357 1357
@@ -1541,16 +1541,6 @@ pause:
1541 bdi_start_background_writeback(bdi); 1541 bdi_start_background_writeback(bdi);
1542} 1542}
1543 1543
1544void set_page_dirty_balance(struct page *page)
1545{
1546 if (set_page_dirty(page)) {
1547 struct address_space *mapping = page_mapping(page);
1548
1549 if (mapping)
1550 balance_dirty_pages_ratelimited(mapping);
1551 }
1552}
1553
1554static DEFINE_PER_CPU(int, bdp_ratelimits); 1544static DEFINE_PER_CPU(int, bdp_ratelimits);
1555 1545
1556/* 1546/*
@@ -1584,7 +1574,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
1584 */ 1574 */
1585void balance_dirty_pages_ratelimited(struct address_space *mapping) 1575void balance_dirty_pages_ratelimited(struct address_space *mapping)
1586{ 1576{
1587 struct backing_dev_info *bdi = mapping->backing_dev_info; 1577 struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
1588 int ratelimit; 1578 int ratelimit;
1589 int *p; 1579 int *p;
1590 1580
@@ -1939,7 +1929,7 @@ continue_unlock:
1939 if (!clear_page_dirty_for_io(page)) 1929 if (!clear_page_dirty_for_io(page))
1940 goto continue_unlock; 1930 goto continue_unlock;
1941 1931
1942 trace_wbc_writepage(wbc, mapping->backing_dev_info); 1932 trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
1943 ret = (*writepage)(page, wbc, data); 1933 ret = (*writepage)(page, wbc, data);
1944 if (unlikely(ret)) { 1934 if (unlikely(ret)) {
1945 if (ret == AOP_WRITEPAGE_ACTIVATE) { 1935 if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -2104,10 +2094,12 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
2104 trace_writeback_dirty_page(page, mapping); 2094 trace_writeback_dirty_page(page, mapping);
2105 2095
2106 if (mapping_cap_account_dirty(mapping)) { 2096 if (mapping_cap_account_dirty(mapping)) {
2097 struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
2098
2107 __inc_zone_page_state(page, NR_FILE_DIRTY); 2099 __inc_zone_page_state(page, NR_FILE_DIRTY);
2108 __inc_zone_page_state(page, NR_DIRTIED); 2100 __inc_zone_page_state(page, NR_DIRTIED);
2109 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); 2101 __inc_bdi_stat(bdi, BDI_RECLAIMABLE);
2110 __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); 2102 __inc_bdi_stat(bdi, BDI_DIRTIED);
2111 task_io_account_write(PAGE_CACHE_SIZE); 2103 task_io_account_write(PAGE_CACHE_SIZE);
2112 current->nr_dirtied++; 2104 current->nr_dirtied++;
2113 this_cpu_inc(bdp_ratelimits); 2105 this_cpu_inc(bdp_ratelimits);
@@ -2123,32 +2115,25 @@ EXPORT_SYMBOL(account_page_dirtied);
2123 * page dirty in that case, but not all the buffers. This is a "bottom-up" 2115 * page dirty in that case, but not all the buffers. This is a "bottom-up"
2124 * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. 2116 * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
2125 * 2117 *
2126 * Most callers have locked the page, which pins the address_space in memory. 2118 * The caller must ensure this doesn't race with truncation. Most will simply
2127 * But zap_pte_range() does not lock the page, however in that case the 2119 * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
2128 * mapping is pinned by the vma's ->vm_file reference. 2120 * the pte lock held, which also locks out truncation.
2129 *
2130 * We take care to handle the case where the page was truncated from the
2131 * mapping by re-checking page_mapping() inside tree_lock.
2132 */ 2121 */
2133int __set_page_dirty_nobuffers(struct page *page) 2122int __set_page_dirty_nobuffers(struct page *page)
2134{ 2123{
2135 if (!TestSetPageDirty(page)) { 2124 if (!TestSetPageDirty(page)) {
2136 struct address_space *mapping = page_mapping(page); 2125 struct address_space *mapping = page_mapping(page);
2137 struct address_space *mapping2;
2138 unsigned long flags; 2126 unsigned long flags;
2139 2127
2140 if (!mapping) 2128 if (!mapping)
2141 return 1; 2129 return 1;
2142 2130
2143 spin_lock_irqsave(&mapping->tree_lock, flags); 2131 spin_lock_irqsave(&mapping->tree_lock, flags);
2144 mapping2 = page_mapping(page); 2132 BUG_ON(page_mapping(page) != mapping);
2145 if (mapping2) { /* Race with truncate? */ 2133 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
2146 BUG_ON(mapping2 != mapping); 2134 account_page_dirtied(page, mapping);
2147 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); 2135 radix_tree_tag_set(&mapping->page_tree, page_index(page),
2148 account_page_dirtied(page, mapping); 2136 PAGECACHE_TAG_DIRTY);
2149 radix_tree_tag_set(&mapping->page_tree,
2150 page_index(page), PAGECACHE_TAG_DIRTY);
2151 }
2152 spin_unlock_irqrestore(&mapping->tree_lock, flags); 2137 spin_unlock_irqrestore(&mapping->tree_lock, flags);
2153 if (mapping->host) { 2138 if (mapping->host) {
2154 /* !PageAnon && !swapper_space */ 2139 /* !PageAnon && !swapper_space */
@@ -2173,7 +2158,7 @@ void account_page_redirty(struct page *page)
2173 if (mapping && mapping_cap_account_dirty(mapping)) { 2158 if (mapping && mapping_cap_account_dirty(mapping)) {
2174 current->nr_dirtied--; 2159 current->nr_dirtied--;
2175 dec_zone_page_state(page, NR_DIRTIED); 2160 dec_zone_page_state(page, NR_DIRTIED);
2176 dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); 2161 dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
2177 } 2162 }
2178} 2163}
2179EXPORT_SYMBOL(account_page_redirty); 2164EXPORT_SYMBOL(account_page_redirty);
@@ -2185,9 +2170,12 @@ EXPORT_SYMBOL(account_page_redirty);
2185 */ 2170 */
2186int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) 2171int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
2187{ 2172{
2173 int ret;
2174
2188 wbc->pages_skipped++; 2175 wbc->pages_skipped++;
2176 ret = __set_page_dirty_nobuffers(page);
2189 account_page_redirty(page); 2177 account_page_redirty(page);
2190 return __set_page_dirty_nobuffers(page); 2178 return ret;
2191} 2179}
2192EXPORT_SYMBOL(redirty_page_for_writepage); 2180EXPORT_SYMBOL(redirty_page_for_writepage);
2193 2181
@@ -2305,16 +2293,14 @@ int clear_page_dirty_for_io(struct page *page)
2305 /* 2293 /*
2306 * We carefully synchronise fault handlers against 2294 * We carefully synchronise fault handlers against
2307 * installing a dirty pte and marking the page dirty 2295 * installing a dirty pte and marking the page dirty
2308 * at this point. We do this by having them hold the 2296 * at this point. We do this by having them hold the
2309 * page lock at some point after installing their 2297 * page lock while dirtying the page, and pages are
2310 * pte, but before marking the page dirty. 2298 * always locked coming in here, so we get the desired
2311 * Pages are always locked coming in here, so we get 2299 * exclusion.
2312 * the desired exclusion. See mm/memory.c:do_wp_page()
2313 * for more comments.
2314 */ 2300 */
2315 if (TestClearPageDirty(page)) { 2301 if (TestClearPageDirty(page)) {
2316 dec_zone_page_state(page, NR_FILE_DIRTY); 2302 dec_zone_page_state(page, NR_FILE_DIRTY);
2317 dec_bdi_stat(mapping->backing_dev_info, 2303 dec_bdi_stat(inode_to_bdi(mapping->host),
2318 BDI_RECLAIMABLE); 2304 BDI_RECLAIMABLE);
2319 return 1; 2305 return 1;
2320 } 2306 }
@@ -2327,14 +2313,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
2327int test_clear_page_writeback(struct page *page) 2313int test_clear_page_writeback(struct page *page)
2328{ 2314{
2329 struct address_space *mapping = page_mapping(page); 2315 struct address_space *mapping = page_mapping(page);
2330 unsigned long memcg_flags;
2331 struct mem_cgroup *memcg; 2316 struct mem_cgroup *memcg;
2332 bool locked;
2333 int ret; 2317 int ret;
2334 2318
2335 memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); 2319 memcg = mem_cgroup_begin_page_stat(page);
2336 if (mapping) { 2320 if (mapping) {
2337 struct backing_dev_info *bdi = mapping->backing_dev_info; 2321 struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
2338 unsigned long flags; 2322 unsigned long flags;
2339 2323
2340 spin_lock_irqsave(&mapping->tree_lock, flags); 2324 spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2357,21 +2341,19 @@ int test_clear_page_writeback(struct page *page)
2357 dec_zone_page_state(page, NR_WRITEBACK); 2341 dec_zone_page_state(page, NR_WRITEBACK);
2358 inc_zone_page_state(page, NR_WRITTEN); 2342 inc_zone_page_state(page, NR_WRITTEN);
2359 } 2343 }
2360 mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); 2344 mem_cgroup_end_page_stat(memcg);
2361 return ret; 2345 return ret;
2362} 2346}
2363 2347
2364int __test_set_page_writeback(struct page *page, bool keep_write) 2348int __test_set_page_writeback(struct page *page, bool keep_write)
2365{ 2349{
2366 struct address_space *mapping = page_mapping(page); 2350 struct address_space *mapping = page_mapping(page);
2367 unsigned long memcg_flags;
2368 struct mem_cgroup *memcg; 2351 struct mem_cgroup *memcg;
2369 bool locked;
2370 int ret; 2352 int ret;
2371 2353
2372 memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); 2354 memcg = mem_cgroup_begin_page_stat(page);
2373 if (mapping) { 2355 if (mapping) {
2374 struct backing_dev_info *bdi = mapping->backing_dev_info; 2356 struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
2375 unsigned long flags; 2357 unsigned long flags;
2376 2358
2377 spin_lock_irqsave(&mapping->tree_lock, flags); 2359 spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2399,7 +2381,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
2399 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); 2381 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
2400 inc_zone_page_state(page, NR_WRITEBACK); 2382 inc_zone_page_state(page, NR_WRITEBACK);
2401 } 2383 }
2402 mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); 2384 mem_cgroup_end_page_stat(memcg);
2403 return ret; 2385 return ret;
2404 2386
2405} 2387}
@@ -2425,12 +2407,7 @@ EXPORT_SYMBOL(mapping_tagged);
2425 */ 2407 */
2426void wait_for_stable_page(struct page *page) 2408void wait_for_stable_page(struct page *page)
2427{ 2409{
2428 struct address_space *mapping = page_mapping(page); 2410 if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
2429 struct backing_dev_info *bdi = mapping->backing_dev_info; 2411 wait_on_page_writeback(page);
2430
2431 if (!bdi_cap_stable_pages_required(bdi))
2432 return;
2433
2434 wait_on_page_writeback(page);
2435} 2412}
2436EXPORT_SYMBOL_GPL(wait_for_stable_page); 2413EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7633c503a116..a47f0b229a1a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -25,6 +25,7 @@
25#include <linux/compiler.h> 25#include <linux/compiler.h>
26#include <linux/kernel.h> 26#include <linux/kernel.h>
27#include <linux/kmemcheck.h> 27#include <linux/kmemcheck.h>
28#include <linux/kasan.h>
28#include <linux/module.h> 29#include <linux/module.h>
29#include <linux/suspend.h> 30#include <linux/suspend.h>
30#include <linux/pagevec.h> 31#include <linux/pagevec.h>
@@ -172,7 +173,7 @@ static void __free_pages_ok(struct page *page, unsigned int order);
172 * 1G machine -> (16M dma, 784M normal, 224M high) 173 * 1G machine -> (16M dma, 784M normal, 224M high)
173 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 174 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
174 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 175 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
175 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 176 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
176 * 177 *
177 * TBD: should special case ZONE_DMA32 machines here - in those we normally 178 * TBD: should special case ZONE_DMA32 machines here - in those we normally
178 * don't need any ZONE_NORMAL reservation 179 * don't need any ZONE_NORMAL reservation
@@ -244,8 +245,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
244 PB_migrate, PB_migrate_end); 245 PB_migrate, PB_migrate_end);
245} 246}
246 247
247bool oom_killer_disabled __read_mostly;
248
249#ifdef CONFIG_DEBUG_VM 248#ifdef CONFIG_DEBUG_VM
250static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 249static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
251{ 250{
@@ -381,36 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order)
381 } 380 }
382} 381}
383 382
384/* update __split_huge_page_refcount if you change this function */
385static int destroy_compound_page(struct page *page, unsigned long order)
386{
387 int i;
388 int nr_pages = 1 << order;
389 int bad = 0;
390
391 if (unlikely(compound_order(page) != order)) {
392 bad_page(page, "wrong compound order", 0);
393 bad++;
394 }
395
396 __ClearPageHead(page);
397
398 for (i = 1; i < nr_pages; i++) {
399 struct page *p = page + i;
400
401 if (unlikely(!PageTail(p))) {
402 bad_page(page, "PageTail not set", 0);
403 bad++;
404 } else if (unlikely(p->first_page != page)) {
405 bad_page(page, "first_page not consistent", 0);
406 bad++;
407 }
408 __ClearPageTail(p);
409 }
410
411 return bad;
412}
413
414static inline void prep_zero_page(struct page *page, unsigned int order, 383static inline void prep_zero_page(struct page *page, unsigned int order,
415 gfp_t gfp_flags) 384 gfp_t gfp_flags)
416{ 385{
@@ -552,17 +521,15 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
552 return 0; 521 return 0;
553 522
554 if (page_is_guard(buddy) && page_order(buddy) == order) { 523 if (page_is_guard(buddy) && page_order(buddy) == order) {
555 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
556
557 if (page_zone_id(page) != page_zone_id(buddy)) 524 if (page_zone_id(page) != page_zone_id(buddy))
558 return 0; 525 return 0;
559 526
527 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
528
560 return 1; 529 return 1;
561 } 530 }
562 531
563 if (PageBuddy(buddy) && page_order(buddy) == order) { 532 if (PageBuddy(buddy) && page_order(buddy) == order) {
564 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
565
566 /* 533 /*
567 * zone check is done late to avoid uselessly 534 * zone check is done late to avoid uselessly
568 * calculating zone/node ids for pages that could 535 * calculating zone/node ids for pages that could
@@ -571,6 +538,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
571 if (page_zone_id(page) != page_zone_id(buddy)) 538 if (page_zone_id(page) != page_zone_id(buddy))
572 return 0; 539 return 0;
573 540
541 VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
542
574 return 1; 543 return 1;
575 } 544 }
576 return 0; 545 return 0;
@@ -613,10 +582,7 @@ static inline void __free_one_page(struct page *page,
613 int max_order = MAX_ORDER; 582 int max_order = MAX_ORDER;
614 583
615 VM_BUG_ON(!zone_is_initialized(zone)); 584 VM_BUG_ON(!zone_is_initialized(zone));
616 585 VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
617 if (unlikely(PageCompound(page)))
618 if (unlikely(destroy_compound_page(page, order)))
619 return;
620 586
621 VM_BUG_ON(migratetype == -1); 587 VM_BUG_ON(migratetype == -1);
622 if (is_migrate_isolate(migratetype)) { 588 if (is_migrate_isolate(migratetype)) {
@@ -797,21 +763,41 @@ static void free_one_page(struct zone *zone,
797 spin_unlock(&zone->lock); 763 spin_unlock(&zone->lock);
798} 764}
799 765
766static int free_tail_pages_check(struct page *head_page, struct page *page)
767{
768 if (!IS_ENABLED(CONFIG_DEBUG_VM))
769 return 0;
770 if (unlikely(!PageTail(page))) {
771 bad_page(page, "PageTail not set", 0);
772 return 1;
773 }
774 if (unlikely(page->first_page != head_page)) {
775 bad_page(page, "first_page not consistent", 0);
776 return 1;
777 }
778 return 0;
779}
780
800static bool free_pages_prepare(struct page *page, unsigned int order) 781static bool free_pages_prepare(struct page *page, unsigned int order)
801{ 782{
802 int i; 783 bool compound = PageCompound(page);
803 int bad = 0; 784 int i, bad = 0;
804 785
805 VM_BUG_ON_PAGE(PageTail(page), page); 786 VM_BUG_ON_PAGE(PageTail(page), page);
806 VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page); 787 VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
807 788
808 trace_mm_page_free(page, order); 789 trace_mm_page_free(page, order);
809 kmemcheck_free_shadow(page, order); 790 kmemcheck_free_shadow(page, order);
791 kasan_free_pages(page, order);
810 792
811 if (PageAnon(page)) 793 if (PageAnon(page))
812 page->mapping = NULL; 794 page->mapping = NULL;
813 for (i = 0; i < (1 << order); i++) 795 bad += free_pages_check(page);
796 for (i = 1; i < (1 << order); i++) {
797 if (compound)
798 bad += free_tail_pages_check(page, page + i);
814 bad += free_pages_check(page + i); 799 bad += free_pages_check(page + i);
800 }
815 if (bad) 801 if (bad)
816 return false; 802 return false;
817 803
@@ -970,7 +956,8 @@ static inline int check_new_page(struct page *page)
970 return 0; 956 return 0;
971} 957}
972 958
973static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) 959static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
960 int alloc_flags)
974{ 961{
975 int i; 962 int i;
976 963
@@ -985,6 +972,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
985 972
986 arch_alloc_page(page, order); 973 arch_alloc_page(page, order);
987 kernel_map_pages(page, 1 << order, 1); 974 kernel_map_pages(page, 1 << order, 1);
975 kasan_alloc_pages(page, order);
988 976
989 if (gfp_flags & __GFP_ZERO) 977 if (gfp_flags & __GFP_ZERO)
990 prep_zero_page(page, order, gfp_flags); 978 prep_zero_page(page, order, gfp_flags);
@@ -994,6 +982,14 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
994 982
995 set_page_owner(page, order, gfp_flags); 983 set_page_owner(page, order, gfp_flags);
996 984
985 /*
986 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
987 * allocate the page. The expectation is that the caller is taking
988 * steps that will free more memory. The caller should avoid the page
989 * being used for !PFMEMALLOC purposes.
990 */
991 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
992
997 return 0; 993 return 0;
998} 994}
999 995
@@ -1130,39 +1126,34 @@ static void change_pageblock_range(struct page *pageblock_page,
1130} 1126}
1131 1127
1132/* 1128/*
1133 * If breaking a large block of pages, move all free pages to the preferred 1129 * When we are falling back to another migratetype during allocation, try to
1134 * allocation list. If falling back for a reclaimable kernel allocation, be 1130 * steal extra free pages from the same pageblocks to satisfy further
1135 * more aggressive about taking ownership of free pages. 1131 * allocations, instead of polluting multiple pageblocks.
1136 * 1132 *
1137 * On the other hand, never change migration type of MIGRATE_CMA pageblocks 1133 * If we are stealing a relatively large buddy page, it is likely there will
1138 * nor move CMA pages to different free lists. We don't want unmovable pages 1134 * be more free pages in the pageblock, so try to steal them all. For
1139 * to be allocated from MIGRATE_CMA areas. 1135 * reclaimable and unmovable allocations, we steal regardless of page size,
1136 * as fragmentation caused by those allocations polluting movable pageblocks
1137 * is worse than movable allocations stealing from unmovable and reclaimable
1138 * pageblocks.
1140 * 1139 *
1141 * Returns the new migratetype of the pageblock (or the same old migratetype 1140 * If we claim more than half of the pageblock, change pageblock's migratetype
1142 * if it was unchanged). 1141 * as well.
1143 */ 1142 */
1144static int try_to_steal_freepages(struct zone *zone, struct page *page, 1143static void try_to_steal_freepages(struct zone *zone, struct page *page,
1145 int start_type, int fallback_type) 1144 int start_type, int fallback_type)
1146{ 1145{
1147 int current_order = page_order(page); 1146 int current_order = page_order(page);
1148 1147
1149 /*
1150 * When borrowing from MIGRATE_CMA, we need to release the excess
1151 * buddy pages to CMA itself. We also ensure the freepage_migratetype
1152 * is set to CMA so it is returned to the correct freelist in case
1153 * the page ends up being not actually allocated from the pcp lists.
1154 */
1155 if (is_migrate_cma(fallback_type))
1156 return fallback_type;
1157
1158 /* Take ownership for orders >= pageblock_order */ 1148 /* Take ownership for orders >= pageblock_order */
1159 if (current_order >= pageblock_order) { 1149 if (current_order >= pageblock_order) {
1160 change_pageblock_range(page, current_order, start_type); 1150 change_pageblock_range(page, current_order, start_type);
1161 return start_type; 1151 return;
1162 } 1152 }
1163 1153
1164 if (current_order >= pageblock_order / 2 || 1154 if (current_order >= pageblock_order / 2 ||
1165 start_type == MIGRATE_RECLAIMABLE || 1155 start_type == MIGRATE_RECLAIMABLE ||
1156 start_type == MIGRATE_UNMOVABLE ||
1166 page_group_by_mobility_disabled) { 1157 page_group_by_mobility_disabled) {
1167 int pages; 1158 int pages;
1168 1159
@@ -1170,15 +1161,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
1170 1161
1171 /* Claim the whole block if over half of it is free */ 1162 /* Claim the whole block if over half of it is free */
1172 if (pages >= (1 << (pageblock_order-1)) || 1163 if (pages >= (1 << (pageblock_order-1)) ||
1173 page_group_by_mobility_disabled) { 1164 page_group_by_mobility_disabled)
1174
1175 set_pageblock_migratetype(page, start_type); 1165 set_pageblock_migratetype(page, start_type);
1176 return start_type;
1177 }
1178
1179 } 1166 }
1180
1181 return fallback_type;
1182} 1167}
1183 1168
1184/* Remove an element from the buddy allocator from the fallback list */ 1169/* Remove an element from the buddy allocator from the fallback list */
@@ -1188,14 +1173,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1188 struct free_area *area; 1173 struct free_area *area;
1189 unsigned int current_order; 1174 unsigned int current_order;
1190 struct page *page; 1175 struct page *page;
1191 int migratetype, new_type, i;
1192 1176
1193 /* Find the largest possible block of pages in the other list */ 1177 /* Find the largest possible block of pages in the other list */
1194 for (current_order = MAX_ORDER-1; 1178 for (current_order = MAX_ORDER-1;
1195 current_order >= order && current_order <= MAX_ORDER-1; 1179 current_order >= order && current_order <= MAX_ORDER-1;
1196 --current_order) { 1180 --current_order) {
1181 int i;
1197 for (i = 0;; i++) { 1182 for (i = 0;; i++) {
1198 migratetype = fallbacks[start_migratetype][i]; 1183 int migratetype = fallbacks[start_migratetype][i];
1184 int buddy_type = start_migratetype;
1199 1185
1200 /* MIGRATE_RESERVE handled later if necessary */ 1186 /* MIGRATE_RESERVE handled later if necessary */
1201 if (migratetype == MIGRATE_RESERVE) 1187 if (migratetype == MIGRATE_RESERVE)
@@ -1209,25 +1195,39 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
1209 struct page, lru); 1195 struct page, lru);
1210 area->nr_free--; 1196 area->nr_free--;
1211 1197
1212 new_type = try_to_steal_freepages(zone, page, 1198 if (!is_migrate_cma(migratetype)) {
1213 start_migratetype, 1199 try_to_steal_freepages(zone, page,
1214 migratetype); 1200 start_migratetype,
1201 migratetype);
1202 } else {
1203 /*
1204 * When borrowing from MIGRATE_CMA, we need to
1205 * release the excess buddy pages to CMA
1206 * itself, and we do not try to steal extra
1207 * free pages.
1208 */
1209 buddy_type = migratetype;
1210 }
1215 1211
1216 /* Remove the page from the freelists */ 1212 /* Remove the page from the freelists */
1217 list_del(&page->lru); 1213 list_del(&page->lru);
1218 rmv_page_order(page); 1214 rmv_page_order(page);
1219 1215
1220 expand(zone, page, order, current_order, area, 1216 expand(zone, page, order, current_order, area,
1221 new_type); 1217 buddy_type);
1222 /* The freepage_migratetype may differ from pageblock's 1218
1219 /*
1220 * The freepage_migratetype may differ from pageblock's
1223 * migratetype depending on the decisions in 1221 * migratetype depending on the decisions in
1224 * try_to_steal_freepages. This is OK as long as it does 1222 * try_to_steal_freepages(). This is OK as long as it
1225 * not differ for MIGRATE_CMA type. 1223 * does not differ for MIGRATE_CMA pageblocks. For CMA
1224 * we need to make sure unallocated pages flushed from
1225 * pcp lists are returned to the correct freelist.
1226 */ 1226 */
1227 set_freepage_migratetype(page, new_type); 1227 set_freepage_migratetype(page, buddy_type);
1228 1228
1229 trace_mm_page_alloc_extfrag(page, order, current_order, 1229 trace_mm_page_alloc_extfrag(page, order, current_order,
1230 start_migratetype, migratetype, new_type); 1230 start_migratetype, migratetype);
1231 1231
1232 return page; 1232 return page;
1233 } 1233 }
@@ -1642,9 +1642,7 @@ int split_free_page(struct page *page)
1642} 1642}
1643 1643
1644/* 1644/*
1645 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 1645 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
1646 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1647 * or two.
1648 */ 1646 */
1649static inline 1647static inline
1650struct page *buffered_rmqueue(struct zone *preferred_zone, 1648struct page *buffered_rmqueue(struct zone *preferred_zone,
@@ -1655,7 +1653,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
1655 struct page *page; 1653 struct page *page;
1656 bool cold = ((gfp_flags & __GFP_COLD) != 0); 1654 bool cold = ((gfp_flags & __GFP_COLD) != 0);
1657 1655
1658again:
1659 if (likely(order == 0)) { 1656 if (likely(order == 0)) {
1660 struct per_cpu_pages *pcp; 1657 struct per_cpu_pages *pcp;
1661 struct list_head *list; 1658 struct list_head *list;
@@ -1711,8 +1708,6 @@ again:
1711 local_irq_restore(flags); 1708 local_irq_restore(flags);
1712 1709
1713 VM_BUG_ON_PAGE(bad_range(zone, page), page); 1710 VM_BUG_ON_PAGE(bad_range(zone, page), page);
1714 if (prep_new_page(page, order, gfp_flags))
1715 goto again;
1716 return page; 1711 return page;
1717 1712
1718failed: 1713failed:
@@ -2033,10 +2028,10 @@ static void reset_alloc_batches(struct zone *preferred_zone)
2033 * a page. 2028 * a page.
2034 */ 2029 */
2035static struct page * 2030static struct page *
2036get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 2031get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
2037 struct zonelist *zonelist, int high_zoneidx, int alloc_flags, 2032 const struct alloc_context *ac)
2038 struct zone *preferred_zone, int classzone_idx, int migratetype)
2039{ 2033{
2034 struct zonelist *zonelist = ac->zonelist;
2040 struct zoneref *z; 2035 struct zoneref *z;
2041 struct page *page = NULL; 2036 struct page *page = NULL;
2042 struct zone *zone; 2037 struct zone *zone;
@@ -2055,8 +2050,8 @@ zonelist_scan:
2055 * Scan zonelist, looking for a zone with enough free. 2050 * Scan zonelist, looking for a zone with enough free.
2056 * See also __cpuset_node_allowed() comment in kernel/cpuset.c. 2051 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
2057 */ 2052 */
2058 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2053 for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
2059 high_zoneidx, nodemask) { 2054 ac->nodemask) {
2060 unsigned long mark; 2055 unsigned long mark;
2061 2056
2062 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 2057 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
@@ -2073,7 +2068,7 @@ zonelist_scan:
2073 * time the page has in memory before being reclaimed. 2068 * time the page has in memory before being reclaimed.
2074 */ 2069 */
2075 if (alloc_flags & ALLOC_FAIR) { 2070 if (alloc_flags & ALLOC_FAIR) {
2076 if (!zone_local(preferred_zone, zone)) 2071 if (!zone_local(ac->preferred_zone, zone))
2077 break; 2072 break;
2078 if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { 2073 if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
2079 nr_fair_skipped++; 2074 nr_fair_skipped++;
@@ -2111,7 +2106,7 @@ zonelist_scan:
2111 2106
2112 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; 2107 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
2113 if (!zone_watermark_ok(zone, order, mark, 2108 if (!zone_watermark_ok(zone, order, mark,
2114 classzone_idx, alloc_flags)) { 2109 ac->classzone_idx, alloc_flags)) {
2115 int ret; 2110 int ret;
2116 2111
2117 /* Checked here to keep the fast path fast */ 2112 /* Checked here to keep the fast path fast */
@@ -2132,7 +2127,7 @@ zonelist_scan:
2132 } 2127 }
2133 2128
2134 if (zone_reclaim_mode == 0 || 2129 if (zone_reclaim_mode == 0 ||
2135 !zone_allows_reclaim(preferred_zone, zone)) 2130 !zone_allows_reclaim(ac->preferred_zone, zone))
2136 goto this_zone_full; 2131 goto this_zone_full;
2137 2132
2138 /* 2133 /*
@@ -2154,7 +2149,7 @@ zonelist_scan:
2154 default: 2149 default:
2155 /* did we reclaim enough */ 2150 /* did we reclaim enough */
2156 if (zone_watermark_ok(zone, order, mark, 2151 if (zone_watermark_ok(zone, order, mark,
2157 classzone_idx, alloc_flags)) 2152 ac->classzone_idx, alloc_flags))
2158 goto try_this_zone; 2153 goto try_this_zone;
2159 2154
2160 /* 2155 /*
@@ -2175,27 +2170,18 @@ zonelist_scan:
2175 } 2170 }
2176 2171
2177try_this_zone: 2172try_this_zone:
2178 page = buffered_rmqueue(preferred_zone, zone, order, 2173 page = buffered_rmqueue(ac->preferred_zone, zone, order,
2179 gfp_mask, migratetype); 2174 gfp_mask, ac->migratetype);
2180 if (page) 2175 if (page) {
2181 break; 2176 if (prep_new_page(page, order, gfp_mask, alloc_flags))
2177 goto try_this_zone;
2178 return page;
2179 }
2182this_zone_full: 2180this_zone_full:
2183 if (IS_ENABLED(CONFIG_NUMA) && zlc_active) 2181 if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
2184 zlc_mark_zone_full(zonelist, z); 2182 zlc_mark_zone_full(zonelist, z);
2185 } 2183 }
2186 2184
2187 if (page) {
2188 /*
2189 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2190 * necessary to allocate the page. The expectation is
2191 * that the caller is taking steps that will free more
2192 * memory. The caller should avoid the page being used
2193 * for !PFMEMALLOC purposes.
2194 */
2195 page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
2196 return page;
2197 }
2198
2199 /* 2185 /*
2200 * The first pass makes sure allocations are spread fairly within the 2186 * The first pass makes sure allocations are spread fairly within the
2201 * local node. However, the local node might have free pages left 2187 * local node. However, the local node might have free pages left
@@ -2208,7 +2194,7 @@ this_zone_full:
2208 alloc_flags &= ~ALLOC_FAIR; 2194 alloc_flags &= ~ALLOC_FAIR;
2209 if (nr_fair_skipped) { 2195 if (nr_fair_skipped) {
2210 zonelist_rescan = true; 2196 zonelist_rescan = true;
2211 reset_alloc_batches(preferred_zone); 2197 reset_alloc_batches(ac->preferred_zone);
2212 } 2198 }
2213 if (nr_online_nodes > 1) 2199 if (nr_online_nodes > 1)
2214 zonelist_rescan = true; 2200 zonelist_rescan = true;
@@ -2330,44 +2316,44 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
2330 2316
2331static inline struct page * 2317static inline struct page *
2332__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, 2318__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2333 struct zonelist *zonelist, enum zone_type high_zoneidx, 2319 const struct alloc_context *ac, unsigned long *did_some_progress)
2334 nodemask_t *nodemask, struct zone *preferred_zone,
2335 int classzone_idx, int migratetype)
2336{ 2320{
2337 struct page *page; 2321 struct page *page;
2338 2322
2339 /* Acquire the per-zone oom lock for each zone */ 2323 *did_some_progress = 0;
2340 if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
2341 schedule_timeout_uninterruptible(1);
2342 return NULL;
2343 }
2344 2324
2345 /* 2325 /*
2346 * PM-freezer should be notified that there might be an OOM killer on 2326 * Acquire the per-zone oom lock for each zone. If that
2347 * its way to kill and wake somebody up. This is too early and we might 2327 * fails, somebody else is making progress for us.
2348 * end up not killing anything but false positives are acceptable.
2349 * See freeze_processes.
2350 */ 2328 */
2351 note_oom_kill(); 2329 if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) {
2330 *did_some_progress = 1;
2331 schedule_timeout_uninterruptible(1);
2332 return NULL;
2333 }
2352 2334
2353 /* 2335 /*
2354 * Go through the zonelist yet one more time, keep very high watermark 2336 * Go through the zonelist yet one more time, keep very high watermark
2355 * here, this is only to catch a parallel oom killing, we must fail if 2337 * here, this is only to catch a parallel oom killing, we must fail if
2356 * we're still under heavy pressure. 2338 * we're still under heavy pressure.
2357 */ 2339 */
2358 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, 2340 page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
2359 order, zonelist, high_zoneidx, 2341 ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
2360 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
2361 preferred_zone, classzone_idx, migratetype);
2362 if (page) 2342 if (page)
2363 goto out; 2343 goto out;
2364 2344
2365 if (!(gfp_mask & __GFP_NOFAIL)) { 2345 if (!(gfp_mask & __GFP_NOFAIL)) {
2346 /* Coredumps can quickly deplete all memory reserves */
2347 if (current->flags & PF_DUMPCORE)
2348 goto out;
2366 /* The OOM killer will not help higher order allocs */ 2349 /* The OOM killer will not help higher order allocs */
2367 if (order > PAGE_ALLOC_COSTLY_ORDER) 2350 if (order > PAGE_ALLOC_COSTLY_ORDER)
2368 goto out; 2351 goto out;
2369 /* The OOM killer does not needlessly kill tasks for lowmem */ 2352 /* The OOM killer does not needlessly kill tasks for lowmem */
2370 if (high_zoneidx < ZONE_NORMAL) 2353 if (ac->high_zoneidx < ZONE_NORMAL)
2354 goto out;
2355 /* The OOM killer does not compensate for light reclaim */
2356 if (!(gfp_mask & __GFP_FS))
2371 goto out; 2357 goto out;
2372 /* 2358 /*
2373 * GFP_THISNODE contains __GFP_NORETRY and we never hit this. 2359 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
@@ -2380,10 +2366,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
2380 goto out; 2366 goto out;
2381 } 2367 }
2382 /* Exhausted what can be done so it's blamo time */ 2368 /* Exhausted what can be done so it's blamo time */
2383 out_of_memory(zonelist, gfp_mask, order, nodemask, false); 2369 if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
2384 2370 *did_some_progress = 1;
2385out: 2371out:
2386 oom_zonelist_unlock(zonelist, gfp_mask); 2372 oom_zonelist_unlock(ac->zonelist, gfp_mask);
2387 return page; 2373 return page;
2388} 2374}
2389 2375
@@ -2391,10 +2377,9 @@ out:
2391/* Try memory compaction for high-order allocations before reclaim */ 2377/* Try memory compaction for high-order allocations before reclaim */
2392static struct page * 2378static struct page *
2393__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2379__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2394 struct zonelist *zonelist, enum zone_type high_zoneidx, 2380 int alloc_flags, const struct alloc_context *ac,
2395 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2381 enum migrate_mode mode, int *contended_compaction,
2396 int classzone_idx, int migratetype, enum migrate_mode mode, 2382 bool *deferred_compaction)
2397 int *contended_compaction, bool *deferred_compaction)
2398{ 2383{
2399 unsigned long compact_result; 2384 unsigned long compact_result;
2400 struct page *page; 2385 struct page *page;
@@ -2403,10 +2388,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2403 return NULL; 2388 return NULL;
2404 2389
2405 current->flags |= PF_MEMALLOC; 2390 current->flags |= PF_MEMALLOC;
2406 compact_result = try_to_compact_pages(zonelist, order, gfp_mask, 2391 compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
2407 nodemask, mode, 2392 mode, contended_compaction);
2408 contended_compaction,
2409 alloc_flags, classzone_idx);
2410 current->flags &= ~PF_MEMALLOC; 2393 current->flags &= ~PF_MEMALLOC;
2411 2394
2412 switch (compact_result) { 2395 switch (compact_result) {
@@ -2425,10 +2408,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2425 */ 2408 */
2426 count_vm_event(COMPACTSTALL); 2409 count_vm_event(COMPACTSTALL);
2427 2410
2428 page = get_page_from_freelist(gfp_mask, nodemask, 2411 page = get_page_from_freelist(gfp_mask, order,
2429 order, zonelist, high_zoneidx, 2412 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
2430 alloc_flags & ~ALLOC_NO_WATERMARKS,
2431 preferred_zone, classzone_idx, migratetype);
2432 2413
2433 if (page) { 2414 if (page) {
2434 struct zone *zone = page_zone(page); 2415 struct zone *zone = page_zone(page);
@@ -2452,10 +2433,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2452#else 2433#else
2453static inline struct page * 2434static inline struct page *
2454__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 2435__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2455 struct zonelist *zonelist, enum zone_type high_zoneidx, 2436 int alloc_flags, const struct alloc_context *ac,
2456 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2437 enum migrate_mode mode, int *contended_compaction,
2457 int classzone_idx, int migratetype, enum migrate_mode mode, 2438 bool *deferred_compaction)
2458 int *contended_compaction, bool *deferred_compaction)
2459{ 2439{
2460 return NULL; 2440 return NULL;
2461} 2441}
@@ -2463,8 +2443,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2463 2443
2464/* Perform direct synchronous page reclaim */ 2444/* Perform direct synchronous page reclaim */
2465static int 2445static int
2466__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, 2446__perform_reclaim(gfp_t gfp_mask, unsigned int order,
2467 nodemask_t *nodemask) 2447 const struct alloc_context *ac)
2468{ 2448{
2469 struct reclaim_state reclaim_state; 2449 struct reclaim_state reclaim_state;
2470 int progress; 2450 int progress;
@@ -2478,7 +2458,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2478 reclaim_state.reclaimed_slab = 0; 2458 reclaim_state.reclaimed_slab = 0;
2479 current->reclaim_state = &reclaim_state; 2459 current->reclaim_state = &reclaim_state;
2480 2460
2481 progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 2461 progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
2462 ac->nodemask);
2482 2463
2483 current->reclaim_state = NULL; 2464 current->reclaim_state = NULL;
2484 lockdep_clear_current_reclaim_state(); 2465 lockdep_clear_current_reclaim_state();
@@ -2492,28 +2473,23 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
2492/* The really slow allocator path where we enter direct reclaim */ 2473/* The really slow allocator path where we enter direct reclaim */
2493static inline struct page * 2474static inline struct page *
2494__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, 2475__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2495 struct zonelist *zonelist, enum zone_type high_zoneidx, 2476 int alloc_flags, const struct alloc_context *ac,
2496 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 2477 unsigned long *did_some_progress)
2497 int classzone_idx, int migratetype, unsigned long *did_some_progress)
2498{ 2478{
2499 struct page *page = NULL; 2479 struct page *page = NULL;
2500 bool drained = false; 2480 bool drained = false;
2501 2481
2502 *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, 2482 *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
2503 nodemask);
2504 if (unlikely(!(*did_some_progress))) 2483 if (unlikely(!(*did_some_progress)))
2505 return NULL; 2484 return NULL;
2506 2485
2507 /* After successful reclaim, reconsider all zones for allocation */ 2486 /* After successful reclaim, reconsider all zones for allocation */
2508 if (IS_ENABLED(CONFIG_NUMA)) 2487 if (IS_ENABLED(CONFIG_NUMA))
2509 zlc_clear_zones_full(zonelist); 2488 zlc_clear_zones_full(ac->zonelist);
2510 2489
2511retry: 2490retry:
2512 page = get_page_from_freelist(gfp_mask, nodemask, order, 2491 page = get_page_from_freelist(gfp_mask, order,
2513 zonelist, high_zoneidx, 2492 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
2514 alloc_flags & ~ALLOC_NO_WATERMARKS,
2515 preferred_zone, classzone_idx,
2516 migratetype);
2517 2493
2518 /* 2494 /*
2519 * If an allocation failed after direct reclaim, it could be because 2495 * If an allocation failed after direct reclaim, it could be because
@@ -2534,36 +2510,30 @@ retry:
2534 */ 2510 */
2535static inline struct page * 2511static inline struct page *
2536__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, 2512__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2537 struct zonelist *zonelist, enum zone_type high_zoneidx, 2513 const struct alloc_context *ac)
2538 nodemask_t *nodemask, struct zone *preferred_zone,
2539 int classzone_idx, int migratetype)
2540{ 2514{
2541 struct page *page; 2515 struct page *page;
2542 2516
2543 do { 2517 do {
2544 page = get_page_from_freelist(gfp_mask, nodemask, order, 2518 page = get_page_from_freelist(gfp_mask, order,
2545 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, 2519 ALLOC_NO_WATERMARKS, ac);
2546 preferred_zone, classzone_idx, migratetype);
2547 2520
2548 if (!page && gfp_mask & __GFP_NOFAIL) 2521 if (!page && gfp_mask & __GFP_NOFAIL)
2549 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2522 wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC,
2523 HZ/50);
2550 } while (!page && (gfp_mask & __GFP_NOFAIL)); 2524 } while (!page && (gfp_mask & __GFP_NOFAIL));
2551 2525
2552 return page; 2526 return page;
2553} 2527}
2554 2528
2555static void wake_all_kswapds(unsigned int order, 2529static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
2556 struct zonelist *zonelist,
2557 enum zone_type high_zoneidx,
2558 struct zone *preferred_zone,
2559 nodemask_t *nodemask)
2560{ 2530{
2561 struct zoneref *z; 2531 struct zoneref *z;
2562 struct zone *zone; 2532 struct zone *zone;
2563 2533
2564 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2534 for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
2565 high_zoneidx, nodemask) 2535 ac->high_zoneidx, ac->nodemask)
2566 wakeup_kswapd(zone, order, zone_idx(preferred_zone)); 2536 wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
2567} 2537}
2568 2538
2569static inline int 2539static inline int
@@ -2622,9 +2592,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2622 2592
2623static inline struct page * 2593static inline struct page *
2624__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2594__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2625 struct zonelist *zonelist, enum zone_type high_zoneidx, 2595 struct alloc_context *ac)
2626 nodemask_t *nodemask, struct zone *preferred_zone,
2627 int classzone_idx, int migratetype)
2628{ 2596{
2629 const gfp_t wait = gfp_mask & __GFP_WAIT; 2597 const gfp_t wait = gfp_mask & __GFP_WAIT;
2630 struct page *page = NULL; 2598 struct page *page = NULL;
@@ -2658,10 +2626,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2658 (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2626 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2659 goto nopage; 2627 goto nopage;
2660 2628
2661restart: 2629retry:
2662 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2630 if (!(gfp_mask & __GFP_NO_KSWAPD))
2663 wake_all_kswapds(order, zonelist, high_zoneidx, 2631 wake_all_kswapds(order, ac);
2664 preferred_zone, nodemask);
2665 2632
2666 /* 2633 /*
2667 * OK, we're below the kswapd watermark and have kicked background 2634 * OK, we're below the kswapd watermark and have kicked background
@@ -2674,18 +2641,16 @@ restart:
2674 * Find the true preferred zone if the allocation is unconstrained by 2641 * Find the true preferred zone if the allocation is unconstrained by
2675 * cpusets. 2642 * cpusets.
2676 */ 2643 */
2677 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { 2644 if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
2678 struct zoneref *preferred_zoneref; 2645 struct zoneref *preferred_zoneref;
2679 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, 2646 preferred_zoneref = first_zones_zonelist(ac->zonelist,
2680 NULL, &preferred_zone); 2647 ac->high_zoneidx, NULL, &ac->preferred_zone);
2681 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2648 ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
2682 } 2649 }
2683 2650
2684rebalance:
2685 /* This is the last chance, in general, before the goto nopage. */ 2651 /* This is the last chance, in general, before the goto nopage. */
2686 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2652 page = get_page_from_freelist(gfp_mask, order,
2687 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2653 alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
2688 preferred_zone, classzone_idx, migratetype);
2689 if (page) 2654 if (page)
2690 goto got_pg; 2655 goto got_pg;
2691 2656
@@ -2696,11 +2661,10 @@ rebalance:
2696 * the allocation is high priority and these type of 2661 * the allocation is high priority and these type of
2697 * allocations are system rather than user orientated 2662 * allocations are system rather than user orientated
2698 */ 2663 */
2699 zonelist = node_zonelist(numa_node_id(), gfp_mask); 2664 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
2665
2666 page = __alloc_pages_high_priority(gfp_mask, order, ac);
2700 2667
2701 page = __alloc_pages_high_priority(gfp_mask, order,
2702 zonelist, high_zoneidx, nodemask,
2703 preferred_zone, classzone_idx, migratetype);
2704 if (page) { 2668 if (page) {
2705 goto got_pg; 2669 goto got_pg;
2706 } 2670 }
@@ -2729,11 +2693,9 @@ rebalance:
2729 * Try direct compaction. The first pass is asynchronous. Subsequent 2693 * Try direct compaction. The first pass is asynchronous. Subsequent
2730 * attempts after direct reclaim are synchronous 2694 * attempts after direct reclaim are synchronous
2731 */ 2695 */
2732 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2696 page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
2733 high_zoneidx, nodemask, alloc_flags, 2697 migration_mode,
2734 preferred_zone, 2698 &contended_compaction,
2735 classzone_idx, migratetype,
2736 migration_mode, &contended_compaction,
2737 &deferred_compaction); 2699 &deferred_compaction);
2738 if (page) 2700 if (page)
2739 goto got_pg; 2701 goto got_pg;
@@ -2779,74 +2741,40 @@ rebalance:
2779 migration_mode = MIGRATE_SYNC_LIGHT; 2741 migration_mode = MIGRATE_SYNC_LIGHT;
2780 2742
2781 /* Try direct reclaim and then allocating */ 2743 /* Try direct reclaim and then allocating */
2782 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2744 page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
2783 zonelist, high_zoneidx, 2745 &did_some_progress);
2784 nodemask,
2785 alloc_flags, preferred_zone,
2786 classzone_idx, migratetype,
2787 &did_some_progress);
2788 if (page) 2746 if (page)
2789 goto got_pg; 2747 goto got_pg;
2790 2748
2791 /*
2792 * If we failed to make any progress reclaiming, then we are
2793 * running out of options and have to consider going OOM
2794 */
2795 if (!did_some_progress) {
2796 if (oom_gfp_allowed(gfp_mask)) {
2797 if (oom_killer_disabled)
2798 goto nopage;
2799 /* Coredumps can quickly deplete all memory reserves */
2800 if ((current->flags & PF_DUMPCORE) &&
2801 !(gfp_mask & __GFP_NOFAIL))
2802 goto nopage;
2803 page = __alloc_pages_may_oom(gfp_mask, order,
2804 zonelist, high_zoneidx,
2805 nodemask, preferred_zone,
2806 classzone_idx, migratetype);
2807 if (page)
2808 goto got_pg;
2809
2810 if (!(gfp_mask & __GFP_NOFAIL)) {
2811 /*
2812 * The oom killer is not called for high-order
2813 * allocations that may fail, so if no progress
2814 * is being made, there are no other options and
2815 * retrying is unlikely to help.
2816 */
2817 if (order > PAGE_ALLOC_COSTLY_ORDER)
2818 goto nopage;
2819 /*
2820 * The oom killer is not called for lowmem
2821 * allocations to prevent needlessly killing
2822 * innocent tasks.
2823 */
2824 if (high_zoneidx < ZONE_NORMAL)
2825 goto nopage;
2826 }
2827
2828 goto restart;
2829 }
2830 }
2831
2832 /* Check if we should retry the allocation */ 2749 /* Check if we should retry the allocation */
2833 pages_reclaimed += did_some_progress; 2750 pages_reclaimed += did_some_progress;
2834 if (should_alloc_retry(gfp_mask, order, did_some_progress, 2751 if (should_alloc_retry(gfp_mask, order, did_some_progress,
2835 pages_reclaimed)) { 2752 pages_reclaimed)) {
2753 /*
2754 * If we fail to make progress by freeing individual
2755 * pages, but the allocation wants us to keep going,
2756 * start OOM killing tasks.
2757 */
2758 if (!did_some_progress) {
2759 page = __alloc_pages_may_oom(gfp_mask, order, ac,
2760 &did_some_progress);
2761 if (page)
2762 goto got_pg;
2763 if (!did_some_progress)
2764 goto nopage;
2765 }
2836 /* Wait for some write requests to complete then retry */ 2766 /* Wait for some write requests to complete then retry */
2837 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2767 wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
2838 goto rebalance; 2768 goto retry;
2839 } else { 2769 } else {
2840 /* 2770 /*
2841 * High-order allocations do not necessarily loop after 2771 * High-order allocations do not necessarily loop after
2842 * direct reclaim and reclaim/compaction depends on compaction 2772 * direct reclaim and reclaim/compaction depends on compaction
2843 * being called after reclaim so call directly if necessary 2773 * being called after reclaim so call directly if necessary
2844 */ 2774 */
2845 page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, 2775 page = __alloc_pages_direct_compact(gfp_mask, order,
2846 high_zoneidx, nodemask, alloc_flags, 2776 alloc_flags, ac, migration_mode,
2847 preferred_zone, 2777 &contended_compaction,
2848 classzone_idx, migratetype,
2849 migration_mode, &contended_compaction,
2850 &deferred_compaction); 2778 &deferred_compaction);
2851 if (page) 2779 if (page)
2852 goto got_pg; 2780 goto got_pg;
@@ -2854,11 +2782,7 @@ rebalance:
2854 2782
2855nopage: 2783nopage:
2856 warn_alloc_failed(gfp_mask, order, NULL); 2784 warn_alloc_failed(gfp_mask, order, NULL);
2857 return page;
2858got_pg: 2785got_pg:
2859 if (kmemcheck_enabled)
2860 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2861
2862 return page; 2786 return page;
2863} 2787}
2864 2788
@@ -2869,14 +2793,16 @@ struct page *
2869__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, 2793__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2870 struct zonelist *zonelist, nodemask_t *nodemask) 2794 struct zonelist *zonelist, nodemask_t *nodemask)
2871{ 2795{
2872 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2873 struct zone *preferred_zone;
2874 struct zoneref *preferred_zoneref; 2796 struct zoneref *preferred_zoneref;
2875 struct page *page = NULL; 2797 struct page *page = NULL;
2876 int migratetype = gfpflags_to_migratetype(gfp_mask);
2877 unsigned int cpuset_mems_cookie; 2798 unsigned int cpuset_mems_cookie;
2878 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; 2799 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
2879 int classzone_idx; 2800 gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
2801 struct alloc_context ac = {
2802 .high_zoneidx = gfp_zone(gfp_mask),
2803 .nodemask = nodemask,
2804 .migratetype = gfpflags_to_migratetype(gfp_mask),
2805 };
2880 2806
2881 gfp_mask &= gfp_allowed_mask; 2807 gfp_mask &= gfp_allowed_mask;
2882 2808
@@ -2895,37 +2821,40 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2895 if (unlikely(!zonelist->_zonerefs->zone)) 2821 if (unlikely(!zonelist->_zonerefs->zone))
2896 return NULL; 2822 return NULL;
2897 2823
2898 if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) 2824 if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
2899 alloc_flags |= ALLOC_CMA; 2825 alloc_flags |= ALLOC_CMA;
2900 2826
2901retry_cpuset: 2827retry_cpuset:
2902 cpuset_mems_cookie = read_mems_allowed_begin(); 2828 cpuset_mems_cookie = read_mems_allowed_begin();
2903 2829
2830 /* We set it here, as __alloc_pages_slowpath might have changed it */
2831 ac.zonelist = zonelist;
2904 /* The preferred zone is used for statistics later */ 2832 /* The preferred zone is used for statistics later */
2905 preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, 2833 preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
2906 nodemask ? : &cpuset_current_mems_allowed, 2834 ac.nodemask ? : &cpuset_current_mems_allowed,
2907 &preferred_zone); 2835 &ac.preferred_zone);
2908 if (!preferred_zone) 2836 if (!ac.preferred_zone)
2909 goto out; 2837 goto out;
2910 classzone_idx = zonelist_zone_idx(preferred_zoneref); 2838 ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
2911 2839
2912 /* First allocation attempt */ 2840 /* First allocation attempt */
2913 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2841 alloc_mask = gfp_mask|__GFP_HARDWALL;
2914 zonelist, high_zoneidx, alloc_flags, 2842 page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
2915 preferred_zone, classzone_idx, migratetype);
2916 if (unlikely(!page)) { 2843 if (unlikely(!page)) {
2917 /* 2844 /*
2918 * Runtime PM, block IO and its error handling path 2845 * Runtime PM, block IO and its error handling path
2919 * can deadlock because I/O on the device might not 2846 * can deadlock because I/O on the device might not
2920 * complete. 2847 * complete.
2921 */ 2848 */
2922 gfp_mask = memalloc_noio_flags(gfp_mask); 2849 alloc_mask = memalloc_noio_flags(gfp_mask);
2923 page = __alloc_pages_slowpath(gfp_mask, order, 2850
2924 zonelist, high_zoneidx, nodemask, 2851 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
2925 preferred_zone, classzone_idx, migratetype);
2926 } 2852 }
2927 2853
2928 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2854 if (kmemcheck_enabled && page)
2855 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2856
2857 trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
2929 2858
2930out: 2859out:
2931 /* 2860 /*
@@ -3945,18 +3874,29 @@ static int __build_all_zonelists(void *data)
3945 return 0; 3874 return 0;
3946} 3875}
3947 3876
3877static noinline void __init
3878build_all_zonelists_init(void)
3879{
3880 __build_all_zonelists(NULL);
3881 mminit_verify_zonelist();
3882 cpuset_init_current_mems_allowed();
3883}
3884
3948/* 3885/*
3949 * Called with zonelists_mutex held always 3886 * Called with zonelists_mutex held always
3950 * unless system_state == SYSTEM_BOOTING. 3887 * unless system_state == SYSTEM_BOOTING.
3888 *
3889 * __ref due to (1) call of __meminit annotated setup_zone_pageset
3890 * [we're only called with non-NULL zone through __meminit paths] and
3891 * (2) call of __init annotated helper build_all_zonelists_init
3892 * [protected by SYSTEM_BOOTING].
3951 */ 3893 */
3952void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 3894void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3953{ 3895{
3954 set_zonelist_order(); 3896 set_zonelist_order();
3955 3897
3956 if (system_state == SYSTEM_BOOTING) { 3898 if (system_state == SYSTEM_BOOTING) {
3957 __build_all_zonelists(NULL); 3899 build_all_zonelists_init();
3958 mminit_verify_zonelist();
3959 cpuset_init_current_mems_allowed();
3960 } else { 3900 } else {
3961#ifdef CONFIG_MEMORY_HOTPLUG 3901#ifdef CONFIG_MEMORY_HOTPLUG
3962 if (zone) 3902 if (zone)
@@ -5059,8 +4999,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
5059 pgdat->node_start_pfn = node_start_pfn; 4999 pgdat->node_start_pfn = node_start_pfn;
5060#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP 5000#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5061 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); 5001 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
5062 printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, 5002 pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
5063 (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); 5003 (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
5064#endif 5004#endif
5065 calculate_node_totalpages(pgdat, start_pfn, end_pfn, 5005 calculate_node_totalpages(pgdat, start_pfn, end_pfn,
5066 zones_size, zholes_size); 5006 zones_size, zholes_size);
@@ -5432,9 +5372,10 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5432 arch_zone_highest_possible_pfn[i]) 5372 arch_zone_highest_possible_pfn[i])
5433 pr_cont("empty\n"); 5373 pr_cont("empty\n");
5434 else 5374 else
5435 pr_cont("[mem %0#10lx-%0#10lx]\n", 5375 pr_cont("[mem %#018Lx-%#018Lx]\n",
5436 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, 5376 (u64)arch_zone_lowest_possible_pfn[i]
5437 (arch_zone_highest_possible_pfn[i] 5377 << PAGE_SHIFT,
5378 ((u64)arch_zone_highest_possible_pfn[i]
5438 << PAGE_SHIFT) - 1); 5379 << PAGE_SHIFT) - 1);
5439 } 5380 }
5440 5381
@@ -5442,15 +5383,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
5442 pr_info("Movable zone start for each node\n"); 5383 pr_info("Movable zone start for each node\n");
5443 for (i = 0; i < MAX_NUMNODES; i++) { 5384 for (i = 0; i < MAX_NUMNODES; i++) {
5444 if (zone_movable_pfn[i]) 5385 if (zone_movable_pfn[i])
5445 pr_info(" Node %d: %#010lx\n", i, 5386 pr_info(" Node %d: %#018Lx\n", i,
5446 zone_movable_pfn[i] << PAGE_SHIFT); 5387 (u64)zone_movable_pfn[i] << PAGE_SHIFT);
5447 } 5388 }
5448 5389
5449 /* Print out the early node map */ 5390 /* Print out the early node map */
5450 pr_info("Early memory node ranges\n"); 5391 pr_info("Early memory node ranges\n");
5451 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 5392 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
5452 pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid, 5393 pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
5453 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); 5394 (u64)start_pfn << PAGE_SHIFT,
5395 ((u64)end_pfn << PAGE_SHIFT) - 1);
5454 5396
5455 /* Initialise every node */ 5397 /* Initialise every node */
5456 mminit_verify_pageflags_layout(); 5398 mminit_verify_pageflags_layout();
diff --git a/mm/page_counter.c b/mm/page_counter.c
index a009574fbba9..11b4beda14ba 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -166,18 +166,19 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit)
166/** 166/**
167 * page_counter_memparse - memparse() for page counter limits 167 * page_counter_memparse - memparse() for page counter limits
168 * @buf: string to parse 168 * @buf: string to parse
169 * @max: string meaning maximum possible value
169 * @nr_pages: returns the result in number of pages 170 * @nr_pages: returns the result in number of pages
170 * 171 *
171 * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be 172 * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be
172 * limited to %PAGE_COUNTER_MAX. 173 * limited to %PAGE_COUNTER_MAX.
173 */ 174 */
174int page_counter_memparse(const char *buf, unsigned long *nr_pages) 175int page_counter_memparse(const char *buf, const char *max,
176 unsigned long *nr_pages)
175{ 177{
176 char unlimited[] = "-1";
177 char *end; 178 char *end;
178 u64 bytes; 179 u64 bytes;
179 180
180 if (!strncmp(buf, unlimited, sizeof(unlimited))) { 181 if (!strcmp(buf, max)) {
181 *nr_pages = PAGE_COUNTER_MAX; 182 *nr_pages = PAGE_COUNTER_MAX;
182 return 0; 183 return 0;
183 } 184 }
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 9ab4a9b5bc09..0993f5f36b01 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -59,20 +59,19 @@ void __reset_page_owner(struct page *page, unsigned int order)
59 59
60void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) 60void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
61{ 61{
62 struct page_ext *page_ext; 62 struct page_ext *page_ext = lookup_page_ext(page);
63 struct stack_trace *trace; 63 struct stack_trace trace = {
64 64 .nr_entries = 0,
65 page_ext = lookup_page_ext(page); 65 .max_entries = ARRAY_SIZE(page_ext->trace_entries),
66 .entries = &page_ext->trace_entries[0],
67 .skip = 3,
68 };
66 69
67 trace = &page_ext->trace; 70 save_stack_trace(&trace);
68 trace->nr_entries = 0;
69 trace->max_entries = ARRAY_SIZE(page_ext->trace_entries);
70 trace->entries = &page_ext->trace_entries[0];
71 trace->skip = 3;
72 save_stack_trace(&page_ext->trace);
73 71
74 page_ext->order = order; 72 page_ext->order = order;
75 page_ext->gfp_mask = gfp_mask; 73 page_ext->gfp_mask = gfp_mask;
74 page_ext->nr_entries = trace.nr_entries;
76 75
77 __set_bit(PAGE_EXT_OWNER, &page_ext->flags); 76 __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
78} 77}
@@ -84,6 +83,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
84 int ret; 83 int ret;
85 int pageblock_mt, page_mt; 84 int pageblock_mt, page_mt;
86 char *kbuf; 85 char *kbuf;
86 struct stack_trace trace = {
87 .nr_entries = page_ext->nr_entries,
88 .entries = &page_ext->trace_entries[0],
89 };
87 90
88 kbuf = kmalloc(count, GFP_KERNEL); 91 kbuf = kmalloc(count, GFP_KERNEL);
89 if (!kbuf) 92 if (!kbuf)
@@ -121,8 +124,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
121 if (ret >= count) 124 if (ret >= count)
122 goto err; 125 goto err;
123 126
124 ret += snprint_stack_trace(kbuf + ret, count - ret, 127 ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
125 &page_ext->trace, 0);
126 if (ret >= count) 128 if (ret >= count)
127 goto err; 129 goto err;
128 130
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index ad83195521f2..75c1f2878519 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -35,7 +35,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
35 do { 35 do {
36again: 36again:
37 next = pmd_addr_end(addr, end); 37 next = pmd_addr_end(addr, end);
38 if (pmd_none(*pmd)) { 38 if (pmd_none(*pmd) || !walk->vma) {
39 if (walk->pte_hole) 39 if (walk->pte_hole)
40 err = walk->pte_hole(addr, next, walk); 40 err = walk->pte_hole(addr, next, walk);
41 if (err) 41 if (err)
@@ -59,7 +59,7 @@ again:
59 continue; 59 continue;
60 60
61 split_huge_page_pmd_mm(walk->mm, addr, pmd); 61 split_huge_page_pmd_mm(walk->mm, addr, pmd);
62 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 62 if (pmd_trans_unstable(pmd))
63 goto again; 63 goto again;
64 err = walk_pte_range(pmd, addr, next, walk); 64 err = walk_pte_range(pmd, addr, next, walk);
65 if (err) 65 if (err)
@@ -86,9 +86,7 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
86 break; 86 break;
87 continue; 87 continue;
88 } 88 }
89 if (walk->pud_entry) 89 if (walk->pmd_entry || walk->pte_entry)
90 err = walk->pud_entry(pud, addr, next, walk);
91 if (!err && (walk->pmd_entry || walk->pte_entry))
92 err = walk_pmd_range(pud, addr, next, walk); 90 err = walk_pmd_range(pud, addr, next, walk);
93 if (err) 91 if (err)
94 break; 92 break;
@@ -97,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
97 return err; 95 return err;
98} 96}
99 97
98static int walk_pgd_range(unsigned long addr, unsigned long end,
99 struct mm_walk *walk)
100{
101 pgd_t *pgd;
102 unsigned long next;
103 int err = 0;
104
105 pgd = pgd_offset(walk->mm, addr);
106 do {
107 next = pgd_addr_end(addr, end);
108 if (pgd_none_or_clear_bad(pgd)) {
109 if (walk->pte_hole)
110 err = walk->pte_hole(addr, next, walk);
111 if (err)
112 break;
113 continue;
114 }
115 if (walk->pmd_entry || walk->pte_entry)
116 err = walk_pud_range(pgd, addr, next, walk);
117 if (err)
118 break;
119 } while (pgd++, addr = next, addr != end);
120
121 return err;
122}
123
100#ifdef CONFIG_HUGETLB_PAGE 124#ifdef CONFIG_HUGETLB_PAGE
101static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, 125static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
102 unsigned long end) 126 unsigned long end)
@@ -105,10 +129,10 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
105 return boundary < end ? boundary : end; 129 return boundary < end ? boundary : end;
106} 130}
107 131
108static int walk_hugetlb_range(struct vm_area_struct *vma, 132static int walk_hugetlb_range(unsigned long addr, unsigned long end,
109 unsigned long addr, unsigned long end,
110 struct mm_walk *walk) 133 struct mm_walk *walk)
111{ 134{
135 struct vm_area_struct *vma = walk->vma;
112 struct hstate *h = hstate_vma(vma); 136 struct hstate *h = hstate_vma(vma);
113 unsigned long next; 137 unsigned long next;
114 unsigned long hmask = huge_page_mask(h); 138 unsigned long hmask = huge_page_mask(h);
@@ -121,15 +145,14 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
121 if (pte && walk->hugetlb_entry) 145 if (pte && walk->hugetlb_entry)
122 err = walk->hugetlb_entry(pte, hmask, addr, next, walk); 146 err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
123 if (err) 147 if (err)
124 return err; 148 break;
125 } while (addr = next, addr != end); 149 } while (addr = next, addr != end);
126 150
127 return 0; 151 return err;
128} 152}
129 153
130#else /* CONFIG_HUGETLB_PAGE */ 154#else /* CONFIG_HUGETLB_PAGE */
131static int walk_hugetlb_range(struct vm_area_struct *vma, 155static int walk_hugetlb_range(unsigned long addr, unsigned long end,
132 unsigned long addr, unsigned long end,
133 struct mm_walk *walk) 156 struct mm_walk *walk)
134{ 157{
135 return 0; 158 return 0;
@@ -137,112 +160,138 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
137 160
138#endif /* CONFIG_HUGETLB_PAGE */ 161#endif /* CONFIG_HUGETLB_PAGE */
139 162
163/*
164 * Decide whether we really walk over the current vma on [@start, @end)
165 * or skip it via the returned value. Return 0 if we do walk over the
166 * current vma, and return 1 if we skip the vma. Negative values means
167 * error, where we abort the current walk.
168 */
169static int walk_page_test(unsigned long start, unsigned long end,
170 struct mm_walk *walk)
171{
172 struct vm_area_struct *vma = walk->vma;
173
174 if (walk->test_walk)
175 return walk->test_walk(start, end, walk);
176
177 /*
178 * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
179 * range, so we don't walk over it as we do for normal vmas. However,
180 * Some callers are interested in handling hole range and they don't
181 * want to just ignore any single address range. Such users certainly
182 * define their ->pte_hole() callbacks, so let's delegate them to handle
183 * vma(VM_PFNMAP).
184 */
185 if (vma->vm_flags & VM_PFNMAP) {
186 int err = 1;
187 if (walk->pte_hole)
188 err = walk->pte_hole(start, end, walk);
189 return err ? err : 1;
190 }
191 return 0;
192}
193
194static int __walk_page_range(unsigned long start, unsigned long end,
195 struct mm_walk *walk)
196{
197 int err = 0;
198 struct vm_area_struct *vma = walk->vma;
199
200 if (vma && is_vm_hugetlb_page(vma)) {
201 if (walk->hugetlb_entry)
202 err = walk_hugetlb_range(start, end, walk);
203 } else
204 err = walk_pgd_range(start, end, walk);
140 205
206 return err;
207}
141 208
142/** 209/**
143 * walk_page_range - walk a memory map's page tables with a callback 210 * walk_page_range - walk page table with caller specific callbacks
144 * @addr: starting address
145 * @end: ending address
146 * @walk: set of callbacks to invoke for each level of the tree
147 * 211 *
148 * Recursively walk the page table for the memory area in a VMA, 212 * Recursively walk the page table tree of the process represented by @walk->mm
149 * calling supplied callbacks. Callbacks are called in-order (first 213 * within the virtual address range [@start, @end). During walking, we can do
150 * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, 214 * some caller-specific works for each entry, by setting up pmd_entry(),
151 * etc.). If lower-level callbacks are omitted, walking depth is reduced. 215 * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
216 * callbacks, the associated entries/pages are just ignored.
217 * The return values of these callbacks are commonly defined like below:
218 * - 0 : succeeded to handle the current entry, and if you don't reach the
219 * end address yet, continue to walk.
220 * - >0 : succeeded to handle the current entry, and return to the caller
221 * with caller specific value.
222 * - <0 : failed to handle the current entry, and return to the caller
223 * with error code.
152 * 224 *
153 * Each callback receives an entry pointer and the start and end of the 225 * Before starting to walk page table, some callers want to check whether
154 * associated range, and a copy of the original mm_walk for access to 226 * they really want to walk over the current vma, typically by checking
155 * the ->private or ->mm fields. 227 * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
228 * purpose.
156 * 229 *
157 * Usually no locks are taken, but splitting transparent huge page may 230 * struct mm_walk keeps current values of some common data like vma and pmd,
158 * take page table lock. And the bottom level iterator will map PTE 231 * which are useful for the access from callbacks. If you want to pass some
159 * directories from highmem if necessary. 232 * caller-specific data to callbacks, @walk->private should be helpful.
160 * 233 *
161 * If any callback returns a non-zero value, the walk is aborted and 234 * Locking:
162 * the return value is propagated back to the caller. Otherwise 0 is returned. 235 * Callers of walk_page_range() and walk_page_vma() should hold
163 * 236 * @walk->mm->mmap_sem, because these function traverse vma list and/or
164 * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry 237 * access to vma's data.
165 * is !NULL.
166 */ 238 */
167int walk_page_range(unsigned long addr, unsigned long end, 239int walk_page_range(unsigned long start, unsigned long end,
168 struct mm_walk *walk) 240 struct mm_walk *walk)
169{ 241{
170 pgd_t *pgd;
171 unsigned long next;
172 int err = 0; 242 int err = 0;
243 unsigned long next;
244 struct vm_area_struct *vma;
173 245
174 if (addr >= end) 246 if (start >= end)
175 return err; 247 return -EINVAL;
176 248
177 if (!walk->mm) 249 if (!walk->mm)
178 return -EINVAL; 250 return -EINVAL;
179 251
180 VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); 252 VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
181 253
182 pgd = pgd_offset(walk->mm, addr); 254 vma = find_vma(walk->mm, start);
183 do { 255 do {
184 struct vm_area_struct *vma = NULL; 256 if (!vma) { /* after the last vma */
185 257 walk->vma = NULL;
186 next = pgd_addr_end(addr, end); 258 next = end;
259 } else if (start < vma->vm_start) { /* outside vma */
260 walk->vma = NULL;
261 next = min(end, vma->vm_start);
262 } else { /* inside vma */
263 walk->vma = vma;
264 next = min(end, vma->vm_end);
265 vma = vma->vm_next;
187 266
188 /* 267 err = walk_page_test(start, next, walk);
189 * This function was not intended to be vma based. 268 if (err > 0)
190 * But there are vma special cases to be handled:
191 * - hugetlb vma's
192 * - VM_PFNMAP vma's
193 */
194 vma = find_vma(walk->mm, addr);
195 if (vma) {
196 /*
197 * There are no page structures backing a VM_PFNMAP
198 * range, so do not allow split_huge_page_pmd().
199 */
200 if ((vma->vm_start <= addr) &&
201 (vma->vm_flags & VM_PFNMAP)) {
202 next = vma->vm_end;
203 pgd = pgd_offset(walk->mm, next);
204 continue;
205 }
206 /*
207 * Handle hugetlb vma individually because pagetable
208 * walk for the hugetlb page is dependent on the
209 * architecture and we can't handled it in the same
210 * manner as non-huge pages.
211 */
212 if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
213 is_vm_hugetlb_page(vma)) {
214 if (vma->vm_end < next)
215 next = vma->vm_end;
216 /*
217 * Hugepage is very tightly coupled with vma,
218 * so walk through hugetlb entries within a
219 * given vma.
220 */
221 err = walk_hugetlb_range(vma, addr, next, walk);
222 if (err)
223 break;
224 pgd = pgd_offset(walk->mm, next);
225 continue; 269 continue;
226 } 270 if (err < 0)
227 }
228
229 if (pgd_none_or_clear_bad(pgd)) {
230 if (walk->pte_hole)
231 err = walk->pte_hole(addr, next, walk);
232 if (err)
233 break; 271 break;
234 pgd++;
235 continue;
236 } 272 }
237 if (walk->pgd_entry) 273 if (walk->vma || walk->pte_hole)
238 err = walk->pgd_entry(pgd, addr, next, walk); 274 err = __walk_page_range(start, next, walk);
239 if (!err &&
240 (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
241 err = walk_pud_range(pgd, addr, next, walk);
242 if (err) 275 if (err)
243 break; 276 break;
244 pgd++; 277 } while (start = next, start < end);
245 } while (addr = next, addr < end);
246
247 return err; 278 return err;
248} 279}
280
281int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk)
282{
283 int err;
284
285 if (!walk->mm)
286 return -EINVAL;
287
288 VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
289 VM_BUG_ON(!vma);
290 walk->vma = vma;
291 err = walk_page_test(vma->vm_start, vma->vm_end, walk);
292 if (err > 0)
293 return 0;
294 if (err < 0)
295 return err;
296 return __walk_page_range(vma->vm_start, vma->vm_end, walk);
297}
diff --git a/mm/percpu.c b/mm/percpu.c
index d39e2f4e335c..73c97a5f4495 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1528,7 +1528,6 @@ static void pcpu_dump_alloc_info(const char *lvl,
1528int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, 1528int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1529 void *base_addr) 1529 void *base_addr)
1530{ 1530{
1531 static char cpus_buf[4096] __initdata;
1532 static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; 1531 static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
1533 static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; 1532 static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
1534 size_t dyn_size = ai->dyn_size; 1533 size_t dyn_size = ai->dyn_size;
@@ -1541,12 +1540,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1541 int *unit_map; 1540 int *unit_map;
1542 int group, unit, i; 1541 int group, unit, i;
1543 1542
1544 cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
1545
1546#define PCPU_SETUP_BUG_ON(cond) do { \ 1543#define PCPU_SETUP_BUG_ON(cond) do { \
1547 if (unlikely(cond)) { \ 1544 if (unlikely(cond)) { \
1548 pr_emerg("PERCPU: failed to initialize, %s", #cond); \ 1545 pr_emerg("PERCPU: failed to initialize, %s", #cond); \
1549 pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \ 1546 pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \
1547 cpumask_pr_args(cpu_possible_mask)); \
1550 pcpu_dump_alloc_info(KERN_EMERG, ai); \ 1548 pcpu_dump_alloc_info(KERN_EMERG, ai); \
1551 BUG(); \ 1549 BUG(); \
1552 } \ 1550 } \
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index dfb79e028ecb..c25f94b33811 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -193,8 +193,6 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
193 pmd_t *pmdp) 193 pmd_t *pmdp)
194{ 194{
195 pmd_t entry = *pmdp; 195 pmd_t entry = *pmdp;
196 if (pmd_numa(entry))
197 entry = pmd_mknonnuma(entry);
198 set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); 196 set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
199 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 197 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
200} 198}
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 5077afcd9e11..b1597690530c 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -99,11 +99,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
99 size_t bytes; 99 size_t bytes;
100 100
101 /* Get the pages we're interested in */ 101 /* Get the pages we're interested in */
102 down_read(&mm->mmap_sem); 102 pages = get_user_pages_unlocked(task, mm, pa, pages,
103 pages = get_user_pages(task, mm, pa, pages, 103 vm_write, 0, process_pages);
104 vm_write, 0, process_pages, NULL);
105 up_read(&mm->mmap_sem);
106
107 if (pages <= 0) 104 if (pages <= 0)
108 return -EFAULT; 105 return -EFAULT;
109 106
diff --git a/mm/readahead.c b/mm/readahead.c
index 17b9172ec37f..935675844b2e 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -27,7 +27,7 @@
27void 27void
28file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) 28file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
29{ 29{
30 ra->ra_pages = mapping->backing_dev_info->ra_pages; 30 ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
31 ra->prev_pos = -1; 31 ra->prev_pos = -1;
32} 32}
33EXPORT_SYMBOL_GPL(file_ra_state_init); 33EXPORT_SYMBOL_GPL(file_ra_state_init);
@@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping,
541 /* 541 /*
542 * Defer asynchronous read-ahead on IO congestion. 542 * Defer asynchronous read-ahead on IO congestion.
543 */ 543 */
544 if (bdi_read_congested(mapping->backing_dev_info)) 544 if (bdi_read_congested(inode_to_bdi(mapping->host)))
545 return; 545 return;
546 546
547 /* do read-ahead */ 547 /* do read-ahead */
diff --git a/mm/rmap.c b/mm/rmap.c
index c5bc241127b2..5e3e09081164 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -72,6 +72,8 @@ static inline struct anon_vma *anon_vma_alloc(void)
72 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); 72 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
73 if (anon_vma) { 73 if (anon_vma) {
74 atomic_set(&anon_vma->refcount, 1); 74 atomic_set(&anon_vma->refcount, 1);
75 anon_vma->degree = 1; /* Reference for first vma */
76 anon_vma->parent = anon_vma;
75 /* 77 /*
76 * Initialise the anon_vma root to point to itself. If called 78 * Initialise the anon_vma root to point to itself. If called
77 * from fork, the root will be reset to the parents anon_vma. 79 * from fork, the root will be reset to the parents anon_vma.
@@ -188,6 +190,8 @@ int anon_vma_prepare(struct vm_area_struct *vma)
188 if (likely(!vma->anon_vma)) { 190 if (likely(!vma->anon_vma)) {
189 vma->anon_vma = anon_vma; 191 vma->anon_vma = anon_vma;
190 anon_vma_chain_link(vma, avc, anon_vma); 192 anon_vma_chain_link(vma, avc, anon_vma);
193 /* vma reference or self-parent link for new root */
194 anon_vma->degree++;
191 allocated = NULL; 195 allocated = NULL;
192 avc = NULL; 196 avc = NULL;
193 } 197 }
@@ -236,6 +240,14 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
236/* 240/*
237 * Attach the anon_vmas from src to dst. 241 * Attach the anon_vmas from src to dst.
238 * Returns 0 on success, -ENOMEM on failure. 242 * Returns 0 on success, -ENOMEM on failure.
243 *
244 * If dst->anon_vma is NULL this function tries to find and reuse existing
245 * anon_vma which has no vmas and only one child anon_vma. This prevents
246 * degradation of anon_vma hierarchy to endless linear chain in case of
247 * constantly forking task. On the other hand, an anon_vma with more than one
248 * child isn't reused even if there was no alive vma, thus rmap walker has a
249 * good chance of avoiding scanning the whole hierarchy when it searches where
250 * page is mapped.
239 */ 251 */
240int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) 252int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
241{ 253{
@@ -256,7 +268,21 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
256 anon_vma = pavc->anon_vma; 268 anon_vma = pavc->anon_vma;
257 root = lock_anon_vma_root(root, anon_vma); 269 root = lock_anon_vma_root(root, anon_vma);
258 anon_vma_chain_link(dst, avc, anon_vma); 270 anon_vma_chain_link(dst, avc, anon_vma);
271
272 /*
273 * Reuse existing anon_vma if its degree lower than two,
274 * that means it has no vma and only one anon_vma child.
275 *
276 * Do not chose parent anon_vma, otherwise first child
277 * will always reuse it. Root anon_vma is never reused:
278 * it has self-parent reference and at least one child.
279 */
280 if (!dst->anon_vma && anon_vma != src->anon_vma &&
281 anon_vma->degree < 2)
282 dst->anon_vma = anon_vma;
259 } 283 }
284 if (dst->anon_vma)
285 dst->anon_vma->degree++;
260 unlock_anon_vma_root(root); 286 unlock_anon_vma_root(root);
261 return 0; 287 return 0;
262 288
@@ -280,6 +306,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
280 if (!pvma->anon_vma) 306 if (!pvma->anon_vma)
281 return 0; 307 return 0;
282 308
309 /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
310 vma->anon_vma = NULL;
311
283 /* 312 /*
284 * First, attach the new VMA to the parent VMA's anon_vmas, 313 * First, attach the new VMA to the parent VMA's anon_vmas,
285 * so rmap can find non-COWed pages in child processes. 314 * so rmap can find non-COWed pages in child processes.
@@ -288,6 +317,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
288 if (error) 317 if (error)
289 return error; 318 return error;
290 319
320 /* An existing anon_vma has been reused, all done then. */
321 if (vma->anon_vma)
322 return 0;
323
291 /* Then add our own anon_vma. */ 324 /* Then add our own anon_vma. */
292 anon_vma = anon_vma_alloc(); 325 anon_vma = anon_vma_alloc();
293 if (!anon_vma) 326 if (!anon_vma)
@@ -301,6 +334,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
301 * lock any of the anon_vmas in this anon_vma tree. 334 * lock any of the anon_vmas in this anon_vma tree.
302 */ 335 */
303 anon_vma->root = pvma->anon_vma->root; 336 anon_vma->root = pvma->anon_vma->root;
337 anon_vma->parent = pvma->anon_vma;
304 /* 338 /*
305 * With refcounts, an anon_vma can stay around longer than the 339 * With refcounts, an anon_vma can stay around longer than the
306 * process it belongs to. The root anon_vma needs to be pinned until 340 * process it belongs to. The root anon_vma needs to be pinned until
@@ -311,6 +345,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
311 vma->anon_vma = anon_vma; 345 vma->anon_vma = anon_vma;
312 anon_vma_lock_write(anon_vma); 346 anon_vma_lock_write(anon_vma);
313 anon_vma_chain_link(vma, avc, anon_vma); 347 anon_vma_chain_link(vma, avc, anon_vma);
348 anon_vma->parent->degree++;
314 anon_vma_unlock_write(anon_vma); 349 anon_vma_unlock_write(anon_vma);
315 350
316 return 0; 351 return 0;
@@ -341,12 +376,16 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
341 * Leave empty anon_vmas on the list - we'll need 376 * Leave empty anon_vmas on the list - we'll need
342 * to free them outside the lock. 377 * to free them outside the lock.
343 */ 378 */
344 if (RB_EMPTY_ROOT(&anon_vma->rb_root)) 379 if (RB_EMPTY_ROOT(&anon_vma->rb_root)) {
380 anon_vma->parent->degree--;
345 continue; 381 continue;
382 }
346 383
347 list_del(&avc->same_vma); 384 list_del(&avc->same_vma);
348 anon_vma_chain_free(avc); 385 anon_vma_chain_free(avc);
349 } 386 }
387 if (vma->anon_vma)
388 vma->anon_vma->degree--;
350 unlock_anon_vma_root(root); 389 unlock_anon_vma_root(root);
351 390
352 /* 391 /*
@@ -357,6 +396,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
357 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { 396 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
358 struct anon_vma *anon_vma = avc->anon_vma; 397 struct anon_vma *anon_vma = avc->anon_vma;
359 398
399 BUG_ON(anon_vma->degree);
360 put_anon_vma(anon_vma); 400 put_anon_vma(anon_vma);
361 401
362 list_del(&avc->same_vma); 402 list_del(&avc->same_vma);
@@ -550,9 +590,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
550 if (!vma->anon_vma || !page__anon_vma || 590 if (!vma->anon_vma || !page__anon_vma ||
551 vma->anon_vma->root != page__anon_vma->root) 591 vma->anon_vma->root != page__anon_vma->root)
552 return -EFAULT; 592 return -EFAULT;
553 } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { 593 } else if (page->mapping) {
554 if (!vma->vm_file || 594 if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
555 vma->vm_file->f_mapping != page->mapping)
556 return -EFAULT; 595 return -EFAULT;
557 } else 596 } else
558 return -EFAULT; 597 return -EFAULT;
@@ -1046,24 +1085,20 @@ void page_add_new_anon_rmap(struct page *page,
1046void page_add_file_rmap(struct page *page) 1085void page_add_file_rmap(struct page *page)
1047{ 1086{
1048 struct mem_cgroup *memcg; 1087 struct mem_cgroup *memcg;
1049 unsigned long flags;
1050 bool locked;
1051 1088
1052 memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); 1089 memcg = mem_cgroup_begin_page_stat(page);
1053 if (atomic_inc_and_test(&page->_mapcount)) { 1090 if (atomic_inc_and_test(&page->_mapcount)) {
1054 __inc_zone_page_state(page, NR_FILE_MAPPED); 1091 __inc_zone_page_state(page, NR_FILE_MAPPED);
1055 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); 1092 mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
1056 } 1093 }
1057 mem_cgroup_end_page_stat(memcg, &locked, &flags); 1094 mem_cgroup_end_page_stat(memcg);
1058} 1095}
1059 1096
1060static void page_remove_file_rmap(struct page *page) 1097static void page_remove_file_rmap(struct page *page)
1061{ 1098{
1062 struct mem_cgroup *memcg; 1099 struct mem_cgroup *memcg;
1063 unsigned long flags;
1064 bool locked;
1065 1100
1066 memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); 1101 memcg = mem_cgroup_begin_page_stat(page);
1067 1102
1068 /* page still mapped by someone else? */ 1103 /* page still mapped by someone else? */
1069 if (!atomic_add_negative(-1, &page->_mapcount)) 1104 if (!atomic_add_negative(-1, &page->_mapcount))
@@ -1084,7 +1119,7 @@ static void page_remove_file_rmap(struct page *page)
1084 if (unlikely(PageMlocked(page))) 1119 if (unlikely(PageMlocked(page)))
1085 clear_page_mlock(page); 1120 clear_page_mlock(page);
1086out: 1121out:
1087 mem_cgroup_end_page_stat(memcg, &locked, &flags); 1122 mem_cgroup_end_page_stat(memcg);
1088} 1123}
1089 1124
1090/** 1125/**
@@ -1234,7 +1269,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1234 if (pte_soft_dirty(pteval)) 1269 if (pte_soft_dirty(pteval))
1235 swp_pte = pte_swp_mksoft_dirty(swp_pte); 1270 swp_pte = pte_swp_mksoft_dirty(swp_pte);
1236 set_pte_at(mm, address, pte, swp_pte); 1271 set_pte_at(mm, address, pte, swp_pte);
1237 BUG_ON(pte_file(*pte));
1238 } else if (IS_ENABLED(CONFIG_MIGRATION) && 1272 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
1239 (flags & TTU_MIGRATION)) { 1273 (flags & TTU_MIGRATION)) {
1240 /* Establish migration entry for a file page */ 1274 /* Establish migration entry for a file page */
@@ -1276,211 +1310,6 @@ out_mlock:
1276 return ret; 1310 return ret;
1277} 1311}
1278 1312
1279/*
1280 * objrmap doesn't work for nonlinear VMAs because the assumption that
1281 * offset-into-file correlates with offset-into-virtual-addresses does not hold.
1282 * Consequently, given a particular page and its ->index, we cannot locate the
1283 * ptes which are mapping that page without an exhaustive linear search.
1284 *
1285 * So what this code does is a mini "virtual scan" of each nonlinear VMA which
1286 * maps the file to which the target page belongs. The ->vm_private_data field
1287 * holds the current cursor into that scan. Successive searches will circulate
1288 * around the vma's virtual address space.
1289 *
1290 * So as more replacement pressure is applied to the pages in a nonlinear VMA,
1291 * more scanning pressure is placed against them as well. Eventually pages
1292 * will become fully unmapped and are eligible for eviction.
1293 *
1294 * For very sparsely populated VMAs this is a little inefficient - chances are
1295 * there there won't be many ptes located within the scan cluster. In this case
1296 * maybe we could scan further - to the end of the pte page, perhaps.
1297 *
1298 * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
1299 * acquire it without blocking. If vma locked, mlock the pages in the cluster,
1300 * rather than unmapping them. If we encounter the "check_page" that vmscan is
1301 * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
1302 */
1303#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
1304#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
1305
1306static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1307 struct vm_area_struct *vma, struct page *check_page)
1308{
1309 struct mm_struct *mm = vma->vm_mm;
1310 pmd_t *pmd;
1311 pte_t *pte;
1312 pte_t pteval;
1313 spinlock_t *ptl;
1314 struct page *page;
1315 unsigned long address;
1316 unsigned long mmun_start; /* For mmu_notifiers */
1317 unsigned long mmun_end; /* For mmu_notifiers */
1318 unsigned long end;
1319 int ret = SWAP_AGAIN;
1320 int locked_vma = 0;
1321
1322 address = (vma->vm_start + cursor) & CLUSTER_MASK;
1323 end = address + CLUSTER_SIZE;
1324 if (address < vma->vm_start)
1325 address = vma->vm_start;
1326 if (end > vma->vm_end)
1327 end = vma->vm_end;
1328
1329 pmd = mm_find_pmd(mm, address);
1330 if (!pmd)
1331 return ret;
1332
1333 mmun_start = address;
1334 mmun_end = end;
1335 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1336
1337 /*
1338 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
1339 * keep the sem while scanning the cluster for mlocking pages.
1340 */
1341 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
1342 locked_vma = (vma->vm_flags & VM_LOCKED);
1343 if (!locked_vma)
1344 up_read(&vma->vm_mm->mmap_sem); /* don't need it */
1345 }
1346
1347 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1348
1349 /* Update high watermark before we lower rss */
1350 update_hiwater_rss(mm);
1351
1352 for (; address < end; pte++, address += PAGE_SIZE) {
1353 if (!pte_present(*pte))
1354 continue;
1355 page = vm_normal_page(vma, address, *pte);
1356 BUG_ON(!page || PageAnon(page));
1357
1358 if (locked_vma) {
1359 if (page == check_page) {
1360 /* we know we have check_page locked */
1361 mlock_vma_page(page);
1362 ret = SWAP_MLOCK;
1363 } else if (trylock_page(page)) {
1364 /*
1365 * If we can lock the page, perform mlock.
1366 * Otherwise leave the page alone, it will be
1367 * eventually encountered again later.
1368 */
1369 mlock_vma_page(page);
1370 unlock_page(page);
1371 }
1372 continue; /* don't unmap */
1373 }
1374
1375 /*
1376 * No need for _notify because we're within an
1377 * mmu_notifier_invalidate_range_ {start|end} scope.
1378 */
1379 if (ptep_clear_flush_young(vma, address, pte))
1380 continue;
1381
1382 /* Nuke the page table entry. */
1383 flush_cache_page(vma, address, pte_pfn(*pte));
1384 pteval = ptep_clear_flush_notify(vma, address, pte);
1385
1386 /* If nonlinear, store the file page offset in the pte. */
1387 if (page->index != linear_page_index(vma, address)) {
1388 pte_t ptfile = pgoff_to_pte(page->index);
1389 if (pte_soft_dirty(pteval))
1390 ptfile = pte_file_mksoft_dirty(ptfile);
1391 set_pte_at(mm, address, pte, ptfile);
1392 }
1393
1394 /* Move the dirty bit to the physical page now the pte is gone. */
1395 if (pte_dirty(pteval))
1396 set_page_dirty(page);
1397
1398 page_remove_rmap(page);
1399 page_cache_release(page);
1400 dec_mm_counter(mm, MM_FILEPAGES);
1401 (*mapcount)--;
1402 }
1403 pte_unmap_unlock(pte - 1, ptl);
1404 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1405 if (locked_vma)
1406 up_read(&vma->vm_mm->mmap_sem);
1407 return ret;
1408}
1409
1410static int try_to_unmap_nonlinear(struct page *page,
1411 struct address_space *mapping, void *arg)
1412{
1413 struct vm_area_struct *vma;
1414 int ret = SWAP_AGAIN;
1415 unsigned long cursor;
1416 unsigned long max_nl_cursor = 0;
1417 unsigned long max_nl_size = 0;
1418 unsigned int mapcount;
1419
1420 list_for_each_entry(vma,
1421 &mapping->i_mmap_nonlinear, shared.nonlinear) {
1422
1423 cursor = (unsigned long) vma->vm_private_data;
1424 if (cursor > max_nl_cursor)
1425 max_nl_cursor = cursor;
1426 cursor = vma->vm_end - vma->vm_start;
1427 if (cursor > max_nl_size)
1428 max_nl_size = cursor;
1429 }
1430
1431 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
1432 return SWAP_FAIL;
1433 }
1434
1435 /*
1436 * We don't try to search for this page in the nonlinear vmas,
1437 * and page_referenced wouldn't have found it anyway. Instead
1438 * just walk the nonlinear vmas trying to age and unmap some.
1439 * The mapcount of the page we came in with is irrelevant,
1440 * but even so use it as a guide to how hard we should try?
1441 */
1442 mapcount = page_mapcount(page);
1443 if (!mapcount)
1444 return ret;
1445
1446 cond_resched();
1447
1448 max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
1449 if (max_nl_cursor == 0)
1450 max_nl_cursor = CLUSTER_SIZE;
1451
1452 do {
1453 list_for_each_entry(vma,
1454 &mapping->i_mmap_nonlinear, shared.nonlinear) {
1455
1456 cursor = (unsigned long) vma->vm_private_data;
1457 while (cursor < max_nl_cursor &&
1458 cursor < vma->vm_end - vma->vm_start) {
1459 if (try_to_unmap_cluster(cursor, &mapcount,
1460 vma, page) == SWAP_MLOCK)
1461 ret = SWAP_MLOCK;
1462 cursor += CLUSTER_SIZE;
1463 vma->vm_private_data = (void *) cursor;
1464 if ((int)mapcount <= 0)
1465 return ret;
1466 }
1467 vma->vm_private_data = (void *) max_nl_cursor;
1468 }
1469 cond_resched();
1470 max_nl_cursor += CLUSTER_SIZE;
1471 } while (max_nl_cursor <= max_nl_size);
1472
1473 /*
1474 * Don't loop forever (perhaps all the remaining pages are
1475 * in locked vmas). Reset cursor on all unreserved nonlinear
1476 * vmas, now forgetting on which ones it had fallen behind.
1477 */
1478 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
1479 vma->vm_private_data = NULL;
1480
1481 return ret;
1482}
1483
1484bool is_vma_temporary_stack(struct vm_area_struct *vma) 1313bool is_vma_temporary_stack(struct vm_area_struct *vma)
1485{ 1314{
1486 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1315 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1526,7 +1355,6 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1526 .rmap_one = try_to_unmap_one, 1355 .rmap_one = try_to_unmap_one,
1527 .arg = (void *)flags, 1356 .arg = (void *)flags,
1528 .done = page_not_mapped, 1357 .done = page_not_mapped,
1529 .file_nonlinear = try_to_unmap_nonlinear,
1530 .anon_lock = page_lock_anon_vma_read, 1358 .anon_lock = page_lock_anon_vma_read,
1531 }; 1359 };
1532 1360
@@ -1572,12 +1400,6 @@ int try_to_munlock(struct page *page)
1572 .rmap_one = try_to_unmap_one, 1400 .rmap_one = try_to_unmap_one,
1573 .arg = (void *)TTU_MUNLOCK, 1401 .arg = (void *)TTU_MUNLOCK,
1574 .done = page_not_mapped, 1402 .done = page_not_mapped,
1575 /*
1576 * We don't bother to try to find the munlocked page in
1577 * nonlinears. It's costly. Instead, later, page reclaim logic
1578 * may call try_to_unmap() and recover PG_mlocked lazily.
1579 */
1580 .file_nonlinear = NULL,
1581 .anon_lock = page_lock_anon_vma_read, 1403 .anon_lock = page_lock_anon_vma_read,
1582 1404
1583 }; 1405 };
@@ -1708,13 +1530,6 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
1708 goto done; 1530 goto done;
1709 } 1531 }
1710 1532
1711 if (!rwc->file_nonlinear)
1712 goto done;
1713
1714 if (list_empty(&mapping->i_mmap_nonlinear))
1715 goto done;
1716
1717 ret = rwc->file_nonlinear(page, mapping, rwc->arg);
1718done: 1533done:
1719 i_mmap_unlock_read(mapping); 1534 i_mmap_unlock_read(mapping);
1720 return ret; 1535 return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index 73ba1df7c8ba..a63031fa3e0c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -191,11 +191,6 @@ static const struct inode_operations shmem_dir_inode_operations;
191static const struct inode_operations shmem_special_inode_operations; 191static const struct inode_operations shmem_special_inode_operations;
192static const struct vm_operations_struct shmem_vm_ops; 192static const struct vm_operations_struct shmem_vm_ops;
193 193
194static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
195 .ra_pages = 0, /* No readahead */
196 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
197};
198
199static LIST_HEAD(shmem_swaplist); 194static LIST_HEAD(shmem_swaplist);
200static DEFINE_MUTEX(shmem_swaplist_mutex); 195static DEFINE_MUTEX(shmem_swaplist_mutex);
201 196
@@ -765,11 +760,11 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
765 goto redirty; 760 goto redirty;
766 761
767 /* 762 /*
768 * shmem_backing_dev_info's capabilities prevent regular writeback or 763 * Our capabilities prevent regular writeback or sync from ever calling
769 * sync from ever calling shmem_writepage; but a stacking filesystem 764 * shmem_writepage; but a stacking filesystem might use ->writepage of
770 * might use ->writepage of its underlying filesystem, in which case 765 * its underlying filesystem, in which case tmpfs should write out to
771 * tmpfs should write out to swap only in response to memory pressure, 766 * swap only in response to memory pressure, and not for the writeback
772 * and not for the writeback threads or sync. 767 * threads or sync.
773 */ 768 */
774 if (!wbc->for_reclaim) { 769 if (!wbc->for_reclaim) {
775 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 770 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
@@ -1013,7 +1008,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
1013 */ 1008 */
1014 oldpage = newpage; 1009 oldpage = newpage;
1015 } else { 1010 } else {
1016 mem_cgroup_migrate(oldpage, newpage, false); 1011 mem_cgroup_migrate(oldpage, newpage, true);
1017 lru_cache_add_anon(newpage); 1012 lru_cache_add_anon(newpage);
1018 *pagep = newpage; 1013 *pagep = newpage;
1019 } 1014 }
@@ -1131,7 +1126,7 @@ repeat:
1131 * truncated or holepunched since swap was confirmed. 1126 * truncated or holepunched since swap was confirmed.
1132 * shmem_undo_range() will have done some of the 1127 * shmem_undo_range() will have done some of the
1133 * unaccounting, now delete_from_swap_cache() will do 1128 * unaccounting, now delete_from_swap_cache() will do
1134 * the rest (including mem_cgroup_uncharge_swapcache). 1129 * the rest.
1135 * Reset swap.val? No, leave it so "failed" goes back to 1130 * Reset swap.val? No, leave it so "failed" goes back to
1136 * "repeat": reading a hole and writing should succeed. 1131 * "repeat": reading a hole and writing should succeed.
1137 */ 1132 */
@@ -1415,7 +1410,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1415 inode->i_ino = get_next_ino(); 1410 inode->i_ino = get_next_ino();
1416 inode_init_owner(inode, dir, mode); 1411 inode_init_owner(inode, dir, mode);
1417 inode->i_blocks = 0; 1412 inode->i_blocks = 0;
1418 inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
1419 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1413 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1420 inode->i_generation = get_seconds(); 1414 inode->i_generation = get_seconds();
1421 info = SHMEM_I(inode); 1415 info = SHMEM_I(inode);
@@ -1461,7 +1455,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1461 1455
1462bool shmem_mapping(struct address_space *mapping) 1456bool shmem_mapping(struct address_space *mapping)
1463{ 1457{
1464 return mapping->backing_dev_info == &shmem_backing_dev_info; 1458 return mapping->host->i_sb->s_op == &shmem_ops;
1465} 1459}
1466 1460
1467#ifdef CONFIG_TMPFS 1461#ifdef CONFIG_TMPFS
@@ -3201,7 +3195,6 @@ static const struct vm_operations_struct shmem_vm_ops = {
3201 .set_policy = shmem_set_policy, 3195 .set_policy = shmem_set_policy,
3202 .get_policy = shmem_get_policy, 3196 .get_policy = shmem_get_policy,
3203#endif 3197#endif
3204 .remap_pages = generic_file_remap_pages,
3205}; 3198};
3206 3199
3207static struct dentry *shmem_mount(struct file_system_type *fs_type, 3200static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@ -3226,10 +3219,6 @@ int __init shmem_init(void)
3226 if (shmem_inode_cachep) 3219 if (shmem_inode_cachep)
3227 return 0; 3220 return 0;
3228 3221
3229 error = bdi_init(&shmem_backing_dev_info);
3230 if (error)
3231 goto out4;
3232
3233 error = shmem_init_inodecache(); 3222 error = shmem_init_inodecache();
3234 if (error) 3223 if (error)
3235 goto out3; 3224 goto out3;
@@ -3253,8 +3242,6 @@ out1:
3253out2: 3242out2:
3254 shmem_destroy_inodecache(); 3243 shmem_destroy_inodecache();
3255out3: 3244out3:
3256 bdi_destroy(&shmem_backing_dev_info);
3257out4:
3258 shm_mnt = ERR_PTR(error); 3245 shm_mnt = ERR_PTR(error);
3259 return error; 3246 return error;
3260} 3247}
diff --git a/mm/slab.c b/mm/slab.c
index 65b5dcb6f671..c4b89eaf4c96 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2382,7 +2382,7 @@ out:
2382 return nr_freed; 2382 return nr_freed;
2383} 2383}
2384 2384
2385int __kmem_cache_shrink(struct kmem_cache *cachep) 2385int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
2386{ 2386{
2387 int ret = 0; 2387 int ret = 0;
2388 int node; 2388 int node;
@@ -2404,7 +2404,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
2404{ 2404{
2405 int i; 2405 int i;
2406 struct kmem_cache_node *n; 2406 struct kmem_cache_node *n;
2407 int rc = __kmem_cache_shrink(cachep); 2407 int rc = __kmem_cache_shrink(cachep, false);
2408 2408
2409 if (rc) 2409 if (rc)
2410 return rc; 2410 return rc;
@@ -3708,8 +3708,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3708 int batchcount, int shared, gfp_t gfp) 3708 int batchcount, int shared, gfp_t gfp)
3709{ 3709{
3710 int ret; 3710 int ret;
3711 struct kmem_cache *c = NULL; 3711 struct kmem_cache *c;
3712 int i = 0;
3713 3712
3714 ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); 3713 ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
3715 3714
@@ -3719,12 +3718,10 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3719 if ((ret < 0) || !is_root_cache(cachep)) 3718 if ((ret < 0) || !is_root_cache(cachep))
3720 return ret; 3719 return ret;
3721 3720
3722 VM_BUG_ON(!mutex_is_locked(&slab_mutex)); 3721 lockdep_assert_held(&slab_mutex);
3723 for_each_memcg_cache_index(i) { 3722 for_each_memcg_cache(c, cachep) {
3724 c = cache_from_memcg_idx(cachep, i); 3723 /* return value determined by the root cache only */
3725 if (c) 3724 __do_tune_cpucache(c, limit, batchcount, shared, gfp);
3726 /* return value determined by the parent cache only */
3727 __do_tune_cpucache(c, limit, batchcount, shared, gfp);
3728 } 3725 }
3729 3726
3730 return ret; 3727 return ret;
diff --git a/mm/slab.h b/mm/slab.h
index 1cf4005482dd..4c3ac12dd644 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -86,8 +86,6 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
86extern void create_boot_cache(struct kmem_cache *, const char *name, 86extern void create_boot_cache(struct kmem_cache *, const char *name,
87 size_t size, unsigned long flags); 87 size_t size, unsigned long flags);
88 88
89struct mem_cgroup;
90
91int slab_unmergeable(struct kmem_cache *s); 89int slab_unmergeable(struct kmem_cache *s);
92struct kmem_cache *find_mergeable(size_t size, size_t align, 90struct kmem_cache *find_mergeable(size_t size, size_t align,
93 unsigned long flags, const char *name, void (*ctor)(void *)); 91 unsigned long flags, const char *name, void (*ctor)(void *));
@@ -140,7 +138,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
140#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) 138#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
141 139
142int __kmem_cache_shutdown(struct kmem_cache *); 140int __kmem_cache_shutdown(struct kmem_cache *);
143int __kmem_cache_shrink(struct kmem_cache *); 141int __kmem_cache_shrink(struct kmem_cache *, bool);
144void slab_kmem_cache_release(struct kmem_cache *); 142void slab_kmem_cache_release(struct kmem_cache *);
145 143
146struct seq_file; 144struct seq_file;
@@ -165,16 +163,27 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
165 size_t count, loff_t *ppos); 163 size_t count, loff_t *ppos);
166 164
167#ifdef CONFIG_MEMCG_KMEM 165#ifdef CONFIG_MEMCG_KMEM
166/*
167 * Iterate over all memcg caches of the given root cache. The caller must hold
168 * slab_mutex.
169 */
170#define for_each_memcg_cache(iter, root) \
171 list_for_each_entry(iter, &(root)->memcg_params.list, \
172 memcg_params.list)
173
174#define for_each_memcg_cache_safe(iter, tmp, root) \
175 list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \
176 memcg_params.list)
177
168static inline bool is_root_cache(struct kmem_cache *s) 178static inline bool is_root_cache(struct kmem_cache *s)
169{ 179{
170 return !s->memcg_params || s->memcg_params->is_root_cache; 180 return s->memcg_params.is_root_cache;
171} 181}
172 182
173static inline bool slab_equal_or_root(struct kmem_cache *s, 183static inline bool slab_equal_or_root(struct kmem_cache *s,
174 struct kmem_cache *p) 184 struct kmem_cache *p)
175{ 185{
176 return (p == s) || 186 return p == s || p == s->memcg_params.root_cache;
177 (s->memcg_params && (p == s->memcg_params->root_cache));
178} 187}
179 188
180/* 189/*
@@ -185,37 +194,30 @@ static inline bool slab_equal_or_root(struct kmem_cache *s,
185static inline const char *cache_name(struct kmem_cache *s) 194static inline const char *cache_name(struct kmem_cache *s)
186{ 195{
187 if (!is_root_cache(s)) 196 if (!is_root_cache(s))
188 return s->memcg_params->root_cache->name; 197 s = s->memcg_params.root_cache;
189 return s->name; 198 return s->name;
190} 199}
191 200
192/* 201/*
193 * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. 202 * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
194 * That said the caller must assure the memcg's cache won't go away. Since once 203 * That said the caller must assure the memcg's cache won't go away by either
195 * created a memcg's cache is destroyed only along with the root cache, it is 204 * taking a css reference to the owner cgroup, or holding the slab_mutex.
196 * true if we are going to allocate from the cache or hold a reference to the
197 * root cache by other means. Otherwise, we should hold either the slab_mutex
198 * or the memcg's slab_caches_mutex while calling this function and accessing
199 * the returned value.
200 */ 205 */
201static inline struct kmem_cache * 206static inline struct kmem_cache *
202cache_from_memcg_idx(struct kmem_cache *s, int idx) 207cache_from_memcg_idx(struct kmem_cache *s, int idx)
203{ 208{
204 struct kmem_cache *cachep; 209 struct kmem_cache *cachep;
205 struct memcg_cache_params *params; 210 struct memcg_cache_array *arr;
206
207 if (!s->memcg_params)
208 return NULL;
209 211
210 rcu_read_lock(); 212 rcu_read_lock();
211 params = rcu_dereference(s->memcg_params); 213 arr = rcu_dereference(s->memcg_params.memcg_caches);
212 214
213 /* 215 /*
214 * Make sure we will access the up-to-date value. The code updating 216 * Make sure we will access the up-to-date value. The code updating
215 * memcg_caches issues a write barrier to match this (see 217 * memcg_caches issues a write barrier to match this (see
216 * memcg_register_cache()). 218 * memcg_create_kmem_cache()).
217 */ 219 */
218 cachep = lockless_dereference(params->memcg_caches[idx]); 220 cachep = lockless_dereference(arr->entries[idx]);
219 rcu_read_unlock(); 221 rcu_read_unlock();
220 222
221 return cachep; 223 return cachep;
@@ -225,7 +227,7 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
225{ 227{
226 if (is_root_cache(s)) 228 if (is_root_cache(s))
227 return s; 229 return s;
228 return s->memcg_params->root_cache; 230 return s->memcg_params.root_cache;
229} 231}
230 232
231static __always_inline int memcg_charge_slab(struct kmem_cache *s, 233static __always_inline int memcg_charge_slab(struct kmem_cache *s,
@@ -235,7 +237,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s,
235 return 0; 237 return 0;
236 if (is_root_cache(s)) 238 if (is_root_cache(s))
237 return 0; 239 return 0;
238 return __memcg_charge_slab(s, gfp, order); 240 return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order);
239} 241}
240 242
241static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) 243static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
@@ -244,9 +246,18 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
244 return; 246 return;
245 if (is_root_cache(s)) 247 if (is_root_cache(s))
246 return; 248 return;
247 __memcg_uncharge_slab(s, order); 249 memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order);
248} 250}
249#else 251
252extern void slab_init_memcg_params(struct kmem_cache *);
253
254#else /* !CONFIG_MEMCG_KMEM */
255
256#define for_each_memcg_cache(iter, root) \
257 for ((void)(iter), (void)(root); 0; )
258#define for_each_memcg_cache_safe(iter, tmp, root) \
259 for ((void)(iter), (void)(tmp), (void)(root); 0; )
260
250static inline bool is_root_cache(struct kmem_cache *s) 261static inline bool is_root_cache(struct kmem_cache *s)
251{ 262{
252 return true; 263 return true;
@@ -282,7 +293,11 @@ static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
282static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) 293static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
283{ 294{
284} 295}
285#endif 296
297static inline void slab_init_memcg_params(struct kmem_cache *s)
298{
299}
300#endif /* CONFIG_MEMCG_KMEM */
286 301
287static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) 302static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
288{ 303{
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e03dd6f2a272..999bb3424d44 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -106,62 +106,67 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
106#endif 106#endif
107 107
108#ifdef CONFIG_MEMCG_KMEM 108#ifdef CONFIG_MEMCG_KMEM
109static int memcg_alloc_cache_params(struct mem_cgroup *memcg, 109void slab_init_memcg_params(struct kmem_cache *s)
110 struct kmem_cache *s, struct kmem_cache *root_cache)
111{ 110{
112 size_t size; 111 s->memcg_params.is_root_cache = true;
112 INIT_LIST_HEAD(&s->memcg_params.list);
113 RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
114}
115
116static int init_memcg_params(struct kmem_cache *s,
117 struct mem_cgroup *memcg, struct kmem_cache *root_cache)
118{
119 struct memcg_cache_array *arr;
113 120
114 if (!memcg_kmem_enabled()) 121 if (memcg) {
122 s->memcg_params.is_root_cache = false;
123 s->memcg_params.memcg = memcg;
124 s->memcg_params.root_cache = root_cache;
115 return 0; 125 return 0;
126 }
116 127
117 if (!memcg) { 128 slab_init_memcg_params(s);
118 size = offsetof(struct memcg_cache_params, memcg_caches);
119 size += memcg_limited_groups_array_size * sizeof(void *);
120 } else
121 size = sizeof(struct memcg_cache_params);
122 129
123 s->memcg_params = kzalloc(size, GFP_KERNEL); 130 if (!memcg_nr_cache_ids)
124 if (!s->memcg_params) 131 return 0;
125 return -ENOMEM;
126 132
127 if (memcg) { 133 arr = kzalloc(sizeof(struct memcg_cache_array) +
128 s->memcg_params->memcg = memcg; 134 memcg_nr_cache_ids * sizeof(void *),
129 s->memcg_params->root_cache = root_cache; 135 GFP_KERNEL);
130 } else 136 if (!arr)
131 s->memcg_params->is_root_cache = true; 137 return -ENOMEM;
132 138
139 RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
133 return 0; 140 return 0;
134} 141}
135 142
136static void memcg_free_cache_params(struct kmem_cache *s) 143static void destroy_memcg_params(struct kmem_cache *s)
137{ 144{
138 kfree(s->memcg_params); 145 if (is_root_cache(s))
146 kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
139} 147}
140 148
141static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs) 149static int update_memcg_params(struct kmem_cache *s, int new_array_size)
142{ 150{
143 int size; 151 struct memcg_cache_array *old, *new;
144 struct memcg_cache_params *new_params, *cur_params;
145
146 BUG_ON(!is_root_cache(s));
147 152
148 size = offsetof(struct memcg_cache_params, memcg_caches); 153 if (!is_root_cache(s))
149 size += num_memcgs * sizeof(void *); 154 return 0;
150 155
151 new_params = kzalloc(size, GFP_KERNEL); 156 new = kzalloc(sizeof(struct memcg_cache_array) +
152 if (!new_params) 157 new_array_size * sizeof(void *), GFP_KERNEL);
158 if (!new)
153 return -ENOMEM; 159 return -ENOMEM;
154 160
155 cur_params = s->memcg_params; 161 old = rcu_dereference_protected(s->memcg_params.memcg_caches,
156 memcpy(new_params->memcg_caches, cur_params->memcg_caches, 162 lockdep_is_held(&slab_mutex));
157 memcg_limited_groups_array_size * sizeof(void *)); 163 if (old)
158 164 memcpy(new->entries, old->entries,
159 new_params->is_root_cache = true; 165 memcg_nr_cache_ids * sizeof(void *));
160
161 rcu_assign_pointer(s->memcg_params, new_params);
162 if (cur_params)
163 kfree_rcu(cur_params, rcu_head);
164 166
167 rcu_assign_pointer(s->memcg_params.memcg_caches, new);
168 if (old)
169 kfree_rcu(old, rcu);
165 return 0; 170 return 0;
166} 171}
167 172
@@ -169,34 +174,28 @@ int memcg_update_all_caches(int num_memcgs)
169{ 174{
170 struct kmem_cache *s; 175 struct kmem_cache *s;
171 int ret = 0; 176 int ret = 0;
172 mutex_lock(&slab_mutex);
173 177
178 mutex_lock(&slab_mutex);
174 list_for_each_entry(s, &slab_caches, list) { 179 list_for_each_entry(s, &slab_caches, list) {
175 if (!is_root_cache(s)) 180 ret = update_memcg_params(s, num_memcgs);
176 continue;
177
178 ret = memcg_update_cache_params(s, num_memcgs);
179 /* 181 /*
180 * Instead of freeing the memory, we'll just leave the caches 182 * Instead of freeing the memory, we'll just leave the caches
181 * up to this point in an updated state. 183 * up to this point in an updated state.
182 */ 184 */
183 if (ret) 185 if (ret)
184 goto out; 186 break;
185 } 187 }
186
187 memcg_update_array_size(num_memcgs);
188out:
189 mutex_unlock(&slab_mutex); 188 mutex_unlock(&slab_mutex);
190 return ret; 189 return ret;
191} 190}
192#else 191#else
193static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, 192static inline int init_memcg_params(struct kmem_cache *s,
194 struct kmem_cache *s, struct kmem_cache *root_cache) 193 struct mem_cgroup *memcg, struct kmem_cache *root_cache)
195{ 194{
196 return 0; 195 return 0;
197} 196}
198 197
199static inline void memcg_free_cache_params(struct kmem_cache *s) 198static inline void destroy_memcg_params(struct kmem_cache *s)
200{ 199{
201} 200}
202#endif /* CONFIG_MEMCG_KMEM */ 201#endif /* CONFIG_MEMCG_KMEM */
@@ -296,8 +295,8 @@ unsigned long calculate_alignment(unsigned long flags,
296} 295}
297 296
298static struct kmem_cache * 297static struct kmem_cache *
299do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, 298do_kmem_cache_create(const char *name, size_t object_size, size_t size,
300 unsigned long flags, void (*ctor)(void *), 299 size_t align, unsigned long flags, void (*ctor)(void *),
301 struct mem_cgroup *memcg, struct kmem_cache *root_cache) 300 struct mem_cgroup *memcg, struct kmem_cache *root_cache)
302{ 301{
303 struct kmem_cache *s; 302 struct kmem_cache *s;
@@ -314,7 +313,7 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
314 s->align = align; 313 s->align = align;
315 s->ctor = ctor; 314 s->ctor = ctor;
316 315
317 err = memcg_alloc_cache_params(memcg, s, root_cache); 316 err = init_memcg_params(s, memcg, root_cache);
318 if (err) 317 if (err)
319 goto out_free_cache; 318 goto out_free_cache;
320 319
@@ -330,8 +329,8 @@ out:
330 return s; 329 return s;
331 330
332out_free_cache: 331out_free_cache:
333 memcg_free_cache_params(s); 332 destroy_memcg_params(s);
334 kfree(s); 333 kmem_cache_free(kmem_cache, s);
335 goto out; 334 goto out;
336} 335}
337 336
@@ -364,11 +363,12 @@ kmem_cache_create(const char *name, size_t size, size_t align,
364 unsigned long flags, void (*ctor)(void *)) 363 unsigned long flags, void (*ctor)(void *))
365{ 364{
366 struct kmem_cache *s; 365 struct kmem_cache *s;
367 char *cache_name; 366 const char *cache_name;
368 int err; 367 int err;
369 368
370 get_online_cpus(); 369 get_online_cpus();
371 get_online_mems(); 370 get_online_mems();
371 memcg_get_cache_ids();
372 372
373 mutex_lock(&slab_mutex); 373 mutex_lock(&slab_mutex);
374 374
@@ -390,7 +390,7 @@ kmem_cache_create(const char *name, size_t size, size_t align,
390 if (s) 390 if (s)
391 goto out_unlock; 391 goto out_unlock;
392 392
393 cache_name = kstrdup(name, GFP_KERNEL); 393 cache_name = kstrdup_const(name, GFP_KERNEL);
394 if (!cache_name) { 394 if (!cache_name) {
395 err = -ENOMEM; 395 err = -ENOMEM;
396 goto out_unlock; 396 goto out_unlock;
@@ -401,12 +401,13 @@ kmem_cache_create(const char *name, size_t size, size_t align,
401 flags, ctor, NULL, NULL); 401 flags, ctor, NULL, NULL);
402 if (IS_ERR(s)) { 402 if (IS_ERR(s)) {
403 err = PTR_ERR(s); 403 err = PTR_ERR(s);
404 kfree(cache_name); 404 kfree_const(cache_name);
405 } 405 }
406 406
407out_unlock: 407out_unlock:
408 mutex_unlock(&slab_mutex); 408 mutex_unlock(&slab_mutex);
409 409
410 memcg_put_cache_ids();
410 put_online_mems(); 411 put_online_mems();
411 put_online_cpus(); 412 put_online_cpus();
412 413
@@ -425,31 +426,91 @@ out_unlock:
425} 426}
426EXPORT_SYMBOL(kmem_cache_create); 427EXPORT_SYMBOL(kmem_cache_create);
427 428
429static int do_kmem_cache_shutdown(struct kmem_cache *s,
430 struct list_head *release, bool *need_rcu_barrier)
431{
432 if (__kmem_cache_shutdown(s) != 0) {
433 printk(KERN_ERR "kmem_cache_destroy %s: "
434 "Slab cache still has objects\n", s->name);
435 dump_stack();
436 return -EBUSY;
437 }
438
439 if (s->flags & SLAB_DESTROY_BY_RCU)
440 *need_rcu_barrier = true;
441
442#ifdef CONFIG_MEMCG_KMEM
443 if (!is_root_cache(s))
444 list_del(&s->memcg_params.list);
445#endif
446 list_move(&s->list, release);
447 return 0;
448}
449
450static void do_kmem_cache_release(struct list_head *release,
451 bool need_rcu_barrier)
452{
453 struct kmem_cache *s, *s2;
454
455 if (need_rcu_barrier)
456 rcu_barrier();
457
458 list_for_each_entry_safe(s, s2, release, list) {
459#ifdef SLAB_SUPPORTS_SYSFS
460 sysfs_slab_remove(s);
461#else
462 slab_kmem_cache_release(s);
463#endif
464 }
465}
466
428#ifdef CONFIG_MEMCG_KMEM 467#ifdef CONFIG_MEMCG_KMEM
429/* 468/*
430 * memcg_create_kmem_cache - Create a cache for a memory cgroup. 469 * memcg_create_kmem_cache - Create a cache for a memory cgroup.
431 * @memcg: The memory cgroup the new cache is for. 470 * @memcg: The memory cgroup the new cache is for.
432 * @root_cache: The parent of the new cache. 471 * @root_cache: The parent of the new cache.
433 * @memcg_name: The name of the memory cgroup (used for naming the new cache).
434 * 472 *
435 * This function attempts to create a kmem cache that will serve allocation 473 * This function attempts to create a kmem cache that will serve allocation
436 * requests going from @memcg to @root_cache. The new cache inherits properties 474 * requests going from @memcg to @root_cache. The new cache inherits properties
437 * from its parent. 475 * from its parent.
438 */ 476 */
439struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, 477void memcg_create_kmem_cache(struct mem_cgroup *memcg,
440 struct kmem_cache *root_cache, 478 struct kmem_cache *root_cache)
441 const char *memcg_name)
442{ 479{
480 static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
481 struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
482 struct memcg_cache_array *arr;
443 struct kmem_cache *s = NULL; 483 struct kmem_cache *s = NULL;
444 char *cache_name; 484 char *cache_name;
485 int idx;
445 486
446 get_online_cpus(); 487 get_online_cpus();
447 get_online_mems(); 488 get_online_mems();
448 489
449 mutex_lock(&slab_mutex); 490 mutex_lock(&slab_mutex);
450 491
492 /*
493 * The memory cgroup could have been deactivated while the cache
494 * creation work was pending.
495 */
496 if (!memcg_kmem_is_active(memcg))
497 goto out_unlock;
498
499 idx = memcg_cache_id(memcg);
500 arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
501 lockdep_is_held(&slab_mutex));
502
503 /*
504 * Since per-memcg caches are created asynchronously on first
505 * allocation (see memcg_kmem_get_cache()), several threads can try to
506 * create the same cache, but only one of them may succeed.
507 */
508 if (arr->entries[idx])
509 goto out_unlock;
510
511 cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
451 cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, 512 cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
452 memcg_cache_id(memcg), memcg_name); 513 css->id, memcg_name_buf);
453 if (!cache_name) 514 if (!cache_name)
454 goto out_unlock; 515 goto out_unlock;
455 516
@@ -457,49 +518,108 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
457 root_cache->size, root_cache->align, 518 root_cache->size, root_cache->align,
458 root_cache->flags, root_cache->ctor, 519 root_cache->flags, root_cache->ctor,
459 memcg, root_cache); 520 memcg, root_cache);
521 /*
522 * If we could not create a memcg cache, do not complain, because
523 * that's not critical at all as we can always proceed with the root
524 * cache.
525 */
460 if (IS_ERR(s)) { 526 if (IS_ERR(s)) {
461 kfree(cache_name); 527 kfree(cache_name);
462 s = NULL; 528 goto out_unlock;
463 } 529 }
464 530
531 list_add(&s->memcg_params.list, &root_cache->memcg_params.list);
532
533 /*
534 * Since readers won't lock (see cache_from_memcg_idx()), we need a
535 * barrier here to ensure nobody will see the kmem_cache partially
536 * initialized.
537 */
538 smp_wmb();
539 arr->entries[idx] = s;
540
465out_unlock: 541out_unlock:
466 mutex_unlock(&slab_mutex); 542 mutex_unlock(&slab_mutex);
467 543
468 put_online_mems(); 544 put_online_mems();
469 put_online_cpus(); 545 put_online_cpus();
470
471 return s;
472} 546}
473 547
474static int memcg_cleanup_cache_params(struct kmem_cache *s) 548void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
475{ 549{
476 int rc; 550 int idx;
551 struct memcg_cache_array *arr;
552 struct kmem_cache *s, *c;
477 553
478 if (!s->memcg_params || 554 idx = memcg_cache_id(memcg);
479 !s->memcg_params->is_root_cache) 555
480 return 0; 556 get_online_cpus();
557 get_online_mems();
481 558
482 mutex_unlock(&slab_mutex);
483 rc = __memcg_cleanup_cache_params(s);
484 mutex_lock(&slab_mutex); 559 mutex_lock(&slab_mutex);
560 list_for_each_entry(s, &slab_caches, list) {
561 if (!is_root_cache(s))
562 continue;
563
564 arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
565 lockdep_is_held(&slab_mutex));
566 c = arr->entries[idx];
567 if (!c)
568 continue;
569
570 __kmem_cache_shrink(c, true);
571 arr->entries[idx] = NULL;
572 }
573 mutex_unlock(&slab_mutex);
485 574
486 return rc; 575 put_online_mems();
576 put_online_cpus();
487} 577}
488#else 578
489static int memcg_cleanup_cache_params(struct kmem_cache *s) 579void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
490{ 580{
491 return 0; 581 LIST_HEAD(release);
582 bool need_rcu_barrier = false;
583 struct kmem_cache *s, *s2;
584
585 get_online_cpus();
586 get_online_mems();
587
588 mutex_lock(&slab_mutex);
589 list_for_each_entry_safe(s, s2, &slab_caches, list) {
590 if (is_root_cache(s) || s->memcg_params.memcg != memcg)
591 continue;
592 /*
593 * The cgroup is about to be freed and therefore has no charges
594 * left. Hence, all its caches must be empty by now.
595 */
596 BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier));
597 }
598 mutex_unlock(&slab_mutex);
599
600 put_online_mems();
601 put_online_cpus();
602
603 do_kmem_cache_release(&release, need_rcu_barrier);
492} 604}
493#endif /* CONFIG_MEMCG_KMEM */ 605#endif /* CONFIG_MEMCG_KMEM */
494 606
495void slab_kmem_cache_release(struct kmem_cache *s) 607void slab_kmem_cache_release(struct kmem_cache *s)
496{ 608{
497 kfree(s->name); 609 destroy_memcg_params(s);
610 kfree_const(s->name);
498 kmem_cache_free(kmem_cache, s); 611 kmem_cache_free(kmem_cache, s);
499} 612}
500 613
501void kmem_cache_destroy(struct kmem_cache *s) 614void kmem_cache_destroy(struct kmem_cache *s)
502{ 615{
616 struct kmem_cache *c, *c2;
617 LIST_HEAD(release);
618 bool need_rcu_barrier = false;
619 bool busy = false;
620
621 BUG_ON(!is_root_cache(s));
622
503 get_online_cpus(); 623 get_online_cpus();
504 get_online_mems(); 624 get_online_mems();
505 625
@@ -509,35 +629,21 @@ void kmem_cache_destroy(struct kmem_cache *s)
509 if (s->refcount) 629 if (s->refcount)
510 goto out_unlock; 630 goto out_unlock;
511 631
512 if (memcg_cleanup_cache_params(s) != 0) 632 for_each_memcg_cache_safe(c, c2, s) {
513 goto out_unlock; 633 if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
514 634 busy = true;
515 if (__kmem_cache_shutdown(s) != 0) {
516 printk(KERN_ERR "kmem_cache_destroy %s: "
517 "Slab cache still has objects\n", s->name);
518 dump_stack();
519 goto out_unlock;
520 } 635 }
521 636
522 list_del(&s->list); 637 if (!busy)
523 638 do_kmem_cache_shutdown(s, &release, &need_rcu_barrier);
524 mutex_unlock(&slab_mutex);
525 if (s->flags & SLAB_DESTROY_BY_RCU)
526 rcu_barrier();
527
528 memcg_free_cache_params(s);
529#ifdef SLAB_SUPPORTS_SYSFS
530 sysfs_slab_remove(s);
531#else
532 slab_kmem_cache_release(s);
533#endif
534 goto out;
535 639
536out_unlock: 640out_unlock:
537 mutex_unlock(&slab_mutex); 641 mutex_unlock(&slab_mutex);
538out: 642
539 put_online_mems(); 643 put_online_mems();
540 put_online_cpus(); 644 put_online_cpus();
645
646 do_kmem_cache_release(&release, need_rcu_barrier);
541} 647}
542EXPORT_SYMBOL(kmem_cache_destroy); 648EXPORT_SYMBOL(kmem_cache_destroy);
543 649
@@ -554,7 +660,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
554 660
555 get_online_cpus(); 661 get_online_cpus();
556 get_online_mems(); 662 get_online_mems();
557 ret = __kmem_cache_shrink(cachep); 663 ret = __kmem_cache_shrink(cachep, false);
558 put_online_mems(); 664 put_online_mems();
559 put_online_cpus(); 665 put_online_cpus();
560 return ret; 666 return ret;
@@ -576,6 +682,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
576 s->name = name; 682 s->name = name;
577 s->size = s->object_size = size; 683 s->size = s->object_size = size;
578 s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); 684 s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
685
686 slab_init_memcg_params(s);
687
579 err = __kmem_cache_create(s, flags); 688 err = __kmem_cache_create(s, flags);
580 689
581 if (err) 690 if (err)
@@ -789,6 +898,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
789 page = alloc_kmem_pages(flags, order); 898 page = alloc_kmem_pages(flags, order);
790 ret = page ? page_address(page) : NULL; 899 ret = page ? page_address(page) : NULL;
791 kmemleak_alloc(ret, size, 1, flags); 900 kmemleak_alloc(ret, size, 1, flags);
901 kasan_kmalloc_large(ret, size);
792 return ret; 902 return ret;
793} 903}
794EXPORT_SYMBOL(kmalloc_order); 904EXPORT_SYMBOL(kmalloc_order);
@@ -855,16 +965,11 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
855{ 965{
856 struct kmem_cache *c; 966 struct kmem_cache *c;
857 struct slabinfo sinfo; 967 struct slabinfo sinfo;
858 int i;
859 968
860 if (!is_root_cache(s)) 969 if (!is_root_cache(s))
861 return; 970 return;
862 971
863 for_each_memcg_cache_index(i) { 972 for_each_memcg_cache(c, s) {
864 c = cache_from_memcg_idx(s, i);
865 if (!c)
866 continue;
867
868 memset(&sinfo, 0, sizeof(sinfo)); 973 memset(&sinfo, 0, sizeof(sinfo));
869 get_slabinfo(c, &sinfo); 974 get_slabinfo(c, &sinfo);
870 975
@@ -916,7 +1021,7 @@ int memcg_slab_show(struct seq_file *m, void *p)
916 1021
917 if (p == slab_caches.next) 1022 if (p == slab_caches.next)
918 print_slabinfo_header(m); 1023 print_slabinfo_header(m);
919 if (!is_root_cache(s) && s->memcg_params->memcg == memcg) 1024 if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
920 cache_show(s, m); 1025 cache_show(s, m);
921 return 0; 1026 return 0;
922} 1027}
@@ -973,8 +1078,10 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
973 if (p) 1078 if (p)
974 ks = ksize(p); 1079 ks = ksize(p);
975 1080
976 if (ks >= new_size) 1081 if (ks >= new_size) {
1082 kasan_krealloc((void *)p, new_size);
977 return (void *)p; 1083 return (void *)p;
1084 }
978 1085
979 ret = kmalloc_track_caller(new_size, flags); 1086 ret = kmalloc_track_caller(new_size, flags);
980 if (ret && p) 1087 if (ret && p)
diff --git a/mm/slob.c b/mm/slob.c
index 96a86206a26b..94a7fede6d48 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -618,7 +618,7 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
618 return 0; 618 return 0;
619} 619}
620 620
621int __kmem_cache_shrink(struct kmem_cache *d) 621int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
622{ 622{
623 return 0; 623 return 0;
624} 624}
diff --git a/mm/slub.c b/mm/slub.c
index fe376fe1f4fe..6832c4eab104 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -20,6 +20,7 @@
20#include <linux/proc_fs.h> 20#include <linux/proc_fs.h>
21#include <linux/notifier.h> 21#include <linux/notifier.h>
22#include <linux/seq_file.h> 22#include <linux/seq_file.h>
23#include <linux/kasan.h>
23#include <linux/kmemcheck.h> 24#include <linux/kmemcheck.h>
24#include <linux/cpu.h> 25#include <linux/cpu.h>
25#include <linux/cpuset.h> 26#include <linux/cpuset.h>
@@ -468,12 +469,30 @@ static char *slub_debug_slabs;
468static int disable_higher_order_debug; 469static int disable_higher_order_debug;
469 470
470/* 471/*
472 * slub is about to manipulate internal object metadata. This memory lies
473 * outside the range of the allocated object, so accessing it would normally
474 * be reported by kasan as a bounds error. metadata_access_enable() is used
475 * to tell kasan that these accesses are OK.
476 */
477static inline void metadata_access_enable(void)
478{
479 kasan_disable_current();
480}
481
482static inline void metadata_access_disable(void)
483{
484 kasan_enable_current();
485}
486
487/*
471 * Object debugging 488 * Object debugging
472 */ 489 */
473static void print_section(char *text, u8 *addr, unsigned int length) 490static void print_section(char *text, u8 *addr, unsigned int length)
474{ 491{
492 metadata_access_enable();
475 print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, 493 print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
476 length, 1); 494 length, 1);
495 metadata_access_disable();
477} 496}
478 497
479static struct track *get_track(struct kmem_cache *s, void *object, 498static struct track *get_track(struct kmem_cache *s, void *object,
@@ -503,7 +522,9 @@ static void set_track(struct kmem_cache *s, void *object,
503 trace.max_entries = TRACK_ADDRS_COUNT; 522 trace.max_entries = TRACK_ADDRS_COUNT;
504 trace.entries = p->addrs; 523 trace.entries = p->addrs;
505 trace.skip = 3; 524 trace.skip = 3;
525 metadata_access_enable();
506 save_stack_trace(&trace); 526 save_stack_trace(&trace);
527 metadata_access_disable();
507 528
508 /* See rant in lockdep.c */ 529 /* See rant in lockdep.c */
509 if (trace.nr_entries != 0 && 530 if (trace.nr_entries != 0 &&
@@ -629,7 +650,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
629 dump_stack(); 650 dump_stack();
630} 651}
631 652
632static void object_err(struct kmem_cache *s, struct page *page, 653void object_err(struct kmem_cache *s, struct page *page,
633 u8 *object, char *reason) 654 u8 *object, char *reason)
634{ 655{
635 slab_bug(s, "%s", reason); 656 slab_bug(s, "%s", reason);
@@ -677,7 +698,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
677 u8 *fault; 698 u8 *fault;
678 u8 *end; 699 u8 *end;
679 700
701 metadata_access_enable();
680 fault = memchr_inv(start, value, bytes); 702 fault = memchr_inv(start, value, bytes);
703 metadata_access_disable();
681 if (!fault) 704 if (!fault)
682 return 1; 705 return 1;
683 706
@@ -770,7 +793,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
770 if (!remainder) 793 if (!remainder)
771 return 1; 794 return 1;
772 795
796 metadata_access_enable();
773 fault = memchr_inv(end - remainder, POISON_INUSE, remainder); 797 fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
798 metadata_access_disable();
774 if (!fault) 799 if (!fault)
775 return 1; 800 return 1;
776 while (end > fault && end[-1] == POISON_INUSE) 801 while (end > fault && end[-1] == POISON_INUSE)
@@ -1226,11 +1251,13 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
1226static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) 1251static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
1227{ 1252{
1228 kmemleak_alloc(ptr, size, 1, flags); 1253 kmemleak_alloc(ptr, size, 1, flags);
1254 kasan_kmalloc_large(ptr, size);
1229} 1255}
1230 1256
1231static inline void kfree_hook(const void *x) 1257static inline void kfree_hook(const void *x)
1232{ 1258{
1233 kmemleak_free(x); 1259 kmemleak_free(x);
1260 kasan_kfree_large(x);
1234} 1261}
1235 1262
1236static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, 1263static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
@@ -1253,6 +1280,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
1253 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 1280 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
1254 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); 1281 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
1255 memcg_kmem_put_cache(s); 1282 memcg_kmem_put_cache(s);
1283 kasan_slab_alloc(s, object);
1256} 1284}
1257 1285
1258static inline void slab_free_hook(struct kmem_cache *s, void *x) 1286static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -1276,6 +1304,8 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
1276#endif 1304#endif
1277 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1305 if (!(s->flags & SLAB_DEBUG_OBJECTS))
1278 debug_check_no_obj_freed(x, s->object_size); 1306 debug_check_no_obj_freed(x, s->object_size);
1307
1308 kasan_slab_free(s, x);
1279} 1309}
1280 1310
1281/* 1311/*
@@ -1370,8 +1400,11 @@ static void setup_object(struct kmem_cache *s, struct page *page,
1370 void *object) 1400 void *object)
1371{ 1401{
1372 setup_object_debug(s, page, object); 1402 setup_object_debug(s, page, object);
1373 if (unlikely(s->ctor)) 1403 if (unlikely(s->ctor)) {
1404 kasan_unpoison_object_data(s, object);
1374 s->ctor(object); 1405 s->ctor(object);
1406 kasan_poison_object_data(s, object);
1407 }
1375} 1408}
1376 1409
1377static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) 1410static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1404,6 +1437,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1404 if (unlikely(s->flags & SLAB_POISON)) 1437 if (unlikely(s->flags & SLAB_POISON))
1405 memset(start, POISON_INUSE, PAGE_SIZE << order); 1438 memset(start, POISON_INUSE, PAGE_SIZE << order);
1406 1439
1440 kasan_poison_slab(page);
1441
1407 for_each_object_idx(p, idx, s, start, page->objects) { 1442 for_each_object_idx(p, idx, s, start, page->objects) {
1408 setup_object(s, page, p); 1443 setup_object(s, page, p);
1409 if (likely(idx < page->objects)) 1444 if (likely(idx < page->objects))
@@ -2007,6 +2042,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2007 int pages; 2042 int pages;
2008 int pobjects; 2043 int pobjects;
2009 2044
2045 preempt_disable();
2010 do { 2046 do {
2011 pages = 0; 2047 pages = 0;
2012 pobjects = 0; 2048 pobjects = 0;
@@ -2040,6 +2076,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2040 2076
2041 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) 2077 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
2042 != oldpage); 2078 != oldpage);
2079 if (unlikely(!s->cpu_partial)) {
2080 unsigned long flags;
2081
2082 local_irq_save(flags);
2083 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
2084 local_irq_restore(flags);
2085 }
2086 preempt_enable();
2043#endif 2087#endif
2044} 2088}
2045 2089
@@ -2398,13 +2442,24 @@ redo:
2398 * reading from one cpu area. That does not matter as long 2442 * reading from one cpu area. That does not matter as long
2399 * as we end up on the original cpu again when doing the cmpxchg. 2443 * as we end up on the original cpu again when doing the cmpxchg.
2400 * 2444 *
2401 * Preemption is disabled for the retrieval of the tid because that 2445 * We should guarantee that tid and kmem_cache are retrieved on
2402 * must occur from the current processor. We cannot allow rescheduling 2446 * the same cpu. It could be different if CONFIG_PREEMPT so we need
2403 * on a different processor between the determination of the pointer 2447 * to check if it is matched or not.
2404 * and the retrieval of the tid.
2405 */ 2448 */
2406 preempt_disable(); 2449 do {
2407 c = this_cpu_ptr(s->cpu_slab); 2450 tid = this_cpu_read(s->cpu_slab->tid);
2451 c = raw_cpu_ptr(s->cpu_slab);
2452 } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
2453
2454 /*
2455 * Irqless object alloc/free algorithm used here depends on sequence
2456 * of fetching cpu_slab's data. tid should be fetched before anything
2457 * on c to guarantee that object and page associated with previous tid
2458 * won't be used with current tid. If we fetch tid first, object and
2459 * page could be one associated with next tid and our alloc/free
2460 * request will be failed. In this case, we will retry. So, no problem.
2461 */
2462 barrier();
2408 2463
2409 /* 2464 /*
2410 * The transaction ids are globally unique per cpu and per operation on 2465 * The transaction ids are globally unique per cpu and per operation on
@@ -2412,8 +2467,6 @@ redo:
2412 * occurs on the right processor and that there was no operation on the 2467 * occurs on the right processor and that there was no operation on the
2413 * linked list in between. 2468 * linked list in between.
2414 */ 2469 */
2415 tid = c->tid;
2416 preempt_enable();
2417 2470
2418 object = c->freelist; 2471 object = c->freelist;
2419 page = c->page; 2472 page = c->page;
@@ -2479,6 +2532,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
2479{ 2532{
2480 void *ret = slab_alloc(s, gfpflags, _RET_IP_); 2533 void *ret = slab_alloc(s, gfpflags, _RET_IP_);
2481 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); 2534 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
2535 kasan_kmalloc(s, ret, size);
2482 return ret; 2536 return ret;
2483} 2537}
2484EXPORT_SYMBOL(kmem_cache_alloc_trace); 2538EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -2505,6 +2559,8 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
2505 2559
2506 trace_kmalloc_node(_RET_IP_, ret, 2560 trace_kmalloc_node(_RET_IP_, ret,
2507 size, s->size, gfpflags, node); 2561 size, s->size, gfpflags, node);
2562
2563 kasan_kmalloc(s, ret, size);
2508 return ret; 2564 return ret;
2509} 2565}
2510EXPORT_SYMBOL(kmem_cache_alloc_node_trace); 2566EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -2512,7 +2568,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
2512#endif 2568#endif
2513 2569
2514/* 2570/*
2515 * Slow patch handling. This may still be called frequently since objects 2571 * Slow path handling. This may still be called frequently since objects
2516 * have a longer lifetime than the cpu slabs in most processing loads. 2572 * have a longer lifetime than the cpu slabs in most processing loads.
2517 * 2573 *
2518 * So we still attempt to reduce cache line usage. Just take the slab 2574 * So we still attempt to reduce cache line usage. Just take the slab
@@ -2659,11 +2715,13 @@ redo:
2659 * data is retrieved via this pointer. If we are on the same cpu 2715 * data is retrieved via this pointer. If we are on the same cpu
2660 * during the cmpxchg then the free will succedd. 2716 * during the cmpxchg then the free will succedd.
2661 */ 2717 */
2662 preempt_disable(); 2718 do {
2663 c = this_cpu_ptr(s->cpu_slab); 2719 tid = this_cpu_read(s->cpu_slab->tid);
2720 c = raw_cpu_ptr(s->cpu_slab);
2721 } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
2664 2722
2665 tid = c->tid; 2723 /* Same with comment on barrier() in slab_alloc_node() */
2666 preempt_enable(); 2724 barrier();
2667 2725
2668 if (likely(page == c->page)) { 2726 if (likely(page == c->page)) {
2669 set_freepointer(s, object, c->freelist); 2727 set_freepointer(s, object, c->freelist);
@@ -2888,6 +2946,7 @@ static void early_kmem_cache_node_alloc(int node)
2888 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 2946 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
2889 init_tracking(kmem_cache_node, n); 2947 init_tracking(kmem_cache_node, n);
2890#endif 2948#endif
2949 kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node));
2891 init_kmem_cache_node(n); 2950 init_kmem_cache_node(n);
2892 inc_slabs_node(kmem_cache_node, node, page->objects); 2951 inc_slabs_node(kmem_cache_node, node, page->objects);
2893 2952
@@ -3260,6 +3319,8 @@ void *__kmalloc(size_t size, gfp_t flags)
3260 3319
3261 trace_kmalloc(_RET_IP_, ret, size, s->size, flags); 3320 trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
3262 3321
3322 kasan_kmalloc(s, ret, size);
3323
3263 return ret; 3324 return ret;
3264} 3325}
3265EXPORT_SYMBOL(__kmalloc); 3326EXPORT_SYMBOL(__kmalloc);
@@ -3303,12 +3364,14 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
3303 3364
3304 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); 3365 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
3305 3366
3367 kasan_kmalloc(s, ret, size);
3368
3306 return ret; 3369 return ret;
3307} 3370}
3308EXPORT_SYMBOL(__kmalloc_node); 3371EXPORT_SYMBOL(__kmalloc_node);
3309#endif 3372#endif
3310 3373
3311size_t ksize(const void *object) 3374static size_t __ksize(const void *object)
3312{ 3375{
3313 struct page *page; 3376 struct page *page;
3314 3377
@@ -3324,6 +3387,15 @@ size_t ksize(const void *object)
3324 3387
3325 return slab_ksize(page->slab_cache); 3388 return slab_ksize(page->slab_cache);
3326} 3389}
3390
3391size_t ksize(const void *object)
3392{
3393 size_t size = __ksize(object);
3394 /* We assume that ksize callers could use whole allocated area,
3395 so we need unpoison this area. */
3396 kasan_krealloc(object, size);
3397 return size;
3398}
3327EXPORT_SYMBOL(ksize); 3399EXPORT_SYMBOL(ksize);
3328 3400
3329void kfree(const void *x) 3401void kfree(const void *x)
@@ -3347,69 +3419,92 @@ void kfree(const void *x)
3347} 3419}
3348EXPORT_SYMBOL(kfree); 3420EXPORT_SYMBOL(kfree);
3349 3421
3422#define SHRINK_PROMOTE_MAX 32
3423
3350/* 3424/*
3351 * kmem_cache_shrink removes empty slabs from the partial lists and sorts 3425 * kmem_cache_shrink discards empty slabs and promotes the slabs filled
3352 * the remaining slabs by the number of items in use. The slabs with the 3426 * up most to the head of the partial lists. New allocations will then
3353 * most items in use come first. New allocations will then fill those up 3427 * fill those up and thus they can be removed from the partial lists.
3354 * and thus they can be removed from the partial lists.
3355 * 3428 *
3356 * The slabs with the least items are placed last. This results in them 3429 * The slabs with the least items are placed last. This results in them
3357 * being allocated from last increasing the chance that the last objects 3430 * being allocated from last increasing the chance that the last objects
3358 * are freed in them. 3431 * are freed in them.
3359 */ 3432 */
3360int __kmem_cache_shrink(struct kmem_cache *s) 3433int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
3361{ 3434{
3362 int node; 3435 int node;
3363 int i; 3436 int i;
3364 struct kmem_cache_node *n; 3437 struct kmem_cache_node *n;
3365 struct page *page; 3438 struct page *page;
3366 struct page *t; 3439 struct page *t;
3367 int objects = oo_objects(s->max); 3440 struct list_head discard;
3368 struct list_head *slabs_by_inuse = 3441 struct list_head promote[SHRINK_PROMOTE_MAX];
3369 kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
3370 unsigned long flags; 3442 unsigned long flags;
3443 int ret = 0;
3371 3444
3372 if (!slabs_by_inuse) 3445 if (deactivate) {
3373 return -ENOMEM; 3446 /*
3447 * Disable empty slabs caching. Used to avoid pinning offline
3448 * memory cgroups by kmem pages that can be freed.
3449 */
3450 s->cpu_partial = 0;
3451 s->min_partial = 0;
3452
3453 /*
3454 * s->cpu_partial is checked locklessly (see put_cpu_partial),
3455 * so we have to make sure the change is visible.
3456 */
3457 kick_all_cpus_sync();
3458 }
3374 3459
3375 flush_all(s); 3460 flush_all(s);
3376 for_each_kmem_cache_node(s, node, n) { 3461 for_each_kmem_cache_node(s, node, n) {
3377 if (!n->nr_partial) 3462 INIT_LIST_HEAD(&discard);
3378 continue; 3463 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
3379 3464 INIT_LIST_HEAD(promote + i);
3380 for (i = 0; i < objects; i++)
3381 INIT_LIST_HEAD(slabs_by_inuse + i);
3382 3465
3383 spin_lock_irqsave(&n->list_lock, flags); 3466 spin_lock_irqsave(&n->list_lock, flags);
3384 3467
3385 /* 3468 /*
3386 * Build lists indexed by the items in use in each slab. 3469 * Build lists of slabs to discard or promote.
3387 * 3470 *
3388 * Note that concurrent frees may occur while we hold the 3471 * Note that concurrent frees may occur while we hold the
3389 * list_lock. page->inuse here is the upper limit. 3472 * list_lock. page->inuse here is the upper limit.
3390 */ 3473 */
3391 list_for_each_entry_safe(page, t, &n->partial, lru) { 3474 list_for_each_entry_safe(page, t, &n->partial, lru) {
3392 list_move(&page->lru, slabs_by_inuse + page->inuse); 3475 int free = page->objects - page->inuse;
3393 if (!page->inuse) 3476
3477 /* Do not reread page->inuse */
3478 barrier();
3479
3480 /* We do not keep full slabs on the list */
3481 BUG_ON(free <= 0);
3482
3483 if (free == page->objects) {
3484 list_move(&page->lru, &discard);
3394 n->nr_partial--; 3485 n->nr_partial--;
3486 } else if (free <= SHRINK_PROMOTE_MAX)
3487 list_move(&page->lru, promote + free - 1);
3395 } 3488 }
3396 3489
3397 /* 3490 /*
3398 * Rebuild the partial list with the slabs filled up most 3491 * Promote the slabs filled up most to the head of the
3399 * first and the least used slabs at the end. 3492 * partial list.
3400 */ 3493 */
3401 for (i = objects - 1; i > 0; i--) 3494 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
3402 list_splice(slabs_by_inuse + i, n->partial.prev); 3495 list_splice(promote + i, &n->partial);
3403 3496
3404 spin_unlock_irqrestore(&n->list_lock, flags); 3497 spin_unlock_irqrestore(&n->list_lock, flags);
3405 3498
3406 /* Release empty slabs */ 3499 /* Release empty slabs */
3407 list_for_each_entry_safe(page, t, slabs_by_inuse, lru) 3500 list_for_each_entry_safe(page, t, &discard, lru)
3408 discard_slab(s, page); 3501 discard_slab(s, page);
3502
3503 if (slabs_node(s, node))
3504 ret = 1;
3409 } 3505 }
3410 3506
3411 kfree(slabs_by_inuse); 3507 return ret;
3412 return 0;
3413} 3508}
3414 3509
3415static int slab_mem_going_offline_callback(void *arg) 3510static int slab_mem_going_offline_callback(void *arg)
@@ -3418,7 +3513,7 @@ static int slab_mem_going_offline_callback(void *arg)
3418 3513
3419 mutex_lock(&slab_mutex); 3514 mutex_lock(&slab_mutex);
3420 list_for_each_entry(s, &slab_caches, list) 3515 list_for_each_entry(s, &slab_caches, list)
3421 __kmem_cache_shrink(s); 3516 __kmem_cache_shrink(s, false);
3422 mutex_unlock(&slab_mutex); 3517 mutex_unlock(&slab_mutex);
3423 3518
3424 return 0; 3519 return 0;
@@ -3566,6 +3661,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
3566 p->slab_cache = s; 3661 p->slab_cache = s;
3567#endif 3662#endif
3568 } 3663 }
3664 slab_init_memcg_params(s);
3569 list_add(&s->list, &slab_caches); 3665 list_add(&s->list, &slab_caches);
3570 return s; 3666 return s;
3571} 3667}
@@ -3624,13 +3720,10 @@ struct kmem_cache *
3624__kmem_cache_alias(const char *name, size_t size, size_t align, 3720__kmem_cache_alias(const char *name, size_t size, size_t align,
3625 unsigned long flags, void (*ctor)(void *)) 3721 unsigned long flags, void (*ctor)(void *))
3626{ 3722{
3627 struct kmem_cache *s; 3723 struct kmem_cache *s, *c;
3628 3724
3629 s = find_mergeable(size, align, flags, name, ctor); 3725 s = find_mergeable(size, align, flags, name, ctor);
3630 if (s) { 3726 if (s) {
3631 int i;
3632 struct kmem_cache *c;
3633
3634 s->refcount++; 3727 s->refcount++;
3635 3728
3636 /* 3729 /*
@@ -3640,10 +3733,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
3640 s->object_size = max(s->object_size, (int)size); 3733 s->object_size = max(s->object_size, (int)size);
3641 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3734 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3642 3735
3643 for_each_memcg_cache_index(i) { 3736 for_each_memcg_cache(c, s) {
3644 c = cache_from_memcg_idx(s, i);
3645 if (!c)
3646 continue;
3647 c->object_size = s->object_size; 3737 c->object_size = s->object_size;
3648 c->inuse = max_t(int, c->inuse, 3738 c->inuse = max_t(int, c->inuse,
3649 ALIGN(size, sizeof(void *))); 3739 ALIGN(size, sizeof(void *)));
@@ -4070,20 +4160,16 @@ static int list_locations(struct kmem_cache *s, char *buf,
4070 4160
4071 if (num_online_cpus() > 1 && 4161 if (num_online_cpus() > 1 &&
4072 !cpumask_empty(to_cpumask(l->cpus)) && 4162 !cpumask_empty(to_cpumask(l->cpus)) &&
4073 len < PAGE_SIZE - 60) { 4163 len < PAGE_SIZE - 60)
4074 len += sprintf(buf + len, " cpus="); 4164 len += scnprintf(buf + len, PAGE_SIZE - len - 50,
4075 len += cpulist_scnprintf(buf + len, 4165 " cpus=%*pbl",
4076 PAGE_SIZE - len - 50, 4166 cpumask_pr_args(to_cpumask(l->cpus)));
4077 to_cpumask(l->cpus));
4078 }
4079 4167
4080 if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && 4168 if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
4081 len < PAGE_SIZE - 60) { 4169 len < PAGE_SIZE - 60)
4082 len += sprintf(buf + len, " nodes="); 4170 len += scnprintf(buf + len, PAGE_SIZE - len - 50,
4083 len += nodelist_scnprintf(buf + len, 4171 " nodes=%*pbl",
4084 PAGE_SIZE - len - 50, 4172 nodemask_pr_args(&l->nodes));
4085 l->nodes);
4086 }
4087 4173
4088 len += sprintf(buf + len, "\n"); 4174 len += sprintf(buf + len, "\n");
4089 } 4175 }
@@ -4680,12 +4766,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf)
4680static ssize_t shrink_store(struct kmem_cache *s, 4766static ssize_t shrink_store(struct kmem_cache *s,
4681 const char *buf, size_t length) 4767 const char *buf, size_t length)
4682{ 4768{
4683 if (buf[0] == '1') { 4769 if (buf[0] == '1')
4684 int rc = kmem_cache_shrink(s); 4770 kmem_cache_shrink(s);
4685 4771 else
4686 if (rc)
4687 return rc;
4688 } else
4689 return -EINVAL; 4772 return -EINVAL;
4690 return length; 4773 return length;
4691} 4774}
@@ -4909,7 +4992,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
4909 err = attribute->store(s, buf, len); 4992 err = attribute->store(s, buf, len);
4910#ifdef CONFIG_MEMCG_KMEM 4993#ifdef CONFIG_MEMCG_KMEM
4911 if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { 4994 if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
4912 int i; 4995 struct kmem_cache *c;
4913 4996
4914 mutex_lock(&slab_mutex); 4997 mutex_lock(&slab_mutex);
4915 if (s->max_attr_size < len) 4998 if (s->max_attr_size < len)
@@ -4932,11 +5015,8 @@ static ssize_t slab_attr_store(struct kobject *kobj,
4932 * directly either failed or succeeded, in which case we loop 5015 * directly either failed or succeeded, in which case we loop
4933 * through the descendants with best-effort propagation. 5016 * through the descendants with best-effort propagation.
4934 */ 5017 */
4935 for_each_memcg_cache_index(i) { 5018 for_each_memcg_cache(c, s)
4936 struct kmem_cache *c = cache_from_memcg_idx(s, i); 5019 attribute->store(c, buf, len);
4937 if (c)
4938 attribute->store(c, buf, len);
4939 }
4940 mutex_unlock(&slab_mutex); 5020 mutex_unlock(&slab_mutex);
4941 } 5021 }
4942#endif 5022#endif
@@ -4953,7 +5033,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
4953 if (is_root_cache(s)) 5033 if (is_root_cache(s))
4954 return; 5034 return;
4955 5035
4956 root_cache = s->memcg_params->root_cache; 5036 root_cache = s->memcg_params.root_cache;
4957 5037
4958 /* 5038 /*
4959 * This mean this cache had no attribute written. Therefore, no point 5039 * This mean this cache had no attribute written. Therefore, no point
@@ -5033,7 +5113,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s)
5033{ 5113{
5034#ifdef CONFIG_MEMCG_KMEM 5114#ifdef CONFIG_MEMCG_KMEM
5035 if (!is_root_cache(s)) 5115 if (!is_root_cache(s))
5036 return s->memcg_params->root_cache->memcg_kset; 5116 return s->memcg_params.root_cache->memcg_kset;
5037#endif 5117#endif
5038 return slab_kset; 5118 return slab_kset;
5039} 5119}
diff --git a/mm/swap.c b/mm/swap.c
index 8a12b33936b4..cd3a5e64cea9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1138,12 +1138,8 @@ void __init swap_setup(void)
1138#ifdef CONFIG_SWAP 1138#ifdef CONFIG_SWAP
1139 int i; 1139 int i;
1140 1140
1141 if (bdi_init(swapper_spaces[0].backing_dev_info)) 1141 for (i = 0; i < MAX_SWAPFILES; i++)
1142 panic("Failed to init swap bdi");
1143 for (i = 0; i < MAX_SWAPFILES; i++) {
1144 spin_lock_init(&swapper_spaces[i].tree_lock); 1142 spin_lock_init(&swapper_spaces[i].tree_lock);
1145 INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
1146 }
1147#endif 1143#endif
1148 1144
1149 /* Use a smaller cluster for small-memory machines */ 1145 /* Use a smaller cluster for small-memory machines */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 9711342987a0..405923f77334 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -32,17 +32,11 @@ static const struct address_space_operations swap_aops = {
32#endif 32#endif
33}; 33};
34 34
35static struct backing_dev_info swap_backing_dev_info = {
36 .name = "swap",
37 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
38};
39
40struct address_space swapper_spaces[MAX_SWAPFILES] = { 35struct address_space swapper_spaces[MAX_SWAPFILES] = {
41 [0 ... MAX_SWAPFILES - 1] = { 36 [0 ... MAX_SWAPFILES - 1] = {
42 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), 37 .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
43 .i_mmap_writable = ATOMIC_INIT(0), 38 .i_mmap_writable = ATOMIC_INIT(0),
44 .a_ops = &swap_aops, 39 .a_ops = &swap_aops,
45 .backing_dev_info = &swap_backing_dev_info,
46 } 40 }
47}; 41};
48 42
diff --git a/mm/truncate.c b/mm/truncate.c
index f1e4d6052369..ddec5a5966d7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -112,7 +112,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
112 struct address_space *mapping = page->mapping; 112 struct address_space *mapping = page->mapping;
113 if (mapping && mapping_cap_account_dirty(mapping)) { 113 if (mapping && mapping_cap_account_dirty(mapping)) {
114 dec_zone_page_state(page, NR_FILE_DIRTY); 114 dec_zone_page_state(page, NR_FILE_DIRTY);
115 dec_bdi_stat(mapping->backing_dev_info, 115 dec_bdi_stat(inode_to_bdi(mapping->host),
116 BDI_RECLAIMABLE); 116 BDI_RECLAIMABLE);
117 if (account_size) 117 if (account_size)
118 task_io_account_cancelled_write(account_size); 118 task_io_account_cancelled_write(account_size);
diff --git a/mm/util.c b/mm/util.c
index fec39d4509a9..3981ae9d1b15 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -12,10 +12,30 @@
12#include <linux/hugetlb.h> 12#include <linux/hugetlb.h>
13#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
14 14
15#include <asm/sections.h>
15#include <asm/uaccess.h> 16#include <asm/uaccess.h>
16 17
17#include "internal.h" 18#include "internal.h"
18 19
20static inline int is_kernel_rodata(unsigned long addr)
21{
22 return addr >= (unsigned long)__start_rodata &&
23 addr < (unsigned long)__end_rodata;
24}
25
26/**
27 * kfree_const - conditionally free memory
28 * @x: pointer to the memory
29 *
30 * Function calls kfree only if @x is not in .rodata section.
31 */
32void kfree_const(const void *x)
33{
34 if (!is_kernel_rodata((unsigned long)x))
35 kfree(x);
36}
37EXPORT_SYMBOL(kfree_const);
38
19/** 39/**
20 * kstrdup - allocate space for and copy an existing string 40 * kstrdup - allocate space for and copy an existing string
21 * @s: the string to duplicate 41 * @s: the string to duplicate
@@ -38,6 +58,24 @@ char *kstrdup(const char *s, gfp_t gfp)
38EXPORT_SYMBOL(kstrdup); 58EXPORT_SYMBOL(kstrdup);
39 59
40/** 60/**
61 * kstrdup_const - conditionally duplicate an existing const string
62 * @s: the string to duplicate
63 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
64 *
65 * Function returns source string if it is in .rodata section otherwise it
66 * fallbacks to kstrdup.
67 * Strings allocated by kstrdup_const should be freed by kfree_const.
68 */
69const char *kstrdup_const(const char *s, gfp_t gfp)
70{
71 if (is_kernel_rodata((unsigned long)s))
72 return s;
73
74 return kstrdup(s, gfp);
75}
76EXPORT_SYMBOL(kstrdup_const);
77
78/**
41 * kstrndup - allocate space for and copy an existing string 79 * kstrndup - allocate space for and copy an existing string
42 * @s: the string to duplicate 80 * @s: the string to duplicate
43 * @max: read at most @max chars from @s 81 * @max: read at most @max chars from @s
@@ -240,14 +278,8 @@ int __weak get_user_pages_fast(unsigned long start,
240 int nr_pages, int write, struct page **pages) 278 int nr_pages, int write, struct page **pages)
241{ 279{
242 struct mm_struct *mm = current->mm; 280 struct mm_struct *mm = current->mm;
243 int ret; 281 return get_user_pages_unlocked(current, mm, start, nr_pages,
244 282 write, 0, pages);
245 down_read(&mm->mmap_sem);
246 ret = get_user_pages(current, mm, start, nr_pages,
247 write, 0, pages, NULL);
248 up_read(&mm->mmap_sem);
249
250 return ret;
251} 283}
252EXPORT_SYMBOL_GPL(get_user_pages_fast); 284EXPORT_SYMBOL_GPL(get_user_pages_fast);
253 285
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 39c338896416..35b25e1340ca 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1324,10 +1324,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1324 if (unlikely(!area)) 1324 if (unlikely(!area))
1325 return NULL; 1325 return NULL;
1326 1326
1327 /* 1327 if (!(flags & VM_NO_GUARD))
1328 * We always allocate a guard page. 1328 size += PAGE_SIZE;
1329 */
1330 size += PAGE_SIZE;
1331 1329
1332 va = alloc_vmap_area(size, align, start, end, node, gfp_mask); 1330 va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
1333 if (IS_ERR(va)) { 1331 if (IS_ERR(va)) {
@@ -1621,6 +1619,7 @@ fail:
1621 * @end: vm area range end 1619 * @end: vm area range end
1622 * @gfp_mask: flags for the page level allocator 1620 * @gfp_mask: flags for the page level allocator
1623 * @prot: protection mask for the allocated pages 1621 * @prot: protection mask for the allocated pages
1622 * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
1624 * @node: node to use for allocation or NUMA_NO_NODE 1623 * @node: node to use for allocation or NUMA_NO_NODE
1625 * @caller: caller's return address 1624 * @caller: caller's return address
1626 * 1625 *
@@ -1630,7 +1629,8 @@ fail:
1630 */ 1629 */
1631void *__vmalloc_node_range(unsigned long size, unsigned long align, 1630void *__vmalloc_node_range(unsigned long size, unsigned long align,
1632 unsigned long start, unsigned long end, gfp_t gfp_mask, 1631 unsigned long start, unsigned long end, gfp_t gfp_mask,
1633 pgprot_t prot, int node, const void *caller) 1632 pgprot_t prot, unsigned long vm_flags, int node,
1633 const void *caller)
1634{ 1634{
1635 struct vm_struct *area; 1635 struct vm_struct *area;
1636 void *addr; 1636 void *addr;
@@ -1640,8 +1640,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
1640 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1640 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1641 goto fail; 1641 goto fail;
1642 1642
1643 area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED, 1643 area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
1644 start, end, node, gfp_mask, caller); 1644 vm_flags, start, end, node, gfp_mask, caller);
1645 if (!area) 1645 if (!area)
1646 goto fail; 1646 goto fail;
1647 1647
@@ -1690,7 +1690,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1690 int node, const void *caller) 1690 int node, const void *caller)
1691{ 1691{
1692 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, 1692 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
1693 gfp_mask, prot, node, caller); 1693 gfp_mask, prot, 0, node, caller);
1694} 1694}
1695 1695
1696void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1696void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd9a72bc4a1b..5e8eadd71bac 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -91,6 +91,9 @@ struct scan_control {
91 /* Can pages be swapped as part of reclaim? */ 91 /* Can pages be swapped as part of reclaim? */
92 unsigned int may_swap:1; 92 unsigned int may_swap:1;
93 93
94 /* Can cgroups be reclaimed below their normal consumption range? */
95 unsigned int may_thrash:1;
96
94 unsigned int hibernation_mode:1; 97 unsigned int hibernation_mode:1;
95 98
96 /* One of the zones is ready for compaction */ 99 /* One of the zones is ready for compaction */
@@ -229,10 +232,10 @@ EXPORT_SYMBOL(unregister_shrinker);
229 232
230#define SHRINK_BATCH 128 233#define SHRINK_BATCH 128
231 234
232static unsigned long shrink_slabs(struct shrink_control *shrinkctl, 235static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
233 struct shrinker *shrinker, 236 struct shrinker *shrinker,
234 unsigned long nr_scanned, 237 unsigned long nr_scanned,
235 unsigned long nr_eligible) 238 unsigned long nr_eligible)
236{ 239{
237 unsigned long freed = 0; 240 unsigned long freed = 0;
238 unsigned long long delta; 241 unsigned long long delta;
@@ -341,9 +344,10 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
341} 344}
342 345
343/** 346/**
344 * shrink_node_slabs - shrink slab caches of a given node 347 * shrink_slab - shrink slab caches
345 * @gfp_mask: allocation context 348 * @gfp_mask: allocation context
346 * @nid: node whose slab caches to target 349 * @nid: node whose slab caches to target
350 * @memcg: memory cgroup whose slab caches to target
347 * @nr_scanned: pressure numerator 351 * @nr_scanned: pressure numerator
348 * @nr_eligible: pressure denominator 352 * @nr_eligible: pressure denominator
349 * 353 *
@@ -352,6 +356,12 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
352 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, 356 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
353 * unaware shrinkers will receive a node id of 0 instead. 357 * unaware shrinkers will receive a node id of 0 instead.
354 * 358 *
359 * @memcg specifies the memory cgroup to target. If it is not NULL,
360 * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
361 * objects from the memory cgroup specified. Otherwise all shrinkers
362 * are called, and memcg aware shrinkers are supposed to scan the
363 * global list then.
364 *
355 * @nr_scanned and @nr_eligible form a ratio that indicate how much of 365 * @nr_scanned and @nr_eligible form a ratio that indicate how much of
356 * the available objects should be scanned. Page reclaim for example 366 * the available objects should be scanned. Page reclaim for example
357 * passes the number of pages scanned and the number of pages on the 367 * passes the number of pages scanned and the number of pages on the
@@ -362,13 +372,17 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
362 * 372 *
363 * Returns the number of reclaimed slab objects. 373 * Returns the number of reclaimed slab objects.
364 */ 374 */
365unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, 375static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
366 unsigned long nr_scanned, 376 struct mem_cgroup *memcg,
367 unsigned long nr_eligible) 377 unsigned long nr_scanned,
378 unsigned long nr_eligible)
368{ 379{
369 struct shrinker *shrinker; 380 struct shrinker *shrinker;
370 unsigned long freed = 0; 381 unsigned long freed = 0;
371 382
383 if (memcg && !memcg_kmem_is_active(memcg))
384 return 0;
385
372 if (nr_scanned == 0) 386 if (nr_scanned == 0)
373 nr_scanned = SWAP_CLUSTER_MAX; 387 nr_scanned = SWAP_CLUSTER_MAX;
374 388
@@ -387,12 +401,16 @@ unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
387 struct shrink_control sc = { 401 struct shrink_control sc = {
388 .gfp_mask = gfp_mask, 402 .gfp_mask = gfp_mask,
389 .nid = nid, 403 .nid = nid,
404 .memcg = memcg,
390 }; 405 };
391 406
407 if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
408 continue;
409
392 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) 410 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
393 sc.nid = 0; 411 sc.nid = 0;
394 412
395 freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible); 413 freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
396 } 414 }
397 415
398 up_read(&shrinker_rwsem); 416 up_read(&shrinker_rwsem);
@@ -401,6 +419,29 @@ out:
401 return freed; 419 return freed;
402} 420}
403 421
422void drop_slab_node(int nid)
423{
424 unsigned long freed;
425
426 do {
427 struct mem_cgroup *memcg = NULL;
428
429 freed = 0;
430 do {
431 freed += shrink_slab(GFP_KERNEL, nid, memcg,
432 1000, 1000);
433 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
434 } while (freed > 10);
435}
436
437void drop_slab(void)
438{
439 int nid;
440
441 for_each_online_node(nid)
442 drop_slab_node(nid);
443}
444
404static inline int is_page_cache_freeable(struct page *page) 445static inline int is_page_cache_freeable(struct page *page)
405{ 446{
406 /* 447 /*
@@ -497,7 +538,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
497 } 538 }
498 if (mapping->a_ops->writepage == NULL) 539 if (mapping->a_ops->writepage == NULL)
499 return PAGE_ACTIVATE; 540 return PAGE_ACTIVATE;
500 if (!may_write_to_queue(mapping->backing_dev_info, sc)) 541 if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
501 return PAGE_KEEP; 542 return PAGE_KEEP;
502 543
503 if (clear_page_dirty_for_io(page)) { 544 if (clear_page_dirty_for_io(page)) {
@@ -876,7 +917,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
876 */ 917 */
877 mapping = page_mapping(page); 918 mapping = page_mapping(page);
878 if (((dirty || writeback) && mapping && 919 if (((dirty || writeback) && mapping &&
879 bdi_write_congested(mapping->backing_dev_info)) || 920 bdi_write_congested(inode_to_bdi(mapping->host))) ||
880 (writeback && PageReclaim(page))) 921 (writeback && PageReclaim(page)))
881 nr_congested++; 922 nr_congested++;
882 923
@@ -1903,8 +1944,12 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
1903 * latencies, so it's better to scan a minimum amount there as 1944 * latencies, so it's better to scan a minimum amount there as
1904 * well. 1945 * well.
1905 */ 1946 */
1906 if (current_is_kswapd() && !zone_reclaimable(zone)) 1947 if (current_is_kswapd()) {
1907 force_scan = true; 1948 if (!zone_reclaimable(zone))
1949 force_scan = true;
1950 if (!mem_cgroup_lruvec_online(lruvec))
1951 force_scan = true;
1952 }
1908 if (!global_reclaim(sc)) 1953 if (!global_reclaim(sc))
1909 force_scan = true; 1954 force_scan = true;
1910 1955
@@ -2269,6 +2314,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
2269static bool shrink_zone(struct zone *zone, struct scan_control *sc, 2314static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2270 bool is_classzone) 2315 bool is_classzone)
2271{ 2316{
2317 struct reclaim_state *reclaim_state = current->reclaim_state;
2272 unsigned long nr_reclaimed, nr_scanned; 2318 unsigned long nr_reclaimed, nr_scanned;
2273 bool reclaimable = false; 2319 bool reclaimable = false;
2274 2320
@@ -2287,15 +2333,28 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2287 memcg = mem_cgroup_iter(root, NULL, &reclaim); 2333 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2288 do { 2334 do {
2289 unsigned long lru_pages; 2335 unsigned long lru_pages;
2336 unsigned long scanned;
2290 struct lruvec *lruvec; 2337 struct lruvec *lruvec;
2291 int swappiness; 2338 int swappiness;
2292 2339
2340 if (mem_cgroup_low(root, memcg)) {
2341 if (!sc->may_thrash)
2342 continue;
2343 mem_cgroup_events(memcg, MEMCG_LOW, 1);
2344 }
2345
2293 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2346 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2294 swappiness = mem_cgroup_swappiness(memcg); 2347 swappiness = mem_cgroup_swappiness(memcg);
2348 scanned = sc->nr_scanned;
2295 2349
2296 shrink_lruvec(lruvec, swappiness, sc, &lru_pages); 2350 shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
2297 zone_lru_pages += lru_pages; 2351 zone_lru_pages += lru_pages;
2298 2352
2353 if (memcg && is_classzone)
2354 shrink_slab(sc->gfp_mask, zone_to_nid(zone),
2355 memcg, sc->nr_scanned - scanned,
2356 lru_pages);
2357
2299 /* 2358 /*
2300 * Direct reclaim and kswapd have to scan all memory 2359 * Direct reclaim and kswapd have to scan all memory
2301 * cgroups to fulfill the overall scan target for the 2360 * cgroups to fulfill the overall scan target for the
@@ -2311,26 +2370,20 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2311 mem_cgroup_iter_break(root, memcg); 2370 mem_cgroup_iter_break(root, memcg);
2312 break; 2371 break;
2313 } 2372 }
2314 memcg = mem_cgroup_iter(root, memcg, &reclaim); 2373 } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
2315 } while (memcg);
2316 2374
2317 /* 2375 /*
2318 * Shrink the slab caches in the same proportion that 2376 * Shrink the slab caches in the same proportion that
2319 * the eligible LRU pages were scanned. 2377 * the eligible LRU pages were scanned.
2320 */ 2378 */
2321 if (global_reclaim(sc) && is_classzone) { 2379 if (global_reclaim(sc) && is_classzone)
2322 struct reclaim_state *reclaim_state; 2380 shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
2323 2381 sc->nr_scanned - nr_scanned,
2324 shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone), 2382 zone_lru_pages);
2325 sc->nr_scanned - nr_scanned, 2383
2326 zone_lru_pages); 2384 if (reclaim_state) {
2327 2385 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2328 reclaim_state = current->reclaim_state; 2386 reclaim_state->reclaimed_slab = 0;
2329 if (reclaim_state) {
2330 sc->nr_reclaimed +=
2331 reclaim_state->reclaimed_slab;
2332 reclaim_state->reclaimed_slab = 0;
2333 }
2334 } 2387 }
2335 2388
2336 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2389 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
@@ -2515,10 +2568,11 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2515static unsigned long do_try_to_free_pages(struct zonelist *zonelist, 2568static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2516 struct scan_control *sc) 2569 struct scan_control *sc)
2517{ 2570{
2571 int initial_priority = sc->priority;
2518 unsigned long total_scanned = 0; 2572 unsigned long total_scanned = 0;
2519 unsigned long writeback_threshold; 2573 unsigned long writeback_threshold;
2520 bool zones_reclaimable; 2574 bool zones_reclaimable;
2521 2575retry:
2522 delayacct_freepages_start(); 2576 delayacct_freepages_start();
2523 2577
2524 if (global_reclaim(sc)) 2578 if (global_reclaim(sc))
@@ -2568,6 +2622,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2568 if (sc->compaction_ready) 2622 if (sc->compaction_ready)
2569 return 1; 2623 return 1;
2570 2624
2625 /* Untapped cgroup reserves? Don't OOM, retry. */
2626 if (!sc->may_thrash) {
2627 sc->priority = initial_priority;
2628 sc->may_thrash = 1;
2629 goto retry;
2630 }
2631
2571 /* Any of the zones still reclaimable? Don't OOM. */ 2632 /* Any of the zones still reclaimable? Don't OOM. */
2572 if (zones_reclaimable) 2633 if (zones_reclaimable)
2573 return 1; 2634 return 1;
@@ -2656,7 +2717,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2656 * should make reasonable progress. 2717 * should make reasonable progress.
2657 */ 2718 */
2658 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2719 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2659 gfp_mask, nodemask) { 2720 gfp_zone(gfp_mask), nodemask) {
2660 if (zone_idx(zone) > ZONE_NORMAL) 2721 if (zone_idx(zone) > ZONE_NORMAL)
2661 continue; 2722 continue;
2662 2723
@@ -2921,18 +2982,20 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2921 return false; 2982 return false;
2922 2983
2923 /* 2984 /*
2924 * There is a potential race between when kswapd checks its watermarks 2985 * The throttled processes are normally woken up in balance_pgdat() as
2925 * and a process gets throttled. There is also a potential race if 2986 * soon as pfmemalloc_watermark_ok() is true. But there is a potential
2926 * processes get throttled, kswapd wakes, a large process exits therby 2987 * race between when kswapd checks the watermarks and a process gets
2927 * balancing the zones that causes kswapd to miss a wakeup. If kswapd 2988 * throttled. There is also a potential race if processes get
2928 * is going to sleep, no process should be sleeping on pfmemalloc_wait 2989 * throttled, kswapd wakes, a large process exits thereby balancing the
2929 * so wake them now if necessary. If necessary, processes will wake 2990 * zones, which causes kswapd to exit balance_pgdat() before reaching
2930 * kswapd and get throttled again 2991 * the wake up checks. If kswapd is going to sleep, no process should
2992 * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
2993 * the wake up is premature, processes will wake kswapd and get
2994 * throttled again. The difference from wake ups in balance_pgdat() is
2995 * that here we are under prepare_to_wait().
2931 */ 2996 */
2932 if (waitqueue_active(&pgdat->pfmemalloc_wait)) { 2997 if (waitqueue_active(&pgdat->pfmemalloc_wait))
2933 wake_up(&pgdat->pfmemalloc_wait); 2998 wake_up_all(&pgdat->pfmemalloc_wait);
2934 return false;
2935 }
2936 2999
2937 return pgdat_balanced(pgdat, order, classzone_idx); 3000 return pgdat_balanced(pgdat, order, classzone_idx);
2938} 3001}
@@ -3173,7 +3236,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3173 */ 3236 */
3174 if (waitqueue_active(&pgdat->pfmemalloc_wait) && 3237 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3175 pfmemalloc_watermark_ok(pgdat)) 3238 pfmemalloc_watermark_ok(pgdat))
3176 wake_up(&pgdat->pfmemalloc_wait); 3239 wake_up_all(&pgdat->pfmemalloc_wait);
3177 3240
3178 /* 3241 /*
3179 * Fragmentation may mean that the system cannot be rebalanced 3242 * Fragmentation may mean that the system cannot be rebalanced
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1284f89fca08..4f5cd974e11a 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -17,6 +17,9 @@
17#include <linux/cpu.h> 17#include <linux/cpu.h>
18#include <linux/cpumask.h> 18#include <linux/cpumask.h>
19#include <linux/vmstat.h> 19#include <linux/vmstat.h>
20#include <linux/proc_fs.h>
21#include <linux/seq_file.h>
22#include <linux/debugfs.h>
20#include <linux/sched.h> 23#include <linux/sched.h>
21#include <linux/math64.h> 24#include <linux/math64.h>
22#include <linux/writeback.h> 25#include <linux/writeback.h>
@@ -670,66 +673,6 @@ int fragmentation_index(struct zone *zone, unsigned int order)
670} 673}
671#endif 674#endif
672 675
673#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
674#include <linux/proc_fs.h>
675#include <linux/seq_file.h>
676
677static char * const migratetype_names[MIGRATE_TYPES] = {
678 "Unmovable",
679 "Reclaimable",
680 "Movable",
681 "Reserve",
682#ifdef CONFIG_CMA
683 "CMA",
684#endif
685#ifdef CONFIG_MEMORY_ISOLATION
686 "Isolate",
687#endif
688};
689
690static void *frag_start(struct seq_file *m, loff_t *pos)
691{
692 pg_data_t *pgdat;
693 loff_t node = *pos;
694 for (pgdat = first_online_pgdat();
695 pgdat && node;
696 pgdat = next_online_pgdat(pgdat))
697 --node;
698
699 return pgdat;
700}
701
702static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
703{
704 pg_data_t *pgdat = (pg_data_t *)arg;
705
706 (*pos)++;
707 return next_online_pgdat(pgdat);
708}
709
710static void frag_stop(struct seq_file *m, void *arg)
711{
712}
713
714/* Walk all the zones in a node and print using a callback */
715static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
716 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
717{
718 struct zone *zone;
719 struct zone *node_zones = pgdat->node_zones;
720 unsigned long flags;
721
722 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
723 if (!populated_zone(zone))
724 continue;
725
726 spin_lock_irqsave(&zone->lock, flags);
727 print(m, pgdat, zone);
728 spin_unlock_irqrestore(&zone->lock, flags);
729 }
730}
731#endif
732
733#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) 676#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
734#ifdef CONFIG_ZONE_DMA 677#ifdef CONFIG_ZONE_DMA
735#define TEXT_FOR_DMA(xx) xx "_dma", 678#define TEXT_FOR_DMA(xx) xx "_dma",
@@ -907,7 +850,66 @@ const char * const vmstat_text[] = {
907#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ 850#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
908 851
909 852
853#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
854 defined(CONFIG_PROC_FS)
855static void *frag_start(struct seq_file *m, loff_t *pos)
856{
857 pg_data_t *pgdat;
858 loff_t node = *pos;
859
860 for (pgdat = first_online_pgdat();
861 pgdat && node;
862 pgdat = next_online_pgdat(pgdat))
863 --node;
864
865 return pgdat;
866}
867
868static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
869{
870 pg_data_t *pgdat = (pg_data_t *)arg;
871
872 (*pos)++;
873 return next_online_pgdat(pgdat);
874}
875
876static void frag_stop(struct seq_file *m, void *arg)
877{
878}
879
880/* Walk all the zones in a node and print using a callback */
881static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
882 void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
883{
884 struct zone *zone;
885 struct zone *node_zones = pgdat->node_zones;
886 unsigned long flags;
887
888 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
889 if (!populated_zone(zone))
890 continue;
891
892 spin_lock_irqsave(&zone->lock, flags);
893 print(m, pgdat, zone);
894 spin_unlock_irqrestore(&zone->lock, flags);
895 }
896}
897#endif
898
910#ifdef CONFIG_PROC_FS 899#ifdef CONFIG_PROC_FS
900static char * const migratetype_names[MIGRATE_TYPES] = {
901 "Unmovable",
902 "Reclaimable",
903 "Movable",
904 "Reserve",
905#ifdef CONFIG_CMA
906 "CMA",
907#endif
908#ifdef CONFIG_MEMORY_ISOLATION
909 "Isolate",
910#endif
911};
912
911static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, 913static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
912 struct zone *zone) 914 struct zone *zone)
913{ 915{
@@ -1435,8 +1437,8 @@ static void vmstat_shepherd(struct work_struct *w)
1435 if (need_update(cpu) && 1437 if (need_update(cpu) &&
1436 cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) 1438 cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
1437 1439
1438 schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu), 1440 schedule_delayed_work_on(cpu,
1439 __round_jiffies_relative(sysctl_stat_interval, cpu)); 1441 &per_cpu(vmstat_work, cpu), 0);
1440 1442
1441 put_online_cpus(); 1443 put_online_cpus();
1442 1444
@@ -1450,7 +1452,7 @@ static void __init start_shepherd_timer(void)
1450 int cpu; 1452 int cpu;
1451 1453
1452 for_each_possible_cpu(cpu) 1454 for_each_possible_cpu(cpu)
1453 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), 1455 INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu),
1454 vmstat_update); 1456 vmstat_update);
1455 1457
1456 if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) 1458 if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
@@ -1536,8 +1538,6 @@ static int __init setup_vmstat(void)
1536module_init(setup_vmstat) 1538module_init(setup_vmstat)
1537 1539
1538#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) 1540#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
1539#include <linux/debugfs.h>
1540
1541 1541
1542/* 1542/*
1543 * Return an index indicating how much of the available free memory is 1543 * Return an index indicating how much of the available free memory is
diff --git a/mm/workingset.c b/mm/workingset.c
index f7216fa7da27..aa017133744b 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
275 275
276 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ 276 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
277 local_irq_disable(); 277 local_irq_disable();
278 shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); 278 shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
279 local_irq_enable(); 279 local_irq_enable();
280 280
281 pages = node_present_pages(sc->nid); 281 pages = node_present_pages(sc->nid);
@@ -302,6 +302,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
302} 302}
303 303
304static enum lru_status shadow_lru_isolate(struct list_head *item, 304static enum lru_status shadow_lru_isolate(struct list_head *item,
305 struct list_lru_one *lru,
305 spinlock_t *lru_lock, 306 spinlock_t *lru_lock,
306 void *arg) 307 void *arg)
307{ 308{
@@ -332,7 +333,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
332 goto out; 333 goto out;
333 } 334 }
334 335
335 list_del_init(item); 336 list_lru_isolate(lru, item);
336 spin_unlock(lru_lock); 337 spin_unlock(lru_lock);
337 338
338 /* 339 /*
@@ -376,8 +377,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
376 377
377 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ 378 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
378 local_irq_disable(); 379 local_irq_disable();
379 ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, 380 ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc,
380 shadow_lru_isolate, NULL, &sc->nr_to_scan); 381 shadow_lru_isolate, NULL);
381 local_irq_enable(); 382 local_irq_enable();
382 return ret; 383 return ret;
383} 384}
diff --git a/mm/zbud.c b/mm/zbud.c
index 4e387bea702e..2ee4e4520493 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -130,7 +130,8 @@ static struct zbud_ops zbud_zpool_ops = {
130 .evict = zbud_zpool_evict 130 .evict = zbud_zpool_evict
131}; 131};
132 132
133static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) 133static void *zbud_zpool_create(char *name, gfp_t gfp,
134 struct zpool_ops *zpool_ops)
134{ 135{
135 return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); 136 return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
136} 137}
diff --git a/mm/zpool.c b/mm/zpool.c
index 739cdf0d183a..bacdab6e47de 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -129,6 +129,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
129/** 129/**
130 * zpool_create_pool() - Create a new zpool 130 * zpool_create_pool() - Create a new zpool
131 * @type The type of the zpool to create (e.g. zbud, zsmalloc) 131 * @type The type of the zpool to create (e.g. zbud, zsmalloc)
132 * @name The name of the zpool (e.g. zram0, zswap)
132 * @gfp The GFP flags to use when allocating the pool. 133 * @gfp The GFP flags to use when allocating the pool.
133 * @ops The optional ops callback. 134 * @ops The optional ops callback.
134 * 135 *
@@ -140,7 +141,8 @@ static void zpool_put_driver(struct zpool_driver *driver)
140 * 141 *
141 * Returns: New zpool on success, NULL on failure. 142 * Returns: New zpool on success, NULL on failure.
142 */ 143 */
143struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) 144struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
145 struct zpool_ops *ops)
144{ 146{
145 struct zpool_driver *driver; 147 struct zpool_driver *driver;
146 struct zpool *zpool; 148 struct zpool *zpool;
@@ -168,7 +170,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
168 170
169 zpool->type = driver->type; 171 zpool->type = driver->type;
170 zpool->driver = driver; 172 zpool->driver = driver;
171 zpool->pool = driver->create(gfp, ops); 173 zpool->pool = driver->create(name, gfp, ops);
172 zpool->ops = ops; 174 zpool->ops = ops;
173 175
174 if (!zpool->pool) { 176 if (!zpool->pool) {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b72403927aa4..0dec1fa5f656 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -91,6 +91,7 @@
91#include <linux/hardirq.h> 91#include <linux/hardirq.h>
92#include <linux/spinlock.h> 92#include <linux/spinlock.h>
93#include <linux/types.h> 93#include <linux/types.h>
94#include <linux/debugfs.h>
94#include <linux/zsmalloc.h> 95#include <linux/zsmalloc.h>
95#include <linux/zpool.h> 96#include <linux/zpool.h>
96 97
@@ -168,6 +169,22 @@ enum fullness_group {
168 ZS_FULL 169 ZS_FULL
169}; 170};
170 171
172enum zs_stat_type {
173 OBJ_ALLOCATED,
174 OBJ_USED,
175 NR_ZS_STAT_TYPE,
176};
177
178#ifdef CONFIG_ZSMALLOC_STAT
179
180static struct dentry *zs_stat_root;
181
182struct zs_size_stat {
183 unsigned long objs[NR_ZS_STAT_TYPE];
184};
185
186#endif
187
171/* 188/*
172 * number of size_classes 189 * number of size_classes
173 */ 190 */
@@ -200,6 +217,10 @@ struct size_class {
200 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 217 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
201 int pages_per_zspage; 218 int pages_per_zspage;
202 219
220#ifdef CONFIG_ZSMALLOC_STAT
221 struct zs_size_stat stats;
222#endif
223
203 spinlock_t lock; 224 spinlock_t lock;
204 225
205 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; 226 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
@@ -217,10 +238,16 @@ struct link_free {
217}; 238};
218 239
219struct zs_pool { 240struct zs_pool {
241 char *name;
242
220 struct size_class **size_class; 243 struct size_class **size_class;
221 244
222 gfp_t flags; /* allocation flags used when growing pool */ 245 gfp_t flags; /* allocation flags used when growing pool */
223 atomic_long_t pages_allocated; 246 atomic_long_t pages_allocated;
247
248#ifdef CONFIG_ZSMALLOC_STAT
249 struct dentry *stat_dentry;
250#endif
224}; 251};
225 252
226/* 253/*
@@ -246,9 +273,9 @@ struct mapping_area {
246 273
247#ifdef CONFIG_ZPOOL 274#ifdef CONFIG_ZPOOL
248 275
249static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) 276static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops)
250{ 277{
251 return zs_create_pool(gfp); 278 return zs_create_pool(name, gfp);
252} 279}
253 280
254static void zs_zpool_destroy(void *pool) 281static void zs_zpool_destroy(void *pool)
@@ -942,6 +969,166 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
942 return true; 969 return true;
943} 970}
944 971
972#ifdef CONFIG_ZSMALLOC_STAT
973
974static inline void zs_stat_inc(struct size_class *class,
975 enum zs_stat_type type, unsigned long cnt)
976{
977 class->stats.objs[type] += cnt;
978}
979
980static inline void zs_stat_dec(struct size_class *class,
981 enum zs_stat_type type, unsigned long cnt)
982{
983 class->stats.objs[type] -= cnt;
984}
985
986static inline unsigned long zs_stat_get(struct size_class *class,
987 enum zs_stat_type type)
988{
989 return class->stats.objs[type];
990}
991
992static int __init zs_stat_init(void)
993{
994 if (!debugfs_initialized())
995 return -ENODEV;
996
997 zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
998 if (!zs_stat_root)
999 return -ENOMEM;
1000
1001 return 0;
1002}
1003
1004static void __exit zs_stat_exit(void)
1005{
1006 debugfs_remove_recursive(zs_stat_root);
1007}
1008
1009static int zs_stats_size_show(struct seq_file *s, void *v)
1010{
1011 int i;
1012 struct zs_pool *pool = s->private;
1013 struct size_class *class;
1014 int objs_per_zspage;
1015 unsigned long obj_allocated, obj_used, pages_used;
1016 unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
1017
1018 seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
1019 "obj_allocated", "obj_used", "pages_used");
1020
1021 for (i = 0; i < zs_size_classes; i++) {
1022 class = pool->size_class[i];
1023
1024 if (class->index != i)
1025 continue;
1026
1027 spin_lock(&class->lock);
1028 obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
1029 obj_used = zs_stat_get(class, OBJ_USED);
1030 spin_unlock(&class->lock);
1031
1032 objs_per_zspage = get_maxobj_per_zspage(class->size,
1033 class->pages_per_zspage);
1034 pages_used = obj_allocated / objs_per_zspage *
1035 class->pages_per_zspage;
1036
1037 seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i,
1038 class->size, obj_allocated, obj_used, pages_used);
1039
1040 total_objs += obj_allocated;
1041 total_used_objs += obj_used;
1042 total_pages += pages_used;
1043 }
1044
1045 seq_puts(s, "\n");
1046 seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "",
1047 total_objs, total_used_objs, total_pages);
1048
1049 return 0;
1050}
1051
1052static int zs_stats_size_open(struct inode *inode, struct file *file)
1053{
1054 return single_open(file, zs_stats_size_show, inode->i_private);
1055}
1056
1057static const struct file_operations zs_stat_size_ops = {
1058 .open = zs_stats_size_open,
1059 .read = seq_read,
1060 .llseek = seq_lseek,
1061 .release = single_release,
1062};
1063
1064static int zs_pool_stat_create(char *name, struct zs_pool *pool)
1065{
1066 struct dentry *entry;
1067
1068 if (!zs_stat_root)
1069 return -ENODEV;
1070
1071 entry = debugfs_create_dir(name, zs_stat_root);
1072 if (!entry) {
1073 pr_warn("debugfs dir <%s> creation failed\n", name);
1074 return -ENOMEM;
1075 }
1076 pool->stat_dentry = entry;
1077
1078 entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
1079 pool->stat_dentry, pool, &zs_stat_size_ops);
1080 if (!entry) {
1081 pr_warn("%s: debugfs file entry <%s> creation failed\n",
1082 name, "obj_in_classes");
1083 return -ENOMEM;
1084 }
1085
1086 return 0;
1087}
1088
1089static void zs_pool_stat_destroy(struct zs_pool *pool)
1090{
1091 debugfs_remove_recursive(pool->stat_dentry);
1092}
1093
1094#else /* CONFIG_ZSMALLOC_STAT */
1095
1096static inline void zs_stat_inc(struct size_class *class,
1097 enum zs_stat_type type, unsigned long cnt)
1098{
1099}
1100
1101static inline void zs_stat_dec(struct size_class *class,
1102 enum zs_stat_type type, unsigned long cnt)
1103{
1104}
1105
1106static inline unsigned long zs_stat_get(struct size_class *class,
1107 enum zs_stat_type type)
1108{
1109 return 0;
1110}
1111
1112static int __init zs_stat_init(void)
1113{
1114 return 0;
1115}
1116
1117static void __exit zs_stat_exit(void)
1118{
1119}
1120
1121static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
1122{
1123 return 0;
1124}
1125
1126static inline void zs_pool_stat_destroy(struct zs_pool *pool)
1127{
1128}
1129
1130#endif
1131
945unsigned long zs_get_total_pages(struct zs_pool *pool) 1132unsigned long zs_get_total_pages(struct zs_pool *pool)
946{ 1133{
947 return atomic_long_read(&pool->pages_allocated); 1134 return atomic_long_read(&pool->pages_allocated);
@@ -1074,7 +1261,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1074 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1261 set_zspage_mapping(first_page, class->index, ZS_EMPTY);
1075 atomic_long_add(class->pages_per_zspage, 1262 atomic_long_add(class->pages_per_zspage,
1076 &pool->pages_allocated); 1263 &pool->pages_allocated);
1264
1077 spin_lock(&class->lock); 1265 spin_lock(&class->lock);
1266 zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
1267 class->size, class->pages_per_zspage));
1078 } 1268 }
1079 1269
1080 obj = (unsigned long)first_page->freelist; 1270 obj = (unsigned long)first_page->freelist;
@@ -1088,6 +1278,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1088 kunmap_atomic(vaddr); 1278 kunmap_atomic(vaddr);
1089 1279
1090 first_page->inuse++; 1280 first_page->inuse++;
1281 zs_stat_inc(class, OBJ_USED, 1);
1091 /* Now move the zspage to another fullness group, if required */ 1282 /* Now move the zspage to another fullness group, if required */
1092 fix_fullness_group(pool, first_page); 1283 fix_fullness_group(pool, first_page);
1093 spin_unlock(&class->lock); 1284 spin_unlock(&class->lock);
@@ -1128,6 +1319,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
1128 1319
1129 first_page->inuse--; 1320 first_page->inuse--;
1130 fullness = fix_fullness_group(pool, first_page); 1321 fullness = fix_fullness_group(pool, first_page);
1322
1323 zs_stat_dec(class, OBJ_USED, 1);
1324 if (fullness == ZS_EMPTY)
1325 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
1326 class->size, class->pages_per_zspage));
1327
1131 spin_unlock(&class->lock); 1328 spin_unlock(&class->lock);
1132 1329
1133 if (fullness == ZS_EMPTY) { 1330 if (fullness == ZS_EMPTY) {
@@ -1148,7 +1345,7 @@ EXPORT_SYMBOL_GPL(zs_free);
1148 * On success, a pointer to the newly created pool is returned, 1345 * On success, a pointer to the newly created pool is returned,
1149 * otherwise NULL. 1346 * otherwise NULL.
1150 */ 1347 */
1151struct zs_pool *zs_create_pool(gfp_t flags) 1348struct zs_pool *zs_create_pool(char *name, gfp_t flags)
1152{ 1349{
1153 int i; 1350 int i;
1154 struct zs_pool *pool; 1351 struct zs_pool *pool;
@@ -1158,9 +1355,16 @@ struct zs_pool *zs_create_pool(gfp_t flags)
1158 if (!pool) 1355 if (!pool)
1159 return NULL; 1356 return NULL;
1160 1357
1358 pool->name = kstrdup(name, GFP_KERNEL);
1359 if (!pool->name) {
1360 kfree(pool);
1361 return NULL;
1362 }
1363
1161 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), 1364 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
1162 GFP_KERNEL); 1365 GFP_KERNEL);
1163 if (!pool->size_class) { 1366 if (!pool->size_class) {
1367 kfree(pool->name);
1164 kfree(pool); 1368 kfree(pool);
1165 return NULL; 1369 return NULL;
1166 } 1370 }
@@ -1210,6 +1414,9 @@ struct zs_pool *zs_create_pool(gfp_t flags)
1210 1414
1211 pool->flags = flags; 1415 pool->flags = flags;
1212 1416
1417 if (zs_pool_stat_create(name, pool))
1418 goto err;
1419
1213 return pool; 1420 return pool;
1214 1421
1215err: 1422err:
@@ -1222,6 +1429,8 @@ void zs_destroy_pool(struct zs_pool *pool)
1222{ 1429{
1223 int i; 1430 int i;
1224 1431
1432 zs_pool_stat_destroy(pool);
1433
1225 for (i = 0; i < zs_size_classes; i++) { 1434 for (i = 0; i < zs_size_classes; i++) {
1226 int fg; 1435 int fg;
1227 struct size_class *class = pool->size_class[i]; 1436 struct size_class *class = pool->size_class[i];
@@ -1242,6 +1451,7 @@ void zs_destroy_pool(struct zs_pool *pool)
1242 } 1451 }
1243 1452
1244 kfree(pool->size_class); 1453 kfree(pool->size_class);
1454 kfree(pool->name);
1245 kfree(pool); 1455 kfree(pool);
1246} 1456}
1247EXPORT_SYMBOL_GPL(zs_destroy_pool); 1457EXPORT_SYMBOL_GPL(zs_destroy_pool);
@@ -1250,17 +1460,30 @@ static int __init zs_init(void)
1250{ 1460{
1251 int ret = zs_register_cpu_notifier(); 1461 int ret = zs_register_cpu_notifier();
1252 1462
1253 if (ret) { 1463 if (ret)
1254 zs_unregister_cpu_notifier(); 1464 goto notifier_fail;
1255 return ret;
1256 }
1257 1465
1258 init_zs_size_classes(); 1466 init_zs_size_classes();
1259 1467
1260#ifdef CONFIG_ZPOOL 1468#ifdef CONFIG_ZPOOL
1261 zpool_register_driver(&zs_zpool_driver); 1469 zpool_register_driver(&zs_zpool_driver);
1262#endif 1470#endif
1471
1472 ret = zs_stat_init();
1473 if (ret) {
1474 pr_err("zs stat initialization failed\n");
1475 goto stat_fail;
1476 }
1263 return 0; 1477 return 0;
1478
1479stat_fail:
1480#ifdef CONFIG_ZPOOL
1481 zpool_unregister_driver(&zs_zpool_driver);
1482#endif
1483notifier_fail:
1484 zs_unregister_cpu_notifier();
1485
1486 return ret;
1264} 1487}
1265 1488
1266static void __exit zs_exit(void) 1489static void __exit zs_exit(void)
@@ -1269,6 +1492,8 @@ static void __exit zs_exit(void)
1269 zpool_unregister_driver(&zs_zpool_driver); 1492 zpool_unregister_driver(&zs_zpool_driver);
1270#endif 1493#endif
1271 zs_unregister_cpu_notifier(); 1494 zs_unregister_cpu_notifier();
1495
1496 zs_stat_exit();
1272} 1497}
1273 1498
1274module_init(zs_init); 1499module_init(zs_init);
diff --git a/mm/zswap.c b/mm/zswap.c
index 0cfce9bc51e4..4249e82ff934 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -906,11 +906,12 @@ static int __init init_zswap(void)
906 906
907 pr_info("loading zswap\n"); 907 pr_info("loading zswap\n");
908 908
909 zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops); 909 zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
910 &zswap_zpool_ops);
910 if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { 911 if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
911 pr_info("%s zpool not available\n", zswap_zpool_type); 912 pr_info("%s zpool not available\n", zswap_zpool_type);
912 zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 913 zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
913 zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, 914 zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
914 &zswap_zpool_ops); 915 &zswap_zpool_ops);
915 } 916 }
916 if (!zswap_pool) { 917 if (!zswap_pool) {