diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 11 | ||||
-rw-r--r-- | mm/Kconfig.debug | 9 | ||||
-rw-r--r-- | mm/Makefile | 6 | ||||
-rw-r--r-- | mm/backing-dev.c | 107 | ||||
-rw-r--r-- | mm/cleancache.c | 2 | ||||
-rw-r--r-- | mm/cma.c | 2 | ||||
-rw-r--r-- | mm/compaction.c | 181 | ||||
-rw-r--r-- | mm/debug.c | 4 | ||||
-rw-r--r-- | mm/fadvise.c | 4 | ||||
-rw-r--r-- | mm/filemap.c | 5 | ||||
-rw-r--r-- | mm/filemap_xip.c | 4 | ||||
-rw-r--r-- | mm/fremap.c | 283 | ||||
-rw-r--r-- | mm/gup.c | 244 | ||||
-rw-r--r-- | mm/huge_memory.c | 156 | ||||
-rw-r--r-- | mm/hugetlb.c | 160 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 2 | ||||
-rw-r--r-- | mm/internal.h | 28 | ||||
-rw-r--r-- | mm/interval_tree.c | 34 | ||||
-rw-r--r-- | mm/kasan/Makefile | 8 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 516 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 75 | ||||
-rw-r--r-- | mm/kasan/report.c | 269 | ||||
-rw-r--r-- | mm/kmemleak.c | 6 | ||||
-rw-r--r-- | mm/ksm.c | 4 | ||||
-rw-r--r-- | mm/list_lru.c | 467 | ||||
-rw-r--r-- | mm/madvise.c | 30 | ||||
-rw-r--r-- | mm/memcontrol.c | 1096 | ||||
-rw-r--r-- | mm/memory-failure.c | 13 | ||||
-rw-r--r-- | mm/memory.c | 330 | ||||
-rw-r--r-- | mm/mempolicy.c | 286 | ||||
-rw-r--r-- | mm/migrate.c | 45 | ||||
-rw-r--r-- | mm/mincore.c | 175 | ||||
-rw-r--r-- | mm/mm_init.c | 4 | ||||
-rw-r--r-- | mm/mmap.c | 113 | ||||
-rw-r--r-- | mm/mmzone.c | 4 | ||||
-rw-r--r-- | mm/mprotect.c | 50 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/msync.c | 5 | ||||
-rw-r--r-- | mm/nommu.c | 115 | ||||
-rw-r--r-- | mm/oom_kill.c | 169 | ||||
-rw-r--r-- | mm/page-writeback.c | 89 | ||||
-rw-r--r-- | mm/page_alloc.c | 526 | ||||
-rw-r--r-- | mm/page_counter.c | 7 | ||||
-rw-r--r-- | mm/page_owner.c | 26 | ||||
-rw-r--r-- | mm/pagewalk.c | 235 | ||||
-rw-r--r-- | mm/percpu.c | 6 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 2 | ||||
-rw-r--r-- | mm/process_vm_access.c | 7 | ||||
-rw-r--r-- | mm/readahead.c | 4 | ||||
-rw-r--r-- | mm/rmap.c | 279 | ||||
-rw-r--r-- | mm/shmem.c | 29 | ||||
-rw-r--r-- | mm/slab.c | 17 | ||||
-rw-r--r-- | mm/slab.h | 67 | ||||
-rw-r--r-- | mm/slab_common.c | 323 | ||||
-rw-r--r-- | mm/slob.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 232 | ||||
-rw-r--r-- | mm/swap.c | 6 | ||||
-rw-r--r-- | mm/swap_state.c | 6 | ||||
-rw-r--r-- | mm/truncate.c | 2 | ||||
-rw-r--r-- | mm/util.c | 48 | ||||
-rw-r--r-- | mm/vmalloc.c | 16 | ||||
-rw-r--r-- | mm/vmscan.c | 147 | ||||
-rw-r--r-- | mm/vmstat.c | 130 | ||||
-rw-r--r-- | mm/workingset.c | 9 | ||||
-rw-r--r-- | mm/zbud.c | 3 | ||||
-rw-r--r-- | mm/zpool.c | 6 | ||||
-rw-r--r-- | mm/zsmalloc.c | 239 | ||||
-rw-r--r-- | mm/zswap.c | 5 |
68 files changed, 4573 insertions, 2919 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 1d1ae6b078fd..de5239c152f9 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -325,6 +325,7 @@ config VIRT_TO_BUS | |||
325 | 325 | ||
326 | config MMU_NOTIFIER | 326 | config MMU_NOTIFIER |
327 | bool | 327 | bool |
328 | select SRCU | ||
328 | 329 | ||
329 | config KSM | 330 | config KSM |
330 | bool "Enable KSM for page merging" | 331 | bool "Enable KSM for page merging" |
@@ -601,6 +602,16 @@ config PGTABLE_MAPPING | |||
601 | You can check speed with zsmalloc benchmark: | 602 | You can check speed with zsmalloc benchmark: |
602 | https://github.com/spartacus06/zsmapbench | 603 | https://github.com/spartacus06/zsmapbench |
603 | 604 | ||
605 | config ZSMALLOC_STAT | ||
606 | bool "Export zsmalloc statistics" | ||
607 | depends on ZSMALLOC | ||
608 | select DEBUG_FS | ||
609 | help | ||
610 | This option enables code in the zsmalloc to collect various | ||
611 | statistics about whats happening in zsmalloc and exports that | ||
612 | information to userspace via debugfs. | ||
613 | If unsure, say N. | ||
614 | |||
604 | config GENERIC_EARLY_IOREMAP | 615 | config GENERIC_EARLY_IOREMAP |
605 | bool | 616 | bool |
606 | 617 | ||
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 56badfc4810a..957d3da53ddd 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
@@ -14,7 +14,6 @@ config DEBUG_PAGEALLOC | |||
14 | depends on !KMEMCHECK | 14 | depends on !KMEMCHECK |
15 | select PAGE_EXTENSION | 15 | select PAGE_EXTENSION |
16 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC | 16 | select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC |
17 | select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC | ||
18 | ---help--- | 17 | ---help--- |
19 | Unmap pages from the kernel linear mapping after free_pages(). | 18 | Unmap pages from the kernel linear mapping after free_pages(). |
20 | This results in a large slowdown, but helps to find certain types | 19 | This results in a large slowdown, but helps to find certain types |
@@ -27,13 +26,5 @@ config DEBUG_PAGEALLOC | |||
27 | that would result in incorrect warnings of memory corruption after | 26 | that would result in incorrect warnings of memory corruption after |
28 | a resume because free pages are not saved to the suspend image. | 27 | a resume because free pages are not saved to the suspend image. |
29 | 28 | ||
30 | config WANT_PAGE_DEBUG_FLAGS | ||
31 | bool | ||
32 | |||
33 | config PAGE_POISONING | 29 | config PAGE_POISONING |
34 | bool | 30 | bool |
35 | select WANT_PAGE_DEBUG_FLAGS | ||
36 | |||
37 | config PAGE_GUARD | ||
38 | bool | ||
39 | select WANT_PAGE_DEBUG_FLAGS | ||
diff --git a/mm/Makefile b/mm/Makefile index 4bf586e66378..088c68e9ec35 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -2,8 +2,11 @@ | |||
2 | # Makefile for the linux memory manager. | 2 | # Makefile for the linux memory manager. |
3 | # | 3 | # |
4 | 4 | ||
5 | KASAN_SANITIZE_slab_common.o := n | ||
6 | KASAN_SANITIZE_slub.o := n | ||
7 | |||
5 | mmu-y := nommu.o | 8 | mmu-y := nommu.o |
6 | mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \ | 9 | mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ |
7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | 10 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ |
8 | vmalloc.o pagewalk.o pgtable-generic.o | 11 | vmalloc.o pagewalk.o pgtable-generic.o |
9 | 12 | ||
@@ -49,6 +52,7 @@ obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o | |||
49 | obj-$(CONFIG_SLAB) += slab.o | 52 | obj-$(CONFIG_SLAB) += slab.o |
50 | obj-$(CONFIG_SLUB) += slub.o | 53 | obj-$(CONFIG_SLUB) += slub.o |
51 | obj-$(CONFIG_KMEMCHECK) += kmemcheck.o | 54 | obj-$(CONFIG_KMEMCHECK) += kmemcheck.o |
55 | obj-$(CONFIG_KASAN) += kasan/ | ||
52 | obj-$(CONFIG_FAILSLAB) += failslab.o | 56 | obj-$(CONFIG_FAILSLAB) += failslab.o |
53 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 57 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
54 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 58 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0ae0df55000b..7690ec77c722 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -14,19 +14,10 @@ | |||
14 | 14 | ||
15 | static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); | 15 | static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); |
16 | 16 | ||
17 | struct backing_dev_info default_backing_dev_info = { | ||
18 | .name = "default", | ||
19 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, | ||
20 | .state = 0, | ||
21 | .capabilities = BDI_CAP_MAP_COPY, | ||
22 | }; | ||
23 | EXPORT_SYMBOL_GPL(default_backing_dev_info); | ||
24 | |||
25 | struct backing_dev_info noop_backing_dev_info = { | 17 | struct backing_dev_info noop_backing_dev_info = { |
26 | .name = "noop", | 18 | .name = "noop", |
27 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 19 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, |
28 | }; | 20 | }; |
29 | EXPORT_SYMBOL_GPL(noop_backing_dev_info); | ||
30 | 21 | ||
31 | static struct class *bdi_class; | 22 | static struct class *bdi_class; |
32 | 23 | ||
@@ -40,17 +31,6 @@ LIST_HEAD(bdi_list); | |||
40 | /* bdi_wq serves all asynchronous writeback tasks */ | 31 | /* bdi_wq serves all asynchronous writeback tasks */ |
41 | struct workqueue_struct *bdi_wq; | 32 | struct workqueue_struct *bdi_wq; |
42 | 33 | ||
43 | static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2) | ||
44 | { | ||
45 | if (wb1 < wb2) { | ||
46 | spin_lock(&wb1->list_lock); | ||
47 | spin_lock_nested(&wb2->list_lock, 1); | ||
48 | } else { | ||
49 | spin_lock(&wb2->list_lock); | ||
50 | spin_lock_nested(&wb1->list_lock, 1); | ||
51 | } | ||
52 | } | ||
53 | |||
54 | #ifdef CONFIG_DEBUG_FS | 34 | #ifdef CONFIG_DEBUG_FS |
55 | #include <linux/debugfs.h> | 35 | #include <linux/debugfs.h> |
56 | #include <linux/seq_file.h> | 36 | #include <linux/seq_file.h> |
@@ -264,9 +244,6 @@ static int __init default_bdi_init(void) | |||
264 | if (!bdi_wq) | 244 | if (!bdi_wq) |
265 | return -ENOMEM; | 245 | return -ENOMEM; |
266 | 246 | ||
267 | err = bdi_init(&default_backing_dev_info); | ||
268 | if (!err) | ||
269 | bdi_register(&default_backing_dev_info, NULL, "default"); | ||
270 | err = bdi_init(&noop_backing_dev_info); | 247 | err = bdi_init(&noop_backing_dev_info); |
271 | 248 | ||
272 | return err; | 249 | return err; |
@@ -355,19 +332,19 @@ EXPORT_SYMBOL(bdi_register_dev); | |||
355 | */ | 332 | */ |
356 | static void bdi_wb_shutdown(struct backing_dev_info *bdi) | 333 | static void bdi_wb_shutdown(struct backing_dev_info *bdi) |
357 | { | 334 | { |
358 | if (!bdi_cap_writeback_dirty(bdi)) | 335 | /* Make sure nobody queues further work */ |
336 | spin_lock_bh(&bdi->wb_lock); | ||
337 | if (!test_and_clear_bit(BDI_registered, &bdi->state)) { | ||
338 | spin_unlock_bh(&bdi->wb_lock); | ||
359 | return; | 339 | return; |
340 | } | ||
341 | spin_unlock_bh(&bdi->wb_lock); | ||
360 | 342 | ||
361 | /* | 343 | /* |
362 | * Make sure nobody finds us on the bdi_list anymore | 344 | * Make sure nobody finds us on the bdi_list anymore |
363 | */ | 345 | */ |
364 | bdi_remove_from_list(bdi); | 346 | bdi_remove_from_list(bdi); |
365 | 347 | ||
366 | /* Make sure nobody queues further work */ | ||
367 | spin_lock_bh(&bdi->wb_lock); | ||
368 | clear_bit(BDI_registered, &bdi->state); | ||
369 | spin_unlock_bh(&bdi->wb_lock); | ||
370 | |||
371 | /* | 348 | /* |
372 | * Drain work list and shutdown the delayed_work. At this point, | 349 | * Drain work list and shutdown the delayed_work. At this point, |
373 | * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi | 350 | * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi |
@@ -375,37 +352,22 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) | |||
375 | */ | 352 | */ |
376 | mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); | 353 | mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); |
377 | flush_delayed_work(&bdi->wb.dwork); | 354 | flush_delayed_work(&bdi->wb.dwork); |
378 | WARN_ON(!list_empty(&bdi->work_list)); | ||
379 | WARN_ON(delayed_work_pending(&bdi->wb.dwork)); | ||
380 | } | 355 | } |
381 | 356 | ||
382 | /* | 357 | /* |
383 | * This bdi is going away now, make sure that no super_blocks point to it | 358 | * Called when the device behind @bdi has been removed or ejected. |
359 | * | ||
360 | * We can't really do much here except for reducing the dirty ratio at | ||
361 | * the moment. In the future we should be able to set a flag so that | ||
362 | * the filesystem can handle errors at mark_inode_dirty time instead | ||
363 | * of only at writeback time. | ||
384 | */ | 364 | */ |
385 | static void bdi_prune_sb(struct backing_dev_info *bdi) | ||
386 | { | ||
387 | struct super_block *sb; | ||
388 | |||
389 | spin_lock(&sb_lock); | ||
390 | list_for_each_entry(sb, &super_blocks, s_list) { | ||
391 | if (sb->s_bdi == bdi) | ||
392 | sb->s_bdi = &default_backing_dev_info; | ||
393 | } | ||
394 | spin_unlock(&sb_lock); | ||
395 | } | ||
396 | |||
397 | void bdi_unregister(struct backing_dev_info *bdi) | 365 | void bdi_unregister(struct backing_dev_info *bdi) |
398 | { | 366 | { |
399 | if (bdi->dev) { | 367 | if (WARN_ON_ONCE(!bdi->dev)) |
400 | bdi_set_min_ratio(bdi, 0); | 368 | return; |
401 | trace_writeback_bdi_unregister(bdi); | ||
402 | bdi_prune_sb(bdi); | ||
403 | 369 | ||
404 | bdi_wb_shutdown(bdi); | 370 | bdi_set_min_ratio(bdi, 0); |
405 | bdi_debug_unregister(bdi); | ||
406 | device_unregister(bdi->dev); | ||
407 | bdi->dev = NULL; | ||
408 | } | ||
409 | } | 371 | } |
410 | EXPORT_SYMBOL(bdi_unregister); | 372 | EXPORT_SYMBOL(bdi_unregister); |
411 | 373 | ||
@@ -474,37 +436,19 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
474 | { | 436 | { |
475 | int i; | 437 | int i; |
476 | 438 | ||
477 | /* | 439 | bdi_wb_shutdown(bdi); |
478 | * Splice our entries to the default_backing_dev_info. This | ||
479 | * condition shouldn't happen. @wb must be empty at this point and | ||
480 | * dirty inodes on it might cause other issues. This workaround is | ||
481 | * added by ce5f8e779519 ("writeback: splice dirty inode entries to | ||
482 | * default bdi on bdi_destroy()") without root-causing the issue. | ||
483 | * | ||
484 | * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com | ||
485 | * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350 | ||
486 | * | ||
487 | * We should probably add WARN_ON() to find out whether it still | ||
488 | * happens and track it down if so. | ||
489 | */ | ||
490 | if (bdi_has_dirty_io(bdi)) { | ||
491 | struct bdi_writeback *dst = &default_backing_dev_info.wb; | ||
492 | |||
493 | bdi_lock_two(&bdi->wb, dst); | ||
494 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); | ||
495 | list_splice(&bdi->wb.b_io, &dst->b_io); | ||
496 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); | ||
497 | spin_unlock(&bdi->wb.list_lock); | ||
498 | spin_unlock(&dst->list_lock); | ||
499 | } | ||
500 | |||
501 | bdi_unregister(bdi); | ||
502 | 440 | ||
441 | WARN_ON(!list_empty(&bdi->work_list)); | ||
503 | WARN_ON(delayed_work_pending(&bdi->wb.dwork)); | 442 | WARN_ON(delayed_work_pending(&bdi->wb.dwork)); |
504 | 443 | ||
444 | if (bdi->dev) { | ||
445 | bdi_debug_unregister(bdi); | ||
446 | device_unregister(bdi->dev); | ||
447 | bdi->dev = NULL; | ||
448 | } | ||
449 | |||
505 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | 450 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) |
506 | percpu_counter_destroy(&bdi->bdi_stat[i]); | 451 | percpu_counter_destroy(&bdi->bdi_stat[i]); |
507 | |||
508 | fprop_local_destroy_percpu(&bdi->completions); | 452 | fprop_local_destroy_percpu(&bdi->completions); |
509 | } | 453 | } |
510 | EXPORT_SYMBOL(bdi_destroy); | 454 | EXPORT_SYMBOL(bdi_destroy); |
@@ -513,13 +457,12 @@ EXPORT_SYMBOL(bdi_destroy); | |||
513 | * For use from filesystems to quickly init and register a bdi associated | 457 | * For use from filesystems to quickly init and register a bdi associated |
514 | * with dirty writeback | 458 | * with dirty writeback |
515 | */ | 459 | */ |
516 | int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, | 460 | int bdi_setup_and_register(struct backing_dev_info *bdi, char *name) |
517 | unsigned int cap) | ||
518 | { | 461 | { |
519 | int err; | 462 | int err; |
520 | 463 | ||
521 | bdi->name = name; | 464 | bdi->name = name; |
522 | bdi->capabilities = cap; | 465 | bdi->capabilities = 0; |
523 | err = bdi_init(bdi); | 466 | err = bdi_init(bdi); |
524 | if (err) | 467 | if (err) |
525 | return err; | 468 | return err; |
diff --git a/mm/cleancache.c b/mm/cleancache.c index d0eac4350403..053bcd8f12fb 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c | |||
@@ -25,7 +25,7 @@ | |||
25 | static struct cleancache_ops *cleancache_ops __read_mostly; | 25 | static struct cleancache_ops *cleancache_ops __read_mostly; |
26 | 26 | ||
27 | /* | 27 | /* |
28 | * Counters available via /sys/kernel/debug/frontswap (if debugfs is | 28 | * Counters available via /sys/kernel/debug/cleancache (if debugfs is |
29 | * properly configured. These are for information only so are not protected | 29 | * properly configured. These are for information only so are not protected |
30 | * against increment races. | 30 | * against increment races. |
31 | */ | 31 | */ |
@@ -199,6 +199,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, | |||
199 | cma->order_per_bit = order_per_bit; | 199 | cma->order_per_bit = order_per_bit; |
200 | *res_cma = cma; | 200 | *res_cma = cma; |
201 | cma_area_count++; | 201 | cma_area_count++; |
202 | totalcma_pages += (size / PAGE_SIZE); | ||
202 | 203 | ||
203 | return 0; | 204 | return 0; |
204 | } | 205 | } |
@@ -337,7 +338,6 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
337 | if (ret) | 338 | if (ret) |
338 | goto err; | 339 | goto err; |
339 | 340 | ||
340 | totalcma_pages += (size / PAGE_SIZE); | ||
341 | pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, | 341 | pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M, |
342 | &base); | 342 | &base); |
343 | return 0; | 343 | return 0; |
diff --git a/mm/compaction.c b/mm/compaction.c index 546e571e9d60..8c0d9459b54a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/sysfs.h> | 16 | #include <linux/sysfs.h> |
17 | #include <linux/balloon_compaction.h> | 17 | #include <linux/balloon_compaction.h> |
18 | #include <linux/page-isolation.h> | 18 | #include <linux/page-isolation.h> |
19 | #include <linux/kasan.h> | ||
19 | #include "internal.h" | 20 | #include "internal.h" |
20 | 21 | ||
21 | #ifdef CONFIG_COMPACTION | 22 | #ifdef CONFIG_COMPACTION |
@@ -34,6 +35,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta) | |||
34 | #endif | 35 | #endif |
35 | 36 | ||
36 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | 37 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA |
38 | #ifdef CONFIG_TRACEPOINTS | ||
39 | static const char *const compaction_status_string[] = { | ||
40 | "deferred", | ||
41 | "skipped", | ||
42 | "continue", | ||
43 | "partial", | ||
44 | "complete", | ||
45 | "no_suitable_page", | ||
46 | "not_suitable_zone", | ||
47 | }; | ||
48 | #endif | ||
37 | 49 | ||
38 | #define CREATE_TRACE_POINTS | 50 | #define CREATE_TRACE_POINTS |
39 | #include <trace/events/compaction.h> | 51 | #include <trace/events/compaction.h> |
@@ -61,6 +73,7 @@ static void map_pages(struct list_head *list) | |||
61 | list_for_each_entry(page, list, lru) { | 73 | list_for_each_entry(page, list, lru) { |
62 | arch_alloc_page(page, 0); | 74 | arch_alloc_page(page, 0); |
63 | kernel_map_pages(page, 1, 1); | 75 | kernel_map_pages(page, 1, 1); |
76 | kasan_alloc_pages(page, 0); | ||
64 | } | 77 | } |
65 | } | 78 | } |
66 | 79 | ||
@@ -113,6 +126,77 @@ static struct page *pageblock_pfn_to_page(unsigned long start_pfn, | |||
113 | } | 126 | } |
114 | 127 | ||
115 | #ifdef CONFIG_COMPACTION | 128 | #ifdef CONFIG_COMPACTION |
129 | |||
130 | /* Do not skip compaction more than 64 times */ | ||
131 | #define COMPACT_MAX_DEFER_SHIFT 6 | ||
132 | |||
133 | /* | ||
134 | * Compaction is deferred when compaction fails to result in a page | ||
135 | * allocation success. 1 << compact_defer_limit compactions are skipped up | ||
136 | * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT | ||
137 | */ | ||
138 | void defer_compaction(struct zone *zone, int order) | ||
139 | { | ||
140 | zone->compact_considered = 0; | ||
141 | zone->compact_defer_shift++; | ||
142 | |||
143 | if (order < zone->compact_order_failed) | ||
144 | zone->compact_order_failed = order; | ||
145 | |||
146 | if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT) | ||
147 | zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT; | ||
148 | |||
149 | trace_mm_compaction_defer_compaction(zone, order); | ||
150 | } | ||
151 | |||
152 | /* Returns true if compaction should be skipped this time */ | ||
153 | bool compaction_deferred(struct zone *zone, int order) | ||
154 | { | ||
155 | unsigned long defer_limit = 1UL << zone->compact_defer_shift; | ||
156 | |||
157 | if (order < zone->compact_order_failed) | ||
158 | return false; | ||
159 | |||
160 | /* Avoid possible overflow */ | ||
161 | if (++zone->compact_considered > defer_limit) | ||
162 | zone->compact_considered = defer_limit; | ||
163 | |||
164 | if (zone->compact_considered >= defer_limit) | ||
165 | return false; | ||
166 | |||
167 | trace_mm_compaction_deferred(zone, order); | ||
168 | |||
169 | return true; | ||
170 | } | ||
171 | |||
172 | /* | ||
173 | * Update defer tracking counters after successful compaction of given order, | ||
174 | * which means an allocation either succeeded (alloc_success == true) or is | ||
175 | * expected to succeed. | ||
176 | */ | ||
177 | void compaction_defer_reset(struct zone *zone, int order, | ||
178 | bool alloc_success) | ||
179 | { | ||
180 | if (alloc_success) { | ||
181 | zone->compact_considered = 0; | ||
182 | zone->compact_defer_shift = 0; | ||
183 | } | ||
184 | if (order >= zone->compact_order_failed) | ||
185 | zone->compact_order_failed = order + 1; | ||
186 | |||
187 | trace_mm_compaction_defer_reset(zone, order); | ||
188 | } | ||
189 | |||
190 | /* Returns true if restarting compaction after many failures */ | ||
191 | bool compaction_restarting(struct zone *zone, int order) | ||
192 | { | ||
193 | if (order < zone->compact_order_failed) | ||
194 | return false; | ||
195 | |||
196 | return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT && | ||
197 | zone->compact_considered >= 1UL << zone->compact_defer_shift; | ||
198 | } | ||
199 | |||
116 | /* Returns true if the pageblock should be scanned for pages to isolate. */ | 200 | /* Returns true if the pageblock should be scanned for pages to isolate. */ |
117 | static inline bool isolation_suitable(struct compact_control *cc, | 201 | static inline bool isolation_suitable(struct compact_control *cc, |
118 | struct page *page) | 202 | struct page *page) |
@@ -408,6 +492,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
408 | 492 | ||
409 | /* If a page was split, advance to the end of it */ | 493 | /* If a page was split, advance to the end of it */ |
410 | if (isolated) { | 494 | if (isolated) { |
495 | cc->nr_freepages += isolated; | ||
496 | if (!strict && | ||
497 | cc->nr_migratepages <= cc->nr_freepages) { | ||
498 | blockpfn += isolated; | ||
499 | break; | ||
500 | } | ||
501 | |||
411 | blockpfn += isolated - 1; | 502 | blockpfn += isolated - 1; |
412 | cursor += isolated - 1; | 503 | cursor += isolated - 1; |
413 | continue; | 504 | continue; |
@@ -421,11 +512,12 @@ isolate_fail: | |||
421 | 512 | ||
422 | } | 513 | } |
423 | 514 | ||
515 | trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn, | ||
516 | nr_scanned, total_isolated); | ||
517 | |||
424 | /* Record how far we have got within the block */ | 518 | /* Record how far we have got within the block */ |
425 | *start_pfn = blockpfn; | 519 | *start_pfn = blockpfn; |
426 | 520 | ||
427 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); | ||
428 | |||
429 | /* | 521 | /* |
430 | * If strict isolation is requested by CMA then check that all the | 522 | * If strict isolation is requested by CMA then check that all the |
431 | * pages requested were isolated. If there were any failures, 0 is | 523 | * pages requested were isolated. If there were any failures, 0 is |
@@ -581,6 +673,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, | |||
581 | unsigned long flags = 0; | 673 | unsigned long flags = 0; |
582 | bool locked = false; | 674 | bool locked = false; |
583 | struct page *page = NULL, *valid_page = NULL; | 675 | struct page *page = NULL, *valid_page = NULL; |
676 | unsigned long start_pfn = low_pfn; | ||
584 | 677 | ||
585 | /* | 678 | /* |
586 | * Ensure that there are not too many pages isolated from the LRU | 679 | * Ensure that there are not too many pages isolated from the LRU |
@@ -741,7 +834,8 @@ isolate_success: | |||
741 | if (low_pfn == end_pfn) | 834 | if (low_pfn == end_pfn) |
742 | update_pageblock_skip(cc, valid_page, nr_isolated, true); | 835 | update_pageblock_skip(cc, valid_page, nr_isolated, true); |
743 | 836 | ||
744 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 837 | trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn, |
838 | nr_scanned, nr_isolated); | ||
745 | 839 | ||
746 | count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); | 840 | count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned); |
747 | if (nr_isolated) | 841 | if (nr_isolated) |
@@ -814,7 +908,6 @@ static void isolate_freepages(struct compact_control *cc) | |||
814 | unsigned long isolate_start_pfn; /* exact pfn we start at */ | 908 | unsigned long isolate_start_pfn; /* exact pfn we start at */ |
815 | unsigned long block_end_pfn; /* end of current pageblock */ | 909 | unsigned long block_end_pfn; /* end of current pageblock */ |
816 | unsigned long low_pfn; /* lowest pfn scanner is able to scan */ | 910 | unsigned long low_pfn; /* lowest pfn scanner is able to scan */ |
817 | int nr_freepages = cc->nr_freepages; | ||
818 | struct list_head *freelist = &cc->freepages; | 911 | struct list_head *freelist = &cc->freepages; |
819 | 912 | ||
820 | /* | 913 | /* |
@@ -839,11 +932,11 @@ static void isolate_freepages(struct compact_control *cc) | |||
839 | * pages on cc->migratepages. We stop searching if the migrate | 932 | * pages on cc->migratepages. We stop searching if the migrate |
840 | * and free page scanners meet or enough free pages are isolated. | 933 | * and free page scanners meet or enough free pages are isolated. |
841 | */ | 934 | */ |
842 | for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; | 935 | for (; block_start_pfn >= low_pfn && |
936 | cc->nr_migratepages > cc->nr_freepages; | ||
843 | block_end_pfn = block_start_pfn, | 937 | block_end_pfn = block_start_pfn, |
844 | block_start_pfn -= pageblock_nr_pages, | 938 | block_start_pfn -= pageblock_nr_pages, |
845 | isolate_start_pfn = block_start_pfn) { | 939 | isolate_start_pfn = block_start_pfn) { |
846 | unsigned long isolated; | ||
847 | 940 | ||
848 | /* | 941 | /* |
849 | * This can iterate a massively long zone without finding any | 942 | * This can iterate a massively long zone without finding any |
@@ -868,9 +961,8 @@ static void isolate_freepages(struct compact_control *cc) | |||
868 | continue; | 961 | continue; |
869 | 962 | ||
870 | /* Found a block suitable for isolating free pages from. */ | 963 | /* Found a block suitable for isolating free pages from. */ |
871 | isolated = isolate_freepages_block(cc, &isolate_start_pfn, | 964 | isolate_freepages_block(cc, &isolate_start_pfn, |
872 | block_end_pfn, freelist, false); | 965 | block_end_pfn, freelist, false); |
873 | nr_freepages += isolated; | ||
874 | 966 | ||
875 | /* | 967 | /* |
876 | * Remember where the free scanner should restart next time, | 968 | * Remember where the free scanner should restart next time, |
@@ -902,8 +994,6 @@ static void isolate_freepages(struct compact_control *cc) | |||
902 | */ | 994 | */ |
903 | if (block_start_pfn < low_pfn) | 995 | if (block_start_pfn < low_pfn) |
904 | cc->free_pfn = cc->migrate_pfn; | 996 | cc->free_pfn = cc->migrate_pfn; |
905 | |||
906 | cc->nr_freepages = nr_freepages; | ||
907 | } | 997 | } |
908 | 998 | ||
909 | /* | 999 | /* |
@@ -1015,8 +1105,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
1015 | low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, | 1105 | low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, |
1016 | isolate_mode); | 1106 | isolate_mode); |
1017 | 1107 | ||
1018 | if (!low_pfn || cc->contended) | 1108 | if (!low_pfn || cc->contended) { |
1109 | acct_isolated(zone, cc); | ||
1019 | return ISOLATE_ABORT; | 1110 | return ISOLATE_ABORT; |
1111 | } | ||
1020 | 1112 | ||
1021 | /* | 1113 | /* |
1022 | * Either we isolated something and proceed with migration. Or | 1114 | * Either we isolated something and proceed with migration. Or |
@@ -1037,7 +1129,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
1037 | return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; | 1129 | return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; |
1038 | } | 1130 | } |
1039 | 1131 | ||
1040 | static int compact_finished(struct zone *zone, struct compact_control *cc, | 1132 | static int __compact_finished(struct zone *zone, struct compact_control *cc, |
1041 | const int migratetype) | 1133 | const int migratetype) |
1042 | { | 1134 | { |
1043 | unsigned int order; | 1135 | unsigned int order; |
@@ -1088,11 +1180,24 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, | |||
1088 | return COMPACT_PARTIAL; | 1180 | return COMPACT_PARTIAL; |
1089 | 1181 | ||
1090 | /* Job done if allocation would set block type */ | 1182 | /* Job done if allocation would set block type */ |
1091 | if (cc->order >= pageblock_order && area->nr_free) | 1183 | if (order >= pageblock_order && area->nr_free) |
1092 | return COMPACT_PARTIAL; | 1184 | return COMPACT_PARTIAL; |
1093 | } | 1185 | } |
1094 | 1186 | ||
1095 | return COMPACT_CONTINUE; | 1187 | return COMPACT_NO_SUITABLE_PAGE; |
1188 | } | ||
1189 | |||
1190 | static int compact_finished(struct zone *zone, struct compact_control *cc, | ||
1191 | const int migratetype) | ||
1192 | { | ||
1193 | int ret; | ||
1194 | |||
1195 | ret = __compact_finished(zone, cc, migratetype); | ||
1196 | trace_mm_compaction_finished(zone, cc->order, ret); | ||
1197 | if (ret == COMPACT_NO_SUITABLE_PAGE) | ||
1198 | ret = COMPACT_CONTINUE; | ||
1199 | |||
1200 | return ret; | ||
1096 | } | 1201 | } |
1097 | 1202 | ||
1098 | /* | 1203 | /* |
@@ -1102,7 +1207,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, | |||
1102 | * COMPACT_PARTIAL - If the allocation would succeed without compaction | 1207 | * COMPACT_PARTIAL - If the allocation would succeed without compaction |
1103 | * COMPACT_CONTINUE - If compaction should run now | 1208 | * COMPACT_CONTINUE - If compaction should run now |
1104 | */ | 1209 | */ |
1105 | unsigned long compaction_suitable(struct zone *zone, int order, | 1210 | static unsigned long __compaction_suitable(struct zone *zone, int order, |
1106 | int alloc_flags, int classzone_idx) | 1211 | int alloc_flags, int classzone_idx) |
1107 | { | 1212 | { |
1108 | int fragindex; | 1213 | int fragindex; |
@@ -1146,11 +1251,24 @@ unsigned long compaction_suitable(struct zone *zone, int order, | |||
1146 | */ | 1251 | */ |
1147 | fragindex = fragmentation_index(zone, order); | 1252 | fragindex = fragmentation_index(zone, order); |
1148 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | 1253 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) |
1149 | return COMPACT_SKIPPED; | 1254 | return COMPACT_NOT_SUITABLE_ZONE; |
1150 | 1255 | ||
1151 | return COMPACT_CONTINUE; | 1256 | return COMPACT_CONTINUE; |
1152 | } | 1257 | } |
1153 | 1258 | ||
1259 | unsigned long compaction_suitable(struct zone *zone, int order, | ||
1260 | int alloc_flags, int classzone_idx) | ||
1261 | { | ||
1262 | unsigned long ret; | ||
1263 | |||
1264 | ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); | ||
1265 | trace_mm_compaction_suitable(zone, order, ret); | ||
1266 | if (ret == COMPACT_NOT_SUITABLE_ZONE) | ||
1267 | ret = COMPACT_SKIPPED; | ||
1268 | |||
1269 | return ret; | ||
1270 | } | ||
1271 | |||
1154 | static int compact_zone(struct zone *zone, struct compact_control *cc) | 1272 | static int compact_zone(struct zone *zone, struct compact_control *cc) |
1155 | { | 1273 | { |
1156 | int ret; | 1274 | int ret; |
@@ -1197,7 +1315,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1197 | zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; | 1315 | zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; |
1198 | } | 1316 | } |
1199 | 1317 | ||
1200 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); | 1318 | trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, |
1319 | cc->free_pfn, end_pfn, sync); | ||
1201 | 1320 | ||
1202 | migrate_prep_local(); | 1321 | migrate_prep_local(); |
1203 | 1322 | ||
@@ -1299,7 +1418,8 @@ out: | |||
1299 | zone->compact_cached_free_pfn = free_pfn; | 1418 | zone->compact_cached_free_pfn = free_pfn; |
1300 | } | 1419 | } |
1301 | 1420 | ||
1302 | trace_mm_compaction_end(ret); | 1421 | trace_mm_compaction_end(start_pfn, cc->migrate_pfn, |
1422 | cc->free_pfn, end_pfn, sync, ret); | ||
1303 | 1423 | ||
1304 | return ret; | 1424 | return ret; |
1305 | } | 1425 | } |
@@ -1335,22 +1455,20 @@ int sysctl_extfrag_threshold = 500; | |||
1335 | 1455 | ||
1336 | /** | 1456 | /** |
1337 | * try_to_compact_pages - Direct compact to satisfy a high-order allocation | 1457 | * try_to_compact_pages - Direct compact to satisfy a high-order allocation |
1338 | * @zonelist: The zonelist used for the current allocation | ||
1339 | * @order: The order of the current allocation | ||
1340 | * @gfp_mask: The GFP mask of the current allocation | 1458 | * @gfp_mask: The GFP mask of the current allocation |
1341 | * @nodemask: The allowed nodes to allocate from | 1459 | * @order: The order of the current allocation |
1460 | * @alloc_flags: The allocation flags of the current allocation | ||
1461 | * @ac: The context of current allocation | ||
1342 | * @mode: The migration mode for async, sync light, or sync migration | 1462 | * @mode: The migration mode for async, sync light, or sync migration |
1343 | * @contended: Return value that determines if compaction was aborted due to | 1463 | * @contended: Return value that determines if compaction was aborted due to |
1344 | * need_resched() or lock contention | 1464 | * need_resched() or lock contention |
1345 | * | 1465 | * |
1346 | * This is the main entry point for direct page compaction. | 1466 | * This is the main entry point for direct page compaction. |
1347 | */ | 1467 | */ |
1348 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 1468 | unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, |
1349 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 1469 | int alloc_flags, const struct alloc_context *ac, |
1350 | enum migrate_mode mode, int *contended, | 1470 | enum migrate_mode mode, int *contended) |
1351 | int alloc_flags, int classzone_idx) | ||
1352 | { | 1471 | { |
1353 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
1354 | int may_enter_fs = gfp_mask & __GFP_FS; | 1472 | int may_enter_fs = gfp_mask & __GFP_FS; |
1355 | int may_perform_io = gfp_mask & __GFP_IO; | 1473 | int may_perform_io = gfp_mask & __GFP_IO; |
1356 | struct zoneref *z; | 1474 | struct zoneref *z; |
@@ -1364,9 +1482,11 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1364 | if (!order || !may_enter_fs || !may_perform_io) | 1482 | if (!order || !may_enter_fs || !may_perform_io) |
1365 | return COMPACT_SKIPPED; | 1483 | return COMPACT_SKIPPED; |
1366 | 1484 | ||
1485 | trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode); | ||
1486 | |||
1367 | /* Compact each zone in the list */ | 1487 | /* Compact each zone in the list */ |
1368 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, | 1488 | for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, |
1369 | nodemask) { | 1489 | ac->nodemask) { |
1370 | int status; | 1490 | int status; |
1371 | int zone_contended; | 1491 | int zone_contended; |
1372 | 1492 | ||
@@ -1374,7 +1494,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1374 | continue; | 1494 | continue; |
1375 | 1495 | ||
1376 | status = compact_zone_order(zone, order, gfp_mask, mode, | 1496 | status = compact_zone_order(zone, order, gfp_mask, mode, |
1377 | &zone_contended, alloc_flags, classzone_idx); | 1497 | &zone_contended, alloc_flags, |
1498 | ac->classzone_idx); | ||
1378 | rc = max(status, rc); | 1499 | rc = max(status, rc); |
1379 | /* | 1500 | /* |
1380 | * It takes at least one zone that wasn't lock contended | 1501 | * It takes at least one zone that wasn't lock contended |
@@ -1384,7 +1505,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
1384 | 1505 | ||
1385 | /* If a normal allocation would succeed, stop compacting */ | 1506 | /* If a normal allocation would succeed, stop compacting */ |
1386 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), | 1507 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), |
1387 | classzone_idx, alloc_flags)) { | 1508 | ac->classzone_idx, alloc_flags)) { |
1388 | /* | 1509 | /* |
1389 | * We think the allocation will succeed in this zone, | 1510 | * We think the allocation will succeed in this zone, |
1390 | * but it is not certain, hence the false. The caller | 1511 | * but it is not certain, hence the false. The caller |
diff --git a/mm/debug.c b/mm/debug.c index 0e58f3211f89..3eb3ac2fcee7 100644 --- a/mm/debug.c +++ b/mm/debug.c | |||
@@ -130,7 +130,6 @@ static const struct trace_print_flags vmaflags_names[] = { | |||
130 | {VM_ACCOUNT, "account" }, | 130 | {VM_ACCOUNT, "account" }, |
131 | {VM_NORESERVE, "noreserve" }, | 131 | {VM_NORESERVE, "noreserve" }, |
132 | {VM_HUGETLB, "hugetlb" }, | 132 | {VM_HUGETLB, "hugetlb" }, |
133 | {VM_NONLINEAR, "nonlinear" }, | ||
134 | #if defined(CONFIG_X86) | 133 | #if defined(CONFIG_X86) |
135 | {VM_PAT, "pat" }, | 134 | {VM_PAT, "pat" }, |
136 | #elif defined(CONFIG_PPC) | 135 | #elif defined(CONFIG_PPC) |
@@ -174,7 +173,7 @@ void dump_mm(const struct mm_struct *mm) | |||
174 | "get_unmapped_area %p\n" | 173 | "get_unmapped_area %p\n" |
175 | #endif | 174 | #endif |
176 | "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" | 175 | "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" |
177 | "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n" | 176 | "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" |
178 | "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" | 177 | "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" |
179 | "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" | 178 | "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" |
180 | "start_code %lx end_code %lx start_data %lx end_data %lx\n" | 179 | "start_code %lx end_code %lx start_data %lx end_data %lx\n" |
@@ -207,6 +206,7 @@ void dump_mm(const struct mm_struct *mm) | |||
207 | mm->pgd, atomic_read(&mm->mm_users), | 206 | mm->pgd, atomic_read(&mm->mm_users), |
208 | atomic_read(&mm->mm_count), | 207 | atomic_read(&mm->mm_count), |
209 | atomic_long_read((atomic_long_t *)&mm->nr_ptes), | 208 | atomic_long_read((atomic_long_t *)&mm->nr_ptes), |
209 | mm_nr_pmds((struct mm_struct *)mm), | ||
210 | mm->map_count, | 210 | mm->map_count, |
211 | mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, | 211 | mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, |
212 | mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, | 212 | mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 2ad7adf4f0a4..fac23ecf8d72 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
@@ -73,7 +73,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) | |||
73 | else | 73 | else |
74 | endbyte--; /* inclusive */ | 74 | endbyte--; /* inclusive */ |
75 | 75 | ||
76 | bdi = mapping->backing_dev_info; | 76 | bdi = inode_to_bdi(mapping->host); |
77 | 77 | ||
78 | switch (advice) { | 78 | switch (advice) { |
79 | case POSIX_FADV_NORMAL: | 79 | case POSIX_FADV_NORMAL: |
@@ -113,7 +113,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) | |||
113 | case POSIX_FADV_NOREUSE: | 113 | case POSIX_FADV_NOREUSE: |
114 | break; | 114 | break; |
115 | case POSIX_FADV_DONTNEED: | 115 | case POSIX_FADV_DONTNEED: |
116 | if (!bdi_write_congested(mapping->backing_dev_info)) | 116 | if (!bdi_write_congested(bdi)) |
117 | __filemap_fdatawrite_range(mapping, offset, endbyte, | 117 | __filemap_fdatawrite_range(mapping, offset, endbyte, |
118 | WB_SYNC_NONE); | 118 | WB_SYNC_NONE); |
119 | 119 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 673e4581a2e5..d9f5336552d7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -211,7 +211,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) | |||
211 | */ | 211 | */ |
212 | if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { | 212 | if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { |
213 | dec_zone_page_state(page, NR_FILE_DIRTY); | 213 | dec_zone_page_state(page, NR_FILE_DIRTY); |
214 | dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); | 214 | dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); |
215 | } | 215 | } |
216 | } | 216 | } |
217 | 217 | ||
@@ -2087,7 +2087,6 @@ const struct vm_operations_struct generic_file_vm_ops = { | |||
2087 | .fault = filemap_fault, | 2087 | .fault = filemap_fault, |
2088 | .map_pages = filemap_map_pages, | 2088 | .map_pages = filemap_map_pages, |
2089 | .page_mkwrite = filemap_page_mkwrite, | 2089 | .page_mkwrite = filemap_page_mkwrite, |
2090 | .remap_pages = generic_file_remap_pages, | ||
2091 | }; | 2090 | }; |
2092 | 2091 | ||
2093 | /* This is used for a general mmap of a disk file */ | 2092 | /* This is used for a general mmap of a disk file */ |
@@ -2565,7 +2564,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
2565 | size_t count = iov_iter_count(from); | 2564 | size_t count = iov_iter_count(from); |
2566 | 2565 | ||
2567 | /* We can write back this queue in page reclaim */ | 2566 | /* We can write back this queue in page reclaim */ |
2568 | current->backing_dev_info = mapping->backing_dev_info; | 2567 | current->backing_dev_info = inode_to_bdi(inode); |
2569 | err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); | 2568 | err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); |
2570 | if (err) | 2569 | if (err) |
2571 | goto out; | 2570 | goto out; |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 0d105aeff82f..c175f9f25210 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -9,6 +9,7 @@ | |||
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/fs.h> | 11 | #include <linux/fs.h> |
12 | #include <linux/backing-dev.h> | ||
12 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
13 | #include <linux/export.h> | 14 | #include <linux/export.h> |
14 | #include <linux/uio.h> | 15 | #include <linux/uio.h> |
@@ -301,7 +302,6 @@ out: | |||
301 | static const struct vm_operations_struct xip_file_vm_ops = { | 302 | static const struct vm_operations_struct xip_file_vm_ops = { |
302 | .fault = xip_file_fault, | 303 | .fault = xip_file_fault, |
303 | .page_mkwrite = filemap_page_mkwrite, | 304 | .page_mkwrite = filemap_page_mkwrite, |
304 | .remap_pages = generic_file_remap_pages, | ||
305 | }; | 305 | }; |
306 | 306 | ||
307 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) | 307 | int xip_file_mmap(struct file * file, struct vm_area_struct * vma) |
@@ -410,7 +410,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, | |||
410 | count = len; | 410 | count = len; |
411 | 411 | ||
412 | /* We can write back this queue in page reclaim */ | 412 | /* We can write back this queue in page reclaim */ |
413 | current->backing_dev_info = mapping->backing_dev_info; | 413 | current->backing_dev_info = inode_to_bdi(inode); |
414 | 414 | ||
415 | ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode)); | 415 | ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode)); |
416 | if (ret) | 416 | if (ret) |
diff --git a/mm/fremap.c b/mm/fremap.c deleted file mode 100644 index 2805d71cf476..000000000000 --- a/mm/fremap.c +++ /dev/null | |||
@@ -1,283 +0,0 @@ | |||
1 | /* | ||
2 | * linux/mm/fremap.c | ||
3 | * | ||
4 | * Explicit pagetable population and nonlinear (random) mappings support. | ||
5 | * | ||
6 | * started by Ingo Molnar, Copyright (C) 2002, 2003 | ||
7 | */ | ||
8 | #include <linux/export.h> | ||
9 | #include <linux/backing-dev.h> | ||
10 | #include <linux/mm.h> | ||
11 | #include <linux/swap.h> | ||
12 | #include <linux/file.h> | ||
13 | #include <linux/mman.h> | ||
14 | #include <linux/pagemap.h> | ||
15 | #include <linux/swapops.h> | ||
16 | #include <linux/rmap.h> | ||
17 | #include <linux/syscalls.h> | ||
18 | #include <linux/mmu_notifier.h> | ||
19 | |||
20 | #include <asm/mmu_context.h> | ||
21 | #include <asm/cacheflush.h> | ||
22 | #include <asm/tlbflush.h> | ||
23 | |||
24 | #include "internal.h" | ||
25 | |||
26 | static int mm_counter(struct page *page) | ||
27 | { | ||
28 | return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; | ||
29 | } | ||
30 | |||
31 | static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | ||
32 | unsigned long addr, pte_t *ptep) | ||
33 | { | ||
34 | pte_t pte = *ptep; | ||
35 | struct page *page; | ||
36 | swp_entry_t entry; | ||
37 | |||
38 | if (pte_present(pte)) { | ||
39 | flush_cache_page(vma, addr, pte_pfn(pte)); | ||
40 | pte = ptep_clear_flush_notify(vma, addr, ptep); | ||
41 | page = vm_normal_page(vma, addr, pte); | ||
42 | if (page) { | ||
43 | if (pte_dirty(pte)) | ||
44 | set_page_dirty(page); | ||
45 | update_hiwater_rss(mm); | ||
46 | dec_mm_counter(mm, mm_counter(page)); | ||
47 | page_remove_rmap(page); | ||
48 | page_cache_release(page); | ||
49 | } | ||
50 | } else { /* zap_pte() is not called when pte_none() */ | ||
51 | if (!pte_file(pte)) { | ||
52 | update_hiwater_rss(mm); | ||
53 | entry = pte_to_swp_entry(pte); | ||
54 | if (non_swap_entry(entry)) { | ||
55 | if (is_migration_entry(entry)) { | ||
56 | page = migration_entry_to_page(entry); | ||
57 | dec_mm_counter(mm, mm_counter(page)); | ||
58 | } | ||
59 | } else { | ||
60 | free_swap_and_cache(entry); | ||
61 | dec_mm_counter(mm, MM_SWAPENTS); | ||
62 | } | ||
63 | } | ||
64 | pte_clear_not_present_full(mm, addr, ptep, 0); | ||
65 | } | ||
66 | } | ||
67 | |||
68 | /* | ||
69 | * Install a file pte to a given virtual memory address, release any | ||
70 | * previously existing mapping. | ||
71 | */ | ||
72 | static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | ||
73 | unsigned long addr, unsigned long pgoff, pgprot_t prot) | ||
74 | { | ||
75 | int err = -ENOMEM; | ||
76 | pte_t *pte, ptfile; | ||
77 | spinlock_t *ptl; | ||
78 | |||
79 | pte = get_locked_pte(mm, addr, &ptl); | ||
80 | if (!pte) | ||
81 | goto out; | ||
82 | |||
83 | ptfile = pgoff_to_pte(pgoff); | ||
84 | |||
85 | if (!pte_none(*pte)) | ||
86 | zap_pte(mm, vma, addr, pte); | ||
87 | |||
88 | set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile)); | ||
89 | /* | ||
90 | * We don't need to run update_mmu_cache() here because the "file pte" | ||
91 | * being installed by install_file_pte() is not a real pte - it's a | ||
92 | * non-present entry (like a swap entry), noting what file offset should | ||
93 | * be mapped there when there's a fault (in a non-linear vma where | ||
94 | * that's not obvious). | ||
95 | */ | ||
96 | pte_unmap_unlock(pte, ptl); | ||
97 | err = 0; | ||
98 | out: | ||
99 | return err; | ||
100 | } | ||
101 | |||
102 | int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, | ||
103 | unsigned long size, pgoff_t pgoff) | ||
104 | { | ||
105 | struct mm_struct *mm = vma->vm_mm; | ||
106 | int err; | ||
107 | |||
108 | do { | ||
109 | err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot); | ||
110 | if (err) | ||
111 | return err; | ||
112 | |||
113 | size -= PAGE_SIZE; | ||
114 | addr += PAGE_SIZE; | ||
115 | pgoff++; | ||
116 | } while (size); | ||
117 | |||
118 | return 0; | ||
119 | } | ||
120 | EXPORT_SYMBOL(generic_file_remap_pages); | ||
121 | |||
122 | /** | ||
123 | * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma | ||
124 | * @start: start of the remapped virtual memory range | ||
125 | * @size: size of the remapped virtual memory range | ||
126 | * @prot: new protection bits of the range (see NOTE) | ||
127 | * @pgoff: to-be-mapped page of the backing store file | ||
128 | * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. | ||
129 | * | ||
130 | * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma | ||
131 | * (shared backing store file). | ||
132 | * | ||
133 | * This syscall works purely via pagetables, so it's the most efficient | ||
134 | * way to map the same (large) file into a given virtual window. Unlike | ||
135 | * mmap()/mremap() it does not create any new vmas. The new mappings are | ||
136 | * also safe across swapout. | ||
137 | * | ||
138 | * NOTE: the @prot parameter right now is ignored (but must be zero), | ||
139 | * and the vma's default protection is used. Arbitrary protections | ||
140 | * might be implemented in the future. | ||
141 | */ | ||
142 | SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | ||
143 | unsigned long, prot, unsigned long, pgoff, unsigned long, flags) | ||
144 | { | ||
145 | struct mm_struct *mm = current->mm; | ||
146 | struct address_space *mapping; | ||
147 | struct vm_area_struct *vma; | ||
148 | int err = -EINVAL; | ||
149 | int has_write_lock = 0; | ||
150 | vm_flags_t vm_flags = 0; | ||
151 | |||
152 | pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " | ||
153 | "See Documentation/vm/remap_file_pages.txt.\n", | ||
154 | current->comm, current->pid); | ||
155 | |||
156 | if (prot) | ||
157 | return err; | ||
158 | /* | ||
159 | * Sanitize the syscall parameters: | ||
160 | */ | ||
161 | start = start & PAGE_MASK; | ||
162 | size = size & PAGE_MASK; | ||
163 | |||
164 | /* Does the address range wrap, or is the span zero-sized? */ | ||
165 | if (start + size <= start) | ||
166 | return err; | ||
167 | |||
168 | /* Does pgoff wrap? */ | ||
169 | if (pgoff + (size >> PAGE_SHIFT) < pgoff) | ||
170 | return err; | ||
171 | |||
172 | /* Can we represent this offset inside this architecture's pte's? */ | ||
173 | #if PTE_FILE_MAX_BITS < BITS_PER_LONG | ||
174 | if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) | ||
175 | return err; | ||
176 | #endif | ||
177 | |||
178 | /* We need down_write() to change vma->vm_flags. */ | ||
179 | down_read(&mm->mmap_sem); | ||
180 | retry: | ||
181 | vma = find_vma(mm, start); | ||
182 | |||
183 | /* | ||
184 | * Make sure the vma is shared, that it supports prefaulting, | ||
185 | * and that the remapped range is valid and fully within | ||
186 | * the single existing vma. | ||
187 | */ | ||
188 | if (!vma || !(vma->vm_flags & VM_SHARED)) | ||
189 | goto out; | ||
190 | |||
191 | if (!vma->vm_ops || !vma->vm_ops->remap_pages) | ||
192 | goto out; | ||
193 | |||
194 | if (start < vma->vm_start || start + size > vma->vm_end) | ||
195 | goto out; | ||
196 | |||
197 | /* Must set VM_NONLINEAR before any pages are populated. */ | ||
198 | if (!(vma->vm_flags & VM_NONLINEAR)) { | ||
199 | /* | ||
200 | * vm_private_data is used as a swapout cursor | ||
201 | * in a VM_NONLINEAR vma. | ||
202 | */ | ||
203 | if (vma->vm_private_data) | ||
204 | goto out; | ||
205 | |||
206 | /* Don't need a nonlinear mapping, exit success */ | ||
207 | if (pgoff == linear_page_index(vma, start)) { | ||
208 | err = 0; | ||
209 | goto out; | ||
210 | } | ||
211 | |||
212 | if (!has_write_lock) { | ||
213 | get_write_lock: | ||
214 | up_read(&mm->mmap_sem); | ||
215 | down_write(&mm->mmap_sem); | ||
216 | has_write_lock = 1; | ||
217 | goto retry; | ||
218 | } | ||
219 | mapping = vma->vm_file->f_mapping; | ||
220 | /* | ||
221 | * page_mkclean doesn't work on nonlinear vmas, so if | ||
222 | * dirty pages need to be accounted, emulate with linear | ||
223 | * vmas. | ||
224 | */ | ||
225 | if (mapping_cap_account_dirty(mapping)) { | ||
226 | unsigned long addr; | ||
227 | struct file *file = get_file(vma->vm_file); | ||
228 | /* mmap_region may free vma; grab the info now */ | ||
229 | vm_flags = vma->vm_flags; | ||
230 | |||
231 | addr = mmap_region(file, start, size, vm_flags, pgoff); | ||
232 | fput(file); | ||
233 | if (IS_ERR_VALUE(addr)) { | ||
234 | err = addr; | ||
235 | } else { | ||
236 | BUG_ON(addr != start); | ||
237 | err = 0; | ||
238 | } | ||
239 | goto out_freed; | ||
240 | } | ||
241 | i_mmap_lock_write(mapping); | ||
242 | flush_dcache_mmap_lock(mapping); | ||
243 | vma->vm_flags |= VM_NONLINEAR; | ||
244 | vma_interval_tree_remove(vma, &mapping->i_mmap); | ||
245 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | ||
246 | flush_dcache_mmap_unlock(mapping); | ||
247 | i_mmap_unlock_write(mapping); | ||
248 | } | ||
249 | |||
250 | if (vma->vm_flags & VM_LOCKED) { | ||
251 | /* | ||
252 | * drop PG_Mlocked flag for over-mapped range | ||
253 | */ | ||
254 | if (!has_write_lock) | ||
255 | goto get_write_lock; | ||
256 | vm_flags = vma->vm_flags; | ||
257 | munlock_vma_pages_range(vma, start, start + size); | ||
258 | vma->vm_flags = vm_flags; | ||
259 | } | ||
260 | |||
261 | mmu_notifier_invalidate_range_start(mm, start, start + size); | ||
262 | err = vma->vm_ops->remap_pages(vma, start, size, pgoff); | ||
263 | mmu_notifier_invalidate_range_end(mm, start, start + size); | ||
264 | |||
265 | /* | ||
266 | * We can't clear VM_NONLINEAR because we'd have to do | ||
267 | * it after ->populate completes, and that would prevent | ||
268 | * downgrading the lock. (Locks can't be upgraded). | ||
269 | */ | ||
270 | |||
271 | out: | ||
272 | if (vma) | ||
273 | vm_flags = vma->vm_flags; | ||
274 | out_freed: | ||
275 | if (likely(!has_write_lock)) | ||
276 | up_read(&mm->mmap_sem); | ||
277 | else | ||
278 | up_write(&mm->mmap_sem); | ||
279 | if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK))) | ||
280 | mm_populate(start, size); | ||
281 | |||
282 | return err; | ||
283 | } | ||
@@ -55,7 +55,7 @@ retry: | |||
55 | */ | 55 | */ |
56 | if (likely(!(flags & FOLL_MIGRATION))) | 56 | if (likely(!(flags & FOLL_MIGRATION))) |
57 | goto no_page; | 57 | goto no_page; |
58 | if (pte_none(pte) || pte_file(pte)) | 58 | if (pte_none(pte)) |
59 | goto no_page; | 59 | goto no_page; |
60 | entry = pte_to_swp_entry(pte); | 60 | entry = pte_to_swp_entry(pte); |
61 | if (!is_migration_entry(entry)) | 61 | if (!is_migration_entry(entry)) |
@@ -64,7 +64,7 @@ retry: | |||
64 | migration_entry_wait(mm, pmd, address); | 64 | migration_entry_wait(mm, pmd, address); |
65 | goto retry; | 65 | goto retry; |
66 | } | 66 | } |
67 | if ((flags & FOLL_NUMA) && pte_numa(pte)) | 67 | if ((flags & FOLL_NUMA) && pte_protnone(pte)) |
68 | goto no_page; | 68 | goto no_page; |
69 | if ((flags & FOLL_WRITE) && !pte_write(pte)) { | 69 | if ((flags & FOLL_WRITE) && !pte_write(pte)) { |
70 | pte_unmap_unlock(ptep, ptl); | 70 | pte_unmap_unlock(ptep, ptl); |
@@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
167 | if (pud_none(*pud)) | 167 | if (pud_none(*pud)) |
168 | return no_page_table(vma, flags); | 168 | return no_page_table(vma, flags); |
169 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { | 169 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { |
170 | if (flags & FOLL_GET) | 170 | page = follow_huge_pud(mm, address, pud, flags); |
171 | return NULL; | 171 | if (page) |
172 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | 172 | return page; |
173 | return page; | 173 | return no_page_table(vma, flags); |
174 | } | 174 | } |
175 | if (unlikely(pud_bad(*pud))) | 175 | if (unlikely(pud_bad(*pud))) |
176 | return no_page_table(vma, flags); | 176 | return no_page_table(vma, flags); |
@@ -179,21 +179,12 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
179 | if (pmd_none(*pmd)) | 179 | if (pmd_none(*pmd)) |
180 | return no_page_table(vma, flags); | 180 | return no_page_table(vma, flags); |
181 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { | 181 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { |
182 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 182 | page = follow_huge_pmd(mm, address, pmd, flags); |
183 | if (flags & FOLL_GET) { | 183 | if (page) |
184 | /* | 184 | return page; |
185 | * Refcount on tail pages are not well-defined and | 185 | return no_page_table(vma, flags); |
186 | * shouldn't be taken. The caller should handle a NULL | ||
187 | * return when trying to follow tail pages. | ||
188 | */ | ||
189 | if (PageHead(page)) | ||
190 | get_page(page); | ||
191 | else | ||
192 | page = NULL; | ||
193 | } | ||
194 | return page; | ||
195 | } | 186 | } |
196 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | 187 | if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) |
197 | return no_page_table(vma, flags); | 188 | return no_page_table(vma, flags); |
198 | if (pmd_trans_huge(*pmd)) { | 189 | if (pmd_trans_huge(*pmd)) { |
199 | if (flags & FOLL_SPLIT) { | 190 | if (flags & FOLL_SPLIT) { |
@@ -296,7 +287,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, | |||
296 | return -ENOMEM; | 287 | return -ENOMEM; |
297 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) | 288 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) |
298 | return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; | 289 | return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; |
299 | if (ret & VM_FAULT_SIGBUS) | 290 | if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) |
300 | return -EFAULT; | 291 | return -EFAULT; |
301 | BUG(); | 292 | BUG(); |
302 | } | 293 | } |
@@ -571,7 +562,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | |||
571 | return -ENOMEM; | 562 | return -ENOMEM; |
572 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) | 563 | if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) |
573 | return -EHWPOISON; | 564 | return -EHWPOISON; |
574 | if (ret & VM_FAULT_SIGBUS) | 565 | if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) |
575 | return -EFAULT; | 566 | return -EFAULT; |
576 | BUG(); | 567 | BUG(); |
577 | } | 568 | } |
@@ -584,6 +575,185 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | |||
584 | return 0; | 575 | return 0; |
585 | } | 576 | } |
586 | 577 | ||
578 | static __always_inline long __get_user_pages_locked(struct task_struct *tsk, | ||
579 | struct mm_struct *mm, | ||
580 | unsigned long start, | ||
581 | unsigned long nr_pages, | ||
582 | int write, int force, | ||
583 | struct page **pages, | ||
584 | struct vm_area_struct **vmas, | ||
585 | int *locked, bool notify_drop, | ||
586 | unsigned int flags) | ||
587 | { | ||
588 | long ret, pages_done; | ||
589 | bool lock_dropped; | ||
590 | |||
591 | if (locked) { | ||
592 | /* if VM_FAULT_RETRY can be returned, vmas become invalid */ | ||
593 | BUG_ON(vmas); | ||
594 | /* check caller initialized locked */ | ||
595 | BUG_ON(*locked != 1); | ||
596 | } | ||
597 | |||
598 | if (pages) | ||
599 | flags |= FOLL_GET; | ||
600 | if (write) | ||
601 | flags |= FOLL_WRITE; | ||
602 | if (force) | ||
603 | flags |= FOLL_FORCE; | ||
604 | |||
605 | pages_done = 0; | ||
606 | lock_dropped = false; | ||
607 | for (;;) { | ||
608 | ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages, | ||
609 | vmas, locked); | ||
610 | if (!locked) | ||
611 | /* VM_FAULT_RETRY couldn't trigger, bypass */ | ||
612 | return ret; | ||
613 | |||
614 | /* VM_FAULT_RETRY cannot return errors */ | ||
615 | if (!*locked) { | ||
616 | BUG_ON(ret < 0); | ||
617 | BUG_ON(ret >= nr_pages); | ||
618 | } | ||
619 | |||
620 | if (!pages) | ||
621 | /* If it's a prefault don't insist harder */ | ||
622 | return ret; | ||
623 | |||
624 | if (ret > 0) { | ||
625 | nr_pages -= ret; | ||
626 | pages_done += ret; | ||
627 | if (!nr_pages) | ||
628 | break; | ||
629 | } | ||
630 | if (*locked) { | ||
631 | /* VM_FAULT_RETRY didn't trigger */ | ||
632 | if (!pages_done) | ||
633 | pages_done = ret; | ||
634 | break; | ||
635 | } | ||
636 | /* VM_FAULT_RETRY triggered, so seek to the faulting offset */ | ||
637 | pages += ret; | ||
638 | start += ret << PAGE_SHIFT; | ||
639 | |||
640 | /* | ||
641 | * Repeat on the address that fired VM_FAULT_RETRY | ||
642 | * without FAULT_FLAG_ALLOW_RETRY but with | ||
643 | * FAULT_FLAG_TRIED. | ||
644 | */ | ||
645 | *locked = 1; | ||
646 | lock_dropped = true; | ||
647 | down_read(&mm->mmap_sem); | ||
648 | ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, | ||
649 | pages, NULL, NULL); | ||
650 | if (ret != 1) { | ||
651 | BUG_ON(ret > 1); | ||
652 | if (!pages_done) | ||
653 | pages_done = ret; | ||
654 | break; | ||
655 | } | ||
656 | nr_pages--; | ||
657 | pages_done++; | ||
658 | if (!nr_pages) | ||
659 | break; | ||
660 | pages++; | ||
661 | start += PAGE_SIZE; | ||
662 | } | ||
663 | if (notify_drop && lock_dropped && *locked) { | ||
664 | /* | ||
665 | * We must let the caller know we temporarily dropped the lock | ||
666 | * and so the critical section protected by it was lost. | ||
667 | */ | ||
668 | up_read(&mm->mmap_sem); | ||
669 | *locked = 0; | ||
670 | } | ||
671 | return pages_done; | ||
672 | } | ||
673 | |||
674 | /* | ||
675 | * We can leverage the VM_FAULT_RETRY functionality in the page fault | ||
676 | * paths better by using either get_user_pages_locked() or | ||
677 | * get_user_pages_unlocked(). | ||
678 | * | ||
679 | * get_user_pages_locked() is suitable to replace the form: | ||
680 | * | ||
681 | * down_read(&mm->mmap_sem); | ||
682 | * do_something() | ||
683 | * get_user_pages(tsk, mm, ..., pages, NULL); | ||
684 | * up_read(&mm->mmap_sem); | ||
685 | * | ||
686 | * to: | ||
687 | * | ||
688 | * int locked = 1; | ||
689 | * down_read(&mm->mmap_sem); | ||
690 | * do_something() | ||
691 | * get_user_pages_locked(tsk, mm, ..., pages, &locked); | ||
692 | * if (locked) | ||
693 | * up_read(&mm->mmap_sem); | ||
694 | */ | ||
695 | long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, | ||
696 | unsigned long start, unsigned long nr_pages, | ||
697 | int write, int force, struct page **pages, | ||
698 | int *locked) | ||
699 | { | ||
700 | return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, | ||
701 | pages, NULL, locked, true, FOLL_TOUCH); | ||
702 | } | ||
703 | EXPORT_SYMBOL(get_user_pages_locked); | ||
704 | |||
705 | /* | ||
706 | * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to | ||
707 | * pass additional gup_flags as last parameter (like FOLL_HWPOISON). | ||
708 | * | ||
709 | * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the | ||
710 | * caller if required (just like with __get_user_pages). "FOLL_GET", | ||
711 | * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed | ||
712 | * according to the parameters "pages", "write", "force" | ||
713 | * respectively. | ||
714 | */ | ||
715 | __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
716 | unsigned long start, unsigned long nr_pages, | ||
717 | int write, int force, struct page **pages, | ||
718 | unsigned int gup_flags) | ||
719 | { | ||
720 | long ret; | ||
721 | int locked = 1; | ||
722 | down_read(&mm->mmap_sem); | ||
723 | ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, | ||
724 | pages, NULL, &locked, false, gup_flags); | ||
725 | if (locked) | ||
726 | up_read(&mm->mmap_sem); | ||
727 | return ret; | ||
728 | } | ||
729 | EXPORT_SYMBOL(__get_user_pages_unlocked); | ||
730 | |||
731 | /* | ||
732 | * get_user_pages_unlocked() is suitable to replace the form: | ||
733 | * | ||
734 | * down_read(&mm->mmap_sem); | ||
735 | * get_user_pages(tsk, mm, ..., pages, NULL); | ||
736 | * up_read(&mm->mmap_sem); | ||
737 | * | ||
738 | * with: | ||
739 | * | ||
740 | * get_user_pages_unlocked(tsk, mm, ..., pages); | ||
741 | * | ||
742 | * It is functionally equivalent to get_user_pages_fast so | ||
743 | * get_user_pages_fast should be used instead, if the two parameters | ||
744 | * "tsk" and "mm" are respectively equal to current and current->mm, | ||
745 | * or if "force" shall be set to 1 (get_user_pages_fast misses the | ||
746 | * "force" parameter). | ||
747 | */ | ||
748 | long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
749 | unsigned long start, unsigned long nr_pages, | ||
750 | int write, int force, struct page **pages) | ||
751 | { | ||
752 | return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, | ||
753 | force, pages, FOLL_TOUCH); | ||
754 | } | ||
755 | EXPORT_SYMBOL(get_user_pages_unlocked); | ||
756 | |||
587 | /* | 757 | /* |
588 | * get_user_pages() - pin user pages in memory | 758 | * get_user_pages() - pin user pages in memory |
589 | * @tsk: the task_struct to use for page fault accounting, or | 759 | * @tsk: the task_struct to use for page fault accounting, or |
@@ -633,22 +803,18 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | |||
633 | * use the correct cache flushing APIs. | 803 | * use the correct cache flushing APIs. |
634 | * | 804 | * |
635 | * See also get_user_pages_fast, for performance critical applications. | 805 | * See also get_user_pages_fast, for performance critical applications. |
806 | * | ||
807 | * get_user_pages should be phased out in favor of | ||
808 | * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing | ||
809 | * should use get_user_pages because it cannot pass | ||
810 | * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. | ||
636 | */ | 811 | */ |
637 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 812 | long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
638 | unsigned long start, unsigned long nr_pages, int write, | 813 | unsigned long start, unsigned long nr_pages, int write, |
639 | int force, struct page **pages, struct vm_area_struct **vmas) | 814 | int force, struct page **pages, struct vm_area_struct **vmas) |
640 | { | 815 | { |
641 | int flags = FOLL_TOUCH; | 816 | return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, |
642 | 817 | pages, vmas, NULL, false, FOLL_TOUCH); | |
643 | if (pages) | ||
644 | flags |= FOLL_GET; | ||
645 | if (write) | ||
646 | flags |= FOLL_WRITE; | ||
647 | if (force) | ||
648 | flags |= FOLL_FORCE; | ||
649 | |||
650 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, | ||
651 | NULL); | ||
652 | } | 818 | } |
653 | EXPORT_SYMBOL(get_user_pages); | 819 | EXPORT_SYMBOL(get_user_pages); |
654 | 820 | ||
@@ -740,10 +906,10 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | |||
740 | 906 | ||
741 | /* | 907 | /* |
742 | * Similar to the PMD case below, NUMA hinting must take slow | 908 | * Similar to the PMD case below, NUMA hinting must take slow |
743 | * path | 909 | * path using the pte_protnone check. |
744 | */ | 910 | */ |
745 | if (!pte_present(pte) || pte_special(pte) || | 911 | if (!pte_present(pte) || pte_special(pte) || |
746 | pte_numa(pte) || (write && !pte_write(pte))) | 912 | pte_protnone(pte) || (write && !pte_write(pte))) |
747 | goto pte_unmap; | 913 | goto pte_unmap; |
748 | 914 | ||
749 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 915 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); |
@@ -938,7 +1104,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | |||
938 | * slowpath for accounting purposes and so that they | 1104 | * slowpath for accounting purposes and so that they |
939 | * can be serialised against THP migration. | 1105 | * can be serialised against THP migration. |
940 | */ | 1106 | */ |
941 | if (pmd_numa(pmd)) | 1107 | if (pmd_protnone(pmd)) |
942 | return 0; | 1108 | return 0; |
943 | 1109 | ||
944 | if (!gup_huge_pmd(pmd, pmdp, addr, next, write, | 1110 | if (!gup_huge_pmd(pmd, pmdp, addr, next, write, |
@@ -1077,10 +1243,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
1077 | start += nr << PAGE_SHIFT; | 1243 | start += nr << PAGE_SHIFT; |
1078 | pages += nr; | 1244 | pages += nr; |
1079 | 1245 | ||
1080 | down_read(&mm->mmap_sem); | 1246 | ret = get_user_pages_unlocked(current, mm, start, |
1081 | ret = get_user_pages(current, mm, start, | 1247 | nr_pages - nr, write, 0, pages); |
1082 | nr_pages - nr, write, 0, pages, NULL); | ||
1083 | up_read(&mm->mmap_sem); | ||
1084 | 1248 | ||
1085 | /* Have to be a bit careful with return values */ | 1249 | /* Have to be a bit careful with return values */ |
1086 | if (nr > 0) { | 1250 | if (nr > 0) { |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 817a875f2b8c..fc00c8cb5a82 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -171,12 +171,7 @@ static int start_khugepaged(void) | |||
171 | } | 171 | } |
172 | 172 | ||
173 | static atomic_t huge_zero_refcount; | 173 | static atomic_t huge_zero_refcount; |
174 | static struct page *huge_zero_page __read_mostly; | 174 | struct page *huge_zero_page __read_mostly; |
175 | |||
176 | static inline bool is_huge_zero_page(struct page *page) | ||
177 | { | ||
178 | return ACCESS_ONCE(huge_zero_page) == page; | ||
179 | } | ||
180 | 175 | ||
181 | static inline bool is_huge_zero_pmd(pmd_t pmd) | 176 | static inline bool is_huge_zero_pmd(pmd_t pmd) |
182 | { | 177 | { |
@@ -766,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) | |||
766 | return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; | 761 | return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp; |
767 | } | 762 | } |
768 | 763 | ||
769 | static inline struct page *alloc_hugepage_vma(int defrag, | ||
770 | struct vm_area_struct *vma, | ||
771 | unsigned long haddr, int nd, | ||
772 | gfp_t extra_gfp) | ||
773 | { | ||
774 | return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp), | ||
775 | HPAGE_PMD_ORDER, vma, haddr, nd); | ||
776 | } | ||
777 | |||
778 | /* Caller must hold page table lock. */ | 764 | /* Caller must hold page table lock. */ |
779 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | 765 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, |
780 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | 766 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, |
@@ -795,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
795 | unsigned long address, pmd_t *pmd, | 781 | unsigned long address, pmd_t *pmd, |
796 | unsigned int flags) | 782 | unsigned int flags) |
797 | { | 783 | { |
784 | gfp_t gfp; | ||
798 | struct page *page; | 785 | struct page *page; |
799 | unsigned long haddr = address & HPAGE_PMD_MASK; | 786 | unsigned long haddr = address & HPAGE_PMD_MASK; |
800 | 787 | ||
@@ -829,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
829 | } | 816 | } |
830 | return 0; | 817 | return 0; |
831 | } | 818 | } |
832 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 819 | gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); |
833 | vma, haddr, numa_node_id(), 0); | 820 | page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); |
834 | if (unlikely(!page)) { | 821 | if (unlikely(!page)) { |
835 | count_vm_event(THP_FAULT_FALLBACK); | 822 | count_vm_event(THP_FAULT_FALLBACK); |
836 | return VM_FAULT_FALLBACK; | 823 | return VM_FAULT_FALLBACK; |
@@ -1118,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1118 | spin_unlock(ptl); | 1105 | spin_unlock(ptl); |
1119 | alloc: | 1106 | alloc: |
1120 | if (transparent_hugepage_enabled(vma) && | 1107 | if (transparent_hugepage_enabled(vma) && |
1121 | !transparent_hugepage_debug_cow()) | 1108 | !transparent_hugepage_debug_cow()) { |
1122 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 1109 | gfp_t gfp; |
1123 | vma, haddr, numa_node_id(), 0); | 1110 | |
1124 | else | 1111 | gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); |
1112 | new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); | ||
1113 | } else | ||
1125 | new_page = NULL; | 1114 | new_page = NULL; |
1126 | 1115 | ||
1127 | if (unlikely(!new_page)) { | 1116 | if (unlikely(!new_page)) { |
@@ -1222,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
1222 | return ERR_PTR(-EFAULT); | 1211 | return ERR_PTR(-EFAULT); |
1223 | 1212 | ||
1224 | /* Full NUMA hinting faults to serialise migration in fault paths */ | 1213 | /* Full NUMA hinting faults to serialise migration in fault paths */ |
1225 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | 1214 | if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) |
1226 | goto out; | 1215 | goto out; |
1227 | 1216 | ||
1228 | page = pmd_page(*pmd); | 1217 | page = pmd_page(*pmd); |
@@ -1273,6 +1262,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1273 | bool migrated = false; | 1262 | bool migrated = false; |
1274 | int flags = 0; | 1263 | int flags = 0; |
1275 | 1264 | ||
1265 | /* A PROT_NONE fault should not end up here */ | ||
1266 | BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); | ||
1267 | |||
1276 | ptl = pmd_lock(mm, pmdp); | 1268 | ptl = pmd_lock(mm, pmdp); |
1277 | if (unlikely(!pmd_same(pmd, *pmdp))) | 1269 | if (unlikely(!pmd_same(pmd, *pmdp))) |
1278 | goto out_unlock; | 1270 | goto out_unlock; |
@@ -1283,8 +1275,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1283 | * check_same as the page may no longer be mapped. | 1275 | * check_same as the page may no longer be mapped. |
1284 | */ | 1276 | */ |
1285 | if (unlikely(pmd_trans_migrating(*pmdp))) { | 1277 | if (unlikely(pmd_trans_migrating(*pmdp))) { |
1278 | page = pmd_page(*pmdp); | ||
1286 | spin_unlock(ptl); | 1279 | spin_unlock(ptl); |
1287 | wait_migrate_huge_page(vma->anon_vma, pmdp); | 1280 | wait_on_page_locked(page); |
1288 | goto out; | 1281 | goto out; |
1289 | } | 1282 | } |
1290 | 1283 | ||
@@ -1352,7 +1345,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1352 | 1345 | ||
1353 | /* | 1346 | /* |
1354 | * Migrate the THP to the requested node, returns with page unlocked | 1347 | * Migrate the THP to the requested node, returns with page unlocked |
1355 | * and pmd_numa cleared. | 1348 | * and access rights restored. |
1356 | */ | 1349 | */ |
1357 | spin_unlock(ptl); | 1350 | spin_unlock(ptl); |
1358 | migrated = migrate_misplaced_transhuge_page(mm, vma, | 1351 | migrated = migrate_misplaced_transhuge_page(mm, vma, |
@@ -1365,9 +1358,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1365 | goto out; | 1358 | goto out; |
1366 | clear_pmdnuma: | 1359 | clear_pmdnuma: |
1367 | BUG_ON(!PageLocked(page)); | 1360 | BUG_ON(!PageLocked(page)); |
1368 | pmd = pmd_mknonnuma(pmd); | 1361 | pmd = pmd_modify(pmd, vma->vm_page_prot); |
1369 | set_pmd_at(mm, haddr, pmdp, pmd); | 1362 | set_pmd_at(mm, haddr, pmdp, pmd); |
1370 | VM_BUG_ON(pmd_numa(*pmdp)); | ||
1371 | update_mmu_cache_pmd(vma, addr, pmdp); | 1363 | update_mmu_cache_pmd(vma, addr, pmdp); |
1372 | unlock_page(page); | 1364 | unlock_page(page); |
1373 | out_unlock: | 1365 | out_unlock: |
@@ -1423,26 +1415,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1423 | return ret; | 1415 | return ret; |
1424 | } | 1416 | } |
1425 | 1417 | ||
1426 | int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | ||
1427 | unsigned long addr, unsigned long end, | ||
1428 | unsigned char *vec) | ||
1429 | { | ||
1430 | spinlock_t *ptl; | ||
1431 | int ret = 0; | ||
1432 | |||
1433 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | ||
1434 | /* | ||
1435 | * All logical pages in the range are present | ||
1436 | * if backed by a huge page. | ||
1437 | */ | ||
1438 | spin_unlock(ptl); | ||
1439 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); | ||
1440 | ret = 1; | ||
1441 | } | ||
1442 | |||
1443 | return ret; | ||
1444 | } | ||
1445 | |||
1446 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | 1418 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, |
1447 | unsigned long old_addr, | 1419 | unsigned long old_addr, |
1448 | unsigned long new_addr, unsigned long old_end, | 1420 | unsigned long new_addr, unsigned long old_end, |
@@ -1510,29 +1482,24 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1510 | 1482 | ||
1511 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 1483 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
1512 | pmd_t entry; | 1484 | pmd_t entry; |
1513 | ret = 1; | 1485 | |
1514 | if (!prot_numa) { | 1486 | /* |
1487 | * Avoid trapping faults against the zero page. The read-only | ||
1488 | * data is likely to be read-cached on the local CPU and | ||
1489 | * local/remote hits to the zero page are not interesting. | ||
1490 | */ | ||
1491 | if (prot_numa && is_huge_zero_pmd(*pmd)) { | ||
1492 | spin_unlock(ptl); | ||
1493 | return 0; | ||
1494 | } | ||
1495 | |||
1496 | if (!prot_numa || !pmd_protnone(*pmd)) { | ||
1497 | ret = 1; | ||
1515 | entry = pmdp_get_and_clear_notify(mm, addr, pmd); | 1498 | entry = pmdp_get_and_clear_notify(mm, addr, pmd); |
1516 | if (pmd_numa(entry)) | ||
1517 | entry = pmd_mknonnuma(entry); | ||
1518 | entry = pmd_modify(entry, newprot); | 1499 | entry = pmd_modify(entry, newprot); |
1519 | ret = HPAGE_PMD_NR; | 1500 | ret = HPAGE_PMD_NR; |
1520 | set_pmd_at(mm, addr, pmd, entry); | 1501 | set_pmd_at(mm, addr, pmd, entry); |
1521 | BUG_ON(pmd_write(entry)); | 1502 | BUG_ON(pmd_write(entry)); |
1522 | } else { | ||
1523 | struct page *page = pmd_page(*pmd); | ||
1524 | |||
1525 | /* | ||
1526 | * Do not trap faults against the zero page. The | ||
1527 | * read-only data is likely to be read-cached on the | ||
1528 | * local CPU cache and it is less useful to know about | ||
1529 | * local vs remote hits on the zero page. | ||
1530 | */ | ||
1531 | if (!is_huge_zero_page(page) && | ||
1532 | !pmd_numa(*pmd)) { | ||
1533 | pmdp_set_numa(mm, addr, pmd); | ||
1534 | ret = HPAGE_PMD_NR; | ||
1535 | } | ||
1536 | } | 1503 | } |
1537 | spin_unlock(ptl); | 1504 | spin_unlock(ptl); |
1538 | } | 1505 | } |
@@ -1797,9 +1764,9 @@ static int __split_huge_page_map(struct page *page, | |||
1797 | pte_t *pte, entry; | 1764 | pte_t *pte, entry; |
1798 | BUG_ON(PageCompound(page+i)); | 1765 | BUG_ON(PageCompound(page+i)); |
1799 | /* | 1766 | /* |
1800 | * Note that pmd_numa is not transferred deliberately | 1767 | * Note that NUMA hinting access restrictions are not |
1801 | * to avoid any possibility that pte_numa leaks to | 1768 | * transferred to avoid any possibility of altering |
1802 | * a PROT_NONE VMA by accident. | 1769 | * permissions across VMAs. |
1803 | */ | 1770 | */ |
1804 | entry = mk_pte(page + i, vma->vm_page_prot); | 1771 | entry = mk_pte(page + i, vma->vm_page_prot); |
1805 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1772 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
@@ -2148,7 +2115,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2148 | { | 2115 | { |
2149 | struct page *page; | 2116 | struct page *page; |
2150 | pte_t *_pte; | 2117 | pte_t *_pte; |
2151 | int referenced = 0, none = 0; | 2118 | int none = 0; |
2119 | bool referenced = false, writable = false; | ||
2152 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | 2120 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; |
2153 | _pte++, address += PAGE_SIZE) { | 2121 | _pte++, address += PAGE_SIZE) { |
2154 | pte_t pteval = *_pte; | 2122 | pte_t pteval = *_pte; |
@@ -2158,7 +2126,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2158 | else | 2126 | else |
2159 | goto out; | 2127 | goto out; |
2160 | } | 2128 | } |
2161 | if (!pte_present(pteval) || !pte_write(pteval)) | 2129 | if (!pte_present(pteval)) |
2162 | goto out; | 2130 | goto out; |
2163 | page = vm_normal_page(vma, address, pteval); | 2131 | page = vm_normal_page(vma, address, pteval); |
2164 | if (unlikely(!page)) | 2132 | if (unlikely(!page)) |
@@ -2168,9 +2136,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2168 | VM_BUG_ON_PAGE(!PageAnon(page), page); | 2136 | VM_BUG_ON_PAGE(!PageAnon(page), page); |
2169 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | 2137 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); |
2170 | 2138 | ||
2171 | /* cannot use mapcount: can't collapse if there's a gup pin */ | ||
2172 | if (page_count(page) != 1) | ||
2173 | goto out; | ||
2174 | /* | 2139 | /* |
2175 | * We can do it before isolate_lru_page because the | 2140 | * We can do it before isolate_lru_page because the |
2176 | * page can't be freed from under us. NOTE: PG_lock | 2141 | * page can't be freed from under us. NOTE: PG_lock |
@@ -2179,6 +2144,29 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2179 | */ | 2144 | */ |
2180 | if (!trylock_page(page)) | 2145 | if (!trylock_page(page)) |
2181 | goto out; | 2146 | goto out; |
2147 | |||
2148 | /* | ||
2149 | * cannot use mapcount: can't collapse if there's a gup pin. | ||
2150 | * The page must only be referenced by the scanned process | ||
2151 | * and page swap cache. | ||
2152 | */ | ||
2153 | if (page_count(page) != 1 + !!PageSwapCache(page)) { | ||
2154 | unlock_page(page); | ||
2155 | goto out; | ||
2156 | } | ||
2157 | if (pte_write(pteval)) { | ||
2158 | writable = true; | ||
2159 | } else { | ||
2160 | if (PageSwapCache(page) && !reuse_swap_page(page)) { | ||
2161 | unlock_page(page); | ||
2162 | goto out; | ||
2163 | } | ||
2164 | /* | ||
2165 | * Page is not in the swap cache. It can be collapsed | ||
2166 | * into a THP. | ||
2167 | */ | ||
2168 | } | ||
2169 | |||
2182 | /* | 2170 | /* |
2183 | * Isolate the page to avoid collapsing an hugepage | 2171 | * Isolate the page to avoid collapsing an hugepage |
2184 | * currently in use by the VM. | 2172 | * currently in use by the VM. |
@@ -2195,9 +2183,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
2195 | /* If there is no mapped pte young don't collapse the page */ | 2183 | /* If there is no mapped pte young don't collapse the page */ |
2196 | if (pte_young(pteval) || PageReferenced(page) || | 2184 | if (pte_young(pteval) || PageReferenced(page) || |
2197 | mmu_notifier_test_young(vma->vm_mm, address)) | 2185 | mmu_notifier_test_young(vma->vm_mm, address)) |
2198 | referenced = 1; | 2186 | referenced = true; |
2199 | } | 2187 | } |
2200 | if (likely(referenced)) | 2188 | if (likely(referenced && writable)) |
2201 | return 1; | 2189 | return 1; |
2202 | out: | 2190 | out: |
2203 | release_pte_pages(pte, _pte); | 2191 | release_pte_pages(pte, _pte); |
@@ -2550,11 +2538,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2550 | { | 2538 | { |
2551 | pmd_t *pmd; | 2539 | pmd_t *pmd; |
2552 | pte_t *pte, *_pte; | 2540 | pte_t *pte, *_pte; |
2553 | int ret = 0, referenced = 0, none = 0; | 2541 | int ret = 0, none = 0; |
2554 | struct page *page; | 2542 | struct page *page; |
2555 | unsigned long _address; | 2543 | unsigned long _address; |
2556 | spinlock_t *ptl; | 2544 | spinlock_t *ptl; |
2557 | int node = NUMA_NO_NODE; | 2545 | int node = NUMA_NO_NODE; |
2546 | bool writable = false, referenced = false; | ||
2558 | 2547 | ||
2559 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2548 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
2560 | 2549 | ||
@@ -2573,8 +2562,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2573 | else | 2562 | else |
2574 | goto out_unmap; | 2563 | goto out_unmap; |
2575 | } | 2564 | } |
2576 | if (!pte_present(pteval) || !pte_write(pteval)) | 2565 | if (!pte_present(pteval)) |
2577 | goto out_unmap; | 2566 | goto out_unmap; |
2567 | if (pte_write(pteval)) | ||
2568 | writable = true; | ||
2569 | |||
2578 | page = vm_normal_page(vma, _address, pteval); | 2570 | page = vm_normal_page(vma, _address, pteval); |
2579 | if (unlikely(!page)) | 2571 | if (unlikely(!page)) |
2580 | goto out_unmap; | 2572 | goto out_unmap; |
@@ -2591,14 +2583,18 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2591 | VM_BUG_ON_PAGE(PageCompound(page), page); | 2583 | VM_BUG_ON_PAGE(PageCompound(page), page); |
2592 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | 2584 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) |
2593 | goto out_unmap; | 2585 | goto out_unmap; |
2594 | /* cannot use mapcount: can't collapse if there's a gup pin */ | 2586 | /* |
2595 | if (page_count(page) != 1) | 2587 | * cannot use mapcount: can't collapse if there's a gup pin. |
2588 | * The page must only be referenced by the scanned process | ||
2589 | * and page swap cache. | ||
2590 | */ | ||
2591 | if (page_count(page) != 1 + !!PageSwapCache(page)) | ||
2596 | goto out_unmap; | 2592 | goto out_unmap; |
2597 | if (pte_young(pteval) || PageReferenced(page) || | 2593 | if (pte_young(pteval) || PageReferenced(page) || |
2598 | mmu_notifier_test_young(vma->vm_mm, address)) | 2594 | mmu_notifier_test_young(vma->vm_mm, address)) |
2599 | referenced = 1; | 2595 | referenced = true; |
2600 | } | 2596 | } |
2601 | if (referenced) | 2597 | if (referenced && writable) |
2602 | ret = 1; | 2598 | ret = 1; |
2603 | out_unmap: | 2599 | out_unmap: |
2604 | pte_unmap_unlock(pte, ptl); | 2600 | pte_unmap_unlock(pte, ptl); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 85032de5e20f..0a9ac6c26832 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -35,7 +35,7 @@ | |||
35 | #include <linux/node.h> | 35 | #include <linux/node.h> |
36 | #include "internal.h" | 36 | #include "internal.h" |
37 | 37 | ||
38 | unsigned long hugepages_treat_as_movable; | 38 | int hugepages_treat_as_movable; |
39 | 39 | ||
40 | int hugetlb_max_hstate __read_mostly; | 40 | int hugetlb_max_hstate __read_mostly; |
41 | unsigned int default_hstate_idx; | 41 | unsigned int default_hstate_idx; |
@@ -2657,9 +2657,10 @@ again: | |||
2657 | goto unlock; | 2657 | goto unlock; |
2658 | 2658 | ||
2659 | /* | 2659 | /* |
2660 | * HWPoisoned hugepage is already unmapped and dropped reference | 2660 | * Migrating hugepage or HWPoisoned hugepage is already |
2661 | * unmapped and its refcount is dropped, so just clear pte here. | ||
2661 | */ | 2662 | */ |
2662 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { | 2663 | if (unlikely(!pte_present(pte))) { |
2663 | huge_pte_clear(mm, address, ptep); | 2664 | huge_pte_clear(mm, address, ptep); |
2664 | goto unlock; | 2665 | goto unlock; |
2665 | } | 2666 | } |
@@ -3134,6 +3135,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3134 | struct page *pagecache_page = NULL; | 3135 | struct page *pagecache_page = NULL; |
3135 | struct hstate *h = hstate_vma(vma); | 3136 | struct hstate *h = hstate_vma(vma); |
3136 | struct address_space *mapping; | 3137 | struct address_space *mapping; |
3138 | int need_wait_lock = 0; | ||
3137 | 3139 | ||
3138 | address &= huge_page_mask(h); | 3140 | address &= huge_page_mask(h); |
3139 | 3141 | ||
@@ -3172,6 +3174,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3172 | ret = 0; | 3174 | ret = 0; |
3173 | 3175 | ||
3174 | /* | 3176 | /* |
3177 | * entry could be a migration/hwpoison entry at this point, so this | ||
3178 | * check prevents the kernel from going below assuming that we have | ||
3179 | * a active hugepage in pagecache. This goto expects the 2nd page fault, | ||
3180 | * and is_hugetlb_entry_(migration|hwpoisoned) check will properly | ||
3181 | * handle it. | ||
3182 | */ | ||
3183 | if (!pte_present(entry)) | ||
3184 | goto out_mutex; | ||
3185 | |||
3186 | /* | ||
3175 | * If we are going to COW the mapping later, we examine the pending | 3187 | * If we are going to COW the mapping later, we examine the pending |
3176 | * reservations for this page now. This will ensure that any | 3188 | * reservations for this page now. This will ensure that any |
3177 | * allocations necessary to record that reservation occur outside the | 3189 | * allocations necessary to record that reservation occur outside the |
@@ -3190,30 +3202,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3190 | vma, address); | 3202 | vma, address); |
3191 | } | 3203 | } |
3192 | 3204 | ||
3205 | ptl = huge_pte_lock(h, mm, ptep); | ||
3206 | |||
3207 | /* Check for a racing update before calling hugetlb_cow */ | ||
3208 | if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) | ||
3209 | goto out_ptl; | ||
3210 | |||
3193 | /* | 3211 | /* |
3194 | * hugetlb_cow() requires page locks of pte_page(entry) and | 3212 | * hugetlb_cow() requires page locks of pte_page(entry) and |
3195 | * pagecache_page, so here we need take the former one | 3213 | * pagecache_page, so here we need take the former one |
3196 | * when page != pagecache_page or !pagecache_page. | 3214 | * when page != pagecache_page or !pagecache_page. |
3197 | * Note that locking order is always pagecache_page -> page, | ||
3198 | * so no worry about deadlock. | ||
3199 | */ | 3215 | */ |
3200 | page = pte_page(entry); | 3216 | page = pte_page(entry); |
3201 | get_page(page); | ||
3202 | if (page != pagecache_page) | 3217 | if (page != pagecache_page) |
3203 | lock_page(page); | 3218 | if (!trylock_page(page)) { |
3204 | 3219 | need_wait_lock = 1; | |
3205 | ptl = huge_pte_lockptr(h, mm, ptep); | 3220 | goto out_ptl; |
3206 | spin_lock(ptl); | 3221 | } |
3207 | /* Check for a racing update before calling hugetlb_cow */ | ||
3208 | if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) | ||
3209 | goto out_ptl; | ||
3210 | 3222 | ||
3223 | get_page(page); | ||
3211 | 3224 | ||
3212 | if (flags & FAULT_FLAG_WRITE) { | 3225 | if (flags & FAULT_FLAG_WRITE) { |
3213 | if (!huge_pte_write(entry)) { | 3226 | if (!huge_pte_write(entry)) { |
3214 | ret = hugetlb_cow(mm, vma, address, ptep, entry, | 3227 | ret = hugetlb_cow(mm, vma, address, ptep, entry, |
3215 | pagecache_page, ptl); | 3228 | pagecache_page, ptl); |
3216 | goto out_ptl; | 3229 | goto out_put_page; |
3217 | } | 3230 | } |
3218 | entry = huge_pte_mkdirty(entry); | 3231 | entry = huge_pte_mkdirty(entry); |
3219 | } | 3232 | } |
@@ -3221,7 +3234,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3221 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, | 3234 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, |
3222 | flags & FAULT_FLAG_WRITE)) | 3235 | flags & FAULT_FLAG_WRITE)) |
3223 | update_mmu_cache(vma, address, ptep); | 3236 | update_mmu_cache(vma, address, ptep); |
3224 | 3237 | out_put_page: | |
3238 | if (page != pagecache_page) | ||
3239 | unlock_page(page); | ||
3240 | put_page(page); | ||
3225 | out_ptl: | 3241 | out_ptl: |
3226 | spin_unlock(ptl); | 3242 | spin_unlock(ptl); |
3227 | 3243 | ||
@@ -3229,12 +3245,17 @@ out_ptl: | |||
3229 | unlock_page(pagecache_page); | 3245 | unlock_page(pagecache_page); |
3230 | put_page(pagecache_page); | 3246 | put_page(pagecache_page); |
3231 | } | 3247 | } |
3232 | if (page != pagecache_page) | ||
3233 | unlock_page(page); | ||
3234 | put_page(page); | ||
3235 | |||
3236 | out_mutex: | 3248 | out_mutex: |
3237 | mutex_unlock(&htlb_fault_mutex_table[hash]); | 3249 | mutex_unlock(&htlb_fault_mutex_table[hash]); |
3250 | /* | ||
3251 | * Generally it's safe to hold refcount during waiting page lock. But | ||
3252 | * here we just wait to defer the next page fault to avoid busy loop and | ||
3253 | * the page is not used after unlocked before returning from the current | ||
3254 | * page fault. So we are safe from accessing freed page, even if we wait | ||
3255 | * here without taking refcount. | ||
3256 | */ | ||
3257 | if (need_wait_lock) | ||
3258 | wait_on_page_locked(page); | ||
3238 | return ret; | 3259 | return ret; |
3239 | } | 3260 | } |
3240 | 3261 | ||
@@ -3364,7 +3385,26 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | |||
3364 | spin_unlock(ptl); | 3385 | spin_unlock(ptl); |
3365 | continue; | 3386 | continue; |
3366 | } | 3387 | } |
3367 | if (!huge_pte_none(huge_ptep_get(ptep))) { | 3388 | pte = huge_ptep_get(ptep); |
3389 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { | ||
3390 | spin_unlock(ptl); | ||
3391 | continue; | ||
3392 | } | ||
3393 | if (unlikely(is_hugetlb_entry_migration(pte))) { | ||
3394 | swp_entry_t entry = pte_to_swp_entry(pte); | ||
3395 | |||
3396 | if (is_write_migration_entry(entry)) { | ||
3397 | pte_t newpte; | ||
3398 | |||
3399 | make_migration_entry_read(&entry); | ||
3400 | newpte = swp_entry_to_pte(entry); | ||
3401 | set_huge_pte_at(mm, address, ptep, newpte); | ||
3402 | pages++; | ||
3403 | } | ||
3404 | spin_unlock(ptl); | ||
3405 | continue; | ||
3406 | } | ||
3407 | if (!huge_pte_none(pte)) { | ||
3368 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 3408 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
3369 | pte = pte_mkhuge(huge_pte_modify(pte, newprot)); | 3409 | pte = pte_mkhuge(huge_pte_modify(pte, newprot)); |
3370 | pte = arch_make_huge_pte(pte, vma, NULL, 0); | 3410 | pte = arch_make_huge_pte(pte, vma, NULL, 0); |
@@ -3558,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
3558 | if (saddr) { | 3598 | if (saddr) { |
3559 | spte = huge_pte_offset(svma->vm_mm, saddr); | 3599 | spte = huge_pte_offset(svma->vm_mm, saddr); |
3560 | if (spte) { | 3600 | if (spte) { |
3601 | mm_inc_nr_pmds(mm); | ||
3561 | get_page(virt_to_page(spte)); | 3602 | get_page(virt_to_page(spte)); |
3562 | break; | 3603 | break; |
3563 | } | 3604 | } |
@@ -3569,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) | |||
3569 | 3610 | ||
3570 | ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); | 3611 | ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); |
3571 | spin_lock(ptl); | 3612 | spin_lock(ptl); |
3572 | if (pud_none(*pud)) | 3613 | if (pud_none(*pud)) { |
3573 | pud_populate(mm, pud, | 3614 | pud_populate(mm, pud, |
3574 | (pmd_t *)((unsigned long)spte & PAGE_MASK)); | 3615 | (pmd_t *)((unsigned long)spte & PAGE_MASK)); |
3575 | else | 3616 | } else { |
3576 | put_page(virt_to_page(spte)); | 3617 | put_page(virt_to_page(spte)); |
3618 | mm_inc_nr_pmds(mm); | ||
3619 | } | ||
3577 | spin_unlock(ptl); | 3620 | spin_unlock(ptl); |
3578 | out: | 3621 | out: |
3579 | pte = (pte_t *)pmd_alloc(mm, pud, addr); | 3622 | pte = (pte_t *)pmd_alloc(mm, pud, addr); |
@@ -3604,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) | |||
3604 | 3647 | ||
3605 | pud_clear(pud); | 3648 | pud_clear(pud); |
3606 | put_page(virt_to_page(ptep)); | 3649 | put_page(virt_to_page(ptep)); |
3650 | mm_dec_nr_pmds(mm); | ||
3607 | *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; | 3651 | *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; |
3608 | return 1; | 3652 | return 1; |
3609 | } | 3653 | } |
@@ -3660,42 +3704,64 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) | |||
3660 | return (pte_t *) pmd; | 3704 | return (pte_t *) pmd; |
3661 | } | 3705 | } |
3662 | 3706 | ||
3663 | struct page * | 3707 | #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ |
3664 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | ||
3665 | pmd_t *pmd, int write) | ||
3666 | { | ||
3667 | struct page *page; | ||
3668 | 3708 | ||
3669 | page = pte_page(*(pte_t *)pmd); | 3709 | /* |
3670 | if (page) | 3710 | * These functions are overwritable if your architecture needs its own |
3671 | page += ((address & ~PMD_MASK) >> PAGE_SHIFT); | 3711 | * behavior. |
3672 | return page; | 3712 | */ |
3713 | struct page * __weak | ||
3714 | follow_huge_addr(struct mm_struct *mm, unsigned long address, | ||
3715 | int write) | ||
3716 | { | ||
3717 | return ERR_PTR(-EINVAL); | ||
3673 | } | 3718 | } |
3674 | 3719 | ||
3675 | struct page * | 3720 | struct page * __weak |
3676 | follow_huge_pud(struct mm_struct *mm, unsigned long address, | 3721 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
3677 | pud_t *pud, int write) | 3722 | pmd_t *pmd, int flags) |
3678 | { | 3723 | { |
3679 | struct page *page; | 3724 | struct page *page = NULL; |
3680 | 3725 | spinlock_t *ptl; | |
3681 | page = pte_page(*(pte_t *)pud); | 3726 | retry: |
3682 | if (page) | 3727 | ptl = pmd_lockptr(mm, pmd); |
3683 | page += ((address & ~PUD_MASK) >> PAGE_SHIFT); | 3728 | spin_lock(ptl); |
3729 | /* | ||
3730 | * make sure that the address range covered by this pmd is not | ||
3731 | * unmapped from other threads. | ||
3732 | */ | ||
3733 | if (!pmd_huge(*pmd)) | ||
3734 | goto out; | ||
3735 | if (pmd_present(*pmd)) { | ||
3736 | page = pte_page(*(pte_t *)pmd) + | ||
3737 | ((address & ~PMD_MASK) >> PAGE_SHIFT); | ||
3738 | if (flags & FOLL_GET) | ||
3739 | get_page(page); | ||
3740 | } else { | ||
3741 | if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) { | ||
3742 | spin_unlock(ptl); | ||
3743 | __migration_entry_wait(mm, (pte_t *)pmd, ptl); | ||
3744 | goto retry; | ||
3745 | } | ||
3746 | /* | ||
3747 | * hwpoisoned entry is treated as no_page_table in | ||
3748 | * follow_page_mask(). | ||
3749 | */ | ||
3750 | } | ||
3751 | out: | ||
3752 | spin_unlock(ptl); | ||
3684 | return page; | 3753 | return page; |
3685 | } | 3754 | } |
3686 | 3755 | ||
3687 | #else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */ | ||
3688 | |||
3689 | /* Can be overriden by architectures */ | ||
3690 | struct page * __weak | 3756 | struct page * __weak |
3691 | follow_huge_pud(struct mm_struct *mm, unsigned long address, | 3757 | follow_huge_pud(struct mm_struct *mm, unsigned long address, |
3692 | pud_t *pud, int write) | 3758 | pud_t *pud, int flags) |
3693 | { | 3759 | { |
3694 | BUG(); | 3760 | if (flags & FOLL_GET) |
3695 | return NULL; | 3761 | return NULL; |
3696 | } | ||
3697 | 3762 | ||
3698 | #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ | 3763 | return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); |
3764 | } | ||
3699 | 3765 | ||
3700 | #ifdef CONFIG_MEMORY_FAILURE | 3766 | #ifdef CONFIG_MEMORY_FAILURE |
3701 | 3767 | ||
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 037e1c00a5b7..6e0057439a46 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c | |||
@@ -279,7 +279,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, | |||
279 | return -EINVAL; | 279 | return -EINVAL; |
280 | 280 | ||
281 | buf = strstrip(buf); | 281 | buf = strstrip(buf); |
282 | ret = page_counter_memparse(buf, &nr_pages); | 282 | ret = page_counter_memparse(buf, "-1", &nr_pages); |
283 | if (ret) | 283 | if (ret) |
284 | return ret; | 284 | return ret; |
285 | 285 | ||
diff --git a/mm/internal.h b/mm/internal.h index efad241f7014..a96da5b0029d 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -110,6 +110,28 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); | |||
110 | */ | 110 | */ |
111 | 111 | ||
112 | /* | 112 | /* |
113 | * Structure for holding the mostly immutable allocation parameters passed | ||
114 | * between functions involved in allocations, including the alloc_pages* | ||
115 | * family of functions. | ||
116 | * | ||
117 | * nodemask, migratetype and high_zoneidx are initialized only once in | ||
118 | * __alloc_pages_nodemask() and then never change. | ||
119 | * | ||
120 | * zonelist, preferred_zone and classzone_idx are set first in | ||
121 | * __alloc_pages_nodemask() for the fast path, and might be later changed | ||
122 | * in __alloc_pages_slowpath(). All other functions pass the whole strucure | ||
123 | * by a const pointer. | ||
124 | */ | ||
125 | struct alloc_context { | ||
126 | struct zonelist *zonelist; | ||
127 | nodemask_t *nodemask; | ||
128 | struct zone *preferred_zone; | ||
129 | int classzone_idx; | ||
130 | int migratetype; | ||
131 | enum zone_type high_zoneidx; | ||
132 | }; | ||
133 | |||
134 | /* | ||
113 | * Locate the struct page for both the matching buddy in our | 135 | * Locate the struct page for both the matching buddy in our |
114 | * pair (buddy1) and the combined O(n+1) page they form (page). | 136 | * pair (buddy1) and the combined O(n+1) page they form (page). |
115 | * | 137 | * |
@@ -329,8 +351,10 @@ extern int mminit_loglevel; | |||
329 | #define mminit_dprintk(level, prefix, fmt, arg...) \ | 351 | #define mminit_dprintk(level, prefix, fmt, arg...) \ |
330 | do { \ | 352 | do { \ |
331 | if (level < mminit_loglevel) { \ | 353 | if (level < mminit_loglevel) { \ |
332 | printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \ | 354 | if (level <= MMINIT_WARNING) \ |
333 | printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \ | 355 | printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \ |
356 | else \ | ||
357 | printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ | ||
334 | } \ | 358 | } \ |
335 | } while (0) | 359 | } while (0) |
336 | 360 | ||
diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 8da581fa9060..f2c2492681bf 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c | |||
@@ -21,8 +21,8 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) | |||
21 | return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; | 21 | return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; |
22 | } | 22 | } |
23 | 23 | ||
24 | INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb, | 24 | INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, |
25 | unsigned long, shared.linear.rb_subtree_last, | 25 | unsigned long, shared.rb_subtree_last, |
26 | vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) | 26 | vma_start_pgoff, vma_last_pgoff,, vma_interval_tree) |
27 | 27 | ||
28 | /* Insert node immediately after prev in the interval tree */ | 28 | /* Insert node immediately after prev in the interval tree */ |
@@ -36,26 +36,26 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, | |||
36 | 36 | ||
37 | VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); | 37 | VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); |
38 | 38 | ||
39 | if (!prev->shared.linear.rb.rb_right) { | 39 | if (!prev->shared.rb.rb_right) { |
40 | parent = prev; | 40 | parent = prev; |
41 | link = &prev->shared.linear.rb.rb_right; | 41 | link = &prev->shared.rb.rb_right; |
42 | } else { | 42 | } else { |
43 | parent = rb_entry(prev->shared.linear.rb.rb_right, | 43 | parent = rb_entry(prev->shared.rb.rb_right, |
44 | struct vm_area_struct, shared.linear.rb); | 44 | struct vm_area_struct, shared.rb); |
45 | if (parent->shared.linear.rb_subtree_last < last) | 45 | if (parent->shared.rb_subtree_last < last) |
46 | parent->shared.linear.rb_subtree_last = last; | 46 | parent->shared.rb_subtree_last = last; |
47 | while (parent->shared.linear.rb.rb_left) { | 47 | while (parent->shared.rb.rb_left) { |
48 | parent = rb_entry(parent->shared.linear.rb.rb_left, | 48 | parent = rb_entry(parent->shared.rb.rb_left, |
49 | struct vm_area_struct, shared.linear.rb); | 49 | struct vm_area_struct, shared.rb); |
50 | if (parent->shared.linear.rb_subtree_last < last) | 50 | if (parent->shared.rb_subtree_last < last) |
51 | parent->shared.linear.rb_subtree_last = last; | 51 | parent->shared.rb_subtree_last = last; |
52 | } | 52 | } |
53 | link = &parent->shared.linear.rb.rb_left; | 53 | link = &parent->shared.rb.rb_left; |
54 | } | 54 | } |
55 | 55 | ||
56 | node->shared.linear.rb_subtree_last = last; | 56 | node->shared.rb_subtree_last = last; |
57 | rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link); | 57 | rb_link_node(&node->shared.rb, &parent->shared.rb, link); |
58 | rb_insert_augmented(&node->shared.linear.rb, root, | 58 | rb_insert_augmented(&node->shared.rb, root, |
59 | &vma_interval_tree_augment); | 59 | &vma_interval_tree_augment); |
60 | } | 60 | } |
61 | 61 | ||
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile new file mode 100644 index 000000000000..bd837b8c2f41 --- /dev/null +++ b/mm/kasan/Makefile | |||
@@ -0,0 +1,8 @@ | |||
1 | KASAN_SANITIZE := n | ||
2 | |||
3 | CFLAGS_REMOVE_kasan.o = -pg | ||
4 | # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 | ||
5 | # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 | ||
6 | CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) | ||
7 | |||
8 | obj-y := kasan.o report.o | ||
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c new file mode 100644 index 000000000000..78fee632a7ee --- /dev/null +++ b/mm/kasan/kasan.c | |||
@@ -0,0 +1,516 @@ | |||
1 | /* | ||
2 | * This file contains shadow memory manipulation code. | ||
3 | * | ||
4 | * Copyright (c) 2014 Samsung Electronics Co., Ltd. | ||
5 | * Author: Andrey Ryabinin <a.ryabinin@samsung.com> | ||
6 | * | ||
7 | * Some of code borrowed from https://github.com/xairy/linux by | ||
8 | * Andrey Konovalov <adech.fo@gmail.com> | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License version 2 as | ||
12 | * published by the Free Software Foundation. | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
17 | #define DISABLE_BRANCH_PROFILING | ||
18 | |||
19 | #include <linux/export.h> | ||
20 | #include <linux/init.h> | ||
21 | #include <linux/kernel.h> | ||
22 | #include <linux/memblock.h> | ||
23 | #include <linux/memory.h> | ||
24 | #include <linux/mm.h> | ||
25 | #include <linux/module.h> | ||
26 | #include <linux/printk.h> | ||
27 | #include <linux/sched.h> | ||
28 | #include <linux/slab.h> | ||
29 | #include <linux/stacktrace.h> | ||
30 | #include <linux/string.h> | ||
31 | #include <linux/types.h> | ||
32 | #include <linux/kasan.h> | ||
33 | |||
34 | #include "kasan.h" | ||
35 | #include "../slab.h" | ||
36 | |||
37 | /* | ||
38 | * Poisons the shadow memory for 'size' bytes starting from 'addr'. | ||
39 | * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE. | ||
40 | */ | ||
41 | static void kasan_poison_shadow(const void *address, size_t size, u8 value) | ||
42 | { | ||
43 | void *shadow_start, *shadow_end; | ||
44 | |||
45 | shadow_start = kasan_mem_to_shadow(address); | ||
46 | shadow_end = kasan_mem_to_shadow(address + size); | ||
47 | |||
48 | memset(shadow_start, value, shadow_end - shadow_start); | ||
49 | } | ||
50 | |||
51 | void kasan_unpoison_shadow(const void *address, size_t size) | ||
52 | { | ||
53 | kasan_poison_shadow(address, size, 0); | ||
54 | |||
55 | if (size & KASAN_SHADOW_MASK) { | ||
56 | u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); | ||
57 | *shadow = size & KASAN_SHADOW_MASK; | ||
58 | } | ||
59 | } | ||
60 | |||
61 | |||
62 | /* | ||
63 | * All functions below always inlined so compiler could | ||
64 | * perform better optimizations in each of __asan_loadX/__assn_storeX | ||
65 | * depending on memory access size X. | ||
66 | */ | ||
67 | |||
68 | static __always_inline bool memory_is_poisoned_1(unsigned long addr) | ||
69 | { | ||
70 | s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr); | ||
71 | |||
72 | if (unlikely(shadow_value)) { | ||
73 | s8 last_accessible_byte = addr & KASAN_SHADOW_MASK; | ||
74 | return unlikely(last_accessible_byte >= shadow_value); | ||
75 | } | ||
76 | |||
77 | return false; | ||
78 | } | ||
79 | |||
80 | static __always_inline bool memory_is_poisoned_2(unsigned long addr) | ||
81 | { | ||
82 | u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); | ||
83 | |||
84 | if (unlikely(*shadow_addr)) { | ||
85 | if (memory_is_poisoned_1(addr + 1)) | ||
86 | return true; | ||
87 | |||
88 | if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0)) | ||
89 | return false; | ||
90 | |||
91 | return unlikely(*(u8 *)shadow_addr); | ||
92 | } | ||
93 | |||
94 | return false; | ||
95 | } | ||
96 | |||
97 | static __always_inline bool memory_is_poisoned_4(unsigned long addr) | ||
98 | { | ||
99 | u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); | ||
100 | |||
101 | if (unlikely(*shadow_addr)) { | ||
102 | if (memory_is_poisoned_1(addr + 3)) | ||
103 | return true; | ||
104 | |||
105 | if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3)) | ||
106 | return false; | ||
107 | |||
108 | return unlikely(*(u8 *)shadow_addr); | ||
109 | } | ||
110 | |||
111 | return false; | ||
112 | } | ||
113 | |||
114 | static __always_inline bool memory_is_poisoned_8(unsigned long addr) | ||
115 | { | ||
116 | u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); | ||
117 | |||
118 | if (unlikely(*shadow_addr)) { | ||
119 | if (memory_is_poisoned_1(addr + 7)) | ||
120 | return true; | ||
121 | |||
122 | if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7)) | ||
123 | return false; | ||
124 | |||
125 | return unlikely(*(u8 *)shadow_addr); | ||
126 | } | ||
127 | |||
128 | return false; | ||
129 | } | ||
130 | |||
131 | static __always_inline bool memory_is_poisoned_16(unsigned long addr) | ||
132 | { | ||
133 | u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr); | ||
134 | |||
135 | if (unlikely(*shadow_addr)) { | ||
136 | u16 shadow_first_bytes = *(u16 *)shadow_addr; | ||
137 | s8 last_byte = (addr + 15) & KASAN_SHADOW_MASK; | ||
138 | |||
139 | if (unlikely(shadow_first_bytes)) | ||
140 | return true; | ||
141 | |||
142 | if (likely(!last_byte)) | ||
143 | return false; | ||
144 | |||
145 | return memory_is_poisoned_1(addr + 15); | ||
146 | } | ||
147 | |||
148 | return false; | ||
149 | } | ||
150 | |||
151 | static __always_inline unsigned long bytes_is_zero(const u8 *start, | ||
152 | size_t size) | ||
153 | { | ||
154 | while (size) { | ||
155 | if (unlikely(*start)) | ||
156 | return (unsigned long)start; | ||
157 | start++; | ||
158 | size--; | ||
159 | } | ||
160 | |||
161 | return 0; | ||
162 | } | ||
163 | |||
164 | static __always_inline unsigned long memory_is_zero(const void *start, | ||
165 | const void *end) | ||
166 | { | ||
167 | unsigned int words; | ||
168 | unsigned long ret; | ||
169 | unsigned int prefix = (unsigned long)start % 8; | ||
170 | |||
171 | if (end - start <= 16) | ||
172 | return bytes_is_zero(start, end - start); | ||
173 | |||
174 | if (prefix) { | ||
175 | prefix = 8 - prefix; | ||
176 | ret = bytes_is_zero(start, prefix); | ||
177 | if (unlikely(ret)) | ||
178 | return ret; | ||
179 | start += prefix; | ||
180 | } | ||
181 | |||
182 | words = (end - start) / 8; | ||
183 | while (words) { | ||
184 | if (unlikely(*(u64 *)start)) | ||
185 | return bytes_is_zero(start, 8); | ||
186 | start += 8; | ||
187 | words--; | ||
188 | } | ||
189 | |||
190 | return bytes_is_zero(start, (end - start) % 8); | ||
191 | } | ||
192 | |||
193 | static __always_inline bool memory_is_poisoned_n(unsigned long addr, | ||
194 | size_t size) | ||
195 | { | ||
196 | unsigned long ret; | ||
197 | |||
198 | ret = memory_is_zero(kasan_mem_to_shadow((void *)addr), | ||
199 | kasan_mem_to_shadow((void *)addr + size - 1) + 1); | ||
200 | |||
201 | if (unlikely(ret)) { | ||
202 | unsigned long last_byte = addr + size - 1; | ||
203 | s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); | ||
204 | |||
205 | if (unlikely(ret != (unsigned long)last_shadow || | ||
206 | ((last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) | ||
207 | return true; | ||
208 | } | ||
209 | return false; | ||
210 | } | ||
211 | |||
212 | static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) | ||
213 | { | ||
214 | if (__builtin_constant_p(size)) { | ||
215 | switch (size) { | ||
216 | case 1: | ||
217 | return memory_is_poisoned_1(addr); | ||
218 | case 2: | ||
219 | return memory_is_poisoned_2(addr); | ||
220 | case 4: | ||
221 | return memory_is_poisoned_4(addr); | ||
222 | case 8: | ||
223 | return memory_is_poisoned_8(addr); | ||
224 | case 16: | ||
225 | return memory_is_poisoned_16(addr); | ||
226 | default: | ||
227 | BUILD_BUG(); | ||
228 | } | ||
229 | } | ||
230 | |||
231 | return memory_is_poisoned_n(addr, size); | ||
232 | } | ||
233 | |||
234 | |||
235 | static __always_inline void check_memory_region(unsigned long addr, | ||
236 | size_t size, bool write) | ||
237 | { | ||
238 | struct kasan_access_info info; | ||
239 | |||
240 | if (unlikely(size == 0)) | ||
241 | return; | ||
242 | |||
243 | if (unlikely((void *)addr < | ||
244 | kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { | ||
245 | info.access_addr = (void *)addr; | ||
246 | info.access_size = size; | ||
247 | info.is_write = write; | ||
248 | info.ip = _RET_IP_; | ||
249 | kasan_report_user_access(&info); | ||
250 | return; | ||
251 | } | ||
252 | |||
253 | if (likely(!memory_is_poisoned(addr, size))) | ||
254 | return; | ||
255 | |||
256 | kasan_report(addr, size, write, _RET_IP_); | ||
257 | } | ||
258 | |||
259 | void __asan_loadN(unsigned long addr, size_t size); | ||
260 | void __asan_storeN(unsigned long addr, size_t size); | ||
261 | |||
262 | #undef memset | ||
263 | void *memset(void *addr, int c, size_t len) | ||
264 | { | ||
265 | __asan_storeN((unsigned long)addr, len); | ||
266 | |||
267 | return __memset(addr, c, len); | ||
268 | } | ||
269 | |||
270 | #undef memmove | ||
271 | void *memmove(void *dest, const void *src, size_t len) | ||
272 | { | ||
273 | __asan_loadN((unsigned long)src, len); | ||
274 | __asan_storeN((unsigned long)dest, len); | ||
275 | |||
276 | return __memmove(dest, src, len); | ||
277 | } | ||
278 | |||
279 | #undef memcpy | ||
280 | void *memcpy(void *dest, const void *src, size_t len) | ||
281 | { | ||
282 | __asan_loadN((unsigned long)src, len); | ||
283 | __asan_storeN((unsigned long)dest, len); | ||
284 | |||
285 | return __memcpy(dest, src, len); | ||
286 | } | ||
287 | |||
288 | void kasan_alloc_pages(struct page *page, unsigned int order) | ||
289 | { | ||
290 | if (likely(!PageHighMem(page))) | ||
291 | kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order); | ||
292 | } | ||
293 | |||
294 | void kasan_free_pages(struct page *page, unsigned int order) | ||
295 | { | ||
296 | if (likely(!PageHighMem(page))) | ||
297 | kasan_poison_shadow(page_address(page), | ||
298 | PAGE_SIZE << order, | ||
299 | KASAN_FREE_PAGE); | ||
300 | } | ||
301 | |||
302 | void kasan_poison_slab(struct page *page) | ||
303 | { | ||
304 | kasan_poison_shadow(page_address(page), | ||
305 | PAGE_SIZE << compound_order(page), | ||
306 | KASAN_KMALLOC_REDZONE); | ||
307 | } | ||
308 | |||
309 | void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) | ||
310 | { | ||
311 | kasan_unpoison_shadow(object, cache->object_size); | ||
312 | } | ||
313 | |||
314 | void kasan_poison_object_data(struct kmem_cache *cache, void *object) | ||
315 | { | ||
316 | kasan_poison_shadow(object, | ||
317 | round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), | ||
318 | KASAN_KMALLOC_REDZONE); | ||
319 | } | ||
320 | |||
321 | void kasan_slab_alloc(struct kmem_cache *cache, void *object) | ||
322 | { | ||
323 | kasan_kmalloc(cache, object, cache->object_size); | ||
324 | } | ||
325 | |||
326 | void kasan_slab_free(struct kmem_cache *cache, void *object) | ||
327 | { | ||
328 | unsigned long size = cache->object_size; | ||
329 | unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); | ||
330 | |||
331 | /* RCU slabs could be legally used after free within the RCU period */ | ||
332 | if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) | ||
333 | return; | ||
334 | |||
335 | kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); | ||
336 | } | ||
337 | |||
338 | void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) | ||
339 | { | ||
340 | unsigned long redzone_start; | ||
341 | unsigned long redzone_end; | ||
342 | |||
343 | if (unlikely(object == NULL)) | ||
344 | return; | ||
345 | |||
346 | redzone_start = round_up((unsigned long)(object + size), | ||
347 | KASAN_SHADOW_SCALE_SIZE); | ||
348 | redzone_end = round_up((unsigned long)object + cache->object_size, | ||
349 | KASAN_SHADOW_SCALE_SIZE); | ||
350 | |||
351 | kasan_unpoison_shadow(object, size); | ||
352 | kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, | ||
353 | KASAN_KMALLOC_REDZONE); | ||
354 | } | ||
355 | EXPORT_SYMBOL(kasan_kmalloc); | ||
356 | |||
357 | void kasan_kmalloc_large(const void *ptr, size_t size) | ||
358 | { | ||
359 | struct page *page; | ||
360 | unsigned long redzone_start; | ||
361 | unsigned long redzone_end; | ||
362 | |||
363 | if (unlikely(ptr == NULL)) | ||
364 | return; | ||
365 | |||
366 | page = virt_to_page(ptr); | ||
367 | redzone_start = round_up((unsigned long)(ptr + size), | ||
368 | KASAN_SHADOW_SCALE_SIZE); | ||
369 | redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page)); | ||
370 | |||
371 | kasan_unpoison_shadow(ptr, size); | ||
372 | kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, | ||
373 | KASAN_PAGE_REDZONE); | ||
374 | } | ||
375 | |||
376 | void kasan_krealloc(const void *object, size_t size) | ||
377 | { | ||
378 | struct page *page; | ||
379 | |||
380 | if (unlikely(object == ZERO_SIZE_PTR)) | ||
381 | return; | ||
382 | |||
383 | page = virt_to_head_page(object); | ||
384 | |||
385 | if (unlikely(!PageSlab(page))) | ||
386 | kasan_kmalloc_large(object, size); | ||
387 | else | ||
388 | kasan_kmalloc(page->slab_cache, object, size); | ||
389 | } | ||
390 | |||
391 | void kasan_kfree_large(const void *ptr) | ||
392 | { | ||
393 | struct page *page = virt_to_page(ptr); | ||
394 | |||
395 | kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), | ||
396 | KASAN_FREE_PAGE); | ||
397 | } | ||
398 | |||
399 | int kasan_module_alloc(void *addr, size_t size) | ||
400 | { | ||
401 | void *ret; | ||
402 | size_t shadow_size; | ||
403 | unsigned long shadow_start; | ||
404 | |||
405 | shadow_start = (unsigned long)kasan_mem_to_shadow(addr); | ||
406 | shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT, | ||
407 | PAGE_SIZE); | ||
408 | |||
409 | if (WARN_ON(!PAGE_ALIGNED(shadow_start))) | ||
410 | return -EINVAL; | ||
411 | |||
412 | ret = __vmalloc_node_range(shadow_size, 1, shadow_start, | ||
413 | shadow_start + shadow_size, | ||
414 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | ||
415 | PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, | ||
416 | __builtin_return_address(0)); | ||
417 | return ret ? 0 : -ENOMEM; | ||
418 | } | ||
419 | |||
420 | void kasan_module_free(void *addr) | ||
421 | { | ||
422 | vfree(kasan_mem_to_shadow(addr)); | ||
423 | } | ||
424 | |||
425 | static void register_global(struct kasan_global *global) | ||
426 | { | ||
427 | size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE); | ||
428 | |||
429 | kasan_unpoison_shadow(global->beg, global->size); | ||
430 | |||
431 | kasan_poison_shadow(global->beg + aligned_size, | ||
432 | global->size_with_redzone - aligned_size, | ||
433 | KASAN_GLOBAL_REDZONE); | ||
434 | } | ||
435 | |||
436 | void __asan_register_globals(struct kasan_global *globals, size_t size) | ||
437 | { | ||
438 | int i; | ||
439 | |||
440 | for (i = 0; i < size; i++) | ||
441 | register_global(&globals[i]); | ||
442 | } | ||
443 | EXPORT_SYMBOL(__asan_register_globals); | ||
444 | |||
445 | void __asan_unregister_globals(struct kasan_global *globals, size_t size) | ||
446 | { | ||
447 | } | ||
448 | EXPORT_SYMBOL(__asan_unregister_globals); | ||
449 | |||
450 | #define DEFINE_ASAN_LOAD_STORE(size) \ | ||
451 | void __asan_load##size(unsigned long addr) \ | ||
452 | { \ | ||
453 | check_memory_region(addr, size, false); \ | ||
454 | } \ | ||
455 | EXPORT_SYMBOL(__asan_load##size); \ | ||
456 | __alias(__asan_load##size) \ | ||
457 | void __asan_load##size##_noabort(unsigned long); \ | ||
458 | EXPORT_SYMBOL(__asan_load##size##_noabort); \ | ||
459 | void __asan_store##size(unsigned long addr) \ | ||
460 | { \ | ||
461 | check_memory_region(addr, size, true); \ | ||
462 | } \ | ||
463 | EXPORT_SYMBOL(__asan_store##size); \ | ||
464 | __alias(__asan_store##size) \ | ||
465 | void __asan_store##size##_noabort(unsigned long); \ | ||
466 | EXPORT_SYMBOL(__asan_store##size##_noabort) | ||
467 | |||
468 | DEFINE_ASAN_LOAD_STORE(1); | ||
469 | DEFINE_ASAN_LOAD_STORE(2); | ||
470 | DEFINE_ASAN_LOAD_STORE(4); | ||
471 | DEFINE_ASAN_LOAD_STORE(8); | ||
472 | DEFINE_ASAN_LOAD_STORE(16); | ||
473 | |||
474 | void __asan_loadN(unsigned long addr, size_t size) | ||
475 | { | ||
476 | check_memory_region(addr, size, false); | ||
477 | } | ||
478 | EXPORT_SYMBOL(__asan_loadN); | ||
479 | |||
480 | __alias(__asan_loadN) | ||
481 | void __asan_loadN_noabort(unsigned long, size_t); | ||
482 | EXPORT_SYMBOL(__asan_loadN_noabort); | ||
483 | |||
484 | void __asan_storeN(unsigned long addr, size_t size) | ||
485 | { | ||
486 | check_memory_region(addr, size, true); | ||
487 | } | ||
488 | EXPORT_SYMBOL(__asan_storeN); | ||
489 | |||
490 | __alias(__asan_storeN) | ||
491 | void __asan_storeN_noabort(unsigned long, size_t); | ||
492 | EXPORT_SYMBOL(__asan_storeN_noabort); | ||
493 | |||
494 | /* to shut up compiler complaints */ | ||
495 | void __asan_handle_no_return(void) {} | ||
496 | EXPORT_SYMBOL(__asan_handle_no_return); | ||
497 | |||
498 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
499 | static int kasan_mem_notifier(struct notifier_block *nb, | ||
500 | unsigned long action, void *data) | ||
501 | { | ||
502 | return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK; | ||
503 | } | ||
504 | |||
505 | static int __init kasan_memhotplug_init(void) | ||
506 | { | ||
507 | pr_err("WARNING: KASan doesn't support memory hot-add\n"); | ||
508 | pr_err("Memory hot-add will be disabled\n"); | ||
509 | |||
510 | hotplug_memory_notifier(kasan_mem_notifier, 0); | ||
511 | |||
512 | return 0; | ||
513 | } | ||
514 | |||
515 | module_init(kasan_memhotplug_init); | ||
516 | #endif | ||
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h new file mode 100644 index 000000000000..4986b0acab21 --- /dev/null +++ b/mm/kasan/kasan.h | |||
@@ -0,0 +1,75 @@ | |||
1 | #ifndef __MM_KASAN_KASAN_H | ||
2 | #define __MM_KASAN_KASAN_H | ||
3 | |||
4 | #include <linux/kasan.h> | ||
5 | |||
6 | #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) | ||
7 | #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) | ||
8 | |||
9 | #define KASAN_FREE_PAGE 0xFF /* page was freed */ | ||
10 | #define KASAN_FREE_PAGE 0xFF /* page was freed */ | ||
11 | #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ | ||
12 | #define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */ | ||
13 | #define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */ | ||
14 | #define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */ | ||
15 | |||
16 | /* | ||
17 | * Stack redzone shadow values | ||
18 | * (Those are compiler's ABI, don't change them) | ||
19 | */ | ||
20 | #define KASAN_STACK_LEFT 0xF1 | ||
21 | #define KASAN_STACK_MID 0xF2 | ||
22 | #define KASAN_STACK_RIGHT 0xF3 | ||
23 | #define KASAN_STACK_PARTIAL 0xF4 | ||
24 | |||
25 | /* Don't break randconfig/all*config builds */ | ||
26 | #ifndef KASAN_ABI_VERSION | ||
27 | #define KASAN_ABI_VERSION 1 | ||
28 | #endif | ||
29 | |||
30 | struct kasan_access_info { | ||
31 | const void *access_addr; | ||
32 | const void *first_bad_addr; | ||
33 | size_t access_size; | ||
34 | bool is_write; | ||
35 | unsigned long ip; | ||
36 | }; | ||
37 | |||
38 | /* The layout of struct dictated by compiler */ | ||
39 | struct kasan_source_location { | ||
40 | const char *filename; | ||
41 | int line_no; | ||
42 | int column_no; | ||
43 | }; | ||
44 | |||
45 | /* The layout of struct dictated by compiler */ | ||
46 | struct kasan_global { | ||
47 | const void *beg; /* Address of the beginning of the global variable. */ | ||
48 | size_t size; /* Size of the global variable. */ | ||
49 | size_t size_with_redzone; /* Size of the variable + size of the red zone. 32 bytes aligned */ | ||
50 | const void *name; | ||
51 | const void *module_name; /* Name of the module where the global variable is declared. */ | ||
52 | unsigned long has_dynamic_init; /* This needed for C++ */ | ||
53 | #if KASAN_ABI_VERSION >= 4 | ||
54 | struct kasan_source_location *location; | ||
55 | #endif | ||
56 | }; | ||
57 | |||
58 | void kasan_report_error(struct kasan_access_info *info); | ||
59 | void kasan_report_user_access(struct kasan_access_info *info); | ||
60 | |||
61 | static inline const void *kasan_shadow_to_mem(const void *shadow_addr) | ||
62 | { | ||
63 | return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) | ||
64 | << KASAN_SHADOW_SCALE_SHIFT); | ||
65 | } | ||
66 | |||
67 | static inline bool kasan_enabled(void) | ||
68 | { | ||
69 | return !current->kasan_depth; | ||
70 | } | ||
71 | |||
72 | void kasan_report(unsigned long addr, size_t size, | ||
73 | bool is_write, unsigned long ip); | ||
74 | |||
75 | #endif | ||
diff --git a/mm/kasan/report.c b/mm/kasan/report.c new file mode 100644 index 000000000000..680ceedf810a --- /dev/null +++ b/mm/kasan/report.c | |||
@@ -0,0 +1,269 @@ | |||
1 | /* | ||
2 | * This file contains error reporting code. | ||
3 | * | ||
4 | * Copyright (c) 2014 Samsung Electronics Co., Ltd. | ||
5 | * Author: Andrey Ryabinin <a.ryabinin@samsung.com> | ||
6 | * | ||
7 | * Some of code borrowed from https://github.com/xairy/linux by | ||
8 | * Andrey Konovalov <adech.fo@gmail.com> | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License version 2 as | ||
12 | * published by the Free Software Foundation. | ||
13 | * | ||
14 | */ | ||
15 | |||
16 | #include <linux/kernel.h> | ||
17 | #include <linux/mm.h> | ||
18 | #include <linux/printk.h> | ||
19 | #include <linux/sched.h> | ||
20 | #include <linux/slab.h> | ||
21 | #include <linux/stacktrace.h> | ||
22 | #include <linux/string.h> | ||
23 | #include <linux/types.h> | ||
24 | #include <linux/kasan.h> | ||
25 | |||
26 | #include <asm/sections.h> | ||
27 | |||
28 | #include "kasan.h" | ||
29 | #include "../slab.h" | ||
30 | |||
31 | /* Shadow layout customization. */ | ||
32 | #define SHADOW_BYTES_PER_BLOCK 1 | ||
33 | #define SHADOW_BLOCKS_PER_ROW 16 | ||
34 | #define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK) | ||
35 | #define SHADOW_ROWS_AROUND_ADDR 2 | ||
36 | |||
37 | static const void *find_first_bad_addr(const void *addr, size_t size) | ||
38 | { | ||
39 | u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr); | ||
40 | const void *first_bad_addr = addr; | ||
41 | |||
42 | while (!shadow_val && first_bad_addr < addr + size) { | ||
43 | first_bad_addr += KASAN_SHADOW_SCALE_SIZE; | ||
44 | shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr); | ||
45 | } | ||
46 | return first_bad_addr; | ||
47 | } | ||
48 | |||
49 | static void print_error_description(struct kasan_access_info *info) | ||
50 | { | ||
51 | const char *bug_type = "unknown crash"; | ||
52 | u8 shadow_val; | ||
53 | |||
54 | info->first_bad_addr = find_first_bad_addr(info->access_addr, | ||
55 | info->access_size); | ||
56 | |||
57 | shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr); | ||
58 | |||
59 | switch (shadow_val) { | ||
60 | case KASAN_FREE_PAGE: | ||
61 | case KASAN_KMALLOC_FREE: | ||
62 | bug_type = "use after free"; | ||
63 | break; | ||
64 | case KASAN_PAGE_REDZONE: | ||
65 | case KASAN_KMALLOC_REDZONE: | ||
66 | case KASAN_GLOBAL_REDZONE: | ||
67 | case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: | ||
68 | bug_type = "out of bounds access"; | ||
69 | break; | ||
70 | case KASAN_STACK_LEFT: | ||
71 | case KASAN_STACK_MID: | ||
72 | case KASAN_STACK_RIGHT: | ||
73 | case KASAN_STACK_PARTIAL: | ||
74 | bug_type = "out of bounds on stack"; | ||
75 | break; | ||
76 | } | ||
77 | |||
78 | pr_err("BUG: KASan: %s in %pS at addr %p\n", | ||
79 | bug_type, (void *)info->ip, | ||
80 | info->access_addr); | ||
81 | pr_err("%s of size %zu by task %s/%d\n", | ||
82 | info->is_write ? "Write" : "Read", | ||
83 | info->access_size, current->comm, task_pid_nr(current)); | ||
84 | } | ||
85 | |||
86 | static inline bool kernel_or_module_addr(const void *addr) | ||
87 | { | ||
88 | return (addr >= (void *)_stext && addr < (void *)_end) | ||
89 | || (addr >= (void *)MODULES_VADDR | ||
90 | && addr < (void *)MODULES_END); | ||
91 | } | ||
92 | |||
93 | static inline bool init_task_stack_addr(const void *addr) | ||
94 | { | ||
95 | return addr >= (void *)&init_thread_union.stack && | ||
96 | (addr <= (void *)&init_thread_union.stack + | ||
97 | sizeof(init_thread_union.stack)); | ||
98 | } | ||
99 | |||
100 | static void print_address_description(struct kasan_access_info *info) | ||
101 | { | ||
102 | const void *addr = info->access_addr; | ||
103 | |||
104 | if ((addr >= (void *)PAGE_OFFSET) && | ||
105 | (addr < high_memory)) { | ||
106 | struct page *page = virt_to_head_page(addr); | ||
107 | |||
108 | if (PageSlab(page)) { | ||
109 | void *object; | ||
110 | struct kmem_cache *cache = page->slab_cache; | ||
111 | void *last_object; | ||
112 | |||
113 | object = virt_to_obj(cache, page_address(page), addr); | ||
114 | last_object = page_address(page) + | ||
115 | page->objects * cache->size; | ||
116 | |||
117 | if (unlikely(object > last_object)) | ||
118 | object = last_object; /* we hit into padding */ | ||
119 | |||
120 | object_err(cache, page, object, | ||
121 | "kasan: bad access detected"); | ||
122 | return; | ||
123 | } | ||
124 | dump_page(page, "kasan: bad access detected"); | ||
125 | } | ||
126 | |||
127 | if (kernel_or_module_addr(addr)) { | ||
128 | if (!init_task_stack_addr(addr)) | ||
129 | pr_err("Address belongs to variable %pS\n", addr); | ||
130 | } | ||
131 | |||
132 | dump_stack(); | ||
133 | } | ||
134 | |||
135 | static bool row_is_guilty(const void *row, const void *guilty) | ||
136 | { | ||
137 | return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW); | ||
138 | } | ||
139 | |||
140 | static int shadow_pointer_offset(const void *row, const void *shadow) | ||
141 | { | ||
142 | /* The length of ">ff00ff00ff00ff00: " is | ||
143 | * 3 + (BITS_PER_LONG/8)*2 chars. | ||
144 | */ | ||
145 | return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 + | ||
146 | (shadow - row) / SHADOW_BYTES_PER_BLOCK + 1; | ||
147 | } | ||
148 | |||
149 | static void print_shadow_for_address(const void *addr) | ||
150 | { | ||
151 | int i; | ||
152 | const void *shadow = kasan_mem_to_shadow(addr); | ||
153 | const void *shadow_row; | ||
154 | |||
155 | shadow_row = (void *)round_down((unsigned long)shadow, | ||
156 | SHADOW_BYTES_PER_ROW) | ||
157 | - SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW; | ||
158 | |||
159 | pr_err("Memory state around the buggy address:\n"); | ||
160 | |||
161 | for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) { | ||
162 | const void *kaddr = kasan_shadow_to_mem(shadow_row); | ||
163 | char buffer[4 + (BITS_PER_LONG/8)*2]; | ||
164 | |||
165 | snprintf(buffer, sizeof(buffer), | ||
166 | (i == 0) ? ">%p: " : " %p: ", kaddr); | ||
167 | |||
168 | kasan_disable_current(); | ||
169 | print_hex_dump(KERN_ERR, buffer, | ||
170 | DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1, | ||
171 | shadow_row, SHADOW_BYTES_PER_ROW, 0); | ||
172 | kasan_enable_current(); | ||
173 | |||
174 | if (row_is_guilty(shadow_row, shadow)) | ||
175 | pr_err("%*c\n", | ||
176 | shadow_pointer_offset(shadow_row, shadow), | ||
177 | '^'); | ||
178 | |||
179 | shadow_row += SHADOW_BYTES_PER_ROW; | ||
180 | } | ||
181 | } | ||
182 | |||
183 | static DEFINE_SPINLOCK(report_lock); | ||
184 | |||
185 | void kasan_report_error(struct kasan_access_info *info) | ||
186 | { | ||
187 | unsigned long flags; | ||
188 | |||
189 | spin_lock_irqsave(&report_lock, flags); | ||
190 | pr_err("=================================" | ||
191 | "=================================\n"); | ||
192 | print_error_description(info); | ||
193 | print_address_description(info); | ||
194 | print_shadow_for_address(info->first_bad_addr); | ||
195 | pr_err("=================================" | ||
196 | "=================================\n"); | ||
197 | spin_unlock_irqrestore(&report_lock, flags); | ||
198 | } | ||
199 | |||
200 | void kasan_report_user_access(struct kasan_access_info *info) | ||
201 | { | ||
202 | unsigned long flags; | ||
203 | |||
204 | spin_lock_irqsave(&report_lock, flags); | ||
205 | pr_err("=================================" | ||
206 | "=================================\n"); | ||
207 | pr_err("BUG: KASan: user-memory-access on address %p\n", | ||
208 | info->access_addr); | ||
209 | pr_err("%s of size %zu by task %s/%d\n", | ||
210 | info->is_write ? "Write" : "Read", | ||
211 | info->access_size, current->comm, task_pid_nr(current)); | ||
212 | dump_stack(); | ||
213 | pr_err("=================================" | ||
214 | "=================================\n"); | ||
215 | spin_unlock_irqrestore(&report_lock, flags); | ||
216 | } | ||
217 | |||
218 | void kasan_report(unsigned long addr, size_t size, | ||
219 | bool is_write, unsigned long ip) | ||
220 | { | ||
221 | struct kasan_access_info info; | ||
222 | |||
223 | if (likely(!kasan_enabled())) | ||
224 | return; | ||
225 | |||
226 | info.access_addr = (void *)addr; | ||
227 | info.access_size = size; | ||
228 | info.is_write = is_write; | ||
229 | info.ip = ip; | ||
230 | kasan_report_error(&info); | ||
231 | } | ||
232 | |||
233 | |||
234 | #define DEFINE_ASAN_REPORT_LOAD(size) \ | ||
235 | void __asan_report_load##size##_noabort(unsigned long addr) \ | ||
236 | { \ | ||
237 | kasan_report(addr, size, false, _RET_IP_); \ | ||
238 | } \ | ||
239 | EXPORT_SYMBOL(__asan_report_load##size##_noabort) | ||
240 | |||
241 | #define DEFINE_ASAN_REPORT_STORE(size) \ | ||
242 | void __asan_report_store##size##_noabort(unsigned long addr) \ | ||
243 | { \ | ||
244 | kasan_report(addr, size, true, _RET_IP_); \ | ||
245 | } \ | ||
246 | EXPORT_SYMBOL(__asan_report_store##size##_noabort) | ||
247 | |||
248 | DEFINE_ASAN_REPORT_LOAD(1); | ||
249 | DEFINE_ASAN_REPORT_LOAD(2); | ||
250 | DEFINE_ASAN_REPORT_LOAD(4); | ||
251 | DEFINE_ASAN_REPORT_LOAD(8); | ||
252 | DEFINE_ASAN_REPORT_LOAD(16); | ||
253 | DEFINE_ASAN_REPORT_STORE(1); | ||
254 | DEFINE_ASAN_REPORT_STORE(2); | ||
255 | DEFINE_ASAN_REPORT_STORE(4); | ||
256 | DEFINE_ASAN_REPORT_STORE(8); | ||
257 | DEFINE_ASAN_REPORT_STORE(16); | ||
258 | |||
259 | void __asan_report_load_n_noabort(unsigned long addr, size_t size) | ||
260 | { | ||
261 | kasan_report(addr, size, false, _RET_IP_); | ||
262 | } | ||
263 | EXPORT_SYMBOL(__asan_report_load_n_noabort); | ||
264 | |||
265 | void __asan_report_store_n_noabort(unsigned long addr, size_t size) | ||
266 | { | ||
267 | kasan_report(addr, size, true, _RET_IP_); | ||
268 | } | ||
269 | EXPORT_SYMBOL(__asan_report_store_n_noabort); | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 3cda50c1e394..5405aff5a590 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
@@ -98,6 +98,7 @@ | |||
98 | #include <asm/processor.h> | 98 | #include <asm/processor.h> |
99 | #include <linux/atomic.h> | 99 | #include <linux/atomic.h> |
100 | 100 | ||
101 | #include <linux/kasan.h> | ||
101 | #include <linux/kmemcheck.h> | 102 | #include <linux/kmemcheck.h> |
102 | #include <linux/kmemleak.h> | 103 | #include <linux/kmemleak.h> |
103 | #include <linux/memory_hotplug.h> | 104 | #include <linux/memory_hotplug.h> |
@@ -1113,7 +1114,10 @@ static bool update_checksum(struct kmemleak_object *object) | |||
1113 | if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) | 1114 | if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) |
1114 | return false; | 1115 | return false; |
1115 | 1116 | ||
1117 | kasan_disable_current(); | ||
1116 | object->checksum = crc32(0, (void *)object->pointer, object->size); | 1118 | object->checksum = crc32(0, (void *)object->pointer, object->size); |
1119 | kasan_enable_current(); | ||
1120 | |||
1117 | return object->checksum != old_csum; | 1121 | return object->checksum != old_csum; |
1118 | } | 1122 | } |
1119 | 1123 | ||
@@ -1164,7 +1168,9 @@ static void scan_block(void *_start, void *_end, | |||
1164 | BYTES_PER_POINTER)) | 1168 | BYTES_PER_POINTER)) |
1165 | continue; | 1169 | continue; |
1166 | 1170 | ||
1171 | kasan_disable_current(); | ||
1167 | pointer = *ptr; | 1172 | pointer = *ptr; |
1173 | kasan_enable_current(); | ||
1168 | 1174 | ||
1169 | object = find_and_get_object(pointer, 1); | 1175 | object = find_and_get_object(pointer, 1); |
1170 | if (!object) | 1176 | if (!object) |
@@ -376,7 +376,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
376 | else | 376 | else |
377 | ret = VM_FAULT_WRITE; | 377 | ret = VM_FAULT_WRITE; |
378 | put_page(page); | 378 | put_page(page); |
379 | } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); | 379 | } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); |
380 | /* | 380 | /* |
381 | * We must loop because handle_mm_fault() may back out if there's | 381 | * We must loop because handle_mm_fault() may back out if there's |
382 | * any difficulty e.g. if pte accessed bit gets updated concurrently. | 382 | * any difficulty e.g. if pte accessed bit gets updated concurrently. |
@@ -1748,7 +1748,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
1748 | */ | 1748 | */ |
1749 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | | 1749 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | |
1750 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | 1750 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | |
1751 | VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP)) | 1751 | VM_HUGETLB | VM_MIXEDMAP)) |
1752 | return 0; /* just ignore the advice */ | 1752 | return 0; /* just ignore the advice */ |
1753 | 1753 | ||
1754 | #ifdef VM_SAO | 1754 | #ifdef VM_SAO |
diff --git a/mm/list_lru.c b/mm/list_lru.c index f1a0db194173..909eca2c820e 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c | |||
@@ -9,18 +9,100 @@ | |||
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/list_lru.h> | 10 | #include <linux/list_lru.h> |
11 | #include <linux/slab.h> | 11 | #include <linux/slab.h> |
12 | #include <linux/mutex.h> | ||
13 | #include <linux/memcontrol.h> | ||
14 | |||
15 | #ifdef CONFIG_MEMCG_KMEM | ||
16 | static LIST_HEAD(list_lrus); | ||
17 | static DEFINE_MUTEX(list_lrus_mutex); | ||
18 | |||
19 | static void list_lru_register(struct list_lru *lru) | ||
20 | { | ||
21 | mutex_lock(&list_lrus_mutex); | ||
22 | list_add(&lru->list, &list_lrus); | ||
23 | mutex_unlock(&list_lrus_mutex); | ||
24 | } | ||
25 | |||
26 | static void list_lru_unregister(struct list_lru *lru) | ||
27 | { | ||
28 | mutex_lock(&list_lrus_mutex); | ||
29 | list_del(&lru->list); | ||
30 | mutex_unlock(&list_lrus_mutex); | ||
31 | } | ||
32 | #else | ||
33 | static void list_lru_register(struct list_lru *lru) | ||
34 | { | ||
35 | } | ||
36 | |||
37 | static void list_lru_unregister(struct list_lru *lru) | ||
38 | { | ||
39 | } | ||
40 | #endif /* CONFIG_MEMCG_KMEM */ | ||
41 | |||
42 | #ifdef CONFIG_MEMCG_KMEM | ||
43 | static inline bool list_lru_memcg_aware(struct list_lru *lru) | ||
44 | { | ||
45 | return !!lru->node[0].memcg_lrus; | ||
46 | } | ||
47 | |||
48 | static inline struct list_lru_one * | ||
49 | list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) | ||
50 | { | ||
51 | /* | ||
52 | * The lock protects the array of per cgroup lists from relocation | ||
53 | * (see memcg_update_list_lru_node). | ||
54 | */ | ||
55 | lockdep_assert_held(&nlru->lock); | ||
56 | if (nlru->memcg_lrus && idx >= 0) | ||
57 | return nlru->memcg_lrus->lru[idx]; | ||
58 | |||
59 | return &nlru->lru; | ||
60 | } | ||
61 | |||
62 | static inline struct list_lru_one * | ||
63 | list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) | ||
64 | { | ||
65 | struct mem_cgroup *memcg; | ||
66 | |||
67 | if (!nlru->memcg_lrus) | ||
68 | return &nlru->lru; | ||
69 | |||
70 | memcg = mem_cgroup_from_kmem(ptr); | ||
71 | if (!memcg) | ||
72 | return &nlru->lru; | ||
73 | |||
74 | return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); | ||
75 | } | ||
76 | #else | ||
77 | static inline bool list_lru_memcg_aware(struct list_lru *lru) | ||
78 | { | ||
79 | return false; | ||
80 | } | ||
81 | |||
82 | static inline struct list_lru_one * | ||
83 | list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) | ||
84 | { | ||
85 | return &nlru->lru; | ||
86 | } | ||
87 | |||
88 | static inline struct list_lru_one * | ||
89 | list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) | ||
90 | { | ||
91 | return &nlru->lru; | ||
92 | } | ||
93 | #endif /* CONFIG_MEMCG_KMEM */ | ||
12 | 94 | ||
13 | bool list_lru_add(struct list_lru *lru, struct list_head *item) | 95 | bool list_lru_add(struct list_lru *lru, struct list_head *item) |
14 | { | 96 | { |
15 | int nid = page_to_nid(virt_to_page(item)); | 97 | int nid = page_to_nid(virt_to_page(item)); |
16 | struct list_lru_node *nlru = &lru->node[nid]; | 98 | struct list_lru_node *nlru = &lru->node[nid]; |
99 | struct list_lru_one *l; | ||
17 | 100 | ||
18 | spin_lock(&nlru->lock); | 101 | spin_lock(&nlru->lock); |
19 | WARN_ON_ONCE(nlru->nr_items < 0); | 102 | l = list_lru_from_kmem(nlru, item); |
20 | if (list_empty(item)) { | 103 | if (list_empty(item)) { |
21 | list_add_tail(item, &nlru->list); | 104 | list_add_tail(item, &l->list); |
22 | if (nlru->nr_items++ == 0) | 105 | l->nr_items++; |
23 | node_set(nid, lru->active_nodes); | ||
24 | spin_unlock(&nlru->lock); | 106 | spin_unlock(&nlru->lock); |
25 | return true; | 107 | return true; |
26 | } | 108 | } |
@@ -33,13 +115,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) | |||
33 | { | 115 | { |
34 | int nid = page_to_nid(virt_to_page(item)); | 116 | int nid = page_to_nid(virt_to_page(item)); |
35 | struct list_lru_node *nlru = &lru->node[nid]; | 117 | struct list_lru_node *nlru = &lru->node[nid]; |
118 | struct list_lru_one *l; | ||
36 | 119 | ||
37 | spin_lock(&nlru->lock); | 120 | spin_lock(&nlru->lock); |
121 | l = list_lru_from_kmem(nlru, item); | ||
38 | if (!list_empty(item)) { | 122 | if (!list_empty(item)) { |
39 | list_del_init(item); | 123 | list_del_init(item); |
40 | if (--nlru->nr_items == 0) | 124 | l->nr_items--; |
41 | node_clear(nid, lru->active_nodes); | ||
42 | WARN_ON_ONCE(nlru->nr_items < 0); | ||
43 | spin_unlock(&nlru->lock); | 125 | spin_unlock(&nlru->lock); |
44 | return true; | 126 | return true; |
45 | } | 127 | } |
@@ -48,33 +130,72 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) | |||
48 | } | 130 | } |
49 | EXPORT_SYMBOL_GPL(list_lru_del); | 131 | EXPORT_SYMBOL_GPL(list_lru_del); |
50 | 132 | ||
51 | unsigned long | 133 | void list_lru_isolate(struct list_lru_one *list, struct list_head *item) |
52 | list_lru_count_node(struct list_lru *lru, int nid) | 134 | { |
135 | list_del_init(item); | ||
136 | list->nr_items--; | ||
137 | } | ||
138 | EXPORT_SYMBOL_GPL(list_lru_isolate); | ||
139 | |||
140 | void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, | ||
141 | struct list_head *head) | ||
142 | { | ||
143 | list_move(item, head); | ||
144 | list->nr_items--; | ||
145 | } | ||
146 | EXPORT_SYMBOL_GPL(list_lru_isolate_move); | ||
147 | |||
148 | static unsigned long __list_lru_count_one(struct list_lru *lru, | ||
149 | int nid, int memcg_idx) | ||
53 | { | 150 | { |
54 | unsigned long count = 0; | ||
55 | struct list_lru_node *nlru = &lru->node[nid]; | 151 | struct list_lru_node *nlru = &lru->node[nid]; |
152 | struct list_lru_one *l; | ||
153 | unsigned long count; | ||
56 | 154 | ||
57 | spin_lock(&nlru->lock); | 155 | spin_lock(&nlru->lock); |
58 | WARN_ON_ONCE(nlru->nr_items < 0); | 156 | l = list_lru_from_memcg_idx(nlru, memcg_idx); |
59 | count += nlru->nr_items; | 157 | count = l->nr_items; |
60 | spin_unlock(&nlru->lock); | 158 | spin_unlock(&nlru->lock); |
61 | 159 | ||
62 | return count; | 160 | return count; |
63 | } | 161 | } |
162 | |||
163 | unsigned long list_lru_count_one(struct list_lru *lru, | ||
164 | int nid, struct mem_cgroup *memcg) | ||
165 | { | ||
166 | return __list_lru_count_one(lru, nid, memcg_cache_id(memcg)); | ||
167 | } | ||
168 | EXPORT_SYMBOL_GPL(list_lru_count_one); | ||
169 | |||
170 | unsigned long list_lru_count_node(struct list_lru *lru, int nid) | ||
171 | { | ||
172 | long count = 0; | ||
173 | int memcg_idx; | ||
174 | |||
175 | count += __list_lru_count_one(lru, nid, -1); | ||
176 | if (list_lru_memcg_aware(lru)) { | ||
177 | for_each_memcg_cache_index(memcg_idx) | ||
178 | count += __list_lru_count_one(lru, nid, memcg_idx); | ||
179 | } | ||
180 | return count; | ||
181 | } | ||
64 | EXPORT_SYMBOL_GPL(list_lru_count_node); | 182 | EXPORT_SYMBOL_GPL(list_lru_count_node); |
65 | 183 | ||
66 | unsigned long | 184 | static unsigned long |
67 | list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, | 185 | __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, |
68 | void *cb_arg, unsigned long *nr_to_walk) | 186 | list_lru_walk_cb isolate, void *cb_arg, |
187 | unsigned long *nr_to_walk) | ||
69 | { | 188 | { |
70 | 189 | ||
71 | struct list_lru_node *nlru = &lru->node[nid]; | 190 | struct list_lru_node *nlru = &lru->node[nid]; |
191 | struct list_lru_one *l; | ||
72 | struct list_head *item, *n; | 192 | struct list_head *item, *n; |
73 | unsigned long isolated = 0; | 193 | unsigned long isolated = 0; |
74 | 194 | ||
75 | spin_lock(&nlru->lock); | 195 | spin_lock(&nlru->lock); |
196 | l = list_lru_from_memcg_idx(nlru, memcg_idx); | ||
76 | restart: | 197 | restart: |
77 | list_for_each_safe(item, n, &nlru->list) { | 198 | list_for_each_safe(item, n, &l->list) { |
78 | enum lru_status ret; | 199 | enum lru_status ret; |
79 | 200 | ||
80 | /* | 201 | /* |
@@ -85,14 +206,11 @@ restart: | |||
85 | break; | 206 | break; |
86 | --*nr_to_walk; | 207 | --*nr_to_walk; |
87 | 208 | ||
88 | ret = isolate(item, &nlru->lock, cb_arg); | 209 | ret = isolate(item, l, &nlru->lock, cb_arg); |
89 | switch (ret) { | 210 | switch (ret) { |
90 | case LRU_REMOVED_RETRY: | 211 | case LRU_REMOVED_RETRY: |
91 | assert_spin_locked(&nlru->lock); | 212 | assert_spin_locked(&nlru->lock); |
92 | case LRU_REMOVED: | 213 | case LRU_REMOVED: |
93 | if (--nlru->nr_items == 0) | ||
94 | node_clear(nid, lru->active_nodes); | ||
95 | WARN_ON_ONCE(nlru->nr_items < 0); | ||
96 | isolated++; | 214 | isolated++; |
97 | /* | 215 | /* |
98 | * If the lru lock has been dropped, our list | 216 | * If the lru lock has been dropped, our list |
@@ -103,7 +221,7 @@ restart: | |||
103 | goto restart; | 221 | goto restart; |
104 | break; | 222 | break; |
105 | case LRU_ROTATE: | 223 | case LRU_ROTATE: |
106 | list_move_tail(item, &nlru->list); | 224 | list_move_tail(item, &l->list); |
107 | break; | 225 | break; |
108 | case LRU_SKIP: | 226 | case LRU_SKIP: |
109 | break; | 227 | break; |
@@ -122,31 +240,322 @@ restart: | |||
122 | spin_unlock(&nlru->lock); | 240 | spin_unlock(&nlru->lock); |
123 | return isolated; | 241 | return isolated; |
124 | } | 242 | } |
243 | |||
244 | unsigned long | ||
245 | list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, | ||
246 | list_lru_walk_cb isolate, void *cb_arg, | ||
247 | unsigned long *nr_to_walk) | ||
248 | { | ||
249 | return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg), | ||
250 | isolate, cb_arg, nr_to_walk); | ||
251 | } | ||
252 | EXPORT_SYMBOL_GPL(list_lru_walk_one); | ||
253 | |||
254 | unsigned long list_lru_walk_node(struct list_lru *lru, int nid, | ||
255 | list_lru_walk_cb isolate, void *cb_arg, | ||
256 | unsigned long *nr_to_walk) | ||
257 | { | ||
258 | long isolated = 0; | ||
259 | int memcg_idx; | ||
260 | |||
261 | isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg, | ||
262 | nr_to_walk); | ||
263 | if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { | ||
264 | for_each_memcg_cache_index(memcg_idx) { | ||
265 | isolated += __list_lru_walk_one(lru, nid, memcg_idx, | ||
266 | isolate, cb_arg, nr_to_walk); | ||
267 | if (*nr_to_walk <= 0) | ||
268 | break; | ||
269 | } | ||
270 | } | ||
271 | return isolated; | ||
272 | } | ||
125 | EXPORT_SYMBOL_GPL(list_lru_walk_node); | 273 | EXPORT_SYMBOL_GPL(list_lru_walk_node); |
126 | 274 | ||
127 | int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) | 275 | static void init_one_lru(struct list_lru_one *l) |
276 | { | ||
277 | INIT_LIST_HEAD(&l->list); | ||
278 | l->nr_items = 0; | ||
279 | } | ||
280 | |||
281 | #ifdef CONFIG_MEMCG_KMEM | ||
282 | static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus, | ||
283 | int begin, int end) | ||
284 | { | ||
285 | int i; | ||
286 | |||
287 | for (i = begin; i < end; i++) | ||
288 | kfree(memcg_lrus->lru[i]); | ||
289 | } | ||
290 | |||
291 | static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus, | ||
292 | int begin, int end) | ||
293 | { | ||
294 | int i; | ||
295 | |||
296 | for (i = begin; i < end; i++) { | ||
297 | struct list_lru_one *l; | ||
298 | |||
299 | l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL); | ||
300 | if (!l) | ||
301 | goto fail; | ||
302 | |||
303 | init_one_lru(l); | ||
304 | memcg_lrus->lru[i] = l; | ||
305 | } | ||
306 | return 0; | ||
307 | fail: | ||
308 | __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1); | ||
309 | return -ENOMEM; | ||
310 | } | ||
311 | |||
312 | static int memcg_init_list_lru_node(struct list_lru_node *nlru) | ||
313 | { | ||
314 | int size = memcg_nr_cache_ids; | ||
315 | |||
316 | nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL); | ||
317 | if (!nlru->memcg_lrus) | ||
318 | return -ENOMEM; | ||
319 | |||
320 | if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) { | ||
321 | kfree(nlru->memcg_lrus); | ||
322 | return -ENOMEM; | ||
323 | } | ||
324 | |||
325 | return 0; | ||
326 | } | ||
327 | |||
328 | static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) | ||
329 | { | ||
330 | __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids); | ||
331 | kfree(nlru->memcg_lrus); | ||
332 | } | ||
333 | |||
334 | static int memcg_update_list_lru_node(struct list_lru_node *nlru, | ||
335 | int old_size, int new_size) | ||
336 | { | ||
337 | struct list_lru_memcg *old, *new; | ||
338 | |||
339 | BUG_ON(old_size > new_size); | ||
340 | |||
341 | old = nlru->memcg_lrus; | ||
342 | new = kmalloc(new_size * sizeof(void *), GFP_KERNEL); | ||
343 | if (!new) | ||
344 | return -ENOMEM; | ||
345 | |||
346 | if (__memcg_init_list_lru_node(new, old_size, new_size)) { | ||
347 | kfree(new); | ||
348 | return -ENOMEM; | ||
349 | } | ||
350 | |||
351 | memcpy(new, old, old_size * sizeof(void *)); | ||
352 | |||
353 | /* | ||
354 | * The lock guarantees that we won't race with a reader | ||
355 | * (see list_lru_from_memcg_idx). | ||
356 | * | ||
357 | * Since list_lru_{add,del} may be called under an IRQ-safe lock, | ||
358 | * we have to use IRQ-safe primitives here to avoid deadlock. | ||
359 | */ | ||
360 | spin_lock_irq(&nlru->lock); | ||
361 | nlru->memcg_lrus = new; | ||
362 | spin_unlock_irq(&nlru->lock); | ||
363 | |||
364 | kfree(old); | ||
365 | return 0; | ||
366 | } | ||
367 | |||
368 | static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru, | ||
369 | int old_size, int new_size) | ||
370 | { | ||
371 | /* do not bother shrinking the array back to the old size, because we | ||
372 | * cannot handle allocation failures here */ | ||
373 | __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size); | ||
374 | } | ||
375 | |||
376 | static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) | ||
377 | { | ||
378 | int i; | ||
379 | |||
380 | for (i = 0; i < nr_node_ids; i++) { | ||
381 | if (!memcg_aware) | ||
382 | lru->node[i].memcg_lrus = NULL; | ||
383 | else if (memcg_init_list_lru_node(&lru->node[i])) | ||
384 | goto fail; | ||
385 | } | ||
386 | return 0; | ||
387 | fail: | ||
388 | for (i = i - 1; i >= 0; i--) | ||
389 | memcg_destroy_list_lru_node(&lru->node[i]); | ||
390 | return -ENOMEM; | ||
391 | } | ||
392 | |||
393 | static void memcg_destroy_list_lru(struct list_lru *lru) | ||
394 | { | ||
395 | int i; | ||
396 | |||
397 | if (!list_lru_memcg_aware(lru)) | ||
398 | return; | ||
399 | |||
400 | for (i = 0; i < nr_node_ids; i++) | ||
401 | memcg_destroy_list_lru_node(&lru->node[i]); | ||
402 | } | ||
403 | |||
404 | static int memcg_update_list_lru(struct list_lru *lru, | ||
405 | int old_size, int new_size) | ||
406 | { | ||
407 | int i; | ||
408 | |||
409 | if (!list_lru_memcg_aware(lru)) | ||
410 | return 0; | ||
411 | |||
412 | for (i = 0; i < nr_node_ids; i++) { | ||
413 | if (memcg_update_list_lru_node(&lru->node[i], | ||
414 | old_size, new_size)) | ||
415 | goto fail; | ||
416 | } | ||
417 | return 0; | ||
418 | fail: | ||
419 | for (i = i - 1; i >= 0; i--) | ||
420 | memcg_cancel_update_list_lru_node(&lru->node[i], | ||
421 | old_size, new_size); | ||
422 | return -ENOMEM; | ||
423 | } | ||
424 | |||
425 | static void memcg_cancel_update_list_lru(struct list_lru *lru, | ||
426 | int old_size, int new_size) | ||
427 | { | ||
428 | int i; | ||
429 | |||
430 | if (!list_lru_memcg_aware(lru)) | ||
431 | return; | ||
432 | |||
433 | for (i = 0; i < nr_node_ids; i++) | ||
434 | memcg_cancel_update_list_lru_node(&lru->node[i], | ||
435 | old_size, new_size); | ||
436 | } | ||
437 | |||
438 | int memcg_update_all_list_lrus(int new_size) | ||
439 | { | ||
440 | int ret = 0; | ||
441 | struct list_lru *lru; | ||
442 | int old_size = memcg_nr_cache_ids; | ||
443 | |||
444 | mutex_lock(&list_lrus_mutex); | ||
445 | list_for_each_entry(lru, &list_lrus, list) { | ||
446 | ret = memcg_update_list_lru(lru, old_size, new_size); | ||
447 | if (ret) | ||
448 | goto fail; | ||
449 | } | ||
450 | out: | ||
451 | mutex_unlock(&list_lrus_mutex); | ||
452 | return ret; | ||
453 | fail: | ||
454 | list_for_each_entry_continue_reverse(lru, &list_lrus, list) | ||
455 | memcg_cancel_update_list_lru(lru, old_size, new_size); | ||
456 | goto out; | ||
457 | } | ||
458 | |||
459 | static void memcg_drain_list_lru_node(struct list_lru_node *nlru, | ||
460 | int src_idx, int dst_idx) | ||
461 | { | ||
462 | struct list_lru_one *src, *dst; | ||
463 | |||
464 | /* | ||
465 | * Since list_lru_{add,del} may be called under an IRQ-safe lock, | ||
466 | * we have to use IRQ-safe primitives here to avoid deadlock. | ||
467 | */ | ||
468 | spin_lock_irq(&nlru->lock); | ||
469 | |||
470 | src = list_lru_from_memcg_idx(nlru, src_idx); | ||
471 | dst = list_lru_from_memcg_idx(nlru, dst_idx); | ||
472 | |||
473 | list_splice_init(&src->list, &dst->list); | ||
474 | dst->nr_items += src->nr_items; | ||
475 | src->nr_items = 0; | ||
476 | |||
477 | spin_unlock_irq(&nlru->lock); | ||
478 | } | ||
479 | |||
480 | static void memcg_drain_list_lru(struct list_lru *lru, | ||
481 | int src_idx, int dst_idx) | ||
482 | { | ||
483 | int i; | ||
484 | |||
485 | if (!list_lru_memcg_aware(lru)) | ||
486 | return; | ||
487 | |||
488 | for (i = 0; i < nr_node_ids; i++) | ||
489 | memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); | ||
490 | } | ||
491 | |||
492 | void memcg_drain_all_list_lrus(int src_idx, int dst_idx) | ||
493 | { | ||
494 | struct list_lru *lru; | ||
495 | |||
496 | mutex_lock(&list_lrus_mutex); | ||
497 | list_for_each_entry(lru, &list_lrus, list) | ||
498 | memcg_drain_list_lru(lru, src_idx, dst_idx); | ||
499 | mutex_unlock(&list_lrus_mutex); | ||
500 | } | ||
501 | #else | ||
502 | static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) | ||
503 | { | ||
504 | return 0; | ||
505 | } | ||
506 | |||
507 | static void memcg_destroy_list_lru(struct list_lru *lru) | ||
508 | { | ||
509 | } | ||
510 | #endif /* CONFIG_MEMCG_KMEM */ | ||
511 | |||
512 | int __list_lru_init(struct list_lru *lru, bool memcg_aware, | ||
513 | struct lock_class_key *key) | ||
128 | { | 514 | { |
129 | int i; | 515 | int i; |
130 | size_t size = sizeof(*lru->node) * nr_node_ids; | 516 | size_t size = sizeof(*lru->node) * nr_node_ids; |
517 | int err = -ENOMEM; | ||
518 | |||
519 | memcg_get_cache_ids(); | ||
131 | 520 | ||
132 | lru->node = kzalloc(size, GFP_KERNEL); | 521 | lru->node = kzalloc(size, GFP_KERNEL); |
133 | if (!lru->node) | 522 | if (!lru->node) |
134 | return -ENOMEM; | 523 | goto out; |
135 | 524 | ||
136 | nodes_clear(lru->active_nodes); | ||
137 | for (i = 0; i < nr_node_ids; i++) { | 525 | for (i = 0; i < nr_node_ids; i++) { |
138 | spin_lock_init(&lru->node[i].lock); | 526 | spin_lock_init(&lru->node[i].lock); |
139 | if (key) | 527 | if (key) |
140 | lockdep_set_class(&lru->node[i].lock, key); | 528 | lockdep_set_class(&lru->node[i].lock, key); |
141 | INIT_LIST_HEAD(&lru->node[i].list); | 529 | init_one_lru(&lru->node[i].lru); |
142 | lru->node[i].nr_items = 0; | ||
143 | } | 530 | } |
144 | return 0; | 531 | |
532 | err = memcg_init_list_lru(lru, memcg_aware); | ||
533 | if (err) { | ||
534 | kfree(lru->node); | ||
535 | goto out; | ||
536 | } | ||
537 | |||
538 | list_lru_register(lru); | ||
539 | out: | ||
540 | memcg_put_cache_ids(); | ||
541 | return err; | ||
145 | } | 542 | } |
146 | EXPORT_SYMBOL_GPL(list_lru_init_key); | 543 | EXPORT_SYMBOL_GPL(__list_lru_init); |
147 | 544 | ||
148 | void list_lru_destroy(struct list_lru *lru) | 545 | void list_lru_destroy(struct list_lru *lru) |
149 | { | 546 | { |
547 | /* Already destroyed or not yet initialized? */ | ||
548 | if (!lru->node) | ||
549 | return; | ||
550 | |||
551 | memcg_get_cache_ids(); | ||
552 | |||
553 | list_lru_unregister(lru); | ||
554 | |||
555 | memcg_destroy_list_lru(lru); | ||
150 | kfree(lru->node); | 556 | kfree(lru->node); |
557 | lru->node = NULL; | ||
558 | |||
559 | memcg_put_cache_ids(); | ||
151 | } | 560 | } |
152 | EXPORT_SYMBOL_GPL(list_lru_destroy); | 561 | EXPORT_SYMBOL_GPL(list_lru_destroy); |
diff --git a/mm/madvise.c b/mm/madvise.c index a271adc93289..1077cbdc8b52 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -155,7 +155,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, | |||
155 | pte = *(orig_pte + ((index - start) / PAGE_SIZE)); | 155 | pte = *(orig_pte + ((index - start) / PAGE_SIZE)); |
156 | pte_unmap_unlock(orig_pte, ptl); | 156 | pte_unmap_unlock(orig_pte, ptl); |
157 | 157 | ||
158 | if (pte_present(pte) || pte_none(pte) || pte_file(pte)) | 158 | if (pte_present(pte) || pte_none(pte)) |
159 | continue; | 159 | continue; |
160 | entry = pte_to_swp_entry(pte); | 160 | entry = pte_to_swp_entry(pte); |
161 | if (unlikely(non_swap_entry(entry))) | 161 | if (unlikely(non_swap_entry(entry))) |
@@ -222,19 +222,22 @@ static long madvise_willneed(struct vm_area_struct *vma, | |||
222 | struct file *file = vma->vm_file; | 222 | struct file *file = vma->vm_file; |
223 | 223 | ||
224 | #ifdef CONFIG_SWAP | 224 | #ifdef CONFIG_SWAP |
225 | if (!file || mapping_cap_swap_backed(file->f_mapping)) { | 225 | if (!file) { |
226 | *prev = vma; | 226 | *prev = vma; |
227 | if (!file) | 227 | force_swapin_readahead(vma, start, end); |
228 | force_swapin_readahead(vma, start, end); | ||
229 | else | ||
230 | force_shm_swapin_readahead(vma, start, end, | ||
231 | file->f_mapping); | ||
232 | return 0; | 228 | return 0; |
233 | } | 229 | } |
234 | #endif | ||
235 | 230 | ||
231 | if (shmem_mapping(file->f_mapping)) { | ||
232 | *prev = vma; | ||
233 | force_shm_swapin_readahead(vma, start, end, | ||
234 | file->f_mapping); | ||
235 | return 0; | ||
236 | } | ||
237 | #else | ||
236 | if (!file) | 238 | if (!file) |
237 | return -EBADF; | 239 | return -EBADF; |
240 | #endif | ||
238 | 241 | ||
239 | if (file->f_mapping->a_ops->get_xip_mem) { | 242 | if (file->f_mapping->a_ops->get_xip_mem) { |
240 | /* no bad return value, but ignore advice */ | 243 | /* no bad return value, but ignore advice */ |
@@ -278,14 +281,7 @@ static long madvise_dontneed(struct vm_area_struct *vma, | |||
278 | if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) | 281 | if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) |
279 | return -EINVAL; | 282 | return -EINVAL; |
280 | 283 | ||
281 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) { | 284 | zap_page_range(vma, start, end - start, NULL); |
282 | struct zap_details details = { | ||
283 | .nonlinear_vma = vma, | ||
284 | .last_index = ULONG_MAX, | ||
285 | }; | ||
286 | zap_page_range(vma, start, end - start, &details); | ||
287 | } else | ||
288 | zap_page_range(vma, start, end - start, NULL); | ||
289 | return 0; | 285 | return 0; |
290 | } | 286 | } |
291 | 287 | ||
@@ -303,7 +299,7 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
303 | 299 | ||
304 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ | 300 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ |
305 | 301 | ||
306 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) | 302 | if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB)) |
307 | return -EINVAL; | 303 | return -EINVAL; |
308 | 304 | ||
309 | f = vma->vm_file; | 305 | f = vma->vm_file; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ef91e856c7e4..d18d3a6e7337 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -72,22 +72,13 @@ EXPORT_SYMBOL(memory_cgrp_subsys); | |||
72 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 72 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
73 | static struct mem_cgroup *root_mem_cgroup __read_mostly; | 73 | static struct mem_cgroup *root_mem_cgroup __read_mostly; |
74 | 74 | ||
75 | /* Whether the swap controller is active */ | ||
75 | #ifdef CONFIG_MEMCG_SWAP | 76 | #ifdef CONFIG_MEMCG_SWAP |
76 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | ||
77 | int do_swap_account __read_mostly; | 77 | int do_swap_account __read_mostly; |
78 | |||
79 | /* for remember boot option*/ | ||
80 | #ifdef CONFIG_MEMCG_SWAP_ENABLED | ||
81 | static int really_do_swap_account __initdata = 1; | ||
82 | #else | ||
83 | static int really_do_swap_account __initdata; | ||
84 | #endif | ||
85 | |||
86 | #else | 78 | #else |
87 | #define do_swap_account 0 | 79 | #define do_swap_account 0 |
88 | #endif | 80 | #endif |
89 | 81 | ||
90 | |||
91 | static const char * const mem_cgroup_stat_names[] = { | 82 | static const char * const mem_cgroup_stat_names[] = { |
92 | "cache", | 83 | "cache", |
93 | "rss", | 84 | "rss", |
@@ -97,14 +88,6 @@ static const char * const mem_cgroup_stat_names[] = { | |||
97 | "swap", | 88 | "swap", |
98 | }; | 89 | }; |
99 | 90 | ||
100 | enum mem_cgroup_events_index { | ||
101 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ | ||
102 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ | ||
103 | MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ | ||
104 | MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ | ||
105 | MEM_CGROUP_EVENTS_NSTATS, | ||
106 | }; | ||
107 | |||
108 | static const char * const mem_cgroup_events_names[] = { | 91 | static const char * const mem_cgroup_events_names[] = { |
109 | "pgpgin", | 92 | "pgpgin", |
110 | "pgpgout", | 93 | "pgpgout", |
@@ -138,7 +121,7 @@ enum mem_cgroup_events_target { | |||
138 | 121 | ||
139 | struct mem_cgroup_stat_cpu { | 122 | struct mem_cgroup_stat_cpu { |
140 | long count[MEM_CGROUP_STAT_NSTATS]; | 123 | long count[MEM_CGROUP_STAT_NSTATS]; |
141 | unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; | 124 | unsigned long events[MEMCG_NR_EVENTS]; |
142 | unsigned long nr_page_events; | 125 | unsigned long nr_page_events; |
143 | unsigned long targets[MEM_CGROUP_NTARGETS]; | 126 | unsigned long targets[MEM_CGROUP_NTARGETS]; |
144 | }; | 127 | }; |
@@ -284,6 +267,10 @@ struct mem_cgroup { | |||
284 | struct page_counter memsw; | 267 | struct page_counter memsw; |
285 | struct page_counter kmem; | 268 | struct page_counter kmem; |
286 | 269 | ||
270 | /* Normal memory consumption range */ | ||
271 | unsigned long low; | ||
272 | unsigned long high; | ||
273 | |||
287 | unsigned long soft_limit; | 274 | unsigned long soft_limit; |
288 | 275 | ||
289 | /* vmpressure notifications */ | 276 | /* vmpressure notifications */ |
@@ -325,9 +312,11 @@ struct mem_cgroup { | |||
325 | /* | 312 | /* |
326 | * set > 0 if pages under this cgroup are moving to other cgroup. | 313 | * set > 0 if pages under this cgroup are moving to other cgroup. |
327 | */ | 314 | */ |
328 | atomic_t moving_account; | 315 | atomic_t moving_account; |
329 | /* taken only while moving_account > 0 */ | 316 | /* taken only while moving_account > 0 */ |
330 | spinlock_t move_lock; | 317 | spinlock_t move_lock; |
318 | struct task_struct *move_lock_task; | ||
319 | unsigned long move_lock_flags; | ||
331 | /* | 320 | /* |
332 | * percpu counter. | 321 | * percpu counter. |
333 | */ | 322 | */ |
@@ -343,11 +332,10 @@ struct mem_cgroup { | |||
343 | struct cg_proto tcp_mem; | 332 | struct cg_proto tcp_mem; |
344 | #endif | 333 | #endif |
345 | #if defined(CONFIG_MEMCG_KMEM) | 334 | #if defined(CONFIG_MEMCG_KMEM) |
346 | /* analogous to slab_common's slab_caches list, but per-memcg; | 335 | /* Index in the kmem_cache->memcg_params.memcg_caches array */ |
347 | * protected by memcg_slab_mutex */ | ||
348 | struct list_head memcg_slab_caches; | ||
349 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ | ||
350 | int kmemcg_id; | 336 | int kmemcg_id; |
337 | bool kmem_acct_activated; | ||
338 | bool kmem_acct_active; | ||
351 | #endif | 339 | #endif |
352 | 340 | ||
353 | int last_scanned_node; | 341 | int last_scanned_node; |
@@ -366,29 +354,26 @@ struct mem_cgroup { | |||
366 | }; | 354 | }; |
367 | 355 | ||
368 | #ifdef CONFIG_MEMCG_KMEM | 356 | #ifdef CONFIG_MEMCG_KMEM |
369 | static bool memcg_kmem_is_active(struct mem_cgroup *memcg) | 357 | bool memcg_kmem_is_active(struct mem_cgroup *memcg) |
370 | { | 358 | { |
371 | return memcg->kmemcg_id >= 0; | 359 | return memcg->kmem_acct_active; |
372 | } | 360 | } |
373 | #endif | 361 | #endif |
374 | 362 | ||
375 | /* Stuffs for move charges at task migration. */ | 363 | /* Stuffs for move charges at task migration. */ |
376 | /* | 364 | /* |
377 | * Types of charges to be moved. "move_charge_at_immitgrate" and | 365 | * Types of charges to be moved. |
378 | * "immigrate_flags" are treated as a left-shifted bitmap of these types. | ||
379 | */ | 366 | */ |
380 | enum move_type { | 367 | #define MOVE_ANON 0x1U |
381 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | 368 | #define MOVE_FILE 0x2U |
382 | MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ | 369 | #define MOVE_MASK (MOVE_ANON | MOVE_FILE) |
383 | NR_MOVE_TYPE, | ||
384 | }; | ||
385 | 370 | ||
386 | /* "mc" and its members are protected by cgroup_mutex */ | 371 | /* "mc" and its members are protected by cgroup_mutex */ |
387 | static struct move_charge_struct { | 372 | static struct move_charge_struct { |
388 | spinlock_t lock; /* for from, to */ | 373 | spinlock_t lock; /* for from, to */ |
389 | struct mem_cgroup *from; | 374 | struct mem_cgroup *from; |
390 | struct mem_cgroup *to; | 375 | struct mem_cgroup *to; |
391 | unsigned long immigrate_flags; | 376 | unsigned long flags; |
392 | unsigned long precharge; | 377 | unsigned long precharge; |
393 | unsigned long moved_charge; | 378 | unsigned long moved_charge; |
394 | unsigned long moved_swap; | 379 | unsigned long moved_swap; |
@@ -399,16 +384,6 @@ static struct move_charge_struct { | |||
399 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | 384 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), |
400 | }; | 385 | }; |
401 | 386 | ||
402 | static bool move_anon(void) | ||
403 | { | ||
404 | return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); | ||
405 | } | ||
406 | |||
407 | static bool move_file(void) | ||
408 | { | ||
409 | return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); | ||
410 | } | ||
411 | |||
412 | /* | 387 | /* |
413 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | 388 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft |
414 | * limit reclaim to prevent infinite loops, if they ever occur. | 389 | * limit reclaim to prevent infinite loops, if they ever occur. |
@@ -544,33 +519,35 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) | |||
544 | } | 519 | } |
545 | EXPORT_SYMBOL(tcp_proto_cgroup); | 520 | EXPORT_SYMBOL(tcp_proto_cgroup); |
546 | 521 | ||
547 | static void disarm_sock_keys(struct mem_cgroup *memcg) | ||
548 | { | ||
549 | if (!memcg_proto_activated(&memcg->tcp_mem)) | ||
550 | return; | ||
551 | static_key_slow_dec(&memcg_socket_limit_enabled); | ||
552 | } | ||
553 | #else | ||
554 | static void disarm_sock_keys(struct mem_cgroup *memcg) | ||
555 | { | ||
556 | } | ||
557 | #endif | 522 | #endif |
558 | 523 | ||
559 | #ifdef CONFIG_MEMCG_KMEM | 524 | #ifdef CONFIG_MEMCG_KMEM |
560 | /* | 525 | /* |
561 | * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. | 526 | * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. |
562 | * The main reason for not using cgroup id for this: | 527 | * The main reason for not using cgroup id for this: |
563 | * this works better in sparse environments, where we have a lot of memcgs, | 528 | * this works better in sparse environments, where we have a lot of memcgs, |
564 | * but only a few kmem-limited. Or also, if we have, for instance, 200 | 529 | * but only a few kmem-limited. Or also, if we have, for instance, 200 |
565 | * memcgs, and none but the 200th is kmem-limited, we'd have to have a | 530 | * memcgs, and none but the 200th is kmem-limited, we'd have to have a |
566 | * 200 entry array for that. | 531 | * 200 entry array for that. |
567 | * | 532 | * |
568 | * The current size of the caches array is stored in | 533 | * The current size of the caches array is stored in memcg_nr_cache_ids. It |
569 | * memcg_limited_groups_array_size. It will double each time we have to | 534 | * will double each time we have to increase it. |
570 | * increase it. | ||
571 | */ | 535 | */ |
572 | static DEFINE_IDA(kmem_limited_groups); | 536 | static DEFINE_IDA(memcg_cache_ida); |
573 | int memcg_limited_groups_array_size; | 537 | int memcg_nr_cache_ids; |
538 | |||
539 | /* Protects memcg_nr_cache_ids */ | ||
540 | static DECLARE_RWSEM(memcg_cache_ids_sem); | ||
541 | |||
542 | void memcg_get_cache_ids(void) | ||
543 | { | ||
544 | down_read(&memcg_cache_ids_sem); | ||
545 | } | ||
546 | |||
547 | void memcg_put_cache_ids(void) | ||
548 | { | ||
549 | up_read(&memcg_cache_ids_sem); | ||
550 | } | ||
574 | 551 | ||
575 | /* | 552 | /* |
576 | * MIN_SIZE is different than 1, because we would like to avoid going through | 553 | * MIN_SIZE is different than 1, because we would like to avoid going through |
@@ -596,32 +573,8 @@ int memcg_limited_groups_array_size; | |||
596 | struct static_key memcg_kmem_enabled_key; | 573 | struct static_key memcg_kmem_enabled_key; |
597 | EXPORT_SYMBOL(memcg_kmem_enabled_key); | 574 | EXPORT_SYMBOL(memcg_kmem_enabled_key); |
598 | 575 | ||
599 | static void memcg_free_cache_id(int id); | ||
600 | |||
601 | static void disarm_kmem_keys(struct mem_cgroup *memcg) | ||
602 | { | ||
603 | if (memcg_kmem_is_active(memcg)) { | ||
604 | static_key_slow_dec(&memcg_kmem_enabled_key); | ||
605 | memcg_free_cache_id(memcg->kmemcg_id); | ||
606 | } | ||
607 | /* | ||
608 | * This check can't live in kmem destruction function, | ||
609 | * since the charges will outlive the cgroup | ||
610 | */ | ||
611 | WARN_ON(page_counter_read(&memcg->kmem)); | ||
612 | } | ||
613 | #else | ||
614 | static void disarm_kmem_keys(struct mem_cgroup *memcg) | ||
615 | { | ||
616 | } | ||
617 | #endif /* CONFIG_MEMCG_KMEM */ | 576 | #endif /* CONFIG_MEMCG_KMEM */ |
618 | 577 | ||
619 | static void disarm_static_keys(struct mem_cgroup *memcg) | ||
620 | { | ||
621 | disarm_sock_keys(memcg); | ||
622 | disarm_kmem_keys(memcg); | ||
623 | } | ||
624 | |||
625 | static struct mem_cgroup_per_zone * | 578 | static struct mem_cgroup_per_zone * |
626 | mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) | 579 | mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) |
627 | { | 580 | { |
@@ -1368,6 +1321,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | |||
1368 | return inactive * inactive_ratio < active; | 1321 | return inactive * inactive_ratio < active; |
1369 | } | 1322 | } |
1370 | 1323 | ||
1324 | bool mem_cgroup_lruvec_online(struct lruvec *lruvec) | ||
1325 | { | ||
1326 | struct mem_cgroup_per_zone *mz; | ||
1327 | struct mem_cgroup *memcg; | ||
1328 | |||
1329 | if (mem_cgroup_disabled()) | ||
1330 | return true; | ||
1331 | |||
1332 | mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); | ||
1333 | memcg = mz->memcg; | ||
1334 | |||
1335 | return !!(memcg->css.flags & CSS_ONLINE); | ||
1336 | } | ||
1337 | |||
1371 | #define mem_cgroup_from_counter(counter, member) \ | 1338 | #define mem_cgroup_from_counter(counter, member) \ |
1372 | container_of(counter, struct mem_cgroup, member) | 1339 | container_of(counter, struct mem_cgroup, member) |
1373 | 1340 | ||
@@ -1477,9 +1444,9 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1477 | 1444 | ||
1478 | pr_info("Task in "); | 1445 | pr_info("Task in "); |
1479 | pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); | 1446 | pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); |
1480 | pr_info(" killed as a result of limit of "); | 1447 | pr_cont(" killed as a result of limit of "); |
1481 | pr_cont_cgroup_path(memcg->css.cgroup); | 1448 | pr_cont_cgroup_path(memcg->css.cgroup); |
1482 | pr_info("\n"); | 1449 | pr_cont("\n"); |
1483 | 1450 | ||
1484 | rcu_read_unlock(); | 1451 | rcu_read_unlock(); |
1485 | 1452 | ||
@@ -1560,7 +1527,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1560 | * quickly exit and free its memory. | 1527 | * quickly exit and free its memory. |
1561 | */ | 1528 | */ |
1562 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { | 1529 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { |
1563 | set_thread_flag(TIF_MEMDIE); | 1530 | mark_tsk_oom_victim(current); |
1564 | return; | 1531 | return; |
1565 | } | 1532 | } |
1566 | 1533 | ||
@@ -1934,7 +1901,7 @@ bool mem_cgroup_oom_synchronize(bool handle) | |||
1934 | if (!memcg) | 1901 | if (!memcg) |
1935 | return false; | 1902 | return false; |
1936 | 1903 | ||
1937 | if (!handle) | 1904 | if (!handle || oom_killer_disabled) |
1938 | goto cleanup; | 1905 | goto cleanup; |
1939 | 1906 | ||
1940 | owait.memcg = memcg; | 1907 | owait.memcg = memcg; |
@@ -1980,34 +1947,33 @@ cleanup: | |||
1980 | /** | 1947 | /** |
1981 | * mem_cgroup_begin_page_stat - begin a page state statistics transaction | 1948 | * mem_cgroup_begin_page_stat - begin a page state statistics transaction |
1982 | * @page: page that is going to change accounted state | 1949 | * @page: page that is going to change accounted state |
1983 | * @locked: &memcg->move_lock slowpath was taken | ||
1984 | * @flags: IRQ-state flags for &memcg->move_lock | ||
1985 | * | 1950 | * |
1986 | * This function must mark the beginning of an accounted page state | 1951 | * This function must mark the beginning of an accounted page state |
1987 | * change to prevent double accounting when the page is concurrently | 1952 | * change to prevent double accounting when the page is concurrently |
1988 | * being moved to another memcg: | 1953 | * being moved to another memcg: |
1989 | * | 1954 | * |
1990 | * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); | 1955 | * memcg = mem_cgroup_begin_page_stat(page); |
1991 | * if (TestClearPageState(page)) | 1956 | * if (TestClearPageState(page)) |
1992 | * mem_cgroup_update_page_stat(memcg, state, -1); | 1957 | * mem_cgroup_update_page_stat(memcg, state, -1); |
1993 | * mem_cgroup_end_page_stat(memcg, locked, flags); | 1958 | * mem_cgroup_end_page_stat(memcg); |
1994 | * | ||
1995 | * The RCU lock is held throughout the transaction. The fast path can | ||
1996 | * get away without acquiring the memcg->move_lock (@locked is false) | ||
1997 | * because page moving starts with an RCU grace period. | ||
1998 | * | ||
1999 | * The RCU lock also protects the memcg from being freed when the page | ||
2000 | * state that is going to change is the only thing preventing the page | ||
2001 | * from being uncharged. E.g. end-writeback clearing PageWriteback(), | ||
2002 | * which allows migration to go ahead and uncharge the page before the | ||
2003 | * account transaction might be complete. | ||
2004 | */ | 1959 | */ |
2005 | struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, | 1960 | struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) |
2006 | bool *locked, | ||
2007 | unsigned long *flags) | ||
2008 | { | 1961 | { |
2009 | struct mem_cgroup *memcg; | 1962 | struct mem_cgroup *memcg; |
1963 | unsigned long flags; | ||
2010 | 1964 | ||
1965 | /* | ||
1966 | * The RCU lock is held throughout the transaction. The fast | ||
1967 | * path can get away without acquiring the memcg->move_lock | ||
1968 | * because page moving starts with an RCU grace period. | ||
1969 | * | ||
1970 | * The RCU lock also protects the memcg from being freed when | ||
1971 | * the page state that is going to change is the only thing | ||
1972 | * preventing the page from being uncharged. | ||
1973 | * E.g. end-writeback clearing PageWriteback(), which allows | ||
1974 | * migration to go ahead and uncharge the page before the | ||
1975 | * account transaction might be complete. | ||
1976 | */ | ||
2011 | rcu_read_lock(); | 1977 | rcu_read_lock(); |
2012 | 1978 | ||
2013 | if (mem_cgroup_disabled()) | 1979 | if (mem_cgroup_disabled()) |
@@ -2017,16 +1983,22 @@ again: | |||
2017 | if (unlikely(!memcg)) | 1983 | if (unlikely(!memcg)) |
2018 | return NULL; | 1984 | return NULL; |
2019 | 1985 | ||
2020 | *locked = false; | ||
2021 | if (atomic_read(&memcg->moving_account) <= 0) | 1986 | if (atomic_read(&memcg->moving_account) <= 0) |
2022 | return memcg; | 1987 | return memcg; |
2023 | 1988 | ||
2024 | spin_lock_irqsave(&memcg->move_lock, *flags); | 1989 | spin_lock_irqsave(&memcg->move_lock, flags); |
2025 | if (memcg != page->mem_cgroup) { | 1990 | if (memcg != page->mem_cgroup) { |
2026 | spin_unlock_irqrestore(&memcg->move_lock, *flags); | 1991 | spin_unlock_irqrestore(&memcg->move_lock, flags); |
2027 | goto again; | 1992 | goto again; |
2028 | } | 1993 | } |
2029 | *locked = true; | 1994 | |
1995 | /* | ||
1996 | * When charge migration first begins, we can have locked and | ||
1997 | * unlocked page stat updates happening concurrently. Track | ||
1998 | * the task who has the lock for mem_cgroup_end_page_stat(). | ||
1999 | */ | ||
2000 | memcg->move_lock_task = current; | ||
2001 | memcg->move_lock_flags = flags; | ||
2030 | 2002 | ||
2031 | return memcg; | 2003 | return memcg; |
2032 | } | 2004 | } |
@@ -2034,14 +2006,17 @@ again: | |||
2034 | /** | 2006 | /** |
2035 | * mem_cgroup_end_page_stat - finish a page state statistics transaction | 2007 | * mem_cgroup_end_page_stat - finish a page state statistics transaction |
2036 | * @memcg: the memcg that was accounted against | 2008 | * @memcg: the memcg that was accounted against |
2037 | * @locked: value received from mem_cgroup_begin_page_stat() | ||
2038 | * @flags: value received from mem_cgroup_begin_page_stat() | ||
2039 | */ | 2009 | */ |
2040 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked, | 2010 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) |
2041 | unsigned long *flags) | ||
2042 | { | 2011 | { |
2043 | if (memcg && *locked) | 2012 | if (memcg && memcg->move_lock_task == current) { |
2044 | spin_unlock_irqrestore(&memcg->move_lock, *flags); | 2013 | unsigned long flags = memcg->move_lock_flags; |
2014 | |||
2015 | memcg->move_lock_task = NULL; | ||
2016 | memcg->move_lock_flags = 0; | ||
2017 | |||
2018 | spin_unlock_irqrestore(&memcg->move_lock, flags); | ||
2019 | } | ||
2045 | 2020 | ||
2046 | rcu_read_unlock(); | 2021 | rcu_read_unlock(); |
2047 | } | 2022 | } |
@@ -2134,17 +2109,6 @@ static void drain_local_stock(struct work_struct *dummy) | |||
2134 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); | 2109 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); |
2135 | } | 2110 | } |
2136 | 2111 | ||
2137 | static void __init memcg_stock_init(void) | ||
2138 | { | ||
2139 | int cpu; | ||
2140 | |||
2141 | for_each_possible_cpu(cpu) { | ||
2142 | struct memcg_stock_pcp *stock = | ||
2143 | &per_cpu(memcg_stock, cpu); | ||
2144 | INIT_WORK(&stock->work, drain_local_stock); | ||
2145 | } | ||
2146 | } | ||
2147 | |||
2148 | /* | 2112 | /* |
2149 | * Cache charges(val) to local per_cpu area. | 2113 | * Cache charges(val) to local per_cpu area. |
2150 | * This will be consumed by consume_stock() function, later. | 2114 | * This will be consumed by consume_stock() function, later. |
@@ -2294,6 +2258,8 @@ retry: | |||
2294 | if (!(gfp_mask & __GFP_WAIT)) | 2258 | if (!(gfp_mask & __GFP_WAIT)) |
2295 | goto nomem; | 2259 | goto nomem; |
2296 | 2260 | ||
2261 | mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); | ||
2262 | |||
2297 | nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, | 2263 | nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, |
2298 | gfp_mask, may_swap); | 2264 | gfp_mask, may_swap); |
2299 | 2265 | ||
@@ -2335,6 +2301,8 @@ retry: | |||
2335 | if (fatal_signal_pending(current)) | 2301 | if (fatal_signal_pending(current)) |
2336 | goto bypass; | 2302 | goto bypass; |
2337 | 2303 | ||
2304 | mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); | ||
2305 | |||
2338 | mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); | 2306 | mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); |
2339 | nomem: | 2307 | nomem: |
2340 | if (!(gfp_mask & __GFP_NOFAIL)) | 2308 | if (!(gfp_mask & __GFP_NOFAIL)) |
@@ -2346,6 +2314,16 @@ done_restock: | |||
2346 | css_get_many(&memcg->css, batch); | 2314 | css_get_many(&memcg->css, batch); |
2347 | if (batch > nr_pages) | 2315 | if (batch > nr_pages) |
2348 | refill_stock(memcg, batch - nr_pages); | 2316 | refill_stock(memcg, batch - nr_pages); |
2317 | /* | ||
2318 | * If the hierarchy is above the normal consumption range, | ||
2319 | * make the charging task trim their excess contribution. | ||
2320 | */ | ||
2321 | do { | ||
2322 | if (page_counter_read(&memcg->memory) <= memcg->high) | ||
2323 | continue; | ||
2324 | mem_cgroup_events(memcg, MEMCG_HIGH, 1); | ||
2325 | try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); | ||
2326 | } while ((memcg = parent_mem_cgroup(memcg))); | ||
2349 | done: | 2327 | done: |
2350 | return ret; | 2328 | return ret; |
2351 | } | 2329 | } |
@@ -2476,27 +2454,8 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg, | |||
2476 | } | 2454 | } |
2477 | 2455 | ||
2478 | #ifdef CONFIG_MEMCG_KMEM | 2456 | #ifdef CONFIG_MEMCG_KMEM |
2479 | /* | 2457 | int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, |
2480 | * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or | 2458 | unsigned long nr_pages) |
2481 | * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. | ||
2482 | */ | ||
2483 | static DEFINE_MUTEX(memcg_slab_mutex); | ||
2484 | |||
2485 | /* | ||
2486 | * This is a bit cumbersome, but it is rarely used and avoids a backpointer | ||
2487 | * in the memcg_cache_params struct. | ||
2488 | */ | ||
2489 | static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) | ||
2490 | { | ||
2491 | struct kmem_cache *cachep; | ||
2492 | |||
2493 | VM_BUG_ON(p->is_root_cache); | ||
2494 | cachep = p->root_cache; | ||
2495 | return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); | ||
2496 | } | ||
2497 | |||
2498 | static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, | ||
2499 | unsigned long nr_pages) | ||
2500 | { | 2459 | { |
2501 | struct page_counter *counter; | 2460 | struct page_counter *counter; |
2502 | int ret = 0; | 2461 | int ret = 0; |
@@ -2533,8 +2492,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, | |||
2533 | return ret; | 2492 | return ret; |
2534 | } | 2493 | } |
2535 | 2494 | ||
2536 | static void memcg_uncharge_kmem(struct mem_cgroup *memcg, | 2495 | void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages) |
2537 | unsigned long nr_pages) | ||
2538 | { | 2496 | { |
2539 | page_counter_uncharge(&memcg->memory, nr_pages); | 2497 | page_counter_uncharge(&memcg->memory, nr_pages); |
2540 | if (do_swap_account) | 2498 | if (do_swap_account) |
@@ -2560,18 +2518,19 @@ static int memcg_alloc_cache_id(void) | |||
2560 | int id, size; | 2518 | int id, size; |
2561 | int err; | 2519 | int err; |
2562 | 2520 | ||
2563 | id = ida_simple_get(&kmem_limited_groups, | 2521 | id = ida_simple_get(&memcg_cache_ida, |
2564 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); | 2522 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); |
2565 | if (id < 0) | 2523 | if (id < 0) |
2566 | return id; | 2524 | return id; |
2567 | 2525 | ||
2568 | if (id < memcg_limited_groups_array_size) | 2526 | if (id < memcg_nr_cache_ids) |
2569 | return id; | 2527 | return id; |
2570 | 2528 | ||
2571 | /* | 2529 | /* |
2572 | * There's no space for the new id in memcg_caches arrays, | 2530 | * There's no space for the new id in memcg_caches arrays, |
2573 | * so we have to grow them. | 2531 | * so we have to grow them. |
2574 | */ | 2532 | */ |
2533 | down_write(&memcg_cache_ids_sem); | ||
2575 | 2534 | ||
2576 | size = 2 * (id + 1); | 2535 | size = 2 * (id + 1); |
2577 | if (size < MEMCG_CACHES_MIN_SIZE) | 2536 | if (size < MEMCG_CACHES_MIN_SIZE) |
@@ -2579,12 +2538,16 @@ static int memcg_alloc_cache_id(void) | |||
2579 | else if (size > MEMCG_CACHES_MAX_SIZE) | 2538 | else if (size > MEMCG_CACHES_MAX_SIZE) |
2580 | size = MEMCG_CACHES_MAX_SIZE; | 2539 | size = MEMCG_CACHES_MAX_SIZE; |
2581 | 2540 | ||
2582 | mutex_lock(&memcg_slab_mutex); | ||
2583 | err = memcg_update_all_caches(size); | 2541 | err = memcg_update_all_caches(size); |
2584 | mutex_unlock(&memcg_slab_mutex); | 2542 | if (!err) |
2543 | err = memcg_update_all_list_lrus(size); | ||
2544 | if (!err) | ||
2545 | memcg_nr_cache_ids = size; | ||
2546 | |||
2547 | up_write(&memcg_cache_ids_sem); | ||
2585 | 2548 | ||
2586 | if (err) { | 2549 | if (err) { |
2587 | ida_simple_remove(&kmem_limited_groups, id); | 2550 | ida_simple_remove(&memcg_cache_ida, id); |
2588 | return err; | 2551 | return err; |
2589 | } | 2552 | } |
2590 | return id; | 2553 | return id; |
@@ -2592,136 +2555,23 @@ static int memcg_alloc_cache_id(void) | |||
2592 | 2555 | ||
2593 | static void memcg_free_cache_id(int id) | 2556 | static void memcg_free_cache_id(int id) |
2594 | { | 2557 | { |
2595 | ida_simple_remove(&kmem_limited_groups, id); | 2558 | ida_simple_remove(&memcg_cache_ida, id); |
2596 | } | ||
2597 | |||
2598 | /* | ||
2599 | * We should update the current array size iff all caches updates succeed. This | ||
2600 | * can only be done from the slab side. The slab mutex needs to be held when | ||
2601 | * calling this. | ||
2602 | */ | ||
2603 | void memcg_update_array_size(int num) | ||
2604 | { | ||
2605 | memcg_limited_groups_array_size = num; | ||
2606 | } | 2559 | } |
2607 | 2560 | ||
2608 | static void memcg_register_cache(struct mem_cgroup *memcg, | 2561 | struct memcg_kmem_cache_create_work { |
2609 | struct kmem_cache *root_cache) | ||
2610 | { | ||
2611 | static char memcg_name_buf[NAME_MAX + 1]; /* protected by | ||
2612 | memcg_slab_mutex */ | ||
2613 | struct kmem_cache *cachep; | ||
2614 | int id; | ||
2615 | |||
2616 | lockdep_assert_held(&memcg_slab_mutex); | ||
2617 | |||
2618 | id = memcg_cache_id(memcg); | ||
2619 | |||
2620 | /* | ||
2621 | * Since per-memcg caches are created asynchronously on first | ||
2622 | * allocation (see memcg_kmem_get_cache()), several threads can try to | ||
2623 | * create the same cache, but only one of them may succeed. | ||
2624 | */ | ||
2625 | if (cache_from_memcg_idx(root_cache, id)) | ||
2626 | return; | ||
2627 | |||
2628 | cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); | ||
2629 | cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); | ||
2630 | /* | ||
2631 | * If we could not create a memcg cache, do not complain, because | ||
2632 | * that's not critical at all as we can always proceed with the root | ||
2633 | * cache. | ||
2634 | */ | ||
2635 | if (!cachep) | ||
2636 | return; | ||
2637 | |||
2638 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); | ||
2639 | |||
2640 | /* | ||
2641 | * Since readers won't lock (see cache_from_memcg_idx()), we need a | ||
2642 | * barrier here to ensure nobody will see the kmem_cache partially | ||
2643 | * initialized. | ||
2644 | */ | ||
2645 | smp_wmb(); | ||
2646 | |||
2647 | BUG_ON(root_cache->memcg_params->memcg_caches[id]); | ||
2648 | root_cache->memcg_params->memcg_caches[id] = cachep; | ||
2649 | } | ||
2650 | |||
2651 | static void memcg_unregister_cache(struct kmem_cache *cachep) | ||
2652 | { | ||
2653 | struct kmem_cache *root_cache; | ||
2654 | struct mem_cgroup *memcg; | ||
2655 | int id; | ||
2656 | |||
2657 | lockdep_assert_held(&memcg_slab_mutex); | ||
2658 | |||
2659 | BUG_ON(is_root_cache(cachep)); | ||
2660 | |||
2661 | root_cache = cachep->memcg_params->root_cache; | ||
2662 | memcg = cachep->memcg_params->memcg; | ||
2663 | id = memcg_cache_id(memcg); | ||
2664 | |||
2665 | BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); | ||
2666 | root_cache->memcg_params->memcg_caches[id] = NULL; | ||
2667 | |||
2668 | list_del(&cachep->memcg_params->list); | ||
2669 | |||
2670 | kmem_cache_destroy(cachep); | ||
2671 | } | ||
2672 | |||
2673 | int __memcg_cleanup_cache_params(struct kmem_cache *s) | ||
2674 | { | ||
2675 | struct kmem_cache *c; | ||
2676 | int i, failed = 0; | ||
2677 | |||
2678 | mutex_lock(&memcg_slab_mutex); | ||
2679 | for_each_memcg_cache_index(i) { | ||
2680 | c = cache_from_memcg_idx(s, i); | ||
2681 | if (!c) | ||
2682 | continue; | ||
2683 | |||
2684 | memcg_unregister_cache(c); | ||
2685 | |||
2686 | if (cache_from_memcg_idx(s, i)) | ||
2687 | failed++; | ||
2688 | } | ||
2689 | mutex_unlock(&memcg_slab_mutex); | ||
2690 | return failed; | ||
2691 | } | ||
2692 | |||
2693 | static void memcg_unregister_all_caches(struct mem_cgroup *memcg) | ||
2694 | { | ||
2695 | struct kmem_cache *cachep; | ||
2696 | struct memcg_cache_params *params, *tmp; | ||
2697 | |||
2698 | if (!memcg_kmem_is_active(memcg)) | ||
2699 | return; | ||
2700 | |||
2701 | mutex_lock(&memcg_slab_mutex); | ||
2702 | list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { | ||
2703 | cachep = memcg_params_to_cache(params); | ||
2704 | memcg_unregister_cache(cachep); | ||
2705 | } | ||
2706 | mutex_unlock(&memcg_slab_mutex); | ||
2707 | } | ||
2708 | |||
2709 | struct memcg_register_cache_work { | ||
2710 | struct mem_cgroup *memcg; | 2562 | struct mem_cgroup *memcg; |
2711 | struct kmem_cache *cachep; | 2563 | struct kmem_cache *cachep; |
2712 | struct work_struct work; | 2564 | struct work_struct work; |
2713 | }; | 2565 | }; |
2714 | 2566 | ||
2715 | static void memcg_register_cache_func(struct work_struct *w) | 2567 | static void memcg_kmem_cache_create_func(struct work_struct *w) |
2716 | { | 2568 | { |
2717 | struct memcg_register_cache_work *cw = | 2569 | struct memcg_kmem_cache_create_work *cw = |
2718 | container_of(w, struct memcg_register_cache_work, work); | 2570 | container_of(w, struct memcg_kmem_cache_create_work, work); |
2719 | struct mem_cgroup *memcg = cw->memcg; | 2571 | struct mem_cgroup *memcg = cw->memcg; |
2720 | struct kmem_cache *cachep = cw->cachep; | 2572 | struct kmem_cache *cachep = cw->cachep; |
2721 | 2573 | ||
2722 | mutex_lock(&memcg_slab_mutex); | 2574 | memcg_create_kmem_cache(memcg, cachep); |
2723 | memcg_register_cache(memcg, cachep); | ||
2724 | mutex_unlock(&memcg_slab_mutex); | ||
2725 | 2575 | ||
2726 | css_put(&memcg->css); | 2576 | css_put(&memcg->css); |
2727 | kfree(cw); | 2577 | kfree(cw); |
@@ -2730,10 +2580,10 @@ static void memcg_register_cache_func(struct work_struct *w) | |||
2730 | /* | 2580 | /* |
2731 | * Enqueue the creation of a per-memcg kmem_cache. | 2581 | * Enqueue the creation of a per-memcg kmem_cache. |
2732 | */ | 2582 | */ |
2733 | static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, | 2583 | static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, |
2734 | struct kmem_cache *cachep) | 2584 | struct kmem_cache *cachep) |
2735 | { | 2585 | { |
2736 | struct memcg_register_cache_work *cw; | 2586 | struct memcg_kmem_cache_create_work *cw; |
2737 | 2587 | ||
2738 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); | 2588 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); |
2739 | if (!cw) | 2589 | if (!cw) |
@@ -2743,18 +2593,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, | |||
2743 | 2593 | ||
2744 | cw->memcg = memcg; | 2594 | cw->memcg = memcg; |
2745 | cw->cachep = cachep; | 2595 | cw->cachep = cachep; |
2596 | INIT_WORK(&cw->work, memcg_kmem_cache_create_func); | ||
2746 | 2597 | ||
2747 | INIT_WORK(&cw->work, memcg_register_cache_func); | ||
2748 | schedule_work(&cw->work); | 2598 | schedule_work(&cw->work); |
2749 | } | 2599 | } |
2750 | 2600 | ||
2751 | static void memcg_schedule_register_cache(struct mem_cgroup *memcg, | 2601 | static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, |
2752 | struct kmem_cache *cachep) | 2602 | struct kmem_cache *cachep) |
2753 | { | 2603 | { |
2754 | /* | 2604 | /* |
2755 | * We need to stop accounting when we kmalloc, because if the | 2605 | * We need to stop accounting when we kmalloc, because if the |
2756 | * corresponding kmalloc cache is not yet created, the first allocation | 2606 | * corresponding kmalloc cache is not yet created, the first allocation |
2757 | * in __memcg_schedule_register_cache will recurse. | 2607 | * in __memcg_schedule_kmem_cache_create will recurse. |
2758 | * | 2608 | * |
2759 | * However, it is better to enclose the whole function. Depending on | 2609 | * However, it is better to enclose the whole function. Depending on |
2760 | * the debugging options enabled, INIT_WORK(), for instance, can | 2610 | * the debugging options enabled, INIT_WORK(), for instance, can |
@@ -2763,24 +2613,10 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg, | |||
2763 | * the safest choice is to do it like this, wrapping the whole function. | 2613 | * the safest choice is to do it like this, wrapping the whole function. |
2764 | */ | 2614 | */ |
2765 | current->memcg_kmem_skip_account = 1; | 2615 | current->memcg_kmem_skip_account = 1; |
2766 | __memcg_schedule_register_cache(memcg, cachep); | 2616 | __memcg_schedule_kmem_cache_create(memcg, cachep); |
2767 | current->memcg_kmem_skip_account = 0; | 2617 | current->memcg_kmem_skip_account = 0; |
2768 | } | 2618 | } |
2769 | 2619 | ||
2770 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) | ||
2771 | { | ||
2772 | unsigned int nr_pages = 1 << order; | ||
2773 | |||
2774 | return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); | ||
2775 | } | ||
2776 | |||
2777 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | ||
2778 | { | ||
2779 | unsigned int nr_pages = 1 << order; | ||
2780 | |||
2781 | memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); | ||
2782 | } | ||
2783 | |||
2784 | /* | 2620 | /* |
2785 | * Return the kmem_cache we're supposed to use for a slab allocation. | 2621 | * Return the kmem_cache we're supposed to use for a slab allocation. |
2786 | * We try to use the current memcg's version of the cache. | 2622 | * We try to use the current memcg's version of the cache. |
@@ -2798,18 +2634,19 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) | |||
2798 | { | 2634 | { |
2799 | struct mem_cgroup *memcg; | 2635 | struct mem_cgroup *memcg; |
2800 | struct kmem_cache *memcg_cachep; | 2636 | struct kmem_cache *memcg_cachep; |
2637 | int kmemcg_id; | ||
2801 | 2638 | ||
2802 | VM_BUG_ON(!cachep->memcg_params); | 2639 | VM_BUG_ON(!is_root_cache(cachep)); |
2803 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); | ||
2804 | 2640 | ||
2805 | if (current->memcg_kmem_skip_account) | 2641 | if (current->memcg_kmem_skip_account) |
2806 | return cachep; | 2642 | return cachep; |
2807 | 2643 | ||
2808 | memcg = get_mem_cgroup_from_mm(current->mm); | 2644 | memcg = get_mem_cgroup_from_mm(current->mm); |
2809 | if (!memcg_kmem_is_active(memcg)) | 2645 | kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id); |
2646 | if (kmemcg_id < 0) | ||
2810 | goto out; | 2647 | goto out; |
2811 | 2648 | ||
2812 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); | 2649 | memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); |
2813 | if (likely(memcg_cachep)) | 2650 | if (likely(memcg_cachep)) |
2814 | return memcg_cachep; | 2651 | return memcg_cachep; |
2815 | 2652 | ||
@@ -2825,7 +2662,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) | |||
2825 | * could happen with the slab_mutex held. So it's better to | 2662 | * could happen with the slab_mutex held. So it's better to |
2826 | * defer everything. | 2663 | * defer everything. |
2827 | */ | 2664 | */ |
2828 | memcg_schedule_register_cache(memcg, cachep); | 2665 | memcg_schedule_kmem_cache_create(memcg, cachep); |
2829 | out: | 2666 | out: |
2830 | css_put(&memcg->css); | 2667 | css_put(&memcg->css); |
2831 | return cachep; | 2668 | return cachep; |
@@ -2834,7 +2671,7 @@ out: | |||
2834 | void __memcg_kmem_put_cache(struct kmem_cache *cachep) | 2671 | void __memcg_kmem_put_cache(struct kmem_cache *cachep) |
2835 | { | 2672 | { |
2836 | if (!is_root_cache(cachep)) | 2673 | if (!is_root_cache(cachep)) |
2837 | css_put(&cachep->memcg_params->memcg->css); | 2674 | css_put(&cachep->memcg_params.memcg->css); |
2838 | } | 2675 | } |
2839 | 2676 | ||
2840 | /* | 2677 | /* |
@@ -2899,6 +2736,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) | |||
2899 | memcg_uncharge_kmem(memcg, 1 << order); | 2736 | memcg_uncharge_kmem(memcg, 1 << order); |
2900 | page->mem_cgroup = NULL; | 2737 | page->mem_cgroup = NULL; |
2901 | } | 2738 | } |
2739 | |||
2740 | struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) | ||
2741 | { | ||
2742 | struct mem_cgroup *memcg = NULL; | ||
2743 | struct kmem_cache *cachep; | ||
2744 | struct page *page; | ||
2745 | |||
2746 | page = virt_to_head_page(ptr); | ||
2747 | if (PageSlab(page)) { | ||
2748 | cachep = page->slab_cache; | ||
2749 | if (!is_root_cache(cachep)) | ||
2750 | memcg = cachep->memcg_params.memcg; | ||
2751 | } else | ||
2752 | /* page allocated by alloc_kmem_pages */ | ||
2753 | memcg = page->mem_cgroup; | ||
2754 | |||
2755 | return memcg; | ||
2756 | } | ||
2902 | #endif /* CONFIG_MEMCG_KMEM */ | 2757 | #endif /* CONFIG_MEMCG_KMEM */ |
2903 | 2758 | ||
2904 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2759 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
@@ -3043,18 +2898,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3043 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { | 2898 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { |
3044 | mem_cgroup_swap_statistics(from, false); | 2899 | mem_cgroup_swap_statistics(from, false); |
3045 | mem_cgroup_swap_statistics(to, true); | 2900 | mem_cgroup_swap_statistics(to, true); |
3046 | /* | ||
3047 | * This function is only called from task migration context now. | ||
3048 | * It postpones page_counter and refcount handling till the end | ||
3049 | * of task migration(mem_cgroup_clear_mc()) for performance | ||
3050 | * improvement. But we cannot postpone css_get(to) because if | ||
3051 | * the process that has been moved to @to does swap-in, the | ||
3052 | * refcount of @to might be decreased to 0. | ||
3053 | * | ||
3054 | * We are in attach() phase, so the cgroup is guaranteed to be | ||
3055 | * alive, so we can just call css_get(). | ||
3056 | */ | ||
3057 | css_get(&to->css); | ||
3058 | return 0; | 2901 | return 0; |
3059 | } | 2902 | } |
3060 | return -EINVAL; | 2903 | return -EINVAL; |
@@ -3445,8 +3288,9 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, | |||
3445 | int err = 0; | 3288 | int err = 0; |
3446 | int memcg_id; | 3289 | int memcg_id; |
3447 | 3290 | ||
3448 | if (memcg_kmem_is_active(memcg)) | 3291 | BUG_ON(memcg->kmemcg_id >= 0); |
3449 | return 0; | 3292 | BUG_ON(memcg->kmem_acct_activated); |
3293 | BUG_ON(memcg->kmem_acct_active); | ||
3450 | 3294 | ||
3451 | /* | 3295 | /* |
3452 | * For simplicity, we won't allow this to be disabled. It also can't | 3296 | * For simplicity, we won't allow this to be disabled. It also can't |
@@ -3489,6 +3333,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg, | |||
3489 | * patched. | 3333 | * patched. |
3490 | */ | 3334 | */ |
3491 | memcg->kmemcg_id = memcg_id; | 3335 | memcg->kmemcg_id = memcg_id; |
3336 | memcg->kmem_acct_activated = true; | ||
3337 | memcg->kmem_acct_active = true; | ||
3492 | out: | 3338 | out: |
3493 | return err; | 3339 | return err; |
3494 | } | 3340 | } |
@@ -3545,7 +3391,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, | |||
3545 | int ret; | 3391 | int ret; |
3546 | 3392 | ||
3547 | buf = strstrip(buf); | 3393 | buf = strstrip(buf); |
3548 | ret = page_counter_memparse(buf, &nr_pages); | 3394 | ret = page_counter_memparse(buf, "-1", &nr_pages); |
3549 | if (ret) | 3395 | if (ret) |
3550 | return ret; | 3396 | return ret; |
3551 | 3397 | ||
@@ -3621,7 +3467,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, | |||
3621 | { | 3467 | { |
3622 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 3468 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
3623 | 3469 | ||
3624 | if (val >= (1 << NR_MOVE_TYPE)) | 3470 | if (val & ~MOVE_MASK) |
3625 | return -EINVAL; | 3471 | return -EINVAL; |
3626 | 3472 | ||
3627 | /* | 3473 | /* |
@@ -3699,6 +3545,10 @@ static int memcg_stat_show(struct seq_file *m, void *v) | |||
3699 | struct mem_cgroup *mi; | 3545 | struct mem_cgroup *mi; |
3700 | unsigned int i; | 3546 | unsigned int i; |
3701 | 3547 | ||
3548 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) != | ||
3549 | MEM_CGROUP_STAT_NSTATS); | ||
3550 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) != | ||
3551 | MEM_CGROUP_EVENTS_NSTATS); | ||
3702 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | 3552 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); |
3703 | 3553 | ||
3704 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 3554 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
@@ -3913,7 +3763,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | |||
3913 | unsigned long usage; | 3763 | unsigned long usage; |
3914 | int i, size, ret; | 3764 | int i, size, ret; |
3915 | 3765 | ||
3916 | ret = page_counter_memparse(args, &threshold); | 3766 | ret = page_counter_memparse(args, "-1", &threshold); |
3917 | if (ret) | 3767 | if (ret) |
3918 | return ret; | 3768 | return ret; |
3919 | 3769 | ||
@@ -4164,9 +4014,59 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
4164 | return mem_cgroup_sockets_init(memcg, ss); | 4014 | return mem_cgroup_sockets_init(memcg, ss); |
4165 | } | 4015 | } |
4166 | 4016 | ||
4017 | static void memcg_deactivate_kmem(struct mem_cgroup *memcg) | ||
4018 | { | ||
4019 | struct cgroup_subsys_state *css; | ||
4020 | struct mem_cgroup *parent, *child; | ||
4021 | int kmemcg_id; | ||
4022 | |||
4023 | if (!memcg->kmem_acct_active) | ||
4024 | return; | ||
4025 | |||
4026 | /* | ||
4027 | * Clear the 'active' flag before clearing memcg_caches arrays entries. | ||
4028 | * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it | ||
4029 | * guarantees no cache will be created for this cgroup after we are | ||
4030 | * done (see memcg_create_kmem_cache()). | ||
4031 | */ | ||
4032 | memcg->kmem_acct_active = false; | ||
4033 | |||
4034 | memcg_deactivate_kmem_caches(memcg); | ||
4035 | |||
4036 | kmemcg_id = memcg->kmemcg_id; | ||
4037 | BUG_ON(kmemcg_id < 0); | ||
4038 | |||
4039 | parent = parent_mem_cgroup(memcg); | ||
4040 | if (!parent) | ||
4041 | parent = root_mem_cgroup; | ||
4042 | |||
4043 | /* | ||
4044 | * Change kmemcg_id of this cgroup and all its descendants to the | ||
4045 | * parent's id, and then move all entries from this cgroup's list_lrus | ||
4046 | * to ones of the parent. After we have finished, all list_lrus | ||
4047 | * corresponding to this cgroup are guaranteed to remain empty. The | ||
4048 | * ordering is imposed by list_lru_node->lock taken by | ||
4049 | * memcg_drain_all_list_lrus(). | ||
4050 | */ | ||
4051 | css_for_each_descendant_pre(css, &memcg->css) { | ||
4052 | child = mem_cgroup_from_css(css); | ||
4053 | BUG_ON(child->kmemcg_id != kmemcg_id); | ||
4054 | child->kmemcg_id = parent->kmemcg_id; | ||
4055 | if (!memcg->use_hierarchy) | ||
4056 | break; | ||
4057 | } | ||
4058 | memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); | ||
4059 | |||
4060 | memcg_free_cache_id(kmemcg_id); | ||
4061 | } | ||
4062 | |||
4167 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) | 4063 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) |
4168 | { | 4064 | { |
4169 | memcg_unregister_all_caches(memcg); | 4065 | if (memcg->kmem_acct_activated) { |
4066 | memcg_destroy_kmem_caches(memcg); | ||
4067 | static_key_slow_dec(&memcg_kmem_enabled_key); | ||
4068 | WARN_ON(page_counter_read(&memcg->kmem)); | ||
4069 | } | ||
4170 | mem_cgroup_sockets_destroy(memcg); | 4070 | mem_cgroup_sockets_destroy(memcg); |
4171 | } | 4071 | } |
4172 | #else | 4072 | #else |
@@ -4175,6 +4075,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
4175 | return 0; | 4075 | return 0; |
4176 | } | 4076 | } |
4177 | 4077 | ||
4078 | static void memcg_deactivate_kmem(struct mem_cgroup *memcg) | ||
4079 | { | ||
4080 | } | ||
4081 | |||
4178 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) | 4082 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) |
4179 | { | 4083 | { |
4180 | } | 4084 | } |
@@ -4403,7 +4307,7 @@ out_kfree: | |||
4403 | return ret; | 4307 | return ret; |
4404 | } | 4308 | } |
4405 | 4309 | ||
4406 | static struct cftype mem_cgroup_files[] = { | 4310 | static struct cftype mem_cgroup_legacy_files[] = { |
4407 | { | 4311 | { |
4408 | .name = "usage_in_bytes", | 4312 | .name = "usage_in_bytes", |
4409 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 4313 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
@@ -4514,34 +4418,6 @@ static struct cftype mem_cgroup_files[] = { | |||
4514 | { }, /* terminate */ | 4418 | { }, /* terminate */ |
4515 | }; | 4419 | }; |
4516 | 4420 | ||
4517 | #ifdef CONFIG_MEMCG_SWAP | ||
4518 | static struct cftype memsw_cgroup_files[] = { | ||
4519 | { | ||
4520 | .name = "memsw.usage_in_bytes", | ||
4521 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||
4522 | .read_u64 = mem_cgroup_read_u64, | ||
4523 | }, | ||
4524 | { | ||
4525 | .name = "memsw.max_usage_in_bytes", | ||
4526 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | ||
4527 | .write = mem_cgroup_reset, | ||
4528 | .read_u64 = mem_cgroup_read_u64, | ||
4529 | }, | ||
4530 | { | ||
4531 | .name = "memsw.limit_in_bytes", | ||
4532 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | ||
4533 | .write = mem_cgroup_write, | ||
4534 | .read_u64 = mem_cgroup_read_u64, | ||
4535 | }, | ||
4536 | { | ||
4537 | .name = "memsw.failcnt", | ||
4538 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | ||
4539 | .write = mem_cgroup_reset, | ||
4540 | .read_u64 = mem_cgroup_read_u64, | ||
4541 | }, | ||
4542 | { }, /* terminate */ | ||
4543 | }; | ||
4544 | #endif | ||
4545 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | 4421 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) |
4546 | { | 4422 | { |
4547 | struct mem_cgroup_per_node *pn; | 4423 | struct mem_cgroup_per_node *pn; |
@@ -4621,8 +4497,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) | |||
4621 | free_mem_cgroup_per_zone_info(memcg, node); | 4497 | free_mem_cgroup_per_zone_info(memcg, node); |
4622 | 4498 | ||
4623 | free_percpu(memcg->stat); | 4499 | free_percpu(memcg->stat); |
4624 | |||
4625 | disarm_static_keys(memcg); | ||
4626 | kfree(memcg); | 4500 | kfree(memcg); |
4627 | } | 4501 | } |
4628 | 4502 | ||
@@ -4637,29 +4511,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | |||
4637 | } | 4511 | } |
4638 | EXPORT_SYMBOL(parent_mem_cgroup); | 4512 | EXPORT_SYMBOL(parent_mem_cgroup); |
4639 | 4513 | ||
4640 | static void __init mem_cgroup_soft_limit_tree_init(void) | ||
4641 | { | ||
4642 | struct mem_cgroup_tree_per_node *rtpn; | ||
4643 | struct mem_cgroup_tree_per_zone *rtpz; | ||
4644 | int tmp, node, zone; | ||
4645 | |||
4646 | for_each_node(node) { | ||
4647 | tmp = node; | ||
4648 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
4649 | tmp = -1; | ||
4650 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
4651 | BUG_ON(!rtpn); | ||
4652 | |||
4653 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
4654 | |||
4655 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
4656 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
4657 | rtpz->rb_root = RB_ROOT; | ||
4658 | spin_lock_init(&rtpz->lock); | ||
4659 | } | ||
4660 | } | ||
4661 | } | ||
4662 | |||
4663 | static struct cgroup_subsys_state * __ref | 4514 | static struct cgroup_subsys_state * __ref |
4664 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | 4515 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) |
4665 | { | 4516 | { |
@@ -4679,6 +4530,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
4679 | if (parent_css == NULL) { | 4530 | if (parent_css == NULL) { |
4680 | root_mem_cgroup = memcg; | 4531 | root_mem_cgroup = memcg; |
4681 | page_counter_init(&memcg->memory, NULL); | 4532 | page_counter_init(&memcg->memory, NULL); |
4533 | memcg->high = PAGE_COUNTER_MAX; | ||
4534 | memcg->soft_limit = PAGE_COUNTER_MAX; | ||
4682 | page_counter_init(&memcg->memsw, NULL); | 4535 | page_counter_init(&memcg->memsw, NULL); |
4683 | page_counter_init(&memcg->kmem, NULL); | 4536 | page_counter_init(&memcg->kmem, NULL); |
4684 | } | 4537 | } |
@@ -4693,7 +4546,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | |||
4693 | spin_lock_init(&memcg->event_list_lock); | 4546 | spin_lock_init(&memcg->event_list_lock); |
4694 | #ifdef CONFIG_MEMCG_KMEM | 4547 | #ifdef CONFIG_MEMCG_KMEM |
4695 | memcg->kmemcg_id = -1; | 4548 | memcg->kmemcg_id = -1; |
4696 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | ||
4697 | #endif | 4549 | #endif |
4698 | 4550 | ||
4699 | return &memcg->css; | 4551 | return &memcg->css; |
@@ -4724,6 +4576,8 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
4724 | 4576 | ||
4725 | if (parent->use_hierarchy) { | 4577 | if (parent->use_hierarchy) { |
4726 | page_counter_init(&memcg->memory, &parent->memory); | 4578 | page_counter_init(&memcg->memory, &parent->memory); |
4579 | memcg->high = PAGE_COUNTER_MAX; | ||
4580 | memcg->soft_limit = PAGE_COUNTER_MAX; | ||
4727 | page_counter_init(&memcg->memsw, &parent->memsw); | 4581 | page_counter_init(&memcg->memsw, &parent->memsw); |
4728 | page_counter_init(&memcg->kmem, &parent->kmem); | 4582 | page_counter_init(&memcg->kmem, &parent->kmem); |
4729 | 4583 | ||
@@ -4733,6 +4587,8 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) | |||
4733 | */ | 4587 | */ |
4734 | } else { | 4588 | } else { |
4735 | page_counter_init(&memcg->memory, NULL); | 4589 | page_counter_init(&memcg->memory, NULL); |
4590 | memcg->high = PAGE_COUNTER_MAX; | ||
4591 | memcg->soft_limit = PAGE_COUNTER_MAX; | ||
4736 | page_counter_init(&memcg->memsw, NULL); | 4592 | page_counter_init(&memcg->memsw, NULL); |
4737 | page_counter_init(&memcg->kmem, NULL); | 4593 | page_counter_init(&memcg->kmem, NULL); |
4738 | /* | 4594 | /* |
@@ -4777,6 +4633,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | |||
4777 | spin_unlock(&memcg->event_list_lock); | 4633 | spin_unlock(&memcg->event_list_lock); |
4778 | 4634 | ||
4779 | vmpressure_cleanup(&memcg->vmpressure); | 4635 | vmpressure_cleanup(&memcg->vmpressure); |
4636 | |||
4637 | memcg_deactivate_kmem(memcg); | ||
4780 | } | 4638 | } |
4781 | 4639 | ||
4782 | static void mem_cgroup_css_free(struct cgroup_subsys_state *css) | 4640 | static void mem_cgroup_css_free(struct cgroup_subsys_state *css) |
@@ -4807,7 +4665,9 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) | |||
4807 | mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); | 4665 | mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); |
4808 | mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); | 4666 | mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); |
4809 | memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); | 4667 | memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); |
4810 | memcg->soft_limit = 0; | 4668 | memcg->low = 0; |
4669 | memcg->high = PAGE_COUNTER_MAX; | ||
4670 | memcg->soft_limit = PAGE_COUNTER_MAX; | ||
4811 | } | 4671 | } |
4812 | 4672 | ||
4813 | #ifdef CONFIG_MMU | 4673 | #ifdef CONFIG_MMU |
@@ -4883,12 +4743,12 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | |||
4883 | if (!page || !page_mapped(page)) | 4743 | if (!page || !page_mapped(page)) |
4884 | return NULL; | 4744 | return NULL; |
4885 | if (PageAnon(page)) { | 4745 | if (PageAnon(page)) { |
4886 | /* we don't move shared anon */ | 4746 | if (!(mc.flags & MOVE_ANON)) |
4887 | if (!move_anon()) | ||
4888 | return NULL; | 4747 | return NULL; |
4889 | } else if (!move_file()) | 4748 | } else { |
4890 | /* we ignore mapcount for file pages */ | 4749 | if (!(mc.flags & MOVE_FILE)) |
4891 | return NULL; | 4750 | return NULL; |
4751 | } | ||
4892 | if (!get_page_unless_zero(page)) | 4752 | if (!get_page_unless_zero(page)) |
4893 | return NULL; | 4753 | return NULL; |
4894 | 4754 | ||
@@ -4902,7 +4762,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | |||
4902 | struct page *page = NULL; | 4762 | struct page *page = NULL; |
4903 | swp_entry_t ent = pte_to_swp_entry(ptent); | 4763 | swp_entry_t ent = pte_to_swp_entry(ptent); |
4904 | 4764 | ||
4905 | if (!move_anon() || non_swap_entry(ent)) | 4765 | if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) |
4906 | return NULL; | 4766 | return NULL; |
4907 | /* | 4767 | /* |
4908 | * Because lookup_swap_cache() updates some statistics counter, | 4768 | * Because lookup_swap_cache() updates some statistics counter, |
@@ -4931,14 +4791,11 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
4931 | 4791 | ||
4932 | if (!vma->vm_file) /* anonymous vma */ | 4792 | if (!vma->vm_file) /* anonymous vma */ |
4933 | return NULL; | 4793 | return NULL; |
4934 | if (!move_file()) | 4794 | if (!(mc.flags & MOVE_FILE)) |
4935 | return NULL; | 4795 | return NULL; |
4936 | 4796 | ||
4937 | mapping = vma->vm_file->f_mapping; | 4797 | mapping = vma->vm_file->f_mapping; |
4938 | if (pte_none(ptent)) | 4798 | pgoff = linear_page_index(vma, addr); |
4939 | pgoff = linear_page_index(vma, addr); | ||
4940 | else /* pte_file(ptent) is true */ | ||
4941 | pgoff = pte_to_pgoff(ptent); | ||
4942 | 4799 | ||
4943 | /* page is moved even if it's not RSS of this task(page-faulted). */ | 4800 | /* page is moved even if it's not RSS of this task(page-faulted). */ |
4944 | #ifdef CONFIG_SWAP | 4801 | #ifdef CONFIG_SWAP |
@@ -4970,7 +4827,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | |||
4970 | page = mc_handle_present_pte(vma, addr, ptent); | 4827 | page = mc_handle_present_pte(vma, addr, ptent); |
4971 | else if (is_swap_pte(ptent)) | 4828 | else if (is_swap_pte(ptent)) |
4972 | page = mc_handle_swap_pte(vma, addr, ptent, &ent); | 4829 | page = mc_handle_swap_pte(vma, addr, ptent, &ent); |
4973 | else if (pte_none(ptent) || pte_file(ptent)) | 4830 | else if (pte_none(ptent)) |
4974 | page = mc_handle_file_pte(vma, addr, ptent, &ent); | 4831 | page = mc_handle_file_pte(vma, addr, ptent, &ent); |
4975 | 4832 | ||
4976 | if (!page && !ent.val) | 4833 | if (!page && !ent.val) |
@@ -5013,7 +4870,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | |||
5013 | 4870 | ||
5014 | page = pmd_page(pmd); | 4871 | page = pmd_page(pmd); |
5015 | VM_BUG_ON_PAGE(!page || !PageHead(page), page); | 4872 | VM_BUG_ON_PAGE(!page || !PageHead(page), page); |
5016 | if (!move_anon()) | 4873 | if (!(mc.flags & MOVE_ANON)) |
5017 | return ret; | 4874 | return ret; |
5018 | if (page->mem_cgroup == mc.from) { | 4875 | if (page->mem_cgroup == mc.from) { |
5019 | ret = MC_TARGET_PAGE; | 4876 | ret = MC_TARGET_PAGE; |
@@ -5036,7 +4893,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
5036 | unsigned long addr, unsigned long end, | 4893 | unsigned long addr, unsigned long end, |
5037 | struct mm_walk *walk) | 4894 | struct mm_walk *walk) |
5038 | { | 4895 | { |
5039 | struct vm_area_struct *vma = walk->private; | 4896 | struct vm_area_struct *vma = walk->vma; |
5040 | pte_t *pte; | 4897 | pte_t *pte; |
5041 | spinlock_t *ptl; | 4898 | spinlock_t *ptl; |
5042 | 4899 | ||
@@ -5062,20 +4919,13 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
5062 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | 4919 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) |
5063 | { | 4920 | { |
5064 | unsigned long precharge; | 4921 | unsigned long precharge; |
5065 | struct vm_area_struct *vma; | ||
5066 | 4922 | ||
4923 | struct mm_walk mem_cgroup_count_precharge_walk = { | ||
4924 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | ||
4925 | .mm = mm, | ||
4926 | }; | ||
5067 | down_read(&mm->mmap_sem); | 4927 | down_read(&mm->mmap_sem); |
5068 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 4928 | walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk); |
5069 | struct mm_walk mem_cgroup_count_precharge_walk = { | ||
5070 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | ||
5071 | .mm = mm, | ||
5072 | .private = vma, | ||
5073 | }; | ||
5074 | if (is_vm_hugetlb_page(vma)) | ||
5075 | continue; | ||
5076 | walk_page_range(vma->vm_start, vma->vm_end, | ||
5077 | &mem_cgroup_count_precharge_walk); | ||
5078 | } | ||
5079 | up_read(&mm->mmap_sem); | 4929 | up_read(&mm->mmap_sem); |
5080 | 4930 | ||
5081 | precharge = mc.precharge; | 4931 | precharge = mc.precharge; |
@@ -5155,15 +5005,15 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | |||
5155 | struct task_struct *p = cgroup_taskset_first(tset); | 5005 | struct task_struct *p = cgroup_taskset_first(tset); |
5156 | int ret = 0; | 5006 | int ret = 0; |
5157 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5007 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5158 | unsigned long move_charge_at_immigrate; | 5008 | unsigned long move_flags; |
5159 | 5009 | ||
5160 | /* | 5010 | /* |
5161 | * We are now commited to this value whatever it is. Changes in this | 5011 | * We are now commited to this value whatever it is. Changes in this |
5162 | * tunable will only affect upcoming migrations, not the current one. | 5012 | * tunable will only affect upcoming migrations, not the current one. |
5163 | * So we need to save it, and keep it going. | 5013 | * So we need to save it, and keep it going. |
5164 | */ | 5014 | */ |
5165 | move_charge_at_immigrate = memcg->move_charge_at_immigrate; | 5015 | move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate); |
5166 | if (move_charge_at_immigrate) { | 5016 | if (move_flags) { |
5167 | struct mm_struct *mm; | 5017 | struct mm_struct *mm; |
5168 | struct mem_cgroup *from = mem_cgroup_from_task(p); | 5018 | struct mem_cgroup *from = mem_cgroup_from_task(p); |
5169 | 5019 | ||
@@ -5183,7 +5033,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | |||
5183 | spin_lock(&mc.lock); | 5033 | spin_lock(&mc.lock); |
5184 | mc.from = from; | 5034 | mc.from = from; |
5185 | mc.to = memcg; | 5035 | mc.to = memcg; |
5186 | mc.immigrate_flags = move_charge_at_immigrate; | 5036 | mc.flags = move_flags; |
5187 | spin_unlock(&mc.lock); | 5037 | spin_unlock(&mc.lock); |
5188 | /* We set mc.moving_task later */ | 5038 | /* We set mc.moving_task later */ |
5189 | 5039 | ||
@@ -5208,7 +5058,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
5208 | struct mm_walk *walk) | 5058 | struct mm_walk *walk) |
5209 | { | 5059 | { |
5210 | int ret = 0; | 5060 | int ret = 0; |
5211 | struct vm_area_struct *vma = walk->private; | 5061 | struct vm_area_struct *vma = walk->vma; |
5212 | pte_t *pte; | 5062 | pte_t *pte; |
5213 | spinlock_t *ptl; | 5063 | spinlock_t *ptl; |
5214 | enum mc_target_type target_type; | 5064 | enum mc_target_type target_type; |
@@ -5304,7 +5154,10 @@ put: /* get_mctgt_type() gets the page */ | |||
5304 | 5154 | ||
5305 | static void mem_cgroup_move_charge(struct mm_struct *mm) | 5155 | static void mem_cgroup_move_charge(struct mm_struct *mm) |
5306 | { | 5156 | { |
5307 | struct vm_area_struct *vma; | 5157 | struct mm_walk mem_cgroup_move_charge_walk = { |
5158 | .pmd_entry = mem_cgroup_move_charge_pte_range, | ||
5159 | .mm = mm, | ||
5160 | }; | ||
5308 | 5161 | ||
5309 | lru_add_drain_all(); | 5162 | lru_add_drain_all(); |
5310 | /* | 5163 | /* |
@@ -5327,24 +5180,11 @@ retry: | |||
5327 | cond_resched(); | 5180 | cond_resched(); |
5328 | goto retry; | 5181 | goto retry; |
5329 | } | 5182 | } |
5330 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 5183 | /* |
5331 | int ret; | 5184 | * When we have consumed all precharges and failed in doing |
5332 | struct mm_walk mem_cgroup_move_charge_walk = { | 5185 | * additional charge, the page walk just aborts. |
5333 | .pmd_entry = mem_cgroup_move_charge_pte_range, | 5186 | */ |
5334 | .mm = mm, | 5187 | walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk); |
5335 | .private = vma, | ||
5336 | }; | ||
5337 | if (is_vm_hugetlb_page(vma)) | ||
5338 | continue; | ||
5339 | ret = walk_page_range(vma->vm_start, vma->vm_end, | ||
5340 | &mem_cgroup_move_charge_walk); | ||
5341 | if (ret) | ||
5342 | /* | ||
5343 | * means we have consumed all precharges and failed in | ||
5344 | * doing additional charge. Just abandon here. | ||
5345 | */ | ||
5346 | break; | ||
5347 | } | ||
5348 | up_read(&mm->mmap_sem); | 5188 | up_read(&mm->mmap_sem); |
5349 | atomic_dec(&mc.from->moving_account); | 5189 | atomic_dec(&mc.from->moving_account); |
5350 | } | 5190 | } |
@@ -5395,118 +5235,211 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) | |||
5395 | mem_cgroup_from_css(root_css)->use_hierarchy = true; | 5235 | mem_cgroup_from_css(root_css)->use_hierarchy = true; |
5396 | } | 5236 | } |
5397 | 5237 | ||
5398 | struct cgroup_subsys memory_cgrp_subsys = { | 5238 | static u64 memory_current_read(struct cgroup_subsys_state *css, |
5399 | .css_alloc = mem_cgroup_css_alloc, | 5239 | struct cftype *cft) |
5400 | .css_online = mem_cgroup_css_online, | 5240 | { |
5401 | .css_offline = mem_cgroup_css_offline, | 5241 | return mem_cgroup_usage(mem_cgroup_from_css(css), false); |
5402 | .css_free = mem_cgroup_css_free, | 5242 | } |
5403 | .css_reset = mem_cgroup_css_reset, | ||
5404 | .can_attach = mem_cgroup_can_attach, | ||
5405 | .cancel_attach = mem_cgroup_cancel_attach, | ||
5406 | .attach = mem_cgroup_move_task, | ||
5407 | .bind = mem_cgroup_bind, | ||
5408 | .legacy_cftypes = mem_cgroup_files, | ||
5409 | .early_init = 0, | ||
5410 | }; | ||
5411 | 5243 | ||
5412 | #ifdef CONFIG_MEMCG_SWAP | 5244 | static int memory_low_show(struct seq_file *m, void *v) |
5413 | static int __init enable_swap_account(char *s) | ||
5414 | { | 5245 | { |
5415 | if (!strcmp(s, "1")) | 5246 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
5416 | really_do_swap_account = 1; | 5247 | unsigned long low = ACCESS_ONCE(memcg->low); |
5417 | else if (!strcmp(s, "0")) | 5248 | |
5418 | really_do_swap_account = 0; | 5249 | if (low == PAGE_COUNTER_MAX) |
5419 | return 1; | 5250 | seq_puts(m, "infinity\n"); |
5251 | else | ||
5252 | seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); | ||
5253 | |||
5254 | return 0; | ||
5420 | } | 5255 | } |
5421 | __setup("swapaccount=", enable_swap_account); | ||
5422 | 5256 | ||
5423 | static void __init memsw_file_init(void) | 5257 | static ssize_t memory_low_write(struct kernfs_open_file *of, |
5258 | char *buf, size_t nbytes, loff_t off) | ||
5424 | { | 5259 | { |
5425 | WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, | 5260 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); |
5426 | memsw_cgroup_files)); | 5261 | unsigned long low; |
5262 | int err; | ||
5263 | |||
5264 | buf = strstrip(buf); | ||
5265 | err = page_counter_memparse(buf, "infinity", &low); | ||
5266 | if (err) | ||
5267 | return err; | ||
5268 | |||
5269 | memcg->low = low; | ||
5270 | |||
5271 | return nbytes; | ||
5427 | } | 5272 | } |
5428 | 5273 | ||
5429 | static void __init enable_swap_cgroup(void) | 5274 | static int memory_high_show(struct seq_file *m, void *v) |
5430 | { | 5275 | { |
5431 | if (!mem_cgroup_disabled() && really_do_swap_account) { | 5276 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
5432 | do_swap_account = 1; | 5277 | unsigned long high = ACCESS_ONCE(memcg->high); |
5433 | memsw_file_init(); | 5278 | |
5434 | } | 5279 | if (high == PAGE_COUNTER_MAX) |
5280 | seq_puts(m, "infinity\n"); | ||
5281 | else | ||
5282 | seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); | ||
5283 | |||
5284 | return 0; | ||
5435 | } | 5285 | } |
5436 | 5286 | ||
5437 | #else | 5287 | static ssize_t memory_high_write(struct kernfs_open_file *of, |
5438 | static void __init enable_swap_cgroup(void) | 5288 | char *buf, size_t nbytes, loff_t off) |
5439 | { | 5289 | { |
5290 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | ||
5291 | unsigned long high; | ||
5292 | int err; | ||
5293 | |||
5294 | buf = strstrip(buf); | ||
5295 | err = page_counter_memparse(buf, "infinity", &high); | ||
5296 | if (err) | ||
5297 | return err; | ||
5298 | |||
5299 | memcg->high = high; | ||
5300 | |||
5301 | return nbytes; | ||
5440 | } | 5302 | } |
5441 | #endif | ||
5442 | 5303 | ||
5443 | #ifdef CONFIG_MEMCG_SWAP | 5304 | static int memory_max_show(struct seq_file *m, void *v) |
5444 | /** | ||
5445 | * mem_cgroup_swapout - transfer a memsw charge to swap | ||
5446 | * @page: page whose memsw charge to transfer | ||
5447 | * @entry: swap entry to move the charge to | ||
5448 | * | ||
5449 | * Transfer the memsw charge of @page to @entry. | ||
5450 | */ | ||
5451 | void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | ||
5452 | { | 5305 | { |
5453 | struct mem_cgroup *memcg; | 5306 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
5454 | unsigned short oldid; | 5307 | unsigned long max = ACCESS_ONCE(memcg->memory.limit); |
5455 | 5308 | ||
5456 | VM_BUG_ON_PAGE(PageLRU(page), page); | 5309 | if (max == PAGE_COUNTER_MAX) |
5457 | VM_BUG_ON_PAGE(page_count(page), page); | 5310 | seq_puts(m, "infinity\n"); |
5311 | else | ||
5312 | seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); | ||
5458 | 5313 | ||
5459 | if (!do_swap_account) | 5314 | return 0; |
5460 | return; | 5315 | } |
5461 | 5316 | ||
5462 | memcg = page->mem_cgroup; | 5317 | static ssize_t memory_max_write(struct kernfs_open_file *of, |
5318 | char *buf, size_t nbytes, loff_t off) | ||
5319 | { | ||
5320 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | ||
5321 | unsigned long max; | ||
5322 | int err; | ||
5463 | 5323 | ||
5464 | /* Readahead page, never charged */ | 5324 | buf = strstrip(buf); |
5465 | if (!memcg) | 5325 | err = page_counter_memparse(buf, "infinity", &max); |
5466 | return; | 5326 | if (err) |
5327 | return err; | ||
5467 | 5328 | ||
5468 | oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); | 5329 | err = mem_cgroup_resize_limit(memcg, max); |
5469 | VM_BUG_ON_PAGE(oldid, page); | 5330 | if (err) |
5470 | mem_cgroup_swap_statistics(memcg, true); | 5331 | return err; |
5471 | 5332 | ||
5472 | page->mem_cgroup = NULL; | 5333 | return nbytes; |
5334 | } | ||
5473 | 5335 | ||
5474 | if (!mem_cgroup_is_root(memcg)) | 5336 | static int memory_events_show(struct seq_file *m, void *v) |
5475 | page_counter_uncharge(&memcg->memory, 1); | 5337 | { |
5338 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | ||
5476 | 5339 | ||
5477 | /* XXX: caller holds IRQ-safe mapping->tree_lock */ | 5340 | seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW)); |
5478 | VM_BUG_ON(!irqs_disabled()); | 5341 | seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH)); |
5342 | seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX)); | ||
5343 | seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM)); | ||
5479 | 5344 | ||
5480 | mem_cgroup_charge_statistics(memcg, page, -1); | 5345 | return 0; |
5481 | memcg_check_events(memcg, page); | ||
5482 | } | 5346 | } |
5483 | 5347 | ||
5348 | static struct cftype memory_files[] = { | ||
5349 | { | ||
5350 | .name = "current", | ||
5351 | .read_u64 = memory_current_read, | ||
5352 | }, | ||
5353 | { | ||
5354 | .name = "low", | ||
5355 | .flags = CFTYPE_NOT_ON_ROOT, | ||
5356 | .seq_show = memory_low_show, | ||
5357 | .write = memory_low_write, | ||
5358 | }, | ||
5359 | { | ||
5360 | .name = "high", | ||
5361 | .flags = CFTYPE_NOT_ON_ROOT, | ||
5362 | .seq_show = memory_high_show, | ||
5363 | .write = memory_high_write, | ||
5364 | }, | ||
5365 | { | ||
5366 | .name = "max", | ||
5367 | .flags = CFTYPE_NOT_ON_ROOT, | ||
5368 | .seq_show = memory_max_show, | ||
5369 | .write = memory_max_write, | ||
5370 | }, | ||
5371 | { | ||
5372 | .name = "events", | ||
5373 | .flags = CFTYPE_NOT_ON_ROOT, | ||
5374 | .seq_show = memory_events_show, | ||
5375 | }, | ||
5376 | { } /* terminate */ | ||
5377 | }; | ||
5378 | |||
5379 | struct cgroup_subsys memory_cgrp_subsys = { | ||
5380 | .css_alloc = mem_cgroup_css_alloc, | ||
5381 | .css_online = mem_cgroup_css_online, | ||
5382 | .css_offline = mem_cgroup_css_offline, | ||
5383 | .css_free = mem_cgroup_css_free, | ||
5384 | .css_reset = mem_cgroup_css_reset, | ||
5385 | .can_attach = mem_cgroup_can_attach, | ||
5386 | .cancel_attach = mem_cgroup_cancel_attach, | ||
5387 | .attach = mem_cgroup_move_task, | ||
5388 | .bind = mem_cgroup_bind, | ||
5389 | .dfl_cftypes = memory_files, | ||
5390 | .legacy_cftypes = mem_cgroup_legacy_files, | ||
5391 | .early_init = 0, | ||
5392 | }; | ||
5393 | |||
5484 | /** | 5394 | /** |
5485 | * mem_cgroup_uncharge_swap - uncharge a swap entry | 5395 | * mem_cgroup_events - count memory events against a cgroup |
5486 | * @entry: swap entry to uncharge | 5396 | * @memcg: the memory cgroup |
5397 | * @idx: the event index | ||
5398 | * @nr: the number of events to account for | ||
5399 | */ | ||
5400 | void mem_cgroup_events(struct mem_cgroup *memcg, | ||
5401 | enum mem_cgroup_events_index idx, | ||
5402 | unsigned int nr) | ||
5403 | { | ||
5404 | this_cpu_add(memcg->stat->events[idx], nr); | ||
5405 | } | ||
5406 | |||
5407 | /** | ||
5408 | * mem_cgroup_low - check if memory consumption is below the normal range | ||
5409 | * @root: the highest ancestor to consider | ||
5410 | * @memcg: the memory cgroup to check | ||
5487 | * | 5411 | * |
5488 | * Drop the memsw charge associated with @entry. | 5412 | * Returns %true if memory consumption of @memcg, and that of all |
5413 | * configurable ancestors up to @root, is below the normal range. | ||
5489 | */ | 5414 | */ |
5490 | void mem_cgroup_uncharge_swap(swp_entry_t entry) | 5415 | bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) |
5491 | { | 5416 | { |
5492 | struct mem_cgroup *memcg; | 5417 | if (mem_cgroup_disabled()) |
5493 | unsigned short id; | 5418 | return false; |
5494 | 5419 | ||
5495 | if (!do_swap_account) | 5420 | /* |
5496 | return; | 5421 | * The toplevel group doesn't have a configurable range, so |
5422 | * it's never low when looked at directly, and it is not | ||
5423 | * considered an ancestor when assessing the hierarchy. | ||
5424 | */ | ||
5497 | 5425 | ||
5498 | id = swap_cgroup_record(entry, 0); | 5426 | if (memcg == root_mem_cgroup) |
5499 | rcu_read_lock(); | 5427 | return false; |
5500 | memcg = mem_cgroup_lookup(id); | 5428 | |
5501 | if (memcg) { | 5429 | if (page_counter_read(&memcg->memory) > memcg->low) |
5502 | if (!mem_cgroup_is_root(memcg)) | 5430 | return false; |
5503 | page_counter_uncharge(&memcg->memsw, 1); | 5431 | |
5504 | mem_cgroup_swap_statistics(memcg, false); | 5432 | while (memcg != root) { |
5505 | css_put(&memcg->css); | 5433 | memcg = parent_mem_cgroup(memcg); |
5434 | |||
5435 | if (memcg == root_mem_cgroup) | ||
5436 | break; | ||
5437 | |||
5438 | if (page_counter_read(&memcg->memory) > memcg->low) | ||
5439 | return false; | ||
5506 | } | 5440 | } |
5507 | rcu_read_unlock(); | 5441 | return true; |
5508 | } | 5442 | } |
5509 | #endif | ||
5510 | 5443 | ||
5511 | /** | 5444 | /** |
5512 | * mem_cgroup_try_charge - try charging a page | 5445 | * mem_cgroup_try_charge - try charging a page |
@@ -5782,7 +5715,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) | |||
5782 | * mem_cgroup_migrate - migrate a charge to another page | 5715 | * mem_cgroup_migrate - migrate a charge to another page |
5783 | * @oldpage: currently charged page | 5716 | * @oldpage: currently charged page |
5784 | * @newpage: page to transfer the charge to | 5717 | * @newpage: page to transfer the charge to |
5785 | * @lrucare: both pages might be on the LRU already | 5718 | * @lrucare: either or both pages might be on the LRU already |
5786 | * | 5719 | * |
5787 | * Migrate the charge from @oldpage to @newpage. | 5720 | * Migrate the charge from @oldpage to @newpage. |
5788 | * | 5721 | * |
@@ -5840,10 +5773,155 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, | |||
5840 | */ | 5773 | */ |
5841 | static int __init mem_cgroup_init(void) | 5774 | static int __init mem_cgroup_init(void) |
5842 | { | 5775 | { |
5776 | int cpu, node; | ||
5777 | |||
5843 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | 5778 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
5844 | enable_swap_cgroup(); | 5779 | |
5845 | mem_cgroup_soft_limit_tree_init(); | 5780 | for_each_possible_cpu(cpu) |
5846 | memcg_stock_init(); | 5781 | INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, |
5782 | drain_local_stock); | ||
5783 | |||
5784 | for_each_node(node) { | ||
5785 | struct mem_cgroup_tree_per_node *rtpn; | ||
5786 | int zone; | ||
5787 | |||
5788 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, | ||
5789 | node_online(node) ? node : NUMA_NO_NODE); | ||
5790 | |||
5791 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
5792 | struct mem_cgroup_tree_per_zone *rtpz; | ||
5793 | |||
5794 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
5795 | rtpz->rb_root = RB_ROOT; | ||
5796 | spin_lock_init(&rtpz->lock); | ||
5797 | } | ||
5798 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
5799 | } | ||
5800 | |||
5847 | return 0; | 5801 | return 0; |
5848 | } | 5802 | } |
5849 | subsys_initcall(mem_cgroup_init); | 5803 | subsys_initcall(mem_cgroup_init); |
5804 | |||
5805 | #ifdef CONFIG_MEMCG_SWAP | ||
5806 | /** | ||
5807 | * mem_cgroup_swapout - transfer a memsw charge to swap | ||
5808 | * @page: page whose memsw charge to transfer | ||
5809 | * @entry: swap entry to move the charge to | ||
5810 | * | ||
5811 | * Transfer the memsw charge of @page to @entry. | ||
5812 | */ | ||
5813 | void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | ||
5814 | { | ||
5815 | struct mem_cgroup *memcg; | ||
5816 | unsigned short oldid; | ||
5817 | |||
5818 | VM_BUG_ON_PAGE(PageLRU(page), page); | ||
5819 | VM_BUG_ON_PAGE(page_count(page), page); | ||
5820 | |||
5821 | if (!do_swap_account) | ||
5822 | return; | ||
5823 | |||
5824 | memcg = page->mem_cgroup; | ||
5825 | |||
5826 | /* Readahead page, never charged */ | ||
5827 | if (!memcg) | ||
5828 | return; | ||
5829 | |||
5830 | oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); | ||
5831 | VM_BUG_ON_PAGE(oldid, page); | ||
5832 | mem_cgroup_swap_statistics(memcg, true); | ||
5833 | |||
5834 | page->mem_cgroup = NULL; | ||
5835 | |||
5836 | if (!mem_cgroup_is_root(memcg)) | ||
5837 | page_counter_uncharge(&memcg->memory, 1); | ||
5838 | |||
5839 | /* XXX: caller holds IRQ-safe mapping->tree_lock */ | ||
5840 | VM_BUG_ON(!irqs_disabled()); | ||
5841 | |||
5842 | mem_cgroup_charge_statistics(memcg, page, -1); | ||
5843 | memcg_check_events(memcg, page); | ||
5844 | } | ||
5845 | |||
5846 | /** | ||
5847 | * mem_cgroup_uncharge_swap - uncharge a swap entry | ||
5848 | * @entry: swap entry to uncharge | ||
5849 | * | ||
5850 | * Drop the memsw charge associated with @entry. | ||
5851 | */ | ||
5852 | void mem_cgroup_uncharge_swap(swp_entry_t entry) | ||
5853 | { | ||
5854 | struct mem_cgroup *memcg; | ||
5855 | unsigned short id; | ||
5856 | |||
5857 | if (!do_swap_account) | ||
5858 | return; | ||
5859 | |||
5860 | id = swap_cgroup_record(entry, 0); | ||
5861 | rcu_read_lock(); | ||
5862 | memcg = mem_cgroup_lookup(id); | ||
5863 | if (memcg) { | ||
5864 | if (!mem_cgroup_is_root(memcg)) | ||
5865 | page_counter_uncharge(&memcg->memsw, 1); | ||
5866 | mem_cgroup_swap_statistics(memcg, false); | ||
5867 | css_put(&memcg->css); | ||
5868 | } | ||
5869 | rcu_read_unlock(); | ||
5870 | } | ||
5871 | |||
5872 | /* for remember boot option*/ | ||
5873 | #ifdef CONFIG_MEMCG_SWAP_ENABLED | ||
5874 | static int really_do_swap_account __initdata = 1; | ||
5875 | #else | ||
5876 | static int really_do_swap_account __initdata; | ||
5877 | #endif | ||
5878 | |||
5879 | static int __init enable_swap_account(char *s) | ||
5880 | { | ||
5881 | if (!strcmp(s, "1")) | ||
5882 | really_do_swap_account = 1; | ||
5883 | else if (!strcmp(s, "0")) | ||
5884 | really_do_swap_account = 0; | ||
5885 | return 1; | ||
5886 | } | ||
5887 | __setup("swapaccount=", enable_swap_account); | ||
5888 | |||
5889 | static struct cftype memsw_cgroup_files[] = { | ||
5890 | { | ||
5891 | .name = "memsw.usage_in_bytes", | ||
5892 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | ||
5893 | .read_u64 = mem_cgroup_read_u64, | ||
5894 | }, | ||
5895 | { | ||
5896 | .name = "memsw.max_usage_in_bytes", | ||
5897 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | ||
5898 | .write = mem_cgroup_reset, | ||
5899 | .read_u64 = mem_cgroup_read_u64, | ||
5900 | }, | ||
5901 | { | ||
5902 | .name = "memsw.limit_in_bytes", | ||
5903 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | ||
5904 | .write = mem_cgroup_write, | ||
5905 | .read_u64 = mem_cgroup_read_u64, | ||
5906 | }, | ||
5907 | { | ||
5908 | .name = "memsw.failcnt", | ||
5909 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | ||
5910 | .write = mem_cgroup_reset, | ||
5911 | .read_u64 = mem_cgroup_read_u64, | ||
5912 | }, | ||
5913 | { }, /* terminate */ | ||
5914 | }; | ||
5915 | |||
5916 | static int __init mem_cgroup_swap_init(void) | ||
5917 | { | ||
5918 | if (!mem_cgroup_disabled() && really_do_swap_account) { | ||
5919 | do_swap_account = 1; | ||
5920 | WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, | ||
5921 | memsw_cgroup_files)); | ||
5922 | } | ||
5923 | return 0; | ||
5924 | } | ||
5925 | subsys_initcall(mem_cgroup_swap_init); | ||
5926 | |||
5927 | #endif /* CONFIG_MEMCG_SWAP */ | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index feb803bf3443..d487f8dc6d39 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -242,15 +242,8 @@ void shake_page(struct page *p, int access) | |||
242 | * Only call shrink_node_slabs here (which would also shrink | 242 | * Only call shrink_node_slabs here (which would also shrink |
243 | * other caches) if access is not potentially fatal. | 243 | * other caches) if access is not potentially fatal. |
244 | */ | 244 | */ |
245 | if (access) { | 245 | if (access) |
246 | int nr; | 246 | drop_slab_node(page_to_nid(p)); |
247 | int nid = page_to_nid(p); | ||
248 | do { | ||
249 | nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000); | ||
250 | if (page_count(p) == 1) | ||
251 | break; | ||
252 | } while (nr > 10); | ||
253 | } | ||
254 | } | 247 | } |
255 | EXPORT_SYMBOL_GPL(shake_page); | 248 | EXPORT_SYMBOL_GPL(shake_page); |
256 | 249 | ||
@@ -1654,8 +1647,6 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1654 | * setting PG_hwpoison. | 1647 | * setting PG_hwpoison. |
1655 | */ | 1648 | */ |
1656 | if (!is_free_buddy_page(page)) | 1649 | if (!is_free_buddy_page(page)) |
1657 | lru_add_drain_all(); | ||
1658 | if (!is_free_buddy_page(page)) | ||
1659 | drain_all_pages(page_zone(page)); | 1650 | drain_all_pages(page_zone(page)); |
1660 | SetPageHWPoison(page); | 1651 | SetPageHWPoison(page); |
1661 | if (!is_free_buddy_page(page)) | 1652 | if (!is_free_buddy_page(page)) |
diff --git a/mm/memory.c b/mm/memory.c index ca920d1fd314..99275325f303 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -235,6 +235,9 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long | |||
235 | 235 | ||
236 | static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) | 236 | static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) |
237 | { | 237 | { |
238 | if (!tlb->end) | ||
239 | return; | ||
240 | |||
238 | tlb_flush(tlb); | 241 | tlb_flush(tlb); |
239 | mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); | 242 | mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); |
240 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | 243 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE |
@@ -247,7 +250,7 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb) | |||
247 | { | 250 | { |
248 | struct mmu_gather_batch *batch; | 251 | struct mmu_gather_batch *batch; |
249 | 252 | ||
250 | for (batch = &tlb->local; batch; batch = batch->next) { | 253 | for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { |
251 | free_pages_and_swap_cache(batch->pages, batch->nr); | 254 | free_pages_and_swap_cache(batch->pages, batch->nr); |
252 | batch->nr = 0; | 255 | batch->nr = 0; |
253 | } | 256 | } |
@@ -256,9 +259,6 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb) | |||
256 | 259 | ||
257 | void tlb_flush_mmu(struct mmu_gather *tlb) | 260 | void tlb_flush_mmu(struct mmu_gather *tlb) |
258 | { | 261 | { |
259 | if (!tlb->end) | ||
260 | return; | ||
261 | |||
262 | tlb_flush_mmu_tlbonly(tlb); | 262 | tlb_flush_mmu_tlbonly(tlb); |
263 | tlb_flush_mmu_free(tlb); | 263 | tlb_flush_mmu_free(tlb); |
264 | } | 264 | } |
@@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | |||
428 | pmd = pmd_offset(pud, start); | 428 | pmd = pmd_offset(pud, start); |
429 | pud_clear(pud); | 429 | pud_clear(pud); |
430 | pmd_free_tlb(tlb, pmd, start); | 430 | pmd_free_tlb(tlb, pmd, start); |
431 | mm_dec_nr_pmds(tlb->mm); | ||
431 | } | 432 | } |
432 | 433 | ||
433 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | 434 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, |
@@ -754,6 +755,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | |||
754 | if (HAVE_PTE_SPECIAL) { | 755 | if (HAVE_PTE_SPECIAL) { |
755 | if (likely(!pte_special(pte))) | 756 | if (likely(!pte_special(pte))) |
756 | goto check_pfn; | 757 | goto check_pfn; |
758 | if (vma->vm_ops && vma->vm_ops->find_special_page) | ||
759 | return vma->vm_ops->find_special_page(vma, addr); | ||
757 | if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) | 760 | if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) |
758 | return NULL; | 761 | return NULL; |
759 | if (!is_zero_pfn(pfn)) | 762 | if (!is_zero_pfn(pfn)) |
@@ -811,42 +814,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
811 | 814 | ||
812 | /* pte contains position in swap or file, so copy. */ | 815 | /* pte contains position in swap or file, so copy. */ |
813 | if (unlikely(!pte_present(pte))) { | 816 | if (unlikely(!pte_present(pte))) { |
814 | if (!pte_file(pte)) { | 817 | swp_entry_t entry = pte_to_swp_entry(pte); |
815 | swp_entry_t entry = pte_to_swp_entry(pte); | 818 | |
816 | 819 | if (likely(!non_swap_entry(entry))) { | |
817 | if (likely(!non_swap_entry(entry))) { | 820 | if (swap_duplicate(entry) < 0) |
818 | if (swap_duplicate(entry) < 0) | 821 | return entry.val; |
819 | return entry.val; | 822 | |
820 | 823 | /* make sure dst_mm is on swapoff's mmlist. */ | |
821 | /* make sure dst_mm is on swapoff's mmlist. */ | 824 | if (unlikely(list_empty(&dst_mm->mmlist))) { |
822 | if (unlikely(list_empty(&dst_mm->mmlist))) { | 825 | spin_lock(&mmlist_lock); |
823 | spin_lock(&mmlist_lock); | 826 | if (list_empty(&dst_mm->mmlist)) |
824 | if (list_empty(&dst_mm->mmlist)) | 827 | list_add(&dst_mm->mmlist, |
825 | list_add(&dst_mm->mmlist, | 828 | &src_mm->mmlist); |
826 | &src_mm->mmlist); | 829 | spin_unlock(&mmlist_lock); |
827 | spin_unlock(&mmlist_lock); | 830 | } |
828 | } | 831 | rss[MM_SWAPENTS]++; |
829 | rss[MM_SWAPENTS]++; | 832 | } else if (is_migration_entry(entry)) { |
830 | } else if (is_migration_entry(entry)) { | 833 | page = migration_entry_to_page(entry); |
831 | page = migration_entry_to_page(entry); | 834 | |
832 | 835 | if (PageAnon(page)) | |
833 | if (PageAnon(page)) | 836 | rss[MM_ANONPAGES]++; |
834 | rss[MM_ANONPAGES]++; | 837 | else |
835 | else | 838 | rss[MM_FILEPAGES]++; |
836 | rss[MM_FILEPAGES]++; | 839 | |
837 | 840 | if (is_write_migration_entry(entry) && | |
838 | if (is_write_migration_entry(entry) && | 841 | is_cow_mapping(vm_flags)) { |
839 | is_cow_mapping(vm_flags)) { | 842 | /* |
840 | /* | 843 | * COW mappings require pages in both |
841 | * COW mappings require pages in both | 844 | * parent and child to be set to read. |
842 | * parent and child to be set to read. | 845 | */ |
843 | */ | 846 | make_migration_entry_read(&entry); |
844 | make_migration_entry_read(&entry); | 847 | pte = swp_entry_to_pte(entry); |
845 | pte = swp_entry_to_pte(entry); | 848 | if (pte_swp_soft_dirty(*src_pte)) |
846 | if (pte_swp_soft_dirty(*src_pte)) | 849 | pte = pte_swp_mksoft_dirty(pte); |
847 | pte = pte_swp_mksoft_dirty(pte); | 850 | set_pte_at(src_mm, addr, src_pte, pte); |
848 | set_pte_at(src_mm, addr, src_pte, pte); | ||
849 | } | ||
850 | } | 851 | } |
851 | } | 852 | } |
852 | goto out_set_pte; | 853 | goto out_set_pte; |
@@ -1020,11 +1021,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
1020 | * readonly mappings. The tradeoff is that copy_page_range is more | 1021 | * readonly mappings. The tradeoff is that copy_page_range is more |
1021 | * efficient than faulting. | 1022 | * efficient than faulting. |
1022 | */ | 1023 | */ |
1023 | if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR | | 1024 | if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && |
1024 | VM_PFNMAP | VM_MIXEDMAP))) { | 1025 | !vma->anon_vma) |
1025 | if (!vma->anon_vma) | 1026 | return 0; |
1026 | return 0; | ||
1027 | } | ||
1028 | 1027 | ||
1029 | if (is_vm_hugetlb_page(vma)) | 1028 | if (is_vm_hugetlb_page(vma)) |
1030 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); | 1029 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); |
@@ -1082,6 +1081,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
1082 | spinlock_t *ptl; | 1081 | spinlock_t *ptl; |
1083 | pte_t *start_pte; | 1082 | pte_t *start_pte; |
1084 | pte_t *pte; | 1083 | pte_t *pte; |
1084 | swp_entry_t entry; | ||
1085 | 1085 | ||
1086 | again: | 1086 | again: |
1087 | init_rss_vec(rss); | 1087 | init_rss_vec(rss); |
@@ -1107,28 +1107,12 @@ again: | |||
1107 | if (details->check_mapping && | 1107 | if (details->check_mapping && |
1108 | details->check_mapping != page->mapping) | 1108 | details->check_mapping != page->mapping) |
1109 | continue; | 1109 | continue; |
1110 | /* | ||
1111 | * Each page->index must be checked when | ||
1112 | * invalidating or truncating nonlinear. | ||
1113 | */ | ||
1114 | if (details->nonlinear_vma && | ||
1115 | (page->index < details->first_index || | ||
1116 | page->index > details->last_index)) | ||
1117 | continue; | ||
1118 | } | 1110 | } |
1119 | ptent = ptep_get_and_clear_full(mm, addr, pte, | 1111 | ptent = ptep_get_and_clear_full(mm, addr, pte, |
1120 | tlb->fullmm); | 1112 | tlb->fullmm); |
1121 | tlb_remove_tlb_entry(tlb, pte, addr); | 1113 | tlb_remove_tlb_entry(tlb, pte, addr); |
1122 | if (unlikely(!page)) | 1114 | if (unlikely(!page)) |
1123 | continue; | 1115 | continue; |
1124 | if (unlikely(details) && details->nonlinear_vma | ||
1125 | && linear_page_index(details->nonlinear_vma, | ||
1126 | addr) != page->index) { | ||
1127 | pte_t ptfile = pgoff_to_pte(page->index); | ||
1128 | if (pte_soft_dirty(ptent)) | ||
1129 | ptfile = pte_file_mksoft_dirty(ptfile); | ||
1130 | set_pte_at(mm, addr, pte, ptfile); | ||
1131 | } | ||
1132 | if (PageAnon(page)) | 1116 | if (PageAnon(page)) |
1133 | rss[MM_ANONPAGES]--; | 1117 | rss[MM_ANONPAGES]--; |
1134 | else { | 1118 | else { |
@@ -1151,33 +1135,25 @@ again: | |||
1151 | } | 1135 | } |
1152 | continue; | 1136 | continue; |
1153 | } | 1137 | } |
1154 | /* | 1138 | /* If details->check_mapping, we leave swap entries. */ |
1155 | * If details->check_mapping, we leave swap entries; | ||
1156 | * if details->nonlinear_vma, we leave file entries. | ||
1157 | */ | ||
1158 | if (unlikely(details)) | 1139 | if (unlikely(details)) |
1159 | continue; | 1140 | continue; |
1160 | if (pte_file(ptent)) { | ||
1161 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) | ||
1162 | print_bad_pte(vma, addr, ptent, NULL); | ||
1163 | } else { | ||
1164 | swp_entry_t entry = pte_to_swp_entry(ptent); | ||
1165 | 1141 | ||
1166 | if (!non_swap_entry(entry)) | 1142 | entry = pte_to_swp_entry(ptent); |
1167 | rss[MM_SWAPENTS]--; | 1143 | if (!non_swap_entry(entry)) |
1168 | else if (is_migration_entry(entry)) { | 1144 | rss[MM_SWAPENTS]--; |
1169 | struct page *page; | 1145 | else if (is_migration_entry(entry)) { |
1146 | struct page *page; | ||
1170 | 1147 | ||
1171 | page = migration_entry_to_page(entry); | 1148 | page = migration_entry_to_page(entry); |
1172 | 1149 | ||
1173 | if (PageAnon(page)) | 1150 | if (PageAnon(page)) |
1174 | rss[MM_ANONPAGES]--; | 1151 | rss[MM_ANONPAGES]--; |
1175 | else | 1152 | else |
1176 | rss[MM_FILEPAGES]--; | 1153 | rss[MM_FILEPAGES]--; |
1177 | } | ||
1178 | if (unlikely(!free_swap_and_cache(entry))) | ||
1179 | print_bad_pte(vma, addr, ptent, NULL); | ||
1180 | } | 1154 | } |
1155 | if (unlikely(!free_swap_and_cache(entry))) | ||
1156 | print_bad_pte(vma, addr, ptent, NULL); | ||
1181 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | 1157 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); |
1182 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1158 | } while (pte++, addr += PAGE_SIZE, addr != end); |
1183 | 1159 | ||
@@ -1277,7 +1253,7 @@ static void unmap_page_range(struct mmu_gather *tlb, | |||
1277 | pgd_t *pgd; | 1253 | pgd_t *pgd; |
1278 | unsigned long next; | 1254 | unsigned long next; |
1279 | 1255 | ||
1280 | if (details && !details->check_mapping && !details->nonlinear_vma) | 1256 | if (details && !details->check_mapping) |
1281 | details = NULL; | 1257 | details = NULL; |
1282 | 1258 | ||
1283 | BUG_ON(addr >= end); | 1259 | BUG_ON(addr >= end); |
@@ -1371,7 +1347,7 @@ void unmap_vmas(struct mmu_gather *tlb, | |||
1371 | * @vma: vm_area_struct holding the applicable pages | 1347 | * @vma: vm_area_struct holding the applicable pages |
1372 | * @start: starting address of pages to zap | 1348 | * @start: starting address of pages to zap |
1373 | * @size: number of bytes to zap | 1349 | * @size: number of bytes to zap |
1374 | * @details: details of nonlinear truncation or shared cache invalidation | 1350 | * @details: details of shared cache invalidation |
1375 | * | 1351 | * |
1376 | * Caller must protect the VMA list | 1352 | * Caller must protect the VMA list |
1377 | */ | 1353 | */ |
@@ -1397,7 +1373,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, | |||
1397 | * @vma: vm_area_struct holding the applicable pages | 1373 | * @vma: vm_area_struct holding the applicable pages |
1398 | * @address: starting address of pages to zap | 1374 | * @address: starting address of pages to zap |
1399 | * @size: number of bytes to zap | 1375 | * @size: number of bytes to zap |
1400 | * @details: details of nonlinear truncation or shared cache invalidation | 1376 | * @details: details of shared cache invalidation |
1401 | * | 1377 | * |
1402 | * The range must fit into one VMA. | 1378 | * The range must fit into one VMA. |
1403 | */ | 1379 | */ |
@@ -1922,12 +1898,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, | |||
1922 | EXPORT_SYMBOL_GPL(apply_to_page_range); | 1898 | EXPORT_SYMBOL_GPL(apply_to_page_range); |
1923 | 1899 | ||
1924 | /* | 1900 | /* |
1925 | * handle_pte_fault chooses page fault handler according to an entry | 1901 | * handle_pte_fault chooses page fault handler according to an entry which was |
1926 | * which was read non-atomically. Before making any commitment, on | 1902 | * read non-atomically. Before making any commitment, on those architectures |
1927 | * those architectures or configurations (e.g. i386 with PAE) which | 1903 | * or configurations (e.g. i386 with PAE) which might give a mix of unmatched |
1928 | * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault | 1904 | * parts, do_swap_page must check under lock before unmapping the pte and |
1929 | * must check under lock before unmapping the pte and proceeding | 1905 | * proceeding (but do_wp_page is only called after already making such a check; |
1930 | * (but do_wp_page is only called after already making such a check; | ||
1931 | * and do_anonymous_page can safely check later on). | 1906 | * and do_anonymous_page can safely check later on). |
1932 | */ | 1907 | */ |
1933 | static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, | 1908 | static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, |
@@ -2033,7 +2008,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2033 | pte_t entry; | 2008 | pte_t entry; |
2034 | int ret = 0; | 2009 | int ret = 0; |
2035 | int page_mkwrite = 0; | 2010 | int page_mkwrite = 0; |
2036 | struct page *dirty_page = NULL; | 2011 | bool dirty_shared = false; |
2037 | unsigned long mmun_start = 0; /* For mmu_notifiers */ | 2012 | unsigned long mmun_start = 0; /* For mmu_notifiers */ |
2038 | unsigned long mmun_end = 0; /* For mmu_notifiers */ | 2013 | unsigned long mmun_end = 0; /* For mmu_notifiers */ |
2039 | struct mem_cgroup *memcg; | 2014 | struct mem_cgroup *memcg; |
@@ -2084,6 +2059,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2084 | unlock_page(old_page); | 2059 | unlock_page(old_page); |
2085 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2060 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
2086 | (VM_WRITE|VM_SHARED))) { | 2061 | (VM_WRITE|VM_SHARED))) { |
2062 | page_cache_get(old_page); | ||
2087 | /* | 2063 | /* |
2088 | * Only catch write-faults on shared writable pages, | 2064 | * Only catch write-faults on shared writable pages, |
2089 | * read-only shared pages can get COWed by | 2065 | * read-only shared pages can get COWed by |
@@ -2091,7 +2067,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2091 | */ | 2067 | */ |
2092 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | 2068 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { |
2093 | int tmp; | 2069 | int tmp; |
2094 | page_cache_get(old_page); | 2070 | |
2095 | pte_unmap_unlock(page_table, ptl); | 2071 | pte_unmap_unlock(page_table, ptl); |
2096 | tmp = do_page_mkwrite(vma, old_page, address); | 2072 | tmp = do_page_mkwrite(vma, old_page, address); |
2097 | if (unlikely(!tmp || (tmp & | 2073 | if (unlikely(!tmp || (tmp & |
@@ -2111,11 +2087,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2111 | unlock_page(old_page); | 2087 | unlock_page(old_page); |
2112 | goto unlock; | 2088 | goto unlock; |
2113 | } | 2089 | } |
2114 | |||
2115 | page_mkwrite = 1; | 2090 | page_mkwrite = 1; |
2116 | } | 2091 | } |
2117 | dirty_page = old_page; | 2092 | |
2118 | get_page(dirty_page); | 2093 | dirty_shared = true; |
2119 | 2094 | ||
2120 | reuse: | 2095 | reuse: |
2121 | /* | 2096 | /* |
@@ -2134,38 +2109,29 @@ reuse: | |||
2134 | pte_unmap_unlock(page_table, ptl); | 2109 | pte_unmap_unlock(page_table, ptl); |
2135 | ret |= VM_FAULT_WRITE; | 2110 | ret |= VM_FAULT_WRITE; |
2136 | 2111 | ||
2137 | if (!dirty_page) | 2112 | if (dirty_shared) { |
2138 | return ret; | 2113 | struct address_space *mapping; |
2114 | int dirtied; | ||
2139 | 2115 | ||
2140 | /* | 2116 | if (!page_mkwrite) |
2141 | * Yes, Virginia, this is actually required to prevent a race | 2117 | lock_page(old_page); |
2142 | * with clear_page_dirty_for_io() from clearing the page dirty | 2118 | |
2143 | * bit after it clear all dirty ptes, but before a racing | 2119 | dirtied = set_page_dirty(old_page); |
2144 | * do_wp_page installs a dirty pte. | 2120 | VM_BUG_ON_PAGE(PageAnon(old_page), old_page); |
2145 | * | 2121 | mapping = old_page->mapping; |
2146 | * do_shared_fault is protected similarly. | 2122 | unlock_page(old_page); |
2147 | */ | 2123 | page_cache_release(old_page); |
2148 | if (!page_mkwrite) { | 2124 | |
2149 | wait_on_page_locked(dirty_page); | 2125 | if ((dirtied || page_mkwrite) && mapping) { |
2150 | set_page_dirty_balance(dirty_page); | ||
2151 | /* file_update_time outside page_lock */ | ||
2152 | if (vma->vm_file) | ||
2153 | file_update_time(vma->vm_file); | ||
2154 | } | ||
2155 | put_page(dirty_page); | ||
2156 | if (page_mkwrite) { | ||
2157 | struct address_space *mapping = dirty_page->mapping; | ||
2158 | |||
2159 | set_page_dirty(dirty_page); | ||
2160 | unlock_page(dirty_page); | ||
2161 | page_cache_release(dirty_page); | ||
2162 | if (mapping) { | ||
2163 | /* | 2126 | /* |
2164 | * Some device drivers do not set page.mapping | 2127 | * Some device drivers do not set page.mapping |
2165 | * but still dirty their pages | 2128 | * but still dirty their pages |
2166 | */ | 2129 | */ |
2167 | balance_dirty_pages_ratelimited(mapping); | 2130 | balance_dirty_pages_ratelimited(mapping); |
2168 | } | 2131 | } |
2132 | |||
2133 | if (!page_mkwrite) | ||
2134 | file_update_time(vma->vm_file); | ||
2169 | } | 2135 | } |
2170 | 2136 | ||
2171 | return ret; | 2137 | return ret; |
@@ -2324,25 +2290,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, | |||
2324 | } | 2290 | } |
2325 | } | 2291 | } |
2326 | 2292 | ||
2327 | static inline void unmap_mapping_range_list(struct list_head *head, | ||
2328 | struct zap_details *details) | ||
2329 | { | ||
2330 | struct vm_area_struct *vma; | ||
2331 | |||
2332 | /* | ||
2333 | * In nonlinear VMAs there is no correspondence between virtual address | ||
2334 | * offset and file offset. So we must perform an exhaustive search | ||
2335 | * across *all* the pages in each nonlinear VMA, not just the pages | ||
2336 | * whose virtual address lies outside the file truncation point. | ||
2337 | */ | ||
2338 | list_for_each_entry(vma, head, shared.nonlinear) { | ||
2339 | details->nonlinear_vma = vma; | ||
2340 | unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); | ||
2341 | } | ||
2342 | } | ||
2343 | |||
2344 | /** | 2293 | /** |
2345 | * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. | 2294 | * unmap_mapping_range - unmap the portion of all mmaps in the specified |
2295 | * address_space corresponding to the specified page range in the underlying | ||
2296 | * file. | ||
2297 | * | ||
2346 | * @mapping: the address space containing mmaps to be unmapped. | 2298 | * @mapping: the address space containing mmaps to be unmapped. |
2347 | * @holebegin: byte in first page to unmap, relative to the start of | 2299 | * @holebegin: byte in first page to unmap, relative to the start of |
2348 | * the underlying file. This will be rounded down to a PAGE_SIZE | 2300 | * the underlying file. This will be rounded down to a PAGE_SIZE |
@@ -2371,7 +2323,6 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2371 | } | 2323 | } |
2372 | 2324 | ||
2373 | details.check_mapping = even_cows? NULL: mapping; | 2325 | details.check_mapping = even_cows? NULL: mapping; |
2374 | details.nonlinear_vma = NULL; | ||
2375 | details.first_index = hba; | 2326 | details.first_index = hba; |
2376 | details.last_index = hba + hlen - 1; | 2327 | details.last_index = hba + hlen - 1; |
2377 | if (details.last_index < details.first_index) | 2328 | if (details.last_index < details.first_index) |
@@ -2381,8 +2332,6 @@ void unmap_mapping_range(struct address_space *mapping, | |||
2381 | i_mmap_lock_write(mapping); | 2332 | i_mmap_lock_write(mapping); |
2382 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) | 2333 | if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) |
2383 | unmap_mapping_range_tree(&mapping->i_mmap, &details); | 2334 | unmap_mapping_range_tree(&mapping->i_mmap, &details); |
2384 | if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) | ||
2385 | unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); | ||
2386 | i_mmap_unlock_write(mapping); | 2335 | i_mmap_unlock_write(mapping); |
2387 | } | 2336 | } |
2388 | EXPORT_SYMBOL(unmap_mapping_range); | 2337 | EXPORT_SYMBOL(unmap_mapping_range); |
@@ -2593,7 +2542,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo | |||
2593 | if (prev && prev->vm_end == address) | 2542 | if (prev && prev->vm_end == address) |
2594 | return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; | 2543 | return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; |
2595 | 2544 | ||
2596 | expand_downwards(vma, address - PAGE_SIZE); | 2545 | return expand_downwards(vma, address - PAGE_SIZE); |
2597 | } | 2546 | } |
2598 | if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { | 2547 | if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { |
2599 | struct vm_area_struct *next = vma->vm_next; | 2548 | struct vm_area_struct *next = vma->vm_next; |
@@ -2602,7 +2551,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo | |||
2602 | if (next && next->vm_start == address + PAGE_SIZE) | 2551 | if (next && next->vm_start == address + PAGE_SIZE) |
2603 | return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; | 2552 | return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; |
2604 | 2553 | ||
2605 | expand_upwards(vma, address + PAGE_SIZE); | 2554 | return expand_upwards(vma, address + PAGE_SIZE); |
2606 | } | 2555 | } |
2607 | return 0; | 2556 | return 0; |
2608 | } | 2557 | } |
@@ -2625,7 +2574,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2625 | 2574 | ||
2626 | /* Check if we need to add a guard page to the stack */ | 2575 | /* Check if we need to add a guard page to the stack */ |
2627 | if (check_stack_guard_page(vma, address) < 0) | 2576 | if (check_stack_guard_page(vma, address) < 0) |
2628 | return VM_FAULT_SIGBUS; | 2577 | return VM_FAULT_SIGSEGV; |
2629 | 2578 | ||
2630 | /* Use the zero-page for reads */ | 2579 | /* Use the zero-page for reads */ |
2631 | if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) { | 2580 | if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) { |
@@ -2743,8 +2692,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, | |||
2743 | entry = mk_pte(page, vma->vm_page_prot); | 2692 | entry = mk_pte(page, vma->vm_page_prot); |
2744 | if (write) | 2693 | if (write) |
2745 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2694 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2746 | else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) | ||
2747 | entry = pte_mksoft_dirty(entry); | ||
2748 | if (anon) { | 2695 | if (anon) { |
2749 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 2696 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2750 | page_add_new_anon_rmap(page, vma, address); | 2697 | page_add_new_anon_rmap(page, vma, address); |
@@ -2879,8 +2826,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2879 | * if page by the offset is not ready to be mapped (cold cache or | 2826 | * if page by the offset is not ready to be mapped (cold cache or |
2880 | * something). | 2827 | * something). |
2881 | */ | 2828 | */ |
2882 | if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && | 2829 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { |
2883 | fault_around_bytes >> PAGE_SHIFT > 1) { | ||
2884 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | 2830 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); |
2885 | do_fault_around(vma, address, pte, pgoff, flags); | 2831 | do_fault_around(vma, address, pte, pgoff, flags); |
2886 | if (!pte_same(*pte, orig_pte)) | 2832 | if (!pte_same(*pte, orig_pte)) |
@@ -3012,8 +2958,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3012 | balance_dirty_pages_ratelimited(mapping); | 2958 | balance_dirty_pages_ratelimited(mapping); |
3013 | } | 2959 | } |
3014 | 2960 | ||
3015 | /* file_update_time outside page_lock */ | 2961 | if (!vma->vm_ops->page_mkwrite) |
3016 | if (vma->vm_file && !vma->vm_ops->page_mkwrite) | ||
3017 | file_update_time(vma->vm_file); | 2962 | file_update_time(vma->vm_file); |
3018 | 2963 | ||
3019 | return ret; | 2964 | return ret; |
@@ -3025,7 +2970,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3025 | * The mmap_sem may have been released depending on flags and our | 2970 | * The mmap_sem may have been released depending on flags and our |
3026 | * return value. See filemap_fault() and __lock_page_or_retry(). | 2971 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3027 | */ | 2972 | */ |
3028 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2973 | static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
3029 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2974 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
3030 | unsigned int flags, pte_t orig_pte) | 2975 | unsigned int flags, pte_t orig_pte) |
3031 | { | 2976 | { |
@@ -3042,46 +2987,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3042 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 2987 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
3043 | } | 2988 | } |
3044 | 2989 | ||
3045 | /* | ||
3046 | * Fault of a previously existing named mapping. Repopulate the pte | ||
3047 | * from the encoded file_pte if possible. This enables swappable | ||
3048 | * nonlinear vmas. | ||
3049 | * | ||
3050 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
3051 | * but allow concurrent faults), and pte mapped but not yet locked. | ||
3052 | * We return with pte unmapped and unlocked. | ||
3053 | * The mmap_sem may have been released depending on flags and our | ||
3054 | * return value. See filemap_fault() and __lock_page_or_retry(). | ||
3055 | */ | ||
3056 | static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3057 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
3058 | unsigned int flags, pte_t orig_pte) | ||
3059 | { | ||
3060 | pgoff_t pgoff; | ||
3061 | |||
3062 | flags |= FAULT_FLAG_NONLINEAR; | ||
3063 | |||
3064 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | ||
3065 | return 0; | ||
3066 | |||
3067 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { | ||
3068 | /* | ||
3069 | * Page table corrupted: show pte and kill process. | ||
3070 | */ | ||
3071 | print_bad_pte(vma, address, orig_pte, NULL); | ||
3072 | return VM_FAULT_SIGBUS; | ||
3073 | } | ||
3074 | |||
3075 | pgoff = pte_to_pgoff(orig_pte); | ||
3076 | if (!(flags & FAULT_FLAG_WRITE)) | ||
3077 | return do_read_fault(mm, vma, address, pmd, pgoff, flags, | ||
3078 | orig_pte); | ||
3079 | if (!(vma->vm_flags & VM_SHARED)) | ||
3080 | return do_cow_fault(mm, vma, address, pmd, pgoff, flags, | ||
3081 | orig_pte); | ||
3082 | return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | ||
3083 | } | ||
3084 | |||
3085 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | 2990 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, |
3086 | unsigned long addr, int page_nid, | 2991 | unsigned long addr, int page_nid, |
3087 | int *flags) | 2992 | int *flags) |
@@ -3108,14 +3013,17 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3108 | bool migrated = false; | 3013 | bool migrated = false; |
3109 | int flags = 0; | 3014 | int flags = 0; |
3110 | 3015 | ||
3016 | /* A PROT_NONE fault should not end up here */ | ||
3017 | BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); | ||
3018 | |||
3111 | /* | 3019 | /* |
3112 | * The "pte" at this point cannot be used safely without | 3020 | * The "pte" at this point cannot be used safely without |
3113 | * validation through pte_unmap_same(). It's of NUMA type but | 3021 | * validation through pte_unmap_same(). It's of NUMA type but |
3114 | * the pfn may be screwed if the read is non atomic. | 3022 | * the pfn may be screwed if the read is non atomic. |
3115 | * | 3023 | * |
3116 | * ptep_modify_prot_start is not called as this is clearing | 3024 | * We can safely just do a "set_pte_at()", because the old |
3117 | * the _PAGE_NUMA bit and it is not really expected that there | 3025 | * page table entry is not accessible, so there would be no |
3118 | * would be concurrent hardware modifications to the PTE. | 3026 | * concurrent hardware modifications to the PTE. |
3119 | */ | 3027 | */ |
3120 | ptl = pte_lockptr(mm, pmd); | 3028 | ptl = pte_lockptr(mm, pmd); |
3121 | spin_lock(ptl); | 3029 | spin_lock(ptl); |
@@ -3124,7 +3032,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3124 | goto out; | 3032 | goto out; |
3125 | } | 3033 | } |
3126 | 3034 | ||
3127 | pte = pte_mknonnuma(pte); | 3035 | /* Make it present again */ |
3036 | pte = pte_modify(pte, vma->vm_page_prot); | ||
3037 | pte = pte_mkyoung(pte); | ||
3128 | set_pte_at(mm, addr, ptep, pte); | 3038 | set_pte_at(mm, addr, ptep, pte); |
3129 | update_mmu_cache(vma, addr, ptep); | 3039 | update_mmu_cache(vma, addr, ptep); |
3130 | 3040 | ||
@@ -3133,7 +3043,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3133 | pte_unmap_unlock(ptep, ptl); | 3043 | pte_unmap_unlock(ptep, ptl); |
3134 | return 0; | 3044 | return 0; |
3135 | } | 3045 | } |
3136 | BUG_ON(is_zero_pfn(page_to_pfn(page))); | ||
3137 | 3046 | ||
3138 | /* | 3047 | /* |
3139 | * Avoid grouping on DSO/COW pages in specific and RO pages | 3048 | * Avoid grouping on DSO/COW pages in specific and RO pages |
@@ -3209,20 +3118,17 @@ static int handle_pte_fault(struct mm_struct *mm, | |||
3209 | if (pte_none(entry)) { | 3118 | if (pte_none(entry)) { |
3210 | if (vma->vm_ops) { | 3119 | if (vma->vm_ops) { |
3211 | if (likely(vma->vm_ops->fault)) | 3120 | if (likely(vma->vm_ops->fault)) |
3212 | return do_linear_fault(mm, vma, address, | 3121 | return do_fault(mm, vma, address, pte, |
3213 | pte, pmd, flags, entry); | 3122 | pmd, flags, entry); |
3214 | } | 3123 | } |
3215 | return do_anonymous_page(mm, vma, address, | 3124 | return do_anonymous_page(mm, vma, address, |
3216 | pte, pmd, flags); | 3125 | pte, pmd, flags); |
3217 | } | 3126 | } |
3218 | if (pte_file(entry)) | ||
3219 | return do_nonlinear_fault(mm, vma, address, | ||
3220 | pte, pmd, flags, entry); | ||
3221 | return do_swap_page(mm, vma, address, | 3127 | return do_swap_page(mm, vma, address, |
3222 | pte, pmd, flags, entry); | 3128 | pte, pmd, flags, entry); |
3223 | } | 3129 | } |
3224 | 3130 | ||
3225 | if (pte_numa(entry)) | 3131 | if (pte_protnone(entry)) |
3226 | return do_numa_page(mm, vma, address, entry, pte, pmd); | 3132 | return do_numa_page(mm, vma, address, entry, pte, pmd); |
3227 | 3133 | ||
3228 | ptl = pte_lockptr(mm, pmd); | 3134 | ptl = pte_lockptr(mm, pmd); |
@@ -3300,7 +3206,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3300 | if (pmd_trans_splitting(orig_pmd)) | 3206 | if (pmd_trans_splitting(orig_pmd)) |
3301 | return 0; | 3207 | return 0; |
3302 | 3208 | ||
3303 | if (pmd_numa(orig_pmd)) | 3209 | if (pmd_protnone(orig_pmd)) |
3304 | return do_huge_pmd_numa_page(mm, vma, address, | 3210 | return do_huge_pmd_numa_page(mm, vma, address, |
3305 | orig_pmd, pmd); | 3211 | orig_pmd, pmd); |
3306 | 3212 | ||
@@ -3421,15 +3327,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) | |||
3421 | 3327 | ||
3422 | spin_lock(&mm->page_table_lock); | 3328 | spin_lock(&mm->page_table_lock); |
3423 | #ifndef __ARCH_HAS_4LEVEL_HACK | 3329 | #ifndef __ARCH_HAS_4LEVEL_HACK |
3424 | if (pud_present(*pud)) /* Another has populated it */ | 3330 | if (!pud_present(*pud)) { |
3425 | pmd_free(mm, new); | 3331 | mm_inc_nr_pmds(mm); |
3426 | else | ||
3427 | pud_populate(mm, pud, new); | 3332 | pud_populate(mm, pud, new); |
3428 | #else | 3333 | } else /* Another has populated it */ |
3429 | if (pgd_present(*pud)) /* Another has populated it */ | ||
3430 | pmd_free(mm, new); | 3334 | pmd_free(mm, new); |
3431 | else | 3335 | #else |
3336 | if (!pgd_present(*pud)) { | ||
3337 | mm_inc_nr_pmds(mm); | ||
3432 | pgd_populate(mm, pud, new); | 3338 | pgd_populate(mm, pud, new); |
3339 | } else /* Another has populated it */ | ||
3340 | pmd_free(mm, new); | ||
3433 | #endif /* __ARCH_HAS_4LEVEL_HACK */ | 3341 | #endif /* __ARCH_HAS_4LEVEL_HACK */ |
3434 | spin_unlock(&mm->page_table_lock); | 3342 | spin_unlock(&mm->page_table_lock); |
3435 | return 0; | 3343 | return 0; |
@@ -3554,7 +3462,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, | |||
3554 | if (follow_phys(vma, addr, write, &prot, &phys_addr)) | 3462 | if (follow_phys(vma, addr, write, &prot, &phys_addr)) |
3555 | return -EINVAL; | 3463 | return -EINVAL; |
3556 | 3464 | ||
3557 | maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); | 3465 | maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); |
3558 | if (write) | 3466 | if (write) |
3559 | memcpy_toio(maddr + offset, buf, len); | 3467 | memcpy_toio(maddr + offset, buf, len); |
3560 | else | 3468 | else |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0e0961b8c39c..4721046a134a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -471,24 +471,34 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { | |||
471 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 471 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
472 | unsigned long flags); | 472 | unsigned long flags); |
473 | 473 | ||
474 | struct queue_pages { | ||
475 | struct list_head *pagelist; | ||
476 | unsigned long flags; | ||
477 | nodemask_t *nmask; | ||
478 | struct vm_area_struct *prev; | ||
479 | }; | ||
480 | |||
474 | /* | 481 | /* |
475 | * Scan through pages checking if pages follow certain conditions, | 482 | * Scan through pages checking if pages follow certain conditions, |
476 | * and move them to the pagelist if they do. | 483 | * and move them to the pagelist if they do. |
477 | */ | 484 | */ |
478 | static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 485 | static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, |
479 | unsigned long addr, unsigned long end, | 486 | unsigned long end, struct mm_walk *walk) |
480 | const nodemask_t *nodes, unsigned long flags, | ||
481 | void *private) | ||
482 | { | 487 | { |
483 | pte_t *orig_pte; | 488 | struct vm_area_struct *vma = walk->vma; |
489 | struct page *page; | ||
490 | struct queue_pages *qp = walk->private; | ||
491 | unsigned long flags = qp->flags; | ||
492 | int nid; | ||
484 | pte_t *pte; | 493 | pte_t *pte; |
485 | spinlock_t *ptl; | 494 | spinlock_t *ptl; |
486 | 495 | ||
487 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 496 | split_huge_page_pmd(vma, addr, pmd); |
488 | do { | 497 | if (pmd_trans_unstable(pmd)) |
489 | struct page *page; | 498 | return 0; |
490 | int nid; | ||
491 | 499 | ||
500 | pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); | ||
501 | for (; addr != end; pte++, addr += PAGE_SIZE) { | ||
492 | if (!pte_present(*pte)) | 502 | if (!pte_present(*pte)) |
493 | continue; | 503 | continue; |
494 | page = vm_normal_page(vma, addr, *pte); | 504 | page = vm_normal_page(vma, addr, *pte); |
@@ -501,114 +511,46 @@ static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
501 | if (PageReserved(page)) | 511 | if (PageReserved(page)) |
502 | continue; | 512 | continue; |
503 | nid = page_to_nid(page); | 513 | nid = page_to_nid(page); |
504 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 514 | if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) |
505 | continue; | 515 | continue; |
506 | 516 | ||
507 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | 517 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) |
508 | migrate_page_add(page, private, flags); | 518 | migrate_page_add(page, qp->pagelist, flags); |
509 | else | 519 | } |
510 | break; | 520 | pte_unmap_unlock(pte - 1, ptl); |
511 | } while (pte++, addr += PAGE_SIZE, addr != end); | 521 | cond_resched(); |
512 | pte_unmap_unlock(orig_pte, ptl); | 522 | return 0; |
513 | return addr != end; | ||
514 | } | 523 | } |
515 | 524 | ||
516 | static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, | 525 | static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, |
517 | pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, | 526 | unsigned long addr, unsigned long end, |
518 | void *private) | 527 | struct mm_walk *walk) |
519 | { | 528 | { |
520 | #ifdef CONFIG_HUGETLB_PAGE | 529 | #ifdef CONFIG_HUGETLB_PAGE |
530 | struct queue_pages *qp = walk->private; | ||
531 | unsigned long flags = qp->flags; | ||
521 | int nid; | 532 | int nid; |
522 | struct page *page; | 533 | struct page *page; |
523 | spinlock_t *ptl; | 534 | spinlock_t *ptl; |
524 | pte_t entry; | 535 | pte_t entry; |
525 | 536 | ||
526 | ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); | 537 | ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); |
527 | entry = huge_ptep_get((pte_t *)pmd); | 538 | entry = huge_ptep_get(pte); |
528 | if (!pte_present(entry)) | 539 | if (!pte_present(entry)) |
529 | goto unlock; | 540 | goto unlock; |
530 | page = pte_page(entry); | 541 | page = pte_page(entry); |
531 | nid = page_to_nid(page); | 542 | nid = page_to_nid(page); |
532 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 543 | if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) |
533 | goto unlock; | 544 | goto unlock; |
534 | /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ | 545 | /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ |
535 | if (flags & (MPOL_MF_MOVE_ALL) || | 546 | if (flags & (MPOL_MF_MOVE_ALL) || |
536 | (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) | 547 | (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) |
537 | isolate_huge_page(page, private); | 548 | isolate_huge_page(page, qp->pagelist); |
538 | unlock: | 549 | unlock: |
539 | spin_unlock(ptl); | 550 | spin_unlock(ptl); |
540 | #else | 551 | #else |
541 | BUG(); | 552 | BUG(); |
542 | #endif | 553 | #endif |
543 | } | ||
544 | |||
545 | static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, | ||
546 | unsigned long addr, unsigned long end, | ||
547 | const nodemask_t *nodes, unsigned long flags, | ||
548 | void *private) | ||
549 | { | ||
550 | pmd_t *pmd; | ||
551 | unsigned long next; | ||
552 | |||
553 | pmd = pmd_offset(pud, addr); | ||
554 | do { | ||
555 | next = pmd_addr_end(addr, end); | ||
556 | if (!pmd_present(*pmd)) | ||
557 | continue; | ||
558 | if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { | ||
559 | queue_pages_hugetlb_pmd_range(vma, pmd, nodes, | ||
560 | flags, private); | ||
561 | continue; | ||
562 | } | ||
563 | split_huge_page_pmd(vma, addr, pmd); | ||
564 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | ||
565 | continue; | ||
566 | if (queue_pages_pte_range(vma, pmd, addr, next, nodes, | ||
567 | flags, private)) | ||
568 | return -EIO; | ||
569 | } while (pmd++, addr = next, addr != end); | ||
570 | return 0; | ||
571 | } | ||
572 | |||
573 | static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | ||
574 | unsigned long addr, unsigned long end, | ||
575 | const nodemask_t *nodes, unsigned long flags, | ||
576 | void *private) | ||
577 | { | ||
578 | pud_t *pud; | ||
579 | unsigned long next; | ||
580 | |||
581 | pud = pud_offset(pgd, addr); | ||
582 | do { | ||
583 | next = pud_addr_end(addr, end); | ||
584 | if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) | ||
585 | continue; | ||
586 | if (pud_none_or_clear_bad(pud)) | ||
587 | continue; | ||
588 | if (queue_pages_pmd_range(vma, pud, addr, next, nodes, | ||
589 | flags, private)) | ||
590 | return -EIO; | ||
591 | } while (pud++, addr = next, addr != end); | ||
592 | return 0; | ||
593 | } | ||
594 | |||
595 | static inline int queue_pages_pgd_range(struct vm_area_struct *vma, | ||
596 | unsigned long addr, unsigned long end, | ||
597 | const nodemask_t *nodes, unsigned long flags, | ||
598 | void *private) | ||
599 | { | ||
600 | pgd_t *pgd; | ||
601 | unsigned long next; | ||
602 | |||
603 | pgd = pgd_offset(vma->vm_mm, addr); | ||
604 | do { | ||
605 | next = pgd_addr_end(addr, end); | ||
606 | if (pgd_none_or_clear_bad(pgd)) | ||
607 | continue; | ||
608 | if (queue_pages_pud_range(vma, pgd, addr, next, nodes, | ||
609 | flags, private)) | ||
610 | return -EIO; | ||
611 | } while (pgd++, addr = next, addr != end); | ||
612 | return 0; | 554 | return 0; |
613 | } | 555 | } |
614 | 556 | ||
@@ -627,7 +569,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, | |||
627 | { | 569 | { |
628 | int nr_updated; | 570 | int nr_updated; |
629 | 571 | ||
630 | nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); | 572 | nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1); |
631 | if (nr_updated) | 573 | if (nr_updated) |
632 | count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); | 574 | count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); |
633 | 575 | ||
@@ -641,6 +583,49 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, | |||
641 | } | 583 | } |
642 | #endif /* CONFIG_NUMA_BALANCING */ | 584 | #endif /* CONFIG_NUMA_BALANCING */ |
643 | 585 | ||
586 | static int queue_pages_test_walk(unsigned long start, unsigned long end, | ||
587 | struct mm_walk *walk) | ||
588 | { | ||
589 | struct vm_area_struct *vma = walk->vma; | ||
590 | struct queue_pages *qp = walk->private; | ||
591 | unsigned long endvma = vma->vm_end; | ||
592 | unsigned long flags = qp->flags; | ||
593 | |||
594 | if (vma->vm_flags & VM_PFNMAP) | ||
595 | return 1; | ||
596 | |||
597 | if (endvma > end) | ||
598 | endvma = end; | ||
599 | if (vma->vm_start > start) | ||
600 | start = vma->vm_start; | ||
601 | |||
602 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { | ||
603 | if (!vma->vm_next && vma->vm_end < end) | ||
604 | return -EFAULT; | ||
605 | if (qp->prev && qp->prev->vm_end < vma->vm_start) | ||
606 | return -EFAULT; | ||
607 | } | ||
608 | |||
609 | qp->prev = vma; | ||
610 | |||
611 | if (vma->vm_flags & VM_PFNMAP) | ||
612 | return 1; | ||
613 | |||
614 | if (flags & MPOL_MF_LAZY) { | ||
615 | /* Similar to task_numa_work, skip inaccessible VMAs */ | ||
616 | if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) | ||
617 | change_prot_numa(vma, start, endvma); | ||
618 | return 1; | ||
619 | } | ||
620 | |||
621 | if ((flags & MPOL_MF_STRICT) || | ||
622 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | ||
623 | vma_migratable(vma))) | ||
624 | /* queue pages from current vma */ | ||
625 | return 0; | ||
626 | return 1; | ||
627 | } | ||
628 | |||
644 | /* | 629 | /* |
645 | * Walk through page tables and collect pages to be migrated. | 630 | * Walk through page tables and collect pages to be migrated. |
646 | * | 631 | * |
@@ -650,50 +635,24 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, | |||
650 | */ | 635 | */ |
651 | static int | 636 | static int |
652 | queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, | 637 | queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, |
653 | const nodemask_t *nodes, unsigned long flags, void *private) | 638 | nodemask_t *nodes, unsigned long flags, |
654 | { | 639 | struct list_head *pagelist) |
655 | int err = 0; | 640 | { |
656 | struct vm_area_struct *vma, *prev; | 641 | struct queue_pages qp = { |
657 | 642 | .pagelist = pagelist, | |
658 | vma = find_vma(mm, start); | 643 | .flags = flags, |
659 | if (!vma) | 644 | .nmask = nodes, |
660 | return -EFAULT; | 645 | .prev = NULL, |
661 | prev = NULL; | 646 | }; |
662 | for (; vma && vma->vm_start < end; vma = vma->vm_next) { | 647 | struct mm_walk queue_pages_walk = { |
663 | unsigned long endvma = vma->vm_end; | 648 | .hugetlb_entry = queue_pages_hugetlb, |
664 | 649 | .pmd_entry = queue_pages_pte_range, | |
665 | if (endvma > end) | 650 | .test_walk = queue_pages_test_walk, |
666 | endvma = end; | 651 | .mm = mm, |
667 | if (vma->vm_start > start) | 652 | .private = &qp, |
668 | start = vma->vm_start; | 653 | }; |
669 | 654 | ||
670 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { | 655 | return walk_page_range(start, end, &queue_pages_walk); |
671 | if (!vma->vm_next && vma->vm_end < end) | ||
672 | return -EFAULT; | ||
673 | if (prev && prev->vm_end < vma->vm_start) | ||
674 | return -EFAULT; | ||
675 | } | ||
676 | |||
677 | if (flags & MPOL_MF_LAZY) { | ||
678 | /* Similar to task_numa_work, skip inaccessible VMAs */ | ||
679 | if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) | ||
680 | change_prot_numa(vma, start, endvma); | ||
681 | goto next; | ||
682 | } | ||
683 | |||
684 | if ((flags & MPOL_MF_STRICT) || | ||
685 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | ||
686 | vma_migratable(vma))) { | ||
687 | |||
688 | err = queue_pages_pgd_range(vma, start, endvma, nodes, | ||
689 | flags, private); | ||
690 | if (err) | ||
691 | break; | ||
692 | } | ||
693 | next: | ||
694 | prev = vma; | ||
695 | } | ||
696 | return err; | ||
697 | } | 656 | } |
698 | 657 | ||
699 | /* | 658 | /* |
@@ -1988,43 +1947,63 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, | |||
1988 | * @order:Order of the GFP allocation. | 1947 | * @order:Order of the GFP allocation. |
1989 | * @vma: Pointer to VMA or NULL if not available. | 1948 | * @vma: Pointer to VMA or NULL if not available. |
1990 | * @addr: Virtual Address of the allocation. Must be inside the VMA. | 1949 | * @addr: Virtual Address of the allocation. Must be inside the VMA. |
1950 | * @node: Which node to prefer for allocation (modulo policy). | ||
1951 | * @hugepage: for hugepages try only the preferred node if possible | ||
1991 | * | 1952 | * |
1992 | * This function allocates a page from the kernel page pool and applies | 1953 | * This function allocates a page from the kernel page pool and applies |
1993 | * a NUMA policy associated with the VMA or the current process. | 1954 | * a NUMA policy associated with the VMA or the current process. |
1994 | * When VMA is not NULL caller must hold down_read on the mmap_sem of the | 1955 | * When VMA is not NULL caller must hold down_read on the mmap_sem of the |
1995 | * mm_struct of the VMA to prevent it from going away. Should be used for | 1956 | * mm_struct of the VMA to prevent it from going away. Should be used for |
1996 | * all allocations for pages that will be mapped into | 1957 | * all allocations for pages that will be mapped into user space. Returns |
1997 | * user space. Returns NULL when no page can be allocated. | 1958 | * NULL when no page can be allocated. |
1998 | * | ||
1999 | * Should be called with the mm_sem of the vma hold. | ||
2000 | */ | 1959 | */ |
2001 | struct page * | 1960 | struct page * |
2002 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | 1961 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, |
2003 | unsigned long addr, int node) | 1962 | unsigned long addr, int node, bool hugepage) |
2004 | { | 1963 | { |
2005 | struct mempolicy *pol; | 1964 | struct mempolicy *pol; |
2006 | struct page *page; | 1965 | struct page *page; |
2007 | unsigned int cpuset_mems_cookie; | 1966 | unsigned int cpuset_mems_cookie; |
1967 | struct zonelist *zl; | ||
1968 | nodemask_t *nmask; | ||
2008 | 1969 | ||
2009 | retry_cpuset: | 1970 | retry_cpuset: |
2010 | pol = get_vma_policy(vma, addr); | 1971 | pol = get_vma_policy(vma, addr); |
2011 | cpuset_mems_cookie = read_mems_allowed_begin(); | 1972 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2012 | 1973 | ||
2013 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1974 | if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage && |
1975 | pol->mode != MPOL_INTERLEAVE)) { | ||
1976 | /* | ||
1977 | * For hugepage allocation and non-interleave policy which | ||
1978 | * allows the current node, we only try to allocate from the | ||
1979 | * current node and don't fall back to other nodes, as the | ||
1980 | * cost of remote accesses would likely offset THP benefits. | ||
1981 | * | ||
1982 | * If the policy is interleave, or does not allow the current | ||
1983 | * node in its nodemask, we allocate the standard way. | ||
1984 | */ | ||
1985 | nmask = policy_nodemask(gfp, pol); | ||
1986 | if (!nmask || node_isset(node, *nmask)) { | ||
1987 | mpol_cond_put(pol); | ||
1988 | page = alloc_pages_exact_node(node, gfp, order); | ||
1989 | goto out; | ||
1990 | } | ||
1991 | } | ||
1992 | |||
1993 | if (pol->mode == MPOL_INTERLEAVE) { | ||
2014 | unsigned nid; | 1994 | unsigned nid; |
2015 | 1995 | ||
2016 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); | 1996 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); |
2017 | mpol_cond_put(pol); | 1997 | mpol_cond_put(pol); |
2018 | page = alloc_page_interleave(gfp, order, nid); | 1998 | page = alloc_page_interleave(gfp, order, nid); |
2019 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | 1999 | goto out; |
2020 | goto retry_cpuset; | ||
2021 | |||
2022 | return page; | ||
2023 | } | 2000 | } |
2024 | page = __alloc_pages_nodemask(gfp, order, | 2001 | |
2025 | policy_zonelist(gfp, pol, node), | 2002 | nmask = policy_nodemask(gfp, pol); |
2026 | policy_nodemask(gfp, pol)); | 2003 | zl = policy_zonelist(gfp, pol, node); |
2027 | mpol_cond_put(pol); | 2004 | mpol_cond_put(pol); |
2005 | page = __alloc_pages_nodemask(gfp, order, zl, nmask); | ||
2006 | out: | ||
2028 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | 2007 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
2029 | goto retry_cpuset; | 2008 | goto retry_cpuset; |
2030 | return page; | 2009 | return page; |
@@ -2838,8 +2817,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | |||
2838 | p += snprintf(p, buffer + maxlen - p, "relative"); | 2817 | p += snprintf(p, buffer + maxlen - p, "relative"); |
2839 | } | 2818 | } |
2840 | 2819 | ||
2841 | if (!nodes_empty(nodes)) { | 2820 | if (!nodes_empty(nodes)) |
2842 | p += snprintf(p, buffer + maxlen - p, ":"); | 2821 | p += scnprintf(p, buffer + maxlen - p, ":%*pbl", |
2843 | p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); | 2822 | nodemask_pr_args(&nodes)); |
2844 | } | ||
2845 | } | 2823 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index 344cdf692fc8..85e042686031 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -179,37 +179,6 @@ out: | |||
179 | } | 179 | } |
180 | 180 | ||
181 | /* | 181 | /* |
182 | * Congratulations to trinity for discovering this bug. | ||
183 | * mm/fremap.c's remap_file_pages() accepts any range within a single vma to | ||
184 | * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then | ||
185 | * replace the specified range by file ptes throughout (maybe populated after). | ||
186 | * If page migration finds a page within that range, while it's still located | ||
187 | * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem: | ||
188 | * zap_pte() clears the temporary migration entry before mmap_sem is dropped. | ||
189 | * But if the migrating page is in a part of the vma outside the range to be | ||
190 | * remapped, then it will not be cleared, and remove_migration_ptes() needs to | ||
191 | * deal with it. Fortunately, this part of the vma is of course still linear, | ||
192 | * so we just need to use linear location on the nonlinear list. | ||
193 | */ | ||
194 | static int remove_linear_migration_ptes_from_nonlinear(struct page *page, | ||
195 | struct address_space *mapping, void *arg) | ||
196 | { | ||
197 | struct vm_area_struct *vma; | ||
198 | /* hugetlbfs does not support remap_pages, so no huge pgoff worries */ | ||
199 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
200 | unsigned long addr; | ||
201 | |||
202 | list_for_each_entry(vma, | ||
203 | &mapping->i_mmap_nonlinear, shared.nonlinear) { | ||
204 | |||
205 | addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | ||
206 | if (addr >= vma->vm_start && addr < vma->vm_end) | ||
207 | remove_migration_pte(page, vma, addr, arg); | ||
208 | } | ||
209 | return SWAP_AGAIN; | ||
210 | } | ||
211 | |||
212 | /* | ||
213 | * Get rid of all migration entries and replace them by | 182 | * Get rid of all migration entries and replace them by |
214 | * references to the indicated page. | 183 | * references to the indicated page. |
215 | */ | 184 | */ |
@@ -218,7 +187,6 @@ static void remove_migration_ptes(struct page *old, struct page *new) | |||
218 | struct rmap_walk_control rwc = { | 187 | struct rmap_walk_control rwc = { |
219 | .rmap_one = remove_migration_pte, | 188 | .rmap_one = remove_migration_pte, |
220 | .arg = old, | 189 | .arg = old, |
221 | .file_nonlinear = remove_linear_migration_ptes_from_nonlinear, | ||
222 | }; | 190 | }; |
223 | 191 | ||
224 | rmap_walk(new, &rwc); | 192 | rmap_walk(new, &rwc); |
@@ -229,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new) | |||
229 | * get to the page and wait until migration is finished. | 197 | * get to the page and wait until migration is finished. |
230 | * When we return from this function the fault will be retried. | 198 | * When we return from this function the fault will be retried. |
231 | */ | 199 | */ |
232 | static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, | 200 | void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, |
233 | spinlock_t *ptl) | 201 | spinlock_t *ptl) |
234 | { | 202 | { |
235 | pte_t pte; | 203 | pte_t pte; |
@@ -1268,7 +1236,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
1268 | goto put_and_set; | 1236 | goto put_and_set; |
1269 | 1237 | ||
1270 | if (PageHuge(page)) { | 1238 | if (PageHuge(page)) { |
1271 | isolate_huge_page(page, &pagelist); | 1239 | if (PageHead(page)) |
1240 | isolate_huge_page(page, &pagelist); | ||
1272 | goto put_and_set; | 1241 | goto put_and_set; |
1273 | } | 1242 | } |
1274 | 1243 | ||
@@ -1685,12 +1654,6 @@ bool pmd_trans_migrating(pmd_t pmd) | |||
1685 | return PageLocked(page); | 1654 | return PageLocked(page); |
1686 | } | 1655 | } |
1687 | 1656 | ||
1688 | void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) | ||
1689 | { | ||
1690 | struct page *page = pmd_page(*pmd); | ||
1691 | wait_on_page_locked(page); | ||
1692 | } | ||
1693 | |||
1694 | /* | 1657 | /* |
1695 | * Attempt to migrate a misplaced page to the specified destination | 1658 | * Attempt to migrate a misplaced page to the specified destination |
1696 | * node. Caller is expected to have an elevated reference count on | 1659 | * node. Caller is expected to have an elevated reference count on |
@@ -1884,7 +1847,7 @@ out_fail: | |||
1884 | out_dropref: | 1847 | out_dropref: |
1885 | ptl = pmd_lock(mm, pmd); | 1848 | ptl = pmd_lock(mm, pmd); |
1886 | if (pmd_same(*pmd, entry)) { | 1849 | if (pmd_same(*pmd, entry)) { |
1887 | entry = pmd_mknonnuma(entry); | 1850 | entry = pmd_modify(entry, vma->vm_page_prot); |
1888 | set_pmd_at(mm, mmun_start, pmd, entry); | 1851 | set_pmd_at(mm, mmun_start, pmd, entry); |
1889 | update_mmu_cache_pmd(vma, address, &entry); | 1852 | update_mmu_cache_pmd(vma, address, &entry); |
1890 | } | 1853 | } |
diff --git a/mm/mincore.c b/mm/mincore.c index c8c528b36641..be25efde64a4 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -19,38 +19,25 @@ | |||
19 | #include <asm/uaccess.h> | 19 | #include <asm/uaccess.h> |
20 | #include <asm/pgtable.h> | 20 | #include <asm/pgtable.h> |
21 | 21 | ||
22 | static void mincore_hugetlb_page_range(struct vm_area_struct *vma, | 22 | static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, |
23 | unsigned long addr, unsigned long end, | 23 | unsigned long end, struct mm_walk *walk) |
24 | unsigned char *vec) | ||
25 | { | 24 | { |
26 | #ifdef CONFIG_HUGETLB_PAGE | 25 | #ifdef CONFIG_HUGETLB_PAGE |
27 | struct hstate *h; | 26 | unsigned char present; |
27 | unsigned char *vec = walk->private; | ||
28 | 28 | ||
29 | h = hstate_vma(vma); | 29 | /* |
30 | while (1) { | 30 | * Hugepages under user process are always in RAM and never |
31 | unsigned char present; | 31 | * swapped out, but theoretically it needs to be checked. |
32 | pte_t *ptep; | 32 | */ |
33 | /* | 33 | present = pte && !huge_pte_none(huge_ptep_get(pte)); |
34 | * Huge pages are always in RAM for now, but | 34 | for (; addr != end; vec++, addr += PAGE_SIZE) |
35 | * theoretically it needs to be checked. | 35 | *vec = present; |
36 | */ | 36 | walk->private = vec; |
37 | ptep = huge_pte_offset(current->mm, | ||
38 | addr & huge_page_mask(h)); | ||
39 | present = ptep && !huge_pte_none(huge_ptep_get(ptep)); | ||
40 | while (1) { | ||
41 | *vec = present; | ||
42 | vec++; | ||
43 | addr += PAGE_SIZE; | ||
44 | if (addr == end) | ||
45 | return; | ||
46 | /* check hugepage border */ | ||
47 | if (!(addr & ~huge_page_mask(h))) | ||
48 | break; | ||
49 | } | ||
50 | } | ||
51 | #else | 37 | #else |
52 | BUG(); | 38 | BUG(); |
53 | #endif | 39 | #endif |
40 | return 0; | ||
54 | } | 41 | } |
55 | 42 | ||
56 | /* | 43 | /* |
@@ -94,9 +81,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) | |||
94 | return present; | 81 | return present; |
95 | } | 82 | } |
96 | 83 | ||
97 | static void mincore_unmapped_range(struct vm_area_struct *vma, | 84 | static int __mincore_unmapped_range(unsigned long addr, unsigned long end, |
98 | unsigned long addr, unsigned long end, | 85 | struct vm_area_struct *vma, unsigned char *vec) |
99 | unsigned char *vec) | ||
100 | { | 86 | { |
101 | unsigned long nr = (end - addr) >> PAGE_SHIFT; | 87 | unsigned long nr = (end - addr) >> PAGE_SHIFT; |
102 | int i; | 88 | int i; |
@@ -111,30 +97,47 @@ static void mincore_unmapped_range(struct vm_area_struct *vma, | |||
111 | for (i = 0; i < nr; i++) | 97 | for (i = 0; i < nr; i++) |
112 | vec[i] = 0; | 98 | vec[i] = 0; |
113 | } | 99 | } |
100 | return nr; | ||
101 | } | ||
102 | |||
103 | static int mincore_unmapped_range(unsigned long addr, unsigned long end, | ||
104 | struct mm_walk *walk) | ||
105 | { | ||
106 | walk->private += __mincore_unmapped_range(addr, end, | ||
107 | walk->vma, walk->private); | ||
108 | return 0; | ||
114 | } | 109 | } |
115 | 110 | ||
116 | static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 111 | static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
117 | unsigned long addr, unsigned long end, | 112 | struct mm_walk *walk) |
118 | unsigned char *vec) | ||
119 | { | 113 | { |
120 | unsigned long next; | ||
121 | spinlock_t *ptl; | 114 | spinlock_t *ptl; |
115 | struct vm_area_struct *vma = walk->vma; | ||
122 | pte_t *ptep; | 116 | pte_t *ptep; |
117 | unsigned char *vec = walk->private; | ||
118 | int nr = (end - addr) >> PAGE_SHIFT; | ||
119 | |||
120 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | ||
121 | memset(vec, 1, nr); | ||
122 | spin_unlock(ptl); | ||
123 | goto out; | ||
124 | } | ||
125 | |||
126 | if (pmd_trans_unstable(pmd)) { | ||
127 | __mincore_unmapped_range(addr, end, vma, vec); | ||
128 | goto out; | ||
129 | } | ||
123 | 130 | ||
124 | ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 131 | ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); |
125 | do { | 132 | for (; addr != end; ptep++, addr += PAGE_SIZE) { |
126 | pte_t pte = *ptep; | 133 | pte_t pte = *ptep; |
127 | pgoff_t pgoff; | ||
128 | 134 | ||
129 | next = addr + PAGE_SIZE; | ||
130 | if (pte_none(pte)) | 135 | if (pte_none(pte)) |
131 | mincore_unmapped_range(vma, addr, next, vec); | 136 | __mincore_unmapped_range(addr, addr + PAGE_SIZE, |
137 | vma, vec); | ||
132 | else if (pte_present(pte)) | 138 | else if (pte_present(pte)) |
133 | *vec = 1; | 139 | *vec = 1; |
134 | else if (pte_file(pte)) { | 140 | else { /* pte is a swap entry */ |
135 | pgoff = pte_to_pgoff(pte); | ||
136 | *vec = mincore_page(vma->vm_file->f_mapping, pgoff); | ||
137 | } else { /* pte is a swap entry */ | ||
138 | swp_entry_t entry = pte_to_swp_entry(pte); | 141 | swp_entry_t entry = pte_to_swp_entry(pte); |
139 | 142 | ||
140 | if (non_swap_entry(entry)) { | 143 | if (non_swap_entry(entry)) { |
@@ -145,9 +148,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
145 | *vec = 1; | 148 | *vec = 1; |
146 | } else { | 149 | } else { |
147 | #ifdef CONFIG_SWAP | 150 | #ifdef CONFIG_SWAP |
148 | pgoff = entry.val; | ||
149 | *vec = mincore_page(swap_address_space(entry), | 151 | *vec = mincore_page(swap_address_space(entry), |
150 | pgoff); | 152 | entry.val); |
151 | #else | 153 | #else |
152 | WARN_ON(1); | 154 | WARN_ON(1); |
153 | *vec = 1; | 155 | *vec = 1; |
@@ -155,69 +157,12 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
155 | } | 157 | } |
156 | } | 158 | } |
157 | vec++; | 159 | vec++; |
158 | } while (ptep++, addr = next, addr != end); | 160 | } |
159 | pte_unmap_unlock(ptep - 1, ptl); | 161 | pte_unmap_unlock(ptep - 1, ptl); |
160 | } | 162 | out: |
161 | 163 | walk->private += nr; | |
162 | static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 164 | cond_resched(); |
163 | unsigned long addr, unsigned long end, | 165 | return 0; |
164 | unsigned char *vec) | ||
165 | { | ||
166 | unsigned long next; | ||
167 | pmd_t *pmd; | ||
168 | |||
169 | pmd = pmd_offset(pud, addr); | ||
170 | do { | ||
171 | next = pmd_addr_end(addr, end); | ||
172 | if (pmd_trans_huge(*pmd)) { | ||
173 | if (mincore_huge_pmd(vma, pmd, addr, next, vec)) { | ||
174 | vec += (next - addr) >> PAGE_SHIFT; | ||
175 | continue; | ||
176 | } | ||
177 | /* fall through */ | ||
178 | } | ||
179 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | ||
180 | mincore_unmapped_range(vma, addr, next, vec); | ||
181 | else | ||
182 | mincore_pte_range(vma, pmd, addr, next, vec); | ||
183 | vec += (next - addr) >> PAGE_SHIFT; | ||
184 | } while (pmd++, addr = next, addr != end); | ||
185 | } | ||
186 | |||
187 | static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | ||
188 | unsigned long addr, unsigned long end, | ||
189 | unsigned char *vec) | ||
190 | { | ||
191 | unsigned long next; | ||
192 | pud_t *pud; | ||
193 | |||
194 | pud = pud_offset(pgd, addr); | ||
195 | do { | ||
196 | next = pud_addr_end(addr, end); | ||
197 | if (pud_none_or_clear_bad(pud)) | ||
198 | mincore_unmapped_range(vma, addr, next, vec); | ||
199 | else | ||
200 | mincore_pmd_range(vma, pud, addr, next, vec); | ||
201 | vec += (next - addr) >> PAGE_SHIFT; | ||
202 | } while (pud++, addr = next, addr != end); | ||
203 | } | ||
204 | |||
205 | static void mincore_page_range(struct vm_area_struct *vma, | ||
206 | unsigned long addr, unsigned long end, | ||
207 | unsigned char *vec) | ||
208 | { | ||
209 | unsigned long next; | ||
210 | pgd_t *pgd; | ||
211 | |||
212 | pgd = pgd_offset(vma->vm_mm, addr); | ||
213 | do { | ||
214 | next = pgd_addr_end(addr, end); | ||
215 | if (pgd_none_or_clear_bad(pgd)) | ||
216 | mincore_unmapped_range(vma, addr, next, vec); | ||
217 | else | ||
218 | mincore_pud_range(vma, pgd, addr, next, vec); | ||
219 | vec += (next - addr) >> PAGE_SHIFT; | ||
220 | } while (pgd++, addr = next, addr != end); | ||
221 | } | 166 | } |
222 | 167 | ||
223 | /* | 168 | /* |
@@ -229,18 +174,22 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v | |||
229 | { | 174 | { |
230 | struct vm_area_struct *vma; | 175 | struct vm_area_struct *vma; |
231 | unsigned long end; | 176 | unsigned long end; |
177 | int err; | ||
178 | struct mm_walk mincore_walk = { | ||
179 | .pmd_entry = mincore_pte_range, | ||
180 | .pte_hole = mincore_unmapped_range, | ||
181 | .hugetlb_entry = mincore_hugetlb, | ||
182 | .private = vec, | ||
183 | }; | ||
232 | 184 | ||
233 | vma = find_vma(current->mm, addr); | 185 | vma = find_vma(current->mm, addr); |
234 | if (!vma || addr < vma->vm_start) | 186 | if (!vma || addr < vma->vm_start) |
235 | return -ENOMEM; | 187 | return -ENOMEM; |
236 | 188 | mincore_walk.mm = vma->vm_mm; | |
237 | end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); | 189 | end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); |
238 | 190 | err = walk_page_range(addr, end, &mincore_walk); | |
239 | if (is_vm_hugetlb_page(vma)) | 191 | if (err < 0) |
240 | mincore_hugetlb_page_range(vma, addr, end, vec); | 192 | return err; |
241 | else | ||
242 | mincore_page_range(vma, addr, end, vec); | ||
243 | |||
244 | return (end - addr) >> PAGE_SHIFT; | 193 | return (end - addr) >> PAGE_SHIFT; |
245 | } | 194 | } |
246 | 195 | ||
diff --git a/mm/mm_init.c b/mm/mm_init.c index 4074caf9936b..5f420f7fafa1 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c | |||
@@ -14,14 +14,14 @@ | |||
14 | #include "internal.h" | 14 | #include "internal.h" |
15 | 15 | ||
16 | #ifdef CONFIG_DEBUG_MEMORY_INIT | 16 | #ifdef CONFIG_DEBUG_MEMORY_INIT |
17 | int mminit_loglevel; | 17 | int __meminitdata mminit_loglevel; |
18 | 18 | ||
19 | #ifndef SECTIONS_SHIFT | 19 | #ifndef SECTIONS_SHIFT |
20 | #define SECTIONS_SHIFT 0 | 20 | #define SECTIONS_SHIFT 0 |
21 | #endif | 21 | #endif |
22 | 22 | ||
23 | /* The zonelists are simply reported, validation is manual. */ | 23 | /* The zonelists are simply reported, validation is manual. */ |
24 | void mminit_verify_zonelist(void) | 24 | void __init mminit_verify_zonelist(void) |
25 | { | 25 | { |
26 | int nid; | 26 | int nid; |
27 | 27 | ||
@@ -152,7 +152,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); | |||
152 | */ | 152 | */ |
153 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | 153 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) |
154 | { | 154 | { |
155 | unsigned long free, allowed, reserve; | 155 | long free, allowed, reserve; |
156 | 156 | ||
157 | VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < | 157 | VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < |
158 | -(s64)vm_committed_as_batch * num_online_cpus(), | 158 | -(s64)vm_committed_as_batch * num_online_cpus(), |
@@ -220,7 +220,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
220 | */ | 220 | */ |
221 | if (mm) { | 221 | if (mm) { |
222 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); | 222 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); |
223 | allowed -= min(mm->total_vm / 32, reserve); | 223 | allowed -= min_t(long, mm->total_vm / 32, reserve); |
224 | } | 224 | } |
225 | 225 | ||
226 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) | 226 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) |
@@ -243,10 +243,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma, | |||
243 | mapping_unmap_writable(mapping); | 243 | mapping_unmap_writable(mapping); |
244 | 244 | ||
245 | flush_dcache_mmap_lock(mapping); | 245 | flush_dcache_mmap_lock(mapping); |
246 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 246 | vma_interval_tree_remove(vma, &mapping->i_mmap); |
247 | list_del_init(&vma->shared.nonlinear); | ||
248 | else | ||
249 | vma_interval_tree_remove(vma, &mapping->i_mmap); | ||
250 | flush_dcache_mmap_unlock(mapping); | 247 | flush_dcache_mmap_unlock(mapping); |
251 | } | 248 | } |
252 | 249 | ||
@@ -649,10 +646,7 @@ static void __vma_link_file(struct vm_area_struct *vma) | |||
649 | atomic_inc(&mapping->i_mmap_writable); | 646 | atomic_inc(&mapping->i_mmap_writable); |
650 | 647 | ||
651 | flush_dcache_mmap_lock(mapping); | 648 | flush_dcache_mmap_lock(mapping); |
652 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) | 649 | vma_interval_tree_insert(vma, &mapping->i_mmap); |
653 | vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); | ||
654 | else | ||
655 | vma_interval_tree_insert(vma, &mapping->i_mmap); | ||
656 | flush_dcache_mmap_unlock(mapping); | 650 | flush_dcache_mmap_unlock(mapping); |
657 | } | 651 | } |
658 | } | 652 | } |
@@ -778,23 +772,22 @@ again: remove_next = 1 + (end > next->vm_end); | |||
778 | if (exporter && exporter->anon_vma && !importer->anon_vma) { | 772 | if (exporter && exporter->anon_vma && !importer->anon_vma) { |
779 | int error; | 773 | int error; |
780 | 774 | ||
775 | importer->anon_vma = exporter->anon_vma; | ||
781 | error = anon_vma_clone(importer, exporter); | 776 | error = anon_vma_clone(importer, exporter); |
782 | if (error) | 777 | if (error) { |
778 | importer->anon_vma = NULL; | ||
783 | return error; | 779 | return error; |
784 | importer->anon_vma = exporter->anon_vma; | 780 | } |
785 | } | 781 | } |
786 | } | 782 | } |
787 | 783 | ||
788 | if (file) { | 784 | if (file) { |
789 | mapping = file->f_mapping; | 785 | mapping = file->f_mapping; |
790 | if (!(vma->vm_flags & VM_NONLINEAR)) { | 786 | root = &mapping->i_mmap; |
791 | root = &mapping->i_mmap; | 787 | uprobe_munmap(vma, vma->vm_start, vma->vm_end); |
792 | uprobe_munmap(vma, vma->vm_start, vma->vm_end); | ||
793 | 788 | ||
794 | if (adjust_next) | 789 | if (adjust_next) |
795 | uprobe_munmap(next, next->vm_start, | 790 | uprobe_munmap(next, next->vm_start, next->vm_end); |
796 | next->vm_end); | ||
797 | } | ||
798 | 791 | ||
799 | i_mmap_lock_write(mapping); | 792 | i_mmap_lock_write(mapping); |
800 | if (insert) { | 793 | if (insert) { |
@@ -2099,14 +2092,17 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
2099 | { | 2092 | { |
2100 | struct mm_struct *mm = vma->vm_mm; | 2093 | struct mm_struct *mm = vma->vm_mm; |
2101 | struct rlimit *rlim = current->signal->rlim; | 2094 | struct rlimit *rlim = current->signal->rlim; |
2102 | unsigned long new_start; | 2095 | unsigned long new_start, actual_size; |
2103 | 2096 | ||
2104 | /* address space limit tests */ | 2097 | /* address space limit tests */ |
2105 | if (!may_expand_vm(mm, grow)) | 2098 | if (!may_expand_vm(mm, grow)) |
2106 | return -ENOMEM; | 2099 | return -ENOMEM; |
2107 | 2100 | ||
2108 | /* Stack limit test */ | 2101 | /* Stack limit test */ |
2109 | if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) | 2102 | actual_size = size; |
2103 | if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) | ||
2104 | actual_size -= PAGE_SIZE; | ||
2105 | if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) | ||
2110 | return -ENOMEM; | 2106 | return -ENOMEM; |
2111 | 2107 | ||
2112 | /* mlock limit tests */ | 2108 | /* mlock limit tests */ |
@@ -2629,6 +2625,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) | |||
2629 | return vm_munmap(addr, len); | 2625 | return vm_munmap(addr, len); |
2630 | } | 2626 | } |
2631 | 2627 | ||
2628 | |||
2629 | /* | ||
2630 | * Emulation of deprecated remap_file_pages() syscall. | ||
2631 | */ | ||
2632 | SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, | ||
2633 | unsigned long, prot, unsigned long, pgoff, unsigned long, flags) | ||
2634 | { | ||
2635 | |||
2636 | struct mm_struct *mm = current->mm; | ||
2637 | struct vm_area_struct *vma; | ||
2638 | unsigned long populate = 0; | ||
2639 | unsigned long ret = -EINVAL; | ||
2640 | struct file *file; | ||
2641 | |||
2642 | pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " | ||
2643 | "See Documentation/vm/remap_file_pages.txt.\n", | ||
2644 | current->comm, current->pid); | ||
2645 | |||
2646 | if (prot) | ||
2647 | return ret; | ||
2648 | start = start & PAGE_MASK; | ||
2649 | size = size & PAGE_MASK; | ||
2650 | |||
2651 | if (start + size <= start) | ||
2652 | return ret; | ||
2653 | |||
2654 | /* Does pgoff wrap? */ | ||
2655 | if (pgoff + (size >> PAGE_SHIFT) < pgoff) | ||
2656 | return ret; | ||
2657 | |||
2658 | down_write(&mm->mmap_sem); | ||
2659 | vma = find_vma(mm, start); | ||
2660 | |||
2661 | if (!vma || !(vma->vm_flags & VM_SHARED)) | ||
2662 | goto out; | ||
2663 | |||
2664 | if (start < vma->vm_start || start + size > vma->vm_end) | ||
2665 | goto out; | ||
2666 | |||
2667 | if (pgoff == linear_page_index(vma, start)) { | ||
2668 | ret = 0; | ||
2669 | goto out; | ||
2670 | } | ||
2671 | |||
2672 | prot |= vma->vm_flags & VM_READ ? PROT_READ : 0; | ||
2673 | prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0; | ||
2674 | prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0; | ||
2675 | |||
2676 | flags &= MAP_NONBLOCK; | ||
2677 | flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE; | ||
2678 | if (vma->vm_flags & VM_LOCKED) { | ||
2679 | flags |= MAP_LOCKED; | ||
2680 | /* drop PG_Mlocked flag for over-mapped range */ | ||
2681 | munlock_vma_pages_range(vma, start, start + size); | ||
2682 | } | ||
2683 | |||
2684 | file = get_file(vma->vm_file); | ||
2685 | ret = do_mmap_pgoff(vma->vm_file, start, size, | ||
2686 | prot, flags, pgoff, &populate); | ||
2687 | fput(file); | ||
2688 | out: | ||
2689 | up_write(&mm->mmap_sem); | ||
2690 | if (populate) | ||
2691 | mm_populate(ret, populate); | ||
2692 | if (!IS_ERR_VALUE(ret)) | ||
2693 | ret = 0; | ||
2694 | return ret; | ||
2695 | } | ||
2696 | |||
2632 | static inline void verify_mm_writelocked(struct mm_struct *mm) | 2697 | static inline void verify_mm_writelocked(struct mm_struct *mm) |
2633 | { | 2698 | { |
2634 | #ifdef CONFIG_DEBUG_VM | 2699 | #ifdef CONFIG_DEBUG_VM |
@@ -2786,9 +2851,6 @@ void exit_mmap(struct mm_struct *mm) | |||
2786 | vma = remove_vma(vma); | 2851 | vma = remove_vma(vma); |
2787 | } | 2852 | } |
2788 | vm_unacct_memory(nr_accounted); | 2853 | vm_unacct_memory(nr_accounted); |
2789 | |||
2790 | WARN_ON(atomic_long_read(&mm->nr_ptes) > | ||
2791 | (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); | ||
2792 | } | 2854 | } |
2793 | 2855 | ||
2794 | /* Insert vm structure into process list sorted by address | 2856 | /* Insert vm structure into process list sorted by address |
@@ -3103,8 +3165,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
3103 | * | 3165 | * |
3104 | * mmap_sem in write mode is required in order to block all operations | 3166 | * mmap_sem in write mode is required in order to block all operations |
3105 | * that could modify pagetables and free pages without need of | 3167 | * that could modify pagetables and free pages without need of |
3106 | * altering the vma layout (for example populate_range() with | 3168 | * altering the vma layout. It's also needed in write mode to avoid new |
3107 | * nonlinear vmas). It's also needed in write mode to avoid new | ||
3108 | * anon_vmas to be associated with existing vmas. | 3169 | * anon_vmas to be associated with existing vmas. |
3109 | * | 3170 | * |
3110 | * A single task can't take more than one mm_take_all_locks() in a row | 3171 | * A single task can't take more than one mm_take_all_locks() in a row |
diff --git a/mm/mmzone.c b/mm/mmzone.c index bf34fb8556db..7d87ebb0d632 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -54,8 +54,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) | |||
54 | /* Returns the next zone at or below highest_zoneidx in a zonelist */ | 54 | /* Returns the next zone at or below highest_zoneidx in a zonelist */ |
55 | struct zoneref *next_zones_zonelist(struct zoneref *z, | 55 | struct zoneref *next_zones_zonelist(struct zoneref *z, |
56 | enum zone_type highest_zoneidx, | 56 | enum zone_type highest_zoneidx, |
57 | nodemask_t *nodes, | 57 | nodemask_t *nodes) |
58 | struct zone **zone) | ||
59 | { | 58 | { |
60 | /* | 59 | /* |
61 | * Find the next suitable zone to use for the allocation. | 60 | * Find the next suitable zone to use for the allocation. |
@@ -69,7 +68,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, | |||
69 | (z->zone && !zref_in_nodemask(z, nodes))) | 68 | (z->zone && !zref_in_nodemask(z, nodes))) |
70 | z++; | 69 | z++; |
71 | 70 | ||
72 | *zone = zonelist_zone(z); | ||
73 | return z; | 71 | return z; |
74 | } | 72 | } |
75 | 73 | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index ace93454ce8e..44727811bf4c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -75,37 +75,35 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
75 | oldpte = *pte; | 75 | oldpte = *pte; |
76 | if (pte_present(oldpte)) { | 76 | if (pte_present(oldpte)) { |
77 | pte_t ptent; | 77 | pte_t ptent; |
78 | bool updated = false; | ||
79 | 78 | ||
80 | if (!prot_numa) { | 79 | /* |
81 | ptent = ptep_modify_prot_start(mm, addr, pte); | 80 | * Avoid trapping faults against the zero or KSM |
82 | if (pte_numa(ptent)) | 81 | * pages. See similar comment in change_huge_pmd. |
83 | ptent = pte_mknonnuma(ptent); | 82 | */ |
84 | ptent = pte_modify(ptent, newprot); | 83 | if (prot_numa) { |
85 | /* | ||
86 | * Avoid taking write faults for pages we | ||
87 | * know to be dirty. | ||
88 | */ | ||
89 | if (dirty_accountable && pte_dirty(ptent) && | ||
90 | (pte_soft_dirty(ptent) || | ||
91 | !(vma->vm_flags & VM_SOFTDIRTY))) | ||
92 | ptent = pte_mkwrite(ptent); | ||
93 | ptep_modify_prot_commit(mm, addr, pte, ptent); | ||
94 | updated = true; | ||
95 | } else { | ||
96 | struct page *page; | 84 | struct page *page; |
97 | 85 | ||
98 | page = vm_normal_page(vma, addr, oldpte); | 86 | page = vm_normal_page(vma, addr, oldpte); |
99 | if (page && !PageKsm(page)) { | 87 | if (!page || PageKsm(page)) |
100 | if (!pte_numa(oldpte)) { | 88 | continue; |
101 | ptep_set_numa(mm, addr, pte); | 89 | |
102 | updated = true; | 90 | /* Avoid TLB flush if possible */ |
103 | } | 91 | if (pte_protnone(oldpte)) |
104 | } | 92 | continue; |
105 | } | 93 | } |
106 | if (updated) | 94 | |
107 | pages++; | 95 | ptent = ptep_modify_prot_start(mm, addr, pte); |
108 | } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { | 96 | ptent = pte_modify(ptent, newprot); |
97 | |||
98 | /* Avoid taking write faults for known dirty pages */ | ||
99 | if (dirty_accountable && pte_dirty(ptent) && | ||
100 | (pte_soft_dirty(ptent) || | ||
101 | !(vma->vm_flags & VM_SOFTDIRTY))) { | ||
102 | ptent = pte_mkwrite(ptent); | ||
103 | } | ||
104 | ptep_modify_prot_commit(mm, addr, pte, ptent); | ||
105 | pages++; | ||
106 | } else if (IS_ENABLED(CONFIG_MIGRATION)) { | ||
109 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 107 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
110 | 108 | ||
111 | if (is_write_migration_entry(entry)) { | 109 | if (is_write_migration_entry(entry)) { |
diff --git a/mm/mremap.c b/mm/mremap.c index 17fa018f5f39..57dadc025c64 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -81,8 +81,6 @@ static pte_t move_soft_dirty_pte(pte_t pte) | |||
81 | pte = pte_mksoft_dirty(pte); | 81 | pte = pte_mksoft_dirty(pte); |
82 | else if (is_swap_pte(pte)) | 82 | else if (is_swap_pte(pte)) |
83 | pte = pte_swp_mksoft_dirty(pte); | 83 | pte = pte_swp_mksoft_dirty(pte); |
84 | else if (pte_file(pte)) | ||
85 | pte = pte_file_mksoft_dirty(pte); | ||
86 | #endif | 84 | #endif |
87 | return pte; | 85 | return pte; |
88 | } | 86 | } |
diff --git a/mm/msync.c b/mm/msync.c index 992a1673d488..bb04d53ae852 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
@@ -86,10 +86,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) | |||
86 | (vma->vm_flags & VM_SHARED)) { | 86 | (vma->vm_flags & VM_SHARED)) { |
87 | get_file(file); | 87 | get_file(file); |
88 | up_read(&mm->mmap_sem); | 88 | up_read(&mm->mmap_sem); |
89 | if (vma->vm_flags & VM_NONLINEAR) | 89 | error = vfs_fsync_range(file, fstart, fend, 1); |
90 | error = vfs_fsync(file, 1); | ||
91 | else | ||
92 | error = vfs_fsync_range(file, fstart, fend, 1); | ||
93 | fput(file); | 90 | fput(file); |
94 | if (error || start >= end) | 91 | if (error || start >= end) |
95 | goto out; | 92 | goto out; |
diff --git a/mm/nommu.c b/mm/nommu.c index b51eadf6d952..7296360fc057 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -59,6 +59,7 @@ | |||
59 | #endif | 59 | #endif |
60 | 60 | ||
61 | void *high_memory; | 61 | void *high_memory; |
62 | EXPORT_SYMBOL(high_memory); | ||
62 | struct page *mem_map; | 63 | struct page *mem_map; |
63 | unsigned long max_mapnr; | 64 | unsigned long max_mapnr; |
64 | unsigned long highest_memmap_pfn; | 65 | unsigned long highest_memmap_pfn; |
@@ -213,6 +214,39 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
213 | } | 214 | } |
214 | EXPORT_SYMBOL(get_user_pages); | 215 | EXPORT_SYMBOL(get_user_pages); |
215 | 216 | ||
217 | long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, | ||
218 | unsigned long start, unsigned long nr_pages, | ||
219 | int write, int force, struct page **pages, | ||
220 | int *locked) | ||
221 | { | ||
222 | return get_user_pages(tsk, mm, start, nr_pages, write, force, | ||
223 | pages, NULL); | ||
224 | } | ||
225 | EXPORT_SYMBOL(get_user_pages_locked); | ||
226 | |||
227 | long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
228 | unsigned long start, unsigned long nr_pages, | ||
229 | int write, int force, struct page **pages, | ||
230 | unsigned int gup_flags) | ||
231 | { | ||
232 | long ret; | ||
233 | down_read(&mm->mmap_sem); | ||
234 | ret = get_user_pages(tsk, mm, start, nr_pages, write, force, | ||
235 | pages, NULL); | ||
236 | up_read(&mm->mmap_sem); | ||
237 | return ret; | ||
238 | } | ||
239 | EXPORT_SYMBOL(__get_user_pages_unlocked); | ||
240 | |||
241 | long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
242 | unsigned long start, unsigned long nr_pages, | ||
243 | int write, int force, struct page **pages) | ||
244 | { | ||
245 | return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, | ||
246 | force, pages, 0); | ||
247 | } | ||
248 | EXPORT_SYMBOL(get_user_pages_unlocked); | ||
249 | |||
216 | /** | 250 | /** |
217 | * follow_pfn - look up PFN at a user virtual address | 251 | * follow_pfn - look up PFN at a user virtual address |
218 | * @vma: memory mapping | 252 | * @vma: memory mapping |
@@ -946,9 +980,6 @@ static int validate_mmap_request(struct file *file, | |||
946 | return -EOVERFLOW; | 980 | return -EOVERFLOW; |
947 | 981 | ||
948 | if (file) { | 982 | if (file) { |
949 | /* validate file mapping requests */ | ||
950 | struct address_space *mapping; | ||
951 | |||
952 | /* files must support mmap */ | 983 | /* files must support mmap */ |
953 | if (!file->f_op->mmap) | 984 | if (!file->f_op->mmap) |
954 | return -ENODEV; | 985 | return -ENODEV; |
@@ -957,28 +988,22 @@ static int validate_mmap_request(struct file *file, | |||
957 | * - we support chardevs that provide their own "memory" | 988 | * - we support chardevs that provide their own "memory" |
958 | * - we support files/blockdevs that are memory backed | 989 | * - we support files/blockdevs that are memory backed |
959 | */ | 990 | */ |
960 | mapping = file->f_mapping; | 991 | if (file->f_op->mmap_capabilities) { |
961 | if (!mapping) | 992 | capabilities = file->f_op->mmap_capabilities(file); |
962 | mapping = file_inode(file)->i_mapping; | 993 | } else { |
963 | |||
964 | capabilities = 0; | ||
965 | if (mapping && mapping->backing_dev_info) | ||
966 | capabilities = mapping->backing_dev_info->capabilities; | ||
967 | |||
968 | if (!capabilities) { | ||
969 | /* no explicit capabilities set, so assume some | 994 | /* no explicit capabilities set, so assume some |
970 | * defaults */ | 995 | * defaults */ |
971 | switch (file_inode(file)->i_mode & S_IFMT) { | 996 | switch (file_inode(file)->i_mode & S_IFMT) { |
972 | case S_IFREG: | 997 | case S_IFREG: |
973 | case S_IFBLK: | 998 | case S_IFBLK: |
974 | capabilities = BDI_CAP_MAP_COPY; | 999 | capabilities = NOMMU_MAP_COPY; |
975 | break; | 1000 | break; |
976 | 1001 | ||
977 | case S_IFCHR: | 1002 | case S_IFCHR: |
978 | capabilities = | 1003 | capabilities = |
979 | BDI_CAP_MAP_DIRECT | | 1004 | NOMMU_MAP_DIRECT | |
980 | BDI_CAP_READ_MAP | | 1005 | NOMMU_MAP_READ | |
981 | BDI_CAP_WRITE_MAP; | 1006 | NOMMU_MAP_WRITE; |
982 | break; | 1007 | break; |
983 | 1008 | ||
984 | default: | 1009 | default: |
@@ -989,9 +1014,9 @@ static int validate_mmap_request(struct file *file, | |||
989 | /* eliminate any capabilities that we can't support on this | 1014 | /* eliminate any capabilities that we can't support on this |
990 | * device */ | 1015 | * device */ |
991 | if (!file->f_op->get_unmapped_area) | 1016 | if (!file->f_op->get_unmapped_area) |
992 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 1017 | capabilities &= ~NOMMU_MAP_DIRECT; |
993 | if (!file->f_op->read) | 1018 | if (!file->f_op->read) |
994 | capabilities &= ~BDI_CAP_MAP_COPY; | 1019 | capabilities &= ~NOMMU_MAP_COPY; |
995 | 1020 | ||
996 | /* The file shall have been opened with read permission. */ | 1021 | /* The file shall have been opened with read permission. */ |
997 | if (!(file->f_mode & FMODE_READ)) | 1022 | if (!(file->f_mode & FMODE_READ)) |
@@ -1010,29 +1035,29 @@ static int validate_mmap_request(struct file *file, | |||
1010 | if (locks_verify_locked(file)) | 1035 | if (locks_verify_locked(file)) |
1011 | return -EAGAIN; | 1036 | return -EAGAIN; |
1012 | 1037 | ||
1013 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) | 1038 | if (!(capabilities & NOMMU_MAP_DIRECT)) |
1014 | return -ENODEV; | 1039 | return -ENODEV; |
1015 | 1040 | ||
1016 | /* we mustn't privatise shared mappings */ | 1041 | /* we mustn't privatise shared mappings */ |
1017 | capabilities &= ~BDI_CAP_MAP_COPY; | 1042 | capabilities &= ~NOMMU_MAP_COPY; |
1018 | } else { | 1043 | } else { |
1019 | /* we're going to read the file into private memory we | 1044 | /* we're going to read the file into private memory we |
1020 | * allocate */ | 1045 | * allocate */ |
1021 | if (!(capabilities & BDI_CAP_MAP_COPY)) | 1046 | if (!(capabilities & NOMMU_MAP_COPY)) |
1022 | return -ENODEV; | 1047 | return -ENODEV; |
1023 | 1048 | ||
1024 | /* we don't permit a private writable mapping to be | 1049 | /* we don't permit a private writable mapping to be |
1025 | * shared with the backing device */ | 1050 | * shared with the backing device */ |
1026 | if (prot & PROT_WRITE) | 1051 | if (prot & PROT_WRITE) |
1027 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 1052 | capabilities &= ~NOMMU_MAP_DIRECT; |
1028 | } | 1053 | } |
1029 | 1054 | ||
1030 | if (capabilities & BDI_CAP_MAP_DIRECT) { | 1055 | if (capabilities & NOMMU_MAP_DIRECT) { |
1031 | if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || | 1056 | if (((prot & PROT_READ) && !(capabilities & NOMMU_MAP_READ)) || |
1032 | ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || | 1057 | ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) || |
1033 | ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) | 1058 | ((prot & PROT_EXEC) && !(capabilities & NOMMU_MAP_EXEC)) |
1034 | ) { | 1059 | ) { |
1035 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 1060 | capabilities &= ~NOMMU_MAP_DIRECT; |
1036 | if (flags & MAP_SHARED) { | 1061 | if (flags & MAP_SHARED) { |
1037 | printk(KERN_WARNING | 1062 | printk(KERN_WARNING |
1038 | "MAP_SHARED not completely supported on !MMU\n"); | 1063 | "MAP_SHARED not completely supported on !MMU\n"); |
@@ -1049,21 +1074,21 @@ static int validate_mmap_request(struct file *file, | |||
1049 | } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { | 1074 | } else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) { |
1050 | /* handle implication of PROT_EXEC by PROT_READ */ | 1075 | /* handle implication of PROT_EXEC by PROT_READ */ |
1051 | if (current->personality & READ_IMPLIES_EXEC) { | 1076 | if (current->personality & READ_IMPLIES_EXEC) { |
1052 | if (capabilities & BDI_CAP_EXEC_MAP) | 1077 | if (capabilities & NOMMU_MAP_EXEC) |
1053 | prot |= PROT_EXEC; | 1078 | prot |= PROT_EXEC; |
1054 | } | 1079 | } |
1055 | } else if ((prot & PROT_READ) && | 1080 | } else if ((prot & PROT_READ) && |
1056 | (prot & PROT_EXEC) && | 1081 | (prot & PROT_EXEC) && |
1057 | !(capabilities & BDI_CAP_EXEC_MAP) | 1082 | !(capabilities & NOMMU_MAP_EXEC) |
1058 | ) { | 1083 | ) { |
1059 | /* backing file is not executable, try to copy */ | 1084 | /* backing file is not executable, try to copy */ |
1060 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 1085 | capabilities &= ~NOMMU_MAP_DIRECT; |
1061 | } | 1086 | } |
1062 | } else { | 1087 | } else { |
1063 | /* anonymous mappings are always memory backed and can be | 1088 | /* anonymous mappings are always memory backed and can be |
1064 | * privately mapped | 1089 | * privately mapped |
1065 | */ | 1090 | */ |
1066 | capabilities = BDI_CAP_MAP_COPY; | 1091 | capabilities = NOMMU_MAP_COPY; |
1067 | 1092 | ||
1068 | /* handle PROT_EXEC implication by PROT_READ */ | 1093 | /* handle PROT_EXEC implication by PROT_READ */ |
1069 | if ((prot & PROT_READ) && | 1094 | if ((prot & PROT_READ) && |
@@ -1095,7 +1120,7 @@ static unsigned long determine_vm_flags(struct file *file, | |||
1095 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); | 1120 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); |
1096 | /* vm_flags |= mm->def_flags; */ | 1121 | /* vm_flags |= mm->def_flags; */ |
1097 | 1122 | ||
1098 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) { | 1123 | if (!(capabilities & NOMMU_MAP_DIRECT)) { |
1099 | /* attempt to share read-only copies of mapped file chunks */ | 1124 | /* attempt to share read-only copies of mapped file chunks */ |
1100 | vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; | 1125 | vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; |
1101 | if (file && !(prot & PROT_WRITE)) | 1126 | if (file && !(prot & PROT_WRITE)) |
@@ -1104,7 +1129,7 @@ static unsigned long determine_vm_flags(struct file *file, | |||
1104 | /* overlay a shareable mapping on the backing device or inode | 1129 | /* overlay a shareable mapping on the backing device or inode |
1105 | * if possible - used for chardevs, ramfs/tmpfs/shmfs and | 1130 | * if possible - used for chardevs, ramfs/tmpfs/shmfs and |
1106 | * romfs/cramfs */ | 1131 | * romfs/cramfs */ |
1107 | vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS); | 1132 | vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS); |
1108 | if (flags & MAP_SHARED) | 1133 | if (flags & MAP_SHARED) |
1109 | vm_flags |= VM_SHARED; | 1134 | vm_flags |= VM_SHARED; |
1110 | } | 1135 | } |
@@ -1157,7 +1182,7 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
1157 | * shared mappings on devices or memory | 1182 | * shared mappings on devices or memory |
1158 | * - VM_MAYSHARE will be set if it may attempt to share | 1183 | * - VM_MAYSHARE will be set if it may attempt to share |
1159 | */ | 1184 | */ |
1160 | if (capabilities & BDI_CAP_MAP_DIRECT) { | 1185 | if (capabilities & NOMMU_MAP_DIRECT) { |
1161 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); | 1186 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); |
1162 | if (ret == 0) { | 1187 | if (ret == 0) { |
1163 | /* shouldn't return success if we're not sharing */ | 1188 | /* shouldn't return success if we're not sharing */ |
@@ -1346,7 +1371,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1346 | if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && | 1371 | if ((pregion->vm_pgoff != pgoff || rpglen != pglen) && |
1347 | !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { | 1372 | !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) { |
1348 | /* new mapping is not a subset of the region */ | 1373 | /* new mapping is not a subset of the region */ |
1349 | if (!(capabilities & BDI_CAP_MAP_DIRECT)) | 1374 | if (!(capabilities & NOMMU_MAP_DIRECT)) |
1350 | goto sharing_violation; | 1375 | goto sharing_violation; |
1351 | continue; | 1376 | continue; |
1352 | } | 1377 | } |
@@ -1385,7 +1410,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1385 | * - this is the hook for quasi-memory character devices to | 1410 | * - this is the hook for quasi-memory character devices to |
1386 | * tell us the location of a shared mapping | 1411 | * tell us the location of a shared mapping |
1387 | */ | 1412 | */ |
1388 | if (capabilities & BDI_CAP_MAP_DIRECT) { | 1413 | if (capabilities & NOMMU_MAP_DIRECT) { |
1389 | addr = file->f_op->get_unmapped_area(file, addr, len, | 1414 | addr = file->f_op->get_unmapped_area(file, addr, len, |
1390 | pgoff, flags); | 1415 | pgoff, flags); |
1391 | if (IS_ERR_VALUE(addr)) { | 1416 | if (IS_ERR_VALUE(addr)) { |
@@ -1397,10 +1422,10 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1397 | * the mapping so we'll have to attempt to copy | 1422 | * the mapping so we'll have to attempt to copy |
1398 | * it */ | 1423 | * it */ |
1399 | ret = -ENODEV; | 1424 | ret = -ENODEV; |
1400 | if (!(capabilities & BDI_CAP_MAP_COPY)) | 1425 | if (!(capabilities & NOMMU_MAP_COPY)) |
1401 | goto error_just_free; | 1426 | goto error_just_free; |
1402 | 1427 | ||
1403 | capabilities &= ~BDI_CAP_MAP_DIRECT; | 1428 | capabilities &= ~NOMMU_MAP_DIRECT; |
1404 | } else { | 1429 | } else { |
1405 | vma->vm_start = region->vm_start = addr; | 1430 | vma->vm_start = region->vm_start = addr; |
1406 | vma->vm_end = region->vm_end = addr + len; | 1431 | vma->vm_end = region->vm_end = addr + len; |
@@ -1411,7 +1436,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1411 | vma->vm_region = region; | 1436 | vma->vm_region = region; |
1412 | 1437 | ||
1413 | /* set up the mapping | 1438 | /* set up the mapping |
1414 | * - the region is filled in if BDI_CAP_MAP_DIRECT is still set | 1439 | * - the region is filled in if NOMMU_MAP_DIRECT is still set |
1415 | */ | 1440 | */ |
1416 | if (file && vma->vm_flags & VM_SHARED) | 1441 | if (file && vma->vm_flags & VM_SHARED) |
1417 | ret = do_mmap_shared_file(vma); | 1442 | ret = do_mmap_shared_file(vma); |
@@ -1894,7 +1919,7 @@ EXPORT_SYMBOL(unmap_mapping_range); | |||
1894 | */ | 1919 | */ |
1895 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | 1920 | int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) |
1896 | { | 1921 | { |
1897 | unsigned long free, allowed, reserve; | 1922 | long free, allowed, reserve; |
1898 | 1923 | ||
1899 | vm_acct_memory(pages); | 1924 | vm_acct_memory(pages); |
1900 | 1925 | ||
@@ -1958,7 +1983,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
1958 | */ | 1983 | */ |
1959 | if (mm) { | 1984 | if (mm) { |
1960 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); | 1985 | reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); |
1961 | allowed -= min(mm->total_vm / 32, reserve); | 1986 | allowed -= min_t(long, mm->total_vm / 32, reserve); |
1962 | } | 1987 | } |
1963 | 1988 | ||
1964 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) | 1989 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) |
@@ -1983,14 +2008,6 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1983 | } | 2008 | } |
1984 | EXPORT_SYMBOL(filemap_map_pages); | 2009 | EXPORT_SYMBOL(filemap_map_pages); |
1985 | 2010 | ||
1986 | int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr, | ||
1987 | unsigned long size, pgoff_t pgoff) | ||
1988 | { | ||
1989 | BUG(); | ||
1990 | return 0; | ||
1991 | } | ||
1992 | EXPORT_SYMBOL(generic_file_remap_pages); | ||
1993 | |||
1994 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, | 2011 | static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, |
1995 | unsigned long addr, void *buf, int len, int write) | 2012 | unsigned long addr, void *buf, int len, int write) |
1996 | { | 2013 | { |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d503e9ce1c7b..642f38cb175a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
169 | * The baseline for the badness score is the proportion of RAM that each | 169 | * The baseline for the badness score is the proportion of RAM that each |
170 | * task's rss, pagetable and swap space use. | 170 | * task's rss, pagetable and swap space use. |
171 | */ | 171 | */ |
172 | points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + | 172 | points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + |
173 | get_mm_counter(p->mm, MM_SWAPENTS); | 173 | atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); |
174 | task_unlock(p); | 174 | task_unlock(p); |
175 | 175 | ||
176 | /* | 176 | /* |
@@ -266,8 +266,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, | |||
266 | * Don't allow any other task to have access to the reserves. | 266 | * Don't allow any other task to have access to the reserves. |
267 | */ | 267 | */ |
268 | if (test_tsk_thread_flag(task, TIF_MEMDIE)) { | 268 | if (test_tsk_thread_flag(task, TIF_MEMDIE)) { |
269 | if (unlikely(frozen(task))) | ||
270 | __thaw_task(task); | ||
271 | if (!force_kill) | 269 | if (!force_kill) |
272 | return OOM_SCAN_ABORT; | 270 | return OOM_SCAN_ABORT; |
273 | } | 271 | } |
@@ -353,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | |||
353 | struct task_struct *p; | 351 | struct task_struct *p; |
354 | struct task_struct *task; | 352 | struct task_struct *task; |
355 | 353 | ||
356 | pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); | 354 | pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); |
357 | rcu_read_lock(); | 355 | rcu_read_lock(); |
358 | for_each_process(p) { | 356 | for_each_process(p) { |
359 | if (oom_unkillable_task(p, memcg, nodemask)) | 357 | if (oom_unkillable_task(p, memcg, nodemask)) |
@@ -369,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | |||
369 | continue; | 367 | continue; |
370 | } | 368 | } |
371 | 369 | ||
372 | pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", | 370 | pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", |
373 | task->pid, from_kuid(&init_user_ns, task_uid(task)), | 371 | task->pid, from_kuid(&init_user_ns, task_uid(task)), |
374 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), | 372 | task->tgid, task->mm->total_vm, get_mm_rss(task->mm), |
375 | atomic_long_read(&task->mm->nr_ptes), | 373 | atomic_long_read(&task->mm->nr_ptes), |
374 | mm_nr_pmds(task->mm), | ||
376 | get_mm_counter(task->mm, MM_SWAPENTS), | 375 | get_mm_counter(task->mm, MM_SWAPENTS), |
377 | task->signal->oom_score_adj, task->comm); | 376 | task->signal->oom_score_adj, task->comm); |
378 | task_unlock(task); | 377 | task_unlock(task); |
@@ -400,20 +399,98 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
400 | } | 399 | } |
401 | 400 | ||
402 | /* | 401 | /* |
403 | * Number of OOM killer invocations (including memcg OOM killer). | 402 | * Number of OOM victims in flight |
404 | * Primarily used by PM freezer to check for potential races with | ||
405 | * OOM killed frozen task. | ||
406 | */ | 403 | */ |
407 | static atomic_t oom_kills = ATOMIC_INIT(0); | 404 | static atomic_t oom_victims = ATOMIC_INIT(0); |
405 | static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); | ||
408 | 406 | ||
409 | int oom_kills_count(void) | 407 | bool oom_killer_disabled __read_mostly; |
408 | static DECLARE_RWSEM(oom_sem); | ||
409 | |||
410 | /** | ||
411 | * mark_tsk_oom_victim - marks the given taks as OOM victim. | ||
412 | * @tsk: task to mark | ||
413 | * | ||
414 | * Has to be called with oom_sem taken for read and never after | ||
415 | * oom has been disabled already. | ||
416 | */ | ||
417 | void mark_tsk_oom_victim(struct task_struct *tsk) | ||
410 | { | 418 | { |
411 | return atomic_read(&oom_kills); | 419 | WARN_ON(oom_killer_disabled); |
420 | /* OOM killer might race with memcg OOM */ | ||
421 | if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) | ||
422 | return; | ||
423 | /* | ||
424 | * Make sure that the task is woken up from uninterruptible sleep | ||
425 | * if it is frozen because OOM killer wouldn't be able to free | ||
426 | * any memory and livelock. freezing_slow_path will tell the freezer | ||
427 | * that TIF_MEMDIE tasks should be ignored. | ||
428 | */ | ||
429 | __thaw_task(tsk); | ||
430 | atomic_inc(&oom_victims); | ||
431 | } | ||
432 | |||
433 | /** | ||
434 | * unmark_oom_victim - unmarks the current task as OOM victim. | ||
435 | * | ||
436 | * Wakes up all waiters in oom_killer_disable() | ||
437 | */ | ||
438 | void unmark_oom_victim(void) | ||
439 | { | ||
440 | if (!test_and_clear_thread_flag(TIF_MEMDIE)) | ||
441 | return; | ||
442 | |||
443 | down_read(&oom_sem); | ||
444 | /* | ||
445 | * There is no need to signal the lasst oom_victim if there | ||
446 | * is nobody who cares. | ||
447 | */ | ||
448 | if (!atomic_dec_return(&oom_victims) && oom_killer_disabled) | ||
449 | wake_up_all(&oom_victims_wait); | ||
450 | up_read(&oom_sem); | ||
451 | } | ||
452 | |||
453 | /** | ||
454 | * oom_killer_disable - disable OOM killer | ||
455 | * | ||
456 | * Forces all page allocations to fail rather than trigger OOM killer. | ||
457 | * Will block and wait until all OOM victims are killed. | ||
458 | * | ||
459 | * The function cannot be called when there are runnable user tasks because | ||
460 | * the userspace would see unexpected allocation failures as a result. Any | ||
461 | * new usage of this function should be consulted with MM people. | ||
462 | * | ||
463 | * Returns true if successful and false if the OOM killer cannot be | ||
464 | * disabled. | ||
465 | */ | ||
466 | bool oom_killer_disable(void) | ||
467 | { | ||
468 | /* | ||
469 | * Make sure to not race with an ongoing OOM killer | ||
470 | * and that the current is not the victim. | ||
471 | */ | ||
472 | down_write(&oom_sem); | ||
473 | if (test_thread_flag(TIF_MEMDIE)) { | ||
474 | up_write(&oom_sem); | ||
475 | return false; | ||
476 | } | ||
477 | |||
478 | oom_killer_disabled = true; | ||
479 | up_write(&oom_sem); | ||
480 | |||
481 | wait_event(oom_victims_wait, !atomic_read(&oom_victims)); | ||
482 | |||
483 | return true; | ||
412 | } | 484 | } |
413 | 485 | ||
414 | void note_oom_kill(void) | 486 | /** |
487 | * oom_killer_enable - enable OOM killer | ||
488 | */ | ||
489 | void oom_killer_enable(void) | ||
415 | { | 490 | { |
416 | atomic_inc(&oom_kills); | 491 | down_write(&oom_sem); |
492 | oom_killer_disabled = false; | ||
493 | up_write(&oom_sem); | ||
417 | } | 494 | } |
418 | 495 | ||
419 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 496 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
@@ -438,11 +515,14 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
438 | * If the task is already exiting, don't alarm the sysadmin or kill | 515 | * If the task is already exiting, don't alarm the sysadmin or kill |
439 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 516 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
440 | */ | 517 | */ |
441 | if (task_will_free_mem(p)) { | 518 | task_lock(p); |
442 | set_tsk_thread_flag(p, TIF_MEMDIE); | 519 | if (p->mm && task_will_free_mem(p)) { |
520 | mark_tsk_oom_victim(p); | ||
521 | task_unlock(p); | ||
443 | put_task_struct(p); | 522 | put_task_struct(p); |
444 | return; | 523 | return; |
445 | } | 524 | } |
525 | task_unlock(p); | ||
446 | 526 | ||
447 | if (__ratelimit(&oom_rs)) | 527 | if (__ratelimit(&oom_rs)) |
448 | dump_header(p, gfp_mask, order, memcg, nodemask); | 528 | dump_header(p, gfp_mask, order, memcg, nodemask); |
@@ -492,6 +572,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
492 | 572 | ||
493 | /* mm cannot safely be dereferenced after task_unlock(victim) */ | 573 | /* mm cannot safely be dereferenced after task_unlock(victim) */ |
494 | mm = victim->mm; | 574 | mm = victim->mm; |
575 | mark_tsk_oom_victim(victim); | ||
495 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", | 576 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", |
496 | task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), | 577 | task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), |
497 | K(get_mm_counter(victim->mm, MM_ANONPAGES)), | 578 | K(get_mm_counter(victim->mm, MM_ANONPAGES)), |
@@ -522,7 +603,6 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
522 | } | 603 | } |
523 | rcu_read_unlock(); | 604 | rcu_read_unlock(); |
524 | 605 | ||
525 | set_tsk_thread_flag(victim, TIF_MEMDIE); | ||
526 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); | 606 | do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); |
527 | put_task_struct(victim); | 607 | put_task_struct(victim); |
528 | } | 608 | } |
@@ -611,7 +691,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) | |||
611 | } | 691 | } |
612 | 692 | ||
613 | /** | 693 | /** |
614 | * out_of_memory - kill the "best" process when we run out of memory | 694 | * __out_of_memory - kill the "best" process when we run out of memory |
615 | * @zonelist: zonelist pointer | 695 | * @zonelist: zonelist pointer |
616 | * @gfp_mask: memory allocation flags | 696 | * @gfp_mask: memory allocation flags |
617 | * @order: amount of memory being requested as a power of 2 | 697 | * @order: amount of memory being requested as a power of 2 |
@@ -623,7 +703,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) | |||
623 | * OR try to be smart about which process to kill. Note that we | 703 | * OR try to be smart about which process to kill. Note that we |
624 | * don't have to be perfect here, we just have to be good. | 704 | * don't have to be perfect here, we just have to be good. |
625 | */ | 705 | */ |
626 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | 706 | static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
627 | int order, nodemask_t *nodemask, bool force_kill) | 707 | int order, nodemask_t *nodemask, bool force_kill) |
628 | { | 708 | { |
629 | const nodemask_t *mpol_mask; | 709 | const nodemask_t *mpol_mask; |
@@ -643,9 +723,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
643 | * If current has a pending SIGKILL or is exiting, then automatically | 723 | * If current has a pending SIGKILL or is exiting, then automatically |
644 | * select it. The goal is to allow it to allocate so that it may | 724 | * select it. The goal is to allow it to allocate so that it may |
645 | * quickly exit and free its memory. | 725 | * quickly exit and free its memory. |
726 | * | ||
727 | * But don't select if current has already released its mm and cleared | ||
728 | * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur. | ||
646 | */ | 729 | */ |
647 | if (fatal_signal_pending(current) || task_will_free_mem(current)) { | 730 | if (current->mm && |
648 | set_thread_flag(TIF_MEMDIE); | 731 | (fatal_signal_pending(current) || task_will_free_mem(current))) { |
732 | mark_tsk_oom_victim(current); | ||
649 | return; | 733 | return; |
650 | } | 734 | } |
651 | 735 | ||
@@ -688,6 +772,32 @@ out: | |||
688 | schedule_timeout_killable(1); | 772 | schedule_timeout_killable(1); |
689 | } | 773 | } |
690 | 774 | ||
775 | /** | ||
776 | * out_of_memory - tries to invoke OOM killer. | ||
777 | * @zonelist: zonelist pointer | ||
778 | * @gfp_mask: memory allocation flags | ||
779 | * @order: amount of memory being requested as a power of 2 | ||
780 | * @nodemask: nodemask passed to page allocator | ||
781 | * @force_kill: true if a task must be killed, even if others are exiting | ||
782 | * | ||
783 | * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable() | ||
784 | * when it returns false. Otherwise returns true. | ||
785 | */ | ||
786 | bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | ||
787 | int order, nodemask_t *nodemask, bool force_kill) | ||
788 | { | ||
789 | bool ret = false; | ||
790 | |||
791 | down_read(&oom_sem); | ||
792 | if (!oom_killer_disabled) { | ||
793 | __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill); | ||
794 | ret = true; | ||
795 | } | ||
796 | up_read(&oom_sem); | ||
797 | |||
798 | return ret; | ||
799 | } | ||
800 | |||
691 | /* | 801 | /* |
692 | * The pagefault handler calls here because it is out of memory, so kill a | 802 | * The pagefault handler calls here because it is out of memory, so kill a |
693 | * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a | 803 | * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a |
@@ -697,12 +807,25 @@ void pagefault_out_of_memory(void) | |||
697 | { | 807 | { |
698 | struct zonelist *zonelist; | 808 | struct zonelist *zonelist; |
699 | 809 | ||
810 | down_read(&oom_sem); | ||
700 | if (mem_cgroup_oom_synchronize(true)) | 811 | if (mem_cgroup_oom_synchronize(true)) |
701 | return; | 812 | goto unlock; |
702 | 813 | ||
703 | zonelist = node_zonelist(first_memory_node, GFP_KERNEL); | 814 | zonelist = node_zonelist(first_memory_node, GFP_KERNEL); |
704 | if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { | 815 | if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) { |
705 | out_of_memory(NULL, 0, 0, NULL, false); | 816 | if (!oom_killer_disabled) |
817 | __out_of_memory(NULL, 0, 0, NULL, false); | ||
818 | else | ||
819 | /* | ||
820 | * There shouldn't be any user tasks runable while the | ||
821 | * OOM killer is disabled so the current task has to | ||
822 | * be a racing OOM victim for which oom_killer_disable() | ||
823 | * is waiting for. | ||
824 | */ | ||
825 | WARN_ON(test_thread_flag(TIF_MEMDIE)); | ||
826 | |||
706 | oom_zonelist_unlock(zonelist, GFP_KERNEL); | 827 | oom_zonelist_unlock(zonelist, GFP_KERNEL); |
707 | } | 828 | } |
829 | unlock: | ||
830 | up_read(&oom_sem); | ||
708 | } | 831 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d5d81f5384d1..45e187b2d971 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1351,7 +1351,7 @@ static void balance_dirty_pages(struct address_space *mapping, | |||
1351 | unsigned long task_ratelimit; | 1351 | unsigned long task_ratelimit; |
1352 | unsigned long dirty_ratelimit; | 1352 | unsigned long dirty_ratelimit; |
1353 | unsigned long pos_ratio; | 1353 | unsigned long pos_ratio; |
1354 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1354 | struct backing_dev_info *bdi = inode_to_bdi(mapping->host); |
1355 | bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; | 1355 | bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; |
1356 | unsigned long start_time = jiffies; | 1356 | unsigned long start_time = jiffies; |
1357 | 1357 | ||
@@ -1541,16 +1541,6 @@ pause: | |||
1541 | bdi_start_background_writeback(bdi); | 1541 | bdi_start_background_writeback(bdi); |
1542 | } | 1542 | } |
1543 | 1543 | ||
1544 | void set_page_dirty_balance(struct page *page) | ||
1545 | { | ||
1546 | if (set_page_dirty(page)) { | ||
1547 | struct address_space *mapping = page_mapping(page); | ||
1548 | |||
1549 | if (mapping) | ||
1550 | balance_dirty_pages_ratelimited(mapping); | ||
1551 | } | ||
1552 | } | ||
1553 | |||
1554 | static DEFINE_PER_CPU(int, bdp_ratelimits); | 1544 | static DEFINE_PER_CPU(int, bdp_ratelimits); |
1555 | 1545 | ||
1556 | /* | 1546 | /* |
@@ -1584,7 +1574,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; | |||
1584 | */ | 1574 | */ |
1585 | void balance_dirty_pages_ratelimited(struct address_space *mapping) | 1575 | void balance_dirty_pages_ratelimited(struct address_space *mapping) |
1586 | { | 1576 | { |
1587 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1577 | struct backing_dev_info *bdi = inode_to_bdi(mapping->host); |
1588 | int ratelimit; | 1578 | int ratelimit; |
1589 | int *p; | 1579 | int *p; |
1590 | 1580 | ||
@@ -1939,7 +1929,7 @@ continue_unlock: | |||
1939 | if (!clear_page_dirty_for_io(page)) | 1929 | if (!clear_page_dirty_for_io(page)) |
1940 | goto continue_unlock; | 1930 | goto continue_unlock; |
1941 | 1931 | ||
1942 | trace_wbc_writepage(wbc, mapping->backing_dev_info); | 1932 | trace_wbc_writepage(wbc, inode_to_bdi(mapping->host)); |
1943 | ret = (*writepage)(page, wbc, data); | 1933 | ret = (*writepage)(page, wbc, data); |
1944 | if (unlikely(ret)) { | 1934 | if (unlikely(ret)) { |
1945 | if (ret == AOP_WRITEPAGE_ACTIVATE) { | 1935 | if (ret == AOP_WRITEPAGE_ACTIVATE) { |
@@ -2104,10 +2094,12 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) | |||
2104 | trace_writeback_dirty_page(page, mapping); | 2094 | trace_writeback_dirty_page(page, mapping); |
2105 | 2095 | ||
2106 | if (mapping_cap_account_dirty(mapping)) { | 2096 | if (mapping_cap_account_dirty(mapping)) { |
2097 | struct backing_dev_info *bdi = inode_to_bdi(mapping->host); | ||
2098 | |||
2107 | __inc_zone_page_state(page, NR_FILE_DIRTY); | 2099 | __inc_zone_page_state(page, NR_FILE_DIRTY); |
2108 | __inc_zone_page_state(page, NR_DIRTIED); | 2100 | __inc_zone_page_state(page, NR_DIRTIED); |
2109 | __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); | 2101 | __inc_bdi_stat(bdi, BDI_RECLAIMABLE); |
2110 | __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); | 2102 | __inc_bdi_stat(bdi, BDI_DIRTIED); |
2111 | task_io_account_write(PAGE_CACHE_SIZE); | 2103 | task_io_account_write(PAGE_CACHE_SIZE); |
2112 | current->nr_dirtied++; | 2104 | current->nr_dirtied++; |
2113 | this_cpu_inc(bdp_ratelimits); | 2105 | this_cpu_inc(bdp_ratelimits); |
@@ -2123,32 +2115,25 @@ EXPORT_SYMBOL(account_page_dirtied); | |||
2123 | * page dirty in that case, but not all the buffers. This is a "bottom-up" | 2115 | * page dirty in that case, but not all the buffers. This is a "bottom-up" |
2124 | * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. | 2116 | * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. |
2125 | * | 2117 | * |
2126 | * Most callers have locked the page, which pins the address_space in memory. | 2118 | * The caller must ensure this doesn't race with truncation. Most will simply |
2127 | * But zap_pte_range() does not lock the page, however in that case the | 2119 | * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and |
2128 | * mapping is pinned by the vma's ->vm_file reference. | 2120 | * the pte lock held, which also locks out truncation. |
2129 | * | ||
2130 | * We take care to handle the case where the page was truncated from the | ||
2131 | * mapping by re-checking page_mapping() inside tree_lock. | ||
2132 | */ | 2121 | */ |
2133 | int __set_page_dirty_nobuffers(struct page *page) | 2122 | int __set_page_dirty_nobuffers(struct page *page) |
2134 | { | 2123 | { |
2135 | if (!TestSetPageDirty(page)) { | 2124 | if (!TestSetPageDirty(page)) { |
2136 | struct address_space *mapping = page_mapping(page); | 2125 | struct address_space *mapping = page_mapping(page); |
2137 | struct address_space *mapping2; | ||
2138 | unsigned long flags; | 2126 | unsigned long flags; |
2139 | 2127 | ||
2140 | if (!mapping) | 2128 | if (!mapping) |
2141 | return 1; | 2129 | return 1; |
2142 | 2130 | ||
2143 | spin_lock_irqsave(&mapping->tree_lock, flags); | 2131 | spin_lock_irqsave(&mapping->tree_lock, flags); |
2144 | mapping2 = page_mapping(page); | 2132 | BUG_ON(page_mapping(page) != mapping); |
2145 | if (mapping2) { /* Race with truncate? */ | 2133 | WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); |
2146 | BUG_ON(mapping2 != mapping); | 2134 | account_page_dirtied(page, mapping); |
2147 | WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); | 2135 | radix_tree_tag_set(&mapping->page_tree, page_index(page), |
2148 | account_page_dirtied(page, mapping); | 2136 | PAGECACHE_TAG_DIRTY); |
2149 | radix_tree_tag_set(&mapping->page_tree, | ||
2150 | page_index(page), PAGECACHE_TAG_DIRTY); | ||
2151 | } | ||
2152 | spin_unlock_irqrestore(&mapping->tree_lock, flags); | 2137 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
2153 | if (mapping->host) { | 2138 | if (mapping->host) { |
2154 | /* !PageAnon && !swapper_space */ | 2139 | /* !PageAnon && !swapper_space */ |
@@ -2173,7 +2158,7 @@ void account_page_redirty(struct page *page) | |||
2173 | if (mapping && mapping_cap_account_dirty(mapping)) { | 2158 | if (mapping && mapping_cap_account_dirty(mapping)) { |
2174 | current->nr_dirtied--; | 2159 | current->nr_dirtied--; |
2175 | dec_zone_page_state(page, NR_DIRTIED); | 2160 | dec_zone_page_state(page, NR_DIRTIED); |
2176 | dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED); | 2161 | dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED); |
2177 | } | 2162 | } |
2178 | } | 2163 | } |
2179 | EXPORT_SYMBOL(account_page_redirty); | 2164 | EXPORT_SYMBOL(account_page_redirty); |
@@ -2185,9 +2170,12 @@ EXPORT_SYMBOL(account_page_redirty); | |||
2185 | */ | 2170 | */ |
2186 | int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) | 2171 | int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) |
2187 | { | 2172 | { |
2173 | int ret; | ||
2174 | |||
2188 | wbc->pages_skipped++; | 2175 | wbc->pages_skipped++; |
2176 | ret = __set_page_dirty_nobuffers(page); | ||
2189 | account_page_redirty(page); | 2177 | account_page_redirty(page); |
2190 | return __set_page_dirty_nobuffers(page); | 2178 | return ret; |
2191 | } | 2179 | } |
2192 | EXPORT_SYMBOL(redirty_page_for_writepage); | 2180 | EXPORT_SYMBOL(redirty_page_for_writepage); |
2193 | 2181 | ||
@@ -2305,16 +2293,14 @@ int clear_page_dirty_for_io(struct page *page) | |||
2305 | /* | 2293 | /* |
2306 | * We carefully synchronise fault handlers against | 2294 | * We carefully synchronise fault handlers against |
2307 | * installing a dirty pte and marking the page dirty | 2295 | * installing a dirty pte and marking the page dirty |
2308 | * at this point. We do this by having them hold the | 2296 | * at this point. We do this by having them hold the |
2309 | * page lock at some point after installing their | 2297 | * page lock while dirtying the page, and pages are |
2310 | * pte, but before marking the page dirty. | 2298 | * always locked coming in here, so we get the desired |
2311 | * Pages are always locked coming in here, so we get | 2299 | * exclusion. |
2312 | * the desired exclusion. See mm/memory.c:do_wp_page() | ||
2313 | * for more comments. | ||
2314 | */ | 2300 | */ |
2315 | if (TestClearPageDirty(page)) { | 2301 | if (TestClearPageDirty(page)) { |
2316 | dec_zone_page_state(page, NR_FILE_DIRTY); | 2302 | dec_zone_page_state(page, NR_FILE_DIRTY); |
2317 | dec_bdi_stat(mapping->backing_dev_info, | 2303 | dec_bdi_stat(inode_to_bdi(mapping->host), |
2318 | BDI_RECLAIMABLE); | 2304 | BDI_RECLAIMABLE); |
2319 | return 1; | 2305 | return 1; |
2320 | } | 2306 | } |
@@ -2327,14 +2313,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); | |||
2327 | int test_clear_page_writeback(struct page *page) | 2313 | int test_clear_page_writeback(struct page *page) |
2328 | { | 2314 | { |
2329 | struct address_space *mapping = page_mapping(page); | 2315 | struct address_space *mapping = page_mapping(page); |
2330 | unsigned long memcg_flags; | ||
2331 | struct mem_cgroup *memcg; | 2316 | struct mem_cgroup *memcg; |
2332 | bool locked; | ||
2333 | int ret; | 2317 | int ret; |
2334 | 2318 | ||
2335 | memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); | 2319 | memcg = mem_cgroup_begin_page_stat(page); |
2336 | if (mapping) { | 2320 | if (mapping) { |
2337 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 2321 | struct backing_dev_info *bdi = inode_to_bdi(mapping->host); |
2338 | unsigned long flags; | 2322 | unsigned long flags; |
2339 | 2323 | ||
2340 | spin_lock_irqsave(&mapping->tree_lock, flags); | 2324 | spin_lock_irqsave(&mapping->tree_lock, flags); |
@@ -2357,21 +2341,19 @@ int test_clear_page_writeback(struct page *page) | |||
2357 | dec_zone_page_state(page, NR_WRITEBACK); | 2341 | dec_zone_page_state(page, NR_WRITEBACK); |
2358 | inc_zone_page_state(page, NR_WRITTEN); | 2342 | inc_zone_page_state(page, NR_WRITTEN); |
2359 | } | 2343 | } |
2360 | mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); | 2344 | mem_cgroup_end_page_stat(memcg); |
2361 | return ret; | 2345 | return ret; |
2362 | } | 2346 | } |
2363 | 2347 | ||
2364 | int __test_set_page_writeback(struct page *page, bool keep_write) | 2348 | int __test_set_page_writeback(struct page *page, bool keep_write) |
2365 | { | 2349 | { |
2366 | struct address_space *mapping = page_mapping(page); | 2350 | struct address_space *mapping = page_mapping(page); |
2367 | unsigned long memcg_flags; | ||
2368 | struct mem_cgroup *memcg; | 2351 | struct mem_cgroup *memcg; |
2369 | bool locked; | ||
2370 | int ret; | 2352 | int ret; |
2371 | 2353 | ||
2372 | memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags); | 2354 | memcg = mem_cgroup_begin_page_stat(page); |
2373 | if (mapping) { | 2355 | if (mapping) { |
2374 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 2356 | struct backing_dev_info *bdi = inode_to_bdi(mapping->host); |
2375 | unsigned long flags; | 2357 | unsigned long flags; |
2376 | 2358 | ||
2377 | spin_lock_irqsave(&mapping->tree_lock, flags); | 2359 | spin_lock_irqsave(&mapping->tree_lock, flags); |
@@ -2399,7 +2381,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) | |||
2399 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); | 2381 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); |
2400 | inc_zone_page_state(page, NR_WRITEBACK); | 2382 | inc_zone_page_state(page, NR_WRITEBACK); |
2401 | } | 2383 | } |
2402 | mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags); | 2384 | mem_cgroup_end_page_stat(memcg); |
2403 | return ret; | 2385 | return ret; |
2404 | 2386 | ||
2405 | } | 2387 | } |
@@ -2425,12 +2407,7 @@ EXPORT_SYMBOL(mapping_tagged); | |||
2425 | */ | 2407 | */ |
2426 | void wait_for_stable_page(struct page *page) | 2408 | void wait_for_stable_page(struct page *page) |
2427 | { | 2409 | { |
2428 | struct address_space *mapping = page_mapping(page); | 2410 | if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host))) |
2429 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 2411 | wait_on_page_writeback(page); |
2430 | |||
2431 | if (!bdi_cap_stable_pages_required(bdi)) | ||
2432 | return; | ||
2433 | |||
2434 | wait_on_page_writeback(page); | ||
2435 | } | 2412 | } |
2436 | EXPORT_SYMBOL_GPL(wait_for_stable_page); | 2413 | EXPORT_SYMBOL_GPL(wait_for_stable_page); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7633c503a116..a47f0b229a1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -25,6 +25,7 @@ | |||
25 | #include <linux/compiler.h> | 25 | #include <linux/compiler.h> |
26 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
27 | #include <linux/kmemcheck.h> | 27 | #include <linux/kmemcheck.h> |
28 | #include <linux/kasan.h> | ||
28 | #include <linux/module.h> | 29 | #include <linux/module.h> |
29 | #include <linux/suspend.h> | 30 | #include <linux/suspend.h> |
30 | #include <linux/pagevec.h> | 31 | #include <linux/pagevec.h> |
@@ -172,7 +173,7 @@ static void __free_pages_ok(struct page *page, unsigned int order); | |||
172 | * 1G machine -> (16M dma, 784M normal, 224M high) | 173 | * 1G machine -> (16M dma, 784M normal, 224M high) |
173 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA | 174 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA |
174 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL | 175 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL |
175 | * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA | 176 | * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA |
176 | * | 177 | * |
177 | * TBD: should special case ZONE_DMA32 machines here - in those we normally | 178 | * TBD: should special case ZONE_DMA32 machines here - in those we normally |
178 | * don't need any ZONE_NORMAL reservation | 179 | * don't need any ZONE_NORMAL reservation |
@@ -244,8 +245,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype) | |||
244 | PB_migrate, PB_migrate_end); | 245 | PB_migrate, PB_migrate_end); |
245 | } | 246 | } |
246 | 247 | ||
247 | bool oom_killer_disabled __read_mostly; | ||
248 | |||
249 | #ifdef CONFIG_DEBUG_VM | 248 | #ifdef CONFIG_DEBUG_VM |
250 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 249 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
251 | { | 250 | { |
@@ -381,36 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
381 | } | 380 | } |
382 | } | 381 | } |
383 | 382 | ||
384 | /* update __split_huge_page_refcount if you change this function */ | ||
385 | static int destroy_compound_page(struct page *page, unsigned long order) | ||
386 | { | ||
387 | int i; | ||
388 | int nr_pages = 1 << order; | ||
389 | int bad = 0; | ||
390 | |||
391 | if (unlikely(compound_order(page) != order)) { | ||
392 | bad_page(page, "wrong compound order", 0); | ||
393 | bad++; | ||
394 | } | ||
395 | |||
396 | __ClearPageHead(page); | ||
397 | |||
398 | for (i = 1; i < nr_pages; i++) { | ||
399 | struct page *p = page + i; | ||
400 | |||
401 | if (unlikely(!PageTail(p))) { | ||
402 | bad_page(page, "PageTail not set", 0); | ||
403 | bad++; | ||
404 | } else if (unlikely(p->first_page != page)) { | ||
405 | bad_page(page, "first_page not consistent", 0); | ||
406 | bad++; | ||
407 | } | ||
408 | __ClearPageTail(p); | ||
409 | } | ||
410 | |||
411 | return bad; | ||
412 | } | ||
413 | |||
414 | static inline void prep_zero_page(struct page *page, unsigned int order, | 383 | static inline void prep_zero_page(struct page *page, unsigned int order, |
415 | gfp_t gfp_flags) | 384 | gfp_t gfp_flags) |
416 | { | 385 | { |
@@ -552,17 +521,15 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
552 | return 0; | 521 | return 0; |
553 | 522 | ||
554 | if (page_is_guard(buddy) && page_order(buddy) == order) { | 523 | if (page_is_guard(buddy) && page_order(buddy) == order) { |
555 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
556 | |||
557 | if (page_zone_id(page) != page_zone_id(buddy)) | 524 | if (page_zone_id(page) != page_zone_id(buddy)) |
558 | return 0; | 525 | return 0; |
559 | 526 | ||
527 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
528 | |||
560 | return 1; | 529 | return 1; |
561 | } | 530 | } |
562 | 531 | ||
563 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 532 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
564 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
565 | |||
566 | /* | 533 | /* |
567 | * zone check is done late to avoid uselessly | 534 | * zone check is done late to avoid uselessly |
568 | * calculating zone/node ids for pages that could | 535 | * calculating zone/node ids for pages that could |
@@ -571,6 +538,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
571 | if (page_zone_id(page) != page_zone_id(buddy)) | 538 | if (page_zone_id(page) != page_zone_id(buddy)) |
572 | return 0; | 539 | return 0; |
573 | 540 | ||
541 | VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); | ||
542 | |||
574 | return 1; | 543 | return 1; |
575 | } | 544 | } |
576 | return 0; | 545 | return 0; |
@@ -613,10 +582,7 @@ static inline void __free_one_page(struct page *page, | |||
613 | int max_order = MAX_ORDER; | 582 | int max_order = MAX_ORDER; |
614 | 583 | ||
615 | VM_BUG_ON(!zone_is_initialized(zone)); | 584 | VM_BUG_ON(!zone_is_initialized(zone)); |
616 | 585 | VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); | |
617 | if (unlikely(PageCompound(page))) | ||
618 | if (unlikely(destroy_compound_page(page, order))) | ||
619 | return; | ||
620 | 586 | ||
621 | VM_BUG_ON(migratetype == -1); | 587 | VM_BUG_ON(migratetype == -1); |
622 | if (is_migrate_isolate(migratetype)) { | 588 | if (is_migrate_isolate(migratetype)) { |
@@ -797,21 +763,41 @@ static void free_one_page(struct zone *zone, | |||
797 | spin_unlock(&zone->lock); | 763 | spin_unlock(&zone->lock); |
798 | } | 764 | } |
799 | 765 | ||
766 | static int free_tail_pages_check(struct page *head_page, struct page *page) | ||
767 | { | ||
768 | if (!IS_ENABLED(CONFIG_DEBUG_VM)) | ||
769 | return 0; | ||
770 | if (unlikely(!PageTail(page))) { | ||
771 | bad_page(page, "PageTail not set", 0); | ||
772 | return 1; | ||
773 | } | ||
774 | if (unlikely(page->first_page != head_page)) { | ||
775 | bad_page(page, "first_page not consistent", 0); | ||
776 | return 1; | ||
777 | } | ||
778 | return 0; | ||
779 | } | ||
780 | |||
800 | static bool free_pages_prepare(struct page *page, unsigned int order) | 781 | static bool free_pages_prepare(struct page *page, unsigned int order) |
801 | { | 782 | { |
802 | int i; | 783 | bool compound = PageCompound(page); |
803 | int bad = 0; | 784 | int i, bad = 0; |
804 | 785 | ||
805 | VM_BUG_ON_PAGE(PageTail(page), page); | 786 | VM_BUG_ON_PAGE(PageTail(page), page); |
806 | VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page); | 787 | VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); |
807 | 788 | ||
808 | trace_mm_page_free(page, order); | 789 | trace_mm_page_free(page, order); |
809 | kmemcheck_free_shadow(page, order); | 790 | kmemcheck_free_shadow(page, order); |
791 | kasan_free_pages(page, order); | ||
810 | 792 | ||
811 | if (PageAnon(page)) | 793 | if (PageAnon(page)) |
812 | page->mapping = NULL; | 794 | page->mapping = NULL; |
813 | for (i = 0; i < (1 << order); i++) | 795 | bad += free_pages_check(page); |
796 | for (i = 1; i < (1 << order); i++) { | ||
797 | if (compound) | ||
798 | bad += free_tail_pages_check(page, page + i); | ||
814 | bad += free_pages_check(page + i); | 799 | bad += free_pages_check(page + i); |
800 | } | ||
815 | if (bad) | 801 | if (bad) |
816 | return false; | 802 | return false; |
817 | 803 | ||
@@ -970,7 +956,8 @@ static inline int check_new_page(struct page *page) | |||
970 | return 0; | 956 | return 0; |
971 | } | 957 | } |
972 | 958 | ||
973 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | 959 | static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, |
960 | int alloc_flags) | ||
974 | { | 961 | { |
975 | int i; | 962 | int i; |
976 | 963 | ||
@@ -985,6 +972,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | |||
985 | 972 | ||
986 | arch_alloc_page(page, order); | 973 | arch_alloc_page(page, order); |
987 | kernel_map_pages(page, 1 << order, 1); | 974 | kernel_map_pages(page, 1 << order, 1); |
975 | kasan_alloc_pages(page, order); | ||
988 | 976 | ||
989 | if (gfp_flags & __GFP_ZERO) | 977 | if (gfp_flags & __GFP_ZERO) |
990 | prep_zero_page(page, order, gfp_flags); | 978 | prep_zero_page(page, order, gfp_flags); |
@@ -994,6 +982,14 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) | |||
994 | 982 | ||
995 | set_page_owner(page, order, gfp_flags); | 983 | set_page_owner(page, order, gfp_flags); |
996 | 984 | ||
985 | /* | ||
986 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to | ||
987 | * allocate the page. The expectation is that the caller is taking | ||
988 | * steps that will free more memory. The caller should avoid the page | ||
989 | * being used for !PFMEMALLOC purposes. | ||
990 | */ | ||
991 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); | ||
992 | |||
997 | return 0; | 993 | return 0; |
998 | } | 994 | } |
999 | 995 | ||
@@ -1130,39 +1126,34 @@ static void change_pageblock_range(struct page *pageblock_page, | |||
1130 | } | 1126 | } |
1131 | 1127 | ||
1132 | /* | 1128 | /* |
1133 | * If breaking a large block of pages, move all free pages to the preferred | 1129 | * When we are falling back to another migratetype during allocation, try to |
1134 | * allocation list. If falling back for a reclaimable kernel allocation, be | 1130 | * steal extra free pages from the same pageblocks to satisfy further |
1135 | * more aggressive about taking ownership of free pages. | 1131 | * allocations, instead of polluting multiple pageblocks. |
1136 | * | 1132 | * |
1137 | * On the other hand, never change migration type of MIGRATE_CMA pageblocks | 1133 | * If we are stealing a relatively large buddy page, it is likely there will |
1138 | * nor move CMA pages to different free lists. We don't want unmovable pages | 1134 | * be more free pages in the pageblock, so try to steal them all. For |
1139 | * to be allocated from MIGRATE_CMA areas. | 1135 | * reclaimable and unmovable allocations, we steal regardless of page size, |
1136 | * as fragmentation caused by those allocations polluting movable pageblocks | ||
1137 | * is worse than movable allocations stealing from unmovable and reclaimable | ||
1138 | * pageblocks. | ||
1140 | * | 1139 | * |
1141 | * Returns the new migratetype of the pageblock (or the same old migratetype | 1140 | * If we claim more than half of the pageblock, change pageblock's migratetype |
1142 | * if it was unchanged). | 1141 | * as well. |
1143 | */ | 1142 | */ |
1144 | static int try_to_steal_freepages(struct zone *zone, struct page *page, | 1143 | static void try_to_steal_freepages(struct zone *zone, struct page *page, |
1145 | int start_type, int fallback_type) | 1144 | int start_type, int fallback_type) |
1146 | { | 1145 | { |
1147 | int current_order = page_order(page); | 1146 | int current_order = page_order(page); |
1148 | 1147 | ||
1149 | /* | ||
1150 | * When borrowing from MIGRATE_CMA, we need to release the excess | ||
1151 | * buddy pages to CMA itself. We also ensure the freepage_migratetype | ||
1152 | * is set to CMA so it is returned to the correct freelist in case | ||
1153 | * the page ends up being not actually allocated from the pcp lists. | ||
1154 | */ | ||
1155 | if (is_migrate_cma(fallback_type)) | ||
1156 | return fallback_type; | ||
1157 | |||
1158 | /* Take ownership for orders >= pageblock_order */ | 1148 | /* Take ownership for orders >= pageblock_order */ |
1159 | if (current_order >= pageblock_order) { | 1149 | if (current_order >= pageblock_order) { |
1160 | change_pageblock_range(page, current_order, start_type); | 1150 | change_pageblock_range(page, current_order, start_type); |
1161 | return start_type; | 1151 | return; |
1162 | } | 1152 | } |
1163 | 1153 | ||
1164 | if (current_order >= pageblock_order / 2 || | 1154 | if (current_order >= pageblock_order / 2 || |
1165 | start_type == MIGRATE_RECLAIMABLE || | 1155 | start_type == MIGRATE_RECLAIMABLE || |
1156 | start_type == MIGRATE_UNMOVABLE || | ||
1166 | page_group_by_mobility_disabled) { | 1157 | page_group_by_mobility_disabled) { |
1167 | int pages; | 1158 | int pages; |
1168 | 1159 | ||
@@ -1170,15 +1161,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, | |||
1170 | 1161 | ||
1171 | /* Claim the whole block if over half of it is free */ | 1162 | /* Claim the whole block if over half of it is free */ |
1172 | if (pages >= (1 << (pageblock_order-1)) || | 1163 | if (pages >= (1 << (pageblock_order-1)) || |
1173 | page_group_by_mobility_disabled) { | 1164 | page_group_by_mobility_disabled) |
1174 | |||
1175 | set_pageblock_migratetype(page, start_type); | 1165 | set_pageblock_migratetype(page, start_type); |
1176 | return start_type; | ||
1177 | } | ||
1178 | |||
1179 | } | 1166 | } |
1180 | |||
1181 | return fallback_type; | ||
1182 | } | 1167 | } |
1183 | 1168 | ||
1184 | /* Remove an element from the buddy allocator from the fallback list */ | 1169 | /* Remove an element from the buddy allocator from the fallback list */ |
@@ -1188,14 +1173,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) | |||
1188 | struct free_area *area; | 1173 | struct free_area *area; |
1189 | unsigned int current_order; | 1174 | unsigned int current_order; |
1190 | struct page *page; | 1175 | struct page *page; |
1191 | int migratetype, new_type, i; | ||
1192 | 1176 | ||
1193 | /* Find the largest possible block of pages in the other list */ | 1177 | /* Find the largest possible block of pages in the other list */ |
1194 | for (current_order = MAX_ORDER-1; | 1178 | for (current_order = MAX_ORDER-1; |
1195 | current_order >= order && current_order <= MAX_ORDER-1; | 1179 | current_order >= order && current_order <= MAX_ORDER-1; |
1196 | --current_order) { | 1180 | --current_order) { |
1181 | int i; | ||
1197 | for (i = 0;; i++) { | 1182 | for (i = 0;; i++) { |
1198 | migratetype = fallbacks[start_migratetype][i]; | 1183 | int migratetype = fallbacks[start_migratetype][i]; |
1184 | int buddy_type = start_migratetype; | ||
1199 | 1185 | ||
1200 | /* MIGRATE_RESERVE handled later if necessary */ | 1186 | /* MIGRATE_RESERVE handled later if necessary */ |
1201 | if (migratetype == MIGRATE_RESERVE) | 1187 | if (migratetype == MIGRATE_RESERVE) |
@@ -1209,25 +1195,39 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) | |||
1209 | struct page, lru); | 1195 | struct page, lru); |
1210 | area->nr_free--; | 1196 | area->nr_free--; |
1211 | 1197 | ||
1212 | new_type = try_to_steal_freepages(zone, page, | 1198 | if (!is_migrate_cma(migratetype)) { |
1213 | start_migratetype, | 1199 | try_to_steal_freepages(zone, page, |
1214 | migratetype); | 1200 | start_migratetype, |
1201 | migratetype); | ||
1202 | } else { | ||
1203 | /* | ||
1204 | * When borrowing from MIGRATE_CMA, we need to | ||
1205 | * release the excess buddy pages to CMA | ||
1206 | * itself, and we do not try to steal extra | ||
1207 | * free pages. | ||
1208 | */ | ||
1209 | buddy_type = migratetype; | ||
1210 | } | ||
1215 | 1211 | ||
1216 | /* Remove the page from the freelists */ | 1212 | /* Remove the page from the freelists */ |
1217 | list_del(&page->lru); | 1213 | list_del(&page->lru); |
1218 | rmv_page_order(page); | 1214 | rmv_page_order(page); |
1219 | 1215 | ||
1220 | expand(zone, page, order, current_order, area, | 1216 | expand(zone, page, order, current_order, area, |
1221 | new_type); | 1217 | buddy_type); |
1222 | /* The freepage_migratetype may differ from pageblock's | 1218 | |
1219 | /* | ||
1220 | * The freepage_migratetype may differ from pageblock's | ||
1223 | * migratetype depending on the decisions in | 1221 | * migratetype depending on the decisions in |
1224 | * try_to_steal_freepages. This is OK as long as it does | 1222 | * try_to_steal_freepages(). This is OK as long as it |
1225 | * not differ for MIGRATE_CMA type. | 1223 | * does not differ for MIGRATE_CMA pageblocks. For CMA |
1224 | * we need to make sure unallocated pages flushed from | ||
1225 | * pcp lists are returned to the correct freelist. | ||
1226 | */ | 1226 | */ |
1227 | set_freepage_migratetype(page, new_type); | 1227 | set_freepage_migratetype(page, buddy_type); |
1228 | 1228 | ||
1229 | trace_mm_page_alloc_extfrag(page, order, current_order, | 1229 | trace_mm_page_alloc_extfrag(page, order, current_order, |
1230 | start_migratetype, migratetype, new_type); | 1230 | start_migratetype, migratetype); |
1231 | 1231 | ||
1232 | return page; | 1232 | return page; |
1233 | } | 1233 | } |
@@ -1642,9 +1642,7 @@ int split_free_page(struct page *page) | |||
1642 | } | 1642 | } |
1643 | 1643 | ||
1644 | /* | 1644 | /* |
1645 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But | 1645 | * Allocate a page from the given zone. Use pcplists for order-0 allocations. |
1646 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | ||
1647 | * or two. | ||
1648 | */ | 1646 | */ |
1649 | static inline | 1647 | static inline |
1650 | struct page *buffered_rmqueue(struct zone *preferred_zone, | 1648 | struct page *buffered_rmqueue(struct zone *preferred_zone, |
@@ -1655,7 +1653,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
1655 | struct page *page; | 1653 | struct page *page; |
1656 | bool cold = ((gfp_flags & __GFP_COLD) != 0); | 1654 | bool cold = ((gfp_flags & __GFP_COLD) != 0); |
1657 | 1655 | ||
1658 | again: | ||
1659 | if (likely(order == 0)) { | 1656 | if (likely(order == 0)) { |
1660 | struct per_cpu_pages *pcp; | 1657 | struct per_cpu_pages *pcp; |
1661 | struct list_head *list; | 1658 | struct list_head *list; |
@@ -1711,8 +1708,6 @@ again: | |||
1711 | local_irq_restore(flags); | 1708 | local_irq_restore(flags); |
1712 | 1709 | ||
1713 | VM_BUG_ON_PAGE(bad_range(zone, page), page); | 1710 | VM_BUG_ON_PAGE(bad_range(zone, page), page); |
1714 | if (prep_new_page(page, order, gfp_flags)) | ||
1715 | goto again; | ||
1716 | return page; | 1711 | return page; |
1717 | 1712 | ||
1718 | failed: | 1713 | failed: |
@@ -2033,10 +2028,10 @@ static void reset_alloc_batches(struct zone *preferred_zone) | |||
2033 | * a page. | 2028 | * a page. |
2034 | */ | 2029 | */ |
2035 | static struct page * | 2030 | static struct page * |
2036 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | 2031 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, |
2037 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, | 2032 | const struct alloc_context *ac) |
2038 | struct zone *preferred_zone, int classzone_idx, int migratetype) | ||
2039 | { | 2033 | { |
2034 | struct zonelist *zonelist = ac->zonelist; | ||
2040 | struct zoneref *z; | 2035 | struct zoneref *z; |
2041 | struct page *page = NULL; | 2036 | struct page *page = NULL; |
2042 | struct zone *zone; | 2037 | struct zone *zone; |
@@ -2055,8 +2050,8 @@ zonelist_scan: | |||
2055 | * Scan zonelist, looking for a zone with enough free. | 2050 | * Scan zonelist, looking for a zone with enough free. |
2056 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. | 2051 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. |
2057 | */ | 2052 | */ |
2058 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2053 | for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, |
2059 | high_zoneidx, nodemask) { | 2054 | ac->nodemask) { |
2060 | unsigned long mark; | 2055 | unsigned long mark; |
2061 | 2056 | ||
2062 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && | 2057 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
@@ -2073,7 +2068,7 @@ zonelist_scan: | |||
2073 | * time the page has in memory before being reclaimed. | 2068 | * time the page has in memory before being reclaimed. |
2074 | */ | 2069 | */ |
2075 | if (alloc_flags & ALLOC_FAIR) { | 2070 | if (alloc_flags & ALLOC_FAIR) { |
2076 | if (!zone_local(preferred_zone, zone)) | 2071 | if (!zone_local(ac->preferred_zone, zone)) |
2077 | break; | 2072 | break; |
2078 | if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { | 2073 | if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { |
2079 | nr_fair_skipped++; | 2074 | nr_fair_skipped++; |
@@ -2111,7 +2106,7 @@ zonelist_scan: | |||
2111 | 2106 | ||
2112 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; | 2107 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
2113 | if (!zone_watermark_ok(zone, order, mark, | 2108 | if (!zone_watermark_ok(zone, order, mark, |
2114 | classzone_idx, alloc_flags)) { | 2109 | ac->classzone_idx, alloc_flags)) { |
2115 | int ret; | 2110 | int ret; |
2116 | 2111 | ||
2117 | /* Checked here to keep the fast path fast */ | 2112 | /* Checked here to keep the fast path fast */ |
@@ -2132,7 +2127,7 @@ zonelist_scan: | |||
2132 | } | 2127 | } |
2133 | 2128 | ||
2134 | if (zone_reclaim_mode == 0 || | 2129 | if (zone_reclaim_mode == 0 || |
2135 | !zone_allows_reclaim(preferred_zone, zone)) | 2130 | !zone_allows_reclaim(ac->preferred_zone, zone)) |
2136 | goto this_zone_full; | 2131 | goto this_zone_full; |
2137 | 2132 | ||
2138 | /* | 2133 | /* |
@@ -2154,7 +2149,7 @@ zonelist_scan: | |||
2154 | default: | 2149 | default: |
2155 | /* did we reclaim enough */ | 2150 | /* did we reclaim enough */ |
2156 | if (zone_watermark_ok(zone, order, mark, | 2151 | if (zone_watermark_ok(zone, order, mark, |
2157 | classzone_idx, alloc_flags)) | 2152 | ac->classzone_idx, alloc_flags)) |
2158 | goto try_this_zone; | 2153 | goto try_this_zone; |
2159 | 2154 | ||
2160 | /* | 2155 | /* |
@@ -2175,27 +2170,18 @@ zonelist_scan: | |||
2175 | } | 2170 | } |
2176 | 2171 | ||
2177 | try_this_zone: | 2172 | try_this_zone: |
2178 | page = buffered_rmqueue(preferred_zone, zone, order, | 2173 | page = buffered_rmqueue(ac->preferred_zone, zone, order, |
2179 | gfp_mask, migratetype); | 2174 | gfp_mask, ac->migratetype); |
2180 | if (page) | 2175 | if (page) { |
2181 | break; | 2176 | if (prep_new_page(page, order, gfp_mask, alloc_flags)) |
2177 | goto try_this_zone; | ||
2178 | return page; | ||
2179 | } | ||
2182 | this_zone_full: | 2180 | this_zone_full: |
2183 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active) | 2181 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active) |
2184 | zlc_mark_zone_full(zonelist, z); | 2182 | zlc_mark_zone_full(zonelist, z); |
2185 | } | 2183 | } |
2186 | 2184 | ||
2187 | if (page) { | ||
2188 | /* | ||
2189 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was | ||
2190 | * necessary to allocate the page. The expectation is | ||
2191 | * that the caller is taking steps that will free more | ||
2192 | * memory. The caller should avoid the page being used | ||
2193 | * for !PFMEMALLOC purposes. | ||
2194 | */ | ||
2195 | page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); | ||
2196 | return page; | ||
2197 | } | ||
2198 | |||
2199 | /* | 2185 | /* |
2200 | * The first pass makes sure allocations are spread fairly within the | 2186 | * The first pass makes sure allocations are spread fairly within the |
2201 | * local node. However, the local node might have free pages left | 2187 | * local node. However, the local node might have free pages left |
@@ -2208,7 +2194,7 @@ this_zone_full: | |||
2208 | alloc_flags &= ~ALLOC_FAIR; | 2194 | alloc_flags &= ~ALLOC_FAIR; |
2209 | if (nr_fair_skipped) { | 2195 | if (nr_fair_skipped) { |
2210 | zonelist_rescan = true; | 2196 | zonelist_rescan = true; |
2211 | reset_alloc_batches(preferred_zone); | 2197 | reset_alloc_batches(ac->preferred_zone); |
2212 | } | 2198 | } |
2213 | if (nr_online_nodes > 1) | 2199 | if (nr_online_nodes > 1) |
2214 | zonelist_rescan = true; | 2200 | zonelist_rescan = true; |
@@ -2330,44 +2316,44 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, | |||
2330 | 2316 | ||
2331 | static inline struct page * | 2317 | static inline struct page * |
2332 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | 2318 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, |
2333 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2319 | const struct alloc_context *ac, unsigned long *did_some_progress) |
2334 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
2335 | int classzone_idx, int migratetype) | ||
2336 | { | 2320 | { |
2337 | struct page *page; | 2321 | struct page *page; |
2338 | 2322 | ||
2339 | /* Acquire the per-zone oom lock for each zone */ | 2323 | *did_some_progress = 0; |
2340 | if (!oom_zonelist_trylock(zonelist, gfp_mask)) { | ||
2341 | schedule_timeout_uninterruptible(1); | ||
2342 | return NULL; | ||
2343 | } | ||
2344 | 2324 | ||
2345 | /* | 2325 | /* |
2346 | * PM-freezer should be notified that there might be an OOM killer on | 2326 | * Acquire the per-zone oom lock for each zone. If that |
2347 | * its way to kill and wake somebody up. This is too early and we might | 2327 | * fails, somebody else is making progress for us. |
2348 | * end up not killing anything but false positives are acceptable. | ||
2349 | * See freeze_processes. | ||
2350 | */ | 2328 | */ |
2351 | note_oom_kill(); | 2329 | if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { |
2330 | *did_some_progress = 1; | ||
2331 | schedule_timeout_uninterruptible(1); | ||
2332 | return NULL; | ||
2333 | } | ||
2352 | 2334 | ||
2353 | /* | 2335 | /* |
2354 | * Go through the zonelist yet one more time, keep very high watermark | 2336 | * Go through the zonelist yet one more time, keep very high watermark |
2355 | * here, this is only to catch a parallel oom killing, we must fail if | 2337 | * here, this is only to catch a parallel oom killing, we must fail if |
2356 | * we're still under heavy pressure. | 2338 | * we're still under heavy pressure. |
2357 | */ | 2339 | */ |
2358 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | 2340 | page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, |
2359 | order, zonelist, high_zoneidx, | 2341 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); |
2360 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, | ||
2361 | preferred_zone, classzone_idx, migratetype); | ||
2362 | if (page) | 2342 | if (page) |
2363 | goto out; | 2343 | goto out; |
2364 | 2344 | ||
2365 | if (!(gfp_mask & __GFP_NOFAIL)) { | 2345 | if (!(gfp_mask & __GFP_NOFAIL)) { |
2346 | /* Coredumps can quickly deplete all memory reserves */ | ||
2347 | if (current->flags & PF_DUMPCORE) | ||
2348 | goto out; | ||
2366 | /* The OOM killer will not help higher order allocs */ | 2349 | /* The OOM killer will not help higher order allocs */ |
2367 | if (order > PAGE_ALLOC_COSTLY_ORDER) | 2350 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
2368 | goto out; | 2351 | goto out; |
2369 | /* The OOM killer does not needlessly kill tasks for lowmem */ | 2352 | /* The OOM killer does not needlessly kill tasks for lowmem */ |
2370 | if (high_zoneidx < ZONE_NORMAL) | 2353 | if (ac->high_zoneidx < ZONE_NORMAL) |
2354 | goto out; | ||
2355 | /* The OOM killer does not compensate for light reclaim */ | ||
2356 | if (!(gfp_mask & __GFP_FS)) | ||
2371 | goto out; | 2357 | goto out; |
2372 | /* | 2358 | /* |
2373 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. | 2359 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. |
@@ -2380,10 +2366,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
2380 | goto out; | 2366 | goto out; |
2381 | } | 2367 | } |
2382 | /* Exhausted what can be done so it's blamo time */ | 2368 | /* Exhausted what can be done so it's blamo time */ |
2383 | out_of_memory(zonelist, gfp_mask, order, nodemask, false); | 2369 | if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)) |
2384 | 2370 | *did_some_progress = 1; | |
2385 | out: | 2371 | out: |
2386 | oom_zonelist_unlock(zonelist, gfp_mask); | 2372 | oom_zonelist_unlock(ac->zonelist, gfp_mask); |
2387 | return page; | 2373 | return page; |
2388 | } | 2374 | } |
2389 | 2375 | ||
@@ -2391,10 +2377,9 @@ out: | |||
2391 | /* Try memory compaction for high-order allocations before reclaim */ | 2377 | /* Try memory compaction for high-order allocations before reclaim */ |
2392 | static struct page * | 2378 | static struct page * |
2393 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2379 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2394 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2380 | int alloc_flags, const struct alloc_context *ac, |
2395 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2381 | enum migrate_mode mode, int *contended_compaction, |
2396 | int classzone_idx, int migratetype, enum migrate_mode mode, | 2382 | bool *deferred_compaction) |
2397 | int *contended_compaction, bool *deferred_compaction) | ||
2398 | { | 2383 | { |
2399 | unsigned long compact_result; | 2384 | unsigned long compact_result; |
2400 | struct page *page; | 2385 | struct page *page; |
@@ -2403,10 +2388,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2403 | return NULL; | 2388 | return NULL; |
2404 | 2389 | ||
2405 | current->flags |= PF_MEMALLOC; | 2390 | current->flags |= PF_MEMALLOC; |
2406 | compact_result = try_to_compact_pages(zonelist, order, gfp_mask, | 2391 | compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, |
2407 | nodemask, mode, | 2392 | mode, contended_compaction); |
2408 | contended_compaction, | ||
2409 | alloc_flags, classzone_idx); | ||
2410 | current->flags &= ~PF_MEMALLOC; | 2393 | current->flags &= ~PF_MEMALLOC; |
2411 | 2394 | ||
2412 | switch (compact_result) { | 2395 | switch (compact_result) { |
@@ -2425,10 +2408,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2425 | */ | 2408 | */ |
2426 | count_vm_event(COMPACTSTALL); | 2409 | count_vm_event(COMPACTSTALL); |
2427 | 2410 | ||
2428 | page = get_page_from_freelist(gfp_mask, nodemask, | 2411 | page = get_page_from_freelist(gfp_mask, order, |
2429 | order, zonelist, high_zoneidx, | 2412 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); |
2430 | alloc_flags & ~ALLOC_NO_WATERMARKS, | ||
2431 | preferred_zone, classzone_idx, migratetype); | ||
2432 | 2413 | ||
2433 | if (page) { | 2414 | if (page) { |
2434 | struct zone *zone = page_zone(page); | 2415 | struct zone *zone = page_zone(page); |
@@ -2452,10 +2433,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2452 | #else | 2433 | #else |
2453 | static inline struct page * | 2434 | static inline struct page * |
2454 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2435 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2455 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2436 | int alloc_flags, const struct alloc_context *ac, |
2456 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2437 | enum migrate_mode mode, int *contended_compaction, |
2457 | int classzone_idx, int migratetype, enum migrate_mode mode, | 2438 | bool *deferred_compaction) |
2458 | int *contended_compaction, bool *deferred_compaction) | ||
2459 | { | 2439 | { |
2460 | return NULL; | 2440 | return NULL; |
2461 | } | 2441 | } |
@@ -2463,8 +2443,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2463 | 2443 | ||
2464 | /* Perform direct synchronous page reclaim */ | 2444 | /* Perform direct synchronous page reclaim */ |
2465 | static int | 2445 | static int |
2466 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | 2446 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, |
2467 | nodemask_t *nodemask) | 2447 | const struct alloc_context *ac) |
2468 | { | 2448 | { |
2469 | struct reclaim_state reclaim_state; | 2449 | struct reclaim_state reclaim_state; |
2470 | int progress; | 2450 | int progress; |
@@ -2478,7 +2458,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | |||
2478 | reclaim_state.reclaimed_slab = 0; | 2458 | reclaim_state.reclaimed_slab = 0; |
2479 | current->reclaim_state = &reclaim_state; | 2459 | current->reclaim_state = &reclaim_state; |
2480 | 2460 | ||
2481 | progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | 2461 | progress = try_to_free_pages(ac->zonelist, order, gfp_mask, |
2462 | ac->nodemask); | ||
2482 | 2463 | ||
2483 | current->reclaim_state = NULL; | 2464 | current->reclaim_state = NULL; |
2484 | lockdep_clear_current_reclaim_state(); | 2465 | lockdep_clear_current_reclaim_state(); |
@@ -2492,28 +2473,23 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | |||
2492 | /* The really slow allocator path where we enter direct reclaim */ | 2473 | /* The really slow allocator path where we enter direct reclaim */ |
2493 | static inline struct page * | 2474 | static inline struct page * |
2494 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | 2475 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, |
2495 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2476 | int alloc_flags, const struct alloc_context *ac, |
2496 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2477 | unsigned long *did_some_progress) |
2497 | int classzone_idx, int migratetype, unsigned long *did_some_progress) | ||
2498 | { | 2478 | { |
2499 | struct page *page = NULL; | 2479 | struct page *page = NULL; |
2500 | bool drained = false; | 2480 | bool drained = false; |
2501 | 2481 | ||
2502 | *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | 2482 | *did_some_progress = __perform_reclaim(gfp_mask, order, ac); |
2503 | nodemask); | ||
2504 | if (unlikely(!(*did_some_progress))) | 2483 | if (unlikely(!(*did_some_progress))) |
2505 | return NULL; | 2484 | return NULL; |
2506 | 2485 | ||
2507 | /* After successful reclaim, reconsider all zones for allocation */ | 2486 | /* After successful reclaim, reconsider all zones for allocation */ |
2508 | if (IS_ENABLED(CONFIG_NUMA)) | 2487 | if (IS_ENABLED(CONFIG_NUMA)) |
2509 | zlc_clear_zones_full(zonelist); | 2488 | zlc_clear_zones_full(ac->zonelist); |
2510 | 2489 | ||
2511 | retry: | 2490 | retry: |
2512 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2491 | page = get_page_from_freelist(gfp_mask, order, |
2513 | zonelist, high_zoneidx, | 2492 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); |
2514 | alloc_flags & ~ALLOC_NO_WATERMARKS, | ||
2515 | preferred_zone, classzone_idx, | ||
2516 | migratetype); | ||
2517 | 2493 | ||
2518 | /* | 2494 | /* |
2519 | * If an allocation failed after direct reclaim, it could be because | 2495 | * If an allocation failed after direct reclaim, it could be because |
@@ -2534,36 +2510,30 @@ retry: | |||
2534 | */ | 2510 | */ |
2535 | static inline struct page * | 2511 | static inline struct page * |
2536 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | 2512 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, |
2537 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2513 | const struct alloc_context *ac) |
2538 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
2539 | int classzone_idx, int migratetype) | ||
2540 | { | 2514 | { |
2541 | struct page *page; | 2515 | struct page *page; |
2542 | 2516 | ||
2543 | do { | 2517 | do { |
2544 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2518 | page = get_page_from_freelist(gfp_mask, order, |
2545 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, | 2519 | ALLOC_NO_WATERMARKS, ac); |
2546 | preferred_zone, classzone_idx, migratetype); | ||
2547 | 2520 | ||
2548 | if (!page && gfp_mask & __GFP_NOFAIL) | 2521 | if (!page && gfp_mask & __GFP_NOFAIL) |
2549 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2522 | wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, |
2523 | HZ/50); | ||
2550 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | 2524 | } while (!page && (gfp_mask & __GFP_NOFAIL)); |
2551 | 2525 | ||
2552 | return page; | 2526 | return page; |
2553 | } | 2527 | } |
2554 | 2528 | ||
2555 | static void wake_all_kswapds(unsigned int order, | 2529 | static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) |
2556 | struct zonelist *zonelist, | ||
2557 | enum zone_type high_zoneidx, | ||
2558 | struct zone *preferred_zone, | ||
2559 | nodemask_t *nodemask) | ||
2560 | { | 2530 | { |
2561 | struct zoneref *z; | 2531 | struct zoneref *z; |
2562 | struct zone *zone; | 2532 | struct zone *zone; |
2563 | 2533 | ||
2564 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2534 | for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, |
2565 | high_zoneidx, nodemask) | 2535 | ac->high_zoneidx, ac->nodemask) |
2566 | wakeup_kswapd(zone, order, zone_idx(preferred_zone)); | 2536 | wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); |
2567 | } | 2537 | } |
2568 | 2538 | ||
2569 | static inline int | 2539 | static inline int |
@@ -2622,9 +2592,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | |||
2622 | 2592 | ||
2623 | static inline struct page * | 2593 | static inline struct page * |
2624 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | 2594 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, |
2625 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2595 | struct alloc_context *ac) |
2626 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
2627 | int classzone_idx, int migratetype) | ||
2628 | { | 2596 | { |
2629 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 2597 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
2630 | struct page *page = NULL; | 2598 | struct page *page = NULL; |
@@ -2658,10 +2626,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2658 | (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 2626 | (gfp_mask & GFP_THISNODE) == GFP_THISNODE) |
2659 | goto nopage; | 2627 | goto nopage; |
2660 | 2628 | ||
2661 | restart: | 2629 | retry: |
2662 | if (!(gfp_mask & __GFP_NO_KSWAPD)) | 2630 | if (!(gfp_mask & __GFP_NO_KSWAPD)) |
2663 | wake_all_kswapds(order, zonelist, high_zoneidx, | 2631 | wake_all_kswapds(order, ac); |
2664 | preferred_zone, nodemask); | ||
2665 | 2632 | ||
2666 | /* | 2633 | /* |
2667 | * OK, we're below the kswapd watermark and have kicked background | 2634 | * OK, we're below the kswapd watermark and have kicked background |
@@ -2674,18 +2641,16 @@ restart: | |||
2674 | * Find the true preferred zone if the allocation is unconstrained by | 2641 | * Find the true preferred zone if the allocation is unconstrained by |
2675 | * cpusets. | 2642 | * cpusets. |
2676 | */ | 2643 | */ |
2677 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { | 2644 | if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { |
2678 | struct zoneref *preferred_zoneref; | 2645 | struct zoneref *preferred_zoneref; |
2679 | preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, | 2646 | preferred_zoneref = first_zones_zonelist(ac->zonelist, |
2680 | NULL, &preferred_zone); | 2647 | ac->high_zoneidx, NULL, &ac->preferred_zone); |
2681 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | 2648 | ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); |
2682 | } | 2649 | } |
2683 | 2650 | ||
2684 | rebalance: | ||
2685 | /* This is the last chance, in general, before the goto nopage. */ | 2651 | /* This is the last chance, in general, before the goto nopage. */ |
2686 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 2652 | page = get_page_from_freelist(gfp_mask, order, |
2687 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, | 2653 | alloc_flags & ~ALLOC_NO_WATERMARKS, ac); |
2688 | preferred_zone, classzone_idx, migratetype); | ||
2689 | if (page) | 2654 | if (page) |
2690 | goto got_pg; | 2655 | goto got_pg; |
2691 | 2656 | ||
@@ -2696,11 +2661,10 @@ rebalance: | |||
2696 | * the allocation is high priority and these type of | 2661 | * the allocation is high priority and these type of |
2697 | * allocations are system rather than user orientated | 2662 | * allocations are system rather than user orientated |
2698 | */ | 2663 | */ |
2699 | zonelist = node_zonelist(numa_node_id(), gfp_mask); | 2664 | ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); |
2665 | |||
2666 | page = __alloc_pages_high_priority(gfp_mask, order, ac); | ||
2700 | 2667 | ||
2701 | page = __alloc_pages_high_priority(gfp_mask, order, | ||
2702 | zonelist, high_zoneidx, nodemask, | ||
2703 | preferred_zone, classzone_idx, migratetype); | ||
2704 | if (page) { | 2668 | if (page) { |
2705 | goto got_pg; | 2669 | goto got_pg; |
2706 | } | 2670 | } |
@@ -2729,11 +2693,9 @@ rebalance: | |||
2729 | * Try direct compaction. The first pass is asynchronous. Subsequent | 2693 | * Try direct compaction. The first pass is asynchronous. Subsequent |
2730 | * attempts after direct reclaim are synchronous | 2694 | * attempts after direct reclaim are synchronous |
2731 | */ | 2695 | */ |
2732 | page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, | 2696 | page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, |
2733 | high_zoneidx, nodemask, alloc_flags, | 2697 | migration_mode, |
2734 | preferred_zone, | 2698 | &contended_compaction, |
2735 | classzone_idx, migratetype, | ||
2736 | migration_mode, &contended_compaction, | ||
2737 | &deferred_compaction); | 2699 | &deferred_compaction); |
2738 | if (page) | 2700 | if (page) |
2739 | goto got_pg; | 2701 | goto got_pg; |
@@ -2779,74 +2741,40 @@ rebalance: | |||
2779 | migration_mode = MIGRATE_SYNC_LIGHT; | 2741 | migration_mode = MIGRATE_SYNC_LIGHT; |
2780 | 2742 | ||
2781 | /* Try direct reclaim and then allocating */ | 2743 | /* Try direct reclaim and then allocating */ |
2782 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2744 | page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, |
2783 | zonelist, high_zoneidx, | 2745 | &did_some_progress); |
2784 | nodemask, | ||
2785 | alloc_flags, preferred_zone, | ||
2786 | classzone_idx, migratetype, | ||
2787 | &did_some_progress); | ||
2788 | if (page) | 2746 | if (page) |
2789 | goto got_pg; | 2747 | goto got_pg; |
2790 | 2748 | ||
2791 | /* | ||
2792 | * If we failed to make any progress reclaiming, then we are | ||
2793 | * running out of options and have to consider going OOM | ||
2794 | */ | ||
2795 | if (!did_some_progress) { | ||
2796 | if (oom_gfp_allowed(gfp_mask)) { | ||
2797 | if (oom_killer_disabled) | ||
2798 | goto nopage; | ||
2799 | /* Coredumps can quickly deplete all memory reserves */ | ||
2800 | if ((current->flags & PF_DUMPCORE) && | ||
2801 | !(gfp_mask & __GFP_NOFAIL)) | ||
2802 | goto nopage; | ||
2803 | page = __alloc_pages_may_oom(gfp_mask, order, | ||
2804 | zonelist, high_zoneidx, | ||
2805 | nodemask, preferred_zone, | ||
2806 | classzone_idx, migratetype); | ||
2807 | if (page) | ||
2808 | goto got_pg; | ||
2809 | |||
2810 | if (!(gfp_mask & __GFP_NOFAIL)) { | ||
2811 | /* | ||
2812 | * The oom killer is not called for high-order | ||
2813 | * allocations that may fail, so if no progress | ||
2814 | * is being made, there are no other options and | ||
2815 | * retrying is unlikely to help. | ||
2816 | */ | ||
2817 | if (order > PAGE_ALLOC_COSTLY_ORDER) | ||
2818 | goto nopage; | ||
2819 | /* | ||
2820 | * The oom killer is not called for lowmem | ||
2821 | * allocations to prevent needlessly killing | ||
2822 | * innocent tasks. | ||
2823 | */ | ||
2824 | if (high_zoneidx < ZONE_NORMAL) | ||
2825 | goto nopage; | ||
2826 | } | ||
2827 | |||
2828 | goto restart; | ||
2829 | } | ||
2830 | } | ||
2831 | |||
2832 | /* Check if we should retry the allocation */ | 2749 | /* Check if we should retry the allocation */ |
2833 | pages_reclaimed += did_some_progress; | 2750 | pages_reclaimed += did_some_progress; |
2834 | if (should_alloc_retry(gfp_mask, order, did_some_progress, | 2751 | if (should_alloc_retry(gfp_mask, order, did_some_progress, |
2835 | pages_reclaimed)) { | 2752 | pages_reclaimed)) { |
2753 | /* | ||
2754 | * If we fail to make progress by freeing individual | ||
2755 | * pages, but the allocation wants us to keep going, | ||
2756 | * start OOM killing tasks. | ||
2757 | */ | ||
2758 | if (!did_some_progress) { | ||
2759 | page = __alloc_pages_may_oom(gfp_mask, order, ac, | ||
2760 | &did_some_progress); | ||
2761 | if (page) | ||
2762 | goto got_pg; | ||
2763 | if (!did_some_progress) | ||
2764 | goto nopage; | ||
2765 | } | ||
2836 | /* Wait for some write requests to complete then retry */ | 2766 | /* Wait for some write requests to complete then retry */ |
2837 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2767 | wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); |
2838 | goto rebalance; | 2768 | goto retry; |
2839 | } else { | 2769 | } else { |
2840 | /* | 2770 | /* |
2841 | * High-order allocations do not necessarily loop after | 2771 | * High-order allocations do not necessarily loop after |
2842 | * direct reclaim and reclaim/compaction depends on compaction | 2772 | * direct reclaim and reclaim/compaction depends on compaction |
2843 | * being called after reclaim so call directly if necessary | 2773 | * being called after reclaim so call directly if necessary |
2844 | */ | 2774 | */ |
2845 | page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, | 2775 | page = __alloc_pages_direct_compact(gfp_mask, order, |
2846 | high_zoneidx, nodemask, alloc_flags, | 2776 | alloc_flags, ac, migration_mode, |
2847 | preferred_zone, | 2777 | &contended_compaction, |
2848 | classzone_idx, migratetype, | ||
2849 | migration_mode, &contended_compaction, | ||
2850 | &deferred_compaction); | 2778 | &deferred_compaction); |
2851 | if (page) | 2779 | if (page) |
2852 | goto got_pg; | 2780 | goto got_pg; |
@@ -2854,11 +2782,7 @@ rebalance: | |||
2854 | 2782 | ||
2855 | nopage: | 2783 | nopage: |
2856 | warn_alloc_failed(gfp_mask, order, NULL); | 2784 | warn_alloc_failed(gfp_mask, order, NULL); |
2857 | return page; | ||
2858 | got_pg: | 2785 | got_pg: |
2859 | if (kmemcheck_enabled) | ||
2860 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
2861 | |||
2862 | return page; | 2786 | return page; |
2863 | } | 2787 | } |
2864 | 2788 | ||
@@ -2869,14 +2793,16 @@ struct page * | |||
2869 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | 2793 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, |
2870 | struct zonelist *zonelist, nodemask_t *nodemask) | 2794 | struct zonelist *zonelist, nodemask_t *nodemask) |
2871 | { | 2795 | { |
2872 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
2873 | struct zone *preferred_zone; | ||
2874 | struct zoneref *preferred_zoneref; | 2796 | struct zoneref *preferred_zoneref; |
2875 | struct page *page = NULL; | 2797 | struct page *page = NULL; |
2876 | int migratetype = gfpflags_to_migratetype(gfp_mask); | ||
2877 | unsigned int cpuset_mems_cookie; | 2798 | unsigned int cpuset_mems_cookie; |
2878 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; | 2799 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; |
2879 | int classzone_idx; | 2800 | gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ |
2801 | struct alloc_context ac = { | ||
2802 | .high_zoneidx = gfp_zone(gfp_mask), | ||
2803 | .nodemask = nodemask, | ||
2804 | .migratetype = gfpflags_to_migratetype(gfp_mask), | ||
2805 | }; | ||
2880 | 2806 | ||
2881 | gfp_mask &= gfp_allowed_mask; | 2807 | gfp_mask &= gfp_allowed_mask; |
2882 | 2808 | ||
@@ -2895,37 +2821,40 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2895 | if (unlikely(!zonelist->_zonerefs->zone)) | 2821 | if (unlikely(!zonelist->_zonerefs->zone)) |
2896 | return NULL; | 2822 | return NULL; |
2897 | 2823 | ||
2898 | if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) | 2824 | if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) |
2899 | alloc_flags |= ALLOC_CMA; | 2825 | alloc_flags |= ALLOC_CMA; |
2900 | 2826 | ||
2901 | retry_cpuset: | 2827 | retry_cpuset: |
2902 | cpuset_mems_cookie = read_mems_allowed_begin(); | 2828 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2903 | 2829 | ||
2830 | /* We set it here, as __alloc_pages_slowpath might have changed it */ | ||
2831 | ac.zonelist = zonelist; | ||
2904 | /* The preferred zone is used for statistics later */ | 2832 | /* The preferred zone is used for statistics later */ |
2905 | preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, | 2833 | preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, |
2906 | nodemask ? : &cpuset_current_mems_allowed, | 2834 | ac.nodemask ? : &cpuset_current_mems_allowed, |
2907 | &preferred_zone); | 2835 | &ac.preferred_zone); |
2908 | if (!preferred_zone) | 2836 | if (!ac.preferred_zone) |
2909 | goto out; | 2837 | goto out; |
2910 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | 2838 | ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); |
2911 | 2839 | ||
2912 | /* First allocation attempt */ | 2840 | /* First allocation attempt */ |
2913 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2841 | alloc_mask = gfp_mask|__GFP_HARDWALL; |
2914 | zonelist, high_zoneidx, alloc_flags, | 2842 | page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); |
2915 | preferred_zone, classzone_idx, migratetype); | ||
2916 | if (unlikely(!page)) { | 2843 | if (unlikely(!page)) { |
2917 | /* | 2844 | /* |
2918 | * Runtime PM, block IO and its error handling path | 2845 | * Runtime PM, block IO and its error handling path |
2919 | * can deadlock because I/O on the device might not | 2846 | * can deadlock because I/O on the device might not |
2920 | * complete. | 2847 | * complete. |
2921 | */ | 2848 | */ |
2922 | gfp_mask = memalloc_noio_flags(gfp_mask); | 2849 | alloc_mask = memalloc_noio_flags(gfp_mask); |
2923 | page = __alloc_pages_slowpath(gfp_mask, order, | 2850 | |
2924 | zonelist, high_zoneidx, nodemask, | 2851 | page = __alloc_pages_slowpath(alloc_mask, order, &ac); |
2925 | preferred_zone, classzone_idx, migratetype); | ||
2926 | } | 2852 | } |
2927 | 2853 | ||
2928 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2854 | if (kmemcheck_enabled && page) |
2855 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
2856 | |||
2857 | trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); | ||
2929 | 2858 | ||
2930 | out: | 2859 | out: |
2931 | /* | 2860 | /* |
@@ -3945,18 +3874,29 @@ static int __build_all_zonelists(void *data) | |||
3945 | return 0; | 3874 | return 0; |
3946 | } | 3875 | } |
3947 | 3876 | ||
3877 | static noinline void __init | ||
3878 | build_all_zonelists_init(void) | ||
3879 | { | ||
3880 | __build_all_zonelists(NULL); | ||
3881 | mminit_verify_zonelist(); | ||
3882 | cpuset_init_current_mems_allowed(); | ||
3883 | } | ||
3884 | |||
3948 | /* | 3885 | /* |
3949 | * Called with zonelists_mutex held always | 3886 | * Called with zonelists_mutex held always |
3950 | * unless system_state == SYSTEM_BOOTING. | 3887 | * unless system_state == SYSTEM_BOOTING. |
3888 | * | ||
3889 | * __ref due to (1) call of __meminit annotated setup_zone_pageset | ||
3890 | * [we're only called with non-NULL zone through __meminit paths] and | ||
3891 | * (2) call of __init annotated helper build_all_zonelists_init | ||
3892 | * [protected by SYSTEM_BOOTING]. | ||
3951 | */ | 3893 | */ |
3952 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) | 3894 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) |
3953 | { | 3895 | { |
3954 | set_zonelist_order(); | 3896 | set_zonelist_order(); |
3955 | 3897 | ||
3956 | if (system_state == SYSTEM_BOOTING) { | 3898 | if (system_state == SYSTEM_BOOTING) { |
3957 | __build_all_zonelists(NULL); | 3899 | build_all_zonelists_init(); |
3958 | mminit_verify_zonelist(); | ||
3959 | cpuset_init_current_mems_allowed(); | ||
3960 | } else { | 3900 | } else { |
3961 | #ifdef CONFIG_MEMORY_HOTPLUG | 3901 | #ifdef CONFIG_MEMORY_HOTPLUG |
3962 | if (zone) | 3902 | if (zone) |
@@ -5059,8 +4999,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
5059 | pgdat->node_start_pfn = node_start_pfn; | 4999 | pgdat->node_start_pfn = node_start_pfn; |
5060 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 5000 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
5061 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); | 5001 | get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); |
5062 | printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, | 5002 | pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, |
5063 | (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); | 5003 | (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); |
5064 | #endif | 5004 | #endif |
5065 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, | 5005 | calculate_node_totalpages(pgdat, start_pfn, end_pfn, |
5066 | zones_size, zholes_size); | 5006 | zones_size, zholes_size); |
@@ -5432,9 +5372,10 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
5432 | arch_zone_highest_possible_pfn[i]) | 5372 | arch_zone_highest_possible_pfn[i]) |
5433 | pr_cont("empty\n"); | 5373 | pr_cont("empty\n"); |
5434 | else | 5374 | else |
5435 | pr_cont("[mem %0#10lx-%0#10lx]\n", | 5375 | pr_cont("[mem %#018Lx-%#018Lx]\n", |
5436 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, | 5376 | (u64)arch_zone_lowest_possible_pfn[i] |
5437 | (arch_zone_highest_possible_pfn[i] | 5377 | << PAGE_SHIFT, |
5378 | ((u64)arch_zone_highest_possible_pfn[i] | ||
5438 | << PAGE_SHIFT) - 1); | 5379 | << PAGE_SHIFT) - 1); |
5439 | } | 5380 | } |
5440 | 5381 | ||
@@ -5442,15 +5383,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
5442 | pr_info("Movable zone start for each node\n"); | 5383 | pr_info("Movable zone start for each node\n"); |
5443 | for (i = 0; i < MAX_NUMNODES; i++) { | 5384 | for (i = 0; i < MAX_NUMNODES; i++) { |
5444 | if (zone_movable_pfn[i]) | 5385 | if (zone_movable_pfn[i]) |
5445 | pr_info(" Node %d: %#010lx\n", i, | 5386 | pr_info(" Node %d: %#018Lx\n", i, |
5446 | zone_movable_pfn[i] << PAGE_SHIFT); | 5387 | (u64)zone_movable_pfn[i] << PAGE_SHIFT); |
5447 | } | 5388 | } |
5448 | 5389 | ||
5449 | /* Print out the early node map */ | 5390 | /* Print out the early node map */ |
5450 | pr_info("Early memory node ranges\n"); | 5391 | pr_info("Early memory node ranges\n"); |
5451 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 5392 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
5452 | pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid, | 5393 | pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, |
5453 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); | 5394 | (u64)start_pfn << PAGE_SHIFT, |
5395 | ((u64)end_pfn << PAGE_SHIFT) - 1); | ||
5454 | 5396 | ||
5455 | /* Initialise every node */ | 5397 | /* Initialise every node */ |
5456 | mminit_verify_pageflags_layout(); | 5398 | mminit_verify_pageflags_layout(); |
diff --git a/mm/page_counter.c b/mm/page_counter.c index a009574fbba9..11b4beda14ba 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c | |||
@@ -166,18 +166,19 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit) | |||
166 | /** | 166 | /** |
167 | * page_counter_memparse - memparse() for page counter limits | 167 | * page_counter_memparse - memparse() for page counter limits |
168 | * @buf: string to parse | 168 | * @buf: string to parse |
169 | * @max: string meaning maximum possible value | ||
169 | * @nr_pages: returns the result in number of pages | 170 | * @nr_pages: returns the result in number of pages |
170 | * | 171 | * |
171 | * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be | 172 | * Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be |
172 | * limited to %PAGE_COUNTER_MAX. | 173 | * limited to %PAGE_COUNTER_MAX. |
173 | */ | 174 | */ |
174 | int page_counter_memparse(const char *buf, unsigned long *nr_pages) | 175 | int page_counter_memparse(const char *buf, const char *max, |
176 | unsigned long *nr_pages) | ||
175 | { | 177 | { |
176 | char unlimited[] = "-1"; | ||
177 | char *end; | 178 | char *end; |
178 | u64 bytes; | 179 | u64 bytes; |
179 | 180 | ||
180 | if (!strncmp(buf, unlimited, sizeof(unlimited))) { | 181 | if (!strcmp(buf, max)) { |
181 | *nr_pages = PAGE_COUNTER_MAX; | 182 | *nr_pages = PAGE_COUNTER_MAX; |
182 | return 0; | 183 | return 0; |
183 | } | 184 | } |
diff --git a/mm/page_owner.c b/mm/page_owner.c index 9ab4a9b5bc09..0993f5f36b01 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c | |||
@@ -59,20 +59,19 @@ void __reset_page_owner(struct page *page, unsigned int order) | |||
59 | 59 | ||
60 | void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) | 60 | void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) |
61 | { | 61 | { |
62 | struct page_ext *page_ext; | 62 | struct page_ext *page_ext = lookup_page_ext(page); |
63 | struct stack_trace *trace; | 63 | struct stack_trace trace = { |
64 | 64 | .nr_entries = 0, | |
65 | page_ext = lookup_page_ext(page); | 65 | .max_entries = ARRAY_SIZE(page_ext->trace_entries), |
66 | .entries = &page_ext->trace_entries[0], | ||
67 | .skip = 3, | ||
68 | }; | ||
66 | 69 | ||
67 | trace = &page_ext->trace; | 70 | save_stack_trace(&trace); |
68 | trace->nr_entries = 0; | ||
69 | trace->max_entries = ARRAY_SIZE(page_ext->trace_entries); | ||
70 | trace->entries = &page_ext->trace_entries[0]; | ||
71 | trace->skip = 3; | ||
72 | save_stack_trace(&page_ext->trace); | ||
73 | 71 | ||
74 | page_ext->order = order; | 72 | page_ext->order = order; |
75 | page_ext->gfp_mask = gfp_mask; | 73 | page_ext->gfp_mask = gfp_mask; |
74 | page_ext->nr_entries = trace.nr_entries; | ||
76 | 75 | ||
77 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); | 76 | __set_bit(PAGE_EXT_OWNER, &page_ext->flags); |
78 | } | 77 | } |
@@ -84,6 +83,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, | |||
84 | int ret; | 83 | int ret; |
85 | int pageblock_mt, page_mt; | 84 | int pageblock_mt, page_mt; |
86 | char *kbuf; | 85 | char *kbuf; |
86 | struct stack_trace trace = { | ||
87 | .nr_entries = page_ext->nr_entries, | ||
88 | .entries = &page_ext->trace_entries[0], | ||
89 | }; | ||
87 | 90 | ||
88 | kbuf = kmalloc(count, GFP_KERNEL); | 91 | kbuf = kmalloc(count, GFP_KERNEL); |
89 | if (!kbuf) | 92 | if (!kbuf) |
@@ -121,8 +124,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, | |||
121 | if (ret >= count) | 124 | if (ret >= count) |
122 | goto err; | 125 | goto err; |
123 | 126 | ||
124 | ret += snprint_stack_trace(kbuf + ret, count - ret, | 127 | ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0); |
125 | &page_ext->trace, 0); | ||
126 | if (ret >= count) | 128 | if (ret >= count) |
127 | goto err; | 129 | goto err; |
128 | 130 | ||
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index ad83195521f2..75c1f2878519 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -35,7 +35,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, | |||
35 | do { | 35 | do { |
36 | again: | 36 | again: |
37 | next = pmd_addr_end(addr, end); | 37 | next = pmd_addr_end(addr, end); |
38 | if (pmd_none(*pmd)) { | 38 | if (pmd_none(*pmd) || !walk->vma) { |
39 | if (walk->pte_hole) | 39 | if (walk->pte_hole) |
40 | err = walk->pte_hole(addr, next, walk); | 40 | err = walk->pte_hole(addr, next, walk); |
41 | if (err) | 41 | if (err) |
@@ -59,7 +59,7 @@ again: | |||
59 | continue; | 59 | continue; |
60 | 60 | ||
61 | split_huge_page_pmd_mm(walk->mm, addr, pmd); | 61 | split_huge_page_pmd_mm(walk->mm, addr, pmd); |
62 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | 62 | if (pmd_trans_unstable(pmd)) |
63 | goto again; | 63 | goto again; |
64 | err = walk_pte_range(pmd, addr, next, walk); | 64 | err = walk_pte_range(pmd, addr, next, walk); |
65 | if (err) | 65 | if (err) |
@@ -86,9 +86,7 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, | |||
86 | break; | 86 | break; |
87 | continue; | 87 | continue; |
88 | } | 88 | } |
89 | if (walk->pud_entry) | 89 | if (walk->pmd_entry || walk->pte_entry) |
90 | err = walk->pud_entry(pud, addr, next, walk); | ||
91 | if (!err && (walk->pmd_entry || walk->pte_entry)) | ||
92 | err = walk_pmd_range(pud, addr, next, walk); | 90 | err = walk_pmd_range(pud, addr, next, walk); |
93 | if (err) | 91 | if (err) |
94 | break; | 92 | break; |
@@ -97,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, | |||
97 | return err; | 95 | return err; |
98 | } | 96 | } |
99 | 97 | ||
98 | static int walk_pgd_range(unsigned long addr, unsigned long end, | ||
99 | struct mm_walk *walk) | ||
100 | { | ||
101 | pgd_t *pgd; | ||
102 | unsigned long next; | ||
103 | int err = 0; | ||
104 | |||
105 | pgd = pgd_offset(walk->mm, addr); | ||
106 | do { | ||
107 | next = pgd_addr_end(addr, end); | ||
108 | if (pgd_none_or_clear_bad(pgd)) { | ||
109 | if (walk->pte_hole) | ||
110 | err = walk->pte_hole(addr, next, walk); | ||
111 | if (err) | ||
112 | break; | ||
113 | continue; | ||
114 | } | ||
115 | if (walk->pmd_entry || walk->pte_entry) | ||
116 | err = walk_pud_range(pgd, addr, next, walk); | ||
117 | if (err) | ||
118 | break; | ||
119 | } while (pgd++, addr = next, addr != end); | ||
120 | |||
121 | return err; | ||
122 | } | ||
123 | |||
100 | #ifdef CONFIG_HUGETLB_PAGE | 124 | #ifdef CONFIG_HUGETLB_PAGE |
101 | static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, | 125 | static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, |
102 | unsigned long end) | 126 | unsigned long end) |
@@ -105,10 +129,10 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, | |||
105 | return boundary < end ? boundary : end; | 129 | return boundary < end ? boundary : end; |
106 | } | 130 | } |
107 | 131 | ||
108 | static int walk_hugetlb_range(struct vm_area_struct *vma, | 132 | static int walk_hugetlb_range(unsigned long addr, unsigned long end, |
109 | unsigned long addr, unsigned long end, | ||
110 | struct mm_walk *walk) | 133 | struct mm_walk *walk) |
111 | { | 134 | { |
135 | struct vm_area_struct *vma = walk->vma; | ||
112 | struct hstate *h = hstate_vma(vma); | 136 | struct hstate *h = hstate_vma(vma); |
113 | unsigned long next; | 137 | unsigned long next; |
114 | unsigned long hmask = huge_page_mask(h); | 138 | unsigned long hmask = huge_page_mask(h); |
@@ -121,15 +145,14 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
121 | if (pte && walk->hugetlb_entry) | 145 | if (pte && walk->hugetlb_entry) |
122 | err = walk->hugetlb_entry(pte, hmask, addr, next, walk); | 146 | err = walk->hugetlb_entry(pte, hmask, addr, next, walk); |
123 | if (err) | 147 | if (err) |
124 | return err; | 148 | break; |
125 | } while (addr = next, addr != end); | 149 | } while (addr = next, addr != end); |
126 | 150 | ||
127 | return 0; | 151 | return err; |
128 | } | 152 | } |
129 | 153 | ||
130 | #else /* CONFIG_HUGETLB_PAGE */ | 154 | #else /* CONFIG_HUGETLB_PAGE */ |
131 | static int walk_hugetlb_range(struct vm_area_struct *vma, | 155 | static int walk_hugetlb_range(unsigned long addr, unsigned long end, |
132 | unsigned long addr, unsigned long end, | ||
133 | struct mm_walk *walk) | 156 | struct mm_walk *walk) |
134 | { | 157 | { |
135 | return 0; | 158 | return 0; |
@@ -137,112 +160,138 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
137 | 160 | ||
138 | #endif /* CONFIG_HUGETLB_PAGE */ | 161 | #endif /* CONFIG_HUGETLB_PAGE */ |
139 | 162 | ||
163 | /* | ||
164 | * Decide whether we really walk over the current vma on [@start, @end) | ||
165 | * or skip it via the returned value. Return 0 if we do walk over the | ||
166 | * current vma, and return 1 if we skip the vma. Negative values means | ||
167 | * error, where we abort the current walk. | ||
168 | */ | ||
169 | static int walk_page_test(unsigned long start, unsigned long end, | ||
170 | struct mm_walk *walk) | ||
171 | { | ||
172 | struct vm_area_struct *vma = walk->vma; | ||
173 | |||
174 | if (walk->test_walk) | ||
175 | return walk->test_walk(start, end, walk); | ||
176 | |||
177 | /* | ||
178 | * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP | ||
179 | * range, so we don't walk over it as we do for normal vmas. However, | ||
180 | * Some callers are interested in handling hole range and they don't | ||
181 | * want to just ignore any single address range. Such users certainly | ||
182 | * define their ->pte_hole() callbacks, so let's delegate them to handle | ||
183 | * vma(VM_PFNMAP). | ||
184 | */ | ||
185 | if (vma->vm_flags & VM_PFNMAP) { | ||
186 | int err = 1; | ||
187 | if (walk->pte_hole) | ||
188 | err = walk->pte_hole(start, end, walk); | ||
189 | return err ? err : 1; | ||
190 | } | ||
191 | return 0; | ||
192 | } | ||
193 | |||
194 | static int __walk_page_range(unsigned long start, unsigned long end, | ||
195 | struct mm_walk *walk) | ||
196 | { | ||
197 | int err = 0; | ||
198 | struct vm_area_struct *vma = walk->vma; | ||
199 | |||
200 | if (vma && is_vm_hugetlb_page(vma)) { | ||
201 | if (walk->hugetlb_entry) | ||
202 | err = walk_hugetlb_range(start, end, walk); | ||
203 | } else | ||
204 | err = walk_pgd_range(start, end, walk); | ||
140 | 205 | ||
206 | return err; | ||
207 | } | ||
141 | 208 | ||
142 | /** | 209 | /** |
143 | * walk_page_range - walk a memory map's page tables with a callback | 210 | * walk_page_range - walk page table with caller specific callbacks |
144 | * @addr: starting address | ||
145 | * @end: ending address | ||
146 | * @walk: set of callbacks to invoke for each level of the tree | ||
147 | * | 211 | * |
148 | * Recursively walk the page table for the memory area in a VMA, | 212 | * Recursively walk the page table tree of the process represented by @walk->mm |
149 | * calling supplied callbacks. Callbacks are called in-order (first | 213 | * within the virtual address range [@start, @end). During walking, we can do |
150 | * PGD, first PUD, first PMD, first PTE, second PTE... second PMD, | 214 | * some caller-specific works for each entry, by setting up pmd_entry(), |
151 | * etc.). If lower-level callbacks are omitted, walking depth is reduced. | 215 | * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these |
216 | * callbacks, the associated entries/pages are just ignored. | ||
217 | * The return values of these callbacks are commonly defined like below: | ||
218 | * - 0 : succeeded to handle the current entry, and if you don't reach the | ||
219 | * end address yet, continue to walk. | ||
220 | * - >0 : succeeded to handle the current entry, and return to the caller | ||
221 | * with caller specific value. | ||
222 | * - <0 : failed to handle the current entry, and return to the caller | ||
223 | * with error code. | ||
152 | * | 224 | * |
153 | * Each callback receives an entry pointer and the start and end of the | 225 | * Before starting to walk page table, some callers want to check whether |
154 | * associated range, and a copy of the original mm_walk for access to | 226 | * they really want to walk over the current vma, typically by checking |
155 | * the ->private or ->mm fields. | 227 | * its vm_flags. walk_page_test() and @walk->test_walk() are used for this |
228 | * purpose. | ||
156 | * | 229 | * |
157 | * Usually no locks are taken, but splitting transparent huge page may | 230 | * struct mm_walk keeps current values of some common data like vma and pmd, |
158 | * take page table lock. And the bottom level iterator will map PTE | 231 | * which are useful for the access from callbacks. If you want to pass some |
159 | * directories from highmem if necessary. | 232 | * caller-specific data to callbacks, @walk->private should be helpful. |
160 | * | 233 | * |
161 | * If any callback returns a non-zero value, the walk is aborted and | 234 | * Locking: |
162 | * the return value is propagated back to the caller. Otherwise 0 is returned. | 235 | * Callers of walk_page_range() and walk_page_vma() should hold |
163 | * | 236 | * @walk->mm->mmap_sem, because these function traverse vma list and/or |
164 | * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry | 237 | * access to vma's data. |
165 | * is !NULL. | ||
166 | */ | 238 | */ |
167 | int walk_page_range(unsigned long addr, unsigned long end, | 239 | int walk_page_range(unsigned long start, unsigned long end, |
168 | struct mm_walk *walk) | 240 | struct mm_walk *walk) |
169 | { | 241 | { |
170 | pgd_t *pgd; | ||
171 | unsigned long next; | ||
172 | int err = 0; | 242 | int err = 0; |
243 | unsigned long next; | ||
244 | struct vm_area_struct *vma; | ||
173 | 245 | ||
174 | if (addr >= end) | 246 | if (start >= end) |
175 | return err; | 247 | return -EINVAL; |
176 | 248 | ||
177 | if (!walk->mm) | 249 | if (!walk->mm) |
178 | return -EINVAL; | 250 | return -EINVAL; |
179 | 251 | ||
180 | VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); | 252 | VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); |
181 | 253 | ||
182 | pgd = pgd_offset(walk->mm, addr); | 254 | vma = find_vma(walk->mm, start); |
183 | do { | 255 | do { |
184 | struct vm_area_struct *vma = NULL; | 256 | if (!vma) { /* after the last vma */ |
185 | 257 | walk->vma = NULL; | |
186 | next = pgd_addr_end(addr, end); | 258 | next = end; |
259 | } else if (start < vma->vm_start) { /* outside vma */ | ||
260 | walk->vma = NULL; | ||
261 | next = min(end, vma->vm_start); | ||
262 | } else { /* inside vma */ | ||
263 | walk->vma = vma; | ||
264 | next = min(end, vma->vm_end); | ||
265 | vma = vma->vm_next; | ||
187 | 266 | ||
188 | /* | 267 | err = walk_page_test(start, next, walk); |
189 | * This function was not intended to be vma based. | 268 | if (err > 0) |
190 | * But there are vma special cases to be handled: | ||
191 | * - hugetlb vma's | ||
192 | * - VM_PFNMAP vma's | ||
193 | */ | ||
194 | vma = find_vma(walk->mm, addr); | ||
195 | if (vma) { | ||
196 | /* | ||
197 | * There are no page structures backing a VM_PFNMAP | ||
198 | * range, so do not allow split_huge_page_pmd(). | ||
199 | */ | ||
200 | if ((vma->vm_start <= addr) && | ||
201 | (vma->vm_flags & VM_PFNMAP)) { | ||
202 | next = vma->vm_end; | ||
203 | pgd = pgd_offset(walk->mm, next); | ||
204 | continue; | ||
205 | } | ||
206 | /* | ||
207 | * Handle hugetlb vma individually because pagetable | ||
208 | * walk for the hugetlb page is dependent on the | ||
209 | * architecture and we can't handled it in the same | ||
210 | * manner as non-huge pages. | ||
211 | */ | ||
212 | if (walk->hugetlb_entry && (vma->vm_start <= addr) && | ||
213 | is_vm_hugetlb_page(vma)) { | ||
214 | if (vma->vm_end < next) | ||
215 | next = vma->vm_end; | ||
216 | /* | ||
217 | * Hugepage is very tightly coupled with vma, | ||
218 | * so walk through hugetlb entries within a | ||
219 | * given vma. | ||
220 | */ | ||
221 | err = walk_hugetlb_range(vma, addr, next, walk); | ||
222 | if (err) | ||
223 | break; | ||
224 | pgd = pgd_offset(walk->mm, next); | ||
225 | continue; | 269 | continue; |
226 | } | 270 | if (err < 0) |
227 | } | ||
228 | |||
229 | if (pgd_none_or_clear_bad(pgd)) { | ||
230 | if (walk->pte_hole) | ||
231 | err = walk->pte_hole(addr, next, walk); | ||
232 | if (err) | ||
233 | break; | 271 | break; |
234 | pgd++; | ||
235 | continue; | ||
236 | } | 272 | } |
237 | if (walk->pgd_entry) | 273 | if (walk->vma || walk->pte_hole) |
238 | err = walk->pgd_entry(pgd, addr, next, walk); | 274 | err = __walk_page_range(start, next, walk); |
239 | if (!err && | ||
240 | (walk->pud_entry || walk->pmd_entry || walk->pte_entry)) | ||
241 | err = walk_pud_range(pgd, addr, next, walk); | ||
242 | if (err) | 275 | if (err) |
243 | break; | 276 | break; |
244 | pgd++; | 277 | } while (start = next, start < end); |
245 | } while (addr = next, addr < end); | ||
246 | |||
247 | return err; | 278 | return err; |
248 | } | 279 | } |
280 | |||
281 | int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk) | ||
282 | { | ||
283 | int err; | ||
284 | |||
285 | if (!walk->mm) | ||
286 | return -EINVAL; | ||
287 | |||
288 | VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); | ||
289 | VM_BUG_ON(!vma); | ||
290 | walk->vma = vma; | ||
291 | err = walk_page_test(vma->vm_start, vma->vm_end, walk); | ||
292 | if (err > 0) | ||
293 | return 0; | ||
294 | if (err < 0) | ||
295 | return err; | ||
296 | return __walk_page_range(vma->vm_start, vma->vm_end, walk); | ||
297 | } | ||
diff --git a/mm/percpu.c b/mm/percpu.c index d39e2f4e335c..73c97a5f4495 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -1528,7 +1528,6 @@ static void pcpu_dump_alloc_info(const char *lvl, | |||
1528 | int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | 1528 | int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, |
1529 | void *base_addr) | 1529 | void *base_addr) |
1530 | { | 1530 | { |
1531 | static char cpus_buf[4096] __initdata; | ||
1532 | static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; | 1531 | static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; |
1533 | static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; | 1532 | static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; |
1534 | size_t dyn_size = ai->dyn_size; | 1533 | size_t dyn_size = ai->dyn_size; |
@@ -1541,12 +1540,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1541 | int *unit_map; | 1540 | int *unit_map; |
1542 | int group, unit, i; | 1541 | int group, unit, i; |
1543 | 1542 | ||
1544 | cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask); | ||
1545 | |||
1546 | #define PCPU_SETUP_BUG_ON(cond) do { \ | 1543 | #define PCPU_SETUP_BUG_ON(cond) do { \ |
1547 | if (unlikely(cond)) { \ | 1544 | if (unlikely(cond)) { \ |
1548 | pr_emerg("PERCPU: failed to initialize, %s", #cond); \ | 1545 | pr_emerg("PERCPU: failed to initialize, %s", #cond); \ |
1549 | pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \ | 1546 | pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \ |
1547 | cpumask_pr_args(cpu_possible_mask)); \ | ||
1550 | pcpu_dump_alloc_info(KERN_EMERG, ai); \ | 1548 | pcpu_dump_alloc_info(KERN_EMERG, ai); \ |
1551 | BUG(); \ | 1549 | BUG(); \ |
1552 | } \ | 1550 | } \ |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index dfb79e028ecb..c25f94b33811 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -193,8 +193,6 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, | |||
193 | pmd_t *pmdp) | 193 | pmd_t *pmdp) |
194 | { | 194 | { |
195 | pmd_t entry = *pmdp; | 195 | pmd_t entry = *pmdp; |
196 | if (pmd_numa(entry)) | ||
197 | entry = pmd_mknonnuma(entry); | ||
198 | set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); | 196 | set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); |
199 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | 197 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
200 | } | 198 | } |
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 5077afcd9e11..b1597690530c 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c | |||
@@ -99,11 +99,8 @@ static int process_vm_rw_single_vec(unsigned long addr, | |||
99 | size_t bytes; | 99 | size_t bytes; |
100 | 100 | ||
101 | /* Get the pages we're interested in */ | 101 | /* Get the pages we're interested in */ |
102 | down_read(&mm->mmap_sem); | 102 | pages = get_user_pages_unlocked(task, mm, pa, pages, |
103 | pages = get_user_pages(task, mm, pa, pages, | 103 | vm_write, 0, process_pages); |
104 | vm_write, 0, process_pages, NULL); | ||
105 | up_read(&mm->mmap_sem); | ||
106 | |||
107 | if (pages <= 0) | 104 | if (pages <= 0) |
108 | return -EFAULT; | 105 | return -EFAULT; |
109 | 106 | ||
diff --git a/mm/readahead.c b/mm/readahead.c index 17b9172ec37f..935675844b2e 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -27,7 +27,7 @@ | |||
27 | void | 27 | void |
28 | file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) | 28 | file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping) |
29 | { | 29 | { |
30 | ra->ra_pages = mapping->backing_dev_info->ra_pages; | 30 | ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages; |
31 | ra->prev_pos = -1; | 31 | ra->prev_pos = -1; |
32 | } | 32 | } |
33 | EXPORT_SYMBOL_GPL(file_ra_state_init); | 33 | EXPORT_SYMBOL_GPL(file_ra_state_init); |
@@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping, | |||
541 | /* | 541 | /* |
542 | * Defer asynchronous read-ahead on IO congestion. | 542 | * Defer asynchronous read-ahead on IO congestion. |
543 | */ | 543 | */ |
544 | if (bdi_read_congested(mapping->backing_dev_info)) | 544 | if (bdi_read_congested(inode_to_bdi(mapping->host))) |
545 | return; | 545 | return; |
546 | 546 | ||
547 | /* do read-ahead */ | 547 | /* do read-ahead */ |
@@ -72,6 +72,8 @@ static inline struct anon_vma *anon_vma_alloc(void) | |||
72 | anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | 72 | anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); |
73 | if (anon_vma) { | 73 | if (anon_vma) { |
74 | atomic_set(&anon_vma->refcount, 1); | 74 | atomic_set(&anon_vma->refcount, 1); |
75 | anon_vma->degree = 1; /* Reference for first vma */ | ||
76 | anon_vma->parent = anon_vma; | ||
75 | /* | 77 | /* |
76 | * Initialise the anon_vma root to point to itself. If called | 78 | * Initialise the anon_vma root to point to itself. If called |
77 | * from fork, the root will be reset to the parents anon_vma. | 79 | * from fork, the root will be reset to the parents anon_vma. |
@@ -188,6 +190,8 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
188 | if (likely(!vma->anon_vma)) { | 190 | if (likely(!vma->anon_vma)) { |
189 | vma->anon_vma = anon_vma; | 191 | vma->anon_vma = anon_vma; |
190 | anon_vma_chain_link(vma, avc, anon_vma); | 192 | anon_vma_chain_link(vma, avc, anon_vma); |
193 | /* vma reference or self-parent link for new root */ | ||
194 | anon_vma->degree++; | ||
191 | allocated = NULL; | 195 | allocated = NULL; |
192 | avc = NULL; | 196 | avc = NULL; |
193 | } | 197 | } |
@@ -236,6 +240,14 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) | |||
236 | /* | 240 | /* |
237 | * Attach the anon_vmas from src to dst. | 241 | * Attach the anon_vmas from src to dst. |
238 | * Returns 0 on success, -ENOMEM on failure. | 242 | * Returns 0 on success, -ENOMEM on failure. |
243 | * | ||
244 | * If dst->anon_vma is NULL this function tries to find and reuse existing | ||
245 | * anon_vma which has no vmas and only one child anon_vma. This prevents | ||
246 | * degradation of anon_vma hierarchy to endless linear chain in case of | ||
247 | * constantly forking task. On the other hand, an anon_vma with more than one | ||
248 | * child isn't reused even if there was no alive vma, thus rmap walker has a | ||
249 | * good chance of avoiding scanning the whole hierarchy when it searches where | ||
250 | * page is mapped. | ||
239 | */ | 251 | */ |
240 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | 252 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) |
241 | { | 253 | { |
@@ -256,7 +268,21 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | |||
256 | anon_vma = pavc->anon_vma; | 268 | anon_vma = pavc->anon_vma; |
257 | root = lock_anon_vma_root(root, anon_vma); | 269 | root = lock_anon_vma_root(root, anon_vma); |
258 | anon_vma_chain_link(dst, avc, anon_vma); | 270 | anon_vma_chain_link(dst, avc, anon_vma); |
271 | |||
272 | /* | ||
273 | * Reuse existing anon_vma if its degree lower than two, | ||
274 | * that means it has no vma and only one anon_vma child. | ||
275 | * | ||
276 | * Do not chose parent anon_vma, otherwise first child | ||
277 | * will always reuse it. Root anon_vma is never reused: | ||
278 | * it has self-parent reference and at least one child. | ||
279 | */ | ||
280 | if (!dst->anon_vma && anon_vma != src->anon_vma && | ||
281 | anon_vma->degree < 2) | ||
282 | dst->anon_vma = anon_vma; | ||
259 | } | 283 | } |
284 | if (dst->anon_vma) | ||
285 | dst->anon_vma->degree++; | ||
260 | unlock_anon_vma_root(root); | 286 | unlock_anon_vma_root(root); |
261 | return 0; | 287 | return 0; |
262 | 288 | ||
@@ -280,6 +306,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
280 | if (!pvma->anon_vma) | 306 | if (!pvma->anon_vma) |
281 | return 0; | 307 | return 0; |
282 | 308 | ||
309 | /* Drop inherited anon_vma, we'll reuse existing or allocate new. */ | ||
310 | vma->anon_vma = NULL; | ||
311 | |||
283 | /* | 312 | /* |
284 | * First, attach the new VMA to the parent VMA's anon_vmas, | 313 | * First, attach the new VMA to the parent VMA's anon_vmas, |
285 | * so rmap can find non-COWed pages in child processes. | 314 | * so rmap can find non-COWed pages in child processes. |
@@ -288,6 +317,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
288 | if (error) | 317 | if (error) |
289 | return error; | 318 | return error; |
290 | 319 | ||
320 | /* An existing anon_vma has been reused, all done then. */ | ||
321 | if (vma->anon_vma) | ||
322 | return 0; | ||
323 | |||
291 | /* Then add our own anon_vma. */ | 324 | /* Then add our own anon_vma. */ |
292 | anon_vma = anon_vma_alloc(); | 325 | anon_vma = anon_vma_alloc(); |
293 | if (!anon_vma) | 326 | if (!anon_vma) |
@@ -301,6 +334,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
301 | * lock any of the anon_vmas in this anon_vma tree. | 334 | * lock any of the anon_vmas in this anon_vma tree. |
302 | */ | 335 | */ |
303 | anon_vma->root = pvma->anon_vma->root; | 336 | anon_vma->root = pvma->anon_vma->root; |
337 | anon_vma->parent = pvma->anon_vma; | ||
304 | /* | 338 | /* |
305 | * With refcounts, an anon_vma can stay around longer than the | 339 | * With refcounts, an anon_vma can stay around longer than the |
306 | * process it belongs to. The root anon_vma needs to be pinned until | 340 | * process it belongs to. The root anon_vma needs to be pinned until |
@@ -311,6 +345,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
311 | vma->anon_vma = anon_vma; | 345 | vma->anon_vma = anon_vma; |
312 | anon_vma_lock_write(anon_vma); | 346 | anon_vma_lock_write(anon_vma); |
313 | anon_vma_chain_link(vma, avc, anon_vma); | 347 | anon_vma_chain_link(vma, avc, anon_vma); |
348 | anon_vma->parent->degree++; | ||
314 | anon_vma_unlock_write(anon_vma); | 349 | anon_vma_unlock_write(anon_vma); |
315 | 350 | ||
316 | return 0; | 351 | return 0; |
@@ -341,12 +376,16 @@ void unlink_anon_vmas(struct vm_area_struct *vma) | |||
341 | * Leave empty anon_vmas on the list - we'll need | 376 | * Leave empty anon_vmas on the list - we'll need |
342 | * to free them outside the lock. | 377 | * to free them outside the lock. |
343 | */ | 378 | */ |
344 | if (RB_EMPTY_ROOT(&anon_vma->rb_root)) | 379 | if (RB_EMPTY_ROOT(&anon_vma->rb_root)) { |
380 | anon_vma->parent->degree--; | ||
345 | continue; | 381 | continue; |
382 | } | ||
346 | 383 | ||
347 | list_del(&avc->same_vma); | 384 | list_del(&avc->same_vma); |
348 | anon_vma_chain_free(avc); | 385 | anon_vma_chain_free(avc); |
349 | } | 386 | } |
387 | if (vma->anon_vma) | ||
388 | vma->anon_vma->degree--; | ||
350 | unlock_anon_vma_root(root); | 389 | unlock_anon_vma_root(root); |
351 | 390 | ||
352 | /* | 391 | /* |
@@ -357,6 +396,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) | |||
357 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | 396 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { |
358 | struct anon_vma *anon_vma = avc->anon_vma; | 397 | struct anon_vma *anon_vma = avc->anon_vma; |
359 | 398 | ||
399 | BUG_ON(anon_vma->degree); | ||
360 | put_anon_vma(anon_vma); | 400 | put_anon_vma(anon_vma); |
361 | 401 | ||
362 | list_del(&avc->same_vma); | 402 | list_del(&avc->same_vma); |
@@ -550,9 +590,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | |||
550 | if (!vma->anon_vma || !page__anon_vma || | 590 | if (!vma->anon_vma || !page__anon_vma || |
551 | vma->anon_vma->root != page__anon_vma->root) | 591 | vma->anon_vma->root != page__anon_vma->root) |
552 | return -EFAULT; | 592 | return -EFAULT; |
553 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { | 593 | } else if (page->mapping) { |
554 | if (!vma->vm_file || | 594 | if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) |
555 | vma->vm_file->f_mapping != page->mapping) | ||
556 | return -EFAULT; | 595 | return -EFAULT; |
557 | } else | 596 | } else |
558 | return -EFAULT; | 597 | return -EFAULT; |
@@ -1046,24 +1085,20 @@ void page_add_new_anon_rmap(struct page *page, | |||
1046 | void page_add_file_rmap(struct page *page) | 1085 | void page_add_file_rmap(struct page *page) |
1047 | { | 1086 | { |
1048 | struct mem_cgroup *memcg; | 1087 | struct mem_cgroup *memcg; |
1049 | unsigned long flags; | ||
1050 | bool locked; | ||
1051 | 1088 | ||
1052 | memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); | 1089 | memcg = mem_cgroup_begin_page_stat(page); |
1053 | if (atomic_inc_and_test(&page->_mapcount)) { | 1090 | if (atomic_inc_and_test(&page->_mapcount)) { |
1054 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 1091 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
1055 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); | 1092 | mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); |
1056 | } | 1093 | } |
1057 | mem_cgroup_end_page_stat(memcg, &locked, &flags); | 1094 | mem_cgroup_end_page_stat(memcg); |
1058 | } | 1095 | } |
1059 | 1096 | ||
1060 | static void page_remove_file_rmap(struct page *page) | 1097 | static void page_remove_file_rmap(struct page *page) |
1061 | { | 1098 | { |
1062 | struct mem_cgroup *memcg; | 1099 | struct mem_cgroup *memcg; |
1063 | unsigned long flags; | ||
1064 | bool locked; | ||
1065 | 1100 | ||
1066 | memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); | 1101 | memcg = mem_cgroup_begin_page_stat(page); |
1067 | 1102 | ||
1068 | /* page still mapped by someone else? */ | 1103 | /* page still mapped by someone else? */ |
1069 | if (!atomic_add_negative(-1, &page->_mapcount)) | 1104 | if (!atomic_add_negative(-1, &page->_mapcount)) |
@@ -1084,7 +1119,7 @@ static void page_remove_file_rmap(struct page *page) | |||
1084 | if (unlikely(PageMlocked(page))) | 1119 | if (unlikely(PageMlocked(page))) |
1085 | clear_page_mlock(page); | 1120 | clear_page_mlock(page); |
1086 | out: | 1121 | out: |
1087 | mem_cgroup_end_page_stat(memcg, &locked, &flags); | 1122 | mem_cgroup_end_page_stat(memcg); |
1088 | } | 1123 | } |
1089 | 1124 | ||
1090 | /** | 1125 | /** |
@@ -1234,7 +1269,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1234 | if (pte_soft_dirty(pteval)) | 1269 | if (pte_soft_dirty(pteval)) |
1235 | swp_pte = pte_swp_mksoft_dirty(swp_pte); | 1270 | swp_pte = pte_swp_mksoft_dirty(swp_pte); |
1236 | set_pte_at(mm, address, pte, swp_pte); | 1271 | set_pte_at(mm, address, pte, swp_pte); |
1237 | BUG_ON(pte_file(*pte)); | ||
1238 | } else if (IS_ENABLED(CONFIG_MIGRATION) && | 1272 | } else if (IS_ENABLED(CONFIG_MIGRATION) && |
1239 | (flags & TTU_MIGRATION)) { | 1273 | (flags & TTU_MIGRATION)) { |
1240 | /* Establish migration entry for a file page */ | 1274 | /* Establish migration entry for a file page */ |
@@ -1276,211 +1310,6 @@ out_mlock: | |||
1276 | return ret; | 1310 | return ret; |
1277 | } | 1311 | } |
1278 | 1312 | ||
1279 | /* | ||
1280 | * objrmap doesn't work for nonlinear VMAs because the assumption that | ||
1281 | * offset-into-file correlates with offset-into-virtual-addresses does not hold. | ||
1282 | * Consequently, given a particular page and its ->index, we cannot locate the | ||
1283 | * ptes which are mapping that page without an exhaustive linear search. | ||
1284 | * | ||
1285 | * So what this code does is a mini "virtual scan" of each nonlinear VMA which | ||
1286 | * maps the file to which the target page belongs. The ->vm_private_data field | ||
1287 | * holds the current cursor into that scan. Successive searches will circulate | ||
1288 | * around the vma's virtual address space. | ||
1289 | * | ||
1290 | * So as more replacement pressure is applied to the pages in a nonlinear VMA, | ||
1291 | * more scanning pressure is placed against them as well. Eventually pages | ||
1292 | * will become fully unmapped and are eligible for eviction. | ||
1293 | * | ||
1294 | * For very sparsely populated VMAs this is a little inefficient - chances are | ||
1295 | * there there won't be many ptes located within the scan cluster. In this case | ||
1296 | * maybe we could scan further - to the end of the pte page, perhaps. | ||
1297 | * | ||
1298 | * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can | ||
1299 | * acquire it without blocking. If vma locked, mlock the pages in the cluster, | ||
1300 | * rather than unmapping them. If we encounter the "check_page" that vmscan is | ||
1301 | * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. | ||
1302 | */ | ||
1303 | #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) | ||
1304 | #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) | ||
1305 | |||
1306 | static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | ||
1307 | struct vm_area_struct *vma, struct page *check_page) | ||
1308 | { | ||
1309 | struct mm_struct *mm = vma->vm_mm; | ||
1310 | pmd_t *pmd; | ||
1311 | pte_t *pte; | ||
1312 | pte_t pteval; | ||
1313 | spinlock_t *ptl; | ||
1314 | struct page *page; | ||
1315 | unsigned long address; | ||
1316 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
1317 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
1318 | unsigned long end; | ||
1319 | int ret = SWAP_AGAIN; | ||
1320 | int locked_vma = 0; | ||
1321 | |||
1322 | address = (vma->vm_start + cursor) & CLUSTER_MASK; | ||
1323 | end = address + CLUSTER_SIZE; | ||
1324 | if (address < vma->vm_start) | ||
1325 | address = vma->vm_start; | ||
1326 | if (end > vma->vm_end) | ||
1327 | end = vma->vm_end; | ||
1328 | |||
1329 | pmd = mm_find_pmd(mm, address); | ||
1330 | if (!pmd) | ||
1331 | return ret; | ||
1332 | |||
1333 | mmun_start = address; | ||
1334 | mmun_end = end; | ||
1335 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1336 | |||
1337 | /* | ||
1338 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, | ||
1339 | * keep the sem while scanning the cluster for mlocking pages. | ||
1340 | */ | ||
1341 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
1342 | locked_vma = (vma->vm_flags & VM_LOCKED); | ||
1343 | if (!locked_vma) | ||
1344 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ | ||
1345 | } | ||
1346 | |||
1347 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
1348 | |||
1349 | /* Update high watermark before we lower rss */ | ||
1350 | update_hiwater_rss(mm); | ||
1351 | |||
1352 | for (; address < end; pte++, address += PAGE_SIZE) { | ||
1353 | if (!pte_present(*pte)) | ||
1354 | continue; | ||
1355 | page = vm_normal_page(vma, address, *pte); | ||
1356 | BUG_ON(!page || PageAnon(page)); | ||
1357 | |||
1358 | if (locked_vma) { | ||
1359 | if (page == check_page) { | ||
1360 | /* we know we have check_page locked */ | ||
1361 | mlock_vma_page(page); | ||
1362 | ret = SWAP_MLOCK; | ||
1363 | } else if (trylock_page(page)) { | ||
1364 | /* | ||
1365 | * If we can lock the page, perform mlock. | ||
1366 | * Otherwise leave the page alone, it will be | ||
1367 | * eventually encountered again later. | ||
1368 | */ | ||
1369 | mlock_vma_page(page); | ||
1370 | unlock_page(page); | ||
1371 | } | ||
1372 | continue; /* don't unmap */ | ||
1373 | } | ||
1374 | |||
1375 | /* | ||
1376 | * No need for _notify because we're within an | ||
1377 | * mmu_notifier_invalidate_range_ {start|end} scope. | ||
1378 | */ | ||
1379 | if (ptep_clear_flush_young(vma, address, pte)) | ||
1380 | continue; | ||
1381 | |||
1382 | /* Nuke the page table entry. */ | ||
1383 | flush_cache_page(vma, address, pte_pfn(*pte)); | ||
1384 | pteval = ptep_clear_flush_notify(vma, address, pte); | ||
1385 | |||
1386 | /* If nonlinear, store the file page offset in the pte. */ | ||
1387 | if (page->index != linear_page_index(vma, address)) { | ||
1388 | pte_t ptfile = pgoff_to_pte(page->index); | ||
1389 | if (pte_soft_dirty(pteval)) | ||
1390 | ptfile = pte_file_mksoft_dirty(ptfile); | ||
1391 | set_pte_at(mm, address, pte, ptfile); | ||
1392 | } | ||
1393 | |||
1394 | /* Move the dirty bit to the physical page now the pte is gone. */ | ||
1395 | if (pte_dirty(pteval)) | ||
1396 | set_page_dirty(page); | ||
1397 | |||
1398 | page_remove_rmap(page); | ||
1399 | page_cache_release(page); | ||
1400 | dec_mm_counter(mm, MM_FILEPAGES); | ||
1401 | (*mapcount)--; | ||
1402 | } | ||
1403 | pte_unmap_unlock(pte - 1, ptl); | ||
1404 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1405 | if (locked_vma) | ||
1406 | up_read(&vma->vm_mm->mmap_sem); | ||
1407 | return ret; | ||
1408 | } | ||
1409 | |||
1410 | static int try_to_unmap_nonlinear(struct page *page, | ||
1411 | struct address_space *mapping, void *arg) | ||
1412 | { | ||
1413 | struct vm_area_struct *vma; | ||
1414 | int ret = SWAP_AGAIN; | ||
1415 | unsigned long cursor; | ||
1416 | unsigned long max_nl_cursor = 0; | ||
1417 | unsigned long max_nl_size = 0; | ||
1418 | unsigned int mapcount; | ||
1419 | |||
1420 | list_for_each_entry(vma, | ||
1421 | &mapping->i_mmap_nonlinear, shared.nonlinear) { | ||
1422 | |||
1423 | cursor = (unsigned long) vma->vm_private_data; | ||
1424 | if (cursor > max_nl_cursor) | ||
1425 | max_nl_cursor = cursor; | ||
1426 | cursor = vma->vm_end - vma->vm_start; | ||
1427 | if (cursor > max_nl_size) | ||
1428 | max_nl_size = cursor; | ||
1429 | } | ||
1430 | |||
1431 | if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ | ||
1432 | return SWAP_FAIL; | ||
1433 | } | ||
1434 | |||
1435 | /* | ||
1436 | * We don't try to search for this page in the nonlinear vmas, | ||
1437 | * and page_referenced wouldn't have found it anyway. Instead | ||
1438 | * just walk the nonlinear vmas trying to age and unmap some. | ||
1439 | * The mapcount of the page we came in with is irrelevant, | ||
1440 | * but even so use it as a guide to how hard we should try? | ||
1441 | */ | ||
1442 | mapcount = page_mapcount(page); | ||
1443 | if (!mapcount) | ||
1444 | return ret; | ||
1445 | |||
1446 | cond_resched(); | ||
1447 | |||
1448 | max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK; | ||
1449 | if (max_nl_cursor == 0) | ||
1450 | max_nl_cursor = CLUSTER_SIZE; | ||
1451 | |||
1452 | do { | ||
1453 | list_for_each_entry(vma, | ||
1454 | &mapping->i_mmap_nonlinear, shared.nonlinear) { | ||
1455 | |||
1456 | cursor = (unsigned long) vma->vm_private_data; | ||
1457 | while (cursor < max_nl_cursor && | ||
1458 | cursor < vma->vm_end - vma->vm_start) { | ||
1459 | if (try_to_unmap_cluster(cursor, &mapcount, | ||
1460 | vma, page) == SWAP_MLOCK) | ||
1461 | ret = SWAP_MLOCK; | ||
1462 | cursor += CLUSTER_SIZE; | ||
1463 | vma->vm_private_data = (void *) cursor; | ||
1464 | if ((int)mapcount <= 0) | ||
1465 | return ret; | ||
1466 | } | ||
1467 | vma->vm_private_data = (void *) max_nl_cursor; | ||
1468 | } | ||
1469 | cond_resched(); | ||
1470 | max_nl_cursor += CLUSTER_SIZE; | ||
1471 | } while (max_nl_cursor <= max_nl_size); | ||
1472 | |||
1473 | /* | ||
1474 | * Don't loop forever (perhaps all the remaining pages are | ||
1475 | * in locked vmas). Reset cursor on all unreserved nonlinear | ||
1476 | * vmas, now forgetting on which ones it had fallen behind. | ||
1477 | */ | ||
1478 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear) | ||
1479 | vma->vm_private_data = NULL; | ||
1480 | |||
1481 | return ret; | ||
1482 | } | ||
1483 | |||
1484 | bool is_vma_temporary_stack(struct vm_area_struct *vma) | 1313 | bool is_vma_temporary_stack(struct vm_area_struct *vma) |
1485 | { | 1314 | { |
1486 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); | 1315 | int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); |
@@ -1526,7 +1355,6 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1526 | .rmap_one = try_to_unmap_one, | 1355 | .rmap_one = try_to_unmap_one, |
1527 | .arg = (void *)flags, | 1356 | .arg = (void *)flags, |
1528 | .done = page_not_mapped, | 1357 | .done = page_not_mapped, |
1529 | .file_nonlinear = try_to_unmap_nonlinear, | ||
1530 | .anon_lock = page_lock_anon_vma_read, | 1358 | .anon_lock = page_lock_anon_vma_read, |
1531 | }; | 1359 | }; |
1532 | 1360 | ||
@@ -1572,12 +1400,6 @@ int try_to_munlock(struct page *page) | |||
1572 | .rmap_one = try_to_unmap_one, | 1400 | .rmap_one = try_to_unmap_one, |
1573 | .arg = (void *)TTU_MUNLOCK, | 1401 | .arg = (void *)TTU_MUNLOCK, |
1574 | .done = page_not_mapped, | 1402 | .done = page_not_mapped, |
1575 | /* | ||
1576 | * We don't bother to try to find the munlocked page in | ||
1577 | * nonlinears. It's costly. Instead, later, page reclaim logic | ||
1578 | * may call try_to_unmap() and recover PG_mlocked lazily. | ||
1579 | */ | ||
1580 | .file_nonlinear = NULL, | ||
1581 | .anon_lock = page_lock_anon_vma_read, | 1403 | .anon_lock = page_lock_anon_vma_read, |
1582 | 1404 | ||
1583 | }; | 1405 | }; |
@@ -1708,13 +1530,6 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | |||
1708 | goto done; | 1530 | goto done; |
1709 | } | 1531 | } |
1710 | 1532 | ||
1711 | if (!rwc->file_nonlinear) | ||
1712 | goto done; | ||
1713 | |||
1714 | if (list_empty(&mapping->i_mmap_nonlinear)) | ||
1715 | goto done; | ||
1716 | |||
1717 | ret = rwc->file_nonlinear(page, mapping, rwc->arg); | ||
1718 | done: | 1533 | done: |
1719 | i_mmap_unlock_read(mapping); | 1534 | i_mmap_unlock_read(mapping); |
1720 | return ret; | 1535 | return ret; |
diff --git a/mm/shmem.c b/mm/shmem.c index 73ba1df7c8ba..a63031fa3e0c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -191,11 +191,6 @@ static const struct inode_operations shmem_dir_inode_operations; | |||
191 | static const struct inode_operations shmem_special_inode_operations; | 191 | static const struct inode_operations shmem_special_inode_operations; |
192 | static const struct vm_operations_struct shmem_vm_ops; | 192 | static const struct vm_operations_struct shmem_vm_ops; |
193 | 193 | ||
194 | static struct backing_dev_info shmem_backing_dev_info __read_mostly = { | ||
195 | .ra_pages = 0, /* No readahead */ | ||
196 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, | ||
197 | }; | ||
198 | |||
199 | static LIST_HEAD(shmem_swaplist); | 194 | static LIST_HEAD(shmem_swaplist); |
200 | static DEFINE_MUTEX(shmem_swaplist_mutex); | 195 | static DEFINE_MUTEX(shmem_swaplist_mutex); |
201 | 196 | ||
@@ -765,11 +760,11 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
765 | goto redirty; | 760 | goto redirty; |
766 | 761 | ||
767 | /* | 762 | /* |
768 | * shmem_backing_dev_info's capabilities prevent regular writeback or | 763 | * Our capabilities prevent regular writeback or sync from ever calling |
769 | * sync from ever calling shmem_writepage; but a stacking filesystem | 764 | * shmem_writepage; but a stacking filesystem might use ->writepage of |
770 | * might use ->writepage of its underlying filesystem, in which case | 765 | * its underlying filesystem, in which case tmpfs should write out to |
771 | * tmpfs should write out to swap only in response to memory pressure, | 766 | * swap only in response to memory pressure, and not for the writeback |
772 | * and not for the writeback threads or sync. | 767 | * threads or sync. |
773 | */ | 768 | */ |
774 | if (!wbc->for_reclaim) { | 769 | if (!wbc->for_reclaim) { |
775 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ | 770 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ |
@@ -1013,7 +1008,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
1013 | */ | 1008 | */ |
1014 | oldpage = newpage; | 1009 | oldpage = newpage; |
1015 | } else { | 1010 | } else { |
1016 | mem_cgroup_migrate(oldpage, newpage, false); | 1011 | mem_cgroup_migrate(oldpage, newpage, true); |
1017 | lru_cache_add_anon(newpage); | 1012 | lru_cache_add_anon(newpage); |
1018 | *pagep = newpage; | 1013 | *pagep = newpage; |
1019 | } | 1014 | } |
@@ -1131,7 +1126,7 @@ repeat: | |||
1131 | * truncated or holepunched since swap was confirmed. | 1126 | * truncated or holepunched since swap was confirmed. |
1132 | * shmem_undo_range() will have done some of the | 1127 | * shmem_undo_range() will have done some of the |
1133 | * unaccounting, now delete_from_swap_cache() will do | 1128 | * unaccounting, now delete_from_swap_cache() will do |
1134 | * the rest (including mem_cgroup_uncharge_swapcache). | 1129 | * the rest. |
1135 | * Reset swap.val? No, leave it so "failed" goes back to | 1130 | * Reset swap.val? No, leave it so "failed" goes back to |
1136 | * "repeat": reading a hole and writing should succeed. | 1131 | * "repeat": reading a hole and writing should succeed. |
1137 | */ | 1132 | */ |
@@ -1415,7 +1410,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
1415 | inode->i_ino = get_next_ino(); | 1410 | inode->i_ino = get_next_ino(); |
1416 | inode_init_owner(inode, dir, mode); | 1411 | inode_init_owner(inode, dir, mode); |
1417 | inode->i_blocks = 0; | 1412 | inode->i_blocks = 0; |
1418 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; | ||
1419 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 1413 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
1420 | inode->i_generation = get_seconds(); | 1414 | inode->i_generation = get_seconds(); |
1421 | info = SHMEM_I(inode); | 1415 | info = SHMEM_I(inode); |
@@ -1461,7 +1455,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
1461 | 1455 | ||
1462 | bool shmem_mapping(struct address_space *mapping) | 1456 | bool shmem_mapping(struct address_space *mapping) |
1463 | { | 1457 | { |
1464 | return mapping->backing_dev_info == &shmem_backing_dev_info; | 1458 | return mapping->host->i_sb->s_op == &shmem_ops; |
1465 | } | 1459 | } |
1466 | 1460 | ||
1467 | #ifdef CONFIG_TMPFS | 1461 | #ifdef CONFIG_TMPFS |
@@ -3201,7 +3195,6 @@ static const struct vm_operations_struct shmem_vm_ops = { | |||
3201 | .set_policy = shmem_set_policy, | 3195 | .set_policy = shmem_set_policy, |
3202 | .get_policy = shmem_get_policy, | 3196 | .get_policy = shmem_get_policy, |
3203 | #endif | 3197 | #endif |
3204 | .remap_pages = generic_file_remap_pages, | ||
3205 | }; | 3198 | }; |
3206 | 3199 | ||
3207 | static struct dentry *shmem_mount(struct file_system_type *fs_type, | 3200 | static struct dentry *shmem_mount(struct file_system_type *fs_type, |
@@ -3226,10 +3219,6 @@ int __init shmem_init(void) | |||
3226 | if (shmem_inode_cachep) | 3219 | if (shmem_inode_cachep) |
3227 | return 0; | 3220 | return 0; |
3228 | 3221 | ||
3229 | error = bdi_init(&shmem_backing_dev_info); | ||
3230 | if (error) | ||
3231 | goto out4; | ||
3232 | |||
3233 | error = shmem_init_inodecache(); | 3222 | error = shmem_init_inodecache(); |
3234 | if (error) | 3223 | if (error) |
3235 | goto out3; | 3224 | goto out3; |
@@ -3253,8 +3242,6 @@ out1: | |||
3253 | out2: | 3242 | out2: |
3254 | shmem_destroy_inodecache(); | 3243 | shmem_destroy_inodecache(); |
3255 | out3: | 3244 | out3: |
3256 | bdi_destroy(&shmem_backing_dev_info); | ||
3257 | out4: | ||
3258 | shm_mnt = ERR_PTR(error); | 3245 | shm_mnt = ERR_PTR(error); |
3259 | return error; | 3246 | return error; |
3260 | } | 3247 | } |
@@ -2382,7 +2382,7 @@ out: | |||
2382 | return nr_freed; | 2382 | return nr_freed; |
2383 | } | 2383 | } |
2384 | 2384 | ||
2385 | int __kmem_cache_shrink(struct kmem_cache *cachep) | 2385 | int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) |
2386 | { | 2386 | { |
2387 | int ret = 0; | 2387 | int ret = 0; |
2388 | int node; | 2388 | int node; |
@@ -2404,7 +2404,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) | |||
2404 | { | 2404 | { |
2405 | int i; | 2405 | int i; |
2406 | struct kmem_cache_node *n; | 2406 | struct kmem_cache_node *n; |
2407 | int rc = __kmem_cache_shrink(cachep); | 2407 | int rc = __kmem_cache_shrink(cachep, false); |
2408 | 2408 | ||
2409 | if (rc) | 2409 | if (rc) |
2410 | return rc; | 2410 | return rc; |
@@ -3708,8 +3708,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
3708 | int batchcount, int shared, gfp_t gfp) | 3708 | int batchcount, int shared, gfp_t gfp) |
3709 | { | 3709 | { |
3710 | int ret; | 3710 | int ret; |
3711 | struct kmem_cache *c = NULL; | 3711 | struct kmem_cache *c; |
3712 | int i = 0; | ||
3713 | 3712 | ||
3714 | ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); | 3713 | ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); |
3715 | 3714 | ||
@@ -3719,12 +3718,10 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
3719 | if ((ret < 0) || !is_root_cache(cachep)) | 3718 | if ((ret < 0) || !is_root_cache(cachep)) |
3720 | return ret; | 3719 | return ret; |
3721 | 3720 | ||
3722 | VM_BUG_ON(!mutex_is_locked(&slab_mutex)); | 3721 | lockdep_assert_held(&slab_mutex); |
3723 | for_each_memcg_cache_index(i) { | 3722 | for_each_memcg_cache(c, cachep) { |
3724 | c = cache_from_memcg_idx(cachep, i); | 3723 | /* return value determined by the root cache only */ |
3725 | if (c) | 3724 | __do_tune_cpucache(c, limit, batchcount, shared, gfp); |
3726 | /* return value determined by the parent cache only */ | ||
3727 | __do_tune_cpucache(c, limit, batchcount, shared, gfp); | ||
3728 | } | 3725 | } |
3729 | 3726 | ||
3730 | return ret; | 3727 | return ret; |
@@ -86,8 +86,6 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, | |||
86 | extern void create_boot_cache(struct kmem_cache *, const char *name, | 86 | extern void create_boot_cache(struct kmem_cache *, const char *name, |
87 | size_t size, unsigned long flags); | 87 | size_t size, unsigned long flags); |
88 | 88 | ||
89 | struct mem_cgroup; | ||
90 | |||
91 | int slab_unmergeable(struct kmem_cache *s); | 89 | int slab_unmergeable(struct kmem_cache *s); |
92 | struct kmem_cache *find_mergeable(size_t size, size_t align, | 90 | struct kmem_cache *find_mergeable(size_t size, size_t align, |
93 | unsigned long flags, const char *name, void (*ctor)(void *)); | 91 | unsigned long flags, const char *name, void (*ctor)(void *)); |
@@ -140,7 +138,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, | |||
140 | #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) | 138 | #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) |
141 | 139 | ||
142 | int __kmem_cache_shutdown(struct kmem_cache *); | 140 | int __kmem_cache_shutdown(struct kmem_cache *); |
143 | int __kmem_cache_shrink(struct kmem_cache *); | 141 | int __kmem_cache_shrink(struct kmem_cache *, bool); |
144 | void slab_kmem_cache_release(struct kmem_cache *); | 142 | void slab_kmem_cache_release(struct kmem_cache *); |
145 | 143 | ||
146 | struct seq_file; | 144 | struct seq_file; |
@@ -165,16 +163,27 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, | |||
165 | size_t count, loff_t *ppos); | 163 | size_t count, loff_t *ppos); |
166 | 164 | ||
167 | #ifdef CONFIG_MEMCG_KMEM | 165 | #ifdef CONFIG_MEMCG_KMEM |
166 | /* | ||
167 | * Iterate over all memcg caches of the given root cache. The caller must hold | ||
168 | * slab_mutex. | ||
169 | */ | ||
170 | #define for_each_memcg_cache(iter, root) \ | ||
171 | list_for_each_entry(iter, &(root)->memcg_params.list, \ | ||
172 | memcg_params.list) | ||
173 | |||
174 | #define for_each_memcg_cache_safe(iter, tmp, root) \ | ||
175 | list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \ | ||
176 | memcg_params.list) | ||
177 | |||
168 | static inline bool is_root_cache(struct kmem_cache *s) | 178 | static inline bool is_root_cache(struct kmem_cache *s) |
169 | { | 179 | { |
170 | return !s->memcg_params || s->memcg_params->is_root_cache; | 180 | return s->memcg_params.is_root_cache; |
171 | } | 181 | } |
172 | 182 | ||
173 | static inline bool slab_equal_or_root(struct kmem_cache *s, | 183 | static inline bool slab_equal_or_root(struct kmem_cache *s, |
174 | struct kmem_cache *p) | 184 | struct kmem_cache *p) |
175 | { | 185 | { |
176 | return (p == s) || | 186 | return p == s || p == s->memcg_params.root_cache; |
177 | (s->memcg_params && (p == s->memcg_params->root_cache)); | ||
178 | } | 187 | } |
179 | 188 | ||
180 | /* | 189 | /* |
@@ -185,37 +194,30 @@ static inline bool slab_equal_or_root(struct kmem_cache *s, | |||
185 | static inline const char *cache_name(struct kmem_cache *s) | 194 | static inline const char *cache_name(struct kmem_cache *s) |
186 | { | 195 | { |
187 | if (!is_root_cache(s)) | 196 | if (!is_root_cache(s)) |
188 | return s->memcg_params->root_cache->name; | 197 | s = s->memcg_params.root_cache; |
189 | return s->name; | 198 | return s->name; |
190 | } | 199 | } |
191 | 200 | ||
192 | /* | 201 | /* |
193 | * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. | 202 | * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. |
194 | * That said the caller must assure the memcg's cache won't go away. Since once | 203 | * That said the caller must assure the memcg's cache won't go away by either |
195 | * created a memcg's cache is destroyed only along with the root cache, it is | 204 | * taking a css reference to the owner cgroup, or holding the slab_mutex. |
196 | * true if we are going to allocate from the cache or hold a reference to the | ||
197 | * root cache by other means. Otherwise, we should hold either the slab_mutex | ||
198 | * or the memcg's slab_caches_mutex while calling this function and accessing | ||
199 | * the returned value. | ||
200 | */ | 205 | */ |
201 | static inline struct kmem_cache * | 206 | static inline struct kmem_cache * |
202 | cache_from_memcg_idx(struct kmem_cache *s, int idx) | 207 | cache_from_memcg_idx(struct kmem_cache *s, int idx) |
203 | { | 208 | { |
204 | struct kmem_cache *cachep; | 209 | struct kmem_cache *cachep; |
205 | struct memcg_cache_params *params; | 210 | struct memcg_cache_array *arr; |
206 | |||
207 | if (!s->memcg_params) | ||
208 | return NULL; | ||
209 | 211 | ||
210 | rcu_read_lock(); | 212 | rcu_read_lock(); |
211 | params = rcu_dereference(s->memcg_params); | 213 | arr = rcu_dereference(s->memcg_params.memcg_caches); |
212 | 214 | ||
213 | /* | 215 | /* |
214 | * Make sure we will access the up-to-date value. The code updating | 216 | * Make sure we will access the up-to-date value. The code updating |
215 | * memcg_caches issues a write barrier to match this (see | 217 | * memcg_caches issues a write barrier to match this (see |
216 | * memcg_register_cache()). | 218 | * memcg_create_kmem_cache()). |
217 | */ | 219 | */ |
218 | cachep = lockless_dereference(params->memcg_caches[idx]); | 220 | cachep = lockless_dereference(arr->entries[idx]); |
219 | rcu_read_unlock(); | 221 | rcu_read_unlock(); |
220 | 222 | ||
221 | return cachep; | 223 | return cachep; |
@@ -225,7 +227,7 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) | |||
225 | { | 227 | { |
226 | if (is_root_cache(s)) | 228 | if (is_root_cache(s)) |
227 | return s; | 229 | return s; |
228 | return s->memcg_params->root_cache; | 230 | return s->memcg_params.root_cache; |
229 | } | 231 | } |
230 | 232 | ||
231 | static __always_inline int memcg_charge_slab(struct kmem_cache *s, | 233 | static __always_inline int memcg_charge_slab(struct kmem_cache *s, |
@@ -235,7 +237,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s, | |||
235 | return 0; | 237 | return 0; |
236 | if (is_root_cache(s)) | 238 | if (is_root_cache(s)) |
237 | return 0; | 239 | return 0; |
238 | return __memcg_charge_slab(s, gfp, order); | 240 | return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order); |
239 | } | 241 | } |
240 | 242 | ||
241 | static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) | 243 | static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) |
@@ -244,9 +246,18 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) | |||
244 | return; | 246 | return; |
245 | if (is_root_cache(s)) | 247 | if (is_root_cache(s)) |
246 | return; | 248 | return; |
247 | __memcg_uncharge_slab(s, order); | 249 | memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order); |
248 | } | 250 | } |
249 | #else | 251 | |
252 | extern void slab_init_memcg_params(struct kmem_cache *); | ||
253 | |||
254 | #else /* !CONFIG_MEMCG_KMEM */ | ||
255 | |||
256 | #define for_each_memcg_cache(iter, root) \ | ||
257 | for ((void)(iter), (void)(root); 0; ) | ||
258 | #define for_each_memcg_cache_safe(iter, tmp, root) \ | ||
259 | for ((void)(iter), (void)(tmp), (void)(root); 0; ) | ||
260 | |||
250 | static inline bool is_root_cache(struct kmem_cache *s) | 261 | static inline bool is_root_cache(struct kmem_cache *s) |
251 | { | 262 | { |
252 | return true; | 263 | return true; |
@@ -282,7 +293,11 @@ static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) | |||
282 | static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) | 293 | static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) |
283 | { | 294 | { |
284 | } | 295 | } |
285 | #endif | 296 | |
297 | static inline void slab_init_memcg_params(struct kmem_cache *s) | ||
298 | { | ||
299 | } | ||
300 | #endif /* CONFIG_MEMCG_KMEM */ | ||
286 | 301 | ||
287 | static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) | 302 | static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) |
288 | { | 303 | { |
diff --git a/mm/slab_common.c b/mm/slab_common.c index e03dd6f2a272..999bb3424d44 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -106,62 +106,67 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) | |||
106 | #endif | 106 | #endif |
107 | 107 | ||
108 | #ifdef CONFIG_MEMCG_KMEM | 108 | #ifdef CONFIG_MEMCG_KMEM |
109 | static int memcg_alloc_cache_params(struct mem_cgroup *memcg, | 109 | void slab_init_memcg_params(struct kmem_cache *s) |
110 | struct kmem_cache *s, struct kmem_cache *root_cache) | ||
111 | { | 110 | { |
112 | size_t size; | 111 | s->memcg_params.is_root_cache = true; |
112 | INIT_LIST_HEAD(&s->memcg_params.list); | ||
113 | RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL); | ||
114 | } | ||
115 | |||
116 | static int init_memcg_params(struct kmem_cache *s, | ||
117 | struct mem_cgroup *memcg, struct kmem_cache *root_cache) | ||
118 | { | ||
119 | struct memcg_cache_array *arr; | ||
113 | 120 | ||
114 | if (!memcg_kmem_enabled()) | 121 | if (memcg) { |
122 | s->memcg_params.is_root_cache = false; | ||
123 | s->memcg_params.memcg = memcg; | ||
124 | s->memcg_params.root_cache = root_cache; | ||
115 | return 0; | 125 | return 0; |
126 | } | ||
116 | 127 | ||
117 | if (!memcg) { | 128 | slab_init_memcg_params(s); |
118 | size = offsetof(struct memcg_cache_params, memcg_caches); | ||
119 | size += memcg_limited_groups_array_size * sizeof(void *); | ||
120 | } else | ||
121 | size = sizeof(struct memcg_cache_params); | ||
122 | 129 | ||
123 | s->memcg_params = kzalloc(size, GFP_KERNEL); | 130 | if (!memcg_nr_cache_ids) |
124 | if (!s->memcg_params) | 131 | return 0; |
125 | return -ENOMEM; | ||
126 | 132 | ||
127 | if (memcg) { | 133 | arr = kzalloc(sizeof(struct memcg_cache_array) + |
128 | s->memcg_params->memcg = memcg; | 134 | memcg_nr_cache_ids * sizeof(void *), |
129 | s->memcg_params->root_cache = root_cache; | 135 | GFP_KERNEL); |
130 | } else | 136 | if (!arr) |
131 | s->memcg_params->is_root_cache = true; | 137 | return -ENOMEM; |
132 | 138 | ||
139 | RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr); | ||
133 | return 0; | 140 | return 0; |
134 | } | 141 | } |
135 | 142 | ||
136 | static void memcg_free_cache_params(struct kmem_cache *s) | 143 | static void destroy_memcg_params(struct kmem_cache *s) |
137 | { | 144 | { |
138 | kfree(s->memcg_params); | 145 | if (is_root_cache(s)) |
146 | kfree(rcu_access_pointer(s->memcg_params.memcg_caches)); | ||
139 | } | 147 | } |
140 | 148 | ||
141 | static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs) | 149 | static int update_memcg_params(struct kmem_cache *s, int new_array_size) |
142 | { | 150 | { |
143 | int size; | 151 | struct memcg_cache_array *old, *new; |
144 | struct memcg_cache_params *new_params, *cur_params; | ||
145 | |||
146 | BUG_ON(!is_root_cache(s)); | ||
147 | 152 | ||
148 | size = offsetof(struct memcg_cache_params, memcg_caches); | 153 | if (!is_root_cache(s)) |
149 | size += num_memcgs * sizeof(void *); | 154 | return 0; |
150 | 155 | ||
151 | new_params = kzalloc(size, GFP_KERNEL); | 156 | new = kzalloc(sizeof(struct memcg_cache_array) + |
152 | if (!new_params) | 157 | new_array_size * sizeof(void *), GFP_KERNEL); |
158 | if (!new) | ||
153 | return -ENOMEM; | 159 | return -ENOMEM; |
154 | 160 | ||
155 | cur_params = s->memcg_params; | 161 | old = rcu_dereference_protected(s->memcg_params.memcg_caches, |
156 | memcpy(new_params->memcg_caches, cur_params->memcg_caches, | 162 | lockdep_is_held(&slab_mutex)); |
157 | memcg_limited_groups_array_size * sizeof(void *)); | 163 | if (old) |
158 | 164 | memcpy(new->entries, old->entries, | |
159 | new_params->is_root_cache = true; | 165 | memcg_nr_cache_ids * sizeof(void *)); |
160 | |||
161 | rcu_assign_pointer(s->memcg_params, new_params); | ||
162 | if (cur_params) | ||
163 | kfree_rcu(cur_params, rcu_head); | ||
164 | 166 | ||
167 | rcu_assign_pointer(s->memcg_params.memcg_caches, new); | ||
168 | if (old) | ||
169 | kfree_rcu(old, rcu); | ||
165 | return 0; | 170 | return 0; |
166 | } | 171 | } |
167 | 172 | ||
@@ -169,34 +174,28 @@ int memcg_update_all_caches(int num_memcgs) | |||
169 | { | 174 | { |
170 | struct kmem_cache *s; | 175 | struct kmem_cache *s; |
171 | int ret = 0; | 176 | int ret = 0; |
172 | mutex_lock(&slab_mutex); | ||
173 | 177 | ||
178 | mutex_lock(&slab_mutex); | ||
174 | list_for_each_entry(s, &slab_caches, list) { | 179 | list_for_each_entry(s, &slab_caches, list) { |
175 | if (!is_root_cache(s)) | 180 | ret = update_memcg_params(s, num_memcgs); |
176 | continue; | ||
177 | |||
178 | ret = memcg_update_cache_params(s, num_memcgs); | ||
179 | /* | 181 | /* |
180 | * Instead of freeing the memory, we'll just leave the caches | 182 | * Instead of freeing the memory, we'll just leave the caches |
181 | * up to this point in an updated state. | 183 | * up to this point in an updated state. |
182 | */ | 184 | */ |
183 | if (ret) | 185 | if (ret) |
184 | goto out; | 186 | break; |
185 | } | 187 | } |
186 | |||
187 | memcg_update_array_size(num_memcgs); | ||
188 | out: | ||
189 | mutex_unlock(&slab_mutex); | 188 | mutex_unlock(&slab_mutex); |
190 | return ret; | 189 | return ret; |
191 | } | 190 | } |
192 | #else | 191 | #else |
193 | static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, | 192 | static inline int init_memcg_params(struct kmem_cache *s, |
194 | struct kmem_cache *s, struct kmem_cache *root_cache) | 193 | struct mem_cgroup *memcg, struct kmem_cache *root_cache) |
195 | { | 194 | { |
196 | return 0; | 195 | return 0; |
197 | } | 196 | } |
198 | 197 | ||
199 | static inline void memcg_free_cache_params(struct kmem_cache *s) | 198 | static inline void destroy_memcg_params(struct kmem_cache *s) |
200 | { | 199 | { |
201 | } | 200 | } |
202 | #endif /* CONFIG_MEMCG_KMEM */ | 201 | #endif /* CONFIG_MEMCG_KMEM */ |
@@ -296,8 +295,8 @@ unsigned long calculate_alignment(unsigned long flags, | |||
296 | } | 295 | } |
297 | 296 | ||
298 | static struct kmem_cache * | 297 | static struct kmem_cache * |
299 | do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, | 298 | do_kmem_cache_create(const char *name, size_t object_size, size_t size, |
300 | unsigned long flags, void (*ctor)(void *), | 299 | size_t align, unsigned long flags, void (*ctor)(void *), |
301 | struct mem_cgroup *memcg, struct kmem_cache *root_cache) | 300 | struct mem_cgroup *memcg, struct kmem_cache *root_cache) |
302 | { | 301 | { |
303 | struct kmem_cache *s; | 302 | struct kmem_cache *s; |
@@ -314,7 +313,7 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, | |||
314 | s->align = align; | 313 | s->align = align; |
315 | s->ctor = ctor; | 314 | s->ctor = ctor; |
316 | 315 | ||
317 | err = memcg_alloc_cache_params(memcg, s, root_cache); | 316 | err = init_memcg_params(s, memcg, root_cache); |
318 | if (err) | 317 | if (err) |
319 | goto out_free_cache; | 318 | goto out_free_cache; |
320 | 319 | ||
@@ -330,8 +329,8 @@ out: | |||
330 | return s; | 329 | return s; |
331 | 330 | ||
332 | out_free_cache: | 331 | out_free_cache: |
333 | memcg_free_cache_params(s); | 332 | destroy_memcg_params(s); |
334 | kfree(s); | 333 | kmem_cache_free(kmem_cache, s); |
335 | goto out; | 334 | goto out; |
336 | } | 335 | } |
337 | 336 | ||
@@ -364,11 +363,12 @@ kmem_cache_create(const char *name, size_t size, size_t align, | |||
364 | unsigned long flags, void (*ctor)(void *)) | 363 | unsigned long flags, void (*ctor)(void *)) |
365 | { | 364 | { |
366 | struct kmem_cache *s; | 365 | struct kmem_cache *s; |
367 | char *cache_name; | 366 | const char *cache_name; |
368 | int err; | 367 | int err; |
369 | 368 | ||
370 | get_online_cpus(); | 369 | get_online_cpus(); |
371 | get_online_mems(); | 370 | get_online_mems(); |
371 | memcg_get_cache_ids(); | ||
372 | 372 | ||
373 | mutex_lock(&slab_mutex); | 373 | mutex_lock(&slab_mutex); |
374 | 374 | ||
@@ -390,7 +390,7 @@ kmem_cache_create(const char *name, size_t size, size_t align, | |||
390 | if (s) | 390 | if (s) |
391 | goto out_unlock; | 391 | goto out_unlock; |
392 | 392 | ||
393 | cache_name = kstrdup(name, GFP_KERNEL); | 393 | cache_name = kstrdup_const(name, GFP_KERNEL); |
394 | if (!cache_name) { | 394 | if (!cache_name) { |
395 | err = -ENOMEM; | 395 | err = -ENOMEM; |
396 | goto out_unlock; | 396 | goto out_unlock; |
@@ -401,12 +401,13 @@ kmem_cache_create(const char *name, size_t size, size_t align, | |||
401 | flags, ctor, NULL, NULL); | 401 | flags, ctor, NULL, NULL); |
402 | if (IS_ERR(s)) { | 402 | if (IS_ERR(s)) { |
403 | err = PTR_ERR(s); | 403 | err = PTR_ERR(s); |
404 | kfree(cache_name); | 404 | kfree_const(cache_name); |
405 | } | 405 | } |
406 | 406 | ||
407 | out_unlock: | 407 | out_unlock: |
408 | mutex_unlock(&slab_mutex); | 408 | mutex_unlock(&slab_mutex); |
409 | 409 | ||
410 | memcg_put_cache_ids(); | ||
410 | put_online_mems(); | 411 | put_online_mems(); |
411 | put_online_cpus(); | 412 | put_online_cpus(); |
412 | 413 | ||
@@ -425,31 +426,91 @@ out_unlock: | |||
425 | } | 426 | } |
426 | EXPORT_SYMBOL(kmem_cache_create); | 427 | EXPORT_SYMBOL(kmem_cache_create); |
427 | 428 | ||
429 | static int do_kmem_cache_shutdown(struct kmem_cache *s, | ||
430 | struct list_head *release, bool *need_rcu_barrier) | ||
431 | { | ||
432 | if (__kmem_cache_shutdown(s) != 0) { | ||
433 | printk(KERN_ERR "kmem_cache_destroy %s: " | ||
434 | "Slab cache still has objects\n", s->name); | ||
435 | dump_stack(); | ||
436 | return -EBUSY; | ||
437 | } | ||
438 | |||
439 | if (s->flags & SLAB_DESTROY_BY_RCU) | ||
440 | *need_rcu_barrier = true; | ||
441 | |||
442 | #ifdef CONFIG_MEMCG_KMEM | ||
443 | if (!is_root_cache(s)) | ||
444 | list_del(&s->memcg_params.list); | ||
445 | #endif | ||
446 | list_move(&s->list, release); | ||
447 | return 0; | ||
448 | } | ||
449 | |||
450 | static void do_kmem_cache_release(struct list_head *release, | ||
451 | bool need_rcu_barrier) | ||
452 | { | ||
453 | struct kmem_cache *s, *s2; | ||
454 | |||
455 | if (need_rcu_barrier) | ||
456 | rcu_barrier(); | ||
457 | |||
458 | list_for_each_entry_safe(s, s2, release, list) { | ||
459 | #ifdef SLAB_SUPPORTS_SYSFS | ||
460 | sysfs_slab_remove(s); | ||
461 | #else | ||
462 | slab_kmem_cache_release(s); | ||
463 | #endif | ||
464 | } | ||
465 | } | ||
466 | |||
428 | #ifdef CONFIG_MEMCG_KMEM | 467 | #ifdef CONFIG_MEMCG_KMEM |
429 | /* | 468 | /* |
430 | * memcg_create_kmem_cache - Create a cache for a memory cgroup. | 469 | * memcg_create_kmem_cache - Create a cache for a memory cgroup. |
431 | * @memcg: The memory cgroup the new cache is for. | 470 | * @memcg: The memory cgroup the new cache is for. |
432 | * @root_cache: The parent of the new cache. | 471 | * @root_cache: The parent of the new cache. |
433 | * @memcg_name: The name of the memory cgroup (used for naming the new cache). | ||
434 | * | 472 | * |
435 | * This function attempts to create a kmem cache that will serve allocation | 473 | * This function attempts to create a kmem cache that will serve allocation |
436 | * requests going from @memcg to @root_cache. The new cache inherits properties | 474 | * requests going from @memcg to @root_cache. The new cache inherits properties |
437 | * from its parent. | 475 | * from its parent. |
438 | */ | 476 | */ |
439 | struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | 477 | void memcg_create_kmem_cache(struct mem_cgroup *memcg, |
440 | struct kmem_cache *root_cache, | 478 | struct kmem_cache *root_cache) |
441 | const char *memcg_name) | ||
442 | { | 479 | { |
480 | static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ | ||
481 | struct cgroup_subsys_state *css = mem_cgroup_css(memcg); | ||
482 | struct memcg_cache_array *arr; | ||
443 | struct kmem_cache *s = NULL; | 483 | struct kmem_cache *s = NULL; |
444 | char *cache_name; | 484 | char *cache_name; |
485 | int idx; | ||
445 | 486 | ||
446 | get_online_cpus(); | 487 | get_online_cpus(); |
447 | get_online_mems(); | 488 | get_online_mems(); |
448 | 489 | ||
449 | mutex_lock(&slab_mutex); | 490 | mutex_lock(&slab_mutex); |
450 | 491 | ||
492 | /* | ||
493 | * The memory cgroup could have been deactivated while the cache | ||
494 | * creation work was pending. | ||
495 | */ | ||
496 | if (!memcg_kmem_is_active(memcg)) | ||
497 | goto out_unlock; | ||
498 | |||
499 | idx = memcg_cache_id(memcg); | ||
500 | arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches, | ||
501 | lockdep_is_held(&slab_mutex)); | ||
502 | |||
503 | /* | ||
504 | * Since per-memcg caches are created asynchronously on first | ||
505 | * allocation (see memcg_kmem_get_cache()), several threads can try to | ||
506 | * create the same cache, but only one of them may succeed. | ||
507 | */ | ||
508 | if (arr->entries[idx]) | ||
509 | goto out_unlock; | ||
510 | |||
511 | cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); | ||
451 | cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, | 512 | cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, |
452 | memcg_cache_id(memcg), memcg_name); | 513 | css->id, memcg_name_buf); |
453 | if (!cache_name) | 514 | if (!cache_name) |
454 | goto out_unlock; | 515 | goto out_unlock; |
455 | 516 | ||
@@ -457,49 +518,108 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, | |||
457 | root_cache->size, root_cache->align, | 518 | root_cache->size, root_cache->align, |
458 | root_cache->flags, root_cache->ctor, | 519 | root_cache->flags, root_cache->ctor, |
459 | memcg, root_cache); | 520 | memcg, root_cache); |
521 | /* | ||
522 | * If we could not create a memcg cache, do not complain, because | ||
523 | * that's not critical at all as we can always proceed with the root | ||
524 | * cache. | ||
525 | */ | ||
460 | if (IS_ERR(s)) { | 526 | if (IS_ERR(s)) { |
461 | kfree(cache_name); | 527 | kfree(cache_name); |
462 | s = NULL; | 528 | goto out_unlock; |
463 | } | 529 | } |
464 | 530 | ||
531 | list_add(&s->memcg_params.list, &root_cache->memcg_params.list); | ||
532 | |||
533 | /* | ||
534 | * Since readers won't lock (see cache_from_memcg_idx()), we need a | ||
535 | * barrier here to ensure nobody will see the kmem_cache partially | ||
536 | * initialized. | ||
537 | */ | ||
538 | smp_wmb(); | ||
539 | arr->entries[idx] = s; | ||
540 | |||
465 | out_unlock: | 541 | out_unlock: |
466 | mutex_unlock(&slab_mutex); | 542 | mutex_unlock(&slab_mutex); |
467 | 543 | ||
468 | put_online_mems(); | 544 | put_online_mems(); |
469 | put_online_cpus(); | 545 | put_online_cpus(); |
470 | |||
471 | return s; | ||
472 | } | 546 | } |
473 | 547 | ||
474 | static int memcg_cleanup_cache_params(struct kmem_cache *s) | 548 | void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) |
475 | { | 549 | { |
476 | int rc; | 550 | int idx; |
551 | struct memcg_cache_array *arr; | ||
552 | struct kmem_cache *s, *c; | ||
477 | 553 | ||
478 | if (!s->memcg_params || | 554 | idx = memcg_cache_id(memcg); |
479 | !s->memcg_params->is_root_cache) | 555 | |
480 | return 0; | 556 | get_online_cpus(); |
557 | get_online_mems(); | ||
481 | 558 | ||
482 | mutex_unlock(&slab_mutex); | ||
483 | rc = __memcg_cleanup_cache_params(s); | ||
484 | mutex_lock(&slab_mutex); | 559 | mutex_lock(&slab_mutex); |
560 | list_for_each_entry(s, &slab_caches, list) { | ||
561 | if (!is_root_cache(s)) | ||
562 | continue; | ||
563 | |||
564 | arr = rcu_dereference_protected(s->memcg_params.memcg_caches, | ||
565 | lockdep_is_held(&slab_mutex)); | ||
566 | c = arr->entries[idx]; | ||
567 | if (!c) | ||
568 | continue; | ||
569 | |||
570 | __kmem_cache_shrink(c, true); | ||
571 | arr->entries[idx] = NULL; | ||
572 | } | ||
573 | mutex_unlock(&slab_mutex); | ||
485 | 574 | ||
486 | return rc; | 575 | put_online_mems(); |
576 | put_online_cpus(); | ||
487 | } | 577 | } |
488 | #else | 578 | |
489 | static int memcg_cleanup_cache_params(struct kmem_cache *s) | 579 | void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) |
490 | { | 580 | { |
491 | return 0; | 581 | LIST_HEAD(release); |
582 | bool need_rcu_barrier = false; | ||
583 | struct kmem_cache *s, *s2; | ||
584 | |||
585 | get_online_cpus(); | ||
586 | get_online_mems(); | ||
587 | |||
588 | mutex_lock(&slab_mutex); | ||
589 | list_for_each_entry_safe(s, s2, &slab_caches, list) { | ||
590 | if (is_root_cache(s) || s->memcg_params.memcg != memcg) | ||
591 | continue; | ||
592 | /* | ||
593 | * The cgroup is about to be freed and therefore has no charges | ||
594 | * left. Hence, all its caches must be empty by now. | ||
595 | */ | ||
596 | BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier)); | ||
597 | } | ||
598 | mutex_unlock(&slab_mutex); | ||
599 | |||
600 | put_online_mems(); | ||
601 | put_online_cpus(); | ||
602 | |||
603 | do_kmem_cache_release(&release, need_rcu_barrier); | ||
492 | } | 604 | } |
493 | #endif /* CONFIG_MEMCG_KMEM */ | 605 | #endif /* CONFIG_MEMCG_KMEM */ |
494 | 606 | ||
495 | void slab_kmem_cache_release(struct kmem_cache *s) | 607 | void slab_kmem_cache_release(struct kmem_cache *s) |
496 | { | 608 | { |
497 | kfree(s->name); | 609 | destroy_memcg_params(s); |
610 | kfree_const(s->name); | ||
498 | kmem_cache_free(kmem_cache, s); | 611 | kmem_cache_free(kmem_cache, s); |
499 | } | 612 | } |
500 | 613 | ||
501 | void kmem_cache_destroy(struct kmem_cache *s) | 614 | void kmem_cache_destroy(struct kmem_cache *s) |
502 | { | 615 | { |
616 | struct kmem_cache *c, *c2; | ||
617 | LIST_HEAD(release); | ||
618 | bool need_rcu_barrier = false; | ||
619 | bool busy = false; | ||
620 | |||
621 | BUG_ON(!is_root_cache(s)); | ||
622 | |||
503 | get_online_cpus(); | 623 | get_online_cpus(); |
504 | get_online_mems(); | 624 | get_online_mems(); |
505 | 625 | ||
@@ -509,35 +629,21 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
509 | if (s->refcount) | 629 | if (s->refcount) |
510 | goto out_unlock; | 630 | goto out_unlock; |
511 | 631 | ||
512 | if (memcg_cleanup_cache_params(s) != 0) | 632 | for_each_memcg_cache_safe(c, c2, s) { |
513 | goto out_unlock; | 633 | if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) |
514 | 634 | busy = true; | |
515 | if (__kmem_cache_shutdown(s) != 0) { | ||
516 | printk(KERN_ERR "kmem_cache_destroy %s: " | ||
517 | "Slab cache still has objects\n", s->name); | ||
518 | dump_stack(); | ||
519 | goto out_unlock; | ||
520 | } | 635 | } |
521 | 636 | ||
522 | list_del(&s->list); | 637 | if (!busy) |
523 | 638 | do_kmem_cache_shutdown(s, &release, &need_rcu_barrier); | |
524 | mutex_unlock(&slab_mutex); | ||
525 | if (s->flags & SLAB_DESTROY_BY_RCU) | ||
526 | rcu_barrier(); | ||
527 | |||
528 | memcg_free_cache_params(s); | ||
529 | #ifdef SLAB_SUPPORTS_SYSFS | ||
530 | sysfs_slab_remove(s); | ||
531 | #else | ||
532 | slab_kmem_cache_release(s); | ||
533 | #endif | ||
534 | goto out; | ||
535 | 639 | ||
536 | out_unlock: | 640 | out_unlock: |
537 | mutex_unlock(&slab_mutex); | 641 | mutex_unlock(&slab_mutex); |
538 | out: | 642 | |
539 | put_online_mems(); | 643 | put_online_mems(); |
540 | put_online_cpus(); | 644 | put_online_cpus(); |
645 | |||
646 | do_kmem_cache_release(&release, need_rcu_barrier); | ||
541 | } | 647 | } |
542 | EXPORT_SYMBOL(kmem_cache_destroy); | 648 | EXPORT_SYMBOL(kmem_cache_destroy); |
543 | 649 | ||
@@ -554,7 +660,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) | |||
554 | 660 | ||
555 | get_online_cpus(); | 661 | get_online_cpus(); |
556 | get_online_mems(); | 662 | get_online_mems(); |
557 | ret = __kmem_cache_shrink(cachep); | 663 | ret = __kmem_cache_shrink(cachep, false); |
558 | put_online_mems(); | 664 | put_online_mems(); |
559 | put_online_cpus(); | 665 | put_online_cpus(); |
560 | return ret; | 666 | return ret; |
@@ -576,6 +682,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz | |||
576 | s->name = name; | 682 | s->name = name; |
577 | s->size = s->object_size = size; | 683 | s->size = s->object_size = size; |
578 | s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); | 684 | s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); |
685 | |||
686 | slab_init_memcg_params(s); | ||
687 | |||
579 | err = __kmem_cache_create(s, flags); | 688 | err = __kmem_cache_create(s, flags); |
580 | 689 | ||
581 | if (err) | 690 | if (err) |
@@ -789,6 +898,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) | |||
789 | page = alloc_kmem_pages(flags, order); | 898 | page = alloc_kmem_pages(flags, order); |
790 | ret = page ? page_address(page) : NULL; | 899 | ret = page ? page_address(page) : NULL; |
791 | kmemleak_alloc(ret, size, 1, flags); | 900 | kmemleak_alloc(ret, size, 1, flags); |
901 | kasan_kmalloc_large(ret, size); | ||
792 | return ret; | 902 | return ret; |
793 | } | 903 | } |
794 | EXPORT_SYMBOL(kmalloc_order); | 904 | EXPORT_SYMBOL(kmalloc_order); |
@@ -855,16 +965,11 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) | |||
855 | { | 965 | { |
856 | struct kmem_cache *c; | 966 | struct kmem_cache *c; |
857 | struct slabinfo sinfo; | 967 | struct slabinfo sinfo; |
858 | int i; | ||
859 | 968 | ||
860 | if (!is_root_cache(s)) | 969 | if (!is_root_cache(s)) |
861 | return; | 970 | return; |
862 | 971 | ||
863 | for_each_memcg_cache_index(i) { | 972 | for_each_memcg_cache(c, s) { |
864 | c = cache_from_memcg_idx(s, i); | ||
865 | if (!c) | ||
866 | continue; | ||
867 | |||
868 | memset(&sinfo, 0, sizeof(sinfo)); | 973 | memset(&sinfo, 0, sizeof(sinfo)); |
869 | get_slabinfo(c, &sinfo); | 974 | get_slabinfo(c, &sinfo); |
870 | 975 | ||
@@ -916,7 +1021,7 @@ int memcg_slab_show(struct seq_file *m, void *p) | |||
916 | 1021 | ||
917 | if (p == slab_caches.next) | 1022 | if (p == slab_caches.next) |
918 | print_slabinfo_header(m); | 1023 | print_slabinfo_header(m); |
919 | if (!is_root_cache(s) && s->memcg_params->memcg == memcg) | 1024 | if (!is_root_cache(s) && s->memcg_params.memcg == memcg) |
920 | cache_show(s, m); | 1025 | cache_show(s, m); |
921 | return 0; | 1026 | return 0; |
922 | } | 1027 | } |
@@ -973,8 +1078,10 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, | |||
973 | if (p) | 1078 | if (p) |
974 | ks = ksize(p); | 1079 | ks = ksize(p); |
975 | 1080 | ||
976 | if (ks >= new_size) | 1081 | if (ks >= new_size) { |
1082 | kasan_krealloc((void *)p, new_size); | ||
977 | return (void *)p; | 1083 | return (void *)p; |
1084 | } | ||
978 | 1085 | ||
979 | ret = kmalloc_track_caller(new_size, flags); | 1086 | ret = kmalloc_track_caller(new_size, flags); |
980 | if (ret && p) | 1087 | if (ret && p) |
@@ -618,7 +618,7 @@ int __kmem_cache_shutdown(struct kmem_cache *c) | |||
618 | return 0; | 618 | return 0; |
619 | } | 619 | } |
620 | 620 | ||
621 | int __kmem_cache_shrink(struct kmem_cache *d) | 621 | int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate) |
622 | { | 622 | { |
623 | return 0; | 623 | return 0; |
624 | } | 624 | } |
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/proc_fs.h> | 20 | #include <linux/proc_fs.h> |
21 | #include <linux/notifier.h> | 21 | #include <linux/notifier.h> |
22 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
23 | #include <linux/kasan.h> | ||
23 | #include <linux/kmemcheck.h> | 24 | #include <linux/kmemcheck.h> |
24 | #include <linux/cpu.h> | 25 | #include <linux/cpu.h> |
25 | #include <linux/cpuset.h> | 26 | #include <linux/cpuset.h> |
@@ -468,12 +469,30 @@ static char *slub_debug_slabs; | |||
468 | static int disable_higher_order_debug; | 469 | static int disable_higher_order_debug; |
469 | 470 | ||
470 | /* | 471 | /* |
472 | * slub is about to manipulate internal object metadata. This memory lies | ||
473 | * outside the range of the allocated object, so accessing it would normally | ||
474 | * be reported by kasan as a bounds error. metadata_access_enable() is used | ||
475 | * to tell kasan that these accesses are OK. | ||
476 | */ | ||
477 | static inline void metadata_access_enable(void) | ||
478 | { | ||
479 | kasan_disable_current(); | ||
480 | } | ||
481 | |||
482 | static inline void metadata_access_disable(void) | ||
483 | { | ||
484 | kasan_enable_current(); | ||
485 | } | ||
486 | |||
487 | /* | ||
471 | * Object debugging | 488 | * Object debugging |
472 | */ | 489 | */ |
473 | static void print_section(char *text, u8 *addr, unsigned int length) | 490 | static void print_section(char *text, u8 *addr, unsigned int length) |
474 | { | 491 | { |
492 | metadata_access_enable(); | ||
475 | print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, | 493 | print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, |
476 | length, 1); | 494 | length, 1); |
495 | metadata_access_disable(); | ||
477 | } | 496 | } |
478 | 497 | ||
479 | static struct track *get_track(struct kmem_cache *s, void *object, | 498 | static struct track *get_track(struct kmem_cache *s, void *object, |
@@ -503,7 +522,9 @@ static void set_track(struct kmem_cache *s, void *object, | |||
503 | trace.max_entries = TRACK_ADDRS_COUNT; | 522 | trace.max_entries = TRACK_ADDRS_COUNT; |
504 | trace.entries = p->addrs; | 523 | trace.entries = p->addrs; |
505 | trace.skip = 3; | 524 | trace.skip = 3; |
525 | metadata_access_enable(); | ||
506 | save_stack_trace(&trace); | 526 | save_stack_trace(&trace); |
527 | metadata_access_disable(); | ||
507 | 528 | ||
508 | /* See rant in lockdep.c */ | 529 | /* See rant in lockdep.c */ |
509 | if (trace.nr_entries != 0 && | 530 | if (trace.nr_entries != 0 && |
@@ -629,7 +650,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) | |||
629 | dump_stack(); | 650 | dump_stack(); |
630 | } | 651 | } |
631 | 652 | ||
632 | static void object_err(struct kmem_cache *s, struct page *page, | 653 | void object_err(struct kmem_cache *s, struct page *page, |
633 | u8 *object, char *reason) | 654 | u8 *object, char *reason) |
634 | { | 655 | { |
635 | slab_bug(s, "%s", reason); | 656 | slab_bug(s, "%s", reason); |
@@ -677,7 +698,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, | |||
677 | u8 *fault; | 698 | u8 *fault; |
678 | u8 *end; | 699 | u8 *end; |
679 | 700 | ||
701 | metadata_access_enable(); | ||
680 | fault = memchr_inv(start, value, bytes); | 702 | fault = memchr_inv(start, value, bytes); |
703 | metadata_access_disable(); | ||
681 | if (!fault) | 704 | if (!fault) |
682 | return 1; | 705 | return 1; |
683 | 706 | ||
@@ -770,7 +793,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) | |||
770 | if (!remainder) | 793 | if (!remainder) |
771 | return 1; | 794 | return 1; |
772 | 795 | ||
796 | metadata_access_enable(); | ||
773 | fault = memchr_inv(end - remainder, POISON_INUSE, remainder); | 797 | fault = memchr_inv(end - remainder, POISON_INUSE, remainder); |
798 | metadata_access_disable(); | ||
774 | if (!fault) | 799 | if (!fault) |
775 | return 1; | 800 | return 1; |
776 | while (end > fault && end[-1] == POISON_INUSE) | 801 | while (end > fault && end[-1] == POISON_INUSE) |
@@ -1226,11 +1251,13 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, | |||
1226 | static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) | 1251 | static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) |
1227 | { | 1252 | { |
1228 | kmemleak_alloc(ptr, size, 1, flags); | 1253 | kmemleak_alloc(ptr, size, 1, flags); |
1254 | kasan_kmalloc_large(ptr, size); | ||
1229 | } | 1255 | } |
1230 | 1256 | ||
1231 | static inline void kfree_hook(const void *x) | 1257 | static inline void kfree_hook(const void *x) |
1232 | { | 1258 | { |
1233 | kmemleak_free(x); | 1259 | kmemleak_free(x); |
1260 | kasan_kfree_large(x); | ||
1234 | } | 1261 | } |
1235 | 1262 | ||
1236 | static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, | 1263 | static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, |
@@ -1253,6 +1280,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, | |||
1253 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); | 1280 | kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); |
1254 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); | 1281 | kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); |
1255 | memcg_kmem_put_cache(s); | 1282 | memcg_kmem_put_cache(s); |
1283 | kasan_slab_alloc(s, object); | ||
1256 | } | 1284 | } |
1257 | 1285 | ||
1258 | static inline void slab_free_hook(struct kmem_cache *s, void *x) | 1286 | static inline void slab_free_hook(struct kmem_cache *s, void *x) |
@@ -1276,6 +1304,8 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
1276 | #endif | 1304 | #endif |
1277 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | 1305 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) |
1278 | debug_check_no_obj_freed(x, s->object_size); | 1306 | debug_check_no_obj_freed(x, s->object_size); |
1307 | |||
1308 | kasan_slab_free(s, x); | ||
1279 | } | 1309 | } |
1280 | 1310 | ||
1281 | /* | 1311 | /* |
@@ -1370,8 +1400,11 @@ static void setup_object(struct kmem_cache *s, struct page *page, | |||
1370 | void *object) | 1400 | void *object) |
1371 | { | 1401 | { |
1372 | setup_object_debug(s, page, object); | 1402 | setup_object_debug(s, page, object); |
1373 | if (unlikely(s->ctor)) | 1403 | if (unlikely(s->ctor)) { |
1404 | kasan_unpoison_object_data(s, object); | ||
1374 | s->ctor(object); | 1405 | s->ctor(object); |
1406 | kasan_poison_object_data(s, object); | ||
1407 | } | ||
1375 | } | 1408 | } |
1376 | 1409 | ||
1377 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | 1410 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) |
@@ -1404,6 +1437,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1404 | if (unlikely(s->flags & SLAB_POISON)) | 1437 | if (unlikely(s->flags & SLAB_POISON)) |
1405 | memset(start, POISON_INUSE, PAGE_SIZE << order); | 1438 | memset(start, POISON_INUSE, PAGE_SIZE << order); |
1406 | 1439 | ||
1440 | kasan_poison_slab(page); | ||
1441 | |||
1407 | for_each_object_idx(p, idx, s, start, page->objects) { | 1442 | for_each_object_idx(p, idx, s, start, page->objects) { |
1408 | setup_object(s, page, p); | 1443 | setup_object(s, page, p); |
1409 | if (likely(idx < page->objects)) | 1444 | if (likely(idx < page->objects)) |
@@ -2007,6 +2042,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) | |||
2007 | int pages; | 2042 | int pages; |
2008 | int pobjects; | 2043 | int pobjects; |
2009 | 2044 | ||
2045 | preempt_disable(); | ||
2010 | do { | 2046 | do { |
2011 | pages = 0; | 2047 | pages = 0; |
2012 | pobjects = 0; | 2048 | pobjects = 0; |
@@ -2040,6 +2076,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) | |||
2040 | 2076 | ||
2041 | } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) | 2077 | } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) |
2042 | != oldpage); | 2078 | != oldpage); |
2079 | if (unlikely(!s->cpu_partial)) { | ||
2080 | unsigned long flags; | ||
2081 | |||
2082 | local_irq_save(flags); | ||
2083 | unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); | ||
2084 | local_irq_restore(flags); | ||
2085 | } | ||
2086 | preempt_enable(); | ||
2043 | #endif | 2087 | #endif |
2044 | } | 2088 | } |
2045 | 2089 | ||
@@ -2398,13 +2442,24 @@ redo: | |||
2398 | * reading from one cpu area. That does not matter as long | 2442 | * reading from one cpu area. That does not matter as long |
2399 | * as we end up on the original cpu again when doing the cmpxchg. | 2443 | * as we end up on the original cpu again when doing the cmpxchg. |
2400 | * | 2444 | * |
2401 | * Preemption is disabled for the retrieval of the tid because that | 2445 | * We should guarantee that tid and kmem_cache are retrieved on |
2402 | * must occur from the current processor. We cannot allow rescheduling | 2446 | * the same cpu. It could be different if CONFIG_PREEMPT so we need |
2403 | * on a different processor between the determination of the pointer | 2447 | * to check if it is matched or not. |
2404 | * and the retrieval of the tid. | ||
2405 | */ | 2448 | */ |
2406 | preempt_disable(); | 2449 | do { |
2407 | c = this_cpu_ptr(s->cpu_slab); | 2450 | tid = this_cpu_read(s->cpu_slab->tid); |
2451 | c = raw_cpu_ptr(s->cpu_slab); | ||
2452 | } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); | ||
2453 | |||
2454 | /* | ||
2455 | * Irqless object alloc/free algorithm used here depends on sequence | ||
2456 | * of fetching cpu_slab's data. tid should be fetched before anything | ||
2457 | * on c to guarantee that object and page associated with previous tid | ||
2458 | * won't be used with current tid. If we fetch tid first, object and | ||
2459 | * page could be one associated with next tid and our alloc/free | ||
2460 | * request will be failed. In this case, we will retry. So, no problem. | ||
2461 | */ | ||
2462 | barrier(); | ||
2408 | 2463 | ||
2409 | /* | 2464 | /* |
2410 | * The transaction ids are globally unique per cpu and per operation on | 2465 | * The transaction ids are globally unique per cpu and per operation on |
@@ -2412,8 +2467,6 @@ redo: | |||
2412 | * occurs on the right processor and that there was no operation on the | 2467 | * occurs on the right processor and that there was no operation on the |
2413 | * linked list in between. | 2468 | * linked list in between. |
2414 | */ | 2469 | */ |
2415 | tid = c->tid; | ||
2416 | preempt_enable(); | ||
2417 | 2470 | ||
2418 | object = c->freelist; | 2471 | object = c->freelist; |
2419 | page = c->page; | 2472 | page = c->page; |
@@ -2479,6 +2532,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) | |||
2479 | { | 2532 | { |
2480 | void *ret = slab_alloc(s, gfpflags, _RET_IP_); | 2533 | void *ret = slab_alloc(s, gfpflags, _RET_IP_); |
2481 | trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); | 2534 | trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); |
2535 | kasan_kmalloc(s, ret, size); | ||
2482 | return ret; | 2536 | return ret; |
2483 | } | 2537 | } |
2484 | EXPORT_SYMBOL(kmem_cache_alloc_trace); | 2538 | EXPORT_SYMBOL(kmem_cache_alloc_trace); |
@@ -2505,6 +2559,8 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, | |||
2505 | 2559 | ||
2506 | trace_kmalloc_node(_RET_IP_, ret, | 2560 | trace_kmalloc_node(_RET_IP_, ret, |
2507 | size, s->size, gfpflags, node); | 2561 | size, s->size, gfpflags, node); |
2562 | |||
2563 | kasan_kmalloc(s, ret, size); | ||
2508 | return ret; | 2564 | return ret; |
2509 | } | 2565 | } |
2510 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); | 2566 | EXPORT_SYMBOL(kmem_cache_alloc_node_trace); |
@@ -2512,7 +2568,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace); | |||
2512 | #endif | 2568 | #endif |
2513 | 2569 | ||
2514 | /* | 2570 | /* |
2515 | * Slow patch handling. This may still be called frequently since objects | 2571 | * Slow path handling. This may still be called frequently since objects |
2516 | * have a longer lifetime than the cpu slabs in most processing loads. | 2572 | * have a longer lifetime than the cpu slabs in most processing loads. |
2517 | * | 2573 | * |
2518 | * So we still attempt to reduce cache line usage. Just take the slab | 2574 | * So we still attempt to reduce cache line usage. Just take the slab |
@@ -2659,11 +2715,13 @@ redo: | |||
2659 | * data is retrieved via this pointer. If we are on the same cpu | 2715 | * data is retrieved via this pointer. If we are on the same cpu |
2660 | * during the cmpxchg then the free will succedd. | 2716 | * during the cmpxchg then the free will succedd. |
2661 | */ | 2717 | */ |
2662 | preempt_disable(); | 2718 | do { |
2663 | c = this_cpu_ptr(s->cpu_slab); | 2719 | tid = this_cpu_read(s->cpu_slab->tid); |
2720 | c = raw_cpu_ptr(s->cpu_slab); | ||
2721 | } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); | ||
2664 | 2722 | ||
2665 | tid = c->tid; | 2723 | /* Same with comment on barrier() in slab_alloc_node() */ |
2666 | preempt_enable(); | 2724 | barrier(); |
2667 | 2725 | ||
2668 | if (likely(page == c->page)) { | 2726 | if (likely(page == c->page)) { |
2669 | set_freepointer(s, object, c->freelist); | 2727 | set_freepointer(s, object, c->freelist); |
@@ -2888,6 +2946,7 @@ static void early_kmem_cache_node_alloc(int node) | |||
2888 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); | 2946 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); |
2889 | init_tracking(kmem_cache_node, n); | 2947 | init_tracking(kmem_cache_node, n); |
2890 | #endif | 2948 | #endif |
2949 | kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node)); | ||
2891 | init_kmem_cache_node(n); | 2950 | init_kmem_cache_node(n); |
2892 | inc_slabs_node(kmem_cache_node, node, page->objects); | 2951 | inc_slabs_node(kmem_cache_node, node, page->objects); |
2893 | 2952 | ||
@@ -3260,6 +3319,8 @@ void *__kmalloc(size_t size, gfp_t flags) | |||
3260 | 3319 | ||
3261 | trace_kmalloc(_RET_IP_, ret, size, s->size, flags); | 3320 | trace_kmalloc(_RET_IP_, ret, size, s->size, flags); |
3262 | 3321 | ||
3322 | kasan_kmalloc(s, ret, size); | ||
3323 | |||
3263 | return ret; | 3324 | return ret; |
3264 | } | 3325 | } |
3265 | EXPORT_SYMBOL(__kmalloc); | 3326 | EXPORT_SYMBOL(__kmalloc); |
@@ -3303,12 +3364,14 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) | |||
3303 | 3364 | ||
3304 | trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); | 3365 | trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); |
3305 | 3366 | ||
3367 | kasan_kmalloc(s, ret, size); | ||
3368 | |||
3306 | return ret; | 3369 | return ret; |
3307 | } | 3370 | } |
3308 | EXPORT_SYMBOL(__kmalloc_node); | 3371 | EXPORT_SYMBOL(__kmalloc_node); |
3309 | #endif | 3372 | #endif |
3310 | 3373 | ||
3311 | size_t ksize(const void *object) | 3374 | static size_t __ksize(const void *object) |
3312 | { | 3375 | { |
3313 | struct page *page; | 3376 | struct page *page; |
3314 | 3377 | ||
@@ -3324,6 +3387,15 @@ size_t ksize(const void *object) | |||
3324 | 3387 | ||
3325 | return slab_ksize(page->slab_cache); | 3388 | return slab_ksize(page->slab_cache); |
3326 | } | 3389 | } |
3390 | |||
3391 | size_t ksize(const void *object) | ||
3392 | { | ||
3393 | size_t size = __ksize(object); | ||
3394 | /* We assume that ksize callers could use whole allocated area, | ||
3395 | so we need unpoison this area. */ | ||
3396 | kasan_krealloc(object, size); | ||
3397 | return size; | ||
3398 | } | ||
3327 | EXPORT_SYMBOL(ksize); | 3399 | EXPORT_SYMBOL(ksize); |
3328 | 3400 | ||
3329 | void kfree(const void *x) | 3401 | void kfree(const void *x) |
@@ -3347,69 +3419,92 @@ void kfree(const void *x) | |||
3347 | } | 3419 | } |
3348 | EXPORT_SYMBOL(kfree); | 3420 | EXPORT_SYMBOL(kfree); |
3349 | 3421 | ||
3422 | #define SHRINK_PROMOTE_MAX 32 | ||
3423 | |||
3350 | /* | 3424 | /* |
3351 | * kmem_cache_shrink removes empty slabs from the partial lists and sorts | 3425 | * kmem_cache_shrink discards empty slabs and promotes the slabs filled |
3352 | * the remaining slabs by the number of items in use. The slabs with the | 3426 | * up most to the head of the partial lists. New allocations will then |
3353 | * most items in use come first. New allocations will then fill those up | 3427 | * fill those up and thus they can be removed from the partial lists. |
3354 | * and thus they can be removed from the partial lists. | ||
3355 | * | 3428 | * |
3356 | * The slabs with the least items are placed last. This results in them | 3429 | * The slabs with the least items are placed last. This results in them |
3357 | * being allocated from last increasing the chance that the last objects | 3430 | * being allocated from last increasing the chance that the last objects |
3358 | * are freed in them. | 3431 | * are freed in them. |
3359 | */ | 3432 | */ |
3360 | int __kmem_cache_shrink(struct kmem_cache *s) | 3433 | int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) |
3361 | { | 3434 | { |
3362 | int node; | 3435 | int node; |
3363 | int i; | 3436 | int i; |
3364 | struct kmem_cache_node *n; | 3437 | struct kmem_cache_node *n; |
3365 | struct page *page; | 3438 | struct page *page; |
3366 | struct page *t; | 3439 | struct page *t; |
3367 | int objects = oo_objects(s->max); | 3440 | struct list_head discard; |
3368 | struct list_head *slabs_by_inuse = | 3441 | struct list_head promote[SHRINK_PROMOTE_MAX]; |
3369 | kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); | ||
3370 | unsigned long flags; | 3442 | unsigned long flags; |
3443 | int ret = 0; | ||
3371 | 3444 | ||
3372 | if (!slabs_by_inuse) | 3445 | if (deactivate) { |
3373 | return -ENOMEM; | 3446 | /* |
3447 | * Disable empty slabs caching. Used to avoid pinning offline | ||
3448 | * memory cgroups by kmem pages that can be freed. | ||
3449 | */ | ||
3450 | s->cpu_partial = 0; | ||
3451 | s->min_partial = 0; | ||
3452 | |||
3453 | /* | ||
3454 | * s->cpu_partial is checked locklessly (see put_cpu_partial), | ||
3455 | * so we have to make sure the change is visible. | ||
3456 | */ | ||
3457 | kick_all_cpus_sync(); | ||
3458 | } | ||
3374 | 3459 | ||
3375 | flush_all(s); | 3460 | flush_all(s); |
3376 | for_each_kmem_cache_node(s, node, n) { | 3461 | for_each_kmem_cache_node(s, node, n) { |
3377 | if (!n->nr_partial) | 3462 | INIT_LIST_HEAD(&discard); |
3378 | continue; | 3463 | for (i = 0; i < SHRINK_PROMOTE_MAX; i++) |
3379 | 3464 | INIT_LIST_HEAD(promote + i); | |
3380 | for (i = 0; i < objects; i++) | ||
3381 | INIT_LIST_HEAD(slabs_by_inuse + i); | ||
3382 | 3465 | ||
3383 | spin_lock_irqsave(&n->list_lock, flags); | 3466 | spin_lock_irqsave(&n->list_lock, flags); |
3384 | 3467 | ||
3385 | /* | 3468 | /* |
3386 | * Build lists indexed by the items in use in each slab. | 3469 | * Build lists of slabs to discard or promote. |
3387 | * | 3470 | * |
3388 | * Note that concurrent frees may occur while we hold the | 3471 | * Note that concurrent frees may occur while we hold the |
3389 | * list_lock. page->inuse here is the upper limit. | 3472 | * list_lock. page->inuse here is the upper limit. |
3390 | */ | 3473 | */ |
3391 | list_for_each_entry_safe(page, t, &n->partial, lru) { | 3474 | list_for_each_entry_safe(page, t, &n->partial, lru) { |
3392 | list_move(&page->lru, slabs_by_inuse + page->inuse); | 3475 | int free = page->objects - page->inuse; |
3393 | if (!page->inuse) | 3476 | |
3477 | /* Do not reread page->inuse */ | ||
3478 | barrier(); | ||
3479 | |||
3480 | /* We do not keep full slabs on the list */ | ||
3481 | BUG_ON(free <= 0); | ||
3482 | |||
3483 | if (free == page->objects) { | ||
3484 | list_move(&page->lru, &discard); | ||
3394 | n->nr_partial--; | 3485 | n->nr_partial--; |
3486 | } else if (free <= SHRINK_PROMOTE_MAX) | ||
3487 | list_move(&page->lru, promote + free - 1); | ||
3395 | } | 3488 | } |
3396 | 3489 | ||
3397 | /* | 3490 | /* |
3398 | * Rebuild the partial list with the slabs filled up most | 3491 | * Promote the slabs filled up most to the head of the |
3399 | * first and the least used slabs at the end. | 3492 | * partial list. |
3400 | */ | 3493 | */ |
3401 | for (i = objects - 1; i > 0; i--) | 3494 | for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) |
3402 | list_splice(slabs_by_inuse + i, n->partial.prev); | 3495 | list_splice(promote + i, &n->partial); |
3403 | 3496 | ||
3404 | spin_unlock_irqrestore(&n->list_lock, flags); | 3497 | spin_unlock_irqrestore(&n->list_lock, flags); |
3405 | 3498 | ||
3406 | /* Release empty slabs */ | 3499 | /* Release empty slabs */ |
3407 | list_for_each_entry_safe(page, t, slabs_by_inuse, lru) | 3500 | list_for_each_entry_safe(page, t, &discard, lru) |
3408 | discard_slab(s, page); | 3501 | discard_slab(s, page); |
3502 | |||
3503 | if (slabs_node(s, node)) | ||
3504 | ret = 1; | ||
3409 | } | 3505 | } |
3410 | 3506 | ||
3411 | kfree(slabs_by_inuse); | 3507 | return ret; |
3412 | return 0; | ||
3413 | } | 3508 | } |
3414 | 3509 | ||
3415 | static int slab_mem_going_offline_callback(void *arg) | 3510 | static int slab_mem_going_offline_callback(void *arg) |
@@ -3418,7 +3513,7 @@ static int slab_mem_going_offline_callback(void *arg) | |||
3418 | 3513 | ||
3419 | mutex_lock(&slab_mutex); | 3514 | mutex_lock(&slab_mutex); |
3420 | list_for_each_entry(s, &slab_caches, list) | 3515 | list_for_each_entry(s, &slab_caches, list) |
3421 | __kmem_cache_shrink(s); | 3516 | __kmem_cache_shrink(s, false); |
3422 | mutex_unlock(&slab_mutex); | 3517 | mutex_unlock(&slab_mutex); |
3423 | 3518 | ||
3424 | return 0; | 3519 | return 0; |
@@ -3566,6 +3661,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) | |||
3566 | p->slab_cache = s; | 3661 | p->slab_cache = s; |
3567 | #endif | 3662 | #endif |
3568 | } | 3663 | } |
3664 | slab_init_memcg_params(s); | ||
3569 | list_add(&s->list, &slab_caches); | 3665 | list_add(&s->list, &slab_caches); |
3570 | return s; | 3666 | return s; |
3571 | } | 3667 | } |
@@ -3624,13 +3720,10 @@ struct kmem_cache * | |||
3624 | __kmem_cache_alias(const char *name, size_t size, size_t align, | 3720 | __kmem_cache_alias(const char *name, size_t size, size_t align, |
3625 | unsigned long flags, void (*ctor)(void *)) | 3721 | unsigned long flags, void (*ctor)(void *)) |
3626 | { | 3722 | { |
3627 | struct kmem_cache *s; | 3723 | struct kmem_cache *s, *c; |
3628 | 3724 | ||
3629 | s = find_mergeable(size, align, flags, name, ctor); | 3725 | s = find_mergeable(size, align, flags, name, ctor); |
3630 | if (s) { | 3726 | if (s) { |
3631 | int i; | ||
3632 | struct kmem_cache *c; | ||
3633 | |||
3634 | s->refcount++; | 3727 | s->refcount++; |
3635 | 3728 | ||
3636 | /* | 3729 | /* |
@@ -3640,10 +3733,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, | |||
3640 | s->object_size = max(s->object_size, (int)size); | 3733 | s->object_size = max(s->object_size, (int)size); |
3641 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); | 3734 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); |
3642 | 3735 | ||
3643 | for_each_memcg_cache_index(i) { | 3736 | for_each_memcg_cache(c, s) { |
3644 | c = cache_from_memcg_idx(s, i); | ||
3645 | if (!c) | ||
3646 | continue; | ||
3647 | c->object_size = s->object_size; | 3737 | c->object_size = s->object_size; |
3648 | c->inuse = max_t(int, c->inuse, | 3738 | c->inuse = max_t(int, c->inuse, |
3649 | ALIGN(size, sizeof(void *))); | 3739 | ALIGN(size, sizeof(void *))); |
@@ -4070,20 +4160,16 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
4070 | 4160 | ||
4071 | if (num_online_cpus() > 1 && | 4161 | if (num_online_cpus() > 1 && |
4072 | !cpumask_empty(to_cpumask(l->cpus)) && | 4162 | !cpumask_empty(to_cpumask(l->cpus)) && |
4073 | len < PAGE_SIZE - 60) { | 4163 | len < PAGE_SIZE - 60) |
4074 | len += sprintf(buf + len, " cpus="); | 4164 | len += scnprintf(buf + len, PAGE_SIZE - len - 50, |
4075 | len += cpulist_scnprintf(buf + len, | 4165 | " cpus=%*pbl", |
4076 | PAGE_SIZE - len - 50, | 4166 | cpumask_pr_args(to_cpumask(l->cpus))); |
4077 | to_cpumask(l->cpus)); | ||
4078 | } | ||
4079 | 4167 | ||
4080 | if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && | 4168 | if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && |
4081 | len < PAGE_SIZE - 60) { | 4169 | len < PAGE_SIZE - 60) |
4082 | len += sprintf(buf + len, " nodes="); | 4170 | len += scnprintf(buf + len, PAGE_SIZE - len - 50, |
4083 | len += nodelist_scnprintf(buf + len, | 4171 | " nodes=%*pbl", |
4084 | PAGE_SIZE - len - 50, | 4172 | nodemask_pr_args(&l->nodes)); |
4085 | l->nodes); | ||
4086 | } | ||
4087 | 4173 | ||
4088 | len += sprintf(buf + len, "\n"); | 4174 | len += sprintf(buf + len, "\n"); |
4089 | } | 4175 | } |
@@ -4680,12 +4766,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf) | |||
4680 | static ssize_t shrink_store(struct kmem_cache *s, | 4766 | static ssize_t shrink_store(struct kmem_cache *s, |
4681 | const char *buf, size_t length) | 4767 | const char *buf, size_t length) |
4682 | { | 4768 | { |
4683 | if (buf[0] == '1') { | 4769 | if (buf[0] == '1') |
4684 | int rc = kmem_cache_shrink(s); | 4770 | kmem_cache_shrink(s); |
4685 | 4771 | else | |
4686 | if (rc) | ||
4687 | return rc; | ||
4688 | } else | ||
4689 | return -EINVAL; | 4772 | return -EINVAL; |
4690 | return length; | 4773 | return length; |
4691 | } | 4774 | } |
@@ -4909,7 +4992,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, | |||
4909 | err = attribute->store(s, buf, len); | 4992 | err = attribute->store(s, buf, len); |
4910 | #ifdef CONFIG_MEMCG_KMEM | 4993 | #ifdef CONFIG_MEMCG_KMEM |
4911 | if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { | 4994 | if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { |
4912 | int i; | 4995 | struct kmem_cache *c; |
4913 | 4996 | ||
4914 | mutex_lock(&slab_mutex); | 4997 | mutex_lock(&slab_mutex); |
4915 | if (s->max_attr_size < len) | 4998 | if (s->max_attr_size < len) |
@@ -4932,11 +5015,8 @@ static ssize_t slab_attr_store(struct kobject *kobj, | |||
4932 | * directly either failed or succeeded, in which case we loop | 5015 | * directly either failed or succeeded, in which case we loop |
4933 | * through the descendants with best-effort propagation. | 5016 | * through the descendants with best-effort propagation. |
4934 | */ | 5017 | */ |
4935 | for_each_memcg_cache_index(i) { | 5018 | for_each_memcg_cache(c, s) |
4936 | struct kmem_cache *c = cache_from_memcg_idx(s, i); | 5019 | attribute->store(c, buf, len); |
4937 | if (c) | ||
4938 | attribute->store(c, buf, len); | ||
4939 | } | ||
4940 | mutex_unlock(&slab_mutex); | 5020 | mutex_unlock(&slab_mutex); |
4941 | } | 5021 | } |
4942 | #endif | 5022 | #endif |
@@ -4953,7 +5033,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) | |||
4953 | if (is_root_cache(s)) | 5033 | if (is_root_cache(s)) |
4954 | return; | 5034 | return; |
4955 | 5035 | ||
4956 | root_cache = s->memcg_params->root_cache; | 5036 | root_cache = s->memcg_params.root_cache; |
4957 | 5037 | ||
4958 | /* | 5038 | /* |
4959 | * This mean this cache had no attribute written. Therefore, no point | 5039 | * This mean this cache had no attribute written. Therefore, no point |
@@ -5033,7 +5113,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s) | |||
5033 | { | 5113 | { |
5034 | #ifdef CONFIG_MEMCG_KMEM | 5114 | #ifdef CONFIG_MEMCG_KMEM |
5035 | if (!is_root_cache(s)) | 5115 | if (!is_root_cache(s)) |
5036 | return s->memcg_params->root_cache->memcg_kset; | 5116 | return s->memcg_params.root_cache->memcg_kset; |
5037 | #endif | 5117 | #endif |
5038 | return slab_kset; | 5118 | return slab_kset; |
5039 | } | 5119 | } |
@@ -1138,12 +1138,8 @@ void __init swap_setup(void) | |||
1138 | #ifdef CONFIG_SWAP | 1138 | #ifdef CONFIG_SWAP |
1139 | int i; | 1139 | int i; |
1140 | 1140 | ||
1141 | if (bdi_init(swapper_spaces[0].backing_dev_info)) | 1141 | for (i = 0; i < MAX_SWAPFILES; i++) |
1142 | panic("Failed to init swap bdi"); | ||
1143 | for (i = 0; i < MAX_SWAPFILES; i++) { | ||
1144 | spin_lock_init(&swapper_spaces[i].tree_lock); | 1142 | spin_lock_init(&swapper_spaces[i].tree_lock); |
1145 | INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); | ||
1146 | } | ||
1147 | #endif | 1143 | #endif |
1148 | 1144 | ||
1149 | /* Use a smaller cluster for small-memory machines */ | 1145 | /* Use a smaller cluster for small-memory machines */ |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 9711342987a0..405923f77334 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -32,17 +32,11 @@ static const struct address_space_operations swap_aops = { | |||
32 | #endif | 32 | #endif |
33 | }; | 33 | }; |
34 | 34 | ||
35 | static struct backing_dev_info swap_backing_dev_info = { | ||
36 | .name = "swap", | ||
37 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, | ||
38 | }; | ||
39 | |||
40 | struct address_space swapper_spaces[MAX_SWAPFILES] = { | 35 | struct address_space swapper_spaces[MAX_SWAPFILES] = { |
41 | [0 ... MAX_SWAPFILES - 1] = { | 36 | [0 ... MAX_SWAPFILES - 1] = { |
42 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), | 37 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), |
43 | .i_mmap_writable = ATOMIC_INIT(0), | 38 | .i_mmap_writable = ATOMIC_INIT(0), |
44 | .a_ops = &swap_aops, | 39 | .a_ops = &swap_aops, |
45 | .backing_dev_info = &swap_backing_dev_info, | ||
46 | } | 40 | } |
47 | }; | 41 | }; |
48 | 42 | ||
diff --git a/mm/truncate.c b/mm/truncate.c index f1e4d6052369..ddec5a5966d7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -112,7 +112,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size) | |||
112 | struct address_space *mapping = page->mapping; | 112 | struct address_space *mapping = page->mapping; |
113 | if (mapping && mapping_cap_account_dirty(mapping)) { | 113 | if (mapping && mapping_cap_account_dirty(mapping)) { |
114 | dec_zone_page_state(page, NR_FILE_DIRTY); | 114 | dec_zone_page_state(page, NR_FILE_DIRTY); |
115 | dec_bdi_stat(mapping->backing_dev_info, | 115 | dec_bdi_stat(inode_to_bdi(mapping->host), |
116 | BDI_RECLAIMABLE); | 116 | BDI_RECLAIMABLE); |
117 | if (account_size) | 117 | if (account_size) |
118 | task_io_account_cancelled_write(account_size); | 118 | task_io_account_cancelled_write(account_size); |
@@ -12,10 +12,30 @@ | |||
12 | #include <linux/hugetlb.h> | 12 | #include <linux/hugetlb.h> |
13 | #include <linux/vmalloc.h> | 13 | #include <linux/vmalloc.h> |
14 | 14 | ||
15 | #include <asm/sections.h> | ||
15 | #include <asm/uaccess.h> | 16 | #include <asm/uaccess.h> |
16 | 17 | ||
17 | #include "internal.h" | 18 | #include "internal.h" |
18 | 19 | ||
20 | static inline int is_kernel_rodata(unsigned long addr) | ||
21 | { | ||
22 | return addr >= (unsigned long)__start_rodata && | ||
23 | addr < (unsigned long)__end_rodata; | ||
24 | } | ||
25 | |||
26 | /** | ||
27 | * kfree_const - conditionally free memory | ||
28 | * @x: pointer to the memory | ||
29 | * | ||
30 | * Function calls kfree only if @x is not in .rodata section. | ||
31 | */ | ||
32 | void kfree_const(const void *x) | ||
33 | { | ||
34 | if (!is_kernel_rodata((unsigned long)x)) | ||
35 | kfree(x); | ||
36 | } | ||
37 | EXPORT_SYMBOL(kfree_const); | ||
38 | |||
19 | /** | 39 | /** |
20 | * kstrdup - allocate space for and copy an existing string | 40 | * kstrdup - allocate space for and copy an existing string |
21 | * @s: the string to duplicate | 41 | * @s: the string to duplicate |
@@ -38,6 +58,24 @@ char *kstrdup(const char *s, gfp_t gfp) | |||
38 | EXPORT_SYMBOL(kstrdup); | 58 | EXPORT_SYMBOL(kstrdup); |
39 | 59 | ||
40 | /** | 60 | /** |
61 | * kstrdup_const - conditionally duplicate an existing const string | ||
62 | * @s: the string to duplicate | ||
63 | * @gfp: the GFP mask used in the kmalloc() call when allocating memory | ||
64 | * | ||
65 | * Function returns source string if it is in .rodata section otherwise it | ||
66 | * fallbacks to kstrdup. | ||
67 | * Strings allocated by kstrdup_const should be freed by kfree_const. | ||
68 | */ | ||
69 | const char *kstrdup_const(const char *s, gfp_t gfp) | ||
70 | { | ||
71 | if (is_kernel_rodata((unsigned long)s)) | ||
72 | return s; | ||
73 | |||
74 | return kstrdup(s, gfp); | ||
75 | } | ||
76 | EXPORT_SYMBOL(kstrdup_const); | ||
77 | |||
78 | /** | ||
41 | * kstrndup - allocate space for and copy an existing string | 79 | * kstrndup - allocate space for and copy an existing string |
42 | * @s: the string to duplicate | 80 | * @s: the string to duplicate |
43 | * @max: read at most @max chars from @s | 81 | * @max: read at most @max chars from @s |
@@ -240,14 +278,8 @@ int __weak get_user_pages_fast(unsigned long start, | |||
240 | int nr_pages, int write, struct page **pages) | 278 | int nr_pages, int write, struct page **pages) |
241 | { | 279 | { |
242 | struct mm_struct *mm = current->mm; | 280 | struct mm_struct *mm = current->mm; |
243 | int ret; | 281 | return get_user_pages_unlocked(current, mm, start, nr_pages, |
244 | 282 | write, 0, pages); | |
245 | down_read(&mm->mmap_sem); | ||
246 | ret = get_user_pages(current, mm, start, nr_pages, | ||
247 | write, 0, pages, NULL); | ||
248 | up_read(&mm->mmap_sem); | ||
249 | |||
250 | return ret; | ||
251 | } | 283 | } |
252 | EXPORT_SYMBOL_GPL(get_user_pages_fast); | 284 | EXPORT_SYMBOL_GPL(get_user_pages_fast); |
253 | 285 | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 39c338896416..35b25e1340ca 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -1324,10 +1324,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
1324 | if (unlikely(!area)) | 1324 | if (unlikely(!area)) |
1325 | return NULL; | 1325 | return NULL; |
1326 | 1326 | ||
1327 | /* | 1327 | if (!(flags & VM_NO_GUARD)) |
1328 | * We always allocate a guard page. | 1328 | size += PAGE_SIZE; |
1329 | */ | ||
1330 | size += PAGE_SIZE; | ||
1331 | 1329 | ||
1332 | va = alloc_vmap_area(size, align, start, end, node, gfp_mask); | 1330 | va = alloc_vmap_area(size, align, start, end, node, gfp_mask); |
1333 | if (IS_ERR(va)) { | 1331 | if (IS_ERR(va)) { |
@@ -1621,6 +1619,7 @@ fail: | |||
1621 | * @end: vm area range end | 1619 | * @end: vm area range end |
1622 | * @gfp_mask: flags for the page level allocator | 1620 | * @gfp_mask: flags for the page level allocator |
1623 | * @prot: protection mask for the allocated pages | 1621 | * @prot: protection mask for the allocated pages |
1622 | * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD) | ||
1624 | * @node: node to use for allocation or NUMA_NO_NODE | 1623 | * @node: node to use for allocation or NUMA_NO_NODE |
1625 | * @caller: caller's return address | 1624 | * @caller: caller's return address |
1626 | * | 1625 | * |
@@ -1630,7 +1629,8 @@ fail: | |||
1630 | */ | 1629 | */ |
1631 | void *__vmalloc_node_range(unsigned long size, unsigned long align, | 1630 | void *__vmalloc_node_range(unsigned long size, unsigned long align, |
1632 | unsigned long start, unsigned long end, gfp_t gfp_mask, | 1631 | unsigned long start, unsigned long end, gfp_t gfp_mask, |
1633 | pgprot_t prot, int node, const void *caller) | 1632 | pgprot_t prot, unsigned long vm_flags, int node, |
1633 | const void *caller) | ||
1634 | { | 1634 | { |
1635 | struct vm_struct *area; | 1635 | struct vm_struct *area; |
1636 | void *addr; | 1636 | void *addr; |
@@ -1640,8 +1640,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, | |||
1640 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) | 1640 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) |
1641 | goto fail; | 1641 | goto fail; |
1642 | 1642 | ||
1643 | area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED, | 1643 | area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | |
1644 | start, end, node, gfp_mask, caller); | 1644 | vm_flags, start, end, node, gfp_mask, caller); |
1645 | if (!area) | 1645 | if (!area) |
1646 | goto fail; | 1646 | goto fail; |
1647 | 1647 | ||
@@ -1690,7 +1690,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, | |||
1690 | int node, const void *caller) | 1690 | int node, const void *caller) |
1691 | { | 1691 | { |
1692 | return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, | 1692 | return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, |
1693 | gfp_mask, prot, node, caller); | 1693 | gfp_mask, prot, 0, node, caller); |
1694 | } | 1694 | } |
1695 | 1695 | ||
1696 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 1696 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index bd9a72bc4a1b..5e8eadd71bac 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -91,6 +91,9 @@ struct scan_control { | |||
91 | /* Can pages be swapped as part of reclaim? */ | 91 | /* Can pages be swapped as part of reclaim? */ |
92 | unsigned int may_swap:1; | 92 | unsigned int may_swap:1; |
93 | 93 | ||
94 | /* Can cgroups be reclaimed below their normal consumption range? */ | ||
95 | unsigned int may_thrash:1; | ||
96 | |||
94 | unsigned int hibernation_mode:1; | 97 | unsigned int hibernation_mode:1; |
95 | 98 | ||
96 | /* One of the zones is ready for compaction */ | 99 | /* One of the zones is ready for compaction */ |
@@ -229,10 +232,10 @@ EXPORT_SYMBOL(unregister_shrinker); | |||
229 | 232 | ||
230 | #define SHRINK_BATCH 128 | 233 | #define SHRINK_BATCH 128 |
231 | 234 | ||
232 | static unsigned long shrink_slabs(struct shrink_control *shrinkctl, | 235 | static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, |
233 | struct shrinker *shrinker, | 236 | struct shrinker *shrinker, |
234 | unsigned long nr_scanned, | 237 | unsigned long nr_scanned, |
235 | unsigned long nr_eligible) | 238 | unsigned long nr_eligible) |
236 | { | 239 | { |
237 | unsigned long freed = 0; | 240 | unsigned long freed = 0; |
238 | unsigned long long delta; | 241 | unsigned long long delta; |
@@ -341,9 +344,10 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl, | |||
341 | } | 344 | } |
342 | 345 | ||
343 | /** | 346 | /** |
344 | * shrink_node_slabs - shrink slab caches of a given node | 347 | * shrink_slab - shrink slab caches |
345 | * @gfp_mask: allocation context | 348 | * @gfp_mask: allocation context |
346 | * @nid: node whose slab caches to target | 349 | * @nid: node whose slab caches to target |
350 | * @memcg: memory cgroup whose slab caches to target | ||
347 | * @nr_scanned: pressure numerator | 351 | * @nr_scanned: pressure numerator |
348 | * @nr_eligible: pressure denominator | 352 | * @nr_eligible: pressure denominator |
349 | * | 353 | * |
@@ -352,6 +356,12 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl, | |||
352 | * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, | 356 | * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, |
353 | * unaware shrinkers will receive a node id of 0 instead. | 357 | * unaware shrinkers will receive a node id of 0 instead. |
354 | * | 358 | * |
359 | * @memcg specifies the memory cgroup to target. If it is not NULL, | ||
360 | * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan | ||
361 | * objects from the memory cgroup specified. Otherwise all shrinkers | ||
362 | * are called, and memcg aware shrinkers are supposed to scan the | ||
363 | * global list then. | ||
364 | * | ||
355 | * @nr_scanned and @nr_eligible form a ratio that indicate how much of | 365 | * @nr_scanned and @nr_eligible form a ratio that indicate how much of |
356 | * the available objects should be scanned. Page reclaim for example | 366 | * the available objects should be scanned. Page reclaim for example |
357 | * passes the number of pages scanned and the number of pages on the | 367 | * passes the number of pages scanned and the number of pages on the |
@@ -362,13 +372,17 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl, | |||
362 | * | 372 | * |
363 | * Returns the number of reclaimed slab objects. | 373 | * Returns the number of reclaimed slab objects. |
364 | */ | 374 | */ |
365 | unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, | 375 | static unsigned long shrink_slab(gfp_t gfp_mask, int nid, |
366 | unsigned long nr_scanned, | 376 | struct mem_cgroup *memcg, |
367 | unsigned long nr_eligible) | 377 | unsigned long nr_scanned, |
378 | unsigned long nr_eligible) | ||
368 | { | 379 | { |
369 | struct shrinker *shrinker; | 380 | struct shrinker *shrinker; |
370 | unsigned long freed = 0; | 381 | unsigned long freed = 0; |
371 | 382 | ||
383 | if (memcg && !memcg_kmem_is_active(memcg)) | ||
384 | return 0; | ||
385 | |||
372 | if (nr_scanned == 0) | 386 | if (nr_scanned == 0) |
373 | nr_scanned = SWAP_CLUSTER_MAX; | 387 | nr_scanned = SWAP_CLUSTER_MAX; |
374 | 388 | ||
@@ -387,12 +401,16 @@ unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, | |||
387 | struct shrink_control sc = { | 401 | struct shrink_control sc = { |
388 | .gfp_mask = gfp_mask, | 402 | .gfp_mask = gfp_mask, |
389 | .nid = nid, | 403 | .nid = nid, |
404 | .memcg = memcg, | ||
390 | }; | 405 | }; |
391 | 406 | ||
407 | if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE)) | ||
408 | continue; | ||
409 | |||
392 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) | 410 | if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) |
393 | sc.nid = 0; | 411 | sc.nid = 0; |
394 | 412 | ||
395 | freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible); | 413 | freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); |
396 | } | 414 | } |
397 | 415 | ||
398 | up_read(&shrinker_rwsem); | 416 | up_read(&shrinker_rwsem); |
@@ -401,6 +419,29 @@ out: | |||
401 | return freed; | 419 | return freed; |
402 | } | 420 | } |
403 | 421 | ||
422 | void drop_slab_node(int nid) | ||
423 | { | ||
424 | unsigned long freed; | ||
425 | |||
426 | do { | ||
427 | struct mem_cgroup *memcg = NULL; | ||
428 | |||
429 | freed = 0; | ||
430 | do { | ||
431 | freed += shrink_slab(GFP_KERNEL, nid, memcg, | ||
432 | 1000, 1000); | ||
433 | } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); | ||
434 | } while (freed > 10); | ||
435 | } | ||
436 | |||
437 | void drop_slab(void) | ||
438 | { | ||
439 | int nid; | ||
440 | |||
441 | for_each_online_node(nid) | ||
442 | drop_slab_node(nid); | ||
443 | } | ||
444 | |||
404 | static inline int is_page_cache_freeable(struct page *page) | 445 | static inline int is_page_cache_freeable(struct page *page) |
405 | { | 446 | { |
406 | /* | 447 | /* |
@@ -497,7 +538,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
497 | } | 538 | } |
498 | if (mapping->a_ops->writepage == NULL) | 539 | if (mapping->a_ops->writepage == NULL) |
499 | return PAGE_ACTIVATE; | 540 | return PAGE_ACTIVATE; |
500 | if (!may_write_to_queue(mapping->backing_dev_info, sc)) | 541 | if (!may_write_to_queue(inode_to_bdi(mapping->host), sc)) |
501 | return PAGE_KEEP; | 542 | return PAGE_KEEP; |
502 | 543 | ||
503 | if (clear_page_dirty_for_io(page)) { | 544 | if (clear_page_dirty_for_io(page)) { |
@@ -876,7 +917,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
876 | */ | 917 | */ |
877 | mapping = page_mapping(page); | 918 | mapping = page_mapping(page); |
878 | if (((dirty || writeback) && mapping && | 919 | if (((dirty || writeback) && mapping && |
879 | bdi_write_congested(mapping->backing_dev_info)) || | 920 | bdi_write_congested(inode_to_bdi(mapping->host))) || |
880 | (writeback && PageReclaim(page))) | 921 | (writeback && PageReclaim(page))) |
881 | nr_congested++; | 922 | nr_congested++; |
882 | 923 | ||
@@ -1903,8 +1944,12 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness, | |||
1903 | * latencies, so it's better to scan a minimum amount there as | 1944 | * latencies, so it's better to scan a minimum amount there as |
1904 | * well. | 1945 | * well. |
1905 | */ | 1946 | */ |
1906 | if (current_is_kswapd() && !zone_reclaimable(zone)) | 1947 | if (current_is_kswapd()) { |
1907 | force_scan = true; | 1948 | if (!zone_reclaimable(zone)) |
1949 | force_scan = true; | ||
1950 | if (!mem_cgroup_lruvec_online(lruvec)) | ||
1951 | force_scan = true; | ||
1952 | } | ||
1908 | if (!global_reclaim(sc)) | 1953 | if (!global_reclaim(sc)) |
1909 | force_scan = true; | 1954 | force_scan = true; |
1910 | 1955 | ||
@@ -2269,6 +2314,7 @@ static inline bool should_continue_reclaim(struct zone *zone, | |||
2269 | static bool shrink_zone(struct zone *zone, struct scan_control *sc, | 2314 | static bool shrink_zone(struct zone *zone, struct scan_control *sc, |
2270 | bool is_classzone) | 2315 | bool is_classzone) |
2271 | { | 2316 | { |
2317 | struct reclaim_state *reclaim_state = current->reclaim_state; | ||
2272 | unsigned long nr_reclaimed, nr_scanned; | 2318 | unsigned long nr_reclaimed, nr_scanned; |
2273 | bool reclaimable = false; | 2319 | bool reclaimable = false; |
2274 | 2320 | ||
@@ -2287,15 +2333,28 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, | |||
2287 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 2333 | memcg = mem_cgroup_iter(root, NULL, &reclaim); |
2288 | do { | 2334 | do { |
2289 | unsigned long lru_pages; | 2335 | unsigned long lru_pages; |
2336 | unsigned long scanned; | ||
2290 | struct lruvec *lruvec; | 2337 | struct lruvec *lruvec; |
2291 | int swappiness; | 2338 | int swappiness; |
2292 | 2339 | ||
2340 | if (mem_cgroup_low(root, memcg)) { | ||
2341 | if (!sc->may_thrash) | ||
2342 | continue; | ||
2343 | mem_cgroup_events(memcg, MEMCG_LOW, 1); | ||
2344 | } | ||
2345 | |||
2293 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 2346 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
2294 | swappiness = mem_cgroup_swappiness(memcg); | 2347 | swappiness = mem_cgroup_swappiness(memcg); |
2348 | scanned = sc->nr_scanned; | ||
2295 | 2349 | ||
2296 | shrink_lruvec(lruvec, swappiness, sc, &lru_pages); | 2350 | shrink_lruvec(lruvec, swappiness, sc, &lru_pages); |
2297 | zone_lru_pages += lru_pages; | 2351 | zone_lru_pages += lru_pages; |
2298 | 2352 | ||
2353 | if (memcg && is_classzone) | ||
2354 | shrink_slab(sc->gfp_mask, zone_to_nid(zone), | ||
2355 | memcg, sc->nr_scanned - scanned, | ||
2356 | lru_pages); | ||
2357 | |||
2299 | /* | 2358 | /* |
2300 | * Direct reclaim and kswapd have to scan all memory | 2359 | * Direct reclaim and kswapd have to scan all memory |
2301 | * cgroups to fulfill the overall scan target for the | 2360 | * cgroups to fulfill the overall scan target for the |
@@ -2311,26 +2370,20 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, | |||
2311 | mem_cgroup_iter_break(root, memcg); | 2370 | mem_cgroup_iter_break(root, memcg); |
2312 | break; | 2371 | break; |
2313 | } | 2372 | } |
2314 | memcg = mem_cgroup_iter(root, memcg, &reclaim); | 2373 | } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); |
2315 | } while (memcg); | ||
2316 | 2374 | ||
2317 | /* | 2375 | /* |
2318 | * Shrink the slab caches in the same proportion that | 2376 | * Shrink the slab caches in the same proportion that |
2319 | * the eligible LRU pages were scanned. | 2377 | * the eligible LRU pages were scanned. |
2320 | */ | 2378 | */ |
2321 | if (global_reclaim(sc) && is_classzone) { | 2379 | if (global_reclaim(sc) && is_classzone) |
2322 | struct reclaim_state *reclaim_state; | 2380 | shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL, |
2323 | 2381 | sc->nr_scanned - nr_scanned, | |
2324 | shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone), | 2382 | zone_lru_pages); |
2325 | sc->nr_scanned - nr_scanned, | 2383 | |
2326 | zone_lru_pages); | 2384 | if (reclaim_state) { |
2327 | 2385 | sc->nr_reclaimed += reclaim_state->reclaimed_slab; | |
2328 | reclaim_state = current->reclaim_state; | 2386 | reclaim_state->reclaimed_slab = 0; |
2329 | if (reclaim_state) { | ||
2330 | sc->nr_reclaimed += | ||
2331 | reclaim_state->reclaimed_slab; | ||
2332 | reclaim_state->reclaimed_slab = 0; | ||
2333 | } | ||
2334 | } | 2387 | } |
2335 | 2388 | ||
2336 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, | 2389 | vmpressure(sc->gfp_mask, sc->target_mem_cgroup, |
@@ -2515,10 +2568,11 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2515 | static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | 2568 | static unsigned long do_try_to_free_pages(struct zonelist *zonelist, |
2516 | struct scan_control *sc) | 2569 | struct scan_control *sc) |
2517 | { | 2570 | { |
2571 | int initial_priority = sc->priority; | ||
2518 | unsigned long total_scanned = 0; | 2572 | unsigned long total_scanned = 0; |
2519 | unsigned long writeback_threshold; | 2573 | unsigned long writeback_threshold; |
2520 | bool zones_reclaimable; | 2574 | bool zones_reclaimable; |
2521 | 2575 | retry: | |
2522 | delayacct_freepages_start(); | 2576 | delayacct_freepages_start(); |
2523 | 2577 | ||
2524 | if (global_reclaim(sc)) | 2578 | if (global_reclaim(sc)) |
@@ -2568,6 +2622,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2568 | if (sc->compaction_ready) | 2622 | if (sc->compaction_ready) |
2569 | return 1; | 2623 | return 1; |
2570 | 2624 | ||
2625 | /* Untapped cgroup reserves? Don't OOM, retry. */ | ||
2626 | if (!sc->may_thrash) { | ||
2627 | sc->priority = initial_priority; | ||
2628 | sc->may_thrash = 1; | ||
2629 | goto retry; | ||
2630 | } | ||
2631 | |||
2571 | /* Any of the zones still reclaimable? Don't OOM. */ | 2632 | /* Any of the zones still reclaimable? Don't OOM. */ |
2572 | if (zones_reclaimable) | 2633 | if (zones_reclaimable) |
2573 | return 1; | 2634 | return 1; |
@@ -2656,7 +2717,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | |||
2656 | * should make reasonable progress. | 2717 | * should make reasonable progress. |
2657 | */ | 2718 | */ |
2658 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2719 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2659 | gfp_mask, nodemask) { | 2720 | gfp_zone(gfp_mask), nodemask) { |
2660 | if (zone_idx(zone) > ZONE_NORMAL) | 2721 | if (zone_idx(zone) > ZONE_NORMAL) |
2661 | continue; | 2722 | continue; |
2662 | 2723 | ||
@@ -2921,18 +2982,20 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
2921 | return false; | 2982 | return false; |
2922 | 2983 | ||
2923 | /* | 2984 | /* |
2924 | * There is a potential race between when kswapd checks its watermarks | 2985 | * The throttled processes are normally woken up in balance_pgdat() as |
2925 | * and a process gets throttled. There is also a potential race if | 2986 | * soon as pfmemalloc_watermark_ok() is true. But there is a potential |
2926 | * processes get throttled, kswapd wakes, a large process exits therby | 2987 | * race between when kswapd checks the watermarks and a process gets |
2927 | * balancing the zones that causes kswapd to miss a wakeup. If kswapd | 2988 | * throttled. There is also a potential race if processes get |
2928 | * is going to sleep, no process should be sleeping on pfmemalloc_wait | 2989 | * throttled, kswapd wakes, a large process exits thereby balancing the |
2929 | * so wake them now if necessary. If necessary, processes will wake | 2990 | * zones, which causes kswapd to exit balance_pgdat() before reaching |
2930 | * kswapd and get throttled again | 2991 | * the wake up checks. If kswapd is going to sleep, no process should |
2992 | * be sleeping on pfmemalloc_wait, so wake them now if necessary. If | ||
2993 | * the wake up is premature, processes will wake kswapd and get | ||
2994 | * throttled again. The difference from wake ups in balance_pgdat() is | ||
2995 | * that here we are under prepare_to_wait(). | ||
2931 | */ | 2996 | */ |
2932 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) { | 2997 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) |
2933 | wake_up(&pgdat->pfmemalloc_wait); | 2998 | wake_up_all(&pgdat->pfmemalloc_wait); |
2934 | return false; | ||
2935 | } | ||
2936 | 2999 | ||
2937 | return pgdat_balanced(pgdat, order, classzone_idx); | 3000 | return pgdat_balanced(pgdat, order, classzone_idx); |
2938 | } | 3001 | } |
@@ -3173,7 +3236,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
3173 | */ | 3236 | */ |
3174 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && | 3237 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && |
3175 | pfmemalloc_watermark_ok(pgdat)) | 3238 | pfmemalloc_watermark_ok(pgdat)) |
3176 | wake_up(&pgdat->pfmemalloc_wait); | 3239 | wake_up_all(&pgdat->pfmemalloc_wait); |
3177 | 3240 | ||
3178 | /* | 3241 | /* |
3179 | * Fragmentation may mean that the system cannot be rebalanced | 3242 | * Fragmentation may mean that the system cannot be rebalanced |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 1284f89fca08..4f5cd974e11a 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -17,6 +17,9 @@ | |||
17 | #include <linux/cpu.h> | 17 | #include <linux/cpu.h> |
18 | #include <linux/cpumask.h> | 18 | #include <linux/cpumask.h> |
19 | #include <linux/vmstat.h> | 19 | #include <linux/vmstat.h> |
20 | #include <linux/proc_fs.h> | ||
21 | #include <linux/seq_file.h> | ||
22 | #include <linux/debugfs.h> | ||
20 | #include <linux/sched.h> | 23 | #include <linux/sched.h> |
21 | #include <linux/math64.h> | 24 | #include <linux/math64.h> |
22 | #include <linux/writeback.h> | 25 | #include <linux/writeback.h> |
@@ -670,66 +673,6 @@ int fragmentation_index(struct zone *zone, unsigned int order) | |||
670 | } | 673 | } |
671 | #endif | 674 | #endif |
672 | 675 | ||
673 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION) | ||
674 | #include <linux/proc_fs.h> | ||
675 | #include <linux/seq_file.h> | ||
676 | |||
677 | static char * const migratetype_names[MIGRATE_TYPES] = { | ||
678 | "Unmovable", | ||
679 | "Reclaimable", | ||
680 | "Movable", | ||
681 | "Reserve", | ||
682 | #ifdef CONFIG_CMA | ||
683 | "CMA", | ||
684 | #endif | ||
685 | #ifdef CONFIG_MEMORY_ISOLATION | ||
686 | "Isolate", | ||
687 | #endif | ||
688 | }; | ||
689 | |||
690 | static void *frag_start(struct seq_file *m, loff_t *pos) | ||
691 | { | ||
692 | pg_data_t *pgdat; | ||
693 | loff_t node = *pos; | ||
694 | for (pgdat = first_online_pgdat(); | ||
695 | pgdat && node; | ||
696 | pgdat = next_online_pgdat(pgdat)) | ||
697 | --node; | ||
698 | |||
699 | return pgdat; | ||
700 | } | ||
701 | |||
702 | static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) | ||
703 | { | ||
704 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
705 | |||
706 | (*pos)++; | ||
707 | return next_online_pgdat(pgdat); | ||
708 | } | ||
709 | |||
710 | static void frag_stop(struct seq_file *m, void *arg) | ||
711 | { | ||
712 | } | ||
713 | |||
714 | /* Walk all the zones in a node and print using a callback */ | ||
715 | static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | ||
716 | void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) | ||
717 | { | ||
718 | struct zone *zone; | ||
719 | struct zone *node_zones = pgdat->node_zones; | ||
720 | unsigned long flags; | ||
721 | |||
722 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
723 | if (!populated_zone(zone)) | ||
724 | continue; | ||
725 | |||
726 | spin_lock_irqsave(&zone->lock, flags); | ||
727 | print(m, pgdat, zone); | ||
728 | spin_unlock_irqrestore(&zone->lock, flags); | ||
729 | } | ||
730 | } | ||
731 | #endif | ||
732 | |||
733 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) | 676 | #if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) |
734 | #ifdef CONFIG_ZONE_DMA | 677 | #ifdef CONFIG_ZONE_DMA |
735 | #define TEXT_FOR_DMA(xx) xx "_dma", | 678 | #define TEXT_FOR_DMA(xx) xx "_dma", |
@@ -907,7 +850,66 @@ const char * const vmstat_text[] = { | |||
907 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ | 850 | #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ |
908 | 851 | ||
909 | 852 | ||
853 | #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \ | ||
854 | defined(CONFIG_PROC_FS) | ||
855 | static void *frag_start(struct seq_file *m, loff_t *pos) | ||
856 | { | ||
857 | pg_data_t *pgdat; | ||
858 | loff_t node = *pos; | ||
859 | |||
860 | for (pgdat = first_online_pgdat(); | ||
861 | pgdat && node; | ||
862 | pgdat = next_online_pgdat(pgdat)) | ||
863 | --node; | ||
864 | |||
865 | return pgdat; | ||
866 | } | ||
867 | |||
868 | static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) | ||
869 | { | ||
870 | pg_data_t *pgdat = (pg_data_t *)arg; | ||
871 | |||
872 | (*pos)++; | ||
873 | return next_online_pgdat(pgdat); | ||
874 | } | ||
875 | |||
876 | static void frag_stop(struct seq_file *m, void *arg) | ||
877 | { | ||
878 | } | ||
879 | |||
880 | /* Walk all the zones in a node and print using a callback */ | ||
881 | static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | ||
882 | void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) | ||
883 | { | ||
884 | struct zone *zone; | ||
885 | struct zone *node_zones = pgdat->node_zones; | ||
886 | unsigned long flags; | ||
887 | |||
888 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | ||
889 | if (!populated_zone(zone)) | ||
890 | continue; | ||
891 | |||
892 | spin_lock_irqsave(&zone->lock, flags); | ||
893 | print(m, pgdat, zone); | ||
894 | spin_unlock_irqrestore(&zone->lock, flags); | ||
895 | } | ||
896 | } | ||
897 | #endif | ||
898 | |||
910 | #ifdef CONFIG_PROC_FS | 899 | #ifdef CONFIG_PROC_FS |
900 | static char * const migratetype_names[MIGRATE_TYPES] = { | ||
901 | "Unmovable", | ||
902 | "Reclaimable", | ||
903 | "Movable", | ||
904 | "Reserve", | ||
905 | #ifdef CONFIG_CMA | ||
906 | "CMA", | ||
907 | #endif | ||
908 | #ifdef CONFIG_MEMORY_ISOLATION | ||
909 | "Isolate", | ||
910 | #endif | ||
911 | }; | ||
912 | |||
911 | static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, | 913 | static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, |
912 | struct zone *zone) | 914 | struct zone *zone) |
913 | { | 915 | { |
@@ -1435,8 +1437,8 @@ static void vmstat_shepherd(struct work_struct *w) | |||
1435 | if (need_update(cpu) && | 1437 | if (need_update(cpu) && |
1436 | cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) | 1438 | cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) |
1437 | 1439 | ||
1438 | schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu), | 1440 | schedule_delayed_work_on(cpu, |
1439 | __round_jiffies_relative(sysctl_stat_interval, cpu)); | 1441 | &per_cpu(vmstat_work, cpu), 0); |
1440 | 1442 | ||
1441 | put_online_cpus(); | 1443 | put_online_cpus(); |
1442 | 1444 | ||
@@ -1450,7 +1452,7 @@ static void __init start_shepherd_timer(void) | |||
1450 | int cpu; | 1452 | int cpu; |
1451 | 1453 | ||
1452 | for_each_possible_cpu(cpu) | 1454 | for_each_possible_cpu(cpu) |
1453 | INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), | 1455 | INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu), |
1454 | vmstat_update); | 1456 | vmstat_update); |
1455 | 1457 | ||
1456 | if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) | 1458 | if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) |
@@ -1536,8 +1538,6 @@ static int __init setup_vmstat(void) | |||
1536 | module_init(setup_vmstat) | 1538 | module_init(setup_vmstat) |
1537 | 1539 | ||
1538 | #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) | 1540 | #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) |
1539 | #include <linux/debugfs.h> | ||
1540 | |||
1541 | 1541 | ||
1542 | /* | 1542 | /* |
1543 | * Return an index indicating how much of the available free memory is | 1543 | * Return an index indicating how much of the available free memory is |
diff --git a/mm/workingset.c b/mm/workingset.c index f7216fa7da27..aa017133744b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c | |||
@@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, | |||
275 | 275 | ||
276 | /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ | 276 | /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ |
277 | local_irq_disable(); | 277 | local_irq_disable(); |
278 | shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); | 278 | shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); |
279 | local_irq_enable(); | 279 | local_irq_enable(); |
280 | 280 | ||
281 | pages = node_present_pages(sc->nid); | 281 | pages = node_present_pages(sc->nid); |
@@ -302,6 +302,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, | |||
302 | } | 302 | } |
303 | 303 | ||
304 | static enum lru_status shadow_lru_isolate(struct list_head *item, | 304 | static enum lru_status shadow_lru_isolate(struct list_head *item, |
305 | struct list_lru_one *lru, | ||
305 | spinlock_t *lru_lock, | 306 | spinlock_t *lru_lock, |
306 | void *arg) | 307 | void *arg) |
307 | { | 308 | { |
@@ -332,7 +333,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, | |||
332 | goto out; | 333 | goto out; |
333 | } | 334 | } |
334 | 335 | ||
335 | list_del_init(item); | 336 | list_lru_isolate(lru, item); |
336 | spin_unlock(lru_lock); | 337 | spin_unlock(lru_lock); |
337 | 338 | ||
338 | /* | 339 | /* |
@@ -376,8 +377,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, | |||
376 | 377 | ||
377 | /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ | 378 | /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ |
378 | local_irq_disable(); | 379 | local_irq_disable(); |
379 | ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, | 380 | ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc, |
380 | shadow_lru_isolate, NULL, &sc->nr_to_scan); | 381 | shadow_lru_isolate, NULL); |
381 | local_irq_enable(); | 382 | local_irq_enable(); |
382 | return ret; | 383 | return ret; |
383 | } | 384 | } |
@@ -130,7 +130,8 @@ static struct zbud_ops zbud_zpool_ops = { | |||
130 | .evict = zbud_zpool_evict | 130 | .evict = zbud_zpool_evict |
131 | }; | 131 | }; |
132 | 132 | ||
133 | static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) | 133 | static void *zbud_zpool_create(char *name, gfp_t gfp, |
134 | struct zpool_ops *zpool_ops) | ||
134 | { | 135 | { |
135 | return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); | 136 | return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); |
136 | } | 137 | } |
diff --git a/mm/zpool.c b/mm/zpool.c index 739cdf0d183a..bacdab6e47de 100644 --- a/mm/zpool.c +++ b/mm/zpool.c | |||
@@ -129,6 +129,7 @@ static void zpool_put_driver(struct zpool_driver *driver) | |||
129 | /** | 129 | /** |
130 | * zpool_create_pool() - Create a new zpool | 130 | * zpool_create_pool() - Create a new zpool |
131 | * @type The type of the zpool to create (e.g. zbud, zsmalloc) | 131 | * @type The type of the zpool to create (e.g. zbud, zsmalloc) |
132 | * @name The name of the zpool (e.g. zram0, zswap) | ||
132 | * @gfp The GFP flags to use when allocating the pool. | 133 | * @gfp The GFP flags to use when allocating the pool. |
133 | * @ops The optional ops callback. | 134 | * @ops The optional ops callback. |
134 | * | 135 | * |
@@ -140,7 +141,8 @@ static void zpool_put_driver(struct zpool_driver *driver) | |||
140 | * | 141 | * |
141 | * Returns: New zpool on success, NULL on failure. | 142 | * Returns: New zpool on success, NULL on failure. |
142 | */ | 143 | */ |
143 | struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) | 144 | struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, |
145 | struct zpool_ops *ops) | ||
144 | { | 146 | { |
145 | struct zpool_driver *driver; | 147 | struct zpool_driver *driver; |
146 | struct zpool *zpool; | 148 | struct zpool *zpool; |
@@ -168,7 +170,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) | |||
168 | 170 | ||
169 | zpool->type = driver->type; | 171 | zpool->type = driver->type; |
170 | zpool->driver = driver; | 172 | zpool->driver = driver; |
171 | zpool->pool = driver->create(gfp, ops); | 173 | zpool->pool = driver->create(name, gfp, ops); |
172 | zpool->ops = ops; | 174 | zpool->ops = ops; |
173 | 175 | ||
174 | if (!zpool->pool) { | 176 | if (!zpool->pool) { |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b72403927aa4..0dec1fa5f656 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
@@ -91,6 +91,7 @@ | |||
91 | #include <linux/hardirq.h> | 91 | #include <linux/hardirq.h> |
92 | #include <linux/spinlock.h> | 92 | #include <linux/spinlock.h> |
93 | #include <linux/types.h> | 93 | #include <linux/types.h> |
94 | #include <linux/debugfs.h> | ||
94 | #include <linux/zsmalloc.h> | 95 | #include <linux/zsmalloc.h> |
95 | #include <linux/zpool.h> | 96 | #include <linux/zpool.h> |
96 | 97 | ||
@@ -168,6 +169,22 @@ enum fullness_group { | |||
168 | ZS_FULL | 169 | ZS_FULL |
169 | }; | 170 | }; |
170 | 171 | ||
172 | enum zs_stat_type { | ||
173 | OBJ_ALLOCATED, | ||
174 | OBJ_USED, | ||
175 | NR_ZS_STAT_TYPE, | ||
176 | }; | ||
177 | |||
178 | #ifdef CONFIG_ZSMALLOC_STAT | ||
179 | |||
180 | static struct dentry *zs_stat_root; | ||
181 | |||
182 | struct zs_size_stat { | ||
183 | unsigned long objs[NR_ZS_STAT_TYPE]; | ||
184 | }; | ||
185 | |||
186 | #endif | ||
187 | |||
171 | /* | 188 | /* |
172 | * number of size_classes | 189 | * number of size_classes |
173 | */ | 190 | */ |
@@ -200,6 +217,10 @@ struct size_class { | |||
200 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ | 217 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ |
201 | int pages_per_zspage; | 218 | int pages_per_zspage; |
202 | 219 | ||
220 | #ifdef CONFIG_ZSMALLOC_STAT | ||
221 | struct zs_size_stat stats; | ||
222 | #endif | ||
223 | |||
203 | spinlock_t lock; | 224 | spinlock_t lock; |
204 | 225 | ||
205 | struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; | 226 | struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; |
@@ -217,10 +238,16 @@ struct link_free { | |||
217 | }; | 238 | }; |
218 | 239 | ||
219 | struct zs_pool { | 240 | struct zs_pool { |
241 | char *name; | ||
242 | |||
220 | struct size_class **size_class; | 243 | struct size_class **size_class; |
221 | 244 | ||
222 | gfp_t flags; /* allocation flags used when growing pool */ | 245 | gfp_t flags; /* allocation flags used when growing pool */ |
223 | atomic_long_t pages_allocated; | 246 | atomic_long_t pages_allocated; |
247 | |||
248 | #ifdef CONFIG_ZSMALLOC_STAT | ||
249 | struct dentry *stat_dentry; | ||
250 | #endif | ||
224 | }; | 251 | }; |
225 | 252 | ||
226 | /* | 253 | /* |
@@ -246,9 +273,9 @@ struct mapping_area { | |||
246 | 273 | ||
247 | #ifdef CONFIG_ZPOOL | 274 | #ifdef CONFIG_ZPOOL |
248 | 275 | ||
249 | static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) | 276 | static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops) |
250 | { | 277 | { |
251 | return zs_create_pool(gfp); | 278 | return zs_create_pool(name, gfp); |
252 | } | 279 | } |
253 | 280 | ||
254 | static void zs_zpool_destroy(void *pool) | 281 | static void zs_zpool_destroy(void *pool) |
@@ -942,6 +969,166 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) | |||
942 | return true; | 969 | return true; |
943 | } | 970 | } |
944 | 971 | ||
972 | #ifdef CONFIG_ZSMALLOC_STAT | ||
973 | |||
974 | static inline void zs_stat_inc(struct size_class *class, | ||
975 | enum zs_stat_type type, unsigned long cnt) | ||
976 | { | ||
977 | class->stats.objs[type] += cnt; | ||
978 | } | ||
979 | |||
980 | static inline void zs_stat_dec(struct size_class *class, | ||
981 | enum zs_stat_type type, unsigned long cnt) | ||
982 | { | ||
983 | class->stats.objs[type] -= cnt; | ||
984 | } | ||
985 | |||
986 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
987 | enum zs_stat_type type) | ||
988 | { | ||
989 | return class->stats.objs[type]; | ||
990 | } | ||
991 | |||
992 | static int __init zs_stat_init(void) | ||
993 | { | ||
994 | if (!debugfs_initialized()) | ||
995 | return -ENODEV; | ||
996 | |||
997 | zs_stat_root = debugfs_create_dir("zsmalloc", NULL); | ||
998 | if (!zs_stat_root) | ||
999 | return -ENOMEM; | ||
1000 | |||
1001 | return 0; | ||
1002 | } | ||
1003 | |||
1004 | static void __exit zs_stat_exit(void) | ||
1005 | { | ||
1006 | debugfs_remove_recursive(zs_stat_root); | ||
1007 | } | ||
1008 | |||
1009 | static int zs_stats_size_show(struct seq_file *s, void *v) | ||
1010 | { | ||
1011 | int i; | ||
1012 | struct zs_pool *pool = s->private; | ||
1013 | struct size_class *class; | ||
1014 | int objs_per_zspage; | ||
1015 | unsigned long obj_allocated, obj_used, pages_used; | ||
1016 | unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; | ||
1017 | |||
1018 | seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size", | ||
1019 | "obj_allocated", "obj_used", "pages_used"); | ||
1020 | |||
1021 | for (i = 0; i < zs_size_classes; i++) { | ||
1022 | class = pool->size_class[i]; | ||
1023 | |||
1024 | if (class->index != i) | ||
1025 | continue; | ||
1026 | |||
1027 | spin_lock(&class->lock); | ||
1028 | obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); | ||
1029 | obj_used = zs_stat_get(class, OBJ_USED); | ||
1030 | spin_unlock(&class->lock); | ||
1031 | |||
1032 | objs_per_zspage = get_maxobj_per_zspage(class->size, | ||
1033 | class->pages_per_zspage); | ||
1034 | pages_used = obj_allocated / objs_per_zspage * | ||
1035 | class->pages_per_zspage; | ||
1036 | |||
1037 | seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i, | ||
1038 | class->size, obj_allocated, obj_used, pages_used); | ||
1039 | |||
1040 | total_objs += obj_allocated; | ||
1041 | total_used_objs += obj_used; | ||
1042 | total_pages += pages_used; | ||
1043 | } | ||
1044 | |||
1045 | seq_puts(s, "\n"); | ||
1046 | seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "", | ||
1047 | total_objs, total_used_objs, total_pages); | ||
1048 | |||
1049 | return 0; | ||
1050 | } | ||
1051 | |||
1052 | static int zs_stats_size_open(struct inode *inode, struct file *file) | ||
1053 | { | ||
1054 | return single_open(file, zs_stats_size_show, inode->i_private); | ||
1055 | } | ||
1056 | |||
1057 | static const struct file_operations zs_stat_size_ops = { | ||
1058 | .open = zs_stats_size_open, | ||
1059 | .read = seq_read, | ||
1060 | .llseek = seq_lseek, | ||
1061 | .release = single_release, | ||
1062 | }; | ||
1063 | |||
1064 | static int zs_pool_stat_create(char *name, struct zs_pool *pool) | ||
1065 | { | ||
1066 | struct dentry *entry; | ||
1067 | |||
1068 | if (!zs_stat_root) | ||
1069 | return -ENODEV; | ||
1070 | |||
1071 | entry = debugfs_create_dir(name, zs_stat_root); | ||
1072 | if (!entry) { | ||
1073 | pr_warn("debugfs dir <%s> creation failed\n", name); | ||
1074 | return -ENOMEM; | ||
1075 | } | ||
1076 | pool->stat_dentry = entry; | ||
1077 | |||
1078 | entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO, | ||
1079 | pool->stat_dentry, pool, &zs_stat_size_ops); | ||
1080 | if (!entry) { | ||
1081 | pr_warn("%s: debugfs file entry <%s> creation failed\n", | ||
1082 | name, "obj_in_classes"); | ||
1083 | return -ENOMEM; | ||
1084 | } | ||
1085 | |||
1086 | return 0; | ||
1087 | } | ||
1088 | |||
1089 | static void zs_pool_stat_destroy(struct zs_pool *pool) | ||
1090 | { | ||
1091 | debugfs_remove_recursive(pool->stat_dentry); | ||
1092 | } | ||
1093 | |||
1094 | #else /* CONFIG_ZSMALLOC_STAT */ | ||
1095 | |||
1096 | static inline void zs_stat_inc(struct size_class *class, | ||
1097 | enum zs_stat_type type, unsigned long cnt) | ||
1098 | { | ||
1099 | } | ||
1100 | |||
1101 | static inline void zs_stat_dec(struct size_class *class, | ||
1102 | enum zs_stat_type type, unsigned long cnt) | ||
1103 | { | ||
1104 | } | ||
1105 | |||
1106 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
1107 | enum zs_stat_type type) | ||
1108 | { | ||
1109 | return 0; | ||
1110 | } | ||
1111 | |||
1112 | static int __init zs_stat_init(void) | ||
1113 | { | ||
1114 | return 0; | ||
1115 | } | ||
1116 | |||
1117 | static void __exit zs_stat_exit(void) | ||
1118 | { | ||
1119 | } | ||
1120 | |||
1121 | static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) | ||
1122 | { | ||
1123 | return 0; | ||
1124 | } | ||
1125 | |||
1126 | static inline void zs_pool_stat_destroy(struct zs_pool *pool) | ||
1127 | { | ||
1128 | } | ||
1129 | |||
1130 | #endif | ||
1131 | |||
945 | unsigned long zs_get_total_pages(struct zs_pool *pool) | 1132 | unsigned long zs_get_total_pages(struct zs_pool *pool) |
946 | { | 1133 | { |
947 | return atomic_long_read(&pool->pages_allocated); | 1134 | return atomic_long_read(&pool->pages_allocated); |
@@ -1074,7 +1261,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
1074 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); | 1261 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); |
1075 | atomic_long_add(class->pages_per_zspage, | 1262 | atomic_long_add(class->pages_per_zspage, |
1076 | &pool->pages_allocated); | 1263 | &pool->pages_allocated); |
1264 | |||
1077 | spin_lock(&class->lock); | 1265 | spin_lock(&class->lock); |
1266 | zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | ||
1267 | class->size, class->pages_per_zspage)); | ||
1078 | } | 1268 | } |
1079 | 1269 | ||
1080 | obj = (unsigned long)first_page->freelist; | 1270 | obj = (unsigned long)first_page->freelist; |
@@ -1088,6 +1278,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
1088 | kunmap_atomic(vaddr); | 1278 | kunmap_atomic(vaddr); |
1089 | 1279 | ||
1090 | first_page->inuse++; | 1280 | first_page->inuse++; |
1281 | zs_stat_inc(class, OBJ_USED, 1); | ||
1091 | /* Now move the zspage to another fullness group, if required */ | 1282 | /* Now move the zspage to another fullness group, if required */ |
1092 | fix_fullness_group(pool, first_page); | 1283 | fix_fullness_group(pool, first_page); |
1093 | spin_unlock(&class->lock); | 1284 | spin_unlock(&class->lock); |
@@ -1128,6 +1319,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj) | |||
1128 | 1319 | ||
1129 | first_page->inuse--; | 1320 | first_page->inuse--; |
1130 | fullness = fix_fullness_group(pool, first_page); | 1321 | fullness = fix_fullness_group(pool, first_page); |
1322 | |||
1323 | zs_stat_dec(class, OBJ_USED, 1); | ||
1324 | if (fullness == ZS_EMPTY) | ||
1325 | zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | ||
1326 | class->size, class->pages_per_zspage)); | ||
1327 | |||
1131 | spin_unlock(&class->lock); | 1328 | spin_unlock(&class->lock); |
1132 | 1329 | ||
1133 | if (fullness == ZS_EMPTY) { | 1330 | if (fullness == ZS_EMPTY) { |
@@ -1148,7 +1345,7 @@ EXPORT_SYMBOL_GPL(zs_free); | |||
1148 | * On success, a pointer to the newly created pool is returned, | 1345 | * On success, a pointer to the newly created pool is returned, |
1149 | * otherwise NULL. | 1346 | * otherwise NULL. |
1150 | */ | 1347 | */ |
1151 | struct zs_pool *zs_create_pool(gfp_t flags) | 1348 | struct zs_pool *zs_create_pool(char *name, gfp_t flags) |
1152 | { | 1349 | { |
1153 | int i; | 1350 | int i; |
1154 | struct zs_pool *pool; | 1351 | struct zs_pool *pool; |
@@ -1158,9 +1355,16 @@ struct zs_pool *zs_create_pool(gfp_t flags) | |||
1158 | if (!pool) | 1355 | if (!pool) |
1159 | return NULL; | 1356 | return NULL; |
1160 | 1357 | ||
1358 | pool->name = kstrdup(name, GFP_KERNEL); | ||
1359 | if (!pool->name) { | ||
1360 | kfree(pool); | ||
1361 | return NULL; | ||
1362 | } | ||
1363 | |||
1161 | pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), | 1364 | pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), |
1162 | GFP_KERNEL); | 1365 | GFP_KERNEL); |
1163 | if (!pool->size_class) { | 1366 | if (!pool->size_class) { |
1367 | kfree(pool->name); | ||
1164 | kfree(pool); | 1368 | kfree(pool); |
1165 | return NULL; | 1369 | return NULL; |
1166 | } | 1370 | } |
@@ -1210,6 +1414,9 @@ struct zs_pool *zs_create_pool(gfp_t flags) | |||
1210 | 1414 | ||
1211 | pool->flags = flags; | 1415 | pool->flags = flags; |
1212 | 1416 | ||
1417 | if (zs_pool_stat_create(name, pool)) | ||
1418 | goto err; | ||
1419 | |||
1213 | return pool; | 1420 | return pool; |
1214 | 1421 | ||
1215 | err: | 1422 | err: |
@@ -1222,6 +1429,8 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
1222 | { | 1429 | { |
1223 | int i; | 1430 | int i; |
1224 | 1431 | ||
1432 | zs_pool_stat_destroy(pool); | ||
1433 | |||
1225 | for (i = 0; i < zs_size_classes; i++) { | 1434 | for (i = 0; i < zs_size_classes; i++) { |
1226 | int fg; | 1435 | int fg; |
1227 | struct size_class *class = pool->size_class[i]; | 1436 | struct size_class *class = pool->size_class[i]; |
@@ -1242,6 +1451,7 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
1242 | } | 1451 | } |
1243 | 1452 | ||
1244 | kfree(pool->size_class); | 1453 | kfree(pool->size_class); |
1454 | kfree(pool->name); | ||
1245 | kfree(pool); | 1455 | kfree(pool); |
1246 | } | 1456 | } |
1247 | EXPORT_SYMBOL_GPL(zs_destroy_pool); | 1457 | EXPORT_SYMBOL_GPL(zs_destroy_pool); |
@@ -1250,17 +1460,30 @@ static int __init zs_init(void) | |||
1250 | { | 1460 | { |
1251 | int ret = zs_register_cpu_notifier(); | 1461 | int ret = zs_register_cpu_notifier(); |
1252 | 1462 | ||
1253 | if (ret) { | 1463 | if (ret) |
1254 | zs_unregister_cpu_notifier(); | 1464 | goto notifier_fail; |
1255 | return ret; | ||
1256 | } | ||
1257 | 1465 | ||
1258 | init_zs_size_classes(); | 1466 | init_zs_size_classes(); |
1259 | 1467 | ||
1260 | #ifdef CONFIG_ZPOOL | 1468 | #ifdef CONFIG_ZPOOL |
1261 | zpool_register_driver(&zs_zpool_driver); | 1469 | zpool_register_driver(&zs_zpool_driver); |
1262 | #endif | 1470 | #endif |
1471 | |||
1472 | ret = zs_stat_init(); | ||
1473 | if (ret) { | ||
1474 | pr_err("zs stat initialization failed\n"); | ||
1475 | goto stat_fail; | ||
1476 | } | ||
1263 | return 0; | 1477 | return 0; |
1478 | |||
1479 | stat_fail: | ||
1480 | #ifdef CONFIG_ZPOOL | ||
1481 | zpool_unregister_driver(&zs_zpool_driver); | ||
1482 | #endif | ||
1483 | notifier_fail: | ||
1484 | zs_unregister_cpu_notifier(); | ||
1485 | |||
1486 | return ret; | ||
1264 | } | 1487 | } |
1265 | 1488 | ||
1266 | static void __exit zs_exit(void) | 1489 | static void __exit zs_exit(void) |
@@ -1269,6 +1492,8 @@ static void __exit zs_exit(void) | |||
1269 | zpool_unregister_driver(&zs_zpool_driver); | 1492 | zpool_unregister_driver(&zs_zpool_driver); |
1270 | #endif | 1493 | #endif |
1271 | zs_unregister_cpu_notifier(); | 1494 | zs_unregister_cpu_notifier(); |
1495 | |||
1496 | zs_stat_exit(); | ||
1272 | } | 1497 | } |
1273 | 1498 | ||
1274 | module_init(zs_init); | 1499 | module_init(zs_init); |
diff --git a/mm/zswap.c b/mm/zswap.c index 0cfce9bc51e4..4249e82ff934 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
@@ -906,11 +906,12 @@ static int __init init_zswap(void) | |||
906 | 906 | ||
907 | pr_info("loading zswap\n"); | 907 | pr_info("loading zswap\n"); |
908 | 908 | ||
909 | zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops); | 909 | zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, |
910 | &zswap_zpool_ops); | ||
910 | if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { | 911 | if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { |
911 | pr_info("%s zpool not available\n", zswap_zpool_type); | 912 | pr_info("%s zpool not available\n", zswap_zpool_type); |
912 | zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; | 913 | zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; |
913 | zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, | 914 | zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, |
914 | &zswap_zpool_ops); | 915 | &zswap_zpool_ops); |
915 | } | 916 | } |
916 | if (!zswap_pool) { | 917 | if (!zswap_pool) { |