diff options
Diffstat (limited to 'mm')
51 files changed, 1583 insertions, 1222 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 9b8fccb969dc..beb7a455915d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -312,7 +312,6 @@ config NEED_BOUNCE_POOL | |||
312 | config NR_QUICK | 312 | config NR_QUICK |
313 | int | 313 | int |
314 | depends on QUICKLIST | 314 | depends on QUICKLIST |
315 | default "2" if AVR32 | ||
316 | default "1" | 315 | default "1" |
317 | 316 | ||
318 | config VIRT_TO_BUS | 317 | config VIRT_TO_BUS |
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 79d0fd13b5b3..5b0adf1435de 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
@@ -42,7 +42,6 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT | |||
42 | 42 | ||
43 | config PAGE_POISONING | 43 | config PAGE_POISONING |
44 | bool "Poison pages after freeing" | 44 | bool "Poison pages after freeing" |
45 | select PAGE_EXTENSION | ||
46 | select PAGE_POISONING_NO_SANITY if HIBERNATION | 45 | select PAGE_POISONING_NO_SANITY if HIBERNATION |
47 | ---help--- | 46 | ---help--- |
48 | Fill the pages with poison patterns after free_pages() and verify | 47 | Fill the pages with poison patterns after free_pages() and verify |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c6f2a37028c2..f028a9a472fd 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -12,8 +12,6 @@ | |||
12 | #include <linux/device.h> | 12 | #include <linux/device.h> |
13 | #include <trace/events/writeback.h> | 13 | #include <trace/events/writeback.h> |
14 | 14 | ||
15 | static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); | ||
16 | |||
17 | struct backing_dev_info noop_backing_dev_info = { | 15 | struct backing_dev_info noop_backing_dev_info = { |
18 | .name = "noop", | 16 | .name = "noop", |
19 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, | 17 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, |
@@ -242,6 +240,8 @@ static __init int bdi_class_init(void) | |||
242 | } | 240 | } |
243 | postcore_initcall(bdi_class_init); | 241 | postcore_initcall(bdi_class_init); |
244 | 242 | ||
243 | static int bdi_init(struct backing_dev_info *bdi); | ||
244 | |||
245 | static int __init default_bdi_init(void) | 245 | static int __init default_bdi_init(void) |
246 | { | 246 | { |
247 | int err; | 247 | int err; |
@@ -294,6 +294,8 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, | |||
294 | 294 | ||
295 | memset(wb, 0, sizeof(*wb)); | 295 | memset(wb, 0, sizeof(*wb)); |
296 | 296 | ||
297 | if (wb != &bdi->wb) | ||
298 | bdi_get(bdi); | ||
297 | wb->bdi = bdi; | 299 | wb->bdi = bdi; |
298 | wb->last_old_flush = jiffies; | 300 | wb->last_old_flush = jiffies; |
299 | INIT_LIST_HEAD(&wb->b_dirty); | 301 | INIT_LIST_HEAD(&wb->b_dirty); |
@@ -314,8 +316,10 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, | |||
314 | wb->dirty_sleep = jiffies; | 316 | wb->dirty_sleep = jiffies; |
315 | 317 | ||
316 | wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); | 318 | wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); |
317 | if (!wb->congested) | 319 | if (!wb->congested) { |
318 | return -ENOMEM; | 320 | err = -ENOMEM; |
321 | goto out_put_bdi; | ||
322 | } | ||
319 | 323 | ||
320 | err = fprop_local_init_percpu(&wb->completions, gfp); | 324 | err = fprop_local_init_percpu(&wb->completions, gfp); |
321 | if (err) | 325 | if (err) |
@@ -335,9 +339,14 @@ out_destroy_stat: | |||
335 | fprop_local_destroy_percpu(&wb->completions); | 339 | fprop_local_destroy_percpu(&wb->completions); |
336 | out_put_cong: | 340 | out_put_cong: |
337 | wb_congested_put(wb->congested); | 341 | wb_congested_put(wb->congested); |
342 | out_put_bdi: | ||
343 | if (wb != &bdi->wb) | ||
344 | bdi_put(bdi); | ||
338 | return err; | 345 | return err; |
339 | } | 346 | } |
340 | 347 | ||
348 | static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb); | ||
349 | |||
341 | /* | 350 | /* |
342 | * Remove bdi from the global list and shutdown any threads we have running | 351 | * Remove bdi from the global list and shutdown any threads we have running |
343 | */ | 352 | */ |
@@ -347,10 +356,18 @@ static void wb_shutdown(struct bdi_writeback *wb) | |||
347 | spin_lock_bh(&wb->work_lock); | 356 | spin_lock_bh(&wb->work_lock); |
348 | if (!test_and_clear_bit(WB_registered, &wb->state)) { | 357 | if (!test_and_clear_bit(WB_registered, &wb->state)) { |
349 | spin_unlock_bh(&wb->work_lock); | 358 | spin_unlock_bh(&wb->work_lock); |
359 | /* | ||
360 | * Wait for wb shutdown to finish if someone else is just | ||
361 | * running wb_shutdown(). Otherwise we could proceed to wb / | ||
362 | * bdi destruction before wb_shutdown() is finished. | ||
363 | */ | ||
364 | wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE); | ||
350 | return; | 365 | return; |
351 | } | 366 | } |
367 | set_bit(WB_shutting_down, &wb->state); | ||
352 | spin_unlock_bh(&wb->work_lock); | 368 | spin_unlock_bh(&wb->work_lock); |
353 | 369 | ||
370 | cgwb_remove_from_bdi_list(wb); | ||
354 | /* | 371 | /* |
355 | * Drain work list and shutdown the delayed_work. !WB_registered | 372 | * Drain work list and shutdown the delayed_work. !WB_registered |
356 | * tells wb_workfn() that @wb is dying and its work_list needs to | 373 | * tells wb_workfn() that @wb is dying and its work_list needs to |
@@ -359,6 +376,12 @@ static void wb_shutdown(struct bdi_writeback *wb) | |||
359 | mod_delayed_work(bdi_wq, &wb->dwork, 0); | 376 | mod_delayed_work(bdi_wq, &wb->dwork, 0); |
360 | flush_delayed_work(&wb->dwork); | 377 | flush_delayed_work(&wb->dwork); |
361 | WARN_ON(!list_empty(&wb->work_list)); | 378 | WARN_ON(!list_empty(&wb->work_list)); |
379 | /* | ||
380 | * Make sure bit gets cleared after shutdown is finished. Matches with | ||
381 | * the barrier provided by test_and_clear_bit() above. | ||
382 | */ | ||
383 | smp_wmb(); | ||
384 | clear_bit(WB_shutting_down, &wb->state); | ||
362 | } | 385 | } |
363 | 386 | ||
364 | static void wb_exit(struct bdi_writeback *wb) | 387 | static void wb_exit(struct bdi_writeback *wb) |
@@ -372,6 +395,8 @@ static void wb_exit(struct bdi_writeback *wb) | |||
372 | 395 | ||
373 | fprop_local_destroy_percpu(&wb->completions); | 396 | fprop_local_destroy_percpu(&wb->completions); |
374 | wb_congested_put(wb->congested); | 397 | wb_congested_put(wb->congested); |
398 | if (wb != &wb->bdi->wb) | ||
399 | bdi_put(wb->bdi); | ||
375 | } | 400 | } |
376 | 401 | ||
377 | #ifdef CONFIG_CGROUP_WRITEBACK | 402 | #ifdef CONFIG_CGROUP_WRITEBACK |
@@ -381,11 +406,9 @@ static void wb_exit(struct bdi_writeback *wb) | |||
381 | /* | 406 | /* |
382 | * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree, | 407 | * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree, |
383 | * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU | 408 | * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU |
384 | * protected. cgwb_release_wait is used to wait for the completion of cgwb | 409 | * protected. |
385 | * releases from bdi destruction path. | ||
386 | */ | 410 | */ |
387 | static DEFINE_SPINLOCK(cgwb_lock); | 411 | static DEFINE_SPINLOCK(cgwb_lock); |
388 | static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait); | ||
389 | 412 | ||
390 | /** | 413 | /** |
391 | * wb_congested_get_create - get or create a wb_congested | 414 | * wb_congested_get_create - get or create a wb_congested |
@@ -438,7 +461,7 @@ retry: | |||
438 | return NULL; | 461 | return NULL; |
439 | 462 | ||
440 | atomic_set(&new_congested->refcnt, 0); | 463 | atomic_set(&new_congested->refcnt, 0); |
441 | new_congested->bdi = bdi; | 464 | new_congested->__bdi = bdi; |
442 | new_congested->blkcg_id = blkcg_id; | 465 | new_congested->blkcg_id = blkcg_id; |
443 | goto retry; | 466 | goto retry; |
444 | 467 | ||
@@ -466,10 +489,10 @@ void wb_congested_put(struct bdi_writeback_congested *congested) | |||
466 | } | 489 | } |
467 | 490 | ||
468 | /* bdi might already have been destroyed leaving @congested unlinked */ | 491 | /* bdi might already have been destroyed leaving @congested unlinked */ |
469 | if (congested->bdi) { | 492 | if (congested->__bdi) { |
470 | rb_erase(&congested->rb_node, | 493 | rb_erase(&congested->rb_node, |
471 | &congested->bdi->cgwb_congested_tree); | 494 | &congested->__bdi->cgwb_congested_tree); |
472 | congested->bdi = NULL; | 495 | congested->__bdi = NULL; |
473 | } | 496 | } |
474 | 497 | ||
475 | spin_unlock_irqrestore(&cgwb_lock, flags); | 498 | spin_unlock_irqrestore(&cgwb_lock, flags); |
@@ -480,11 +503,6 @@ static void cgwb_release_workfn(struct work_struct *work) | |||
480 | { | 503 | { |
481 | struct bdi_writeback *wb = container_of(work, struct bdi_writeback, | 504 | struct bdi_writeback *wb = container_of(work, struct bdi_writeback, |
482 | release_work); | 505 | release_work); |
483 | struct backing_dev_info *bdi = wb->bdi; | ||
484 | |||
485 | spin_lock_irq(&cgwb_lock); | ||
486 | list_del_rcu(&wb->bdi_node); | ||
487 | spin_unlock_irq(&cgwb_lock); | ||
488 | 506 | ||
489 | wb_shutdown(wb); | 507 | wb_shutdown(wb); |
490 | 508 | ||
@@ -495,9 +513,6 @@ static void cgwb_release_workfn(struct work_struct *work) | |||
495 | percpu_ref_exit(&wb->refcnt); | 513 | percpu_ref_exit(&wb->refcnt); |
496 | wb_exit(wb); | 514 | wb_exit(wb); |
497 | kfree_rcu(wb, rcu); | 515 | kfree_rcu(wb, rcu); |
498 | |||
499 | if (atomic_dec_and_test(&bdi->usage_cnt)) | ||
500 | wake_up_all(&cgwb_release_wait); | ||
501 | } | 516 | } |
502 | 517 | ||
503 | static void cgwb_release(struct percpu_ref *refcnt) | 518 | static void cgwb_release(struct percpu_ref *refcnt) |
@@ -517,6 +532,13 @@ static void cgwb_kill(struct bdi_writeback *wb) | |||
517 | percpu_ref_kill(&wb->refcnt); | 532 | percpu_ref_kill(&wb->refcnt); |
518 | } | 533 | } |
519 | 534 | ||
535 | static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) | ||
536 | { | ||
537 | spin_lock_irq(&cgwb_lock); | ||
538 | list_del_rcu(&wb->bdi_node); | ||
539 | spin_unlock_irq(&cgwb_lock); | ||
540 | } | ||
541 | |||
520 | static int cgwb_create(struct backing_dev_info *bdi, | 542 | static int cgwb_create(struct backing_dev_info *bdi, |
521 | struct cgroup_subsys_state *memcg_css, gfp_t gfp) | 543 | struct cgroup_subsys_state *memcg_css, gfp_t gfp) |
522 | { | 544 | { |
@@ -580,7 +602,6 @@ static int cgwb_create(struct backing_dev_info *bdi, | |||
580 | /* we might have raced another instance of this function */ | 602 | /* we might have raced another instance of this function */ |
581 | ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); | 603 | ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); |
582 | if (!ret) { | 604 | if (!ret) { |
583 | atomic_inc(&bdi->usage_cnt); | ||
584 | list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); | 605 | list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); |
585 | list_add(&wb->memcg_node, memcg_cgwb_list); | 606 | list_add(&wb->memcg_node, memcg_cgwb_list); |
586 | list_add(&wb->blkcg_node, blkcg_cgwb_list); | 607 | list_add(&wb->blkcg_node, blkcg_cgwb_list); |
@@ -670,7 +691,6 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) | |||
670 | 691 | ||
671 | INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); | 692 | INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); |
672 | bdi->cgwb_congested_tree = RB_ROOT; | 693 | bdi->cgwb_congested_tree = RB_ROOT; |
673 | atomic_set(&bdi->usage_cnt, 1); | ||
674 | 694 | ||
675 | ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); | 695 | ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); |
676 | if (!ret) { | 696 | if (!ret) { |
@@ -680,29 +700,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) | |||
680 | return ret; | 700 | return ret; |
681 | } | 701 | } |
682 | 702 | ||
683 | static void cgwb_bdi_destroy(struct backing_dev_info *bdi) | 703 | static void cgwb_bdi_unregister(struct backing_dev_info *bdi) |
684 | { | 704 | { |
685 | struct radix_tree_iter iter; | 705 | struct radix_tree_iter iter; |
686 | void **slot; | 706 | void **slot; |
707 | struct bdi_writeback *wb; | ||
687 | 708 | ||
688 | WARN_ON(test_bit(WB_registered, &bdi->wb.state)); | 709 | WARN_ON(test_bit(WB_registered, &bdi->wb.state)); |
689 | 710 | ||
690 | spin_lock_irq(&cgwb_lock); | 711 | spin_lock_irq(&cgwb_lock); |
691 | radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) | 712 | radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) |
692 | cgwb_kill(*slot); | 713 | cgwb_kill(*slot); |
693 | spin_unlock_irq(&cgwb_lock); | ||
694 | 714 | ||
695 | /* | 715 | while (!list_empty(&bdi->wb_list)) { |
696 | * All cgwb's must be shutdown and released before returning. Drain | 716 | wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, |
697 | * the usage counter to wait for all cgwb's ever created on @bdi. | 717 | bdi_node); |
698 | */ | 718 | spin_unlock_irq(&cgwb_lock); |
699 | atomic_dec(&bdi->usage_cnt); | 719 | wb_shutdown(wb); |
700 | wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt)); | 720 | spin_lock_irq(&cgwb_lock); |
701 | /* | 721 | } |
702 | * Grab back our reference so that we hold it when @bdi gets | 722 | spin_unlock_irq(&cgwb_lock); |
703 | * re-registered. | ||
704 | */ | ||
705 | atomic_inc(&bdi->usage_cnt); | ||
706 | } | 723 | } |
707 | 724 | ||
708 | /** | 725 | /** |
@@ -752,11 +769,18 @@ static void cgwb_bdi_exit(struct backing_dev_info *bdi) | |||
752 | rb_entry(rbn, struct bdi_writeback_congested, rb_node); | 769 | rb_entry(rbn, struct bdi_writeback_congested, rb_node); |
753 | 770 | ||
754 | rb_erase(rbn, &bdi->cgwb_congested_tree); | 771 | rb_erase(rbn, &bdi->cgwb_congested_tree); |
755 | congested->bdi = NULL; /* mark @congested unlinked */ | 772 | congested->__bdi = NULL; /* mark @congested unlinked */ |
756 | } | 773 | } |
757 | spin_unlock_irq(&cgwb_lock); | 774 | spin_unlock_irq(&cgwb_lock); |
758 | } | 775 | } |
759 | 776 | ||
777 | static void cgwb_bdi_register(struct backing_dev_info *bdi) | ||
778 | { | ||
779 | spin_lock_irq(&cgwb_lock); | ||
780 | list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); | ||
781 | spin_unlock_irq(&cgwb_lock); | ||
782 | } | ||
783 | |||
760 | #else /* CONFIG_CGROUP_WRITEBACK */ | 784 | #else /* CONFIG_CGROUP_WRITEBACK */ |
761 | 785 | ||
762 | static int cgwb_bdi_init(struct backing_dev_info *bdi) | 786 | static int cgwb_bdi_init(struct backing_dev_info *bdi) |
@@ -777,16 +801,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) | |||
777 | return 0; | 801 | return 0; |
778 | } | 802 | } |
779 | 803 | ||
780 | static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { } | 804 | static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } |
781 | 805 | ||
782 | static void cgwb_bdi_exit(struct backing_dev_info *bdi) | 806 | static void cgwb_bdi_exit(struct backing_dev_info *bdi) |
783 | { | 807 | { |
784 | wb_congested_put(bdi->wb_congested); | 808 | wb_congested_put(bdi->wb_congested); |
785 | } | 809 | } |
786 | 810 | ||
811 | static void cgwb_bdi_register(struct backing_dev_info *bdi) | ||
812 | { | ||
813 | list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); | ||
814 | } | ||
815 | |||
816 | static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) | ||
817 | { | ||
818 | list_del_rcu(&wb->bdi_node); | ||
819 | } | ||
820 | |||
787 | #endif /* CONFIG_CGROUP_WRITEBACK */ | 821 | #endif /* CONFIG_CGROUP_WRITEBACK */ |
788 | 822 | ||
789 | int bdi_init(struct backing_dev_info *bdi) | 823 | static int bdi_init(struct backing_dev_info *bdi) |
790 | { | 824 | { |
791 | int ret; | 825 | int ret; |
792 | 826 | ||
@@ -802,11 +836,8 @@ int bdi_init(struct backing_dev_info *bdi) | |||
802 | 836 | ||
803 | ret = cgwb_bdi_init(bdi); | 837 | ret = cgwb_bdi_init(bdi); |
804 | 838 | ||
805 | list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); | ||
806 | |||
807 | return ret; | 839 | return ret; |
808 | } | 840 | } |
809 | EXPORT_SYMBOL(bdi_init); | ||
810 | 841 | ||
811 | struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) | 842 | struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) |
812 | { | 843 | { |
@@ -823,22 +854,20 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) | |||
823 | } | 854 | } |
824 | return bdi; | 855 | return bdi; |
825 | } | 856 | } |
857 | EXPORT_SYMBOL(bdi_alloc_node); | ||
826 | 858 | ||
827 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, | 859 | int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) |
828 | const char *fmt, ...) | ||
829 | { | 860 | { |
830 | va_list args; | ||
831 | struct device *dev; | 861 | struct device *dev; |
832 | 862 | ||
833 | if (bdi->dev) /* The driver needs to use separate queues per device */ | 863 | if (bdi->dev) /* The driver needs to use separate queues per device */ |
834 | return 0; | 864 | return 0; |
835 | 865 | ||
836 | va_start(args, fmt); | 866 | dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args); |
837 | dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); | ||
838 | va_end(args); | ||
839 | if (IS_ERR(dev)) | 867 | if (IS_ERR(dev)) |
840 | return PTR_ERR(dev); | 868 | return PTR_ERR(dev); |
841 | 869 | ||
870 | cgwb_bdi_register(bdi); | ||
842 | bdi->dev = dev; | 871 | bdi->dev = dev; |
843 | 872 | ||
844 | bdi_debug_register(bdi, dev_name(dev)); | 873 | bdi_debug_register(bdi, dev_name(dev)); |
@@ -851,20 +880,25 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, | |||
851 | trace_writeback_bdi_register(bdi); | 880 | trace_writeback_bdi_register(bdi); |
852 | return 0; | 881 | return 0; |
853 | } | 882 | } |
854 | EXPORT_SYMBOL(bdi_register); | 883 | EXPORT_SYMBOL(bdi_register_va); |
855 | 884 | ||
856 | int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) | 885 | int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) |
857 | { | 886 | { |
858 | return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); | 887 | va_list args; |
888 | int ret; | ||
889 | |||
890 | va_start(args, fmt); | ||
891 | ret = bdi_register_va(bdi, fmt, args); | ||
892 | va_end(args); | ||
893 | return ret; | ||
859 | } | 894 | } |
860 | EXPORT_SYMBOL(bdi_register_dev); | 895 | EXPORT_SYMBOL(bdi_register); |
861 | 896 | ||
862 | int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner) | 897 | int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner) |
863 | { | 898 | { |
864 | int rc; | 899 | int rc; |
865 | 900 | ||
866 | rc = bdi_register(bdi, NULL, "%u:%u", MAJOR(owner->devt), | 901 | rc = bdi_register(bdi, "%u:%u", MAJOR(owner->devt), MINOR(owner->devt)); |
867 | MINOR(owner->devt)); | ||
868 | if (rc) | 902 | if (rc) |
869 | return rc; | 903 | return rc; |
870 | /* Leaking owner reference... */ | 904 | /* Leaking owner reference... */ |
@@ -892,7 +926,7 @@ void bdi_unregister(struct backing_dev_info *bdi) | |||
892 | /* make sure nobody finds us on the bdi_list anymore */ | 926 | /* make sure nobody finds us on the bdi_list anymore */ |
893 | bdi_remove_from_list(bdi); | 927 | bdi_remove_from_list(bdi); |
894 | wb_shutdown(&bdi->wb); | 928 | wb_shutdown(&bdi->wb); |
895 | cgwb_bdi_destroy(bdi); | 929 | cgwb_bdi_unregister(bdi); |
896 | 930 | ||
897 | if (bdi->dev) { | 931 | if (bdi->dev) { |
898 | bdi_debug_unregister(bdi); | 932 | bdi_debug_unregister(bdi); |
@@ -906,19 +940,16 @@ void bdi_unregister(struct backing_dev_info *bdi) | |||
906 | } | 940 | } |
907 | } | 941 | } |
908 | 942 | ||
909 | static void bdi_exit(struct backing_dev_info *bdi) | ||
910 | { | ||
911 | WARN_ON_ONCE(bdi->dev); | ||
912 | wb_exit(&bdi->wb); | ||
913 | cgwb_bdi_exit(bdi); | ||
914 | } | ||
915 | |||
916 | static void release_bdi(struct kref *ref) | 943 | static void release_bdi(struct kref *ref) |
917 | { | 944 | { |
918 | struct backing_dev_info *bdi = | 945 | struct backing_dev_info *bdi = |
919 | container_of(ref, struct backing_dev_info, refcnt); | 946 | container_of(ref, struct backing_dev_info, refcnt); |
920 | 947 | ||
921 | bdi_exit(bdi); | 948 | if (test_bit(WB_registered, &bdi->wb.state)) |
949 | bdi_unregister(bdi); | ||
950 | WARN_ON_ONCE(bdi->dev); | ||
951 | wb_exit(&bdi->wb); | ||
952 | cgwb_bdi_exit(bdi); | ||
922 | kfree(bdi); | 953 | kfree(bdi); |
923 | } | 954 | } |
924 | 955 | ||
@@ -926,38 +957,7 @@ void bdi_put(struct backing_dev_info *bdi) | |||
926 | { | 957 | { |
927 | kref_put(&bdi->refcnt, release_bdi); | 958 | kref_put(&bdi->refcnt, release_bdi); |
928 | } | 959 | } |
929 | 960 | EXPORT_SYMBOL(bdi_put); | |
930 | void bdi_destroy(struct backing_dev_info *bdi) | ||
931 | { | ||
932 | bdi_unregister(bdi); | ||
933 | bdi_exit(bdi); | ||
934 | } | ||
935 | EXPORT_SYMBOL(bdi_destroy); | ||
936 | |||
937 | /* | ||
938 | * For use from filesystems to quickly init and register a bdi associated | ||
939 | * with dirty writeback | ||
940 | */ | ||
941 | int bdi_setup_and_register(struct backing_dev_info *bdi, char *name) | ||
942 | { | ||
943 | int err; | ||
944 | |||
945 | bdi->name = name; | ||
946 | bdi->capabilities = 0; | ||
947 | err = bdi_init(bdi); | ||
948 | if (err) | ||
949 | return err; | ||
950 | |||
951 | err = bdi_register(bdi, NULL, "%.28s-%ld", name, | ||
952 | atomic_long_inc_return(&bdi_seq)); | ||
953 | if (err) { | ||
954 | bdi_destroy(bdi); | ||
955 | return err; | ||
956 | } | ||
957 | |||
958 | return 0; | ||
959 | } | ||
960 | EXPORT_SYMBOL(bdi_setup_and_register); | ||
961 | 961 | ||
962 | static wait_queue_head_t congestion_wqh[2] = { | 962 | static wait_queue_head_t congestion_wqh[2] = { |
963 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), | 963 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), |
@@ -53,6 +53,11 @@ unsigned long cma_get_size(const struct cma *cma) | |||
53 | return cma->count << PAGE_SHIFT; | 53 | return cma->count << PAGE_SHIFT; |
54 | } | 54 | } |
55 | 55 | ||
56 | const char *cma_get_name(const struct cma *cma) | ||
57 | { | ||
58 | return cma->name ? cma->name : "(undefined)"; | ||
59 | } | ||
60 | |||
56 | static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, | 61 | static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, |
57 | int align_order) | 62 | int align_order) |
58 | { | 63 | { |
@@ -168,6 +173,7 @@ core_initcall(cma_init_reserved_areas); | |||
168 | */ | 173 | */ |
169 | int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, | 174 | int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, |
170 | unsigned int order_per_bit, | 175 | unsigned int order_per_bit, |
176 | const char *name, | ||
171 | struct cma **res_cma) | 177 | struct cma **res_cma) |
172 | { | 178 | { |
173 | struct cma *cma; | 179 | struct cma *cma; |
@@ -198,6 +204,13 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, | |||
198 | * subsystems (like slab allocator) are available. | 204 | * subsystems (like slab allocator) are available. |
199 | */ | 205 | */ |
200 | cma = &cma_areas[cma_area_count]; | 206 | cma = &cma_areas[cma_area_count]; |
207 | if (name) { | ||
208 | cma->name = name; | ||
209 | } else { | ||
210 | cma->name = kasprintf(GFP_KERNEL, "cma%d\n", cma_area_count); | ||
211 | if (!cma->name) | ||
212 | return -ENOMEM; | ||
213 | } | ||
201 | cma->base_pfn = PFN_DOWN(base); | 214 | cma->base_pfn = PFN_DOWN(base); |
202 | cma->count = size >> PAGE_SHIFT; | 215 | cma->count = size >> PAGE_SHIFT; |
203 | cma->order_per_bit = order_per_bit; | 216 | cma->order_per_bit = order_per_bit; |
@@ -229,7 +242,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, | |||
229 | int __init cma_declare_contiguous(phys_addr_t base, | 242 | int __init cma_declare_contiguous(phys_addr_t base, |
230 | phys_addr_t size, phys_addr_t limit, | 243 | phys_addr_t size, phys_addr_t limit, |
231 | phys_addr_t alignment, unsigned int order_per_bit, | 244 | phys_addr_t alignment, unsigned int order_per_bit, |
232 | bool fixed, struct cma **res_cma) | 245 | bool fixed, const char *name, struct cma **res_cma) |
233 | { | 246 | { |
234 | phys_addr_t memblock_end = memblock_end_of_DRAM(); | 247 | phys_addr_t memblock_end = memblock_end_of_DRAM(); |
235 | phys_addr_t highmem_start; | 248 | phys_addr_t highmem_start; |
@@ -335,7 +348,7 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
335 | base = addr; | 348 | base = addr; |
336 | } | 349 | } |
337 | 350 | ||
338 | ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma); | 351 | ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); |
339 | if (ret) | 352 | if (ret) |
340 | goto err; | 353 | goto err; |
341 | 354 | ||
@@ -491,3 +504,17 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) | |||
491 | 504 | ||
492 | return true; | 505 | return true; |
493 | } | 506 | } |
507 | |||
508 | int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) | ||
509 | { | ||
510 | int i; | ||
511 | |||
512 | for (i = 0; i < cma_area_count; i++) { | ||
513 | int ret = it(&cma_areas[i], data); | ||
514 | |||
515 | if (ret) | ||
516 | return ret; | ||
517 | } | ||
518 | |||
519 | return 0; | ||
520 | } | ||
@@ -11,6 +11,7 @@ struct cma { | |||
11 | struct hlist_head mem_head; | 11 | struct hlist_head mem_head; |
12 | spinlock_t mem_head_lock; | 12 | spinlock_t mem_head_lock; |
13 | #endif | 13 | #endif |
14 | const char *name; | ||
14 | }; | 15 | }; |
15 | 16 | ||
16 | extern struct cma cma_areas[MAX_CMA_AREAS]; | 17 | extern struct cma cma_areas[MAX_CMA_AREAS]; |
diff --git a/mm/cma_debug.c b/mm/cma_debug.c index ffc0c3d0ae64..595b757bef72 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c | |||
@@ -167,7 +167,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) | |||
167 | char name[16]; | 167 | char name[16]; |
168 | int u32s; | 168 | int u32s; |
169 | 169 | ||
170 | sprintf(name, "cma-%d", idx); | 170 | sprintf(name, "cma-%s", cma->name); |
171 | 171 | ||
172 | tmp = debugfs_create_dir(name, cma_debugfs_root); | 172 | tmp = debugfs_create_dir(name, cma_debugfs_root); |
173 | 173 | ||
diff --git a/mm/compaction.c b/mm/compaction.c index 81e1eaa2a2cf..613c59e928cb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -89,11 +89,6 @@ static void map_pages(struct list_head *list) | |||
89 | list_splice(&tmp_list, list); | 89 | list_splice(&tmp_list, list); |
90 | } | 90 | } |
91 | 91 | ||
92 | static inline bool migrate_async_suitable(int migratetype) | ||
93 | { | ||
94 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; | ||
95 | } | ||
96 | |||
97 | #ifdef CONFIG_COMPACTION | 92 | #ifdef CONFIG_COMPACTION |
98 | 93 | ||
99 | int PageMovable(struct page *page) | 94 | int PageMovable(struct page *page) |
@@ -988,13 +983,26 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, | |||
988 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ | 983 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ |
989 | #ifdef CONFIG_COMPACTION | 984 | #ifdef CONFIG_COMPACTION |
990 | 985 | ||
991 | /* Returns true if the page is within a block suitable for migration to */ | 986 | static bool suitable_migration_source(struct compact_control *cc, |
992 | static bool suitable_migration_target(struct compact_control *cc, | ||
993 | struct page *page) | 987 | struct page *page) |
994 | { | 988 | { |
995 | if (cc->ignore_block_suitable) | 989 | int block_mt; |
990 | |||
991 | if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) | ||
996 | return true; | 992 | return true; |
997 | 993 | ||
994 | block_mt = get_pageblock_migratetype(page); | ||
995 | |||
996 | if (cc->migratetype == MIGRATE_MOVABLE) | ||
997 | return is_migrate_movable(block_mt); | ||
998 | else | ||
999 | return block_mt == cc->migratetype; | ||
1000 | } | ||
1001 | |||
1002 | /* Returns true if the page is within a block suitable for migration to */ | ||
1003 | static bool suitable_migration_target(struct compact_control *cc, | ||
1004 | struct page *page) | ||
1005 | { | ||
998 | /* If the page is a large free page, then disallow migration */ | 1006 | /* If the page is a large free page, then disallow migration */ |
999 | if (PageBuddy(page)) { | 1007 | if (PageBuddy(page)) { |
1000 | /* | 1008 | /* |
@@ -1006,8 +1014,11 @@ static bool suitable_migration_target(struct compact_control *cc, | |||
1006 | return false; | 1014 | return false; |
1007 | } | 1015 | } |
1008 | 1016 | ||
1017 | if (cc->ignore_block_suitable) | ||
1018 | return true; | ||
1019 | |||
1009 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | 1020 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ |
1010 | if (migrate_async_suitable(get_pageblock_migratetype(page))) | 1021 | if (is_migrate_movable(get_pageblock_migratetype(page))) |
1011 | return true; | 1022 | return true; |
1012 | 1023 | ||
1013 | /* Otherwise skip the block */ | 1024 | /* Otherwise skip the block */ |
@@ -1242,8 +1253,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
1242 | * Async compaction is optimistic to see if the minimum amount | 1253 | * Async compaction is optimistic to see if the minimum amount |
1243 | * of work satisfies the allocation. | 1254 | * of work satisfies the allocation. |
1244 | */ | 1255 | */ |
1245 | if (cc->mode == MIGRATE_ASYNC && | 1256 | if (!suitable_migration_source(cc, page)) |
1246 | !migrate_async_suitable(get_pageblock_migratetype(page))) | ||
1247 | continue; | 1257 | continue; |
1248 | 1258 | ||
1249 | /* Perform the isolation */ | 1259 | /* Perform the isolation */ |
@@ -1276,11 +1286,11 @@ static inline bool is_via_compact_memory(int order) | |||
1276 | return order == -1; | 1286 | return order == -1; |
1277 | } | 1287 | } |
1278 | 1288 | ||
1279 | static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc, | 1289 | static enum compact_result __compact_finished(struct zone *zone, |
1280 | const int migratetype) | 1290 | struct compact_control *cc) |
1281 | { | 1291 | { |
1282 | unsigned int order; | 1292 | unsigned int order; |
1283 | unsigned long watermark; | 1293 | const int migratetype = cc->migratetype; |
1284 | 1294 | ||
1285 | if (cc->contended || fatal_signal_pending(current)) | 1295 | if (cc->contended || fatal_signal_pending(current)) |
1286 | return COMPACT_CONTENDED; | 1296 | return COMPACT_CONTENDED; |
@@ -1308,12 +1318,16 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ | |||
1308 | if (is_via_compact_memory(cc->order)) | 1318 | if (is_via_compact_memory(cc->order)) |
1309 | return COMPACT_CONTINUE; | 1319 | return COMPACT_CONTINUE; |
1310 | 1320 | ||
1311 | /* Compaction run is not finished if the watermark is not met */ | 1321 | if (cc->finishing_block) { |
1312 | watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK]; | 1322 | /* |
1313 | 1323 | * We have finished the pageblock, but better check again that | |
1314 | if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, | 1324 | * we really succeeded. |
1315 | cc->alloc_flags)) | 1325 | */ |
1316 | return COMPACT_CONTINUE; | 1326 | if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) |
1327 | cc->finishing_block = false; | ||
1328 | else | ||
1329 | return COMPACT_CONTINUE; | ||
1330 | } | ||
1317 | 1331 | ||
1318 | /* Direct compactor: Is a suitable page free? */ | 1332 | /* Direct compactor: Is a suitable page free? */ |
1319 | for (order = cc->order; order < MAX_ORDER; order++) { | 1333 | for (order = cc->order; order < MAX_ORDER; order++) { |
@@ -1335,20 +1349,40 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ | |||
1335 | * other migratetype buddy lists. | 1349 | * other migratetype buddy lists. |
1336 | */ | 1350 | */ |
1337 | if (find_suitable_fallback(area, order, migratetype, | 1351 | if (find_suitable_fallback(area, order, migratetype, |
1338 | true, &can_steal) != -1) | 1352 | true, &can_steal) != -1) { |
1339 | return COMPACT_SUCCESS; | 1353 | |
1354 | /* movable pages are OK in any pageblock */ | ||
1355 | if (migratetype == MIGRATE_MOVABLE) | ||
1356 | return COMPACT_SUCCESS; | ||
1357 | |||
1358 | /* | ||
1359 | * We are stealing for a non-movable allocation. Make | ||
1360 | * sure we finish compacting the current pageblock | ||
1361 | * first so it is as free as possible and we won't | ||
1362 | * have to steal another one soon. This only applies | ||
1363 | * to sync compaction, as async compaction operates | ||
1364 | * on pageblocks of the same migratetype. | ||
1365 | */ | ||
1366 | if (cc->mode == MIGRATE_ASYNC || | ||
1367 | IS_ALIGNED(cc->migrate_pfn, | ||
1368 | pageblock_nr_pages)) { | ||
1369 | return COMPACT_SUCCESS; | ||
1370 | } | ||
1371 | |||
1372 | cc->finishing_block = true; | ||
1373 | return COMPACT_CONTINUE; | ||
1374 | } | ||
1340 | } | 1375 | } |
1341 | 1376 | ||
1342 | return COMPACT_NO_SUITABLE_PAGE; | 1377 | return COMPACT_NO_SUITABLE_PAGE; |
1343 | } | 1378 | } |
1344 | 1379 | ||
1345 | static enum compact_result compact_finished(struct zone *zone, | 1380 | static enum compact_result compact_finished(struct zone *zone, |
1346 | struct compact_control *cc, | 1381 | struct compact_control *cc) |
1347 | const int migratetype) | ||
1348 | { | 1382 | { |
1349 | int ret; | 1383 | int ret; |
1350 | 1384 | ||
1351 | ret = __compact_finished(zone, cc, migratetype); | 1385 | ret = __compact_finished(zone, cc); |
1352 | trace_mm_compaction_finished(zone, cc->order, ret); | 1386 | trace_mm_compaction_finished(zone, cc->order, ret); |
1353 | if (ret == COMPACT_NO_SUITABLE_PAGE) | 1387 | if (ret == COMPACT_NO_SUITABLE_PAGE) |
1354 | ret = COMPACT_CONTINUE; | 1388 | ret = COMPACT_CONTINUE; |
@@ -1481,9 +1515,9 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro | |||
1481 | enum compact_result ret; | 1515 | enum compact_result ret; |
1482 | unsigned long start_pfn = zone->zone_start_pfn; | 1516 | unsigned long start_pfn = zone->zone_start_pfn; |
1483 | unsigned long end_pfn = zone_end_pfn(zone); | 1517 | unsigned long end_pfn = zone_end_pfn(zone); |
1484 | const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); | ||
1485 | const bool sync = cc->mode != MIGRATE_ASYNC; | 1518 | const bool sync = cc->mode != MIGRATE_ASYNC; |
1486 | 1519 | ||
1520 | cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); | ||
1487 | ret = compaction_suitable(zone, cc->order, cc->alloc_flags, | 1521 | ret = compaction_suitable(zone, cc->order, cc->alloc_flags, |
1488 | cc->classzone_idx); | 1522 | cc->classzone_idx); |
1489 | /* Compaction is likely to fail */ | 1523 | /* Compaction is likely to fail */ |
@@ -1533,8 +1567,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro | |||
1533 | 1567 | ||
1534 | migrate_prep_local(); | 1568 | migrate_prep_local(); |
1535 | 1569 | ||
1536 | while ((ret = compact_finished(zone, cc, migratetype)) == | 1570 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { |
1537 | COMPACT_CONTINUE) { | ||
1538 | int err; | 1571 | int err; |
1539 | 1572 | ||
1540 | switch (isolate_migratepages(zone, cc)) { | 1573 | switch (isolate_migratepages(zone, cc)) { |
diff --git a/mm/filemap.c b/mm/filemap.c index d6e67be1802e..6f1be573a5e6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -519,7 +519,7 @@ EXPORT_SYMBOL(filemap_write_and_wait); | |||
519 | * | 519 | * |
520 | * Write out and wait upon file offsets lstart->lend, inclusive. | 520 | * Write out and wait upon file offsets lstart->lend, inclusive. |
521 | * | 521 | * |
522 | * Note that `lend' is inclusive (describes the last byte to be written) so | 522 | * Note that @lend is inclusive (describes the last byte to be written) so |
523 | * that this function can be used to write to the very end-of-file (end = -1). | 523 | * that this function can be used to write to the very end-of-file (end = -1). |
524 | */ | 524 | */ |
525 | int filemap_write_and_wait_range(struct address_space *mapping, | 525 | int filemap_write_and_wait_range(struct address_space *mapping, |
@@ -1277,12 +1277,14 @@ EXPORT_SYMBOL(find_lock_entry); | |||
1277 | * | 1277 | * |
1278 | * PCG flags modify how the page is returned. | 1278 | * PCG flags modify how the page is returned. |
1279 | * | 1279 | * |
1280 | * FGP_ACCESSED: the page will be marked accessed | 1280 | * @fgp_flags can be: |
1281 | * FGP_LOCK: Page is return locked | 1281 | * |
1282 | * FGP_CREAT: If page is not present then a new page is allocated using | 1282 | * - FGP_ACCESSED: the page will be marked accessed |
1283 | * @gfp_mask and added to the page cache and the VM's LRU | 1283 | * - FGP_LOCK: Page is return locked |
1284 | * list. The page is returned locked and with an increased | 1284 | * - FGP_CREAT: If page is not present then a new page is allocated using |
1285 | * refcount. Otherwise, %NULL is returned. | 1285 | * @gfp_mask and added to the page cache and the VM's LRU |
1286 | * list. The page is returned locked and with an increased | ||
1287 | * refcount. Otherwise, NULL is returned. | ||
1286 | * | 1288 | * |
1287 | * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even | 1289 | * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even |
1288 | * if the GFP flags specified for FGP_CREAT are atomic. | 1290 | * if the GFP flags specified for FGP_CREAT are atomic. |
@@ -2202,12 +2204,12 @@ int filemap_fault(struct vm_fault *vmf) | |||
2202 | struct file_ra_state *ra = &file->f_ra; | 2204 | struct file_ra_state *ra = &file->f_ra; |
2203 | struct inode *inode = mapping->host; | 2205 | struct inode *inode = mapping->host; |
2204 | pgoff_t offset = vmf->pgoff; | 2206 | pgoff_t offset = vmf->pgoff; |
2207 | pgoff_t max_off; | ||
2205 | struct page *page; | 2208 | struct page *page; |
2206 | loff_t size; | ||
2207 | int ret = 0; | 2209 | int ret = 0; |
2208 | 2210 | ||
2209 | size = round_up(i_size_read(inode), PAGE_SIZE); | 2211 | max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); |
2210 | if (offset >= size >> PAGE_SHIFT) | 2212 | if (unlikely(offset >= max_off)) |
2211 | return VM_FAULT_SIGBUS; | 2213 | return VM_FAULT_SIGBUS; |
2212 | 2214 | ||
2213 | /* | 2215 | /* |
@@ -2256,8 +2258,8 @@ retry_find: | |||
2256 | * Found the page and have a reference on it. | 2258 | * Found the page and have a reference on it. |
2257 | * We must recheck i_size under page lock. | 2259 | * We must recheck i_size under page lock. |
2258 | */ | 2260 | */ |
2259 | size = round_up(i_size_read(inode), PAGE_SIZE); | 2261 | max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); |
2260 | if (unlikely(offset >= size >> PAGE_SHIFT)) { | 2262 | if (unlikely(offset >= max_off)) { |
2261 | unlock_page(page); | 2263 | unlock_page(page); |
2262 | put_page(page); | 2264 | put_page(page); |
2263 | return VM_FAULT_SIGBUS; | 2265 | return VM_FAULT_SIGBUS; |
@@ -2323,7 +2325,7 @@ void filemap_map_pages(struct vm_fault *vmf, | |||
2323 | struct file *file = vmf->vma->vm_file; | 2325 | struct file *file = vmf->vma->vm_file; |
2324 | struct address_space *mapping = file->f_mapping; | 2326 | struct address_space *mapping = file->f_mapping; |
2325 | pgoff_t last_pgoff = start_pgoff; | 2327 | pgoff_t last_pgoff = start_pgoff; |
2326 | loff_t size; | 2328 | unsigned long max_idx; |
2327 | struct page *head, *page; | 2329 | struct page *head, *page; |
2328 | 2330 | ||
2329 | rcu_read_lock(); | 2331 | rcu_read_lock(); |
@@ -2369,8 +2371,8 @@ repeat: | |||
2369 | if (page->mapping != mapping || !PageUptodate(page)) | 2371 | if (page->mapping != mapping || !PageUptodate(page)) |
2370 | goto unlock; | 2372 | goto unlock; |
2371 | 2373 | ||
2372 | size = round_up(i_size_read(mapping->host), PAGE_SIZE); | 2374 | max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); |
2373 | if (page->index >= size >> PAGE_SHIFT) | 2375 | if (page->index >= max_idx) |
2374 | goto unlock; | 2376 | goto unlock; |
2375 | 2377 | ||
2376 | if (file->f_ra.mmap_miss > 0) | 2378 | if (file->f_ra.mmap_miss > 0) |
@@ -2718,18 +2720,16 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) | |||
2718 | * about to write. We do this *before* the write so that we can return | 2720 | * about to write. We do this *before* the write so that we can return |
2719 | * without clobbering -EIOCBQUEUED from ->direct_IO(). | 2721 | * without clobbering -EIOCBQUEUED from ->direct_IO(). |
2720 | */ | 2722 | */ |
2721 | if (mapping->nrpages) { | 2723 | written = invalidate_inode_pages2_range(mapping, |
2722 | written = invalidate_inode_pages2_range(mapping, | ||
2723 | pos >> PAGE_SHIFT, end); | 2724 | pos >> PAGE_SHIFT, end); |
2724 | /* | 2725 | /* |
2725 | * If a page can not be invalidated, return 0 to fall back | 2726 | * If a page can not be invalidated, return 0 to fall back |
2726 | * to buffered write. | 2727 | * to buffered write. |
2727 | */ | 2728 | */ |
2728 | if (written) { | 2729 | if (written) { |
2729 | if (written == -EBUSY) | 2730 | if (written == -EBUSY) |
2730 | return 0; | 2731 | return 0; |
2731 | goto out; | 2732 | goto out; |
2732 | } | ||
2733 | } | 2733 | } |
2734 | 2734 | ||
2735 | written = mapping->a_ops->direct_IO(iocb, from); | 2735 | written = mapping->a_ops->direct_IO(iocb, from); |
@@ -2742,10 +2742,8 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) | |||
2742 | * so we don't support it 100%. If this invalidation | 2742 | * so we don't support it 100%. If this invalidation |
2743 | * fails, tough, the write still worked... | 2743 | * fails, tough, the write still worked... |
2744 | */ | 2744 | */ |
2745 | if (mapping->nrpages) { | 2745 | invalidate_inode_pages2_range(mapping, |
2746 | invalidate_inode_pages2_range(mapping, | 2746 | pos >> PAGE_SHIFT, end); |
2747 | pos >> PAGE_SHIFT, end); | ||
2748 | } | ||
2749 | 2747 | ||
2750 | if (written > 0) { | 2748 | if (written > 0) { |
2751 | pos += written; | 2749 | pos += written; |
@@ -2793,12 +2791,6 @@ ssize_t generic_perform_write(struct file *file, | |||
2793 | ssize_t written = 0; | 2791 | ssize_t written = 0; |
2794 | unsigned int flags = 0; | 2792 | unsigned int flags = 0; |
2795 | 2793 | ||
2796 | /* | ||
2797 | * Copies from kernel address space cannot fail (NFSD is a big user). | ||
2798 | */ | ||
2799 | if (!iter_is_iovec(i)) | ||
2800 | flags |= AOP_FLAG_UNINTERRUPTIBLE; | ||
2801 | |||
2802 | do { | 2794 | do { |
2803 | struct page *page; | 2795 | struct page *page; |
2804 | unsigned long offset; /* Offset into pagecache page */ | 2796 | unsigned long offset; /* Offset into pagecache page */ |
@@ -3000,7 +2992,7 @@ EXPORT_SYMBOL(generic_file_write_iter); | |||
3000 | * @gfp_mask: memory allocation flags (and I/O mode) | 2992 | * @gfp_mask: memory allocation flags (and I/O mode) |
3001 | * | 2993 | * |
3002 | * The address_space is to try to release any data against the page | 2994 | * The address_space is to try to release any data against the page |
3003 | * (presumably at page->private). If the release was successful, return `1'. | 2995 | * (presumably at page->private). If the release was successful, return '1'. |
3004 | * Otherwise return zero. | 2996 | * Otherwise return zero. |
3005 | * | 2997 | * |
3006 | * This may also be called if PG_fscache is set on a page, indicating that the | 2998 | * This may also be called if PG_fscache is set on a page, indicating that the |
diff --git a/mm/frame_vector.c b/mm/frame_vector.c index db77dcb38afd..72ebec18629c 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c | |||
@@ -200,10 +200,7 @@ struct frame_vector *frame_vector_create(unsigned int nr_frames) | |||
200 | * Avoid higher order allocations, use vmalloc instead. It should | 200 | * Avoid higher order allocations, use vmalloc instead. It should |
201 | * be rare anyway. | 201 | * be rare anyway. |
202 | */ | 202 | */ |
203 | if (size <= PAGE_SIZE) | 203 | vec = kvmalloc(size, GFP_KERNEL); |
204 | vec = kmalloc(size, GFP_KERNEL); | ||
205 | else | ||
206 | vec = vmalloc(size); | ||
207 | if (!vec) | 204 | if (!vec) |
208 | return NULL; | 205 | return NULL; |
209 | vec->nr_allocated = nr_frames; | 206 | vec->nr_allocated = nr_frames; |
@@ -1189,34 +1189,57 @@ struct page *get_dump_page(unsigned long addr) | |||
1189 | */ | 1189 | */ |
1190 | #ifdef CONFIG_HAVE_GENERIC_RCU_GUP | 1190 | #ifdef CONFIG_HAVE_GENERIC_RCU_GUP |
1191 | 1191 | ||
1192 | #ifndef gup_get_pte | ||
1193 | /* | ||
1194 | * We assume that the PTE can be read atomically. If this is not the case for | ||
1195 | * your architecture, please provide the helper. | ||
1196 | */ | ||
1197 | static inline pte_t gup_get_pte(pte_t *ptep) | ||
1198 | { | ||
1199 | return READ_ONCE(*ptep); | ||
1200 | } | ||
1201 | #endif | ||
1202 | |||
1203 | static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) | ||
1204 | { | ||
1205 | while ((*nr) - nr_start) { | ||
1206 | struct page *page = pages[--(*nr)]; | ||
1207 | |||
1208 | ClearPageReferenced(page); | ||
1209 | put_page(page); | ||
1210 | } | ||
1211 | } | ||
1212 | |||
1192 | #ifdef __HAVE_ARCH_PTE_SPECIAL | 1213 | #ifdef __HAVE_ARCH_PTE_SPECIAL |
1193 | static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | 1214 | static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, |
1194 | int write, struct page **pages, int *nr) | 1215 | int write, struct page **pages, int *nr) |
1195 | { | 1216 | { |
1217 | struct dev_pagemap *pgmap = NULL; | ||
1218 | int nr_start = *nr, ret = 0; | ||
1196 | pte_t *ptep, *ptem; | 1219 | pte_t *ptep, *ptem; |
1197 | int ret = 0; | ||
1198 | 1220 | ||
1199 | ptem = ptep = pte_offset_map(&pmd, addr); | 1221 | ptem = ptep = pte_offset_map(&pmd, addr); |
1200 | do { | 1222 | do { |
1201 | /* | 1223 | pte_t pte = gup_get_pte(ptep); |
1202 | * In the line below we are assuming that the pte can be read | ||
1203 | * atomically. If this is not the case for your architecture, | ||
1204 | * please wrap this in a helper function! | ||
1205 | * | ||
1206 | * for an example see gup_get_pte in arch/x86/mm/gup.c | ||
1207 | */ | ||
1208 | pte_t pte = READ_ONCE(*ptep); | ||
1209 | struct page *head, *page; | 1224 | struct page *head, *page; |
1210 | 1225 | ||
1211 | /* | 1226 | /* |
1212 | * Similar to the PMD case below, NUMA hinting must take slow | 1227 | * Similar to the PMD case below, NUMA hinting must take slow |
1213 | * path using the pte_protnone check. | 1228 | * path using the pte_protnone check. |
1214 | */ | 1229 | */ |
1215 | if (!pte_present(pte) || pte_special(pte) || | 1230 | if (pte_protnone(pte)) |
1216 | pte_protnone(pte) || (write && !pte_write(pte))) | ||
1217 | goto pte_unmap; | 1231 | goto pte_unmap; |
1218 | 1232 | ||
1219 | if (!arch_pte_access_permitted(pte, write)) | 1233 | if (!pte_access_permitted(pte, write)) |
1234 | goto pte_unmap; | ||
1235 | |||
1236 | if (pte_devmap(pte)) { | ||
1237 | pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); | ||
1238 | if (unlikely(!pgmap)) { | ||
1239 | undo_dev_pagemap(nr, nr_start, pages); | ||
1240 | goto pte_unmap; | ||
1241 | } | ||
1242 | } else if (pte_special(pte)) | ||
1220 | goto pte_unmap; | 1243 | goto pte_unmap; |
1221 | 1244 | ||
1222 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 1245 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); |
@@ -1232,6 +1255,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | |||
1232 | } | 1255 | } |
1233 | 1256 | ||
1234 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | 1257 | VM_BUG_ON_PAGE(compound_head(page) != head, page); |
1258 | |||
1259 | put_dev_pagemap(pgmap); | ||
1260 | SetPageReferenced(page); | ||
1235 | pages[*nr] = page; | 1261 | pages[*nr] = page; |
1236 | (*nr)++; | 1262 | (*nr)++; |
1237 | 1263 | ||
@@ -1261,15 +1287,76 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | |||
1261 | } | 1287 | } |
1262 | #endif /* __HAVE_ARCH_PTE_SPECIAL */ | 1288 | #endif /* __HAVE_ARCH_PTE_SPECIAL */ |
1263 | 1289 | ||
1290 | #ifdef __HAVE_ARCH_PTE_DEVMAP | ||
1291 | static int __gup_device_huge(unsigned long pfn, unsigned long addr, | ||
1292 | unsigned long end, struct page **pages, int *nr) | ||
1293 | { | ||
1294 | int nr_start = *nr; | ||
1295 | struct dev_pagemap *pgmap = NULL; | ||
1296 | |||
1297 | do { | ||
1298 | struct page *page = pfn_to_page(pfn); | ||
1299 | |||
1300 | pgmap = get_dev_pagemap(pfn, pgmap); | ||
1301 | if (unlikely(!pgmap)) { | ||
1302 | undo_dev_pagemap(nr, nr_start, pages); | ||
1303 | return 0; | ||
1304 | } | ||
1305 | SetPageReferenced(page); | ||
1306 | pages[*nr] = page; | ||
1307 | get_page(page); | ||
1308 | put_dev_pagemap(pgmap); | ||
1309 | (*nr)++; | ||
1310 | pfn++; | ||
1311 | } while (addr += PAGE_SIZE, addr != end); | ||
1312 | return 1; | ||
1313 | } | ||
1314 | |||
1315 | static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, | ||
1316 | unsigned long end, struct page **pages, int *nr) | ||
1317 | { | ||
1318 | unsigned long fault_pfn; | ||
1319 | |||
1320 | fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | ||
1321 | return __gup_device_huge(fault_pfn, addr, end, pages, nr); | ||
1322 | } | ||
1323 | |||
1324 | static int __gup_device_huge_pud(pud_t pud, unsigned long addr, | ||
1325 | unsigned long end, struct page **pages, int *nr) | ||
1326 | { | ||
1327 | unsigned long fault_pfn; | ||
1328 | |||
1329 | fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); | ||
1330 | return __gup_device_huge(fault_pfn, addr, end, pages, nr); | ||
1331 | } | ||
1332 | #else | ||
1333 | static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, | ||
1334 | unsigned long end, struct page **pages, int *nr) | ||
1335 | { | ||
1336 | BUILD_BUG(); | ||
1337 | return 0; | ||
1338 | } | ||
1339 | |||
1340 | static int __gup_device_huge_pud(pud_t pud, unsigned long addr, | ||
1341 | unsigned long end, struct page **pages, int *nr) | ||
1342 | { | ||
1343 | BUILD_BUG(); | ||
1344 | return 0; | ||
1345 | } | ||
1346 | #endif | ||
1347 | |||
1264 | static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, | 1348 | static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, |
1265 | unsigned long end, int write, struct page **pages, int *nr) | 1349 | unsigned long end, int write, struct page **pages, int *nr) |
1266 | { | 1350 | { |
1267 | struct page *head, *page; | 1351 | struct page *head, *page; |
1268 | int refs; | 1352 | int refs; |
1269 | 1353 | ||
1270 | if (write && !pmd_write(orig)) | 1354 | if (!pmd_access_permitted(orig, write)) |
1271 | return 0; | 1355 | return 0; |
1272 | 1356 | ||
1357 | if (pmd_devmap(orig)) | ||
1358 | return __gup_device_huge_pmd(orig, addr, end, pages, nr); | ||
1359 | |||
1273 | refs = 0; | 1360 | refs = 0; |
1274 | head = pmd_page(orig); | 1361 | head = pmd_page(orig); |
1275 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | 1362 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); |
@@ -1293,6 +1380,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, | |||
1293 | return 0; | 1380 | return 0; |
1294 | } | 1381 | } |
1295 | 1382 | ||
1383 | SetPageReferenced(head); | ||
1296 | return 1; | 1384 | return 1; |
1297 | } | 1385 | } |
1298 | 1386 | ||
@@ -1302,9 +1390,12 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, | |||
1302 | struct page *head, *page; | 1390 | struct page *head, *page; |
1303 | int refs; | 1391 | int refs; |
1304 | 1392 | ||
1305 | if (write && !pud_write(orig)) | 1393 | if (!pud_access_permitted(orig, write)) |
1306 | return 0; | 1394 | return 0; |
1307 | 1395 | ||
1396 | if (pud_devmap(orig)) | ||
1397 | return __gup_device_huge_pud(orig, addr, end, pages, nr); | ||
1398 | |||
1308 | refs = 0; | 1399 | refs = 0; |
1309 | head = pud_page(orig); | 1400 | head = pud_page(orig); |
1310 | page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); | 1401 | page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); |
@@ -1328,6 +1419,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, | |||
1328 | return 0; | 1419 | return 0; |
1329 | } | 1420 | } |
1330 | 1421 | ||
1422 | SetPageReferenced(head); | ||
1331 | return 1; | 1423 | return 1; |
1332 | } | 1424 | } |
1333 | 1425 | ||
@@ -1338,9 +1430,10 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, | |||
1338 | int refs; | 1430 | int refs; |
1339 | struct page *head, *page; | 1431 | struct page *head, *page; |
1340 | 1432 | ||
1341 | if (write && !pgd_write(orig)) | 1433 | if (!pgd_access_permitted(orig, write)) |
1342 | return 0; | 1434 | return 0; |
1343 | 1435 | ||
1436 | BUILD_BUG_ON(pgd_devmap(orig)); | ||
1344 | refs = 0; | 1437 | refs = 0; |
1345 | head = pgd_page(orig); | 1438 | head = pgd_page(orig); |
1346 | page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); | 1439 | page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); |
@@ -1364,6 +1457,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, | |||
1364 | return 0; | 1457 | return 0; |
1365 | } | 1458 | } |
1366 | 1459 | ||
1460 | SetPageReferenced(head); | ||
1367 | return 1; | 1461 | return 1; |
1368 | } | 1462 | } |
1369 | 1463 | ||
@@ -1481,7 +1575,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
1481 | end = start + len; | 1575 | end = start + len; |
1482 | 1576 | ||
1483 | if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, | 1577 | if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, |
1484 | start, len))) | 1578 | (void __user *)start, len))) |
1485 | return 0; | 1579 | return 0; |
1486 | 1580 | ||
1487 | /* | 1581 | /* |
@@ -1520,6 +1614,21 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
1520 | return nr; | 1614 | return nr; |
1521 | } | 1615 | } |
1522 | 1616 | ||
1617 | #ifndef gup_fast_permitted | ||
1618 | /* | ||
1619 | * Check if it's allowed to use __get_user_pages_fast() for the range, or | ||
1620 | * we need to fall back to the slow version: | ||
1621 | */ | ||
1622 | bool gup_fast_permitted(unsigned long start, int nr_pages, int write) | ||
1623 | { | ||
1624 | unsigned long len, end; | ||
1625 | |||
1626 | len = (unsigned long) nr_pages << PAGE_SHIFT; | ||
1627 | end = start + len; | ||
1628 | return end >= start; | ||
1629 | } | ||
1630 | #endif | ||
1631 | |||
1523 | /** | 1632 | /** |
1524 | * get_user_pages_fast() - pin user pages in memory | 1633 | * get_user_pages_fast() - pin user pages in memory |
1525 | * @start: starting user address | 1634 | * @start: starting user address |
@@ -1539,11 +1648,14 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
1539 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | 1648 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, |
1540 | struct page **pages) | 1649 | struct page **pages) |
1541 | { | 1650 | { |
1542 | int nr, ret; | 1651 | int nr = 0, ret = 0; |
1543 | 1652 | ||
1544 | start &= PAGE_MASK; | 1653 | start &= PAGE_MASK; |
1545 | nr = __get_user_pages_fast(start, nr_pages, write, pages); | 1654 | |
1546 | ret = nr; | 1655 | if (gup_fast_permitted(start, nr_pages, write)) { |
1656 | nr = __get_user_pages_fast(start, nr_pages, write, pages); | ||
1657 | ret = nr; | ||
1658 | } | ||
1547 | 1659 | ||
1548 | if (nr < nr_pages) { | 1660 | if (nr < nr_pages) { |
1549 | /* Try to get the remaining pages with get_user_pages */ | 1661 | /* Try to get the remaining pages with get_user_pages */ |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f3c4f9d22821..a84909cf20d3 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -715,7 +715,8 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf) | |||
715 | } | 715 | } |
716 | 716 | ||
717 | static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | 717 | static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, |
718 | pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write) | 718 | pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, |
719 | pgtable_t pgtable) | ||
719 | { | 720 | { |
720 | struct mm_struct *mm = vma->vm_mm; | 721 | struct mm_struct *mm = vma->vm_mm; |
721 | pmd_t entry; | 722 | pmd_t entry; |
@@ -729,6 +730,12 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | |||
729 | entry = pmd_mkyoung(pmd_mkdirty(entry)); | 730 | entry = pmd_mkyoung(pmd_mkdirty(entry)); |
730 | entry = maybe_pmd_mkwrite(entry, vma); | 731 | entry = maybe_pmd_mkwrite(entry, vma); |
731 | } | 732 | } |
733 | |||
734 | if (pgtable) { | ||
735 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | ||
736 | atomic_long_inc(&mm->nr_ptes); | ||
737 | } | ||
738 | |||
732 | set_pmd_at(mm, addr, pmd, entry); | 739 | set_pmd_at(mm, addr, pmd, entry); |
733 | update_mmu_cache_pmd(vma, addr, pmd); | 740 | update_mmu_cache_pmd(vma, addr, pmd); |
734 | spin_unlock(ptl); | 741 | spin_unlock(ptl); |
@@ -738,6 +745,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | |||
738 | pmd_t *pmd, pfn_t pfn, bool write) | 745 | pmd_t *pmd, pfn_t pfn, bool write) |
739 | { | 746 | { |
740 | pgprot_t pgprot = vma->vm_page_prot; | 747 | pgprot_t pgprot = vma->vm_page_prot; |
748 | pgtable_t pgtable = NULL; | ||
741 | /* | 749 | /* |
742 | * If we had pmd_special, we could avoid all these restrictions, | 750 | * If we had pmd_special, we could avoid all these restrictions, |
743 | * but we need to be consistent with PTEs and architectures that | 751 | * but we need to be consistent with PTEs and architectures that |
@@ -752,9 +760,15 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | |||
752 | if (addr < vma->vm_start || addr >= vma->vm_end) | 760 | if (addr < vma->vm_start || addr >= vma->vm_end) |
753 | return VM_FAULT_SIGBUS; | 761 | return VM_FAULT_SIGBUS; |
754 | 762 | ||
763 | if (arch_needs_pgtable_deposit()) { | ||
764 | pgtable = pte_alloc_one(vma->vm_mm, addr); | ||
765 | if (!pgtable) | ||
766 | return VM_FAULT_OOM; | ||
767 | } | ||
768 | |||
755 | track_pfn_insert(vma, &pgprot, pfn); | 769 | track_pfn_insert(vma, &pgprot, pfn); |
756 | 770 | ||
757 | insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); | 771 | insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable); |
758 | return VM_FAULT_NOPAGE; | 772 | return VM_FAULT_NOPAGE; |
759 | } | 773 | } |
760 | EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); | 774 | EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); |
@@ -1564,9 +1578,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1564 | ClearPageDirty(page); | 1578 | ClearPageDirty(page); |
1565 | unlock_page(page); | 1579 | unlock_page(page); |
1566 | 1580 | ||
1567 | if (PageActive(page)) | ||
1568 | deactivate_page(page); | ||
1569 | |||
1570 | if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { | 1581 | if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { |
1571 | pmdp_invalidate(vma, addr, pmd); | 1582 | pmdp_invalidate(vma, addr, pmd); |
1572 | orig_pmd = pmd_mkold(orig_pmd); | 1583 | orig_pmd = pmd_mkold(orig_pmd); |
@@ -1575,6 +1586,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1575 | set_pmd_at(mm, addr, pmd, orig_pmd); | 1586 | set_pmd_at(mm, addr, pmd, orig_pmd); |
1576 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1587 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1577 | } | 1588 | } |
1589 | |||
1590 | mark_page_lazyfree(page); | ||
1578 | ret = true; | 1591 | ret = true; |
1579 | out: | 1592 | out: |
1580 | spin_unlock(ptl); | 1593 | spin_unlock(ptl); |
@@ -1612,12 +1625,13 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1612 | tlb->fullmm); | 1625 | tlb->fullmm); |
1613 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1626 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1614 | if (vma_is_dax(vma)) { | 1627 | if (vma_is_dax(vma)) { |
1628 | if (arch_needs_pgtable_deposit()) | ||
1629 | zap_deposited_table(tlb->mm, pmd); | ||
1615 | spin_unlock(ptl); | 1630 | spin_unlock(ptl); |
1616 | if (is_huge_zero_pmd(orig_pmd)) | 1631 | if (is_huge_zero_pmd(orig_pmd)) |
1617 | tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); | 1632 | tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); |
1618 | } else if (is_huge_zero_pmd(orig_pmd)) { | 1633 | } else if (is_huge_zero_pmd(orig_pmd)) { |
1619 | pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); | 1634 | zap_deposited_table(tlb->mm, pmd); |
1620 | atomic_long_dec(&tlb->mm->nr_ptes); | ||
1621 | spin_unlock(ptl); | 1635 | spin_unlock(ptl); |
1622 | tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); | 1636 | tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); |
1623 | } else { | 1637 | } else { |
@@ -1626,10 +1640,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1626 | VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); | 1640 | VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); |
1627 | VM_BUG_ON_PAGE(!PageHead(page), page); | 1641 | VM_BUG_ON_PAGE(!PageHead(page), page); |
1628 | if (PageAnon(page)) { | 1642 | if (PageAnon(page)) { |
1629 | pgtable_t pgtable; | 1643 | zap_deposited_table(tlb->mm, pmd); |
1630 | pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); | ||
1631 | pte_free(tlb->mm, pgtable); | ||
1632 | atomic_long_dec(&tlb->mm->nr_ptes); | ||
1633 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | 1644 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); |
1634 | } else { | 1645 | } else { |
1635 | if (arch_needs_pgtable_deposit()) | 1646 | if (arch_needs_pgtable_deposit()) |
@@ -2145,15 +2156,15 @@ static void freeze_page(struct page *page) | |||
2145 | { | 2156 | { |
2146 | enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | | 2157 | enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | |
2147 | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; | 2158 | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; |
2148 | int ret; | 2159 | bool unmap_success; |
2149 | 2160 | ||
2150 | VM_BUG_ON_PAGE(!PageHead(page), page); | 2161 | VM_BUG_ON_PAGE(!PageHead(page), page); |
2151 | 2162 | ||
2152 | if (PageAnon(page)) | 2163 | if (PageAnon(page)) |
2153 | ttu_flags |= TTU_MIGRATION; | 2164 | ttu_flags |= TTU_MIGRATION; |
2154 | 2165 | ||
2155 | ret = try_to_unmap(page, ttu_flags); | 2166 | unmap_success = try_to_unmap(page, ttu_flags); |
2156 | VM_BUG_ON_PAGE(ret, page); | 2167 | VM_BUG_ON_PAGE(!unmap_success, page); |
2157 | } | 2168 | } |
2158 | 2169 | ||
2159 | static void unfreeze_page(struct page *page) | 2170 | static void unfreeze_page(struct page *page) |
@@ -2399,7 +2410,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
2399 | 2410 | ||
2400 | VM_BUG_ON_PAGE(is_huge_zero_page(page), page); | 2411 | VM_BUG_ON_PAGE(is_huge_zero_page(page), page); |
2401 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 2412 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
2402 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | ||
2403 | VM_BUG_ON_PAGE(!PageCompound(page), page); | 2413 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
2404 | 2414 | ||
2405 | if (PageAnon(head)) { | 2415 | if (PageAnon(head)) { |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 9d26fd9fefe4..356df057a2a8 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
@@ -34,8 +34,7 @@ static int hwpoison_inject(void *data, u64 val) | |||
34 | if (!hwpoison_filter_enable) | 34 | if (!hwpoison_filter_enable) |
35 | goto inject; | 35 | goto inject; |
36 | 36 | ||
37 | if (!PageLRU(hpage) && !PageHuge(p)) | 37 | shake_page(hpage, 0); |
38 | shake_page(hpage, 0); | ||
39 | /* | 38 | /* |
40 | * This implies unable to support non-LRU pages. | 39 | * This implies unable to support non-LRU pages. |
41 | */ | 40 | */ |
diff --git a/mm/internal.h b/mm/internal.h index 266efaeaa370..0e4f558412fb 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -81,11 +81,16 @@ static inline void set_page_refcounted(struct page *page) | |||
81 | extern unsigned long highest_memmap_pfn; | 81 | extern unsigned long highest_memmap_pfn; |
82 | 82 | ||
83 | /* | 83 | /* |
84 | * Maximum number of reclaim retries without progress before the OOM | ||
85 | * killer is consider the only way forward. | ||
86 | */ | ||
87 | #define MAX_RECLAIM_RETRIES 16 | ||
88 | |||
89 | /* | ||
84 | * in mm/vmscan.c: | 90 | * in mm/vmscan.c: |
85 | */ | 91 | */ |
86 | extern int isolate_lru_page(struct page *page); | 92 | extern int isolate_lru_page(struct page *page); |
87 | extern void putback_lru_page(struct page *page); | 93 | extern void putback_lru_page(struct page *page); |
88 | extern bool pgdat_reclaimable(struct pglist_data *pgdat); | ||
89 | 94 | ||
90 | /* | 95 | /* |
91 | * in mm/rmap.c: | 96 | * in mm/rmap.c: |
@@ -178,6 +183,7 @@ extern int user_min_free_kbytes; | |||
178 | struct compact_control { | 183 | struct compact_control { |
179 | struct list_head freepages; /* List of free pages to migrate to */ | 184 | struct list_head freepages; /* List of free pages to migrate to */ |
180 | struct list_head migratepages; /* List of pages being migrated */ | 185 | struct list_head migratepages; /* List of pages being migrated */ |
186 | struct zone *zone; | ||
181 | unsigned long nr_freepages; /* Number of isolated free pages */ | 187 | unsigned long nr_freepages; /* Number of isolated free pages */ |
182 | unsigned long nr_migratepages; /* Number of pages to migrate */ | 188 | unsigned long nr_migratepages; /* Number of pages to migrate */ |
183 | unsigned long total_migrate_scanned; | 189 | unsigned long total_migrate_scanned; |
@@ -185,17 +191,18 @@ struct compact_control { | |||
185 | unsigned long free_pfn; /* isolate_freepages search base */ | 191 | unsigned long free_pfn; /* isolate_freepages search base */ |
186 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 192 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
187 | unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ | 193 | unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ |
194 | const gfp_t gfp_mask; /* gfp mask of a direct compactor */ | ||
195 | int order; /* order a direct compactor needs */ | ||
196 | int migratetype; /* migratetype of direct compactor */ | ||
197 | const unsigned int alloc_flags; /* alloc flags of a direct compactor */ | ||
198 | const int classzone_idx; /* zone index of a direct compactor */ | ||
188 | enum migrate_mode mode; /* Async or sync migration mode */ | 199 | enum migrate_mode mode; /* Async or sync migration mode */ |
189 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ | 200 | bool ignore_skip_hint; /* Scan blocks even if marked skip */ |
190 | bool ignore_block_suitable; /* Scan blocks considered unsuitable */ | 201 | bool ignore_block_suitable; /* Scan blocks considered unsuitable */ |
191 | bool direct_compaction; /* False from kcompactd or /proc/... */ | 202 | bool direct_compaction; /* False from kcompactd or /proc/... */ |
192 | bool whole_zone; /* Whole zone should/has been scanned */ | 203 | bool whole_zone; /* Whole zone should/has been scanned */ |
193 | int order; /* order a direct compactor needs */ | ||
194 | const gfp_t gfp_mask; /* gfp mask of a direct compactor */ | ||
195 | const unsigned int alloc_flags; /* alloc flags of a direct compactor */ | ||
196 | const int classzone_idx; /* zone index of a direct compactor */ | ||
197 | struct zone *zone; | ||
198 | bool contended; /* Signal lock or sched contention */ | 204 | bool contended; /* Signal lock or sched contention */ |
205 | bool finishing_block; /* Finishing current pageblock */ | ||
199 | }; | 206 | }; |
200 | 207 | ||
201 | unsigned long | 208 | unsigned long |
@@ -505,4 +512,14 @@ extern const struct trace_print_flags pageflag_names[]; | |||
505 | extern const struct trace_print_flags vmaflag_names[]; | 512 | extern const struct trace_print_flags vmaflag_names[]; |
506 | extern const struct trace_print_flags gfpflag_names[]; | 513 | extern const struct trace_print_flags gfpflag_names[]; |
507 | 514 | ||
515 | static inline bool is_migrate_highatomic(enum migratetype migratetype) | ||
516 | { | ||
517 | return migratetype == MIGRATE_HIGHATOMIC; | ||
518 | } | ||
519 | |||
520 | static inline bool is_migrate_highatomic_page(struct page *page) | ||
521 | { | ||
522 | return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; | ||
523 | } | ||
524 | |||
508 | #endif /* __MM_INTERNAL_H */ | 525 | #endif /* __MM_INTERNAL_H */ |
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 98b27195e38b..b10da59cf765 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c | |||
@@ -577,7 +577,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object) | |||
577 | 577 | ||
578 | shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); | 578 | shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); |
579 | if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) { | 579 | if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) { |
580 | kasan_report_double_free(cache, object, shadow_byte); | 580 | kasan_report_double_free(cache, object, |
581 | __builtin_return_address(1)); | ||
581 | return true; | 582 | return true; |
582 | } | 583 | } |
583 | 584 | ||
@@ -690,7 +691,7 @@ int kasan_module_alloc(void *addr, size_t size) | |||
690 | 691 | ||
691 | ret = __vmalloc_node_range(shadow_size, 1, shadow_start, | 692 | ret = __vmalloc_node_range(shadow_size, 1, shadow_start, |
692 | shadow_start + shadow_size, | 693 | shadow_start + shadow_size, |
693 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | 694 | GFP_KERNEL | __GFP_ZERO, |
694 | PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, | 695 | PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, |
695 | __builtin_return_address(0)); | 696 | __builtin_return_address(0)); |
696 | 697 | ||
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index dd2dea8eb077..1229298cce64 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h | |||
@@ -99,7 +99,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) | |||
99 | void kasan_report(unsigned long addr, size_t size, | 99 | void kasan_report(unsigned long addr, size_t size, |
100 | bool is_write, unsigned long ip); | 100 | bool is_write, unsigned long ip); |
101 | void kasan_report_double_free(struct kmem_cache *cache, void *object, | 101 | void kasan_report_double_free(struct kmem_cache *cache, void *object, |
102 | s8 shadow); | 102 | void *ip); |
103 | 103 | ||
104 | #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) | 104 | #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) |
105 | void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); | 105 | void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); |
diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ab42a0803f16..beee0e980e2d 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c | |||
@@ -51,7 +51,13 @@ static const void *find_first_bad_addr(const void *addr, size_t size) | |||
51 | return first_bad_addr; | 51 | return first_bad_addr; |
52 | } | 52 | } |
53 | 53 | ||
54 | static void print_error_description(struct kasan_access_info *info) | 54 | static bool addr_has_shadow(struct kasan_access_info *info) |
55 | { | ||
56 | return (info->access_addr >= | ||
57 | kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); | ||
58 | } | ||
59 | |||
60 | static const char *get_shadow_bug_type(struct kasan_access_info *info) | ||
55 | { | 61 | { |
56 | const char *bug_type = "unknown-crash"; | 62 | const char *bug_type = "unknown-crash"; |
57 | u8 *shadow_addr; | 63 | u8 *shadow_addr; |
@@ -98,12 +104,39 @@ static void print_error_description(struct kasan_access_info *info) | |||
98 | break; | 104 | break; |
99 | } | 105 | } |
100 | 106 | ||
101 | pr_err("BUG: KASAN: %s in %pS at addr %p\n", | 107 | return bug_type; |
102 | bug_type, (void *)info->ip, | 108 | } |
103 | info->access_addr); | 109 | |
104 | pr_err("%s of size %zu by task %s/%d\n", | 110 | const char *get_wild_bug_type(struct kasan_access_info *info) |
105 | info->is_write ? "Write" : "Read", | 111 | { |
106 | info->access_size, current->comm, task_pid_nr(current)); | 112 | const char *bug_type = "unknown-crash"; |
113 | |||
114 | if ((unsigned long)info->access_addr < PAGE_SIZE) | ||
115 | bug_type = "null-ptr-deref"; | ||
116 | else if ((unsigned long)info->access_addr < TASK_SIZE) | ||
117 | bug_type = "user-memory-access"; | ||
118 | else | ||
119 | bug_type = "wild-memory-access"; | ||
120 | |||
121 | return bug_type; | ||
122 | } | ||
123 | |||
124 | static const char *get_bug_type(struct kasan_access_info *info) | ||
125 | { | ||
126 | if (addr_has_shadow(info)) | ||
127 | return get_shadow_bug_type(info); | ||
128 | return get_wild_bug_type(info); | ||
129 | } | ||
130 | |||
131 | static void print_error_description(struct kasan_access_info *info) | ||
132 | { | ||
133 | const char *bug_type = get_bug_type(info); | ||
134 | |||
135 | pr_err("BUG: KASAN: %s in %pS\n", | ||
136 | bug_type, (void *)info->ip); | ||
137 | pr_err("%s of size %zu at addr %p by task %s/%d\n", | ||
138 | info->is_write ? "Write" : "Read", info->access_size, | ||
139 | info->access_addr, current->comm, task_pid_nr(current)); | ||
107 | } | 140 | } |
108 | 141 | ||
109 | static inline bool kernel_or_module_addr(const void *addr) | 142 | static inline bool kernel_or_module_addr(const void *addr) |
@@ -144,9 +177,9 @@ static void kasan_end_report(unsigned long *flags) | |||
144 | kasan_enable_current(); | 177 | kasan_enable_current(); |
145 | } | 178 | } |
146 | 179 | ||
147 | static void print_track(struct kasan_track *track) | 180 | static void print_track(struct kasan_track *track, const char *prefix) |
148 | { | 181 | { |
149 | pr_err("PID = %u\n", track->pid); | 182 | pr_err("%s by task %u:\n", prefix, track->pid); |
150 | if (track->stack) { | 183 | if (track->stack) { |
151 | struct stack_trace trace; | 184 | struct stack_trace trace; |
152 | 185 | ||
@@ -157,59 +190,84 @@ static void print_track(struct kasan_track *track) | |||
157 | } | 190 | } |
158 | } | 191 | } |
159 | 192 | ||
160 | static void kasan_object_err(struct kmem_cache *cache, void *object) | 193 | static struct page *addr_to_page(const void *addr) |
161 | { | 194 | { |
162 | struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); | 195 | if ((addr >= (void *)PAGE_OFFSET) && |
196 | (addr < high_memory)) | ||
197 | return virt_to_head_page(addr); | ||
198 | return NULL; | ||
199 | } | ||
163 | 200 | ||
164 | dump_stack(); | 201 | static void describe_object_addr(struct kmem_cache *cache, void *object, |
165 | pr_err("Object at %p, in cache %s size: %d\n", object, cache->name, | 202 | const void *addr) |
166 | cache->object_size); | 203 | { |
204 | unsigned long access_addr = (unsigned long)addr; | ||
205 | unsigned long object_addr = (unsigned long)object; | ||
206 | const char *rel_type; | ||
207 | int rel_bytes; | ||
167 | 208 | ||
168 | if (!(cache->flags & SLAB_KASAN)) | 209 | pr_err("The buggy address belongs to the object at %p\n" |
210 | " which belongs to the cache %s of size %d\n", | ||
211 | object, cache->name, cache->object_size); | ||
212 | |||
213 | if (!addr) | ||
169 | return; | 214 | return; |
170 | 215 | ||
171 | pr_err("Allocated:\n"); | 216 | if (access_addr < object_addr) { |
172 | print_track(&alloc_info->alloc_track); | 217 | rel_type = "to the left"; |
173 | pr_err("Freed:\n"); | 218 | rel_bytes = object_addr - access_addr; |
174 | print_track(&alloc_info->free_track); | 219 | } else if (access_addr >= object_addr + cache->object_size) { |
220 | rel_type = "to the right"; | ||
221 | rel_bytes = access_addr - (object_addr + cache->object_size); | ||
222 | } else { | ||
223 | rel_type = "inside"; | ||
224 | rel_bytes = access_addr - object_addr; | ||
225 | } | ||
226 | |||
227 | pr_err("The buggy address is located %d bytes %s of\n" | ||
228 | " %d-byte region [%p, %p)\n", | ||
229 | rel_bytes, rel_type, cache->object_size, (void *)object_addr, | ||
230 | (void *)(object_addr + cache->object_size)); | ||
175 | } | 231 | } |
176 | 232 | ||
177 | void kasan_report_double_free(struct kmem_cache *cache, void *object, | 233 | static void describe_object(struct kmem_cache *cache, void *object, |
178 | s8 shadow) | 234 | const void *addr) |
179 | { | 235 | { |
180 | unsigned long flags; | 236 | struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); |
181 | 237 | ||
182 | kasan_start_report(&flags); | 238 | if (cache->flags & SLAB_KASAN) { |
183 | pr_err("BUG: Double free or freeing an invalid pointer\n"); | 239 | print_track(&alloc_info->alloc_track, "Allocated"); |
184 | pr_err("Unexpected shadow byte: 0x%hhX\n", shadow); | 240 | pr_err("\n"); |
185 | kasan_object_err(cache, object); | 241 | print_track(&alloc_info->free_track, "Freed"); |
186 | kasan_end_report(&flags); | 242 | pr_err("\n"); |
243 | } | ||
244 | |||
245 | describe_object_addr(cache, object, addr); | ||
187 | } | 246 | } |
188 | 247 | ||
189 | static void print_address_description(struct kasan_access_info *info) | 248 | static void print_address_description(void *addr) |
190 | { | 249 | { |
191 | const void *addr = info->access_addr; | 250 | struct page *page = addr_to_page(addr); |
192 | 251 | ||
193 | if ((addr >= (void *)PAGE_OFFSET) && | 252 | dump_stack(); |
194 | (addr < high_memory)) { | 253 | pr_err("\n"); |
195 | struct page *page = virt_to_head_page(addr); | 254 | |
196 | 255 | if (page && PageSlab(page)) { | |
197 | if (PageSlab(page)) { | 256 | struct kmem_cache *cache = page->slab_cache; |
198 | void *object; | 257 | void *object = nearest_obj(cache, page, addr); |
199 | struct kmem_cache *cache = page->slab_cache; | 258 | |
200 | object = nearest_obj(cache, page, | 259 | describe_object(cache, object, addr); |
201 | (void *)info->access_addr); | ||
202 | kasan_object_err(cache, object); | ||
203 | return; | ||
204 | } | ||
205 | dump_page(page, "kasan: bad access detected"); | ||
206 | } | 260 | } |
207 | 261 | ||
208 | if (kernel_or_module_addr(addr)) { | 262 | if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { |
209 | if (!init_task_stack_addr(addr)) | 263 | pr_err("The buggy address belongs to the variable:\n"); |
210 | pr_err("Address belongs to variable %pS\n", addr); | 264 | pr_err(" %pS\n", addr); |
265 | } | ||
266 | |||
267 | if (page) { | ||
268 | pr_err("The buggy address belongs to the page:\n"); | ||
269 | dump_page(page, "kasan: bad access detected"); | ||
211 | } | 270 | } |
212 | dump_stack(); | ||
213 | } | 271 | } |
214 | 272 | ||
215 | static bool row_is_guilty(const void *row, const void *guilty) | 273 | static bool row_is_guilty(const void *row, const void *guilty) |
@@ -264,31 +322,34 @@ static void print_shadow_for_address(const void *addr) | |||
264 | } | 322 | } |
265 | } | 323 | } |
266 | 324 | ||
325 | void kasan_report_double_free(struct kmem_cache *cache, void *object, | ||
326 | void *ip) | ||
327 | { | ||
328 | unsigned long flags; | ||
329 | |||
330 | kasan_start_report(&flags); | ||
331 | pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip); | ||
332 | pr_err("\n"); | ||
333 | print_address_description(object); | ||
334 | pr_err("\n"); | ||
335 | print_shadow_for_address(object); | ||
336 | kasan_end_report(&flags); | ||
337 | } | ||
338 | |||
267 | static void kasan_report_error(struct kasan_access_info *info) | 339 | static void kasan_report_error(struct kasan_access_info *info) |
268 | { | 340 | { |
269 | unsigned long flags; | 341 | unsigned long flags; |
270 | const char *bug_type; | ||
271 | 342 | ||
272 | kasan_start_report(&flags); | 343 | kasan_start_report(&flags); |
273 | 344 | ||
274 | if (info->access_addr < | 345 | print_error_description(info); |
275 | kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) { | 346 | pr_err("\n"); |
276 | if ((unsigned long)info->access_addr < PAGE_SIZE) | 347 | |
277 | bug_type = "null-ptr-deref"; | 348 | if (!addr_has_shadow(info)) { |
278 | else if ((unsigned long)info->access_addr < TASK_SIZE) | ||
279 | bug_type = "user-memory-access"; | ||
280 | else | ||
281 | bug_type = "wild-memory-access"; | ||
282 | pr_err("BUG: KASAN: %s on address %p\n", | ||
283 | bug_type, info->access_addr); | ||
284 | pr_err("%s of size %zu by task %s/%d\n", | ||
285 | info->is_write ? "Write" : "Read", | ||
286 | info->access_size, current->comm, | ||
287 | task_pid_nr(current)); | ||
288 | dump_stack(); | 349 | dump_stack(); |
289 | } else { | 350 | } else { |
290 | print_error_description(info); | 351 | print_address_description((void *)info->access_addr); |
291 | print_address_description(info); | 352 | pr_err("\n"); |
292 | print_shadow_for_address(info->first_bad_addr); | 353 | print_shadow_for_address(info->first_bad_addr); |
293 | } | 354 | } |
294 | 355 | ||
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ba40b7f673f4..7cb9c88bb4a3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c | |||
@@ -483,8 +483,7 @@ void __khugepaged_exit(struct mm_struct *mm) | |||
483 | 483 | ||
484 | static void release_pte_page(struct page *page) | 484 | static void release_pte_page(struct page *page) |
485 | { | 485 | { |
486 | /* 0 stands for page_is_file_cache(page) == false */ | 486 | dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); |
487 | dec_node_page_state(page, NR_ISOLATED_ANON + 0); | ||
488 | unlock_page(page); | 487 | unlock_page(page); |
489 | putback_lru_page(page); | 488 | putback_lru_page(page); |
490 | } | 489 | } |
@@ -532,7 +531,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
532 | 531 | ||
533 | VM_BUG_ON_PAGE(PageCompound(page), page); | 532 | VM_BUG_ON_PAGE(PageCompound(page), page); |
534 | VM_BUG_ON_PAGE(!PageAnon(page), page); | 533 | VM_BUG_ON_PAGE(!PageAnon(page), page); |
535 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | ||
536 | 534 | ||
537 | /* | 535 | /* |
538 | * We can do it before isolate_lru_page because the | 536 | * We can do it before isolate_lru_page because the |
@@ -550,7 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
550 | * The page must only be referenced by the scanned process | 548 | * The page must only be referenced by the scanned process |
551 | * and page swap cache. | 549 | * and page swap cache. |
552 | */ | 550 | */ |
553 | if (page_count(page) != 1 + !!PageSwapCache(page)) { | 551 | if (page_count(page) != 1 + PageSwapCache(page)) { |
554 | unlock_page(page); | 552 | unlock_page(page); |
555 | result = SCAN_PAGE_COUNT; | 553 | result = SCAN_PAGE_COUNT; |
556 | goto out; | 554 | goto out; |
@@ -579,8 +577,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
579 | result = SCAN_DEL_PAGE_LRU; | 577 | result = SCAN_DEL_PAGE_LRU; |
580 | goto out; | 578 | goto out; |
581 | } | 579 | } |
582 | /* 0 stands for page_is_file_cache(page) == false */ | 580 | inc_node_page_state(page, |
583 | inc_node_page_state(page, NR_ISOLATED_ANON + 0); | 581 | NR_ISOLATED_ANON + page_is_file_cache(page)); |
584 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 582 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
585 | VM_BUG_ON_PAGE(PageLRU(page), page); | 583 | VM_BUG_ON_PAGE(PageLRU(page), page); |
586 | 584 | ||
@@ -1183,7 +1181,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
1183 | * The page must only be referenced by the scanned process | 1181 | * The page must only be referenced by the scanned process |
1184 | * and page swap cache. | 1182 | * and page swap cache. |
1185 | */ | 1183 | */ |
1186 | if (page_count(page) != 1 + !!PageSwapCache(page)) { | 1184 | if (page_count(page) != 1 + PageSwapCache(page)) { |
1187 | result = SCAN_PAGE_COUNT; | 1185 | result = SCAN_PAGE_COUNT; |
1188 | goto out_unmap; | 1186 | goto out_unmap; |
1189 | } | 1187 | } |
@@ -1933,11 +1933,10 @@ struct page *ksm_might_need_to_copy(struct page *page, | |||
1933 | return new_page; | 1933 | return new_page; |
1934 | } | 1934 | } |
1935 | 1935 | ||
1936 | int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) | 1936 | void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) |
1937 | { | 1937 | { |
1938 | struct stable_node *stable_node; | 1938 | struct stable_node *stable_node; |
1939 | struct rmap_item *rmap_item; | 1939 | struct rmap_item *rmap_item; |
1940 | int ret = SWAP_AGAIN; | ||
1941 | int search_new_forks = 0; | 1940 | int search_new_forks = 0; |
1942 | 1941 | ||
1943 | VM_BUG_ON_PAGE(!PageKsm(page), page); | 1942 | VM_BUG_ON_PAGE(!PageKsm(page), page); |
@@ -1950,7 +1949,7 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) | |||
1950 | 1949 | ||
1951 | stable_node = page_stable_node(page); | 1950 | stable_node = page_stable_node(page); |
1952 | if (!stable_node) | 1951 | if (!stable_node) |
1953 | return ret; | 1952 | return; |
1954 | again: | 1953 | again: |
1955 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { | 1954 | hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { |
1956 | struct anon_vma *anon_vma = rmap_item->anon_vma; | 1955 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
@@ -1978,23 +1977,20 @@ again: | |||
1978 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) | 1977 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) |
1979 | continue; | 1978 | continue; |
1980 | 1979 | ||
1981 | ret = rwc->rmap_one(page, vma, | 1980 | if (!rwc->rmap_one(page, vma, |
1982 | rmap_item->address, rwc->arg); | 1981 | rmap_item->address, rwc->arg)) { |
1983 | if (ret != SWAP_AGAIN) { | ||
1984 | anon_vma_unlock_read(anon_vma); | 1982 | anon_vma_unlock_read(anon_vma); |
1985 | goto out; | 1983 | return; |
1986 | } | 1984 | } |
1987 | if (rwc->done && rwc->done(page)) { | 1985 | if (rwc->done && rwc->done(page)) { |
1988 | anon_vma_unlock_read(anon_vma); | 1986 | anon_vma_unlock_read(anon_vma); |
1989 | goto out; | 1987 | return; |
1990 | } | 1988 | } |
1991 | } | 1989 | } |
1992 | anon_vma_unlock_read(anon_vma); | 1990 | anon_vma_unlock_read(anon_vma); |
1993 | } | 1991 | } |
1994 | if (!search_new_forks++) | 1992 | if (!search_new_forks++) |
1995 | goto again; | 1993 | goto again; |
1996 | out: | ||
1997 | return ret; | ||
1998 | } | 1994 | } |
1999 | 1995 | ||
2000 | #ifdef CONFIG_MIGRATION | 1996 | #ifdef CONFIG_MIGRATION |
diff --git a/mm/madvise.c b/mm/madvise.c index 7a2abf0127ae..25b78ee4fc2c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -411,10 +411,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, | |||
411 | ptent = pte_mkold(ptent); | 411 | ptent = pte_mkold(ptent); |
412 | ptent = pte_mkclean(ptent); | 412 | ptent = pte_mkclean(ptent); |
413 | set_pte_at(mm, addr, pte, ptent); | 413 | set_pte_at(mm, addr, pte, ptent); |
414 | if (PageActive(page)) | ||
415 | deactivate_page(page); | ||
416 | tlb_remove_tlb_entry(tlb, pte, addr); | 414 | tlb_remove_tlb_entry(tlb, pte, addr); |
417 | } | 415 | } |
416 | mark_page_lazyfree(page); | ||
418 | } | 417 | } |
419 | out: | 418 | out: |
420 | if (nr_swap) { | 419 | if (nr_swap) { |
@@ -606,34 +605,40 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
606 | /* | 605 | /* |
607 | * Error injection support for memory error handling. | 606 | * Error injection support for memory error handling. |
608 | */ | 607 | */ |
609 | static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) | 608 | static int madvise_inject_error(int behavior, |
609 | unsigned long start, unsigned long end) | ||
610 | { | 610 | { |
611 | struct page *p; | 611 | struct page *page; |
612 | |||
612 | if (!capable(CAP_SYS_ADMIN)) | 613 | if (!capable(CAP_SYS_ADMIN)) |
613 | return -EPERM; | 614 | return -EPERM; |
615 | |||
614 | for (; start < end; start += PAGE_SIZE << | 616 | for (; start < end; start += PAGE_SIZE << |
615 | compound_order(compound_head(p))) { | 617 | compound_order(compound_head(page))) { |
616 | int ret; | 618 | int ret; |
617 | 619 | ||
618 | ret = get_user_pages_fast(start, 1, 0, &p); | 620 | ret = get_user_pages_fast(start, 1, 0, &page); |
619 | if (ret != 1) | 621 | if (ret != 1) |
620 | return ret; | 622 | return ret; |
621 | 623 | ||
622 | if (PageHWPoison(p)) { | 624 | if (PageHWPoison(page)) { |
623 | put_page(p); | 625 | put_page(page); |
624 | continue; | 626 | continue; |
625 | } | 627 | } |
626 | if (bhv == MADV_SOFT_OFFLINE) { | 628 | |
627 | pr_info("Soft offlining page %#lx at %#lx\n", | 629 | if (behavior == MADV_SOFT_OFFLINE) { |
628 | page_to_pfn(p), start); | 630 | pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", |
629 | ret = soft_offline_page(p, MF_COUNT_INCREASED); | 631 | page_to_pfn(page), start); |
632 | |||
633 | ret = soft_offline_page(page, MF_COUNT_INCREASED); | ||
630 | if (ret) | 634 | if (ret) |
631 | return ret; | 635 | return ret; |
632 | continue; | 636 | continue; |
633 | } | 637 | } |
634 | pr_info("Injecting memory failure for page %#lx at %#lx\n", | 638 | pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", |
635 | page_to_pfn(p), start); | 639 | page_to_pfn(page), start); |
636 | ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); | 640 | |
641 | ret = memory_failure(page_to_pfn(page), 0, MF_COUNT_INCREASED); | ||
637 | if (ret) | 642 | if (ret) |
638 | return ret; | 643 | return ret; |
639 | } | 644 | } |
@@ -651,13 +656,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
651 | case MADV_WILLNEED: | 656 | case MADV_WILLNEED: |
652 | return madvise_willneed(vma, prev, start, end); | 657 | return madvise_willneed(vma, prev, start, end); |
653 | case MADV_FREE: | 658 | case MADV_FREE: |
654 | /* | 659 | return madvise_free(vma, prev, start, end); |
655 | * XXX: In this implementation, MADV_FREE works like | ||
656 | * MADV_DONTNEED on swapless system or full swap. | ||
657 | */ | ||
658 | if (get_nr_swap_pages() > 0) | ||
659 | return madvise_free(vma, prev, start, end); | ||
660 | /* passthrough */ | ||
661 | case MADV_DONTNEED: | 660 | case MADV_DONTNEED: |
662 | return madvise_dontneed(vma, prev, start, end); | 661 | return madvise_dontneed(vma, prev, start, end); |
663 | default: | 662 | default: |
@@ -688,6 +687,10 @@ madvise_behavior_valid(int behavior) | |||
688 | #endif | 687 | #endif |
689 | case MADV_DONTDUMP: | 688 | case MADV_DONTDUMP: |
690 | case MADV_DODUMP: | 689 | case MADV_DODUMP: |
690 | #ifdef CONFIG_MEMORY_FAILURE | ||
691 | case MADV_SOFT_OFFLINE: | ||
692 | case MADV_HWPOISON: | ||
693 | #endif | ||
691 | return true; | 694 | return true; |
692 | 695 | ||
693 | default: | 696 | default: |
@@ -761,10 +764,6 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
761 | size_t len; | 764 | size_t len; |
762 | struct blk_plug plug; | 765 | struct blk_plug plug; |
763 | 766 | ||
764 | #ifdef CONFIG_MEMORY_FAILURE | ||
765 | if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) | ||
766 | return madvise_hwpoison(behavior, start, start+len_in); | ||
767 | #endif | ||
768 | if (!madvise_behavior_valid(behavior)) | 767 | if (!madvise_behavior_valid(behavior)) |
769 | return error; | 768 | return error; |
770 | 769 | ||
@@ -784,6 +783,11 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
784 | if (end == start) | 783 | if (end == start) |
785 | return error; | 784 | return error; |
786 | 785 | ||
786 | #ifdef CONFIG_MEMORY_FAILURE | ||
787 | if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) | ||
788 | return madvise_inject_error(behavior, start, start + len_in); | ||
789 | #endif | ||
790 | |||
787 | write = madvise_need_mmap_write(behavior); | 791 | write = madvise_need_mmap_write(behavior); |
788 | if (write) { | 792 | if (write) { |
789 | if (down_write_killable(¤t->mm->mmap_sem)) | 793 | if (down_write_killable(¤t->mm->mmap_sem)) |
diff --git a/mm/memblock.c b/mm/memblock.c index 696f06d17c4e..b049c9b2dba8 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -805,6 +805,18 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) | |||
805 | } | 805 | } |
806 | 806 | ||
807 | /** | 807 | /** |
808 | * memblock_clear_nomap - Clear flag MEMBLOCK_NOMAP for a specified region. | ||
809 | * @base: the base phys addr of the region | ||
810 | * @size: the size of the region | ||
811 | * | ||
812 | * Return 0 on success, -errno on failure. | ||
813 | */ | ||
814 | int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) | ||
815 | { | ||
816 | return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP); | ||
817 | } | ||
818 | |||
819 | /** | ||
808 | * __next_reserved_mem_region - next function for for_each_reserved_region() | 820 | * __next_reserved_mem_region - next function for for_each_reserved_region() |
809 | * @idx: pointer to u64 loop variable | 821 | * @idx: pointer to u64 loop variable |
810 | * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL | 822 | * @out_start: ptr to phys_addr_t for start address of the region, can be %NULL |
@@ -1531,11 +1543,37 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) | |||
1531 | (phys_addr_t)ULLONG_MAX); | 1543 | (phys_addr_t)ULLONG_MAX); |
1532 | } | 1544 | } |
1533 | 1545 | ||
1546 | void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) | ||
1547 | { | ||
1548 | int start_rgn, end_rgn; | ||
1549 | int i, ret; | ||
1550 | |||
1551 | if (!size) | ||
1552 | return; | ||
1553 | |||
1554 | ret = memblock_isolate_range(&memblock.memory, base, size, | ||
1555 | &start_rgn, &end_rgn); | ||
1556 | if (ret) | ||
1557 | return; | ||
1558 | |||
1559 | /* remove all the MAP regions */ | ||
1560 | for (i = memblock.memory.cnt - 1; i >= end_rgn; i--) | ||
1561 | if (!memblock_is_nomap(&memblock.memory.regions[i])) | ||
1562 | memblock_remove_region(&memblock.memory, i); | ||
1563 | |||
1564 | for (i = start_rgn - 1; i >= 0; i--) | ||
1565 | if (!memblock_is_nomap(&memblock.memory.regions[i])) | ||
1566 | memblock_remove_region(&memblock.memory, i); | ||
1567 | |||
1568 | /* truncate the reserved regions */ | ||
1569 | memblock_remove_range(&memblock.reserved, 0, base); | ||
1570 | memblock_remove_range(&memblock.reserved, | ||
1571 | base + size, (phys_addr_t)ULLONG_MAX); | ||
1572 | } | ||
1573 | |||
1534 | void __init memblock_mem_limit_remove_map(phys_addr_t limit) | 1574 | void __init memblock_mem_limit_remove_map(phys_addr_t limit) |
1535 | { | 1575 | { |
1536 | struct memblock_type *type = &memblock.memory; | ||
1537 | phys_addr_t max_addr; | 1576 | phys_addr_t max_addr; |
1538 | int i, ret, start_rgn, end_rgn; | ||
1539 | 1577 | ||
1540 | if (!limit) | 1578 | if (!limit) |
1541 | return; | 1579 | return; |
@@ -1546,19 +1584,7 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit) | |||
1546 | if (max_addr == (phys_addr_t)ULLONG_MAX) | 1584 | if (max_addr == (phys_addr_t)ULLONG_MAX) |
1547 | return; | 1585 | return; |
1548 | 1586 | ||
1549 | ret = memblock_isolate_range(type, max_addr, (phys_addr_t)ULLONG_MAX, | 1587 | memblock_cap_memory_range(0, max_addr); |
1550 | &start_rgn, &end_rgn); | ||
1551 | if (ret) | ||
1552 | return; | ||
1553 | |||
1554 | /* remove all the MAP regions above the limit */ | ||
1555 | for (i = end_rgn - 1; i >= start_rgn; i--) { | ||
1556 | if (!memblock_is_nomap(&type->regions[i])) | ||
1557 | memblock_remove_region(type, i); | ||
1558 | } | ||
1559 | /* truncate the reserved regions */ | ||
1560 | memblock_remove_range(&memblock.reserved, max_addr, | ||
1561 | (phys_addr_t)ULLONG_MAX); | ||
1562 | } | 1588 | } |
1563 | 1589 | ||
1564 | static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) | 1590 | static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2bd7541d7c11..ff73899af61a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -100,24 +100,7 @@ static bool do_memsw_account(void) | |||
100 | return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; | 100 | return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; |
101 | } | 101 | } |
102 | 102 | ||
103 | static const char * const mem_cgroup_stat_names[] = { | 103 | static const char *const mem_cgroup_lru_names[] = { |
104 | "cache", | ||
105 | "rss", | ||
106 | "rss_huge", | ||
107 | "mapped_file", | ||
108 | "dirty", | ||
109 | "writeback", | ||
110 | "swap", | ||
111 | }; | ||
112 | |||
113 | static const char * const mem_cgroup_events_names[] = { | ||
114 | "pgpgin", | ||
115 | "pgpgout", | ||
116 | "pgfault", | ||
117 | "pgmajfault", | ||
118 | }; | ||
119 | |||
120 | static const char * const mem_cgroup_lru_names[] = { | ||
121 | "inactive_anon", | 104 | "inactive_anon", |
122 | "active_anon", | 105 | "active_anon", |
123 | "inactive_file", | 106 | "inactive_file", |
@@ -568,32 +551,15 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) | |||
568 | * common workload, threshold and synchronization as vmstat[] should be | 551 | * common workload, threshold and synchronization as vmstat[] should be |
569 | * implemented. | 552 | * implemented. |
570 | */ | 553 | */ |
571 | static unsigned long | ||
572 | mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) | ||
573 | { | ||
574 | long val = 0; | ||
575 | int cpu; | ||
576 | |||
577 | /* Per-cpu values can be negative, use a signed accumulator */ | ||
578 | for_each_possible_cpu(cpu) | ||
579 | val += per_cpu(memcg->stat->count[idx], cpu); | ||
580 | /* | ||
581 | * Summing races with updates, so val may be negative. Avoid exposing | ||
582 | * transient negative values. | ||
583 | */ | ||
584 | if (val < 0) | ||
585 | val = 0; | ||
586 | return val; | ||
587 | } | ||
588 | 554 | ||
589 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | 555 | static unsigned long memcg_sum_events(struct mem_cgroup *memcg, |
590 | enum mem_cgroup_events_index idx) | 556 | enum memcg_event_item event) |
591 | { | 557 | { |
592 | unsigned long val = 0; | 558 | unsigned long val = 0; |
593 | int cpu; | 559 | int cpu; |
594 | 560 | ||
595 | for_each_possible_cpu(cpu) | 561 | for_each_possible_cpu(cpu) |
596 | val += per_cpu(memcg->stat->events[idx], cpu); | 562 | val += per_cpu(memcg->stat->events[event], cpu); |
597 | return val; | 563 | return val; |
598 | } | 564 | } |
599 | 565 | ||
@@ -606,23 +572,23 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | |||
606 | * counted as CACHE even if it's on ANON LRU. | 572 | * counted as CACHE even if it's on ANON LRU. |
607 | */ | 573 | */ |
608 | if (PageAnon(page)) | 574 | if (PageAnon(page)) |
609 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], | 575 | __this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages); |
610 | nr_pages); | 576 | else { |
611 | else | 577 | __this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages); |
612 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], | 578 | if (PageSwapBacked(page)) |
613 | nr_pages); | 579 | __this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages); |
580 | } | ||
614 | 581 | ||
615 | if (compound) { | 582 | if (compound) { |
616 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | 583 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
617 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], | 584 | __this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages); |
618 | nr_pages); | ||
619 | } | 585 | } |
620 | 586 | ||
621 | /* pagein of a big page is an event. So, ignore page size */ | 587 | /* pagein of a big page is an event. So, ignore page size */ |
622 | if (nr_pages > 0) | 588 | if (nr_pages > 0) |
623 | __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); | 589 | __this_cpu_inc(memcg->stat->events[PGPGIN]); |
624 | else { | 590 | else { |
625 | __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); | 591 | __this_cpu_inc(memcg->stat->events[PGPGOUT]); |
626 | nr_pages = -nr_pages; /* for event */ | 592 | nr_pages = -nr_pages; /* for event */ |
627 | } | 593 | } |
628 | 594 | ||
@@ -1144,6 +1110,28 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) | |||
1144 | return false; | 1110 | return false; |
1145 | } | 1111 | } |
1146 | 1112 | ||
1113 | unsigned int memcg1_stats[] = { | ||
1114 | MEMCG_CACHE, | ||
1115 | MEMCG_RSS, | ||
1116 | MEMCG_RSS_HUGE, | ||
1117 | NR_SHMEM, | ||
1118 | NR_FILE_MAPPED, | ||
1119 | NR_FILE_DIRTY, | ||
1120 | NR_WRITEBACK, | ||
1121 | MEMCG_SWAP, | ||
1122 | }; | ||
1123 | |||
1124 | static const char *const memcg1_stat_names[] = { | ||
1125 | "cache", | ||
1126 | "rss", | ||
1127 | "rss_huge", | ||
1128 | "shmem", | ||
1129 | "mapped_file", | ||
1130 | "dirty", | ||
1131 | "writeback", | ||
1132 | "swap", | ||
1133 | }; | ||
1134 | |||
1147 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 1135 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
1148 | /** | 1136 | /** |
1149 | * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. | 1137 | * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. |
@@ -1188,11 +1176,11 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1188 | pr_cont_cgroup_path(iter->css.cgroup); | 1176 | pr_cont_cgroup_path(iter->css.cgroup); |
1189 | pr_cont(":"); | 1177 | pr_cont(":"); |
1190 | 1178 | ||
1191 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 1179 | for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { |
1192 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) | 1180 | if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account) |
1193 | continue; | 1181 | continue; |
1194 | pr_cont(" %s:%luKB", mem_cgroup_stat_names[i], | 1182 | pr_cont(" %s:%luKB", memcg1_stat_names[i], |
1195 | K(mem_cgroup_read_stat(iter, i))); | 1183 | K(memcg_page_state(iter, memcg1_stats[i]))); |
1196 | } | 1184 | } |
1197 | 1185 | ||
1198 | for (i = 0; i < NR_LRU_LISTS; i++) | 1186 | for (i = 0; i < NR_LRU_LISTS; i++) |
@@ -1837,7 +1825,7 @@ static void reclaim_high(struct mem_cgroup *memcg, | |||
1837 | do { | 1825 | do { |
1838 | if (page_counter_read(&memcg->memory) <= memcg->high) | 1826 | if (page_counter_read(&memcg->memory) <= memcg->high) |
1839 | continue; | 1827 | continue; |
1840 | mem_cgroup_events(memcg, MEMCG_HIGH, 1); | 1828 | mem_cgroup_event(memcg, MEMCG_HIGH); |
1841 | try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); | 1829 | try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); |
1842 | } while ((memcg = parent_mem_cgroup(memcg))); | 1830 | } while ((memcg = parent_mem_cgroup(memcg))); |
1843 | } | 1831 | } |
@@ -1928,7 +1916,7 @@ retry: | |||
1928 | if (!gfpflags_allow_blocking(gfp_mask)) | 1916 | if (!gfpflags_allow_blocking(gfp_mask)) |
1929 | goto nomem; | 1917 | goto nomem; |
1930 | 1918 | ||
1931 | mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); | 1919 | mem_cgroup_event(mem_over_limit, MEMCG_MAX); |
1932 | 1920 | ||
1933 | nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, | 1921 | nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, |
1934 | gfp_mask, may_swap); | 1922 | gfp_mask, may_swap); |
@@ -1971,7 +1959,7 @@ retry: | |||
1971 | if (fatal_signal_pending(current)) | 1959 | if (fatal_signal_pending(current)) |
1972 | goto force; | 1960 | goto force; |
1973 | 1961 | ||
1974 | mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); | 1962 | mem_cgroup_event(mem_over_limit, MEMCG_OOM); |
1975 | 1963 | ||
1976 | mem_cgroup_oom(mem_over_limit, gfp_mask, | 1964 | mem_cgroup_oom(mem_over_limit, gfp_mask, |
1977 | get_order(nr_pages * PAGE_SIZE)); | 1965 | get_order(nr_pages * PAGE_SIZE)); |
@@ -2381,7 +2369,7 @@ void mem_cgroup_split_huge_fixup(struct page *head) | |||
2381 | for (i = 1; i < HPAGE_PMD_NR; i++) | 2369 | for (i = 1; i < HPAGE_PMD_NR; i++) |
2382 | head[i].mem_cgroup = head->mem_cgroup; | 2370 | head[i].mem_cgroup = head->mem_cgroup; |
2383 | 2371 | ||
2384 | __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], | 2372 | __this_cpu_sub(head->mem_cgroup->stat->count[MEMCG_RSS_HUGE], |
2385 | HPAGE_PMD_NR); | 2373 | HPAGE_PMD_NR); |
2386 | } | 2374 | } |
2387 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 2375 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
@@ -2391,7 +2379,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, | |||
2391 | bool charge) | 2379 | bool charge) |
2392 | { | 2380 | { |
2393 | int val = (charge) ? 1 : -1; | 2381 | int val = (charge) ? 1 : -1; |
2394 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); | 2382 | this_cpu_add(memcg->stat->count[MEMCG_SWAP], val); |
2395 | } | 2383 | } |
2396 | 2384 | ||
2397 | /** | 2385 | /** |
@@ -2725,7 +2713,7 @@ static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) | |||
2725 | 2713 | ||
2726 | for_each_mem_cgroup_tree(iter, memcg) { | 2714 | for_each_mem_cgroup_tree(iter, memcg) { |
2727 | for (i = 0; i < MEMCG_NR_STAT; i++) | 2715 | for (i = 0; i < MEMCG_NR_STAT; i++) |
2728 | stat[i] += mem_cgroup_read_stat(iter, i); | 2716 | stat[i] += memcg_page_state(iter, i); |
2729 | } | 2717 | } |
2730 | } | 2718 | } |
2731 | 2719 | ||
@@ -2738,7 +2726,7 @@ static void tree_events(struct mem_cgroup *memcg, unsigned long *events) | |||
2738 | 2726 | ||
2739 | for_each_mem_cgroup_tree(iter, memcg) { | 2727 | for_each_mem_cgroup_tree(iter, memcg) { |
2740 | for (i = 0; i < MEMCG_NR_EVENTS; i++) | 2728 | for (i = 0; i < MEMCG_NR_EVENTS; i++) |
2741 | events[i] += mem_cgroup_read_events(iter, i); | 2729 | events[i] += memcg_sum_events(iter, i); |
2742 | } | 2730 | } |
2743 | } | 2731 | } |
2744 | 2732 | ||
@@ -2750,13 +2738,10 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | |||
2750 | struct mem_cgroup *iter; | 2738 | struct mem_cgroup *iter; |
2751 | 2739 | ||
2752 | for_each_mem_cgroup_tree(iter, memcg) { | 2740 | for_each_mem_cgroup_tree(iter, memcg) { |
2753 | val += mem_cgroup_read_stat(iter, | 2741 | val += memcg_page_state(iter, MEMCG_CACHE); |
2754 | MEM_CGROUP_STAT_CACHE); | 2742 | val += memcg_page_state(iter, MEMCG_RSS); |
2755 | val += mem_cgroup_read_stat(iter, | ||
2756 | MEM_CGROUP_STAT_RSS); | ||
2757 | if (swap) | 2743 | if (swap) |
2758 | val += mem_cgroup_read_stat(iter, | 2744 | val += memcg_page_state(iter, MEMCG_SWAP); |
2759 | MEM_CGROUP_STAT_SWAP); | ||
2760 | } | 2745 | } |
2761 | } else { | 2746 | } else { |
2762 | if (!swap) | 2747 | if (!swap) |
@@ -3131,6 +3116,21 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) | |||
3131 | } | 3116 | } |
3132 | #endif /* CONFIG_NUMA */ | 3117 | #endif /* CONFIG_NUMA */ |
3133 | 3118 | ||
3119 | /* Universal VM events cgroup1 shows, original sort order */ | ||
3120 | unsigned int memcg1_events[] = { | ||
3121 | PGPGIN, | ||
3122 | PGPGOUT, | ||
3123 | PGFAULT, | ||
3124 | PGMAJFAULT, | ||
3125 | }; | ||
3126 | |||
3127 | static const char *const memcg1_event_names[] = { | ||
3128 | "pgpgin", | ||
3129 | "pgpgout", | ||
3130 | "pgfault", | ||
3131 | "pgmajfault", | ||
3132 | }; | ||
3133 | |||
3134 | static int memcg_stat_show(struct seq_file *m, void *v) | 3134 | static int memcg_stat_show(struct seq_file *m, void *v) |
3135 | { | 3135 | { |
3136 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 3136 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
@@ -3138,22 +3138,20 @@ static int memcg_stat_show(struct seq_file *m, void *v) | |||
3138 | struct mem_cgroup *mi; | 3138 | struct mem_cgroup *mi; |
3139 | unsigned int i; | 3139 | unsigned int i; |
3140 | 3140 | ||
3141 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) != | 3141 | BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); |
3142 | MEM_CGROUP_STAT_NSTATS); | ||
3143 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) != | ||
3144 | MEM_CGROUP_EVENTS_NSTATS); | ||
3145 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | 3142 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); |
3146 | 3143 | ||
3147 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 3144 | for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { |
3148 | if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) | 3145 | if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) |
3149 | continue; | 3146 | continue; |
3150 | seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i], | 3147 | seq_printf(m, "%s %lu\n", memcg1_stat_names[i], |
3151 | mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); | 3148 | memcg_page_state(memcg, memcg1_stats[i]) * |
3149 | PAGE_SIZE); | ||
3152 | } | 3150 | } |
3153 | 3151 | ||
3154 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) | 3152 | for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) |
3155 | seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], | 3153 | seq_printf(m, "%s %lu\n", memcg1_event_names[i], |
3156 | mem_cgroup_read_events(memcg, i)); | 3154 | memcg_sum_events(memcg, memcg1_events[i])); |
3157 | 3155 | ||
3158 | for (i = 0; i < NR_LRU_LISTS; i++) | 3156 | for (i = 0; i < NR_LRU_LISTS; i++) |
3159 | seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], | 3157 | seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], |
@@ -3171,23 +3169,23 @@ static int memcg_stat_show(struct seq_file *m, void *v) | |||
3171 | seq_printf(m, "hierarchical_memsw_limit %llu\n", | 3169 | seq_printf(m, "hierarchical_memsw_limit %llu\n", |
3172 | (u64)memsw * PAGE_SIZE); | 3170 | (u64)memsw * PAGE_SIZE); |
3173 | 3171 | ||
3174 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 3172 | for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { |
3175 | unsigned long long val = 0; | 3173 | unsigned long long val = 0; |
3176 | 3174 | ||
3177 | if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) | 3175 | if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) |
3178 | continue; | 3176 | continue; |
3179 | for_each_mem_cgroup_tree(mi, memcg) | 3177 | for_each_mem_cgroup_tree(mi, memcg) |
3180 | val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; | 3178 | val += memcg_page_state(mi, memcg1_stats[i]) * |
3181 | seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val); | 3179 | PAGE_SIZE; |
3180 | seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val); | ||
3182 | } | 3181 | } |
3183 | 3182 | ||
3184 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { | 3183 | for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) { |
3185 | unsigned long long val = 0; | 3184 | unsigned long long val = 0; |
3186 | 3185 | ||
3187 | for_each_mem_cgroup_tree(mi, memcg) | 3186 | for_each_mem_cgroup_tree(mi, memcg) |
3188 | val += mem_cgroup_read_events(mi, i); | 3187 | val += memcg_sum_events(mi, memcg1_events[i]); |
3189 | seq_printf(m, "total_%s %llu\n", | 3188 | seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val); |
3190 | mem_cgroup_events_names[i], val); | ||
3191 | } | 3189 | } |
3192 | 3190 | ||
3193 | for (i = 0; i < NR_LRU_LISTS; i++) { | 3191 | for (i = 0; i < NR_LRU_LISTS; i++) { |
@@ -3652,10 +3650,10 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, | |||
3652 | struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); | 3650 | struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); |
3653 | struct mem_cgroup *parent; | 3651 | struct mem_cgroup *parent; |
3654 | 3652 | ||
3655 | *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY); | 3653 | *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); |
3656 | 3654 | ||
3657 | /* this should eventually include NR_UNSTABLE_NFS */ | 3655 | /* this should eventually include NR_UNSTABLE_NFS */ |
3658 | *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); | 3656 | *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); |
3659 | *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | | 3657 | *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | |
3660 | (1 << LRU_ACTIVE_FILE)); | 3658 | (1 << LRU_ACTIVE_FILE)); |
3661 | *pheadroom = PAGE_COUNTER_MAX; | 3659 | *pheadroom = PAGE_COUNTER_MAX; |
@@ -4511,33 +4509,29 @@ static int mem_cgroup_move_account(struct page *page, | |||
4511 | spin_lock_irqsave(&from->move_lock, flags); | 4509 | spin_lock_irqsave(&from->move_lock, flags); |
4512 | 4510 | ||
4513 | if (!anon && page_mapped(page)) { | 4511 | if (!anon && page_mapped(page)) { |
4514 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | 4512 | __this_cpu_sub(from->stat->count[NR_FILE_MAPPED], nr_pages); |
4515 | nr_pages); | 4513 | __this_cpu_add(to->stat->count[NR_FILE_MAPPED], nr_pages); |
4516 | __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | ||
4517 | nr_pages); | ||
4518 | } | 4514 | } |
4519 | 4515 | ||
4520 | /* | 4516 | /* |
4521 | * move_lock grabbed above and caller set from->moving_account, so | 4517 | * move_lock grabbed above and caller set from->moving_account, so |
4522 | * mem_cgroup_update_page_stat() will serialize updates to PageDirty. | 4518 | * mod_memcg_page_state will serialize updates to PageDirty. |
4523 | * So mapping should be stable for dirty pages. | 4519 | * So mapping should be stable for dirty pages. |
4524 | */ | 4520 | */ |
4525 | if (!anon && PageDirty(page)) { | 4521 | if (!anon && PageDirty(page)) { |
4526 | struct address_space *mapping = page_mapping(page); | 4522 | struct address_space *mapping = page_mapping(page); |
4527 | 4523 | ||
4528 | if (mapping_cap_account_dirty(mapping)) { | 4524 | if (mapping_cap_account_dirty(mapping)) { |
4529 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY], | 4525 | __this_cpu_sub(from->stat->count[NR_FILE_DIRTY], |
4530 | nr_pages); | 4526 | nr_pages); |
4531 | __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY], | 4527 | __this_cpu_add(to->stat->count[NR_FILE_DIRTY], |
4532 | nr_pages); | 4528 | nr_pages); |
4533 | } | 4529 | } |
4534 | } | 4530 | } |
4535 | 4531 | ||
4536 | if (PageWriteback(page)) { | 4532 | if (PageWriteback(page)) { |
4537 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], | 4533 | __this_cpu_sub(from->stat->count[NR_WRITEBACK], nr_pages); |
4538 | nr_pages); | 4534 | __this_cpu_add(to->stat->count[NR_WRITEBACK], nr_pages); |
4539 | __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], | ||
4540 | nr_pages); | ||
4541 | } | 4535 | } |
4542 | 4536 | ||
4543 | /* | 4537 | /* |
@@ -5154,7 +5148,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, | |||
5154 | continue; | 5148 | continue; |
5155 | } | 5149 | } |
5156 | 5150 | ||
5157 | mem_cgroup_events(memcg, MEMCG_OOM, 1); | 5151 | mem_cgroup_event(memcg, MEMCG_OOM); |
5158 | if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) | 5152 | if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) |
5159 | break; | 5153 | break; |
5160 | } | 5154 | } |
@@ -5167,10 +5161,10 @@ static int memory_events_show(struct seq_file *m, void *v) | |||
5167 | { | 5161 | { |
5168 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5162 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
5169 | 5163 | ||
5170 | seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW)); | 5164 | seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW)); |
5171 | seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH)); | 5165 | seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH)); |
5172 | seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX)); | 5166 | seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX)); |
5173 | seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM)); | 5167 | seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM)); |
5174 | 5168 | ||
5175 | return 0; | 5169 | return 0; |
5176 | } | 5170 | } |
@@ -5197,9 +5191,9 @@ static int memory_stat_show(struct seq_file *m, void *v) | |||
5197 | tree_events(memcg, events); | 5191 | tree_events(memcg, events); |
5198 | 5192 | ||
5199 | seq_printf(m, "anon %llu\n", | 5193 | seq_printf(m, "anon %llu\n", |
5200 | (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); | 5194 | (u64)stat[MEMCG_RSS] * PAGE_SIZE); |
5201 | seq_printf(m, "file %llu\n", | 5195 | seq_printf(m, "file %llu\n", |
5202 | (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); | 5196 | (u64)stat[MEMCG_CACHE] * PAGE_SIZE); |
5203 | seq_printf(m, "kernel_stack %llu\n", | 5197 | seq_printf(m, "kernel_stack %llu\n", |
5204 | (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); | 5198 | (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); |
5205 | seq_printf(m, "slab %llu\n", | 5199 | seq_printf(m, "slab %llu\n", |
@@ -5208,12 +5202,14 @@ static int memory_stat_show(struct seq_file *m, void *v) | |||
5208 | seq_printf(m, "sock %llu\n", | 5202 | seq_printf(m, "sock %llu\n", |
5209 | (u64)stat[MEMCG_SOCK] * PAGE_SIZE); | 5203 | (u64)stat[MEMCG_SOCK] * PAGE_SIZE); |
5210 | 5204 | ||
5205 | seq_printf(m, "shmem %llu\n", | ||
5206 | (u64)stat[NR_SHMEM] * PAGE_SIZE); | ||
5211 | seq_printf(m, "file_mapped %llu\n", | 5207 | seq_printf(m, "file_mapped %llu\n", |
5212 | (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE); | 5208 | (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE); |
5213 | seq_printf(m, "file_dirty %llu\n", | 5209 | seq_printf(m, "file_dirty %llu\n", |
5214 | (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE); | 5210 | (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE); |
5215 | seq_printf(m, "file_writeback %llu\n", | 5211 | seq_printf(m, "file_writeback %llu\n", |
5216 | (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE); | 5212 | (u64)stat[NR_WRITEBACK] * PAGE_SIZE); |
5217 | 5213 | ||
5218 | for (i = 0; i < NR_LRU_LISTS; i++) { | 5214 | for (i = 0; i < NR_LRU_LISTS; i++) { |
5219 | struct mem_cgroup *mi; | 5215 | struct mem_cgroup *mi; |
@@ -5232,10 +5228,15 @@ static int memory_stat_show(struct seq_file *m, void *v) | |||
5232 | 5228 | ||
5233 | /* Accumulated memory events */ | 5229 | /* Accumulated memory events */ |
5234 | 5230 | ||
5235 | seq_printf(m, "pgfault %lu\n", | 5231 | seq_printf(m, "pgfault %lu\n", events[PGFAULT]); |
5236 | events[MEM_CGROUP_EVENTS_PGFAULT]); | 5232 | seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); |
5237 | seq_printf(m, "pgmajfault %lu\n", | 5233 | |
5238 | events[MEM_CGROUP_EVENTS_PGMAJFAULT]); | 5234 | seq_printf(m, "workingset_refault %lu\n", |
5235 | stat[WORKINGSET_REFAULT]); | ||
5236 | seq_printf(m, "workingset_activate %lu\n", | ||
5237 | stat[WORKINGSET_ACTIVATE]); | ||
5238 | seq_printf(m, "workingset_nodereclaim %lu\n", | ||
5239 | stat[WORKINGSET_NODERECLAIM]); | ||
5239 | 5240 | ||
5240 | return 0; | 5241 | return 0; |
5241 | } | 5242 | } |
@@ -5476,8 +5477,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, | |||
5476 | 5477 | ||
5477 | static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, | 5478 | static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, |
5478 | unsigned long nr_anon, unsigned long nr_file, | 5479 | unsigned long nr_anon, unsigned long nr_file, |
5479 | unsigned long nr_huge, unsigned long nr_kmem, | 5480 | unsigned long nr_kmem, unsigned long nr_huge, |
5480 | struct page *dummy_page) | 5481 | unsigned long nr_shmem, struct page *dummy_page) |
5481 | { | 5482 | { |
5482 | unsigned long nr_pages = nr_anon + nr_file + nr_kmem; | 5483 | unsigned long nr_pages = nr_anon + nr_file + nr_kmem; |
5483 | unsigned long flags; | 5484 | unsigned long flags; |
@@ -5492,10 +5493,11 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, | |||
5492 | } | 5493 | } |
5493 | 5494 | ||
5494 | local_irq_save(flags); | 5495 | local_irq_save(flags); |
5495 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); | 5496 | __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon); |
5496 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); | 5497 | __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file); |
5497 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); | 5498 | __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge); |
5498 | __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); | 5499 | __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem); |
5500 | __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout); | ||
5499 | __this_cpu_add(memcg->stat->nr_page_events, nr_pages); | 5501 | __this_cpu_add(memcg->stat->nr_page_events, nr_pages); |
5500 | memcg_check_events(memcg, dummy_page); | 5502 | memcg_check_events(memcg, dummy_page); |
5501 | local_irq_restore(flags); | 5503 | local_irq_restore(flags); |
@@ -5507,6 +5509,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, | |||
5507 | static void uncharge_list(struct list_head *page_list) | 5509 | static void uncharge_list(struct list_head *page_list) |
5508 | { | 5510 | { |
5509 | struct mem_cgroup *memcg = NULL; | 5511 | struct mem_cgroup *memcg = NULL; |
5512 | unsigned long nr_shmem = 0; | ||
5510 | unsigned long nr_anon = 0; | 5513 | unsigned long nr_anon = 0; |
5511 | unsigned long nr_file = 0; | 5514 | unsigned long nr_file = 0; |
5512 | unsigned long nr_huge = 0; | 5515 | unsigned long nr_huge = 0; |
@@ -5539,9 +5542,9 @@ static void uncharge_list(struct list_head *page_list) | |||
5539 | if (memcg != page->mem_cgroup) { | 5542 | if (memcg != page->mem_cgroup) { |
5540 | if (memcg) { | 5543 | if (memcg) { |
5541 | uncharge_batch(memcg, pgpgout, nr_anon, nr_file, | 5544 | uncharge_batch(memcg, pgpgout, nr_anon, nr_file, |
5542 | nr_huge, nr_kmem, page); | 5545 | nr_kmem, nr_huge, nr_shmem, page); |
5543 | pgpgout = nr_anon = nr_file = | 5546 | pgpgout = nr_anon = nr_file = nr_kmem = 0; |
5544 | nr_huge = nr_kmem = 0; | 5547 | nr_huge = nr_shmem = 0; |
5545 | } | 5548 | } |
5546 | memcg = page->mem_cgroup; | 5549 | memcg = page->mem_cgroup; |
5547 | } | 5550 | } |
@@ -5555,8 +5558,11 @@ static void uncharge_list(struct list_head *page_list) | |||
5555 | } | 5558 | } |
5556 | if (PageAnon(page)) | 5559 | if (PageAnon(page)) |
5557 | nr_anon += nr_pages; | 5560 | nr_anon += nr_pages; |
5558 | else | 5561 | else { |
5559 | nr_file += nr_pages; | 5562 | nr_file += nr_pages; |
5563 | if (PageSwapBacked(page)) | ||
5564 | nr_shmem += nr_pages; | ||
5565 | } | ||
5560 | pgpgout++; | 5566 | pgpgout++; |
5561 | } else { | 5567 | } else { |
5562 | nr_kmem += 1 << compound_order(page); | 5568 | nr_kmem += 1 << compound_order(page); |
@@ -5568,7 +5574,7 @@ static void uncharge_list(struct list_head *page_list) | |||
5568 | 5574 | ||
5569 | if (memcg) | 5575 | if (memcg) |
5570 | uncharge_batch(memcg, pgpgout, nr_anon, nr_file, | 5576 | uncharge_batch(memcg, pgpgout, nr_anon, nr_file, |
5571 | nr_huge, nr_kmem, page); | 5577 | nr_kmem, nr_huge, nr_shmem, page); |
5572 | } | 5578 | } |
5573 | 5579 | ||
5574 | /** | 5580 | /** |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 27f7210e7fab..73066b80d14a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -220,6 +220,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, | |||
220 | */ | 220 | */ |
221 | void shake_page(struct page *p, int access) | 221 | void shake_page(struct page *p, int access) |
222 | { | 222 | { |
223 | if (PageHuge(p)) | ||
224 | return; | ||
225 | |||
223 | if (!PageSlab(p)) { | 226 | if (!PageSlab(p)) { |
224 | lru_add_drain_all(); | 227 | lru_add_drain_all(); |
225 | if (PageLRU(p)) | 228 | if (PageLRU(p)) |
@@ -322,7 +325,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, | |||
322 | * wrong earlier. | 325 | * wrong earlier. |
323 | */ | 326 | */ |
324 | static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, | 327 | static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, |
325 | int fail, struct page *page, unsigned long pfn, | 328 | bool fail, struct page *page, unsigned long pfn, |
326 | int flags) | 329 | int flags) |
327 | { | 330 | { |
328 | struct to_kill *tk, *next; | 331 | struct to_kill *tk, *next; |
@@ -904,35 +907,36 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page); | |||
904 | * Do all that is necessary to remove user space mappings. Unmap | 907 | * Do all that is necessary to remove user space mappings. Unmap |
905 | * the pages and send SIGBUS to the processes if the data was dirty. | 908 | * the pages and send SIGBUS to the processes if the data was dirty. |
906 | */ | 909 | */ |
907 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | 910 | static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, |
908 | int trapno, int flags, struct page **hpagep) | 911 | int trapno, int flags, struct page **hpagep) |
909 | { | 912 | { |
910 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | 913 | enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; |
911 | struct address_space *mapping; | 914 | struct address_space *mapping; |
912 | LIST_HEAD(tokill); | 915 | LIST_HEAD(tokill); |
913 | int ret; | 916 | bool unmap_success; |
914 | int kill = 1, forcekill; | 917 | int kill = 1, forcekill; |
915 | struct page *hpage = *hpagep; | 918 | struct page *hpage = *hpagep; |
919 | bool mlocked = PageMlocked(hpage); | ||
916 | 920 | ||
917 | /* | 921 | /* |
918 | * Here we are interested only in user-mapped pages, so skip any | 922 | * Here we are interested only in user-mapped pages, so skip any |
919 | * other types of pages. | 923 | * other types of pages. |
920 | */ | 924 | */ |
921 | if (PageReserved(p) || PageSlab(p)) | 925 | if (PageReserved(p) || PageSlab(p)) |
922 | return SWAP_SUCCESS; | 926 | return true; |
923 | if (!(PageLRU(hpage) || PageHuge(p))) | 927 | if (!(PageLRU(hpage) || PageHuge(p))) |
924 | return SWAP_SUCCESS; | 928 | return true; |
925 | 929 | ||
926 | /* | 930 | /* |
927 | * This check implies we don't kill processes if their pages | 931 | * This check implies we don't kill processes if their pages |
928 | * are in the swap cache early. Those are always late kills. | 932 | * are in the swap cache early. Those are always late kills. |
929 | */ | 933 | */ |
930 | if (!page_mapped(hpage)) | 934 | if (!page_mapped(hpage)) |
931 | return SWAP_SUCCESS; | 935 | return true; |
932 | 936 | ||
933 | if (PageKsm(p)) { | 937 | if (PageKsm(p)) { |
934 | pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn); | 938 | pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn); |
935 | return SWAP_FAIL; | 939 | return false; |
936 | } | 940 | } |
937 | 941 | ||
938 | if (PageSwapCache(p)) { | 942 | if (PageSwapCache(p)) { |
@@ -971,12 +975,19 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
971 | if (kill) | 975 | if (kill) |
972 | collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); | 976 | collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); |
973 | 977 | ||
974 | ret = try_to_unmap(hpage, ttu); | 978 | unmap_success = try_to_unmap(hpage, ttu); |
975 | if (ret != SWAP_SUCCESS) | 979 | if (!unmap_success) |
976 | pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", | 980 | pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", |
977 | pfn, page_mapcount(hpage)); | 981 | pfn, page_mapcount(hpage)); |
978 | 982 | ||
979 | /* | 983 | /* |
984 | * try_to_unmap() might put mlocked page in lru cache, so call | ||
985 | * shake_page() again to ensure that it's flushed. | ||
986 | */ | ||
987 | if (mlocked) | ||
988 | shake_page(hpage, 0); | ||
989 | |||
990 | /* | ||
980 | * Now that the dirty bit has been propagated to the | 991 | * Now that the dirty bit has been propagated to the |
981 | * struct page and all unmaps done we can decide if | 992 | * struct page and all unmaps done we can decide if |
982 | * killing is needed or not. Only kill when the page | 993 | * killing is needed or not. Only kill when the page |
@@ -987,10 +998,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
987 | * any accesses to the poisoned memory. | 998 | * any accesses to the poisoned memory. |
988 | */ | 999 | */ |
989 | forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); | 1000 | forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); |
990 | kill_procs(&tokill, forcekill, trapno, | 1001 | kill_procs(&tokill, forcekill, trapno, !unmap_success, p, pfn, flags); |
991 | ret != SWAP_SUCCESS, p, pfn, flags); | ||
992 | 1002 | ||
993 | return ret; | 1003 | return unmap_success; |
994 | } | 1004 | } |
995 | 1005 | ||
996 | static void set_page_hwpoison_huge_page(struct page *hpage) | 1006 | static void set_page_hwpoison_huge_page(struct page *hpage) |
@@ -1138,22 +1148,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1138 | * The check (unnecessarily) ignores LRU pages being isolated and | 1148 | * The check (unnecessarily) ignores LRU pages being isolated and |
1139 | * walked by the page reclaim code, however that's not a big loss. | 1149 | * walked by the page reclaim code, however that's not a big loss. |
1140 | */ | 1150 | */ |
1141 | if (!PageHuge(p)) { | 1151 | shake_page(p, 0); |
1142 | if (!PageLRU(p)) | 1152 | /* shake_page could have turned it free. */ |
1143 | shake_page(p, 0); | 1153 | if (!PageLRU(p) && is_free_buddy_page(p)) { |
1144 | if (!PageLRU(p)) { | 1154 | if (flags & MF_COUNT_INCREASED) |
1145 | /* | 1155 | action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); |
1146 | * shake_page could have turned it free. | 1156 | else |
1147 | */ | 1157 | action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED); |
1148 | if (is_free_buddy_page(p)) { | 1158 | return 0; |
1149 | if (flags & MF_COUNT_INCREASED) | ||
1150 | action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); | ||
1151 | else | ||
1152 | action_result(pfn, MF_MSG_BUDDY_2ND, | ||
1153 | MF_DELAYED); | ||
1154 | return 0; | ||
1155 | } | ||
1156 | } | ||
1157 | } | 1159 | } |
1158 | 1160 | ||
1159 | lock_page(hpage); | 1161 | lock_page(hpage); |
@@ -1230,8 +1232,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1230 | * When the raw error page is thp tail page, hpage points to the raw | 1232 | * When the raw error page is thp tail page, hpage points to the raw |
1231 | * page after thp split. | 1233 | * page after thp split. |
1232 | */ | 1234 | */ |
1233 | if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) | 1235 | if (!hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)) { |
1234 | != SWAP_SUCCESS) { | ||
1235 | action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); | 1236 | action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); |
1236 | res = -EBUSY; | 1237 | res = -EBUSY; |
1237 | goto out; | 1238 | goto out; |
@@ -1543,8 +1544,8 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) | |||
1543 | if (ret == 1 && !PageLRU(page)) { | 1544 | if (ret == 1 && !PageLRU(page)) { |
1544 | /* Drop page reference which is from __get_any_page() */ | 1545 | /* Drop page reference which is from __get_any_page() */ |
1545 | put_hwpoison_page(page); | 1546 | put_hwpoison_page(page); |
1546 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", | 1547 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n", |
1547 | pfn, page->flags); | 1548 | pfn, page->flags, &page->flags); |
1548 | return -EIO; | 1549 | return -EIO; |
1549 | } | 1550 | } |
1550 | } | 1551 | } |
@@ -1585,8 +1586,8 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
1585 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, | 1586 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
1586 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1587 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
1587 | if (ret) { | 1588 | if (ret) { |
1588 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1589 | pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", |
1589 | pfn, ret, page->flags); | 1590 | pfn, ret, page->flags, &page->flags); |
1590 | /* | 1591 | /* |
1591 | * We know that soft_offline_huge_page() tries to migrate | 1592 | * We know that soft_offline_huge_page() tries to migrate |
1592 | * only one hugepage pointed to by hpage, so we need not | 1593 | * only one hugepage pointed to by hpage, so we need not |
@@ -1677,14 +1678,14 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1677 | if (!list_empty(&pagelist)) | 1678 | if (!list_empty(&pagelist)) |
1678 | putback_movable_pages(&pagelist); | 1679 | putback_movable_pages(&pagelist); |
1679 | 1680 | ||
1680 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1681 | pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", |
1681 | pfn, ret, page->flags); | 1682 | pfn, ret, page->flags, &page->flags); |
1682 | if (ret > 0) | 1683 | if (ret > 0) |
1683 | ret = -EIO; | 1684 | ret = -EIO; |
1684 | } | 1685 | } |
1685 | } else { | 1686 | } else { |
1686 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | 1687 | pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n", |
1687 | pfn, ret, page_count(page), page->flags); | 1688 | pfn, ret, page_count(page), page->flags, &page->flags); |
1688 | } | 1689 | } |
1689 | return ret; | 1690 | return ret; |
1690 | } | 1691 | } |
diff --git a/mm/memory.c b/mm/memory.c index 235ba51b2fbf..6ff5d729ded0 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -4298,7 +4298,7 @@ void __might_fault(const char *file, int line) | |||
4298 | * get paged out, therefore we'll never actually fault, and the | 4298 | * get paged out, therefore we'll never actually fault, and the |
4299 | * below annotations will generate false positives. | 4299 | * below annotations will generate false positives. |
4300 | */ | 4300 | */ |
4301 | if (segment_eq(get_fs(), KERNEL_DS)) | 4301 | if (uaccess_kernel()) |
4302 | return; | 4302 | return; |
4303 | if (pagefault_disabled()) | 4303 | if (pagefault_disabled()) |
4304 | return; | 4304 | return; |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6fa7208bcd56..b63d7d1239df 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -1208,7 +1208,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
1208 | 1208 | ||
1209 | arch_refresh_nodedata(nid, pgdat); | 1209 | arch_refresh_nodedata(nid, pgdat); |
1210 | } else { | 1210 | } else { |
1211 | /* Reset the nr_zones, order and classzone_idx before reuse */ | 1211 | /* |
1212 | * Reset the nr_zones, order and classzone_idx before reuse. | ||
1213 | * Note that kswapd will init kswapd_classzone_idx properly | ||
1214 | * when it starts in the near future. | ||
1215 | */ | ||
1212 | pgdat->nr_zones = 0; | 1216 | pgdat->nr_zones = 0; |
1213 | pgdat->kswapd_order = 0; | 1217 | pgdat->kswapd_order = 0; |
1214 | pgdat->kswapd_classzone_idx = 0; | 1218 | pgdat->kswapd_classzone_idx = 0; |
diff --git a/mm/migrate.c b/mm/migrate.c index ed97c2c14fa8..89a0a1707f4c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -184,9 +184,9 @@ void putback_movable_pages(struct list_head *l) | |||
184 | unlock_page(page); | 184 | unlock_page(page); |
185 | put_page(page); | 185 | put_page(page); |
186 | } else { | 186 | } else { |
187 | putback_lru_page(page); | ||
188 | dec_node_page_state(page, NR_ISOLATED_ANON + | 187 | dec_node_page_state(page, NR_ISOLATED_ANON + |
189 | page_is_file_cache(page)); | 188 | page_is_file_cache(page)); |
189 | putback_lru_page(page); | ||
190 | } | 190 | } |
191 | } | 191 | } |
192 | } | 192 | } |
@@ -194,7 +194,7 @@ void putback_movable_pages(struct list_head *l) | |||
194 | /* | 194 | /* |
195 | * Restore a potential migration pte to a working pte entry | 195 | * Restore a potential migration pte to a working pte entry |
196 | */ | 196 | */ |
197 | static int remove_migration_pte(struct page *page, struct vm_area_struct *vma, | 197 | static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, |
198 | unsigned long addr, void *old) | 198 | unsigned long addr, void *old) |
199 | { | 199 | { |
200 | struct page_vma_mapped_walk pvmw = { | 200 | struct page_vma_mapped_walk pvmw = { |
@@ -253,7 +253,7 @@ static int remove_migration_pte(struct page *page, struct vm_area_struct *vma, | |||
253 | update_mmu_cache(vma, pvmw.address, pvmw.pte); | 253 | update_mmu_cache(vma, pvmw.address, pvmw.pte); |
254 | } | 254 | } |
255 | 255 | ||
256 | return SWAP_AGAIN; | 256 | return true; |
257 | } | 257 | } |
258 | 258 | ||
259 | /* | 259 | /* |
@@ -1722,9 +1722,6 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, | |||
1722 | { | 1722 | { |
1723 | int z; | 1723 | int z; |
1724 | 1724 | ||
1725 | if (!pgdat_reclaimable(pgdat)) | ||
1726 | return false; | ||
1727 | |||
1728 | for (z = pgdat->nr_zones - 1; z >= 0; z--) { | 1725 | for (z = pgdat->nr_zones - 1; z >= 0; z--) { |
1729 | struct zone *zone = pgdat->node_zones + z; | 1726 | struct zone *zone = pgdat->node_zones + z; |
1730 | 1727 | ||
@@ -1947,7 +1944,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1947 | 1944 | ||
1948 | /* Prepare a page as a migration target */ | 1945 | /* Prepare a page as a migration target */ |
1949 | __SetPageLocked(new_page); | 1946 | __SetPageLocked(new_page); |
1950 | __SetPageSwapBacked(new_page); | 1947 | if (PageSwapBacked(page)) |
1948 | __SetPageSwapBacked(new_page); | ||
1951 | 1949 | ||
1952 | /* anon mapping, we can simply copy page->mapping to the new page: */ | 1950 | /* anon mapping, we can simply copy page->mapping to the new page: */ |
1953 | new_page->mapping = page->mapping; | 1951 | new_page->mapping = page->mapping; |
diff --git a/mm/mlock.c b/mm/mlock.c index 0dd9ca18e19e..c483c5c20b4b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -123,17 +123,15 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage) | |||
123 | */ | 123 | */ |
124 | static void __munlock_isolated_page(struct page *page) | 124 | static void __munlock_isolated_page(struct page *page) |
125 | { | 125 | { |
126 | int ret = SWAP_AGAIN; | ||
127 | |||
128 | /* | 126 | /* |
129 | * Optimization: if the page was mapped just once, that's our mapping | 127 | * Optimization: if the page was mapped just once, that's our mapping |
130 | * and we don't need to check all the other vmas. | 128 | * and we don't need to check all the other vmas. |
131 | */ | 129 | */ |
132 | if (page_mapcount(page) > 1) | 130 | if (page_mapcount(page) > 1) |
133 | ret = try_to_munlock(page); | 131 | try_to_munlock(page); |
134 | 132 | ||
135 | /* Did try_to_unlock() succeed or punt? */ | 133 | /* Did try_to_unlock() succeed or punt? */ |
136 | if (ret != SWAP_MLOCK) | 134 | if (!PageMlocked(page)) |
137 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | 135 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); |
138 | 136 | ||
139 | putback_lru_page(page); | 137 | putback_lru_page(page); |
@@ -1479,7 +1479,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1479 | struct user_struct *user = NULL; | 1479 | struct user_struct *user = NULL; |
1480 | struct hstate *hs; | 1480 | struct hstate *hs; |
1481 | 1481 | ||
1482 | hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); | 1482 | hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); |
1483 | if (!hs) | 1483 | if (!hs) |
1484 | return -EINVAL; | 1484 | return -EINVAL; |
1485 | 1485 | ||
diff --git a/mm/nommu.c b/mm/nommu.c index 2d131b97a851..fc184f597d59 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -237,12 +237,16 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | |||
237 | } | 237 | } |
238 | EXPORT_SYMBOL(__vmalloc); | 238 | EXPORT_SYMBOL(__vmalloc); |
239 | 239 | ||
240 | void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) | ||
241 | { | ||
242 | return __vmalloc(size, flags, PAGE_KERNEL); | ||
243 | } | ||
244 | |||
240 | void *vmalloc_user(unsigned long size) | 245 | void *vmalloc_user(unsigned long size) |
241 | { | 246 | { |
242 | void *ret; | 247 | void *ret; |
243 | 248 | ||
244 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | 249 | ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); |
245 | PAGE_KERNEL); | ||
246 | if (ret) { | 250 | if (ret) { |
247 | struct vm_area_struct *vma; | 251 | struct vm_area_struct *vma; |
248 | 252 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d083714a2bb9..04c9143a8625 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -685,6 +685,7 @@ void exit_oom_victim(void) | |||
685 | void oom_killer_enable(void) | 685 | void oom_killer_enable(void) |
686 | { | 686 | { |
687 | oom_killer_disabled = false; | 687 | oom_killer_disabled = false; |
688 | pr_info("OOM killer enabled.\n"); | ||
688 | } | 689 | } |
689 | 690 | ||
690 | /** | 691 | /** |
@@ -721,6 +722,7 @@ bool oom_killer_disable(signed long timeout) | |||
721 | oom_killer_enable(); | 722 | oom_killer_enable(); |
722 | return false; | 723 | return false; |
723 | } | 724 | } |
725 | pr_info("OOM killer disabled.\n"); | ||
724 | 726 | ||
725 | return true; | 727 | return true; |
726 | } | 728 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d8ac2a7fb9e7..143c1c25d680 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -650,9 +650,8 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp) | |||
650 | 650 | ||
651 | spin_lock_init(&dom->lock); | 651 | spin_lock_init(&dom->lock); |
652 | 652 | ||
653 | init_timer_deferrable(&dom->period_timer); | 653 | setup_deferrable_timer(&dom->period_timer, writeout_period, |
654 | dom->period_timer.function = writeout_period; | 654 | (unsigned long)dom); |
655 | dom->period_timer.data = (unsigned long)dom; | ||
656 | 655 | ||
657 | dom->dirty_limit_tstamp = jiffies; | 656 | dom->dirty_limit_tstamp = jiffies; |
658 | 657 | ||
@@ -2353,10 +2352,16 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) | |||
2353 | 2352 | ||
2354 | if (wbc->nr_to_write <= 0) | 2353 | if (wbc->nr_to_write <= 0) |
2355 | return 0; | 2354 | return 0; |
2356 | if (mapping->a_ops->writepages) | 2355 | while (1) { |
2357 | ret = mapping->a_ops->writepages(mapping, wbc); | 2356 | if (mapping->a_ops->writepages) |
2358 | else | 2357 | ret = mapping->a_ops->writepages(mapping, wbc); |
2359 | ret = generic_writepages(mapping, wbc); | 2358 | else |
2359 | ret = generic_writepages(mapping, wbc); | ||
2360 | if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL)) | ||
2361 | break; | ||
2362 | cond_resched(); | ||
2363 | congestion_wait(BLK_RW_ASYNC, HZ/50); | ||
2364 | } | ||
2360 | return ret; | 2365 | return ret; |
2361 | } | 2366 | } |
2362 | 2367 | ||
@@ -2428,7 +2433,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) | |||
2428 | inode_attach_wb(inode, page); | 2433 | inode_attach_wb(inode, page); |
2429 | wb = inode_to_wb(inode); | 2434 | wb = inode_to_wb(inode); |
2430 | 2435 | ||
2431 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); | 2436 | inc_memcg_page_state(page, NR_FILE_DIRTY); |
2432 | __inc_node_page_state(page, NR_FILE_DIRTY); | 2437 | __inc_node_page_state(page, NR_FILE_DIRTY); |
2433 | __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); | 2438 | __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
2434 | __inc_node_page_state(page, NR_DIRTIED); | 2439 | __inc_node_page_state(page, NR_DIRTIED); |
@@ -2450,7 +2455,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, | |||
2450 | struct bdi_writeback *wb) | 2455 | struct bdi_writeback *wb) |
2451 | { | 2456 | { |
2452 | if (mapping_cap_account_dirty(mapping)) { | 2457 | if (mapping_cap_account_dirty(mapping)) { |
2453 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); | 2458 | dec_memcg_page_state(page, NR_FILE_DIRTY); |
2454 | dec_node_page_state(page, NR_FILE_DIRTY); | 2459 | dec_node_page_state(page, NR_FILE_DIRTY); |
2455 | dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); | 2460 | dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
2456 | dec_wb_stat(wb, WB_RECLAIMABLE); | 2461 | dec_wb_stat(wb, WB_RECLAIMABLE); |
@@ -2707,7 +2712,7 @@ int clear_page_dirty_for_io(struct page *page) | |||
2707 | */ | 2712 | */ |
2708 | wb = unlocked_inode_to_wb_begin(inode, &locked); | 2713 | wb = unlocked_inode_to_wb_begin(inode, &locked); |
2709 | if (TestClearPageDirty(page)) { | 2714 | if (TestClearPageDirty(page)) { |
2710 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); | 2715 | dec_memcg_page_state(page, NR_FILE_DIRTY); |
2711 | dec_node_page_state(page, NR_FILE_DIRTY); | 2716 | dec_node_page_state(page, NR_FILE_DIRTY); |
2712 | dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); | 2717 | dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
2713 | dec_wb_stat(wb, WB_RECLAIMABLE); | 2718 | dec_wb_stat(wb, WB_RECLAIMABLE); |
@@ -2754,7 +2759,7 @@ int test_clear_page_writeback(struct page *page) | |||
2754 | ret = TestClearPageWriteback(page); | 2759 | ret = TestClearPageWriteback(page); |
2755 | } | 2760 | } |
2756 | if (ret) { | 2761 | if (ret) { |
2757 | mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); | 2762 | dec_memcg_page_state(page, NR_WRITEBACK); |
2758 | dec_node_page_state(page, NR_WRITEBACK); | 2763 | dec_node_page_state(page, NR_WRITEBACK); |
2759 | dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); | 2764 | dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
2760 | inc_node_page_state(page, NR_WRITTEN); | 2765 | inc_node_page_state(page, NR_WRITTEN); |
@@ -2809,7 +2814,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) | |||
2809 | ret = TestSetPageWriteback(page); | 2814 | ret = TestSetPageWriteback(page); |
2810 | } | 2815 | } |
2811 | if (!ret) { | 2816 | if (!ret) { |
2812 | mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); | 2817 | inc_memcg_page_state(page, NR_WRITEBACK); |
2813 | inc_node_page_state(page, NR_WRITEBACK); | 2818 | inc_node_page_state(page, NR_WRITEBACK); |
2814 | inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); | 2819 | inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); |
2815 | } | 2820 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f3d603cef2c0..f9e450c6b6e4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -65,6 +65,7 @@ | |||
65 | #include <linux/page_owner.h> | 65 | #include <linux/page_owner.h> |
66 | #include <linux/kthread.h> | 66 | #include <linux/kthread.h> |
67 | #include <linux/memcontrol.h> | 67 | #include <linux/memcontrol.h> |
68 | #include <linux/ftrace.h> | ||
68 | 69 | ||
69 | #include <asm/sections.h> | 70 | #include <asm/sections.h> |
70 | #include <asm/tlbflush.h> | 71 | #include <asm/tlbflush.h> |
@@ -1090,14 +1091,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
1090 | { | 1091 | { |
1091 | int migratetype = 0; | 1092 | int migratetype = 0; |
1092 | int batch_free = 0; | 1093 | int batch_free = 0; |
1093 | unsigned long nr_scanned, flags; | ||
1094 | bool isolated_pageblocks; | 1094 | bool isolated_pageblocks; |
1095 | 1095 | ||
1096 | spin_lock_irqsave(&zone->lock, flags); | 1096 | spin_lock(&zone->lock); |
1097 | isolated_pageblocks = has_isolate_pageblock(zone); | 1097 | isolated_pageblocks = has_isolate_pageblock(zone); |
1098 | nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); | ||
1099 | if (nr_scanned) | ||
1100 | __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); | ||
1101 | 1098 | ||
1102 | while (count) { | 1099 | while (count) { |
1103 | struct page *page; | 1100 | struct page *page; |
@@ -1142,7 +1139,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
1142 | trace_mm_page_pcpu_drain(page, 0, mt); | 1139 | trace_mm_page_pcpu_drain(page, 0, mt); |
1143 | } while (--count && --batch_free && !list_empty(list)); | 1140 | } while (--count && --batch_free && !list_empty(list)); |
1144 | } | 1141 | } |
1145 | spin_unlock_irqrestore(&zone->lock, flags); | 1142 | spin_unlock(&zone->lock); |
1146 | } | 1143 | } |
1147 | 1144 | ||
1148 | static void free_one_page(struct zone *zone, | 1145 | static void free_one_page(struct zone *zone, |
@@ -1150,19 +1147,13 @@ static void free_one_page(struct zone *zone, | |||
1150 | unsigned int order, | 1147 | unsigned int order, |
1151 | int migratetype) | 1148 | int migratetype) |
1152 | { | 1149 | { |
1153 | unsigned long nr_scanned, flags; | 1150 | spin_lock(&zone->lock); |
1154 | spin_lock_irqsave(&zone->lock, flags); | ||
1155 | __count_vm_events(PGFREE, 1 << order); | ||
1156 | nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); | ||
1157 | if (nr_scanned) | ||
1158 | __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); | ||
1159 | |||
1160 | if (unlikely(has_isolate_pageblock(zone) || | 1151 | if (unlikely(has_isolate_pageblock(zone) || |
1161 | is_migrate_isolate(migratetype))) { | 1152 | is_migrate_isolate(migratetype))) { |
1162 | migratetype = get_pfnblock_migratetype(page, pfn); | 1153 | migratetype = get_pfnblock_migratetype(page, pfn); |
1163 | } | 1154 | } |
1164 | __free_one_page(page, pfn, zone, order, migratetype); | 1155 | __free_one_page(page, pfn, zone, order, migratetype); |
1165 | spin_unlock_irqrestore(&zone->lock, flags); | 1156 | spin_unlock(&zone->lock); |
1166 | } | 1157 | } |
1167 | 1158 | ||
1168 | static void __meminit __init_single_page(struct page *page, unsigned long pfn, | 1159 | static void __meminit __init_single_page(struct page *page, unsigned long pfn, |
@@ -1240,6 +1231,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) | |||
1240 | 1231 | ||
1241 | static void __free_pages_ok(struct page *page, unsigned int order) | 1232 | static void __free_pages_ok(struct page *page, unsigned int order) |
1242 | { | 1233 | { |
1234 | unsigned long flags; | ||
1243 | int migratetype; | 1235 | int migratetype; |
1244 | unsigned long pfn = page_to_pfn(page); | 1236 | unsigned long pfn = page_to_pfn(page); |
1245 | 1237 | ||
@@ -1247,7 +1239,10 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
1247 | return; | 1239 | return; |
1248 | 1240 | ||
1249 | migratetype = get_pfnblock_migratetype(page, pfn); | 1241 | migratetype = get_pfnblock_migratetype(page, pfn); |
1242 | local_irq_save(flags); | ||
1243 | __count_vm_events(PGFREE, 1 << order); | ||
1250 | free_one_page(page_zone(page), page, pfn, order, migratetype); | 1244 | free_one_page(page_zone(page), page, pfn, order, migratetype); |
1245 | local_irq_restore(flags); | ||
1251 | } | 1246 | } |
1252 | 1247 | ||
1253 | static void __init __free_pages_boot_core(struct page *page, unsigned int order) | 1248 | static void __init __free_pages_boot_core(struct page *page, unsigned int order) |
@@ -1695,10 +1690,10 @@ static inline int check_new_page(struct page *page) | |||
1695 | return 1; | 1690 | return 1; |
1696 | } | 1691 | } |
1697 | 1692 | ||
1698 | static inline bool free_pages_prezeroed(bool poisoned) | 1693 | static inline bool free_pages_prezeroed(void) |
1699 | { | 1694 | { |
1700 | return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && | 1695 | return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && |
1701 | page_poisoning_enabled() && poisoned; | 1696 | page_poisoning_enabled(); |
1702 | } | 1697 | } |
1703 | 1698 | ||
1704 | #ifdef CONFIG_DEBUG_VM | 1699 | #ifdef CONFIG_DEBUG_VM |
@@ -1752,17 +1747,10 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags | |||
1752 | unsigned int alloc_flags) | 1747 | unsigned int alloc_flags) |
1753 | { | 1748 | { |
1754 | int i; | 1749 | int i; |
1755 | bool poisoned = true; | ||
1756 | |||
1757 | for (i = 0; i < (1 << order); i++) { | ||
1758 | struct page *p = page + i; | ||
1759 | if (poisoned) | ||
1760 | poisoned &= page_is_poisoned(p); | ||
1761 | } | ||
1762 | 1750 | ||
1763 | post_alloc_hook(page, order, gfp_flags); | 1751 | post_alloc_hook(page, order, gfp_flags); |
1764 | 1752 | ||
1765 | if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) | 1753 | if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) |
1766 | for (i = 0; i < (1 << order); i++) | 1754 | for (i = 0; i < (1 << order); i++) |
1767 | clear_highpage(page + i); | 1755 | clear_highpage(page + i); |
1768 | 1756 | ||
@@ -1844,9 +1832,9 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, | |||
1844 | * Note that start_page and end_pages are not aligned on a pageblock | 1832 | * Note that start_page and end_pages are not aligned on a pageblock |
1845 | * boundary. If alignment is required, use move_freepages_block() | 1833 | * boundary. If alignment is required, use move_freepages_block() |
1846 | */ | 1834 | */ |
1847 | int move_freepages(struct zone *zone, | 1835 | static int move_freepages(struct zone *zone, |
1848 | struct page *start_page, struct page *end_page, | 1836 | struct page *start_page, struct page *end_page, |
1849 | int migratetype) | 1837 | int migratetype, int *num_movable) |
1850 | { | 1838 | { |
1851 | struct page *page; | 1839 | struct page *page; |
1852 | unsigned int order; | 1840 | unsigned int order; |
@@ -1863,6 +1851,9 @@ int move_freepages(struct zone *zone, | |||
1863 | VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); | 1851 | VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); |
1864 | #endif | 1852 | #endif |
1865 | 1853 | ||
1854 | if (num_movable) | ||
1855 | *num_movable = 0; | ||
1856 | |||
1866 | for (page = start_page; page <= end_page;) { | 1857 | for (page = start_page; page <= end_page;) { |
1867 | if (!pfn_valid_within(page_to_pfn(page))) { | 1858 | if (!pfn_valid_within(page_to_pfn(page))) { |
1868 | page++; | 1859 | page++; |
@@ -1873,6 +1864,15 @@ int move_freepages(struct zone *zone, | |||
1873 | VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); | 1864 | VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); |
1874 | 1865 | ||
1875 | if (!PageBuddy(page)) { | 1866 | if (!PageBuddy(page)) { |
1867 | /* | ||
1868 | * We assume that pages that could be isolated for | ||
1869 | * migration are movable. But we don't actually try | ||
1870 | * isolating, as that would be expensive. | ||
1871 | */ | ||
1872 | if (num_movable && | ||
1873 | (PageLRU(page) || __PageMovable(page))) | ||
1874 | (*num_movable)++; | ||
1875 | |||
1876 | page++; | 1876 | page++; |
1877 | continue; | 1877 | continue; |
1878 | } | 1878 | } |
@@ -1888,7 +1888,7 @@ int move_freepages(struct zone *zone, | |||
1888 | } | 1888 | } |
1889 | 1889 | ||
1890 | int move_freepages_block(struct zone *zone, struct page *page, | 1890 | int move_freepages_block(struct zone *zone, struct page *page, |
1891 | int migratetype) | 1891 | int migratetype, int *num_movable) |
1892 | { | 1892 | { |
1893 | unsigned long start_pfn, end_pfn; | 1893 | unsigned long start_pfn, end_pfn; |
1894 | struct page *start_page, *end_page; | 1894 | struct page *start_page, *end_page; |
@@ -1905,7 +1905,8 @@ int move_freepages_block(struct zone *zone, struct page *page, | |||
1905 | if (!zone_spans_pfn(zone, end_pfn)) | 1905 | if (!zone_spans_pfn(zone, end_pfn)) |
1906 | return 0; | 1906 | return 0; |
1907 | 1907 | ||
1908 | return move_freepages(zone, start_page, end_page, migratetype); | 1908 | return move_freepages(zone, start_page, end_page, migratetype, |
1909 | num_movable); | ||
1909 | } | 1910 | } |
1910 | 1911 | ||
1911 | static void change_pageblock_range(struct page *pageblock_page, | 1912 | static void change_pageblock_range(struct page *pageblock_page, |
@@ -1955,28 +1956,79 @@ static bool can_steal_fallback(unsigned int order, int start_mt) | |||
1955 | /* | 1956 | /* |
1956 | * This function implements actual steal behaviour. If order is large enough, | 1957 | * This function implements actual steal behaviour. If order is large enough, |
1957 | * we can steal whole pageblock. If not, we first move freepages in this | 1958 | * we can steal whole pageblock. If not, we first move freepages in this |
1958 | * pageblock and check whether half of pages are moved or not. If half of | 1959 | * pageblock to our migratetype and determine how many already-allocated pages |
1959 | * pages are moved, we can change migratetype of pageblock and permanently | 1960 | * are there in the pageblock with a compatible migratetype. If at least half |
1960 | * use it's pages as requested migratetype in the future. | 1961 | * of pages are free or compatible, we can change migratetype of the pageblock |
1962 | * itself, so pages freed in the future will be put on the correct free list. | ||
1961 | */ | 1963 | */ |
1962 | static void steal_suitable_fallback(struct zone *zone, struct page *page, | 1964 | static void steal_suitable_fallback(struct zone *zone, struct page *page, |
1963 | int start_type) | 1965 | int start_type, bool whole_block) |
1964 | { | 1966 | { |
1965 | unsigned int current_order = page_order(page); | 1967 | unsigned int current_order = page_order(page); |
1966 | int pages; | 1968 | struct free_area *area; |
1969 | int free_pages, movable_pages, alike_pages; | ||
1970 | int old_block_type; | ||
1971 | |||
1972 | old_block_type = get_pageblock_migratetype(page); | ||
1973 | |||
1974 | /* | ||
1975 | * This can happen due to races and we want to prevent broken | ||
1976 | * highatomic accounting. | ||
1977 | */ | ||
1978 | if (is_migrate_highatomic(old_block_type)) | ||
1979 | goto single_page; | ||
1967 | 1980 | ||
1968 | /* Take ownership for orders >= pageblock_order */ | 1981 | /* Take ownership for orders >= pageblock_order */ |
1969 | if (current_order >= pageblock_order) { | 1982 | if (current_order >= pageblock_order) { |
1970 | change_pageblock_range(page, current_order, start_type); | 1983 | change_pageblock_range(page, current_order, start_type); |
1971 | return; | 1984 | goto single_page; |
1985 | } | ||
1986 | |||
1987 | /* We are not allowed to try stealing from the whole block */ | ||
1988 | if (!whole_block) | ||
1989 | goto single_page; | ||
1990 | |||
1991 | free_pages = move_freepages_block(zone, page, start_type, | ||
1992 | &movable_pages); | ||
1993 | /* | ||
1994 | * Determine how many pages are compatible with our allocation. | ||
1995 | * For movable allocation, it's the number of movable pages which | ||
1996 | * we just obtained. For other types it's a bit more tricky. | ||
1997 | */ | ||
1998 | if (start_type == MIGRATE_MOVABLE) { | ||
1999 | alike_pages = movable_pages; | ||
2000 | } else { | ||
2001 | /* | ||
2002 | * If we are falling back a RECLAIMABLE or UNMOVABLE allocation | ||
2003 | * to MOVABLE pageblock, consider all non-movable pages as | ||
2004 | * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or | ||
2005 | * vice versa, be conservative since we can't distinguish the | ||
2006 | * exact migratetype of non-movable pages. | ||
2007 | */ | ||
2008 | if (old_block_type == MIGRATE_MOVABLE) | ||
2009 | alike_pages = pageblock_nr_pages | ||
2010 | - (free_pages + movable_pages); | ||
2011 | else | ||
2012 | alike_pages = 0; | ||
1972 | } | 2013 | } |
1973 | 2014 | ||
1974 | pages = move_freepages_block(zone, page, start_type); | 2015 | /* moving whole block can fail due to zone boundary conditions */ |
2016 | if (!free_pages) | ||
2017 | goto single_page; | ||
1975 | 2018 | ||
1976 | /* Claim the whole block if over half of it is free */ | 2019 | /* |
1977 | if (pages >= (1 << (pageblock_order-1)) || | 2020 | * If a sufficient number of pages in the block are either free or of |
2021 | * comparable migratability as our allocation, claim the whole block. | ||
2022 | */ | ||
2023 | if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || | ||
1978 | page_group_by_mobility_disabled) | 2024 | page_group_by_mobility_disabled) |
1979 | set_pageblock_migratetype(page, start_type); | 2025 | set_pageblock_migratetype(page, start_type); |
2026 | |||
2027 | return; | ||
2028 | |||
2029 | single_page: | ||
2030 | area = &zone->free_area[current_order]; | ||
2031 | list_move(&page->lru, &area->free_list[start_type]); | ||
1980 | } | 2032 | } |
1981 | 2033 | ||
1982 | /* | 2034 | /* |
@@ -2042,11 +2094,11 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, | |||
2042 | 2094 | ||
2043 | /* Yoink! */ | 2095 | /* Yoink! */ |
2044 | mt = get_pageblock_migratetype(page); | 2096 | mt = get_pageblock_migratetype(page); |
2045 | if (mt != MIGRATE_HIGHATOMIC && | 2097 | if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) |
2046 | !is_migrate_isolate(mt) && !is_migrate_cma(mt)) { | 2098 | && !is_migrate_cma(mt)) { |
2047 | zone->nr_reserved_highatomic += pageblock_nr_pages; | 2099 | zone->nr_reserved_highatomic += pageblock_nr_pages; |
2048 | set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); | 2100 | set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); |
2049 | move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); | 2101 | move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); |
2050 | } | 2102 | } |
2051 | 2103 | ||
2052 | out_unlock: | 2104 | out_unlock: |
@@ -2100,8 +2152,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, | |||
2100 | * from highatomic to ac->migratetype. So we should | 2152 | * from highatomic to ac->migratetype. So we should |
2101 | * adjust the count once. | 2153 | * adjust the count once. |
2102 | */ | 2154 | */ |
2103 | if (get_pageblock_migratetype(page) == | 2155 | if (is_migrate_highatomic_page(page)) { |
2104 | MIGRATE_HIGHATOMIC) { | ||
2105 | /* | 2156 | /* |
2106 | * It should never happen but changes to | 2157 | * It should never happen but changes to |
2107 | * locking could inadvertently allow a per-cpu | 2158 | * locking could inadvertently allow a per-cpu |
@@ -2124,7 +2175,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, | |||
2124 | * may increase. | 2175 | * may increase. |
2125 | */ | 2176 | */ |
2126 | set_pageblock_migratetype(page, ac->migratetype); | 2177 | set_pageblock_migratetype(page, ac->migratetype); |
2127 | ret = move_freepages_block(zone, page, ac->migratetype); | 2178 | ret = move_freepages_block(zone, page, ac->migratetype, |
2179 | NULL); | ||
2128 | if (ret) { | 2180 | if (ret) { |
2129 | spin_unlock_irqrestore(&zone->lock, flags); | 2181 | spin_unlock_irqrestore(&zone->lock, flags); |
2130 | return ret; | 2182 | return ret; |
@@ -2136,8 +2188,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, | |||
2136 | return false; | 2188 | return false; |
2137 | } | 2189 | } |
2138 | 2190 | ||
2139 | /* Remove an element from the buddy allocator from the fallback list */ | 2191 | /* |
2140 | static inline struct page * | 2192 | * Try finding a free buddy page on the fallback list and put it on the free |
2193 | * list of requested migratetype, possibly along with other pages from the same | ||
2194 | * block, depending on fragmentation avoidance heuristics. Returns true if | ||
2195 | * fallback was found so that __rmqueue_smallest() can grab it. | ||
2196 | */ | ||
2197 | static inline bool | ||
2141 | __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) | 2198 | __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) |
2142 | { | 2199 | { |
2143 | struct free_area *area; | 2200 | struct free_area *area; |
@@ -2158,33 +2215,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) | |||
2158 | 2215 | ||
2159 | page = list_first_entry(&area->free_list[fallback_mt], | 2216 | page = list_first_entry(&area->free_list[fallback_mt], |
2160 | struct page, lru); | 2217 | struct page, lru); |
2161 | if (can_steal && | ||
2162 | get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC) | ||
2163 | steal_suitable_fallback(zone, page, start_migratetype); | ||
2164 | |||
2165 | /* Remove the page from the freelists */ | ||
2166 | area->nr_free--; | ||
2167 | list_del(&page->lru); | ||
2168 | rmv_page_order(page); | ||
2169 | 2218 | ||
2170 | expand(zone, page, order, current_order, area, | 2219 | steal_suitable_fallback(zone, page, start_migratetype, |
2171 | start_migratetype); | 2220 | can_steal); |
2172 | /* | ||
2173 | * The pcppage_migratetype may differ from pageblock's | ||
2174 | * migratetype depending on the decisions in | ||
2175 | * find_suitable_fallback(). This is OK as long as it does not | ||
2176 | * differ for MIGRATE_CMA pageblocks. Those can be used as | ||
2177 | * fallback only via special __rmqueue_cma_fallback() function | ||
2178 | */ | ||
2179 | set_pcppage_migratetype(page, start_migratetype); | ||
2180 | 2221 | ||
2181 | trace_mm_page_alloc_extfrag(page, order, current_order, | 2222 | trace_mm_page_alloc_extfrag(page, order, current_order, |
2182 | start_migratetype, fallback_mt); | 2223 | start_migratetype, fallback_mt); |
2183 | 2224 | ||
2184 | return page; | 2225 | return true; |
2185 | } | 2226 | } |
2186 | 2227 | ||
2187 | return NULL; | 2228 | return false; |
2188 | } | 2229 | } |
2189 | 2230 | ||
2190 | /* | 2231 | /* |
@@ -2196,13 +2237,14 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, | |||
2196 | { | 2237 | { |
2197 | struct page *page; | 2238 | struct page *page; |
2198 | 2239 | ||
2240 | retry: | ||
2199 | page = __rmqueue_smallest(zone, order, migratetype); | 2241 | page = __rmqueue_smallest(zone, order, migratetype); |
2200 | if (unlikely(!page)) { | 2242 | if (unlikely(!page)) { |
2201 | if (migratetype == MIGRATE_MOVABLE) | 2243 | if (migratetype == MIGRATE_MOVABLE) |
2202 | page = __rmqueue_cma_fallback(zone, order); | 2244 | page = __rmqueue_cma_fallback(zone, order); |
2203 | 2245 | ||
2204 | if (!page) | 2246 | if (!page && __rmqueue_fallback(zone, order, migratetype)) |
2205 | page = __rmqueue_fallback(zone, order, migratetype); | 2247 | goto retry; |
2206 | } | 2248 | } |
2207 | 2249 | ||
2208 | trace_mm_page_alloc_zone_locked(page, order, migratetype); | 2250 | trace_mm_page_alloc_zone_locked(page, order, migratetype); |
@@ -2219,9 +2261,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
2219 | int migratetype, bool cold) | 2261 | int migratetype, bool cold) |
2220 | { | 2262 | { |
2221 | int i, alloced = 0; | 2263 | int i, alloced = 0; |
2222 | unsigned long flags; | ||
2223 | 2264 | ||
2224 | spin_lock_irqsave(&zone->lock, flags); | 2265 | spin_lock(&zone->lock); |
2225 | for (i = 0; i < count; ++i) { | 2266 | for (i = 0; i < count; ++i) { |
2226 | struct page *page = __rmqueue(zone, order, migratetype); | 2267 | struct page *page = __rmqueue(zone, order, migratetype); |
2227 | if (unlikely(page == NULL)) | 2268 | if (unlikely(page == NULL)) |
@@ -2257,7 +2298,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
2257 | * pages added to the pcp list. | 2298 | * pages added to the pcp list. |
2258 | */ | 2299 | */ |
2259 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); | 2300 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); |
2260 | spin_unlock_irqrestore(&zone->lock, flags); | 2301 | spin_unlock(&zone->lock); |
2261 | return alloced; | 2302 | return alloced; |
2262 | } | 2303 | } |
2263 | 2304 | ||
@@ -2485,25 +2526,22 @@ void free_hot_cold_page(struct page *page, bool cold) | |||
2485 | { | 2526 | { |
2486 | struct zone *zone = page_zone(page); | 2527 | struct zone *zone = page_zone(page); |
2487 | struct per_cpu_pages *pcp; | 2528 | struct per_cpu_pages *pcp; |
2529 | unsigned long flags; | ||
2488 | unsigned long pfn = page_to_pfn(page); | 2530 | unsigned long pfn = page_to_pfn(page); |
2489 | int migratetype; | 2531 | int migratetype; |
2490 | 2532 | ||
2491 | if (in_interrupt()) { | ||
2492 | __free_pages_ok(page, 0); | ||
2493 | return; | ||
2494 | } | ||
2495 | |||
2496 | if (!free_pcp_prepare(page)) | 2533 | if (!free_pcp_prepare(page)) |
2497 | return; | 2534 | return; |
2498 | 2535 | ||
2499 | migratetype = get_pfnblock_migratetype(page, pfn); | 2536 | migratetype = get_pfnblock_migratetype(page, pfn); |
2500 | set_pcppage_migratetype(page, migratetype); | 2537 | set_pcppage_migratetype(page, migratetype); |
2501 | preempt_disable(); | 2538 | local_irq_save(flags); |
2539 | __count_vm_event(PGFREE); | ||
2502 | 2540 | ||
2503 | /* | 2541 | /* |
2504 | * We only track unmovable, reclaimable and movable on pcp lists. | 2542 | * We only track unmovable, reclaimable and movable on pcp lists. |
2505 | * Free ISOLATE pages back to the allocator because they are being | 2543 | * Free ISOLATE pages back to the allocator because they are being |
2506 | * offlined but treat RESERVE as movable pages so we can get those | 2544 | * offlined but treat HIGHATOMIC as movable pages so we can get those |
2507 | * areas back if necessary. Otherwise, we may have to free | 2545 | * areas back if necessary. Otherwise, we may have to free |
2508 | * excessively into the page allocator | 2546 | * excessively into the page allocator |
2509 | */ | 2547 | */ |
@@ -2515,7 +2553,6 @@ void free_hot_cold_page(struct page *page, bool cold) | |||
2515 | migratetype = MIGRATE_MOVABLE; | 2553 | migratetype = MIGRATE_MOVABLE; |
2516 | } | 2554 | } |
2517 | 2555 | ||
2518 | __count_vm_event(PGFREE); | ||
2519 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | 2556 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
2520 | if (!cold) | 2557 | if (!cold) |
2521 | list_add(&page->lru, &pcp->lists[migratetype]); | 2558 | list_add(&page->lru, &pcp->lists[migratetype]); |
@@ -2529,7 +2566,7 @@ void free_hot_cold_page(struct page *page, bool cold) | |||
2529 | } | 2566 | } |
2530 | 2567 | ||
2531 | out: | 2568 | out: |
2532 | preempt_enable(); | 2569 | local_irq_restore(flags); |
2533 | } | 2570 | } |
2534 | 2571 | ||
2535 | /* | 2572 | /* |
@@ -2614,7 +2651,7 @@ int __isolate_free_page(struct page *page, unsigned int order) | |||
2614 | for (; page < endpage; page += pageblock_nr_pages) { | 2651 | for (; page < endpage; page += pageblock_nr_pages) { |
2615 | int mt = get_pageblock_migratetype(page); | 2652 | int mt = get_pageblock_migratetype(page); |
2616 | if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) | 2653 | if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) |
2617 | && mt != MIGRATE_HIGHATOMIC) | 2654 | && !is_migrate_highatomic(mt)) |
2618 | set_pageblock_migratetype(page, | 2655 | set_pageblock_migratetype(page, |
2619 | MIGRATE_MOVABLE); | 2656 | MIGRATE_MOVABLE); |
2620 | } | 2657 | } |
@@ -2654,8 +2691,6 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, | |||
2654 | { | 2691 | { |
2655 | struct page *page; | 2692 | struct page *page; |
2656 | 2693 | ||
2657 | VM_BUG_ON(in_interrupt()); | ||
2658 | |||
2659 | do { | 2694 | do { |
2660 | if (list_empty(list)) { | 2695 | if (list_empty(list)) { |
2661 | pcp->count += rmqueue_bulk(zone, 0, | 2696 | pcp->count += rmqueue_bulk(zone, 0, |
@@ -2686,8 +2721,9 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, | |||
2686 | struct list_head *list; | 2721 | struct list_head *list; |
2687 | bool cold = ((gfp_flags & __GFP_COLD) != 0); | 2722 | bool cold = ((gfp_flags & __GFP_COLD) != 0); |
2688 | struct page *page; | 2723 | struct page *page; |
2724 | unsigned long flags; | ||
2689 | 2725 | ||
2690 | preempt_disable(); | 2726 | local_irq_save(flags); |
2691 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | 2727 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
2692 | list = &pcp->lists[migratetype]; | 2728 | list = &pcp->lists[migratetype]; |
2693 | page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); | 2729 | page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); |
@@ -2695,7 +2731,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, | |||
2695 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); | 2731 | __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); |
2696 | zone_statistics(preferred_zone, zone); | 2732 | zone_statistics(preferred_zone, zone); |
2697 | } | 2733 | } |
2698 | preempt_enable(); | 2734 | local_irq_restore(flags); |
2699 | return page; | 2735 | return page; |
2700 | } | 2736 | } |
2701 | 2737 | ||
@@ -2711,7 +2747,7 @@ struct page *rmqueue(struct zone *preferred_zone, | |||
2711 | unsigned long flags; | 2747 | unsigned long flags; |
2712 | struct page *page; | 2748 | struct page *page; |
2713 | 2749 | ||
2714 | if (likely(order == 0) && !in_interrupt()) { | 2750 | if (likely(order == 0)) { |
2715 | page = rmqueue_pcplist(preferred_zone, zone, order, | 2751 | page = rmqueue_pcplist(preferred_zone, zone, order, |
2716 | gfp_flags, migratetype); | 2752 | gfp_flags, migratetype); |
2717 | goto out; | 2753 | goto out; |
@@ -3113,8 +3149,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) | |||
3113 | static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, | 3149 | static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, |
3114 | DEFAULT_RATELIMIT_BURST); | 3150 | DEFAULT_RATELIMIT_BURST); |
3115 | 3151 | ||
3116 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || | 3152 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) |
3117 | debug_guardpage_minorder() > 0) | ||
3118 | return; | 3153 | return; |
3119 | 3154 | ||
3120 | pr_warn("%s: ", current->comm); | 3155 | pr_warn("%s: ", current->comm); |
@@ -3248,14 +3283,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
3248 | enum compact_priority prio, enum compact_result *compact_result) | 3283 | enum compact_priority prio, enum compact_result *compact_result) |
3249 | { | 3284 | { |
3250 | struct page *page; | 3285 | struct page *page; |
3286 | unsigned int noreclaim_flag; | ||
3251 | 3287 | ||
3252 | if (!order) | 3288 | if (!order) |
3253 | return NULL; | 3289 | return NULL; |
3254 | 3290 | ||
3255 | current->flags |= PF_MEMALLOC; | 3291 | noreclaim_flag = memalloc_noreclaim_save(); |
3256 | *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, | 3292 | *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, |
3257 | prio); | 3293 | prio); |
3258 | current->flags &= ~PF_MEMALLOC; | 3294 | memalloc_noreclaim_restore(noreclaim_flag); |
3259 | 3295 | ||
3260 | if (*compact_result <= COMPACT_INACTIVE) | 3296 | if (*compact_result <= COMPACT_INACTIVE) |
3261 | return NULL; | 3297 | return NULL; |
@@ -3402,12 +3438,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, | |||
3402 | { | 3438 | { |
3403 | struct reclaim_state reclaim_state; | 3439 | struct reclaim_state reclaim_state; |
3404 | int progress; | 3440 | int progress; |
3441 | unsigned int noreclaim_flag; | ||
3405 | 3442 | ||
3406 | cond_resched(); | 3443 | cond_resched(); |
3407 | 3444 | ||
3408 | /* We now go into synchronous reclaim */ | 3445 | /* We now go into synchronous reclaim */ |
3409 | cpuset_memory_pressure_bump(); | 3446 | cpuset_memory_pressure_bump(); |
3410 | current->flags |= PF_MEMALLOC; | 3447 | noreclaim_flag = memalloc_noreclaim_save(); |
3411 | lockdep_set_current_reclaim_state(gfp_mask); | 3448 | lockdep_set_current_reclaim_state(gfp_mask); |
3412 | reclaim_state.reclaimed_slab = 0; | 3449 | reclaim_state.reclaimed_slab = 0; |
3413 | current->reclaim_state = &reclaim_state; | 3450 | current->reclaim_state = &reclaim_state; |
@@ -3417,7 +3454,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, | |||
3417 | 3454 | ||
3418 | current->reclaim_state = NULL; | 3455 | current->reclaim_state = NULL; |
3419 | lockdep_clear_current_reclaim_state(); | 3456 | lockdep_clear_current_reclaim_state(); |
3420 | current->flags &= ~PF_MEMALLOC; | 3457 | memalloc_noreclaim_restore(noreclaim_flag); |
3421 | 3458 | ||
3422 | cond_resched(); | 3459 | cond_resched(); |
3423 | 3460 | ||
@@ -3525,19 +3562,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | |||
3525 | } | 3562 | } |
3526 | 3563 | ||
3527 | /* | 3564 | /* |
3528 | * Maximum number of reclaim retries without any progress before OOM killer | ||
3529 | * is consider as the only way to move forward. | ||
3530 | */ | ||
3531 | #define MAX_RECLAIM_RETRIES 16 | ||
3532 | |||
3533 | /* | ||
3534 | * Checks whether it makes sense to retry the reclaim to make a forward progress | 3565 | * Checks whether it makes sense to retry the reclaim to make a forward progress |
3535 | * for the given allocation request. | 3566 | * for the given allocation request. |
3536 | * The reclaim feedback represented by did_some_progress (any progress during | 3567 | * |
3537 | * the last reclaim round) and no_progress_loops (number of reclaim rounds without | 3568 | * We give up when we either have tried MAX_RECLAIM_RETRIES in a row |
3538 | * any progress in a row) is considered as well as the reclaimable pages on the | 3569 | * without success, or when we couldn't even meet the watermark if we |
3539 | * applicable zone list (with a backoff mechanism which is a function of | 3570 | * reclaimed all remaining pages on the LRU lists. |
3540 | * no_progress_loops). | ||
3541 | * | 3571 | * |
3542 | * Returns true if a retry is viable or false to enter the oom path. | 3572 | * Returns true if a retry is viable or false to enter the oom path. |
3543 | */ | 3573 | */ |
@@ -3582,13 +3612,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, | |||
3582 | bool wmark; | 3612 | bool wmark; |
3583 | 3613 | ||
3584 | available = reclaimable = zone_reclaimable_pages(zone); | 3614 | available = reclaimable = zone_reclaimable_pages(zone); |
3585 | available -= DIV_ROUND_UP((*no_progress_loops) * available, | ||
3586 | MAX_RECLAIM_RETRIES); | ||
3587 | available += zone_page_state_snapshot(zone, NR_FREE_PAGES); | 3615 | available += zone_page_state_snapshot(zone, NR_FREE_PAGES); |
3588 | 3616 | ||
3589 | /* | 3617 | /* |
3590 | * Would the allocation succeed if we reclaimed the whole | 3618 | * Would the allocation succeed if we reclaimed all |
3591 | * available? | 3619 | * reclaimable pages? |
3592 | */ | 3620 | */ |
3593 | wmark = __zone_watermark_ok(zone, order, min_wmark, | 3621 | wmark = __zone_watermark_ok(zone, order, min_wmark, |
3594 | ac_classzone_idx(ac), alloc_flags, available); | 3622 | ac_classzone_idx(ac), alloc_flags, available); |
@@ -3639,6 +3667,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
3639 | struct alloc_context *ac) | 3667 | struct alloc_context *ac) |
3640 | { | 3668 | { |
3641 | bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; | 3669 | bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; |
3670 | const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; | ||
3642 | struct page *page = NULL; | 3671 | struct page *page = NULL; |
3643 | unsigned int alloc_flags; | 3672 | unsigned int alloc_flags; |
3644 | unsigned long did_some_progress; | 3673 | unsigned long did_some_progress; |
@@ -3706,12 +3735,17 @@ retry_cpuset: | |||
3706 | 3735 | ||
3707 | /* | 3736 | /* |
3708 | * For costly allocations, try direct compaction first, as it's likely | 3737 | * For costly allocations, try direct compaction first, as it's likely |
3709 | * that we have enough base pages and don't need to reclaim. Don't try | 3738 | * that we have enough base pages and don't need to reclaim. For non- |
3710 | * that for allocations that are allowed to ignore watermarks, as the | 3739 | * movable high-order allocations, do that as well, as compaction will |
3711 | * ALLOC_NO_WATERMARKS attempt didn't yet happen. | 3740 | * try prevent permanent fragmentation by migrating from blocks of the |
3741 | * same migratetype. | ||
3742 | * Don't try this for allocations that are allowed to ignore | ||
3743 | * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. | ||
3712 | */ | 3744 | */ |
3713 | if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER && | 3745 | if (can_direct_reclaim && |
3714 | !gfp_pfmemalloc_allowed(gfp_mask)) { | 3746 | (costly_order || |
3747 | (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) | ||
3748 | && !gfp_pfmemalloc_allowed(gfp_mask)) { | ||
3715 | page = __alloc_pages_direct_compact(gfp_mask, order, | 3749 | page = __alloc_pages_direct_compact(gfp_mask, order, |
3716 | alloc_flags, ac, | 3750 | alloc_flags, ac, |
3717 | INIT_COMPACT_PRIORITY, | 3751 | INIT_COMPACT_PRIORITY, |
@@ -3723,7 +3757,7 @@ retry_cpuset: | |||
3723 | * Checks for costly allocations with __GFP_NORETRY, which | 3757 | * Checks for costly allocations with __GFP_NORETRY, which |
3724 | * includes THP page fault allocations | 3758 | * includes THP page fault allocations |
3725 | */ | 3759 | */ |
3726 | if (gfp_mask & __GFP_NORETRY) { | 3760 | if (costly_order && (gfp_mask & __GFP_NORETRY)) { |
3727 | /* | 3761 | /* |
3728 | * If compaction is deferred for high-order allocations, | 3762 | * If compaction is deferred for high-order allocations, |
3729 | * it is because sync compaction recently failed. If | 3763 | * it is because sync compaction recently failed. If |
@@ -3774,7 +3808,7 @@ retry: | |||
3774 | 3808 | ||
3775 | /* Make sure we know about allocations which stall for too long */ | 3809 | /* Make sure we know about allocations which stall for too long */ |
3776 | if (time_after(jiffies, alloc_start + stall_timeout)) { | 3810 | if (time_after(jiffies, alloc_start + stall_timeout)) { |
3777 | warn_alloc(gfp_mask, ac->nodemask, | 3811 | warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask, |
3778 | "page allocation stalls for %ums, order:%u", | 3812 | "page allocation stalls for %ums, order:%u", |
3779 | jiffies_to_msecs(jiffies-alloc_start), order); | 3813 | jiffies_to_msecs(jiffies-alloc_start), order); |
3780 | stall_timeout += 10 * HZ; | 3814 | stall_timeout += 10 * HZ; |
@@ -3804,7 +3838,7 @@ retry: | |||
3804 | * Do not retry costly high order allocations unless they are | 3838 | * Do not retry costly high order allocations unless they are |
3805 | * __GFP_REPEAT | 3839 | * __GFP_REPEAT |
3806 | */ | 3840 | */ |
3807 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) | 3841 | if (costly_order && !(gfp_mask & __GFP_REPEAT)) |
3808 | goto nopage; | 3842 | goto nopage; |
3809 | 3843 | ||
3810 | if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, | 3844 | if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, |
@@ -3974,10 +4008,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
3974 | goto out; | 4008 | goto out; |
3975 | 4009 | ||
3976 | /* | 4010 | /* |
3977 | * Runtime PM, block IO and its error handling path can deadlock | 4011 | * Apply scoped allocation constraints. This is mainly about GFP_NOFS |
3978 | * because I/O on the device might not complete. | 4012 | * resp. GFP_NOIO which has to be inherited for all allocation requests |
4013 | * from a particular context which has been marked by | ||
4014 | * memalloc_no{fs,io}_{save,restore}. | ||
3979 | */ | 4015 | */ |
3980 | alloc_mask = memalloc_noio_flags(gfp_mask); | 4016 | alloc_mask = current_gfp_context(gfp_mask); |
3981 | ac.spread_dirty_pages = false; | 4017 | ac.spread_dirty_pages = false; |
3982 | 4018 | ||
3983 | /* | 4019 | /* |
@@ -4250,7 +4286,8 @@ EXPORT_SYMBOL(free_pages_exact); | |||
4250 | * nr_free_zone_pages() counts the number of counts pages which are beyond the | 4286 | * nr_free_zone_pages() counts the number of counts pages which are beyond the |
4251 | * high watermark within all zones at or below a given zone index. For each | 4287 | * high watermark within all zones at or below a given zone index. For each |
4252 | * zone, the number of pages is calculated as: | 4288 | * zone, the number of pages is calculated as: |
4253 | * managed_pages - high_pages | 4289 | * |
4290 | * nr_free_zone_pages = managed_pages - high_pages | ||
4254 | */ | 4291 | */ |
4255 | static unsigned long nr_free_zone_pages(int offset) | 4292 | static unsigned long nr_free_zone_pages(int offset) |
4256 | { | 4293 | { |
@@ -4512,7 +4549,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) | |||
4512 | #endif | 4549 | #endif |
4513 | " writeback_tmp:%lukB" | 4550 | " writeback_tmp:%lukB" |
4514 | " unstable:%lukB" | 4551 | " unstable:%lukB" |
4515 | " pages_scanned:%lu" | ||
4516 | " all_unreclaimable? %s" | 4552 | " all_unreclaimable? %s" |
4517 | "\n", | 4553 | "\n", |
4518 | pgdat->node_id, | 4554 | pgdat->node_id, |
@@ -4535,8 +4571,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) | |||
4535 | #endif | 4571 | #endif |
4536 | K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), | 4572 | K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), |
4537 | K(node_page_state(pgdat, NR_UNSTABLE_NFS)), | 4573 | K(node_page_state(pgdat, NR_UNSTABLE_NFS)), |
4538 | node_page_state(pgdat, NR_PAGES_SCANNED), | 4574 | pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? |
4539 | !pgdat_reclaimable(pgdat) ? "yes" : "no"); | 4575 | "yes" : "no"); |
4540 | } | 4576 | } |
4541 | 4577 | ||
4542 | for_each_populated_zone(zone) { | 4578 | for_each_populated_zone(zone) { |
@@ -7431,7 +7467,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
7431 | .zone = page_zone(pfn_to_page(start)), | 7467 | .zone = page_zone(pfn_to_page(start)), |
7432 | .mode = MIGRATE_SYNC, | 7468 | .mode = MIGRATE_SYNC, |
7433 | .ignore_skip_hint = true, | 7469 | .ignore_skip_hint = true, |
7434 | .gfp_mask = memalloc_noio_flags(gfp_mask), | 7470 | .gfp_mask = current_gfp_context(gfp_mask), |
7435 | }; | 7471 | }; |
7436 | INIT_LIST_HEAD(&cc.migratepages); | 7472 | INIT_LIST_HEAD(&cc.migratepages); |
7437 | 7473 | ||
diff --git a/mm/page_ext.c b/mm/page_ext.c index 121dcffc4ec1..88ccc044b09a 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c | |||
@@ -59,9 +59,6 @@ | |||
59 | 59 | ||
60 | static struct page_ext_operations *page_ext_ops[] = { | 60 | static struct page_ext_operations *page_ext_ops[] = { |
61 | &debug_guardpage_ops, | 61 | &debug_guardpage_ops, |
62 | #ifdef CONFIG_PAGE_POISONING | ||
63 | &page_poisoning_ops, | ||
64 | #endif | ||
65 | #ifdef CONFIG_PAGE_OWNER | 62 | #ifdef CONFIG_PAGE_OWNER |
66 | &page_owner_ops, | 63 | &page_owner_ops, |
67 | #endif | 64 | #endif |
@@ -127,15 +124,12 @@ struct page_ext *lookup_page_ext(struct page *page) | |||
127 | struct page_ext *base; | 124 | struct page_ext *base; |
128 | 125 | ||
129 | base = NODE_DATA(page_to_nid(page))->node_page_ext; | 126 | base = NODE_DATA(page_to_nid(page))->node_page_ext; |
130 | #if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) | 127 | #if defined(CONFIG_DEBUG_VM) |
131 | /* | 128 | /* |
132 | * The sanity checks the page allocator does upon freeing a | 129 | * The sanity checks the page allocator does upon freeing a |
133 | * page can reach here before the page_ext arrays are | 130 | * page can reach here before the page_ext arrays are |
134 | * allocated when feeding a range of pages to the allocator | 131 | * allocated when feeding a range of pages to the allocator |
135 | * for the first time during bootup or memory hotplug. | 132 | * for the first time during bootup or memory hotplug. |
136 | * | ||
137 | * This check is also necessary for ensuring page poisoning | ||
138 | * works as expected when enabled | ||
139 | */ | 133 | */ |
140 | if (unlikely(!base)) | 134 | if (unlikely(!base)) |
141 | return NULL; | 135 | return NULL; |
@@ -204,15 +198,12 @@ struct page_ext *lookup_page_ext(struct page *page) | |||
204 | { | 198 | { |
205 | unsigned long pfn = page_to_pfn(page); | 199 | unsigned long pfn = page_to_pfn(page); |
206 | struct mem_section *section = __pfn_to_section(pfn); | 200 | struct mem_section *section = __pfn_to_section(pfn); |
207 | #if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) | 201 | #if defined(CONFIG_DEBUG_VM) |
208 | /* | 202 | /* |
209 | * The sanity checks the page allocator does upon freeing a | 203 | * The sanity checks the page allocator does upon freeing a |
210 | * page can reach here before the page_ext arrays are | 204 | * page can reach here before the page_ext arrays are |
211 | * allocated when feeding a range of pages to the allocator | 205 | * allocated when feeding a range of pages to the allocator |
212 | * for the first time during bootup or memory hotplug. | 206 | * for the first time during bootup or memory hotplug. |
213 | * | ||
214 | * This check is also necessary for ensuring page poisoning | ||
215 | * works as expected when enabled | ||
216 | */ | 207 | */ |
217 | if (!section->page_ext) | 208 | if (!section->page_ext) |
218 | return NULL; | 209 | return NULL; |
diff --git a/mm/page_idle.c b/mm/page_idle.c index b0ee56c56b58..1b0f48c62316 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c | |||
@@ -50,7 +50,7 @@ static struct page *page_idle_get_page(unsigned long pfn) | |||
50 | return page; | 50 | return page; |
51 | } | 51 | } |
52 | 52 | ||
53 | static int page_idle_clear_pte_refs_one(struct page *page, | 53 | static bool page_idle_clear_pte_refs_one(struct page *page, |
54 | struct vm_area_struct *vma, | 54 | struct vm_area_struct *vma, |
55 | unsigned long addr, void *arg) | 55 | unsigned long addr, void *arg) |
56 | { | 56 | { |
@@ -84,7 +84,7 @@ static int page_idle_clear_pte_refs_one(struct page *page, | |||
84 | */ | 84 | */ |
85 | set_page_young(page); | 85 | set_page_young(page); |
86 | } | 86 | } |
87 | return SWAP_AGAIN; | 87 | return true; |
88 | } | 88 | } |
89 | 89 | ||
90 | static void page_idle_clear_pte_refs(struct page *page) | 90 | static void page_idle_clear_pte_refs(struct page *page) |
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f4e17a57926a..5092e4ef00c8 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -66,7 +66,8 @@ out: | |||
66 | 66 | ||
67 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | 67 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); |
68 | zone->nr_isolate_pageblock++; | 68 | zone->nr_isolate_pageblock++; |
69 | nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); | 69 | nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, |
70 | NULL); | ||
70 | 71 | ||
71 | __mod_zone_freepage_state(zone, -nr_pages, migratetype); | 72 | __mod_zone_freepage_state(zone, -nr_pages, migratetype); |
72 | } | 73 | } |
@@ -88,7 +89,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) | |||
88 | 89 | ||
89 | zone = page_zone(page); | 90 | zone = page_zone(page); |
90 | spin_lock_irqsave(&zone->lock, flags); | 91 | spin_lock_irqsave(&zone->lock, flags); |
91 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | 92 | if (!is_migrate_isolate_page(page)) |
92 | goto out; | 93 | goto out; |
93 | 94 | ||
94 | /* | 95 | /* |
@@ -120,7 +121,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) | |||
120 | * pageblock scanning for freepage moving. | 121 | * pageblock scanning for freepage moving. |
121 | */ | 122 | */ |
122 | if (!isolated_page) { | 123 | if (!isolated_page) { |
123 | nr_pages = move_freepages_block(zone, page, migratetype); | 124 | nr_pages = move_freepages_block(zone, page, migratetype, NULL); |
124 | __mod_zone_freepage_state(zone, nr_pages, migratetype); | 125 | __mod_zone_freepage_state(zone, nr_pages, migratetype); |
125 | } | 126 | } |
126 | set_pageblock_migratetype(page, migratetype); | 127 | set_pageblock_migratetype(page, migratetype); |
@@ -205,7 +206,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | |||
205 | pfn < end_pfn; | 206 | pfn < end_pfn; |
206 | pfn += pageblock_nr_pages) { | 207 | pfn += pageblock_nr_pages) { |
207 | page = __first_valid_page(pfn, pageblock_nr_pages); | 208 | page = __first_valid_page(pfn, pageblock_nr_pages); |
208 | if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | 209 | if (!page || !is_migrate_isolate_page(page)) |
209 | continue; | 210 | continue; |
210 | unset_migratetype_isolate(page, migratetype); | 211 | unset_migratetype_isolate(page, migratetype); |
211 | } | 212 | } |
@@ -262,7 +263,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, | |||
262 | */ | 263 | */ |
263 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 264 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |
264 | page = __first_valid_page(pfn, pageblock_nr_pages); | 265 | page = __first_valid_page(pfn, pageblock_nr_pages); |
265 | if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | 266 | if (page && !is_migrate_isolate_page(page)) |
266 | break; | 267 | break; |
267 | } | 268 | } |
268 | page = __first_valid_page(start_pfn, end_pfn - start_pfn); | 269 | page = __first_valid_page(start_pfn, end_pfn - start_pfn); |
diff --git a/mm/page_poison.c b/mm/page_poison.c index 2e647c65916b..be19e989ccff 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c | |||
@@ -6,7 +6,6 @@ | |||
6 | #include <linux/poison.h> | 6 | #include <linux/poison.h> |
7 | #include <linux/ratelimit.h> | 7 | #include <linux/ratelimit.h> |
8 | 8 | ||
9 | static bool __page_poisoning_enabled __read_mostly; | ||
10 | static bool want_page_poisoning __read_mostly; | 9 | static bool want_page_poisoning __read_mostly; |
11 | 10 | ||
12 | static int early_page_poison_param(char *buf) | 11 | static int early_page_poison_param(char *buf) |
@@ -19,74 +18,21 @@ early_param("page_poison", early_page_poison_param); | |||
19 | 18 | ||
20 | bool page_poisoning_enabled(void) | 19 | bool page_poisoning_enabled(void) |
21 | { | 20 | { |
22 | return __page_poisoning_enabled; | ||
23 | } | ||
24 | |||
25 | static bool need_page_poisoning(void) | ||
26 | { | ||
27 | return want_page_poisoning; | ||
28 | } | ||
29 | |||
30 | static void init_page_poisoning(void) | ||
31 | { | ||
32 | /* | 21 | /* |
33 | * page poisoning is debug page alloc for some arches. If either | 22 | * Assumes that debug_pagealloc_enabled is set before |
34 | * of those options are enabled, enable poisoning | 23 | * free_all_bootmem. |
24 | * Page poisoning is debug page alloc for some arches. If | ||
25 | * either of those options are enabled, enable poisoning. | ||
35 | */ | 26 | */ |
36 | if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) { | 27 | return (want_page_poisoning || |
37 | if (!want_page_poisoning && !debug_pagealloc_enabled()) | 28 | (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && |
38 | return; | 29 | debug_pagealloc_enabled())); |
39 | } else { | ||
40 | if (!want_page_poisoning) | ||
41 | return; | ||
42 | } | ||
43 | |||
44 | __page_poisoning_enabled = true; | ||
45 | } | ||
46 | |||
47 | struct page_ext_operations page_poisoning_ops = { | ||
48 | .need = need_page_poisoning, | ||
49 | .init = init_page_poisoning, | ||
50 | }; | ||
51 | |||
52 | static inline void set_page_poison(struct page *page) | ||
53 | { | ||
54 | struct page_ext *page_ext; | ||
55 | |||
56 | page_ext = lookup_page_ext(page); | ||
57 | if (unlikely(!page_ext)) | ||
58 | return; | ||
59 | |||
60 | __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); | ||
61 | } | ||
62 | |||
63 | static inline void clear_page_poison(struct page *page) | ||
64 | { | ||
65 | struct page_ext *page_ext; | ||
66 | |||
67 | page_ext = lookup_page_ext(page); | ||
68 | if (unlikely(!page_ext)) | ||
69 | return; | ||
70 | |||
71 | __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); | ||
72 | } | ||
73 | |||
74 | bool page_is_poisoned(struct page *page) | ||
75 | { | ||
76 | struct page_ext *page_ext; | ||
77 | |||
78 | page_ext = lookup_page_ext(page); | ||
79 | if (unlikely(!page_ext)) | ||
80 | return false; | ||
81 | |||
82 | return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); | ||
83 | } | 30 | } |
84 | 31 | ||
85 | static void poison_page(struct page *page) | 32 | static void poison_page(struct page *page) |
86 | { | 33 | { |
87 | void *addr = kmap_atomic(page); | 34 | void *addr = kmap_atomic(page); |
88 | 35 | ||
89 | set_page_poison(page); | ||
90 | memset(addr, PAGE_POISON, PAGE_SIZE); | 36 | memset(addr, PAGE_POISON, PAGE_SIZE); |
91 | kunmap_atomic(addr); | 37 | kunmap_atomic(addr); |
92 | } | 38 | } |
@@ -140,12 +86,13 @@ static void unpoison_page(struct page *page) | |||
140 | { | 86 | { |
141 | void *addr; | 87 | void *addr; |
142 | 88 | ||
143 | if (!page_is_poisoned(page)) | ||
144 | return; | ||
145 | |||
146 | addr = kmap_atomic(page); | 89 | addr = kmap_atomic(page); |
90 | /* | ||
91 | * Page poisoning when enabled poisons each and every page | ||
92 | * that is freed to buddy. Thus no extra check is done to | ||
93 | * see if a page was posioned. | ||
94 | */ | ||
147 | check_poison_mem(addr, PAGE_SIZE); | 95 | check_poison_mem(addr, PAGE_SIZE); |
148 | clear_page_poison(page); | ||
149 | kunmap_atomic(addr); | 96 | kunmap_atomic(addr); |
150 | } | 97 | } |
151 | 98 | ||
diff --git a/mm/percpu.c b/mm/percpu.c index 60a6488e9e6d..e0aa8ae7bde7 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -1284,18 +1284,7 @@ void free_percpu(void __percpu *ptr) | |||
1284 | } | 1284 | } |
1285 | EXPORT_SYMBOL_GPL(free_percpu); | 1285 | EXPORT_SYMBOL_GPL(free_percpu); |
1286 | 1286 | ||
1287 | /** | 1287 | bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr) |
1288 | * is_kernel_percpu_address - test whether address is from static percpu area | ||
1289 | * @addr: address to test | ||
1290 | * | ||
1291 | * Test whether @addr belongs to in-kernel static percpu area. Module | ||
1292 | * static percpu areas are not considered. For those, use | ||
1293 | * is_module_percpu_address(). | ||
1294 | * | ||
1295 | * RETURNS: | ||
1296 | * %true if @addr is from in-kernel static percpu area, %false otherwise. | ||
1297 | */ | ||
1298 | bool is_kernel_percpu_address(unsigned long addr) | ||
1299 | { | 1288 | { |
1300 | #ifdef CONFIG_SMP | 1289 | #ifdef CONFIG_SMP |
1301 | const size_t static_size = __per_cpu_end - __per_cpu_start; | 1290 | const size_t static_size = __per_cpu_end - __per_cpu_start; |
@@ -1304,16 +1293,39 @@ bool is_kernel_percpu_address(unsigned long addr) | |||
1304 | 1293 | ||
1305 | for_each_possible_cpu(cpu) { | 1294 | for_each_possible_cpu(cpu) { |
1306 | void *start = per_cpu_ptr(base, cpu); | 1295 | void *start = per_cpu_ptr(base, cpu); |
1296 | void *va = (void *)addr; | ||
1307 | 1297 | ||
1308 | if ((void *)addr >= start && (void *)addr < start + static_size) | 1298 | if (va >= start && va < start + static_size) { |
1299 | if (can_addr) { | ||
1300 | *can_addr = (unsigned long) (va - start); | ||
1301 | *can_addr += (unsigned long) | ||
1302 | per_cpu_ptr(base, get_boot_cpu_id()); | ||
1303 | } | ||
1309 | return true; | 1304 | return true; |
1310 | } | 1305 | } |
1306 | } | ||
1311 | #endif | 1307 | #endif |
1312 | /* on UP, can't distinguish from other static vars, always false */ | 1308 | /* on UP, can't distinguish from other static vars, always false */ |
1313 | return false; | 1309 | return false; |
1314 | } | 1310 | } |
1315 | 1311 | ||
1316 | /** | 1312 | /** |
1313 | * is_kernel_percpu_address - test whether address is from static percpu area | ||
1314 | * @addr: address to test | ||
1315 | * | ||
1316 | * Test whether @addr belongs to in-kernel static percpu area. Module | ||
1317 | * static percpu areas are not considered. For those, use | ||
1318 | * is_module_percpu_address(). | ||
1319 | * | ||
1320 | * RETURNS: | ||
1321 | * %true if @addr is from in-kernel static percpu area, %false otherwise. | ||
1322 | */ | ||
1323 | bool is_kernel_percpu_address(unsigned long addr) | ||
1324 | { | ||
1325 | return __is_kernel_percpu_address(addr, NULL); | ||
1326 | } | ||
1327 | |||
1328 | /** | ||
1317 | * per_cpu_ptr_to_phys - convert translated percpu address to physical address | 1329 | * per_cpu_ptr_to_phys - convert translated percpu address to physical address |
1318 | * @addr: the address to be converted to physical address | 1330 | * @addr: the address to be converted to physical address |
1319 | * | 1331 | * |
@@ -724,7 +724,7 @@ struct page_referenced_arg { | |||
724 | /* | 724 | /* |
725 | * arg: page_referenced_arg will be passed | 725 | * arg: page_referenced_arg will be passed |
726 | */ | 726 | */ |
727 | static int page_referenced_one(struct page *page, struct vm_area_struct *vma, | 727 | static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, |
728 | unsigned long address, void *arg) | 728 | unsigned long address, void *arg) |
729 | { | 729 | { |
730 | struct page_referenced_arg *pra = arg; | 730 | struct page_referenced_arg *pra = arg; |
@@ -741,7 +741,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
741 | if (vma->vm_flags & VM_LOCKED) { | 741 | if (vma->vm_flags & VM_LOCKED) { |
742 | page_vma_mapped_walk_done(&pvmw); | 742 | page_vma_mapped_walk_done(&pvmw); |
743 | pra->vm_flags |= VM_LOCKED; | 743 | pra->vm_flags |= VM_LOCKED; |
744 | return SWAP_FAIL; /* To break the loop */ | 744 | return false; /* To break the loop */ |
745 | } | 745 | } |
746 | 746 | ||
747 | if (pvmw.pte) { | 747 | if (pvmw.pte) { |
@@ -781,9 +781,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
781 | } | 781 | } |
782 | 782 | ||
783 | if (!pra->mapcount) | 783 | if (!pra->mapcount) |
784 | return SWAP_SUCCESS; /* To break the loop */ | 784 | return false; /* To break the loop */ |
785 | 785 | ||
786 | return SWAP_AGAIN; | 786 | return true; |
787 | } | 787 | } |
788 | 788 | ||
789 | static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) | 789 | static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) |
@@ -812,7 +812,6 @@ int page_referenced(struct page *page, | |||
812 | struct mem_cgroup *memcg, | 812 | struct mem_cgroup *memcg, |
813 | unsigned long *vm_flags) | 813 | unsigned long *vm_flags) |
814 | { | 814 | { |
815 | int ret; | ||
816 | int we_locked = 0; | 815 | int we_locked = 0; |
817 | struct page_referenced_arg pra = { | 816 | struct page_referenced_arg pra = { |
818 | .mapcount = total_mapcount(page), | 817 | .mapcount = total_mapcount(page), |
@@ -846,7 +845,7 @@ int page_referenced(struct page *page, | |||
846 | rwc.invalid_vma = invalid_page_referenced_vma; | 845 | rwc.invalid_vma = invalid_page_referenced_vma; |
847 | } | 846 | } |
848 | 847 | ||
849 | ret = rmap_walk(page, &rwc); | 848 | rmap_walk(page, &rwc); |
850 | *vm_flags = pra.vm_flags; | 849 | *vm_flags = pra.vm_flags; |
851 | 850 | ||
852 | if (we_locked) | 851 | if (we_locked) |
@@ -855,7 +854,7 @@ int page_referenced(struct page *page, | |||
855 | return pra.referenced; | 854 | return pra.referenced; |
856 | } | 855 | } |
857 | 856 | ||
858 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | 857 | static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, |
859 | unsigned long address, void *arg) | 858 | unsigned long address, void *arg) |
860 | { | 859 | { |
861 | struct page_vma_mapped_walk pvmw = { | 860 | struct page_vma_mapped_walk pvmw = { |
@@ -908,7 +907,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, | |||
908 | } | 907 | } |
909 | } | 908 | } |
910 | 909 | ||
911 | return SWAP_AGAIN; | 910 | return true; |
912 | } | 911 | } |
913 | 912 | ||
914 | static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) | 913 | static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) |
@@ -1159,7 +1158,7 @@ void page_add_file_rmap(struct page *page, bool compound) | |||
1159 | goto out; | 1158 | goto out; |
1160 | } | 1159 | } |
1161 | __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); | 1160 | __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); |
1162 | mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, nr); | 1161 | mod_memcg_page_state(page, NR_FILE_MAPPED, nr); |
1163 | out: | 1162 | out: |
1164 | unlock_page_memcg(page); | 1163 | unlock_page_memcg(page); |
1165 | } | 1164 | } |
@@ -1199,7 +1198,7 @@ static void page_remove_file_rmap(struct page *page, bool compound) | |||
1199 | * pte lock(a spinlock) is held, which implies preemption disabled. | 1198 | * pte lock(a spinlock) is held, which implies preemption disabled. |
1200 | */ | 1199 | */ |
1201 | __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); | 1200 | __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); |
1202 | mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -nr); | 1201 | mod_memcg_page_state(page, NR_FILE_MAPPED, -nr); |
1203 | 1202 | ||
1204 | if (unlikely(PageMlocked(page))) | 1203 | if (unlikely(PageMlocked(page))) |
1205 | clear_page_mlock(page); | 1204 | clear_page_mlock(page); |
@@ -1288,15 +1287,10 @@ void page_remove_rmap(struct page *page, bool compound) | |||
1288 | */ | 1287 | */ |
1289 | } | 1288 | } |
1290 | 1289 | ||
1291 | struct rmap_private { | ||
1292 | enum ttu_flags flags; | ||
1293 | int lazyfreed; | ||
1294 | }; | ||
1295 | |||
1296 | /* | 1290 | /* |
1297 | * @arg: enum ttu_flags will be passed to this argument | 1291 | * @arg: enum ttu_flags will be passed to this argument |
1298 | */ | 1292 | */ |
1299 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 1293 | static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
1300 | unsigned long address, void *arg) | 1294 | unsigned long address, void *arg) |
1301 | { | 1295 | { |
1302 | struct mm_struct *mm = vma->vm_mm; | 1296 | struct mm_struct *mm = vma->vm_mm; |
@@ -1307,13 +1301,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1307 | }; | 1301 | }; |
1308 | pte_t pteval; | 1302 | pte_t pteval; |
1309 | struct page *subpage; | 1303 | struct page *subpage; |
1310 | int ret = SWAP_AGAIN; | 1304 | bool ret = true; |
1311 | struct rmap_private *rp = arg; | 1305 | enum ttu_flags flags = (enum ttu_flags)arg; |
1312 | enum ttu_flags flags = rp->flags; | ||
1313 | 1306 | ||
1314 | /* munlock has nothing to gain from examining un-locked vmas */ | 1307 | /* munlock has nothing to gain from examining un-locked vmas */ |
1315 | if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) | 1308 | if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) |
1316 | return SWAP_AGAIN; | 1309 | return true; |
1317 | 1310 | ||
1318 | if (flags & TTU_SPLIT_HUGE_PMD) { | 1311 | if (flags & TTU_SPLIT_HUGE_PMD) { |
1319 | split_huge_pmd_address(vma, address, | 1312 | split_huge_pmd_address(vma, address, |
@@ -1336,7 +1329,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1336 | */ | 1329 | */ |
1337 | mlock_vma_page(page); | 1330 | mlock_vma_page(page); |
1338 | } | 1331 | } |
1339 | ret = SWAP_MLOCK; | 1332 | ret = false; |
1340 | page_vma_mapped_walk_done(&pvmw); | 1333 | page_vma_mapped_walk_done(&pvmw); |
1341 | break; | 1334 | break; |
1342 | } | 1335 | } |
@@ -1354,7 +1347,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1354 | if (!(flags & TTU_IGNORE_ACCESS)) { | 1347 | if (!(flags & TTU_IGNORE_ACCESS)) { |
1355 | if (ptep_clear_flush_young_notify(vma, address, | 1348 | if (ptep_clear_flush_young_notify(vma, address, |
1356 | pvmw.pte)) { | 1349 | pvmw.pte)) { |
1357 | ret = SWAP_FAIL; | 1350 | ret = false; |
1358 | page_vma_mapped_walk_done(&pvmw); | 1351 | page_vma_mapped_walk_done(&pvmw); |
1359 | break; | 1352 | break; |
1360 | } | 1353 | } |
@@ -1424,18 +1417,34 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1424 | * Store the swap location in the pte. | 1417 | * Store the swap location in the pte. |
1425 | * See handle_pte_fault() ... | 1418 | * See handle_pte_fault() ... |
1426 | */ | 1419 | */ |
1427 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); | 1420 | if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { |
1421 | WARN_ON_ONCE(1); | ||
1422 | ret = false; | ||
1423 | page_vma_mapped_walk_done(&pvmw); | ||
1424 | break; | ||
1425 | } | ||
1426 | |||
1427 | /* MADV_FREE page check */ | ||
1428 | if (!PageSwapBacked(page)) { | ||
1429 | if (!PageDirty(page)) { | ||
1430 | dec_mm_counter(mm, MM_ANONPAGES); | ||
1431 | goto discard; | ||
1432 | } | ||
1428 | 1433 | ||
1429 | if (!PageDirty(page) && (flags & TTU_LZFREE)) { | 1434 | /* |
1430 | /* It's a freeable page by MADV_FREE */ | 1435 | * If the page was redirtied, it cannot be |
1431 | dec_mm_counter(mm, MM_ANONPAGES); | 1436 | * discarded. Remap the page to page table. |
1432 | rp->lazyfreed++; | 1437 | */ |
1433 | goto discard; | 1438 | set_pte_at(mm, address, pvmw.pte, pteval); |
1439 | SetPageSwapBacked(page); | ||
1440 | ret = false; | ||
1441 | page_vma_mapped_walk_done(&pvmw); | ||
1442 | break; | ||
1434 | } | 1443 | } |
1435 | 1444 | ||
1436 | if (swap_duplicate(entry) < 0) { | 1445 | if (swap_duplicate(entry) < 0) { |
1437 | set_pte_at(mm, address, pvmw.pte, pteval); | 1446 | set_pte_at(mm, address, pvmw.pte, pteval); |
1438 | ret = SWAP_FAIL; | 1447 | ret = false; |
1439 | page_vma_mapped_walk_done(&pvmw); | 1448 | page_vma_mapped_walk_done(&pvmw); |
1440 | break; | 1449 | break; |
1441 | } | 1450 | } |
@@ -1492,24 +1501,14 @@ static int page_mapcount_is_zero(struct page *page) | |||
1492 | * | 1501 | * |
1493 | * Tries to remove all the page table entries which are mapping this | 1502 | * Tries to remove all the page table entries which are mapping this |
1494 | * page, used in the pageout path. Caller must hold the page lock. | 1503 | * page, used in the pageout path. Caller must hold the page lock. |
1495 | * Return values are: | ||
1496 | * | 1504 | * |
1497 | * SWAP_SUCCESS - we succeeded in removing all mappings | 1505 | * If unmap is successful, return true. Otherwise, false. |
1498 | * SWAP_AGAIN - we missed a mapping, try again later | ||
1499 | * SWAP_FAIL - the page is unswappable | ||
1500 | * SWAP_MLOCK - page is mlocked. | ||
1501 | */ | 1506 | */ |
1502 | int try_to_unmap(struct page *page, enum ttu_flags flags) | 1507 | bool try_to_unmap(struct page *page, enum ttu_flags flags) |
1503 | { | 1508 | { |
1504 | int ret; | ||
1505 | struct rmap_private rp = { | ||
1506 | .flags = flags, | ||
1507 | .lazyfreed = 0, | ||
1508 | }; | ||
1509 | |||
1510 | struct rmap_walk_control rwc = { | 1509 | struct rmap_walk_control rwc = { |
1511 | .rmap_one = try_to_unmap_one, | 1510 | .rmap_one = try_to_unmap_one, |
1512 | .arg = &rp, | 1511 | .arg = (void *)flags, |
1513 | .done = page_mapcount_is_zero, | 1512 | .done = page_mapcount_is_zero, |
1514 | .anon_lock = page_lock_anon_vma_read, | 1513 | .anon_lock = page_lock_anon_vma_read, |
1515 | }; | 1514 | }; |
@@ -1526,16 +1525,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1526 | rwc.invalid_vma = invalid_migration_vma; | 1525 | rwc.invalid_vma = invalid_migration_vma; |
1527 | 1526 | ||
1528 | if (flags & TTU_RMAP_LOCKED) | 1527 | if (flags & TTU_RMAP_LOCKED) |
1529 | ret = rmap_walk_locked(page, &rwc); | 1528 | rmap_walk_locked(page, &rwc); |
1530 | else | 1529 | else |
1531 | ret = rmap_walk(page, &rwc); | 1530 | rmap_walk(page, &rwc); |
1532 | 1531 | ||
1533 | if (ret != SWAP_MLOCK && !page_mapcount(page)) { | 1532 | return !page_mapcount(page) ? true : false; |
1534 | ret = SWAP_SUCCESS; | ||
1535 | if (rp.lazyfreed && !PageDirty(page)) | ||
1536 | ret = SWAP_LZFREE; | ||
1537 | } | ||
1538 | return ret; | ||
1539 | } | 1533 | } |
1540 | 1534 | ||
1541 | static int page_not_mapped(struct page *page) | 1535 | static int page_not_mapped(struct page *page) |
@@ -1550,34 +1544,22 @@ static int page_not_mapped(struct page *page) | |||
1550 | * Called from munlock code. Checks all of the VMAs mapping the page | 1544 | * Called from munlock code. Checks all of the VMAs mapping the page |
1551 | * to make sure nobody else has this page mlocked. The page will be | 1545 | * to make sure nobody else has this page mlocked. The page will be |
1552 | * returned with PG_mlocked cleared if no other vmas have it mlocked. | 1546 | * returned with PG_mlocked cleared if no other vmas have it mlocked. |
1553 | * | ||
1554 | * Return values are: | ||
1555 | * | ||
1556 | * SWAP_AGAIN - no vma is holding page mlocked, or, | ||
1557 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem | ||
1558 | * SWAP_FAIL - page cannot be located at present | ||
1559 | * SWAP_MLOCK - page is now mlocked. | ||
1560 | */ | 1547 | */ |
1561 | int try_to_munlock(struct page *page) | ||
1562 | { | ||
1563 | int ret; | ||
1564 | struct rmap_private rp = { | ||
1565 | .flags = TTU_MUNLOCK, | ||
1566 | .lazyfreed = 0, | ||
1567 | }; | ||
1568 | 1548 | ||
1549 | void try_to_munlock(struct page *page) | ||
1550 | { | ||
1569 | struct rmap_walk_control rwc = { | 1551 | struct rmap_walk_control rwc = { |
1570 | .rmap_one = try_to_unmap_one, | 1552 | .rmap_one = try_to_unmap_one, |
1571 | .arg = &rp, | 1553 | .arg = (void *)TTU_MUNLOCK, |
1572 | .done = page_not_mapped, | 1554 | .done = page_not_mapped, |
1573 | .anon_lock = page_lock_anon_vma_read, | 1555 | .anon_lock = page_lock_anon_vma_read, |
1574 | 1556 | ||
1575 | }; | 1557 | }; |
1576 | 1558 | ||
1577 | VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); | 1559 | VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); |
1560 | VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); | ||
1578 | 1561 | ||
1579 | ret = rmap_walk(page, &rwc); | 1562 | rmap_walk(page, &rwc); |
1580 | return ret; | ||
1581 | } | 1563 | } |
1582 | 1564 | ||
1583 | void __put_anon_vma(struct anon_vma *anon_vma) | 1565 | void __put_anon_vma(struct anon_vma *anon_vma) |
@@ -1625,13 +1607,12 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, | |||
1625 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | 1607 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be |
1626 | * LOCKED. | 1608 | * LOCKED. |
1627 | */ | 1609 | */ |
1628 | static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, | 1610 | static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, |
1629 | bool locked) | 1611 | bool locked) |
1630 | { | 1612 | { |
1631 | struct anon_vma *anon_vma; | 1613 | struct anon_vma *anon_vma; |
1632 | pgoff_t pgoff_start, pgoff_end; | 1614 | pgoff_t pgoff_start, pgoff_end; |
1633 | struct anon_vma_chain *avc; | 1615 | struct anon_vma_chain *avc; |
1634 | int ret = SWAP_AGAIN; | ||
1635 | 1616 | ||
1636 | if (locked) { | 1617 | if (locked) { |
1637 | anon_vma = page_anon_vma(page); | 1618 | anon_vma = page_anon_vma(page); |
@@ -1641,7 +1622,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, | |||
1641 | anon_vma = rmap_walk_anon_lock(page, rwc); | 1622 | anon_vma = rmap_walk_anon_lock(page, rwc); |
1642 | } | 1623 | } |
1643 | if (!anon_vma) | 1624 | if (!anon_vma) |
1644 | return ret; | 1625 | return; |
1645 | 1626 | ||
1646 | pgoff_start = page_to_pgoff(page); | 1627 | pgoff_start = page_to_pgoff(page); |
1647 | pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; | 1628 | pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; |
@@ -1655,8 +1636,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, | |||
1655 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) | 1636 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) |
1656 | continue; | 1637 | continue; |
1657 | 1638 | ||
1658 | ret = rwc->rmap_one(page, vma, address, rwc->arg); | 1639 | if (!rwc->rmap_one(page, vma, address, rwc->arg)) |
1659 | if (ret != SWAP_AGAIN) | ||
1660 | break; | 1640 | break; |
1661 | if (rwc->done && rwc->done(page)) | 1641 | if (rwc->done && rwc->done(page)) |
1662 | break; | 1642 | break; |
@@ -1664,7 +1644,6 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, | |||
1664 | 1644 | ||
1665 | if (!locked) | 1645 | if (!locked) |
1666 | anon_vma_unlock_read(anon_vma); | 1646 | anon_vma_unlock_read(anon_vma); |
1667 | return ret; | ||
1668 | } | 1647 | } |
1669 | 1648 | ||
1670 | /* | 1649 | /* |
@@ -1680,13 +1659,12 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, | |||
1680 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | 1659 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be |
1681 | * LOCKED. | 1660 | * LOCKED. |
1682 | */ | 1661 | */ |
1683 | static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, | 1662 | static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, |
1684 | bool locked) | 1663 | bool locked) |
1685 | { | 1664 | { |
1686 | struct address_space *mapping = page_mapping(page); | 1665 | struct address_space *mapping = page_mapping(page); |
1687 | pgoff_t pgoff_start, pgoff_end; | 1666 | pgoff_t pgoff_start, pgoff_end; |
1688 | struct vm_area_struct *vma; | 1667 | struct vm_area_struct *vma; |
1689 | int ret = SWAP_AGAIN; | ||
1690 | 1668 | ||
1691 | /* | 1669 | /* |
1692 | * The page lock not only makes sure that page->mapping cannot | 1670 | * The page lock not only makes sure that page->mapping cannot |
@@ -1697,7 +1675,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, | |||
1697 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 1675 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
1698 | 1676 | ||
1699 | if (!mapping) | 1677 | if (!mapping) |
1700 | return ret; | 1678 | return; |
1701 | 1679 | ||
1702 | pgoff_start = page_to_pgoff(page); | 1680 | pgoff_start = page_to_pgoff(page); |
1703 | pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; | 1681 | pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; |
@@ -1712,8 +1690,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, | |||
1712 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) | 1690 | if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) |
1713 | continue; | 1691 | continue; |
1714 | 1692 | ||
1715 | ret = rwc->rmap_one(page, vma, address, rwc->arg); | 1693 | if (!rwc->rmap_one(page, vma, address, rwc->arg)) |
1716 | if (ret != SWAP_AGAIN) | ||
1717 | goto done; | 1694 | goto done; |
1718 | if (rwc->done && rwc->done(page)) | 1695 | if (rwc->done && rwc->done(page)) |
1719 | goto done; | 1696 | goto done; |
@@ -1722,28 +1699,27 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, | |||
1722 | done: | 1699 | done: |
1723 | if (!locked) | 1700 | if (!locked) |
1724 | i_mmap_unlock_read(mapping); | 1701 | i_mmap_unlock_read(mapping); |
1725 | return ret; | ||
1726 | } | 1702 | } |
1727 | 1703 | ||
1728 | int rmap_walk(struct page *page, struct rmap_walk_control *rwc) | 1704 | void rmap_walk(struct page *page, struct rmap_walk_control *rwc) |
1729 | { | 1705 | { |
1730 | if (unlikely(PageKsm(page))) | 1706 | if (unlikely(PageKsm(page))) |
1731 | return rmap_walk_ksm(page, rwc); | 1707 | rmap_walk_ksm(page, rwc); |
1732 | else if (PageAnon(page)) | 1708 | else if (PageAnon(page)) |
1733 | return rmap_walk_anon(page, rwc, false); | 1709 | rmap_walk_anon(page, rwc, false); |
1734 | else | 1710 | else |
1735 | return rmap_walk_file(page, rwc, false); | 1711 | rmap_walk_file(page, rwc, false); |
1736 | } | 1712 | } |
1737 | 1713 | ||
1738 | /* Like rmap_walk, but caller holds relevant rmap lock */ | 1714 | /* Like rmap_walk, but caller holds relevant rmap lock */ |
1739 | int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) | 1715 | void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) |
1740 | { | 1716 | { |
1741 | /* no ksm support for now */ | 1717 | /* no ksm support for now */ |
1742 | VM_BUG_ON_PAGE(PageKsm(page), page); | 1718 | VM_BUG_ON_PAGE(PageKsm(page), page); |
1743 | if (PageAnon(page)) | 1719 | if (PageAnon(page)) |
1744 | return rmap_walk_anon(page, rwc, true); | 1720 | rmap_walk_anon(page, rwc, true); |
1745 | else | 1721 | else |
1746 | return rmap_walk_file(page, rwc, true); | 1722 | rmap_walk_file(page, rwc, true); |
1747 | } | 1723 | } |
1748 | 1724 | ||
1749 | #ifdef CONFIG_HUGETLB_PAGE | 1725 | #ifdef CONFIG_HUGETLB_PAGE |
diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 0fd21670b513..6bb4deb12e78 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c | |||
@@ -9,11 +9,12 @@ | |||
9 | * as published by the Free Software Foundation; version 2 | 9 | * as published by the Free Software Foundation; version 2 |
10 | * of the License. | 10 | * of the License. |
11 | */ | 11 | */ |
12 | #define pr_fmt(fmt) "rodata_test: " fmt | ||
13 | |||
12 | #include <linux/uaccess.h> | 14 | #include <linux/uaccess.h> |
13 | #include <asm/sections.h> | 15 | #include <asm/sections.h> |
14 | 16 | ||
15 | const int rodata_test_data = 0xC3; | 17 | const int rodata_test_data = 0xC3; |
16 | EXPORT_SYMBOL_GPL(rodata_test_data); | ||
17 | 18 | ||
18 | void rodata_test(void) | 19 | void rodata_test(void) |
19 | { | 20 | { |
@@ -23,20 +24,20 @@ void rodata_test(void) | |||
23 | /* test 1: read the value */ | 24 | /* test 1: read the value */ |
24 | /* If this test fails, some previous testrun has clobbered the state */ | 25 | /* If this test fails, some previous testrun has clobbered the state */ |
25 | if (!rodata_test_data) { | 26 | if (!rodata_test_data) { |
26 | pr_err("rodata_test: test 1 fails (start data)\n"); | 27 | pr_err("test 1 fails (start data)\n"); |
27 | return; | 28 | return; |
28 | } | 29 | } |
29 | 30 | ||
30 | /* test 2: write to the variable; this should fault */ | 31 | /* test 2: write to the variable; this should fault */ |
31 | if (!probe_kernel_write((void *)&rodata_test_data, | 32 | if (!probe_kernel_write((void *)&rodata_test_data, |
32 | (void *)&zero, sizeof(zero))) { | 33 | (void *)&zero, sizeof(zero))) { |
33 | pr_err("rodata_test: test data was not read only\n"); | 34 | pr_err("test data was not read only\n"); |
34 | return; | 35 | return; |
35 | } | 36 | } |
36 | 37 | ||
37 | /* test 3: check the value hasn't changed */ | 38 | /* test 3: check the value hasn't changed */ |
38 | if (rodata_test_data == zero) { | 39 | if (rodata_test_data == zero) { |
39 | pr_err("rodata_test: test data was changed\n"); | 40 | pr_err("test data was changed\n"); |
40 | return; | 41 | return; |
41 | } | 42 | } |
42 | 43 | ||
@@ -44,13 +45,13 @@ void rodata_test(void) | |||
44 | start = (unsigned long)__start_rodata; | 45 | start = (unsigned long)__start_rodata; |
45 | end = (unsigned long)__end_rodata; | 46 | end = (unsigned long)__end_rodata; |
46 | if (start & (PAGE_SIZE - 1)) { | 47 | if (start & (PAGE_SIZE - 1)) { |
47 | pr_err("rodata_test: start of .rodata is not page size aligned\n"); | 48 | pr_err("start of .rodata is not page size aligned\n"); |
48 | return; | 49 | return; |
49 | } | 50 | } |
50 | if (end & (PAGE_SIZE - 1)) { | 51 | if (end & (PAGE_SIZE - 1)) { |
51 | pr_err("rodata_test: end of .rodata is not page size aligned\n"); | 52 | pr_err("end of .rodata is not page size aligned\n"); |
52 | return; | 53 | return; |
53 | } | 54 | } |
54 | 55 | ||
55 | pr_info("rodata_test: all tests were successful\n"); | 56 | pr_info("all tests were successful\n"); |
56 | } | 57 | } |
@@ -3879,7 +3879,12 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
3879 | 3879 | ||
3880 | prev = cachep->cpu_cache; | 3880 | prev = cachep->cpu_cache; |
3881 | cachep->cpu_cache = cpu_cache; | 3881 | cachep->cpu_cache = cpu_cache; |
3882 | kick_all_cpus_sync(); | 3882 | /* |
3883 | * Without a previous cpu_cache there's no need to synchronize remote | ||
3884 | * cpus, so skip the IPIs. | ||
3885 | */ | ||
3886 | if (prev) | ||
3887 | kick_all_cpus_sync(); | ||
3883 | 3888 | ||
3884 | check_irq_on(); | 3889 | check_irq_on(); |
3885 | cachep->batchcount = batchcount; | 3890 | cachep->batchcount = batchcount; |
diff --git a/mm/sparse.c b/mm/sparse.c index db6bf3c97ea2..6903c8fc3085 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -248,10 +248,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms, | |||
248 | 248 | ||
249 | unsigned long usemap_size(void) | 249 | unsigned long usemap_size(void) |
250 | { | 250 | { |
251 | unsigned long size_bytes; | 251 | return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long); |
252 | size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; | ||
253 | size_bytes = roundup(size_bytes, sizeof(unsigned long)); | ||
254 | return size_bytes; | ||
255 | } | 252 | } |
256 | 253 | ||
257 | #ifdef CONFIG_MEMORY_HOTPLUG | 254 | #ifdef CONFIG_MEMORY_HOTPLUG |
@@ -46,7 +46,7 @@ int page_cluster; | |||
46 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); | 46 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); |
47 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); | 47 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); |
48 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); | 48 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); |
49 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); | 49 | static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); |
50 | #ifdef CONFIG_SMP | 50 | #ifdef CONFIG_SMP |
51 | static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); | 51 | static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); |
52 | #endif | 52 | #endif |
@@ -97,6 +97,16 @@ static void __put_compound_page(struct page *page) | |||
97 | 97 | ||
98 | void __put_page(struct page *page) | 98 | void __put_page(struct page *page) |
99 | { | 99 | { |
100 | if (is_zone_device_page(page)) { | ||
101 | put_dev_pagemap(page->pgmap); | ||
102 | |||
103 | /* | ||
104 | * The page belongs to the device that created pgmap. Do | ||
105 | * not return it to page allocator. | ||
106 | */ | ||
107 | return; | ||
108 | } | ||
109 | |||
100 | if (unlikely(PageCompound(page))) | 110 | if (unlikely(PageCompound(page))) |
101 | __put_compound_page(page); | 111 | __put_compound_page(page); |
102 | else | 112 | else |
@@ -561,20 +571,27 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, | |||
561 | } | 571 | } |
562 | 572 | ||
563 | 573 | ||
564 | static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, | 574 | static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, |
565 | void *arg) | 575 | void *arg) |
566 | { | 576 | { |
567 | if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { | 577 | if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && |
568 | int file = page_is_file_cache(page); | 578 | !PageUnevictable(page)) { |
569 | int lru = page_lru_base_type(page); | 579 | bool active = PageActive(page); |
570 | 580 | ||
571 | del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); | 581 | del_page_from_lru_list(page, lruvec, |
582 | LRU_INACTIVE_ANON + active); | ||
572 | ClearPageActive(page); | 583 | ClearPageActive(page); |
573 | ClearPageReferenced(page); | 584 | ClearPageReferenced(page); |
574 | add_page_to_lru_list(page, lruvec, lru); | 585 | /* |
586 | * lazyfree pages are clean anonymous pages. They have | ||
587 | * SwapBacked flag cleared to distinguish normal anonymous | ||
588 | * pages | ||
589 | */ | ||
590 | ClearPageSwapBacked(page); | ||
591 | add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); | ||
575 | 592 | ||
576 | __count_vm_event(PGDEACTIVATE); | 593 | __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); |
577 | update_page_reclaim_stat(lruvec, file, 0); | 594 | update_page_reclaim_stat(lruvec, 1, 0); |
578 | } | 595 | } |
579 | } | 596 | } |
580 | 597 | ||
@@ -604,9 +621,9 @@ void lru_add_drain_cpu(int cpu) | |||
604 | if (pagevec_count(pvec)) | 621 | if (pagevec_count(pvec)) |
605 | pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); | 622 | pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); |
606 | 623 | ||
607 | pvec = &per_cpu(lru_deactivate_pvecs, cpu); | 624 | pvec = &per_cpu(lru_lazyfree_pvecs, cpu); |
608 | if (pagevec_count(pvec)) | 625 | if (pagevec_count(pvec)) |
609 | pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); | 626 | pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); |
610 | 627 | ||
611 | activate_page_drain(cpu); | 628 | activate_page_drain(cpu); |
612 | } | 629 | } |
@@ -638,22 +655,22 @@ void deactivate_file_page(struct page *page) | |||
638 | } | 655 | } |
639 | 656 | ||
640 | /** | 657 | /** |
641 | * deactivate_page - deactivate a page | 658 | * mark_page_lazyfree - make an anon page lazyfree |
642 | * @page: page to deactivate | 659 | * @page: page to deactivate |
643 | * | 660 | * |
644 | * deactivate_page() moves @page to the inactive list if @page was on the active | 661 | * mark_page_lazyfree() moves @page to the inactive file list. |
645 | * list and was not an unevictable page. This is done to accelerate the reclaim | 662 | * This is done to accelerate the reclaim of @page. |
646 | * of @page. | ||
647 | */ | 663 | */ |
648 | void deactivate_page(struct page *page) | 664 | void mark_page_lazyfree(struct page *page) |
649 | { | 665 | { |
650 | if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { | 666 | if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && |
651 | struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); | 667 | !PageUnevictable(page)) { |
668 | struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs); | ||
652 | 669 | ||
653 | get_page(page); | 670 | get_page(page); |
654 | if (!pagevec_add(pvec, page) || PageCompound(page)) | 671 | if (!pagevec_add(pvec, page) || PageCompound(page)) |
655 | pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); | 672 | pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); |
656 | put_cpu_var(lru_deactivate_pvecs); | 673 | put_cpu_var(lru_lazyfree_pvecs); |
657 | } | 674 | } |
658 | } | 675 | } |
659 | 676 | ||
@@ -693,7 +710,7 @@ void lru_add_drain_all(void) | |||
693 | if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || | 710 | if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || |
694 | pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || | 711 | pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || |
695 | pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || | 712 | pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || |
696 | pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || | 713 | pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || |
697 | need_activate_page_drain(cpu)) { | 714 | need_activate_page_drain(cpu)) { |
698 | INIT_WORK(work, lru_add_drain_per_cpu); | 715 | INIT_WORK(work, lru_add_drain_per_cpu); |
699 | queue_work_on(cpu, mm_percpu_wq, work); | 716 | queue_work_on(cpu, mm_percpu_wq, work); |
diff --git a/mm/swap_slots.c b/mm/swap_slots.c index b1ccb58ad397..58f6c78f1dad 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/cpumask.h> | 31 | #include <linux/cpumask.h> |
32 | #include <linux/vmalloc.h> | 32 | #include <linux/vmalloc.h> |
33 | #include <linux/mutex.h> | 33 | #include <linux/mutex.h> |
34 | #include <linux/mm.h> | ||
34 | 35 | ||
35 | #ifdef CONFIG_SWAP | 36 | #ifdef CONFIG_SWAP |
36 | 37 | ||
@@ -119,16 +120,18 @@ static int alloc_swap_slot_cache(unsigned int cpu) | |||
119 | 120 | ||
120 | /* | 121 | /* |
121 | * Do allocation outside swap_slots_cache_mutex | 122 | * Do allocation outside swap_slots_cache_mutex |
122 | * as vzalloc could trigger reclaim and get_swap_page, | 123 | * as kvzalloc could trigger reclaim and get_swap_page, |
123 | * which can lock swap_slots_cache_mutex. | 124 | * which can lock swap_slots_cache_mutex. |
124 | */ | 125 | */ |
125 | slots = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE); | 126 | slots = kvzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE, |
127 | GFP_KERNEL); | ||
126 | if (!slots) | 128 | if (!slots) |
127 | return -ENOMEM; | 129 | return -ENOMEM; |
128 | 130 | ||
129 | slots_ret = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE); | 131 | slots_ret = kvzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE, |
132 | GFP_KERNEL); | ||
130 | if (!slots_ret) { | 133 | if (!slots_ret) { |
131 | vfree(slots); | 134 | kvfree(slots); |
132 | return -ENOMEM; | 135 | return -ENOMEM; |
133 | } | 136 | } |
134 | 137 | ||
@@ -152,9 +155,9 @@ static int alloc_swap_slot_cache(unsigned int cpu) | |||
152 | out: | 155 | out: |
153 | mutex_unlock(&swap_slots_cache_mutex); | 156 | mutex_unlock(&swap_slots_cache_mutex); |
154 | if (slots) | 157 | if (slots) |
155 | vfree(slots); | 158 | kvfree(slots); |
156 | if (slots_ret) | 159 | if (slots_ret) |
157 | vfree(slots_ret); | 160 | kvfree(slots_ret); |
158 | return 0; | 161 | return 0; |
159 | } | 162 | } |
160 | 163 | ||
@@ -171,7 +174,7 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, | |||
171 | cache->cur = 0; | 174 | cache->cur = 0; |
172 | cache->nr = 0; | 175 | cache->nr = 0; |
173 | if (free_slots && cache->slots) { | 176 | if (free_slots && cache->slots) { |
174 | vfree(cache->slots); | 177 | kvfree(cache->slots); |
175 | cache->slots = NULL; | 178 | cache->slots = NULL; |
176 | } | 179 | } |
177 | mutex_unlock(&cache->alloc_lock); | 180 | mutex_unlock(&cache->alloc_lock); |
@@ -186,7 +189,7 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, | |||
186 | } | 189 | } |
187 | spin_unlock_irq(&cache->free_lock); | 190 | spin_unlock_irq(&cache->free_lock); |
188 | if (slots) | 191 | if (slots) |
189 | vfree(slots); | 192 | kvfree(slots); |
190 | } | 193 | } |
191 | } | 194 | } |
192 | 195 | ||
@@ -241,8 +244,10 @@ int enable_swap_slots_cache(void) | |||
241 | 244 | ||
242 | ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache", | 245 | ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache", |
243 | alloc_swap_slot_cache, free_slot_cache); | 246 | alloc_swap_slot_cache, free_slot_cache); |
244 | if (ret < 0) | 247 | if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating " |
248 | "without swap slots cache.\n", __func__)) | ||
245 | goto out_unlock; | 249 | goto out_unlock; |
250 | |||
246 | swap_slot_cache_initialized = true; | 251 | swap_slot_cache_initialized = true; |
247 | __reenable_swap_slots_cache(); | 252 | __reenable_swap_slots_cache(); |
248 | out_unlock: | 253 | out_unlock: |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 473b71e052a8..539b8885e3d1 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -360,17 +360,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
360 | /* | 360 | /* |
361 | * We might race against get_swap_page() and stumble | 361 | * We might race against get_swap_page() and stumble |
362 | * across a SWAP_HAS_CACHE swap_map entry whose page | 362 | * across a SWAP_HAS_CACHE swap_map entry whose page |
363 | * has not been brought into the swapcache yet, while | 363 | * has not been brought into the swapcache yet. |
364 | * the other end is scheduled away waiting on discard | ||
365 | * I/O completion at scan_swap_map(). | ||
366 | * | ||
367 | * In order to avoid turning this transitory state | ||
368 | * into a permanent loop around this -EEXIST case | ||
369 | * if !CONFIG_PREEMPT and the I/O completion happens | ||
370 | * to be waiting on the CPU waitqueue where we are now | ||
371 | * busy looping, we just conditionally invoke the | ||
372 | * scheduler here, if there are some more important | ||
373 | * tasks to run. | ||
374 | */ | 364 | */ |
375 | cond_resched(); | 365 | cond_resched(); |
376 | continue; | 366 | continue; |
@@ -533,7 +523,7 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) | |||
533 | unsigned int i, nr; | 523 | unsigned int i, nr; |
534 | 524 | ||
535 | nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); | 525 | nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); |
536 | spaces = vzalloc(sizeof(struct address_space) * nr); | 526 | spaces = kvzalloc(sizeof(struct address_space) * nr, GFP_KERNEL); |
537 | if (!spaces) | 527 | if (!spaces) |
538 | return -ENOMEM; | 528 | return -ENOMEM; |
539 | for (i = 0; i < nr; i++) { | 529 | for (i = 0; i < nr; i++) { |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 178130880b90..4f6cba1b6632 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -335,7 +335,7 @@ static void cluster_list_add_tail(struct swap_cluster_list *list, | |||
335 | ci_tail = ci + tail; | 335 | ci_tail = ci + tail; |
336 | spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); | 336 | spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); |
337 | cluster_set_next(ci_tail, idx); | 337 | cluster_set_next(ci_tail, idx); |
338 | unlock_cluster(ci_tail); | 338 | spin_unlock(&ci_tail->lock); |
339 | cluster_set_next_flag(&list->tail, idx, 0); | 339 | cluster_set_next_flag(&list->tail, idx, 0); |
340 | } | 340 | } |
341 | } | 341 | } |
@@ -672,6 +672,9 @@ checks: | |||
672 | else | 672 | else |
673 | goto done; | 673 | goto done; |
674 | } | 674 | } |
675 | si->swap_map[offset] = usage; | ||
676 | inc_cluster_info_page(si, si->cluster_info, offset); | ||
677 | unlock_cluster(ci); | ||
675 | 678 | ||
676 | if (offset == si->lowest_bit) | 679 | if (offset == si->lowest_bit) |
677 | si->lowest_bit++; | 680 | si->lowest_bit++; |
@@ -685,9 +688,6 @@ checks: | |||
685 | plist_del(&si->avail_list, &swap_avail_head); | 688 | plist_del(&si->avail_list, &swap_avail_head); |
686 | spin_unlock(&swap_avail_lock); | 689 | spin_unlock(&swap_avail_lock); |
687 | } | 690 | } |
688 | si->swap_map[offset] = usage; | ||
689 | inc_cluster_info_page(si, si->cluster_info, offset); | ||
690 | unlock_cluster(ci); | ||
691 | si->cluster_next = offset + 1; | 691 | si->cluster_next = offset + 1; |
692 | slots[n_ret++] = swp_entry(si->type, offset); | 692 | slots[n_ret++] = swp_entry(si->type, offset); |
693 | 693 | ||
@@ -1079,8 +1079,6 @@ void swapcache_free_entries(swp_entry_t *entries, int n) | |||
1079 | p = swap_info_get_cont(entries[i], prev); | 1079 | p = swap_info_get_cont(entries[i], prev); |
1080 | if (p) | 1080 | if (p) |
1081 | swap_entry_free(p, entries[i]); | 1081 | swap_entry_free(p, entries[i]); |
1082 | else | ||
1083 | break; | ||
1084 | prev = p; | 1082 | prev = p; |
1085 | } | 1083 | } |
1086 | if (p) | 1084 | if (p) |
@@ -1111,6 +1109,18 @@ int page_swapcount(struct page *page) | |||
1111 | return count; | 1109 | return count; |
1112 | } | 1110 | } |
1113 | 1111 | ||
1112 | static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) | ||
1113 | { | ||
1114 | int count = 0; | ||
1115 | pgoff_t offset = swp_offset(entry); | ||
1116 | struct swap_cluster_info *ci; | ||
1117 | |||
1118 | ci = lock_cluster_or_swap_info(si, offset); | ||
1119 | count = swap_count(si->swap_map[offset]); | ||
1120 | unlock_cluster_or_swap_info(si, ci); | ||
1121 | return count; | ||
1122 | } | ||
1123 | |||
1114 | /* | 1124 | /* |
1115 | * How many references to @entry are currently swapped out? | 1125 | * How many references to @entry are currently swapped out? |
1116 | * This does not give an exact answer when swap count is continued, | 1126 | * This does not give an exact answer when swap count is continued, |
@@ -1119,17 +1129,11 @@ int page_swapcount(struct page *page) | |||
1119 | int __swp_swapcount(swp_entry_t entry) | 1129 | int __swp_swapcount(swp_entry_t entry) |
1120 | { | 1130 | { |
1121 | int count = 0; | 1131 | int count = 0; |
1122 | pgoff_t offset; | ||
1123 | struct swap_info_struct *si; | 1132 | struct swap_info_struct *si; |
1124 | struct swap_cluster_info *ci; | ||
1125 | 1133 | ||
1126 | si = __swap_info_get(entry); | 1134 | si = __swap_info_get(entry); |
1127 | if (si) { | 1135 | if (si) |
1128 | offset = swp_offset(entry); | 1136 | count = swap_swapcount(si, entry); |
1129 | ci = lock_cluster_or_swap_info(si, offset); | ||
1130 | count = swap_count(si->swap_map[offset]); | ||
1131 | unlock_cluster_or_swap_info(si, ci); | ||
1132 | } | ||
1133 | return count; | 1137 | return count; |
1134 | } | 1138 | } |
1135 | 1139 | ||
@@ -1291,7 +1295,8 @@ int free_swap_and_cache(swp_entry_t entry) | |||
1291 | * Also recheck PageSwapCache now page is locked (above). | 1295 | * Also recheck PageSwapCache now page is locked (above). |
1292 | */ | 1296 | */ |
1293 | if (PageSwapCache(page) && !PageWriteback(page) && | 1297 | if (PageSwapCache(page) && !PageWriteback(page) && |
1294 | (!page_mapped(page) || mem_cgroup_swap_full(page))) { | 1298 | (!page_mapped(page) || mem_cgroup_swap_full(page)) && |
1299 | !swap_swapcount(p, entry)) { | ||
1295 | delete_from_swap_cache(page); | 1300 | delete_from_swap_cache(page); |
1296 | SetPageDirty(page); | 1301 | SetPageDirty(page); |
1297 | } | 1302 | } |
@@ -2265,8 +2270,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
2265 | free_percpu(p->percpu_cluster); | 2270 | free_percpu(p->percpu_cluster); |
2266 | p->percpu_cluster = NULL; | 2271 | p->percpu_cluster = NULL; |
2267 | vfree(swap_map); | 2272 | vfree(swap_map); |
2268 | vfree(cluster_info); | 2273 | kvfree(cluster_info); |
2269 | vfree(frontswap_map); | 2274 | kvfree(frontswap_map); |
2270 | /* Destroy swap account information */ | 2275 | /* Destroy swap account information */ |
2271 | swap_cgroup_swapoff(p->type); | 2276 | swap_cgroup_swapoff(p->type); |
2272 | exit_swap_address_space(p->type); | 2277 | exit_swap_address_space(p->type); |
@@ -2789,7 +2794,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2789 | p->cluster_next = 1 + (prandom_u32() % p->highest_bit); | 2794 | p->cluster_next = 1 + (prandom_u32() % p->highest_bit); |
2790 | nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); | 2795 | nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); |
2791 | 2796 | ||
2792 | cluster_info = vzalloc(nr_cluster * sizeof(*cluster_info)); | 2797 | cluster_info = kvzalloc(nr_cluster * sizeof(*cluster_info), |
2798 | GFP_KERNEL); | ||
2793 | if (!cluster_info) { | 2799 | if (!cluster_info) { |
2794 | error = -ENOMEM; | 2800 | error = -ENOMEM; |
2795 | goto bad_swap; | 2801 | goto bad_swap; |
@@ -2822,7 +2828,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2822 | } | 2828 | } |
2823 | /* frontswap enabled? set up bit-per-page map for frontswap */ | 2829 | /* frontswap enabled? set up bit-per-page map for frontswap */ |
2824 | if (IS_ENABLED(CONFIG_FRONTSWAP)) | 2830 | if (IS_ENABLED(CONFIG_FRONTSWAP)) |
2825 | frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); | 2831 | frontswap_map = kvzalloc(BITS_TO_LONGS(maxpages) * sizeof(long), |
2832 | GFP_KERNEL); | ||
2826 | 2833 | ||
2827 | if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { | 2834 | if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { |
2828 | /* | 2835 | /* |
diff --git a/mm/truncate.c b/mm/truncate.c index 6263affdef88..83a059e8cd1d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -266,9 +266,8 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
266 | pgoff_t index; | 266 | pgoff_t index; |
267 | int i; | 267 | int i; |
268 | 268 | ||
269 | cleancache_invalidate_inode(mapping); | ||
270 | if (mapping->nrpages == 0 && mapping->nrexceptional == 0) | 269 | if (mapping->nrpages == 0 && mapping->nrexceptional == 0) |
271 | return; | 270 | goto out; |
272 | 271 | ||
273 | /* Offsets within partial pages */ | 272 | /* Offsets within partial pages */ |
274 | partial_start = lstart & (PAGE_SIZE - 1); | 273 | partial_start = lstart & (PAGE_SIZE - 1); |
@@ -363,7 +362,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
363 | * will be released, just zeroed, so we can bail out now. | 362 | * will be released, just zeroed, so we can bail out now. |
364 | */ | 363 | */ |
365 | if (start >= end) | 364 | if (start >= end) |
366 | return; | 365 | goto out; |
367 | 366 | ||
368 | index = start; | 367 | index = start; |
369 | for ( ; ; ) { | 368 | for ( ; ; ) { |
@@ -410,6 +409,8 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
410 | pagevec_release(&pvec); | 409 | pagevec_release(&pvec); |
411 | index++; | 410 | index++; |
412 | } | 411 | } |
412 | |||
413 | out: | ||
413 | cleancache_invalidate_inode(mapping); | 414 | cleancache_invalidate_inode(mapping); |
414 | } | 415 | } |
415 | EXPORT_SYMBOL(truncate_inode_pages_range); | 416 | EXPORT_SYMBOL(truncate_inode_pages_range); |
@@ -623,7 +624,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
623 | int ret2 = 0; | 624 | int ret2 = 0; |
624 | int did_range_unmap = 0; | 625 | int did_range_unmap = 0; |
625 | 626 | ||
626 | cleancache_invalidate_inode(mapping); | 627 | if (mapping->nrpages == 0 && mapping->nrexceptional == 0) |
628 | goto out; | ||
629 | |||
627 | pagevec_init(&pvec, 0); | 630 | pagevec_init(&pvec, 0); |
628 | index = start; | 631 | index = start; |
629 | while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, | 632 | while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, |
@@ -686,6 +689,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
686 | cond_resched(); | 689 | cond_resched(); |
687 | index++; | 690 | index++; |
688 | } | 691 | } |
692 | |||
693 | out: | ||
689 | cleancache_invalidate_inode(mapping); | 694 | cleancache_invalidate_inode(mapping); |
690 | return ret; | 695 | return ret; |
691 | } | 696 | } |
diff --git a/mm/usercopy.c b/mm/usercopy.c index d155e12563b1..a9852b24715d 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c | |||
@@ -19,15 +19,9 @@ | |||
19 | #include <linux/sched.h> | 19 | #include <linux/sched.h> |
20 | #include <linux/sched/task.h> | 20 | #include <linux/sched/task.h> |
21 | #include <linux/sched/task_stack.h> | 21 | #include <linux/sched/task_stack.h> |
22 | #include <linux/thread_info.h> | ||
22 | #include <asm/sections.h> | 23 | #include <asm/sections.h> |
23 | 24 | ||
24 | enum { | ||
25 | BAD_STACK = -1, | ||
26 | NOT_STACK = 0, | ||
27 | GOOD_FRAME, | ||
28 | GOOD_STACK, | ||
29 | }; | ||
30 | |||
31 | /* | 25 | /* |
32 | * Checks if a given pointer and length is contained by the current | 26 | * Checks if a given pointer and length is contained by the current |
33 | * stack frame (if possible). | 27 | * stack frame (if possible). |
@@ -206,17 +200,6 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, | |||
206 | { | 200 | { |
207 | struct page *page; | 201 | struct page *page; |
208 | 202 | ||
209 | /* | ||
210 | * Some architectures (arm64) return true for virt_addr_valid() on | ||
211 | * vmalloced addresses. Work around this by checking for vmalloc | ||
212 | * first. | ||
213 | * | ||
214 | * We also need to check for module addresses explicitly since we | ||
215 | * may copy static data from modules to userspace | ||
216 | */ | ||
217 | if (is_vmalloc_or_module_addr(ptr)) | ||
218 | return NULL; | ||
219 | |||
220 | if (!virt_addr_valid(ptr)) | 203 | if (!virt_addr_valid(ptr)) |
221 | return NULL; | 204 | return NULL; |
222 | 205 | ||
@@ -329,6 +329,63 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, | |||
329 | } | 329 | } |
330 | EXPORT_SYMBOL(vm_mmap); | 330 | EXPORT_SYMBOL(vm_mmap); |
331 | 331 | ||
332 | /** | ||
333 | * kvmalloc_node - attempt to allocate physically contiguous memory, but upon | ||
334 | * failure, fall back to non-contiguous (vmalloc) allocation. | ||
335 | * @size: size of the request. | ||
336 | * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. | ||
337 | * @node: numa node to allocate from | ||
338 | * | ||
339 | * Uses kmalloc to get the memory but if the allocation fails then falls back | ||
340 | * to the vmalloc allocator. Use kvfree for freeing the memory. | ||
341 | * | ||
342 | * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. __GFP_REPEAT | ||
343 | * is supported only for large (>32kB) allocations, and it should be used only if | ||
344 | * kmalloc is preferable to the vmalloc fallback, due to visible performance drawbacks. | ||
345 | * | ||
346 | * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people. | ||
347 | */ | ||
348 | void *kvmalloc_node(size_t size, gfp_t flags, int node) | ||
349 | { | ||
350 | gfp_t kmalloc_flags = flags; | ||
351 | void *ret; | ||
352 | |||
353 | /* | ||
354 | * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) | ||
355 | * so the given set of flags has to be compatible. | ||
356 | */ | ||
357 | WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL); | ||
358 | |||
359 | /* | ||
360 | * Make sure that larger requests are not too disruptive - no OOM | ||
361 | * killer and no allocation failure warnings as we have a fallback | ||
362 | */ | ||
363 | if (size > PAGE_SIZE) { | ||
364 | kmalloc_flags |= __GFP_NOWARN; | ||
365 | |||
366 | /* | ||
367 | * We have to override __GFP_REPEAT by __GFP_NORETRY for !costly | ||
368 | * requests because there is no other way to tell the allocator | ||
369 | * that we want to fail rather than retry endlessly. | ||
370 | */ | ||
371 | if (!(kmalloc_flags & __GFP_REPEAT) || | ||
372 | (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) | ||
373 | kmalloc_flags |= __GFP_NORETRY; | ||
374 | } | ||
375 | |||
376 | ret = kmalloc_node(size, kmalloc_flags, node); | ||
377 | |||
378 | /* | ||
379 | * It doesn't really make sense to fallback to vmalloc for sub page | ||
380 | * requests | ||
381 | */ | ||
382 | if (ret || size <= PAGE_SIZE) | ||
383 | return ret; | ||
384 | |||
385 | return __vmalloc_node_flags(size, node, flags); | ||
386 | } | ||
387 | EXPORT_SYMBOL(kvmalloc_node); | ||
388 | |||
332 | void kvfree(const void *addr) | 389 | void kvfree(const void *addr) |
333 | { | 390 | { |
334 | if (is_vmalloc_addr(addr)) | 391 | if (is_vmalloc_addr(addr)) |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0b057628a7ba..1dda6d8a200a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -1579,7 +1579,7 @@ void vfree_atomic(const void *addr) | |||
1579 | * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling | 1579 | * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling |
1580 | * conventions for vfree() arch-depenedent would be a really bad idea) | 1580 | * conventions for vfree() arch-depenedent would be a really bad idea) |
1581 | * | 1581 | * |
1582 | * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) | 1582 | * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) |
1583 | */ | 1583 | */ |
1584 | void vfree(const void *addr) | 1584 | void vfree(const void *addr) |
1585 | { | 1585 | { |
@@ -1649,16 +1649,13 @@ void *vmap(struct page **pages, unsigned int count, | |||
1649 | } | 1649 | } |
1650 | EXPORT_SYMBOL(vmap); | 1650 | EXPORT_SYMBOL(vmap); |
1651 | 1651 | ||
1652 | static void *__vmalloc_node(unsigned long size, unsigned long align, | ||
1653 | gfp_t gfp_mask, pgprot_t prot, | ||
1654 | int node, const void *caller); | ||
1655 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | 1652 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
1656 | pgprot_t prot, int node) | 1653 | pgprot_t prot, int node) |
1657 | { | 1654 | { |
1658 | struct page **pages; | 1655 | struct page **pages; |
1659 | unsigned int nr_pages, array_size, i; | 1656 | unsigned int nr_pages, array_size, i; |
1660 | const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; | 1657 | const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; |
1661 | const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; | 1658 | const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN; |
1662 | 1659 | ||
1663 | nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; | 1660 | nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; |
1664 | array_size = (nr_pages * sizeof(struct page *)); | 1661 | array_size = (nr_pages * sizeof(struct page *)); |
@@ -1786,8 +1783,15 @@ fail: | |||
1786 | * Allocate enough pages to cover @size from the page level | 1783 | * Allocate enough pages to cover @size from the page level |
1787 | * allocator with @gfp_mask flags. Map them into contiguous | 1784 | * allocator with @gfp_mask flags. Map them into contiguous |
1788 | * kernel virtual space, using a pagetable protection of @prot. | 1785 | * kernel virtual space, using a pagetable protection of @prot. |
1786 | * | ||
1787 | * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_REPEAT | ||
1788 | * and __GFP_NOFAIL are not supported | ||
1789 | * | ||
1790 | * Any use of gfp flags outside of GFP_KERNEL should be consulted | ||
1791 | * with mm people. | ||
1792 | * | ||
1789 | */ | 1793 | */ |
1790 | static void *__vmalloc_node(unsigned long size, unsigned long align, | 1794 | void *__vmalloc_node(unsigned long size, unsigned long align, |
1791 | gfp_t gfp_mask, pgprot_t prot, | 1795 | gfp_t gfp_mask, pgprot_t prot, |
1792 | int node, const void *caller) | 1796 | int node, const void *caller) |
1793 | { | 1797 | { |
@@ -1802,13 +1806,6 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | |||
1802 | } | 1806 | } |
1803 | EXPORT_SYMBOL(__vmalloc); | 1807 | EXPORT_SYMBOL(__vmalloc); |
1804 | 1808 | ||
1805 | static inline void *__vmalloc_node_flags(unsigned long size, | ||
1806 | int node, gfp_t flags) | ||
1807 | { | ||
1808 | return __vmalloc_node(size, 1, flags, PAGE_KERNEL, | ||
1809 | node, __builtin_return_address(0)); | ||
1810 | } | ||
1811 | |||
1812 | /** | 1809 | /** |
1813 | * vmalloc - allocate virtually contiguous memory | 1810 | * vmalloc - allocate virtually contiguous memory |
1814 | * @size: allocation size | 1811 | * @size: allocation size |
@@ -1821,7 +1818,7 @@ static inline void *__vmalloc_node_flags(unsigned long size, | |||
1821 | void *vmalloc(unsigned long size) | 1818 | void *vmalloc(unsigned long size) |
1822 | { | 1819 | { |
1823 | return __vmalloc_node_flags(size, NUMA_NO_NODE, | 1820 | return __vmalloc_node_flags(size, NUMA_NO_NODE, |
1824 | GFP_KERNEL | __GFP_HIGHMEM); | 1821 | GFP_KERNEL); |
1825 | } | 1822 | } |
1826 | EXPORT_SYMBOL(vmalloc); | 1823 | EXPORT_SYMBOL(vmalloc); |
1827 | 1824 | ||
@@ -1838,7 +1835,7 @@ EXPORT_SYMBOL(vmalloc); | |||
1838 | void *vzalloc(unsigned long size) | 1835 | void *vzalloc(unsigned long size) |
1839 | { | 1836 | { |
1840 | return __vmalloc_node_flags(size, NUMA_NO_NODE, | 1837 | return __vmalloc_node_flags(size, NUMA_NO_NODE, |
1841 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); | 1838 | GFP_KERNEL | __GFP_ZERO); |
1842 | } | 1839 | } |
1843 | EXPORT_SYMBOL(vzalloc); | 1840 | EXPORT_SYMBOL(vzalloc); |
1844 | 1841 | ||
@@ -1855,7 +1852,7 @@ void *vmalloc_user(unsigned long size) | |||
1855 | void *ret; | 1852 | void *ret; |
1856 | 1853 | ||
1857 | ret = __vmalloc_node(size, SHMLBA, | 1854 | ret = __vmalloc_node(size, SHMLBA, |
1858 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | 1855 | GFP_KERNEL | __GFP_ZERO, |
1859 | PAGE_KERNEL, NUMA_NO_NODE, | 1856 | PAGE_KERNEL, NUMA_NO_NODE, |
1860 | __builtin_return_address(0)); | 1857 | __builtin_return_address(0)); |
1861 | if (ret) { | 1858 | if (ret) { |
@@ -1879,7 +1876,7 @@ EXPORT_SYMBOL(vmalloc_user); | |||
1879 | */ | 1876 | */ |
1880 | void *vmalloc_node(unsigned long size, int node) | 1877 | void *vmalloc_node(unsigned long size, int node) |
1881 | { | 1878 | { |
1882 | return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, | 1879 | return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, |
1883 | node, __builtin_return_address(0)); | 1880 | node, __builtin_return_address(0)); |
1884 | } | 1881 | } |
1885 | EXPORT_SYMBOL(vmalloc_node); | 1882 | EXPORT_SYMBOL(vmalloc_node); |
@@ -1899,7 +1896,7 @@ EXPORT_SYMBOL(vmalloc_node); | |||
1899 | void *vzalloc_node(unsigned long size, int node) | 1896 | void *vzalloc_node(unsigned long size, int node) |
1900 | { | 1897 | { |
1901 | return __vmalloc_node_flags(size, node, | 1898 | return __vmalloc_node_flags(size, node, |
1902 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); | 1899 | GFP_KERNEL | __GFP_ZERO); |
1903 | } | 1900 | } |
1904 | EXPORT_SYMBOL(vzalloc_node); | 1901 | EXPORT_SYMBOL(vzalloc_node); |
1905 | 1902 | ||
@@ -1921,7 +1918,7 @@ EXPORT_SYMBOL(vzalloc_node); | |||
1921 | 1918 | ||
1922 | void *vmalloc_exec(unsigned long size) | 1919 | void *vmalloc_exec(unsigned long size) |
1923 | { | 1920 | { |
1924 | return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, | 1921 | return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, |
1925 | NUMA_NO_NODE, __builtin_return_address(0)); | 1922 | NUMA_NO_NODE, __builtin_return_address(0)); |
1926 | } | 1923 | } |
1927 | 1924 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index bc8031ef994d..2f45c0520f43 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -97,8 +97,13 @@ struct scan_control { | |||
97 | /* Can pages be swapped as part of reclaim? */ | 97 | /* Can pages be swapped as part of reclaim? */ |
98 | unsigned int may_swap:1; | 98 | unsigned int may_swap:1; |
99 | 99 | ||
100 | /* Can cgroups be reclaimed below their normal consumption range? */ | 100 | /* |
101 | unsigned int may_thrash:1; | 101 | * Cgroups are not reclaimed below their configured memory.low, |
102 | * unless we threaten to OOM. If any cgroups are skipped due to | ||
103 | * memory.low and nothing was reclaimed, go back for memory.low. | ||
104 | */ | ||
105 | unsigned int memcg_low_reclaim:1; | ||
106 | unsigned int memcg_low_skipped:1; | ||
102 | 107 | ||
103 | unsigned int hibernation_mode:1; | 108 | unsigned int hibernation_mode:1; |
104 | 109 | ||
@@ -230,12 +235,6 @@ unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) | |||
230 | return nr; | 235 | return nr; |
231 | } | 236 | } |
232 | 237 | ||
233 | bool pgdat_reclaimable(struct pglist_data *pgdat) | ||
234 | { | ||
235 | return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) < | ||
236 | pgdat_reclaimable_pages(pgdat) * 6; | ||
237 | } | ||
238 | |||
239 | /** | 238 | /** |
240 | * lruvec_lru_size - Returns the number of pages on the given LRU list. | 239 | * lruvec_lru_size - Returns the number of pages on the given LRU list. |
241 | * @lruvec: lru vector | 240 | * @lruvec: lru vector |
@@ -912,7 +911,8 @@ static void page_check_dirty_writeback(struct page *page, | |||
912 | * Anonymous pages are not handled by flushers and must be written | 911 | * Anonymous pages are not handled by flushers and must be written |
913 | * from reclaim context. Do not stall reclaim based on them | 912 | * from reclaim context. Do not stall reclaim based on them |
914 | */ | 913 | */ |
915 | if (!page_is_file_cache(page)) { | 914 | if (!page_is_file_cache(page) || |
915 | (PageAnon(page) && !PageSwapBacked(page))) { | ||
916 | *dirty = false; | 916 | *dirty = false; |
917 | *writeback = false; | 917 | *writeback = false; |
918 | return; | 918 | return; |
@@ -972,8 +972,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
972 | int may_enter_fs; | 972 | int may_enter_fs; |
973 | enum page_references references = PAGEREF_RECLAIM_CLEAN; | 973 | enum page_references references = PAGEREF_RECLAIM_CLEAN; |
974 | bool dirty, writeback; | 974 | bool dirty, writeback; |
975 | bool lazyfree = false; | ||
976 | int ret = SWAP_SUCCESS; | ||
977 | 975 | ||
978 | cond_resched(); | 976 | cond_resched(); |
979 | 977 | ||
@@ -988,13 +986,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
988 | sc->nr_scanned++; | 986 | sc->nr_scanned++; |
989 | 987 | ||
990 | if (unlikely(!page_evictable(page))) | 988 | if (unlikely(!page_evictable(page))) |
991 | goto cull_mlocked; | 989 | goto activate_locked; |
992 | 990 | ||
993 | if (!sc->may_unmap && page_mapped(page)) | 991 | if (!sc->may_unmap && page_mapped(page)) |
994 | goto keep_locked; | 992 | goto keep_locked; |
995 | 993 | ||
996 | /* Double the slab pressure for mapped and swapcache pages */ | 994 | /* Double the slab pressure for mapped and swapcache pages */ |
997 | if (page_mapped(page) || PageSwapCache(page)) | 995 | if ((page_mapped(page) || PageSwapCache(page)) && |
996 | !(PageAnon(page) && !PageSwapBacked(page))) | ||
998 | sc->nr_scanned++; | 997 | sc->nr_scanned++; |
999 | 998 | ||
1000 | may_enter_fs = (sc->gfp_mask & __GFP_FS) || | 999 | may_enter_fs = (sc->gfp_mask & __GFP_FS) || |
@@ -1120,13 +1119,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1120 | /* | 1119 | /* |
1121 | * Anonymous process memory has backing store? | 1120 | * Anonymous process memory has backing store? |
1122 | * Try to allocate it some swap space here. | 1121 | * Try to allocate it some swap space here. |
1122 | * Lazyfree page could be freed directly | ||
1123 | */ | 1123 | */ |
1124 | if (PageAnon(page) && !PageSwapCache(page)) { | 1124 | if (PageAnon(page) && PageSwapBacked(page) && |
1125 | !PageSwapCache(page)) { | ||
1125 | if (!(sc->gfp_mask & __GFP_IO)) | 1126 | if (!(sc->gfp_mask & __GFP_IO)) |
1126 | goto keep_locked; | 1127 | goto keep_locked; |
1127 | if (!add_to_swap(page, page_list)) | 1128 | if (!add_to_swap(page, page_list)) |
1128 | goto activate_locked; | 1129 | goto activate_locked; |
1129 | lazyfree = true; | ||
1130 | may_enter_fs = 1; | 1130 | may_enter_fs = 1; |
1131 | 1131 | ||
1132 | /* Adding to swap updated mapping */ | 1132 | /* Adding to swap updated mapping */ |
@@ -1143,21 +1143,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1143 | * The page is mapped into the page tables of one or more | 1143 | * The page is mapped into the page tables of one or more |
1144 | * processes. Try to unmap it here. | 1144 | * processes. Try to unmap it here. |
1145 | */ | 1145 | */ |
1146 | if (page_mapped(page) && mapping) { | 1146 | if (page_mapped(page)) { |
1147 | switch (ret = try_to_unmap(page, lazyfree ? | 1147 | if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) { |
1148 | (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : | ||
1149 | (ttu_flags | TTU_BATCH_FLUSH))) { | ||
1150 | case SWAP_FAIL: | ||
1151 | nr_unmap_fail++; | 1148 | nr_unmap_fail++; |
1152 | goto activate_locked; | 1149 | goto activate_locked; |
1153 | case SWAP_AGAIN: | ||
1154 | goto keep_locked; | ||
1155 | case SWAP_MLOCK: | ||
1156 | goto cull_mlocked; | ||
1157 | case SWAP_LZFREE: | ||
1158 | goto lazyfree; | ||
1159 | case SWAP_SUCCESS: | ||
1160 | ; /* try to free the page below */ | ||
1161 | } | 1150 | } |
1162 | } | 1151 | } |
1163 | 1152 | ||
@@ -1267,10 +1256,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1267 | } | 1256 | } |
1268 | } | 1257 | } |
1269 | 1258 | ||
1270 | lazyfree: | 1259 | if (PageAnon(page) && !PageSwapBacked(page)) { |
1271 | if (!mapping || !__remove_mapping(mapping, page, true)) | 1260 | /* follow __remove_mapping for reference */ |
1272 | goto keep_locked; | 1261 | if (!page_ref_freeze(page, 1)) |
1262 | goto keep_locked; | ||
1263 | if (PageDirty(page)) { | ||
1264 | page_ref_unfreeze(page, 1); | ||
1265 | goto keep_locked; | ||
1266 | } | ||
1273 | 1267 | ||
1268 | count_vm_event(PGLAZYFREED); | ||
1269 | } else if (!mapping || !__remove_mapping(mapping, page, true)) | ||
1270 | goto keep_locked; | ||
1274 | /* | 1271 | /* |
1275 | * At this point, we have no other references and there is | 1272 | * At this point, we have no other references and there is |
1276 | * no way to pick any more up (removed from LRU, removed | 1273 | * no way to pick any more up (removed from LRU, removed |
@@ -1280,9 +1277,6 @@ lazyfree: | |||
1280 | */ | 1277 | */ |
1281 | __ClearPageLocked(page); | 1278 | __ClearPageLocked(page); |
1282 | free_it: | 1279 | free_it: |
1283 | if (ret == SWAP_LZFREE) | ||
1284 | count_vm_event(PGLAZYFREED); | ||
1285 | |||
1286 | nr_reclaimed++; | 1280 | nr_reclaimed++; |
1287 | 1281 | ||
1288 | /* | 1282 | /* |
@@ -1292,20 +1286,16 @@ free_it: | |||
1292 | list_add(&page->lru, &free_pages); | 1286 | list_add(&page->lru, &free_pages); |
1293 | continue; | 1287 | continue; |
1294 | 1288 | ||
1295 | cull_mlocked: | ||
1296 | if (PageSwapCache(page)) | ||
1297 | try_to_free_swap(page); | ||
1298 | unlock_page(page); | ||
1299 | list_add(&page->lru, &ret_pages); | ||
1300 | continue; | ||
1301 | |||
1302 | activate_locked: | 1289 | activate_locked: |
1303 | /* Not a candidate for swapping, so reclaim swap space. */ | 1290 | /* Not a candidate for swapping, so reclaim swap space. */ |
1304 | if (PageSwapCache(page) && mem_cgroup_swap_full(page)) | 1291 | if (PageSwapCache(page) && (mem_cgroup_swap_full(page) || |
1292 | PageMlocked(page))) | ||
1305 | try_to_free_swap(page); | 1293 | try_to_free_swap(page); |
1306 | VM_BUG_ON_PAGE(PageActive(page), page); | 1294 | VM_BUG_ON_PAGE(PageActive(page), page); |
1307 | SetPageActive(page); | 1295 | if (!PageMlocked(page)) { |
1308 | pgactivate++; | 1296 | SetPageActive(page); |
1297 | pgactivate++; | ||
1298 | } | ||
1309 | keep_locked: | 1299 | keep_locked: |
1310 | unlock_page(page); | 1300 | unlock_page(page); |
1311 | keep: | 1301 | keep: |
@@ -1354,7 +1344,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
1354 | } | 1344 | } |
1355 | 1345 | ||
1356 | ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, | 1346 | ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, |
1357 | TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true); | 1347 | TTU_IGNORE_ACCESS, NULL, true); |
1358 | list_splice(&clean_pages, page_list); | 1348 | list_splice(&clean_pages, page_list); |
1359 | mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); | 1349 | mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); |
1360 | return ret; | 1350 | return ret; |
@@ -1478,12 +1468,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1478 | unsigned long nr_taken = 0; | 1468 | unsigned long nr_taken = 0; |
1479 | unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; | 1469 | unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; |
1480 | unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; | 1470 | unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; |
1481 | unsigned long skipped = 0, total_skipped = 0; | 1471 | unsigned long skipped = 0; |
1482 | unsigned long scan, nr_pages; | 1472 | unsigned long scan, nr_pages; |
1483 | LIST_HEAD(pages_skipped); | 1473 | LIST_HEAD(pages_skipped); |
1484 | 1474 | ||
1485 | for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && | 1475 | for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && |
1486 | !list_empty(src);) { | 1476 | !list_empty(src); scan++) { |
1487 | struct page *page; | 1477 | struct page *page; |
1488 | 1478 | ||
1489 | page = lru_to_page(src); | 1479 | page = lru_to_page(src); |
@@ -1497,12 +1487,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1497 | continue; | 1487 | continue; |
1498 | } | 1488 | } |
1499 | 1489 | ||
1500 | /* | ||
1501 | * Account for scanned and skipped separetly to avoid the pgdat | ||
1502 | * being prematurely marked unreclaimable by pgdat_reclaimable. | ||
1503 | */ | ||
1504 | scan++; | ||
1505 | |||
1506 | switch (__isolate_lru_page(page, mode)) { | 1490 | switch (__isolate_lru_page(page, mode)) { |
1507 | case 0: | 1491 | case 0: |
1508 | nr_pages = hpage_nr_pages(page); | 1492 | nr_pages = hpage_nr_pages(page); |
@@ -1531,6 +1515,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1531 | if (!list_empty(&pages_skipped)) { | 1515 | if (!list_empty(&pages_skipped)) { |
1532 | int zid; | 1516 | int zid; |
1533 | 1517 | ||
1518 | list_splice(&pages_skipped, src); | ||
1534 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 1519 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
1535 | if (!nr_skipped[zid]) | 1520 | if (!nr_skipped[zid]) |
1536 | continue; | 1521 | continue; |
@@ -1538,17 +1523,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1538 | __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); | 1523 | __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); |
1539 | skipped += nr_skipped[zid]; | 1524 | skipped += nr_skipped[zid]; |
1540 | } | 1525 | } |
1541 | |||
1542 | /* | ||
1543 | * Account skipped pages as a partial scan as the pgdat may be | ||
1544 | * close to unreclaimable. If the LRU list is empty, account | ||
1545 | * skipped pages as a full scan. | ||
1546 | */ | ||
1547 | total_skipped = list_empty(src) ? skipped : skipped >> 2; | ||
1548 | |||
1549 | list_splice(&pages_skipped, src); | ||
1550 | } | 1526 | } |
1551 | *nr_scanned = scan + total_skipped; | 1527 | *nr_scanned = scan; |
1552 | trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, | 1528 | trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, |
1553 | scan, skipped, nr_taken, mode, lru); | 1529 | scan, skipped, nr_taken, mode, lru); |
1554 | update_lru_sizes(lruvec, lru, nr_zone_taken); | 1530 | update_lru_sizes(lruvec, lru, nr_zone_taken); |
@@ -1750,7 +1726,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1750 | reclaim_stat->recent_scanned[file] += nr_taken; | 1726 | reclaim_stat->recent_scanned[file] += nr_taken; |
1751 | 1727 | ||
1752 | if (global_reclaim(sc)) { | 1728 | if (global_reclaim(sc)) { |
1753 | __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); | ||
1754 | if (current_is_kswapd()) | 1729 | if (current_is_kswapd()) |
1755 | __count_vm_events(PGSCAN_KSWAPD, nr_scanned); | 1730 | __count_vm_events(PGSCAN_KSWAPD, nr_scanned); |
1756 | else | 1731 | else |
@@ -1761,7 +1736,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1761 | if (nr_taken == 0) | 1736 | if (nr_taken == 0) |
1762 | return 0; | 1737 | return 0; |
1763 | 1738 | ||
1764 | nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP, | 1739 | nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0, |
1765 | &stat, false); | 1740 | &stat, false); |
1766 | 1741 | ||
1767 | spin_lock_irq(&pgdat->lru_lock); | 1742 | spin_lock_irq(&pgdat->lru_lock); |
@@ -1953,8 +1928,6 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1953 | __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); | 1928 | __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); |
1954 | reclaim_stat->recent_scanned[file] += nr_taken; | 1929 | reclaim_stat->recent_scanned[file] += nr_taken; |
1955 | 1930 | ||
1956 | if (global_reclaim(sc)) | ||
1957 | __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); | ||
1958 | __count_vm_events(PGREFILL, nr_scanned); | 1931 | __count_vm_events(PGREFILL, nr_scanned); |
1959 | 1932 | ||
1960 | spin_unlock_irq(&pgdat->lru_lock); | 1933 | spin_unlock_irq(&pgdat->lru_lock); |
@@ -2033,6 +2006,8 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
2033 | * Both inactive lists should also be large enough that each inactive | 2006 | * Both inactive lists should also be large enough that each inactive |
2034 | * page has a chance to be referenced again before it is reclaimed. | 2007 | * page has a chance to be referenced again before it is reclaimed. |
2035 | * | 2008 | * |
2009 | * If that fails and refaulting is observed, the inactive list grows. | ||
2010 | * | ||
2036 | * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages | 2011 | * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages |
2037 | * on this LRU, maintained by the pageout code. A zone->inactive_ratio | 2012 | * on this LRU, maintained by the pageout code. A zone->inactive_ratio |
2038 | * of 3 means 3:1 or 25% of the pages are kept on the inactive list. | 2013 | * of 3 means 3:1 or 25% of the pages are kept on the inactive list. |
@@ -2049,12 +2024,15 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
2049 | * 10TB 320 32GB | 2024 | * 10TB 320 32GB |
2050 | */ | 2025 | */ |
2051 | static bool inactive_list_is_low(struct lruvec *lruvec, bool file, | 2026 | static bool inactive_list_is_low(struct lruvec *lruvec, bool file, |
2052 | struct scan_control *sc, bool trace) | 2027 | struct mem_cgroup *memcg, |
2028 | struct scan_control *sc, bool actual_reclaim) | ||
2053 | { | 2029 | { |
2054 | unsigned long inactive_ratio; | ||
2055 | unsigned long inactive, active; | ||
2056 | enum lru_list inactive_lru = file * LRU_FILE; | ||
2057 | enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; | 2030 | enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; |
2031 | struct pglist_data *pgdat = lruvec_pgdat(lruvec); | ||
2032 | enum lru_list inactive_lru = file * LRU_FILE; | ||
2033 | unsigned long inactive, active; | ||
2034 | unsigned long inactive_ratio; | ||
2035 | unsigned long refaults; | ||
2058 | unsigned long gb; | 2036 | unsigned long gb; |
2059 | 2037 | ||
2060 | /* | 2038 | /* |
@@ -2067,27 +2045,42 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, | |||
2067 | inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); | 2045 | inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); |
2068 | active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); | 2046 | active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); |
2069 | 2047 | ||
2070 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | 2048 | if (memcg) |
2071 | if (gb) | 2049 | refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); |
2072 | inactive_ratio = int_sqrt(10 * gb); | ||
2073 | else | 2050 | else |
2074 | inactive_ratio = 1; | 2051 | refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); |
2052 | |||
2053 | /* | ||
2054 | * When refaults are being observed, it means a new workingset | ||
2055 | * is being established. Disable active list protection to get | ||
2056 | * rid of the stale workingset quickly. | ||
2057 | */ | ||
2058 | if (file && actual_reclaim && lruvec->refaults != refaults) { | ||
2059 | inactive_ratio = 0; | ||
2060 | } else { | ||
2061 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | ||
2062 | if (gb) | ||
2063 | inactive_ratio = int_sqrt(10 * gb); | ||
2064 | else | ||
2065 | inactive_ratio = 1; | ||
2066 | } | ||
2075 | 2067 | ||
2076 | if (trace) | 2068 | if (actual_reclaim) |
2077 | trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id, | 2069 | trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, |
2078 | sc->reclaim_idx, | 2070 | lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, |
2079 | lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, | 2071 | lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, |
2080 | lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, | 2072 | inactive_ratio, file); |
2081 | inactive_ratio, file); | ||
2082 | 2073 | ||
2083 | return inactive * inactive_ratio < active; | 2074 | return inactive * inactive_ratio < active; |
2084 | } | 2075 | } |
2085 | 2076 | ||
2086 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 2077 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
2087 | struct lruvec *lruvec, struct scan_control *sc) | 2078 | struct lruvec *lruvec, struct mem_cgroup *memcg, |
2079 | struct scan_control *sc) | ||
2088 | { | 2080 | { |
2089 | if (is_active_lru(lru)) { | 2081 | if (is_active_lru(lru)) { |
2090 | if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true)) | 2082 | if (inactive_list_is_low(lruvec, is_file_lru(lru), |
2083 | memcg, sc, true)) | ||
2091 | shrink_active_list(nr_to_scan, lruvec, sc, lru); | 2084 | shrink_active_list(nr_to_scan, lruvec, sc, lru); |
2092 | return 0; | 2085 | return 0; |
2093 | } | 2086 | } |
@@ -2123,30 +2116,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, | |||
2123 | unsigned long anon_prio, file_prio; | 2116 | unsigned long anon_prio, file_prio; |
2124 | enum scan_balance scan_balance; | 2117 | enum scan_balance scan_balance; |
2125 | unsigned long anon, file; | 2118 | unsigned long anon, file; |
2126 | bool force_scan = false; | ||
2127 | unsigned long ap, fp; | 2119 | unsigned long ap, fp; |
2128 | enum lru_list lru; | 2120 | enum lru_list lru; |
2129 | bool some_scanned; | ||
2130 | int pass; | ||
2131 | |||
2132 | /* | ||
2133 | * If the zone or memcg is small, nr[l] can be 0. This | ||
2134 | * results in no scanning on this priority and a potential | ||
2135 | * priority drop. Global direct reclaim can go to the next | ||
2136 | * zone and tends to have no problems. Global kswapd is for | ||
2137 | * zone balancing and it needs to scan a minimum amount. When | ||
2138 | * reclaiming for a memcg, a priority drop can cause high | ||
2139 | * latencies, so it's better to scan a minimum amount there as | ||
2140 | * well. | ||
2141 | */ | ||
2142 | if (current_is_kswapd()) { | ||
2143 | if (!pgdat_reclaimable(pgdat)) | ||
2144 | force_scan = true; | ||
2145 | if (!mem_cgroup_online(memcg)) | ||
2146 | force_scan = true; | ||
2147 | } | ||
2148 | if (!global_reclaim(sc)) | ||
2149 | force_scan = true; | ||
2150 | 2121 | ||
2151 | /* If we have no swap space, do not bother scanning anon pages. */ | 2122 | /* If we have no swap space, do not bother scanning anon pages. */ |
2152 | if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { | 2123 | if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { |
@@ -2218,7 +2189,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, | |||
2218 | * lruvec even if it has plenty of old anonymous pages unless the | 2189 | * lruvec even if it has plenty of old anonymous pages unless the |
2219 | * system is under heavy pressure. | 2190 | * system is under heavy pressure. |
2220 | */ | 2191 | */ |
2221 | if (!inactive_list_is_low(lruvec, true, sc, false) && | 2192 | if (!inactive_list_is_low(lruvec, true, memcg, sc, false) && |
2222 | lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { | 2193 | lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { |
2223 | scan_balance = SCAN_FILE; | 2194 | scan_balance = SCAN_FILE; |
2224 | goto out; | 2195 | goto out; |
@@ -2277,55 +2248,48 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, | |||
2277 | fraction[1] = fp; | 2248 | fraction[1] = fp; |
2278 | denominator = ap + fp + 1; | 2249 | denominator = ap + fp + 1; |
2279 | out: | 2250 | out: |
2280 | some_scanned = false; | 2251 | *lru_pages = 0; |
2281 | /* Only use force_scan on second pass. */ | 2252 | for_each_evictable_lru(lru) { |
2282 | for (pass = 0; !some_scanned && pass < 2; pass++) { | 2253 | int file = is_file_lru(lru); |
2283 | *lru_pages = 0; | 2254 | unsigned long size; |
2284 | for_each_evictable_lru(lru) { | 2255 | unsigned long scan; |
2285 | int file = is_file_lru(lru); | ||
2286 | unsigned long size; | ||
2287 | unsigned long scan; | ||
2288 | |||
2289 | size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); | ||
2290 | scan = size >> sc->priority; | ||
2291 | |||
2292 | if (!scan && pass && force_scan) | ||
2293 | scan = min(size, SWAP_CLUSTER_MAX); | ||
2294 | |||
2295 | switch (scan_balance) { | ||
2296 | case SCAN_EQUAL: | ||
2297 | /* Scan lists relative to size */ | ||
2298 | break; | ||
2299 | case SCAN_FRACT: | ||
2300 | /* | ||
2301 | * Scan types proportional to swappiness and | ||
2302 | * their relative recent reclaim efficiency. | ||
2303 | */ | ||
2304 | scan = div64_u64(scan * fraction[file], | ||
2305 | denominator); | ||
2306 | break; | ||
2307 | case SCAN_FILE: | ||
2308 | case SCAN_ANON: | ||
2309 | /* Scan one type exclusively */ | ||
2310 | if ((scan_balance == SCAN_FILE) != file) { | ||
2311 | size = 0; | ||
2312 | scan = 0; | ||
2313 | } | ||
2314 | break; | ||
2315 | default: | ||
2316 | /* Look ma, no brain */ | ||
2317 | BUG(); | ||
2318 | } | ||
2319 | 2256 | ||
2320 | *lru_pages += size; | 2257 | size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); |
2321 | nr[lru] = scan; | 2258 | scan = size >> sc->priority; |
2259 | /* | ||
2260 | * If the cgroup's already been deleted, make sure to | ||
2261 | * scrape out the remaining cache. | ||
2262 | */ | ||
2263 | if (!scan && !mem_cgroup_online(memcg)) | ||
2264 | scan = min(size, SWAP_CLUSTER_MAX); | ||
2322 | 2265 | ||
2266 | switch (scan_balance) { | ||
2267 | case SCAN_EQUAL: | ||
2268 | /* Scan lists relative to size */ | ||
2269 | break; | ||
2270 | case SCAN_FRACT: | ||
2323 | /* | 2271 | /* |
2324 | * Skip the second pass and don't force_scan, | 2272 | * Scan types proportional to swappiness and |
2325 | * if we found something to scan. | 2273 | * their relative recent reclaim efficiency. |
2326 | */ | 2274 | */ |
2327 | some_scanned |= !!scan; | 2275 | scan = div64_u64(scan * fraction[file], |
2276 | denominator); | ||
2277 | break; | ||
2278 | case SCAN_FILE: | ||
2279 | case SCAN_ANON: | ||
2280 | /* Scan one type exclusively */ | ||
2281 | if ((scan_balance == SCAN_FILE) != file) { | ||
2282 | size = 0; | ||
2283 | scan = 0; | ||
2284 | } | ||
2285 | break; | ||
2286 | default: | ||
2287 | /* Look ma, no brain */ | ||
2288 | BUG(); | ||
2328 | } | 2289 | } |
2290 | |||
2291 | *lru_pages += size; | ||
2292 | nr[lru] = scan; | ||
2329 | } | 2293 | } |
2330 | } | 2294 | } |
2331 | 2295 | ||
@@ -2376,7 +2340,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc | |||
2376 | nr[lru] -= nr_to_scan; | 2340 | nr[lru] -= nr_to_scan; |
2377 | 2341 | ||
2378 | nr_reclaimed += shrink_list(lru, nr_to_scan, | 2342 | nr_reclaimed += shrink_list(lru, nr_to_scan, |
2379 | lruvec, sc); | 2343 | lruvec, memcg, sc); |
2380 | } | 2344 | } |
2381 | } | 2345 | } |
2382 | 2346 | ||
@@ -2443,7 +2407,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc | |||
2443 | * Even if we did not try to evict anon pages at all, we want to | 2407 | * Even if we did not try to evict anon pages at all, we want to |
2444 | * rebalance the anon lru active/inactive ratio. | 2408 | * rebalance the anon lru active/inactive ratio. |
2445 | */ | 2409 | */ |
2446 | if (inactive_list_is_low(lruvec, false, sc, true)) | 2410 | if (inactive_list_is_low(lruvec, false, memcg, sc, true)) |
2447 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, | 2411 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, |
2448 | sc, LRU_ACTIVE_ANON); | 2412 | sc, LRU_ACTIVE_ANON); |
2449 | } | 2413 | } |
@@ -2557,9 +2521,11 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) | |||
2557 | unsigned long scanned; | 2521 | unsigned long scanned; |
2558 | 2522 | ||
2559 | if (mem_cgroup_low(root, memcg)) { | 2523 | if (mem_cgroup_low(root, memcg)) { |
2560 | if (!sc->may_thrash) | 2524 | if (!sc->memcg_low_reclaim) { |
2525 | sc->memcg_low_skipped = 1; | ||
2561 | continue; | 2526 | continue; |
2562 | mem_cgroup_events(memcg, MEMCG_LOW, 1); | 2527 | } |
2528 | mem_cgroup_event(memcg, MEMCG_LOW); | ||
2563 | } | 2529 | } |
2564 | 2530 | ||
2565 | reclaimed = sc->nr_reclaimed; | 2531 | reclaimed = sc->nr_reclaimed; |
@@ -2620,6 +2586,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) | |||
2620 | } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, | 2586 | } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, |
2621 | sc->nr_scanned - nr_scanned, sc)); | 2587 | sc->nr_scanned - nr_scanned, sc)); |
2622 | 2588 | ||
2589 | /* | ||
2590 | * Kswapd gives up on balancing particular nodes after too | ||
2591 | * many failures to reclaim anything from them and goes to | ||
2592 | * sleep. On reclaim progress, reset the failure counter. A | ||
2593 | * successful direct reclaim run will revive a dormant kswapd. | ||
2594 | */ | ||
2595 | if (reclaimable) | ||
2596 | pgdat->kswapd_failures = 0; | ||
2597 | |||
2623 | return reclaimable; | 2598 | return reclaimable; |
2624 | } | 2599 | } |
2625 | 2600 | ||
@@ -2694,10 +2669,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2694 | GFP_KERNEL | __GFP_HARDWALL)) | 2669 | GFP_KERNEL | __GFP_HARDWALL)) |
2695 | continue; | 2670 | continue; |
2696 | 2671 | ||
2697 | if (sc->priority != DEF_PRIORITY && | ||
2698 | !pgdat_reclaimable(zone->zone_pgdat)) | ||
2699 | continue; /* Let kswapd poll it */ | ||
2700 | |||
2701 | /* | 2672 | /* |
2702 | * If we already have plenty of memory free for | 2673 | * If we already have plenty of memory free for |
2703 | * compaction in this zone, don't free any more. | 2674 | * compaction in this zone, don't free any more. |
@@ -2752,6 +2723,25 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2752 | sc->gfp_mask = orig_mask; | 2723 | sc->gfp_mask = orig_mask; |
2753 | } | 2724 | } |
2754 | 2725 | ||
2726 | static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) | ||
2727 | { | ||
2728 | struct mem_cgroup *memcg; | ||
2729 | |||
2730 | memcg = mem_cgroup_iter(root_memcg, NULL, NULL); | ||
2731 | do { | ||
2732 | unsigned long refaults; | ||
2733 | struct lruvec *lruvec; | ||
2734 | |||
2735 | if (memcg) | ||
2736 | refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); | ||
2737 | else | ||
2738 | refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); | ||
2739 | |||
2740 | lruvec = mem_cgroup_lruvec(pgdat, memcg); | ||
2741 | lruvec->refaults = refaults; | ||
2742 | } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); | ||
2743 | } | ||
2744 | |||
2755 | /* | 2745 | /* |
2756 | * This is the main entry point to direct page reclaim. | 2746 | * This is the main entry point to direct page reclaim. |
2757 | * | 2747 | * |
@@ -2772,6 +2762,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2772 | struct scan_control *sc) | 2762 | struct scan_control *sc) |
2773 | { | 2763 | { |
2774 | int initial_priority = sc->priority; | 2764 | int initial_priority = sc->priority; |
2765 | pg_data_t *last_pgdat; | ||
2766 | struct zoneref *z; | ||
2767 | struct zone *zone; | ||
2775 | retry: | 2768 | retry: |
2776 | delayacct_freepages_start(); | 2769 | delayacct_freepages_start(); |
2777 | 2770 | ||
@@ -2798,6 +2791,15 @@ retry: | |||
2798 | sc->may_writepage = 1; | 2791 | sc->may_writepage = 1; |
2799 | } while (--sc->priority >= 0); | 2792 | } while (--sc->priority >= 0); |
2800 | 2793 | ||
2794 | last_pgdat = NULL; | ||
2795 | for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, | ||
2796 | sc->nodemask) { | ||
2797 | if (zone->zone_pgdat == last_pgdat) | ||
2798 | continue; | ||
2799 | last_pgdat = zone->zone_pgdat; | ||
2800 | snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); | ||
2801 | } | ||
2802 | |||
2801 | delayacct_freepages_end(); | 2803 | delayacct_freepages_end(); |
2802 | 2804 | ||
2803 | if (sc->nr_reclaimed) | 2805 | if (sc->nr_reclaimed) |
@@ -2808,16 +2810,17 @@ retry: | |||
2808 | return 1; | 2810 | return 1; |
2809 | 2811 | ||
2810 | /* Untapped cgroup reserves? Don't OOM, retry. */ | 2812 | /* Untapped cgroup reserves? Don't OOM, retry. */ |
2811 | if (!sc->may_thrash) { | 2813 | if (sc->memcg_low_skipped) { |
2812 | sc->priority = initial_priority; | 2814 | sc->priority = initial_priority; |
2813 | sc->may_thrash = 1; | 2815 | sc->memcg_low_reclaim = 1; |
2816 | sc->memcg_low_skipped = 0; | ||
2814 | goto retry; | 2817 | goto retry; |
2815 | } | 2818 | } |
2816 | 2819 | ||
2817 | return 0; | 2820 | return 0; |
2818 | } | 2821 | } |
2819 | 2822 | ||
2820 | static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | 2823 | static bool allow_direct_reclaim(pg_data_t *pgdat) |
2821 | { | 2824 | { |
2822 | struct zone *zone; | 2825 | struct zone *zone; |
2823 | unsigned long pfmemalloc_reserve = 0; | 2826 | unsigned long pfmemalloc_reserve = 0; |
@@ -2825,10 +2828,15 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | |||
2825 | int i; | 2828 | int i; |
2826 | bool wmark_ok; | 2829 | bool wmark_ok; |
2827 | 2830 | ||
2831 | if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) | ||
2832 | return true; | ||
2833 | |||
2828 | for (i = 0; i <= ZONE_NORMAL; i++) { | 2834 | for (i = 0; i <= ZONE_NORMAL; i++) { |
2829 | zone = &pgdat->node_zones[i]; | 2835 | zone = &pgdat->node_zones[i]; |
2830 | if (!managed_zone(zone) || | 2836 | if (!managed_zone(zone)) |
2831 | pgdat_reclaimable_pages(pgdat) == 0) | 2837 | continue; |
2838 | |||
2839 | if (!zone_reclaimable_pages(zone)) | ||
2832 | continue; | 2840 | continue; |
2833 | 2841 | ||
2834 | pfmemalloc_reserve += min_wmark_pages(zone); | 2842 | pfmemalloc_reserve += min_wmark_pages(zone); |
@@ -2905,7 +2913,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | |||
2905 | 2913 | ||
2906 | /* Throttle based on the first usable node */ | 2914 | /* Throttle based on the first usable node */ |
2907 | pgdat = zone->zone_pgdat; | 2915 | pgdat = zone->zone_pgdat; |
2908 | if (pfmemalloc_watermark_ok(pgdat)) | 2916 | if (allow_direct_reclaim(pgdat)) |
2909 | goto out; | 2917 | goto out; |
2910 | break; | 2918 | break; |
2911 | } | 2919 | } |
@@ -2927,14 +2935,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | |||
2927 | */ | 2935 | */ |
2928 | if (!(gfp_mask & __GFP_FS)) { | 2936 | if (!(gfp_mask & __GFP_FS)) { |
2929 | wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, | 2937 | wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, |
2930 | pfmemalloc_watermark_ok(pgdat), HZ); | 2938 | allow_direct_reclaim(pgdat), HZ); |
2931 | 2939 | ||
2932 | goto check_pending; | 2940 | goto check_pending; |
2933 | } | 2941 | } |
2934 | 2942 | ||
2935 | /* Throttle until kswapd wakes the process */ | 2943 | /* Throttle until kswapd wakes the process */ |
2936 | wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, | 2944 | wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, |
2937 | pfmemalloc_watermark_ok(pgdat)); | 2945 | allow_direct_reclaim(pgdat)); |
2938 | 2946 | ||
2939 | check_pending: | 2947 | check_pending: |
2940 | if (fatal_signal_pending(current)) | 2948 | if (fatal_signal_pending(current)) |
@@ -2950,7 +2958,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2950 | unsigned long nr_reclaimed; | 2958 | unsigned long nr_reclaimed; |
2951 | struct scan_control sc = { | 2959 | struct scan_control sc = { |
2952 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2960 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2953 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), | 2961 | .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), |
2954 | .reclaim_idx = gfp_zone(gfp_mask), | 2962 | .reclaim_idx = gfp_zone(gfp_mask), |
2955 | .order = order, | 2963 | .order = order, |
2956 | .nodemask = nodemask, | 2964 | .nodemask = nodemask, |
@@ -3028,9 +3036,10 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
3028 | struct zonelist *zonelist; | 3036 | struct zonelist *zonelist; |
3029 | unsigned long nr_reclaimed; | 3037 | unsigned long nr_reclaimed; |
3030 | int nid; | 3038 | int nid; |
3039 | unsigned int noreclaim_flag; | ||
3031 | struct scan_control sc = { | 3040 | struct scan_control sc = { |
3032 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), | 3041 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), |
3033 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 3042 | .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | |
3034 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), | 3043 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), |
3035 | .reclaim_idx = MAX_NR_ZONES - 1, | 3044 | .reclaim_idx = MAX_NR_ZONES - 1, |
3036 | .target_mem_cgroup = memcg, | 3045 | .target_mem_cgroup = memcg, |
@@ -3054,9 +3063,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
3054 | sc.gfp_mask, | 3063 | sc.gfp_mask, |
3055 | sc.reclaim_idx); | 3064 | sc.reclaim_idx); |
3056 | 3065 | ||
3057 | current->flags |= PF_MEMALLOC; | 3066 | noreclaim_flag = memalloc_noreclaim_save(); |
3058 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | 3067 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); |
3059 | current->flags &= ~PF_MEMALLOC; | 3068 | memalloc_noreclaim_restore(noreclaim_flag); |
3060 | 3069 | ||
3061 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); | 3070 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); |
3062 | 3071 | ||
@@ -3076,7 +3085,7 @@ static void age_active_anon(struct pglist_data *pgdat, | |||
3076 | do { | 3085 | do { |
3077 | struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); | 3086 | struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); |
3078 | 3087 | ||
3079 | if (inactive_list_is_low(lruvec, false, sc, true)) | 3088 | if (inactive_list_is_low(lruvec, false, memcg, sc, true)) |
3080 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, | 3089 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, |
3081 | sc, LRU_ACTIVE_ANON); | 3090 | sc, LRU_ACTIVE_ANON); |
3082 | 3091 | ||
@@ -3084,22 +3093,44 @@ static void age_active_anon(struct pglist_data *pgdat, | |||
3084 | } while (memcg); | 3093 | } while (memcg); |
3085 | } | 3094 | } |
3086 | 3095 | ||
3087 | static bool zone_balanced(struct zone *zone, int order, int classzone_idx) | 3096 | /* |
3097 | * Returns true if there is an eligible zone balanced for the request order | ||
3098 | * and classzone_idx | ||
3099 | */ | ||
3100 | static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | ||
3088 | { | 3101 | { |
3089 | unsigned long mark = high_wmark_pages(zone); | 3102 | int i; |
3103 | unsigned long mark = -1; | ||
3104 | struct zone *zone; | ||
3090 | 3105 | ||
3091 | if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx)) | 3106 | for (i = 0; i <= classzone_idx; i++) { |
3092 | return false; | 3107 | zone = pgdat->node_zones + i; |
3108 | |||
3109 | if (!managed_zone(zone)) | ||
3110 | continue; | ||
3111 | |||
3112 | mark = high_wmark_pages(zone); | ||
3113 | if (zone_watermark_ok_safe(zone, order, mark, classzone_idx)) | ||
3114 | return true; | ||
3115 | } | ||
3093 | 3116 | ||
3094 | /* | 3117 | /* |
3095 | * If any eligible zone is balanced then the node is not considered | 3118 | * If a node has no populated zone within classzone_idx, it does not |
3096 | * to be congested or dirty | 3119 | * need balancing by definition. This can happen if a zone-restricted |
3120 | * allocation tries to wake a remote kswapd. | ||
3097 | */ | 3121 | */ |
3098 | clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags); | 3122 | if (mark == -1) |
3099 | clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags); | 3123 | return true; |
3100 | clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags); | ||
3101 | 3124 | ||
3102 | return true; | 3125 | return false; |
3126 | } | ||
3127 | |||
3128 | /* Clear pgdat state for congested, dirty or under writeback. */ | ||
3129 | static void clear_pgdat_congested(pg_data_t *pgdat) | ||
3130 | { | ||
3131 | clear_bit(PGDAT_CONGESTED, &pgdat->flags); | ||
3132 | clear_bit(PGDAT_DIRTY, &pgdat->flags); | ||
3133 | clear_bit(PGDAT_WRITEBACK, &pgdat->flags); | ||
3103 | } | 3134 | } |
3104 | 3135 | ||
3105 | /* | 3136 | /* |
@@ -3110,11 +3141,9 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx) | |||
3110 | */ | 3141 | */ |
3111 | static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) | 3142 | static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) |
3112 | { | 3143 | { |
3113 | int i; | ||
3114 | |||
3115 | /* | 3144 | /* |
3116 | * The throttled processes are normally woken up in balance_pgdat() as | 3145 | * The throttled processes are normally woken up in balance_pgdat() as |
3117 | * soon as pfmemalloc_watermark_ok() is true. But there is a potential | 3146 | * soon as allow_direct_reclaim() is true. But there is a potential |
3118 | * race between when kswapd checks the watermarks and a process gets | 3147 | * race between when kswapd checks the watermarks and a process gets |
3119 | * throttled. There is also a potential race if processes get | 3148 | * throttled. There is also a potential race if processes get |
3120 | * throttled, kswapd wakes, a large process exits thereby balancing the | 3149 | * throttled, kswapd wakes, a large process exits thereby balancing the |
@@ -3128,17 +3157,16 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
3128 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) | 3157 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) |
3129 | wake_up_all(&pgdat->pfmemalloc_wait); | 3158 | wake_up_all(&pgdat->pfmemalloc_wait); |
3130 | 3159 | ||
3131 | for (i = 0; i <= classzone_idx; i++) { | 3160 | /* Hopeless node, leave it to direct reclaim */ |
3132 | struct zone *zone = pgdat->node_zones + i; | 3161 | if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) |
3133 | 3162 | return true; | |
3134 | if (!managed_zone(zone)) | ||
3135 | continue; | ||
3136 | 3163 | ||
3137 | if (!zone_balanced(zone, order, classzone_idx)) | 3164 | if (pgdat_balanced(pgdat, order, classzone_idx)) { |
3138 | return false; | 3165 | clear_pgdat_congested(pgdat); |
3166 | return true; | ||
3139 | } | 3167 | } |
3140 | 3168 | ||
3141 | return true; | 3169 | return false; |
3142 | } | 3170 | } |
3143 | 3171 | ||
3144 | /* | 3172 | /* |
@@ -3214,9 +3242,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3214 | count_vm_event(PAGEOUTRUN); | 3242 | count_vm_event(PAGEOUTRUN); |
3215 | 3243 | ||
3216 | do { | 3244 | do { |
3245 | unsigned long nr_reclaimed = sc.nr_reclaimed; | ||
3217 | bool raise_priority = true; | 3246 | bool raise_priority = true; |
3218 | 3247 | ||
3219 | sc.nr_reclaimed = 0; | ||
3220 | sc.reclaim_idx = classzone_idx; | 3248 | sc.reclaim_idx = classzone_idx; |
3221 | 3249 | ||
3222 | /* | 3250 | /* |
@@ -3241,23 +3269,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3241 | } | 3269 | } |
3242 | 3270 | ||
3243 | /* | 3271 | /* |
3244 | * Only reclaim if there are no eligible zones. Check from | 3272 | * Only reclaim if there are no eligible zones. Note that |
3245 | * high to low zone as allocations prefer higher zones. | 3273 | * sc.reclaim_idx is not used as buffer_heads_over_limit may |
3246 | * Scanning from low to high zone would allow congestion to be | 3274 | * have adjusted it. |
3247 | * cleared during a very small window when a small low | ||
3248 | * zone was balanced even under extreme pressure when the | ||
3249 | * overall node may be congested. Note that sc.reclaim_idx | ||
3250 | * is not used as buffer_heads_over_limit may have adjusted | ||
3251 | * it. | ||
3252 | */ | 3275 | */ |
3253 | for (i = classzone_idx; i >= 0; i--) { | 3276 | if (pgdat_balanced(pgdat, sc.order, classzone_idx)) |
3254 | zone = pgdat->node_zones + i; | 3277 | goto out; |
3255 | if (!managed_zone(zone)) | ||
3256 | continue; | ||
3257 | |||
3258 | if (zone_balanced(zone, sc.order, classzone_idx)) | ||
3259 | goto out; | ||
3260 | } | ||
3261 | 3278 | ||
3262 | /* | 3279 | /* |
3263 | * Do some background aging of the anon list, to give | 3280 | * Do some background aging of the anon list, to give |
@@ -3271,7 +3288,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3271 | * If we're getting trouble reclaiming, start doing writepage | 3288 | * If we're getting trouble reclaiming, start doing writepage |
3272 | * even in laptop mode. | 3289 | * even in laptop mode. |
3273 | */ | 3290 | */ |
3274 | if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat)) | 3291 | if (sc.priority < DEF_PRIORITY - 2) |
3275 | sc.may_writepage = 1; | 3292 | sc.may_writepage = 1; |
3276 | 3293 | ||
3277 | /* Call soft limit reclaim before calling shrink_node. */ | 3294 | /* Call soft limit reclaim before calling shrink_node. */ |
@@ -3295,7 +3312,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3295 | * able to safely make forward progress. Wake them | 3312 | * able to safely make forward progress. Wake them |
3296 | */ | 3313 | */ |
3297 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && | 3314 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && |
3298 | pfmemalloc_watermark_ok(pgdat)) | 3315 | allow_direct_reclaim(pgdat)) |
3299 | wake_up_all(&pgdat->pfmemalloc_wait); | 3316 | wake_up_all(&pgdat->pfmemalloc_wait); |
3300 | 3317 | ||
3301 | /* Check if kswapd should be suspending */ | 3318 | /* Check if kswapd should be suspending */ |
@@ -3306,11 +3323,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3306 | * Raise priority if scanning rate is too low or there was no | 3323 | * Raise priority if scanning rate is too low or there was no |
3307 | * progress in reclaiming pages | 3324 | * progress in reclaiming pages |
3308 | */ | 3325 | */ |
3309 | if (raise_priority || !sc.nr_reclaimed) | 3326 | nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; |
3327 | if (raise_priority || !nr_reclaimed) | ||
3310 | sc.priority--; | 3328 | sc.priority--; |
3311 | } while (sc.priority >= 1); | 3329 | } while (sc.priority >= 1); |
3312 | 3330 | ||
3331 | if (!sc.nr_reclaimed) | ||
3332 | pgdat->kswapd_failures++; | ||
3333 | |||
3313 | out: | 3334 | out: |
3335 | snapshot_refaults(NULL, pgdat); | ||
3314 | /* | 3336 | /* |
3315 | * Return the order kswapd stopped reclaiming at as | 3337 | * Return the order kswapd stopped reclaiming at as |
3316 | * prepare_kswapd_sleep() takes it into account. If another caller | 3338 | * prepare_kswapd_sleep() takes it into account. If another caller |
@@ -3320,6 +3342,22 @@ out: | |||
3320 | return sc.order; | 3342 | return sc.order; |
3321 | } | 3343 | } |
3322 | 3344 | ||
3345 | /* | ||
3346 | * pgdat->kswapd_classzone_idx is the highest zone index that a recent | ||
3347 | * allocation request woke kswapd for. When kswapd has not woken recently, | ||
3348 | * the value is MAX_NR_ZONES which is not a valid index. This compares a | ||
3349 | * given classzone and returns it or the highest classzone index kswapd | ||
3350 | * was recently woke for. | ||
3351 | */ | ||
3352 | static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, | ||
3353 | enum zone_type classzone_idx) | ||
3354 | { | ||
3355 | if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES) | ||
3356 | return classzone_idx; | ||
3357 | |||
3358 | return max(pgdat->kswapd_classzone_idx, classzone_idx); | ||
3359 | } | ||
3360 | |||
3323 | static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, | 3361 | static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, |
3324 | unsigned int classzone_idx) | 3362 | unsigned int classzone_idx) |
3325 | { | 3363 | { |
@@ -3331,7 +3369,13 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o | |||
3331 | 3369 | ||
3332 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 3370 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
3333 | 3371 | ||
3334 | /* Try to sleep for a short interval */ | 3372 | /* |
3373 | * Try to sleep for a short interval. Note that kcompactd will only be | ||
3374 | * woken if it is possible to sleep for a short interval. This is | ||
3375 | * deliberate on the assumption that if reclaim cannot keep an | ||
3376 | * eligible zone balanced that it's also unlikely that compaction will | ||
3377 | * succeed. | ||
3378 | */ | ||
3335 | if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { | 3379 | if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { |
3336 | /* | 3380 | /* |
3337 | * Compaction records what page blocks it recently failed to | 3381 | * Compaction records what page blocks it recently failed to |
@@ -3355,7 +3399,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o | |||
3355 | * the previous request that slept prematurely. | 3399 | * the previous request that slept prematurely. |
3356 | */ | 3400 | */ |
3357 | if (remaining) { | 3401 | if (remaining) { |
3358 | pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); | 3402 | pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); |
3359 | pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); | 3403 | pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); |
3360 | } | 3404 | } |
3361 | 3405 | ||
@@ -3409,7 +3453,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o | |||
3409 | */ | 3453 | */ |
3410 | static int kswapd(void *p) | 3454 | static int kswapd(void *p) |
3411 | { | 3455 | { |
3412 | unsigned int alloc_order, reclaim_order, classzone_idx; | 3456 | unsigned int alloc_order, reclaim_order; |
3457 | unsigned int classzone_idx = MAX_NR_ZONES - 1; | ||
3413 | pg_data_t *pgdat = (pg_data_t*)p; | 3458 | pg_data_t *pgdat = (pg_data_t*)p; |
3414 | struct task_struct *tsk = current; | 3459 | struct task_struct *tsk = current; |
3415 | 3460 | ||
@@ -3439,20 +3484,23 @@ static int kswapd(void *p) | |||
3439 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; | 3484 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; |
3440 | set_freezable(); | 3485 | set_freezable(); |
3441 | 3486 | ||
3442 | pgdat->kswapd_order = alloc_order = reclaim_order = 0; | 3487 | pgdat->kswapd_order = 0; |
3443 | pgdat->kswapd_classzone_idx = classzone_idx = 0; | 3488 | pgdat->kswapd_classzone_idx = MAX_NR_ZONES; |
3444 | for ( ; ; ) { | 3489 | for ( ; ; ) { |
3445 | bool ret; | 3490 | bool ret; |
3446 | 3491 | ||
3492 | alloc_order = reclaim_order = pgdat->kswapd_order; | ||
3493 | classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); | ||
3494 | |||
3447 | kswapd_try_sleep: | 3495 | kswapd_try_sleep: |
3448 | kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, | 3496 | kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, |
3449 | classzone_idx); | 3497 | classzone_idx); |
3450 | 3498 | ||
3451 | /* Read the new order and classzone_idx */ | 3499 | /* Read the new order and classzone_idx */ |
3452 | alloc_order = reclaim_order = pgdat->kswapd_order; | 3500 | alloc_order = reclaim_order = pgdat->kswapd_order; |
3453 | classzone_idx = pgdat->kswapd_classzone_idx; | 3501 | classzone_idx = kswapd_classzone_idx(pgdat, 0); |
3454 | pgdat->kswapd_order = 0; | 3502 | pgdat->kswapd_order = 0; |
3455 | pgdat->kswapd_classzone_idx = 0; | 3503 | pgdat->kswapd_classzone_idx = MAX_NR_ZONES; |
3456 | 3504 | ||
3457 | ret = try_to_freeze(); | 3505 | ret = try_to_freeze(); |
3458 | if (kthread_should_stop()) | 3506 | if (kthread_should_stop()) |
@@ -3478,9 +3526,6 @@ kswapd_try_sleep: | |||
3478 | reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); | 3526 | reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); |
3479 | if (reclaim_order < alloc_order) | 3527 | if (reclaim_order < alloc_order) |
3480 | goto kswapd_try_sleep; | 3528 | goto kswapd_try_sleep; |
3481 | |||
3482 | alloc_order = reclaim_order = pgdat->kswapd_order; | ||
3483 | classzone_idx = pgdat->kswapd_classzone_idx; | ||
3484 | } | 3529 | } |
3485 | 3530 | ||
3486 | tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); | 3531 | tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); |
@@ -3496,7 +3541,6 @@ kswapd_try_sleep: | |||
3496 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) | 3541 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) |
3497 | { | 3542 | { |
3498 | pg_data_t *pgdat; | 3543 | pg_data_t *pgdat; |
3499 | int z; | ||
3500 | 3544 | ||
3501 | if (!managed_zone(zone)) | 3545 | if (!managed_zone(zone)) |
3502 | return; | 3546 | return; |
@@ -3504,22 +3548,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) | |||
3504 | if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) | 3548 | if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) |
3505 | return; | 3549 | return; |
3506 | pgdat = zone->zone_pgdat; | 3550 | pgdat = zone->zone_pgdat; |
3507 | pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); | 3551 | pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, |
3552 | classzone_idx); | ||
3508 | pgdat->kswapd_order = max(pgdat->kswapd_order, order); | 3553 | pgdat->kswapd_order = max(pgdat->kswapd_order, order); |
3509 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 3554 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
3510 | return; | 3555 | return; |
3511 | 3556 | ||
3512 | /* Only wake kswapd if all zones are unbalanced */ | 3557 | /* Hopeless node, leave it to direct reclaim */ |
3513 | for (z = 0; z <= classzone_idx; z++) { | 3558 | if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) |
3514 | zone = pgdat->node_zones + z; | 3559 | return; |
3515 | if (!managed_zone(zone)) | ||
3516 | continue; | ||
3517 | 3560 | ||
3518 | if (zone_balanced(zone, order, classzone_idx)) | 3561 | if (pgdat_balanced(pgdat, order, classzone_idx)) |
3519 | return; | 3562 | return; |
3520 | } | ||
3521 | 3563 | ||
3522 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | 3564 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order); |
3523 | wake_up_interruptible(&pgdat->kswapd_wait); | 3565 | wake_up_interruptible(&pgdat->kswapd_wait); |
3524 | } | 3566 | } |
3525 | 3567 | ||
@@ -3548,8 +3590,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
3548 | struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); | 3590 | struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); |
3549 | struct task_struct *p = current; | 3591 | struct task_struct *p = current; |
3550 | unsigned long nr_reclaimed; | 3592 | unsigned long nr_reclaimed; |
3593 | unsigned int noreclaim_flag; | ||
3551 | 3594 | ||
3552 | p->flags |= PF_MEMALLOC; | 3595 | noreclaim_flag = memalloc_noreclaim_save(); |
3553 | lockdep_set_current_reclaim_state(sc.gfp_mask); | 3596 | lockdep_set_current_reclaim_state(sc.gfp_mask); |
3554 | reclaim_state.reclaimed_slab = 0; | 3597 | reclaim_state.reclaimed_slab = 0; |
3555 | p->reclaim_state = &reclaim_state; | 3598 | p->reclaim_state = &reclaim_state; |
@@ -3558,7 +3601,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
3558 | 3601 | ||
3559 | p->reclaim_state = NULL; | 3602 | p->reclaim_state = NULL; |
3560 | lockdep_clear_current_reclaim_state(); | 3603 | lockdep_clear_current_reclaim_state(); |
3561 | p->flags &= ~PF_MEMALLOC; | 3604 | memalloc_noreclaim_restore(noreclaim_flag); |
3562 | 3605 | ||
3563 | return nr_reclaimed; | 3606 | return nr_reclaimed; |
3564 | } | 3607 | } |
@@ -3723,9 +3766,10 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in | |||
3723 | struct task_struct *p = current; | 3766 | struct task_struct *p = current; |
3724 | struct reclaim_state reclaim_state; | 3767 | struct reclaim_state reclaim_state; |
3725 | int classzone_idx = gfp_zone(gfp_mask); | 3768 | int classzone_idx = gfp_zone(gfp_mask); |
3769 | unsigned int noreclaim_flag; | ||
3726 | struct scan_control sc = { | 3770 | struct scan_control sc = { |
3727 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), | 3771 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), |
3728 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), | 3772 | .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), |
3729 | .order = order, | 3773 | .order = order, |
3730 | .priority = NODE_RECLAIM_PRIORITY, | 3774 | .priority = NODE_RECLAIM_PRIORITY, |
3731 | .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), | 3775 | .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), |
@@ -3740,7 +3784,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in | |||
3740 | * and we also need to be able to write out pages for RECLAIM_WRITE | 3784 | * and we also need to be able to write out pages for RECLAIM_WRITE |
3741 | * and RECLAIM_UNMAP. | 3785 | * and RECLAIM_UNMAP. |
3742 | */ | 3786 | */ |
3743 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; | 3787 | noreclaim_flag = memalloc_noreclaim_save(); |
3788 | p->flags |= PF_SWAPWRITE; | ||
3744 | lockdep_set_current_reclaim_state(gfp_mask); | 3789 | lockdep_set_current_reclaim_state(gfp_mask); |
3745 | reclaim_state.reclaimed_slab = 0; | 3790 | reclaim_state.reclaimed_slab = 0; |
3746 | p->reclaim_state = &reclaim_state; | 3791 | p->reclaim_state = &reclaim_state; |
@@ -3756,7 +3801,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in | |||
3756 | } | 3801 | } |
3757 | 3802 | ||
3758 | p->reclaim_state = NULL; | 3803 | p->reclaim_state = NULL; |
3759 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 3804 | current->flags &= ~PF_SWAPWRITE; |
3805 | memalloc_noreclaim_restore(noreclaim_flag); | ||
3760 | lockdep_clear_current_reclaim_state(); | 3806 | lockdep_clear_current_reclaim_state(); |
3761 | return sc.nr_reclaimed >= nr_pages; | 3807 | return sc.nr_reclaimed >= nr_pages; |
3762 | } | 3808 | } |
@@ -3779,9 +3825,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) | |||
3779 | sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) | 3825 | sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) |
3780 | return NODE_RECLAIM_FULL; | 3826 | return NODE_RECLAIM_FULL; |
3781 | 3827 | ||
3782 | if (!pgdat_reclaimable(pgdat)) | ||
3783 | return NODE_RECLAIM_FULL; | ||
3784 | |||
3785 | /* | 3828 | /* |
3786 | * Do not scan if the allocation should not be delayed. | 3829 | * Do not scan if the allocation should not be delayed. |
3787 | */ | 3830 | */ |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 809025ed97ea..f5fa1bd1eb16 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -954,7 +954,6 @@ const char * const vmstat_text[] = { | |||
954 | "nr_unevictable", | 954 | "nr_unevictable", |
955 | "nr_isolated_anon", | 955 | "nr_isolated_anon", |
956 | "nr_isolated_file", | 956 | "nr_isolated_file", |
957 | "nr_pages_scanned", | ||
958 | "workingset_refault", | 957 | "workingset_refault", |
959 | "workingset_activate", | 958 | "workingset_activate", |
960 | "workingset_nodereclaim", | 959 | "workingset_nodereclaim", |
@@ -992,6 +991,7 @@ const char * const vmstat_text[] = { | |||
992 | "pgfree", | 991 | "pgfree", |
993 | "pgactivate", | 992 | "pgactivate", |
994 | "pgdeactivate", | 993 | "pgdeactivate", |
994 | "pglazyfree", | ||
995 | 995 | ||
996 | "pgfault", | 996 | "pgfault", |
997 | "pgmajfault", | 997 | "pgmajfault", |
@@ -1124,8 +1124,12 @@ static void frag_stop(struct seq_file *m, void *arg) | |||
1124 | { | 1124 | { |
1125 | } | 1125 | } |
1126 | 1126 | ||
1127 | /* Walk all the zones in a node and print using a callback */ | 1127 | /* |
1128 | * Walk zones in a node and print using a callback. | ||
1129 | * If @assert_populated is true, only use callback for zones that are populated. | ||
1130 | */ | ||
1128 | static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | 1131 | static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, |
1132 | bool assert_populated, | ||
1129 | void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) | 1133 | void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) |
1130 | { | 1134 | { |
1131 | struct zone *zone; | 1135 | struct zone *zone; |
@@ -1133,7 +1137,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | |||
1133 | unsigned long flags; | 1137 | unsigned long flags; |
1134 | 1138 | ||
1135 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { | 1139 | for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { |
1136 | if (!populated_zone(zone)) | 1140 | if (assert_populated && !populated_zone(zone)) |
1137 | continue; | 1141 | continue; |
1138 | 1142 | ||
1139 | spin_lock_irqsave(&zone->lock, flags); | 1143 | spin_lock_irqsave(&zone->lock, flags); |
@@ -1161,7 +1165,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
1161 | static int frag_show(struct seq_file *m, void *arg) | 1165 | static int frag_show(struct seq_file *m, void *arg) |
1162 | { | 1166 | { |
1163 | pg_data_t *pgdat = (pg_data_t *)arg; | 1167 | pg_data_t *pgdat = (pg_data_t *)arg; |
1164 | walk_zones_in_node(m, pgdat, frag_show_print); | 1168 | walk_zones_in_node(m, pgdat, true, frag_show_print); |
1165 | return 0; | 1169 | return 0; |
1166 | } | 1170 | } |
1167 | 1171 | ||
@@ -1202,7 +1206,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg) | |||
1202 | seq_printf(m, "%6d ", order); | 1206 | seq_printf(m, "%6d ", order); |
1203 | seq_putc(m, '\n'); | 1207 | seq_putc(m, '\n'); |
1204 | 1208 | ||
1205 | walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); | 1209 | walk_zones_in_node(m, pgdat, true, pagetypeinfo_showfree_print); |
1206 | 1210 | ||
1207 | return 0; | 1211 | return 0; |
1208 | } | 1212 | } |
@@ -1254,7 +1258,7 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) | |||
1254 | for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) | 1258 | for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) |
1255 | seq_printf(m, "%12s ", migratetype_names[mtype]); | 1259 | seq_printf(m, "%12s ", migratetype_names[mtype]); |
1256 | seq_putc(m, '\n'); | 1260 | seq_putc(m, '\n'); |
1257 | walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); | 1261 | walk_zones_in_node(m, pgdat, true, pagetypeinfo_showblockcount_print); |
1258 | 1262 | ||
1259 | return 0; | 1263 | return 0; |
1260 | } | 1264 | } |
@@ -1280,7 +1284,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) | |||
1280 | seq_printf(m, "%12s ", migratetype_names[mtype]); | 1284 | seq_printf(m, "%12s ", migratetype_names[mtype]); |
1281 | seq_putc(m, '\n'); | 1285 | seq_putc(m, '\n'); |
1282 | 1286 | ||
1283 | walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print); | 1287 | walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print); |
1284 | #endif /* CONFIG_PAGE_OWNER */ | 1288 | #endif /* CONFIG_PAGE_OWNER */ |
1285 | } | 1289 | } |
1286 | 1290 | ||
@@ -1378,7 +1382,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
1378 | "\n min %lu" | 1382 | "\n min %lu" |
1379 | "\n low %lu" | 1383 | "\n low %lu" |
1380 | "\n high %lu" | 1384 | "\n high %lu" |
1381 | "\n node_scanned %lu" | ||
1382 | "\n spanned %lu" | 1385 | "\n spanned %lu" |
1383 | "\n present %lu" | 1386 | "\n present %lu" |
1384 | "\n managed %lu", | 1387 | "\n managed %lu", |
@@ -1386,23 +1389,28 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
1386 | min_wmark_pages(zone), | 1389 | min_wmark_pages(zone), |
1387 | low_wmark_pages(zone), | 1390 | low_wmark_pages(zone), |
1388 | high_wmark_pages(zone), | 1391 | high_wmark_pages(zone), |
1389 | node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED), | ||
1390 | zone->spanned_pages, | 1392 | zone->spanned_pages, |
1391 | zone->present_pages, | 1393 | zone->present_pages, |
1392 | zone->managed_pages); | 1394 | zone->managed_pages); |
1393 | 1395 | ||
1394 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | ||
1395 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], | ||
1396 | zone_page_state(zone, i)); | ||
1397 | |||
1398 | seq_printf(m, | 1396 | seq_printf(m, |
1399 | "\n protection: (%ld", | 1397 | "\n protection: (%ld", |
1400 | zone->lowmem_reserve[0]); | 1398 | zone->lowmem_reserve[0]); |
1401 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) | 1399 | for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) |
1402 | seq_printf(m, ", %ld", zone->lowmem_reserve[i]); | 1400 | seq_printf(m, ", %ld", zone->lowmem_reserve[i]); |
1403 | seq_printf(m, | 1401 | seq_putc(m, ')'); |
1404 | ")" | 1402 | |
1405 | "\n pagesets"); | 1403 | /* If unpopulated, no other information is useful */ |
1404 | if (!populated_zone(zone)) { | ||
1405 | seq_putc(m, '\n'); | ||
1406 | return; | ||
1407 | } | ||
1408 | |||
1409 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | ||
1410 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], | ||
1411 | zone_page_state(zone, i)); | ||
1412 | |||
1413 | seq_printf(m, "\n pagesets"); | ||
1406 | for_each_online_cpu(i) { | 1414 | for_each_online_cpu(i) { |
1407 | struct per_cpu_pageset *pageset; | 1415 | struct per_cpu_pageset *pageset; |
1408 | 1416 | ||
@@ -1425,19 +1433,22 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
1425 | "\n node_unreclaimable: %u" | 1433 | "\n node_unreclaimable: %u" |
1426 | "\n start_pfn: %lu" | 1434 | "\n start_pfn: %lu" |
1427 | "\n node_inactive_ratio: %u", | 1435 | "\n node_inactive_ratio: %u", |
1428 | !pgdat_reclaimable(zone->zone_pgdat), | 1436 | pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, |
1429 | zone->zone_start_pfn, | 1437 | zone->zone_start_pfn, |
1430 | zone->zone_pgdat->inactive_ratio); | 1438 | zone->zone_pgdat->inactive_ratio); |
1431 | seq_putc(m, '\n'); | 1439 | seq_putc(m, '\n'); |
1432 | } | 1440 | } |
1433 | 1441 | ||
1434 | /* | 1442 | /* |
1435 | * Output information about zones in @pgdat. | 1443 | * Output information about zones in @pgdat. All zones are printed regardless |
1444 | * of whether they are populated or not: lowmem_reserve_ratio operates on the | ||
1445 | * set of all zones and userspace would not be aware of such zones if they are | ||
1446 | * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio). | ||
1436 | */ | 1447 | */ |
1437 | static int zoneinfo_show(struct seq_file *m, void *arg) | 1448 | static int zoneinfo_show(struct seq_file *m, void *arg) |
1438 | { | 1449 | { |
1439 | pg_data_t *pgdat = (pg_data_t *)arg; | 1450 | pg_data_t *pgdat = (pg_data_t *)arg; |
1440 | walk_zones_in_node(m, pgdat, zoneinfo_show_print); | 1451 | walk_zones_in_node(m, pgdat, false, zoneinfo_show_print); |
1441 | return 0; | 1452 | return 0; |
1442 | } | 1453 | } |
1443 | 1454 | ||
@@ -1586,22 +1597,9 @@ int vmstat_refresh(struct ctl_table *table, int write, | |||
1586 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { | 1597 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { |
1587 | val = atomic_long_read(&vm_zone_stat[i]); | 1598 | val = atomic_long_read(&vm_zone_stat[i]); |
1588 | if (val < 0) { | 1599 | if (val < 0) { |
1589 | switch (i) { | 1600 | pr_warn("%s: %s %ld\n", |
1590 | case NR_PAGES_SCANNED: | 1601 | __func__, vmstat_text[i], val); |
1591 | /* | 1602 | err = -EINVAL; |
1592 | * This is often seen to go negative in | ||
1593 | * recent kernels, but not to go permanently | ||
1594 | * negative. Whilst it would be nicer not to | ||
1595 | * have exceptions, rooting them out would be | ||
1596 | * another task, of rather low priority. | ||
1597 | */ | ||
1598 | break; | ||
1599 | default: | ||
1600 | pr_warn("%s: %s %ld\n", | ||
1601 | __func__, vmstat_text[i], val); | ||
1602 | err = -EINVAL; | ||
1603 | break; | ||
1604 | } | ||
1605 | } | 1603 | } |
1606 | } | 1604 | } |
1607 | if (err) | 1605 | if (err) |
@@ -1768,8 +1766,7 @@ void __init init_mm_internals(void) | |||
1768 | { | 1766 | { |
1769 | int ret __maybe_unused; | 1767 | int ret __maybe_unused; |
1770 | 1768 | ||
1771 | mm_percpu_wq = alloc_workqueue("mm_percpu_wq", | 1769 | mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0); |
1772 | WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); | ||
1773 | 1770 | ||
1774 | #ifdef CONFIG_SMP | 1771 | #ifdef CONFIG_SMP |
1775 | ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead", | 1772 | ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead", |
@@ -1857,7 +1854,7 @@ static int unusable_show(struct seq_file *m, void *arg) | |||
1857 | if (!node_state(pgdat->node_id, N_MEMORY)) | 1854 | if (!node_state(pgdat->node_id, N_MEMORY)) |
1858 | return 0; | 1855 | return 0; |
1859 | 1856 | ||
1860 | walk_zones_in_node(m, pgdat, unusable_show_print); | 1857 | walk_zones_in_node(m, pgdat, true, unusable_show_print); |
1861 | 1858 | ||
1862 | return 0; | 1859 | return 0; |
1863 | } | 1860 | } |
@@ -1909,7 +1906,7 @@ static int extfrag_show(struct seq_file *m, void *arg) | |||
1909 | { | 1906 | { |
1910 | pg_data_t *pgdat = (pg_data_t *)arg; | 1907 | pg_data_t *pgdat = (pg_data_t *)arg; |
1911 | 1908 | ||
1912 | walk_zones_in_node(m, pgdat, extfrag_show_print); | 1909 | walk_zones_in_node(m, pgdat, true, extfrag_show_print); |
1913 | 1910 | ||
1914 | return 0; | 1911 | return 0; |
1915 | } | 1912 | } |
diff --git a/mm/workingset.c b/mm/workingset.c index eda05c71fa49..b8c9ab678479 100644 --- a/mm/workingset.c +++ b/mm/workingset.c | |||
@@ -269,7 +269,6 @@ bool workingset_refault(void *shadow) | |||
269 | lruvec = mem_cgroup_lruvec(pgdat, memcg); | 269 | lruvec = mem_cgroup_lruvec(pgdat, memcg); |
270 | refault = atomic_long_read(&lruvec->inactive_age); | 270 | refault = atomic_long_read(&lruvec->inactive_age); |
271 | active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); | 271 | active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); |
272 | rcu_read_unlock(); | ||
273 | 272 | ||
274 | /* | 273 | /* |
275 | * The unsigned subtraction here gives an accurate distance | 274 | * The unsigned subtraction here gives an accurate distance |
@@ -290,11 +289,15 @@ bool workingset_refault(void *shadow) | |||
290 | refault_distance = (refault - eviction) & EVICTION_MASK; | 289 | refault_distance = (refault - eviction) & EVICTION_MASK; |
291 | 290 | ||
292 | inc_node_state(pgdat, WORKINGSET_REFAULT); | 291 | inc_node_state(pgdat, WORKINGSET_REFAULT); |
292 | inc_memcg_state(memcg, WORKINGSET_REFAULT); | ||
293 | 293 | ||
294 | if (refault_distance <= active_file) { | 294 | if (refault_distance <= active_file) { |
295 | inc_node_state(pgdat, WORKINGSET_ACTIVATE); | 295 | inc_node_state(pgdat, WORKINGSET_ACTIVATE); |
296 | inc_memcg_state(memcg, WORKINGSET_ACTIVATE); | ||
297 | rcu_read_unlock(); | ||
296 | return true; | 298 | return true; |
297 | } | 299 | } |
300 | rcu_read_unlock(); | ||
298 | return false; | 301 | return false; |
299 | } | 302 | } |
300 | 303 | ||
@@ -472,6 +475,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, | |||
472 | if (WARN_ON_ONCE(node->exceptional)) | 475 | if (WARN_ON_ONCE(node->exceptional)) |
473 | goto out_invalid; | 476 | goto out_invalid; |
474 | inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); | 477 | inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); |
478 | inc_memcg_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); | ||
475 | __radix_tree_delete_node(&mapping->page_tree, node, | 479 | __radix_tree_delete_node(&mapping->page_tree, node, |
476 | workingset_update_node, mapping); | 480 | workingset_update_node, mapping); |
477 | 481 | ||