aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/compaction.c3
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/huge_memory.c10
-rw-r--r--mm/hugetlb.c447
-rw-r--r--mm/hwpoison-inject.c4
-rw-r--r--mm/internal.h2
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/ksm.c6
-rw-r--r--mm/madvise.c33
-rw-r--r--mm/memblock.c18
-rw-r--r--mm/memcontrol.c17
-rw-r--r--mm/memory-failure.c174
-rw-r--r--mm/memory.c41
-rw-r--r--mm/memory_hotplug.c116
-rw-r--r--mm/mempolicy.c116
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/migrate.c63
-rw-r--r--mm/mlock.c316
-rw-r--r--mm/mmap.c59
-rw-r--r--mm/mremap.c5
-rw-r--r--mm/page-writeback.c269
-rw-r--r--mm/page_alloc.c308
-rw-r--r--mm/page_isolation.c14
-rw-r--r--mm/pgtable-generic.c24
-rw-r--r--mm/readahead.c8
-rw-r--r--mm/shmem.c6
-rw-r--r--mm/slub.c8
-rw-r--r--mm/sparse.c133
-rw-r--r--mm/swap.c77
-rw-r--r--mm/swap_state.c4
-rw-r--r--mm/swapfile.c596
-rw-r--r--mm/util.c5
-rw-r--r--mm/vmalloc.c29
-rw-r--r--mm/vmscan.c80
-rw-r--r--mm/vmstat.c95
-rw-r--r--mm/zbud.c4
-rw-r--r--mm/zswap.c18
38 files changed, 2116 insertions, 1000 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 37d9edcd14cf..ce682f7a4f29 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -652,7 +652,7 @@ int pdflush_proc_obsolete(struct ctl_table *table, int write,
652{ 652{
653 char kbuf[] = "0\n"; 653 char kbuf[] = "0\n";
654 654
655 if (*ppos) { 655 if (*ppos || *lenp < sizeof(kbuf)) {
656 *lenp = 0; 656 *lenp = 0;
657 return 0; 657 return 0;
658 } 658 }
diff --git a/mm/compaction.c b/mm/compaction.c
index 05ccb4cc0bdb..c43789388cd8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1131,6 +1131,9 @@ void compact_pgdat(pg_data_t *pgdat, int order)
1131 .sync = false, 1131 .sync = false,
1132 }; 1132 };
1133 1133
1134 if (!order)
1135 return;
1136
1134 __compact_pgdat(pgdat, &cc); 1137 __compact_pgdat(pgdat, &cc);
1135} 1138}
1136 1139
diff --git a/mm/filemap.c b/mm/filemap.c
index 731a2c24532d..e607728db4a8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -469,7 +469,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
469 if (error) 469 if (error)
470 goto out; 470 goto out;
471 471
472 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); 472 error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
473 if (error == 0) { 473 if (error == 0) {
474 page_cache_get(page); 474 page_cache_get(page);
475 page->mapping = mapping; 475 page->mapping = mapping;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d94f7dee3997..d66010e0049d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -422,7 +422,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
422 unsigned long msecs; 422 unsigned long msecs;
423 int err; 423 int err;
424 424
425 err = strict_strtoul(buf, 10, &msecs); 425 err = kstrtoul(buf, 10, &msecs);
426 if (err || msecs > UINT_MAX) 426 if (err || msecs > UINT_MAX)
427 return -EINVAL; 427 return -EINVAL;
428 428
@@ -449,7 +449,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
449 unsigned long msecs; 449 unsigned long msecs;
450 int err; 450 int err;
451 451
452 err = strict_strtoul(buf, 10, &msecs); 452 err = kstrtoul(buf, 10, &msecs);
453 if (err || msecs > UINT_MAX) 453 if (err || msecs > UINT_MAX)
454 return -EINVAL; 454 return -EINVAL;
455 455
@@ -475,7 +475,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
475 int err; 475 int err;
476 unsigned long pages; 476 unsigned long pages;
477 477
478 err = strict_strtoul(buf, 10, &pages); 478 err = kstrtoul(buf, 10, &pages);
479 if (err || !pages || pages > UINT_MAX) 479 if (err || !pages || pages > UINT_MAX)
480 return -EINVAL; 480 return -EINVAL;
481 481
@@ -543,7 +543,7 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
543 int err; 543 int err;
544 unsigned long max_ptes_none; 544 unsigned long max_ptes_none;
545 545
546 err = strict_strtoul(buf, 10, &max_ptes_none); 546 err = kstrtoul(buf, 10, &max_ptes_none);
547 if (err || max_ptes_none > HPAGE_PMD_NR-1) 547 if (err || max_ptes_none > HPAGE_PMD_NR-1)
548 return -EINVAL; 548 return -EINVAL;
549 549
@@ -2301,6 +2301,8 @@ static void collapse_huge_page(struct mm_struct *mm,
2301 goto out; 2301 goto out;
2302 2302
2303 vma = find_vma(mm, address); 2303 vma = find_vma(mm, address);
2304 if (!vma)
2305 goto out;
2304 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2306 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2305 hend = vma->vm_end & HPAGE_PMD_MASK; 2307 hend = vma->vm_end & HPAGE_PMD_MASK;
2306 if (address < hstart || address + HPAGE_PMD_SIZE > hend) 2308 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b60f33080a28..b49579c7f2a5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -21,6 +21,7 @@
21#include <linux/rmap.h> 21#include <linux/rmap.h>
22#include <linux/swap.h> 22#include <linux/swap.h>
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/page-isolation.h>
24 25
25#include <asm/page.h> 26#include <asm/page.h>
26#include <asm/pgtable.h> 27#include <asm/pgtable.h>
@@ -33,7 +34,6 @@
33#include "internal.h" 34#include "internal.h"
34 35
35const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 36const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
36static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
37unsigned long hugepages_treat_as_movable; 37unsigned long hugepages_treat_as_movable;
38 38
39int hugetlb_max_hstate __read_mostly; 39int hugetlb_max_hstate __read_mostly;
@@ -48,7 +48,8 @@ static unsigned long __initdata default_hstate_max_huge_pages;
48static unsigned long __initdata default_hstate_size; 48static unsigned long __initdata default_hstate_size;
49 49
50/* 50/*
51 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 51 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
52 * free_huge_pages, and surplus_huge_pages.
52 */ 53 */
53DEFINE_SPINLOCK(hugetlb_lock); 54DEFINE_SPINLOCK(hugetlb_lock);
54 55
@@ -135,9 +136,9 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
135 * across the pages in a mapping. 136 * across the pages in a mapping.
136 * 137 *
137 * The region data structures are protected by a combination of the mmap_sem 138 * The region data structures are protected by a combination of the mmap_sem
138 * and the hugetlb_instantion_mutex. To access or modify a region the caller 139 * and the hugetlb_instantiation_mutex. To access or modify a region the caller
139 * must either hold the mmap_sem for write, or the mmap_sem for read and 140 * must either hold the mmap_sem for write, or the mmap_sem for read and
140 * the hugetlb_instantiation mutex: 141 * the hugetlb_instantiation_mutex:
141 * 142 *
142 * down_write(&mm->mmap_sem); 143 * down_write(&mm->mmap_sem);
143 * or 144 * or
@@ -434,25 +435,6 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
434 return (get_vma_private_data(vma) & flag) != 0; 435 return (get_vma_private_data(vma) & flag) != 0;
435} 436}
436 437
437/* Decrement the reserved pages in the hugepage pool by one */
438static void decrement_hugepage_resv_vma(struct hstate *h,
439 struct vm_area_struct *vma)
440{
441 if (vma->vm_flags & VM_NORESERVE)
442 return;
443
444 if (vma->vm_flags & VM_MAYSHARE) {
445 /* Shared mappings always use reserves */
446 h->resv_huge_pages--;
447 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
448 /*
449 * Only the process that called mmap() has reserves for
450 * private mappings.
451 */
452 h->resv_huge_pages--;
453 }
454}
455
456/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ 438/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
457void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 439void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
458{ 440{
@@ -462,12 +444,35 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
462} 444}
463 445
464/* Returns true if the VMA has associated reserve pages */ 446/* Returns true if the VMA has associated reserve pages */
465static int vma_has_reserves(struct vm_area_struct *vma) 447static int vma_has_reserves(struct vm_area_struct *vma, long chg)
466{ 448{
449 if (vma->vm_flags & VM_NORESERVE) {
450 /*
451 * This address is already reserved by other process(chg == 0),
452 * so, we should decrement reserved count. Without decrementing,
453 * reserve count remains after releasing inode, because this
454 * allocated page will go into page cache and is regarded as
455 * coming from reserved pool in releasing step. Currently, we
456 * don't have any other solution to deal with this situation
457 * properly, so add work-around here.
458 */
459 if (vma->vm_flags & VM_MAYSHARE && chg == 0)
460 return 1;
461 else
462 return 0;
463 }
464
465 /* Shared mappings always use reserves */
467 if (vma->vm_flags & VM_MAYSHARE) 466 if (vma->vm_flags & VM_MAYSHARE)
468 return 1; 467 return 1;
468
469 /*
470 * Only the process that called mmap() has reserves for
471 * private mappings.
472 */
469 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) 473 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
470 return 1; 474 return 1;
475
471 return 0; 476 return 0;
472} 477}
473 478
@@ -517,9 +522,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
517{ 522{
518 struct page *page; 523 struct page *page;
519 524
520 if (list_empty(&h->hugepage_freelists[nid])) 525 list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
526 if (!is_migrate_isolate_page(page))
527 break;
528 /*
529 * if 'non-isolated free hugepage' not found on the list,
530 * the allocation fails.
531 */
532 if (&h->hugepage_freelists[nid] == &page->lru)
521 return NULL; 533 return NULL;
522 page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
523 list_move(&page->lru, &h->hugepage_activelist); 534 list_move(&page->lru, &h->hugepage_activelist);
524 set_page_refcounted(page); 535 set_page_refcounted(page);
525 h->free_huge_pages--; 536 h->free_huge_pages--;
@@ -527,9 +538,19 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
527 return page; 538 return page;
528} 539}
529 540
541/* Movability of hugepages depends on migration support. */
542static inline gfp_t htlb_alloc_mask(struct hstate *h)
543{
544 if (hugepages_treat_as_movable || hugepage_migration_support(h))
545 return GFP_HIGHUSER_MOVABLE;
546 else
547 return GFP_HIGHUSER;
548}
549
530static struct page *dequeue_huge_page_vma(struct hstate *h, 550static struct page *dequeue_huge_page_vma(struct hstate *h,
531 struct vm_area_struct *vma, 551 struct vm_area_struct *vma,
532 unsigned long address, int avoid_reserve) 552 unsigned long address, int avoid_reserve,
553 long chg)
533{ 554{
534 struct page *page = NULL; 555 struct page *page = NULL;
535 struct mempolicy *mpol; 556 struct mempolicy *mpol;
@@ -539,16 +560,12 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
539 struct zoneref *z; 560 struct zoneref *z;
540 unsigned int cpuset_mems_cookie; 561 unsigned int cpuset_mems_cookie;
541 562
542retry_cpuset:
543 cpuset_mems_cookie = get_mems_allowed();
544 zonelist = huge_zonelist(vma, address,
545 htlb_alloc_mask, &mpol, &nodemask);
546 /* 563 /*
547 * A child process with MAP_PRIVATE mappings created by their parent 564 * A child process with MAP_PRIVATE mappings created by their parent
548 * have no page reserves. This check ensures that reservations are 565 * have no page reserves. This check ensures that reservations are
549 * not "stolen". The child may still get SIGKILLed 566 * not "stolen". The child may still get SIGKILLed
550 */ 567 */
551 if (!vma_has_reserves(vma) && 568 if (!vma_has_reserves(vma, chg) &&
552 h->free_huge_pages - h->resv_huge_pages == 0) 569 h->free_huge_pages - h->resv_huge_pages == 0)
553 goto err; 570 goto err;
554 571
@@ -556,13 +573,23 @@ retry_cpuset:
556 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) 573 if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
557 goto err; 574 goto err;
558 575
576retry_cpuset:
577 cpuset_mems_cookie = get_mems_allowed();
578 zonelist = huge_zonelist(vma, address,
579 htlb_alloc_mask(h), &mpol, &nodemask);
580
559 for_each_zone_zonelist_nodemask(zone, z, zonelist, 581 for_each_zone_zonelist_nodemask(zone, z, zonelist,
560 MAX_NR_ZONES - 1, nodemask) { 582 MAX_NR_ZONES - 1, nodemask) {
561 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { 583 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) {
562 page = dequeue_huge_page_node(h, zone_to_nid(zone)); 584 page = dequeue_huge_page_node(h, zone_to_nid(zone));
563 if (page) { 585 if (page) {
564 if (!avoid_reserve) 586 if (avoid_reserve)
565 decrement_hugepage_resv_vma(h, vma); 587 break;
588 if (!vma_has_reserves(vma, chg))
589 break;
590
591 SetPagePrivate(page);
592 h->resv_huge_pages--;
566 break; 593 break;
567 } 594 }
568 } 595 }
@@ -574,7 +601,6 @@ retry_cpuset:
574 return page; 601 return page;
575 602
576err: 603err:
577 mpol_cond_put(mpol);
578 return NULL; 604 return NULL;
579} 605}
580 606
@@ -620,15 +646,20 @@ static void free_huge_page(struct page *page)
620 int nid = page_to_nid(page); 646 int nid = page_to_nid(page);
621 struct hugepage_subpool *spool = 647 struct hugepage_subpool *spool =
622 (struct hugepage_subpool *)page_private(page); 648 (struct hugepage_subpool *)page_private(page);
649 bool restore_reserve;
623 650
624 set_page_private(page, 0); 651 set_page_private(page, 0);
625 page->mapping = NULL; 652 page->mapping = NULL;
626 BUG_ON(page_count(page)); 653 BUG_ON(page_count(page));
627 BUG_ON(page_mapcount(page)); 654 BUG_ON(page_mapcount(page));
655 restore_reserve = PagePrivate(page);
628 656
629 spin_lock(&hugetlb_lock); 657 spin_lock(&hugetlb_lock);
630 hugetlb_cgroup_uncharge_page(hstate_index(h), 658 hugetlb_cgroup_uncharge_page(hstate_index(h),
631 pages_per_huge_page(h), page); 659 pages_per_huge_page(h), page);
660 if (restore_reserve)
661 h->resv_huge_pages++;
662
632 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { 663 if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
633 /* remove the page from active list */ 664 /* remove the page from active list */
634 list_del(&page->lru); 665 list_del(&page->lru);
@@ -715,7 +746,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
715 return NULL; 746 return NULL;
716 747
717 page = alloc_pages_exact_node(nid, 748 page = alloc_pages_exact_node(nid,
718 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 749 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
719 __GFP_REPEAT|__GFP_NOWARN, 750 __GFP_REPEAT|__GFP_NOWARN,
720 huge_page_order(h)); 751 huge_page_order(h));
721 if (page) { 752 if (page) {
@@ -772,33 +803,6 @@ static int hstate_next_node_to_alloc(struct hstate *h,
772 return nid; 803 return nid;
773} 804}
774 805
775static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
776{
777 struct page *page;
778 int start_nid;
779 int next_nid;
780 int ret = 0;
781
782 start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
783 next_nid = start_nid;
784
785 do {
786 page = alloc_fresh_huge_page_node(h, next_nid);
787 if (page) {
788 ret = 1;
789 break;
790 }
791 next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
792 } while (next_nid != start_nid);
793
794 if (ret)
795 count_vm_event(HTLB_BUDDY_PGALLOC);
796 else
797 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
798
799 return ret;
800}
801
802/* 806/*
803 * helper for free_pool_huge_page() - return the previously saved 807 * helper for free_pool_huge_page() - return the previously saved
804 * node ["this node"] from which to free a huge page. Advance the 808 * node ["this node"] from which to free a huge page. Advance the
@@ -817,6 +821,40 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
817 return nid; 821 return nid;
818} 822}
819 823
824#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \
825 for (nr_nodes = nodes_weight(*mask); \
826 nr_nodes > 0 && \
827 ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \
828 nr_nodes--)
829
830#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \
831 for (nr_nodes = nodes_weight(*mask); \
832 nr_nodes > 0 && \
833 ((node = hstate_next_node_to_free(hs, mask)) || 1); \
834 nr_nodes--)
835
836static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
837{
838 struct page *page;
839 int nr_nodes, node;
840 int ret = 0;
841
842 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
843 page = alloc_fresh_huge_page_node(h, node);
844 if (page) {
845 ret = 1;
846 break;
847 }
848 }
849
850 if (ret)
851 count_vm_event(HTLB_BUDDY_PGALLOC);
852 else
853 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
854
855 return ret;
856}
857
820/* 858/*
821 * Free huge page from pool from next node to free. 859 * Free huge page from pool from next node to free.
822 * Attempt to keep persistent huge pages more or less 860 * Attempt to keep persistent huge pages more or less
@@ -826,40 +864,73 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
826static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, 864static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
827 bool acct_surplus) 865 bool acct_surplus)
828{ 866{
829 int start_nid; 867 int nr_nodes, node;
830 int next_nid;
831 int ret = 0; 868 int ret = 0;
832 869
833 start_nid = hstate_next_node_to_free(h, nodes_allowed); 870 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
834 next_nid = start_nid;
835
836 do {
837 /* 871 /*
838 * If we're returning unused surplus pages, only examine 872 * If we're returning unused surplus pages, only examine
839 * nodes with surplus pages. 873 * nodes with surplus pages.
840 */ 874 */
841 if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && 875 if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
842 !list_empty(&h->hugepage_freelists[next_nid])) { 876 !list_empty(&h->hugepage_freelists[node])) {
843 struct page *page = 877 struct page *page =
844 list_entry(h->hugepage_freelists[next_nid].next, 878 list_entry(h->hugepage_freelists[node].next,
845 struct page, lru); 879 struct page, lru);
846 list_del(&page->lru); 880 list_del(&page->lru);
847 h->free_huge_pages--; 881 h->free_huge_pages--;
848 h->free_huge_pages_node[next_nid]--; 882 h->free_huge_pages_node[node]--;
849 if (acct_surplus) { 883 if (acct_surplus) {
850 h->surplus_huge_pages--; 884 h->surplus_huge_pages--;
851 h->surplus_huge_pages_node[next_nid]--; 885 h->surplus_huge_pages_node[node]--;
852 } 886 }
853 update_and_free_page(h, page); 887 update_and_free_page(h, page);
854 ret = 1; 888 ret = 1;
855 break; 889 break;
856 } 890 }
857 next_nid = hstate_next_node_to_free(h, nodes_allowed); 891 }
858 } while (next_nid != start_nid);
859 892
860 return ret; 893 return ret;
861} 894}
862 895
896/*
897 * Dissolve a given free hugepage into free buddy pages. This function does
898 * nothing for in-use (including surplus) hugepages.
899 */
900static void dissolve_free_huge_page(struct page *page)
901{
902 spin_lock(&hugetlb_lock);
903 if (PageHuge(page) && !page_count(page)) {
904 struct hstate *h = page_hstate(page);
905 int nid = page_to_nid(page);
906 list_del(&page->lru);
907 h->free_huge_pages--;
908 h->free_huge_pages_node[nid]--;
909 update_and_free_page(h, page);
910 }
911 spin_unlock(&hugetlb_lock);
912}
913
914/*
915 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
916 * make specified memory blocks removable from the system.
917 * Note that start_pfn should aligned with (minimum) hugepage size.
918 */
919void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
920{
921 unsigned int order = 8 * sizeof(void *);
922 unsigned long pfn;
923 struct hstate *h;
924
925 /* Set scan step to minimum hugepage size */
926 for_each_hstate(h)
927 if (order > huge_page_order(h))
928 order = huge_page_order(h);
929 VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
930 for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
931 dissolve_free_huge_page(pfn_to_page(pfn));
932}
933
863static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) 934static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
864{ 935{
865 struct page *page; 936 struct page *page;
@@ -902,12 +973,12 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
902 spin_unlock(&hugetlb_lock); 973 spin_unlock(&hugetlb_lock);
903 974
904 if (nid == NUMA_NO_NODE) 975 if (nid == NUMA_NO_NODE)
905 page = alloc_pages(htlb_alloc_mask|__GFP_COMP| 976 page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
906 __GFP_REPEAT|__GFP_NOWARN, 977 __GFP_REPEAT|__GFP_NOWARN,
907 huge_page_order(h)); 978 huge_page_order(h));
908 else 979 else
909 page = alloc_pages_exact_node(nid, 980 page = alloc_pages_exact_node(nid,
910 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| 981 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
911 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); 982 __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
912 983
913 if (page && arch_prepare_hugepage(page)) { 984 if (page && arch_prepare_hugepage(page)) {
@@ -944,10 +1015,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
944 */ 1015 */
945struct page *alloc_huge_page_node(struct hstate *h, int nid) 1016struct page *alloc_huge_page_node(struct hstate *h, int nid)
946{ 1017{
947 struct page *page; 1018 struct page *page = NULL;
948 1019
949 spin_lock(&hugetlb_lock); 1020 spin_lock(&hugetlb_lock);
950 page = dequeue_huge_page_node(h, nid); 1021 if (h->free_huge_pages - h->resv_huge_pages > 0)
1022 page = dequeue_huge_page_node(h, nid);
951 spin_unlock(&hugetlb_lock); 1023 spin_unlock(&hugetlb_lock);
952 1024
953 if (!page) 1025 if (!page)
@@ -1035,11 +1107,8 @@ free:
1035 spin_unlock(&hugetlb_lock); 1107 spin_unlock(&hugetlb_lock);
1036 1108
1037 /* Free unnecessary surplus pages to the buddy allocator */ 1109 /* Free unnecessary surplus pages to the buddy allocator */
1038 if (!list_empty(&surplus_list)) { 1110 list_for_each_entry_safe(page, tmp, &surplus_list, lru)
1039 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1111 put_page(page);
1040 put_page(page);
1041 }
1042 }
1043 spin_lock(&hugetlb_lock); 1112 spin_lock(&hugetlb_lock);
1044 1113
1045 return ret; 1114 return ret;
@@ -1106,9 +1175,9 @@ static long vma_needs_reservation(struct hstate *h,
1106 } else { 1175 } else {
1107 long err; 1176 long err;
1108 pgoff_t idx = vma_hugecache_offset(h, vma, addr); 1177 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1109 struct resv_map *reservations = vma_resv_map(vma); 1178 struct resv_map *resv = vma_resv_map(vma);
1110 1179
1111 err = region_chg(&reservations->regions, idx, idx + 1); 1180 err = region_chg(&resv->regions, idx, idx + 1);
1112 if (err < 0) 1181 if (err < 0)
1113 return err; 1182 return err;
1114 return 0; 1183 return 0;
@@ -1126,10 +1195,10 @@ static void vma_commit_reservation(struct hstate *h,
1126 1195
1127 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { 1196 } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
1128 pgoff_t idx = vma_hugecache_offset(h, vma, addr); 1197 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
1129 struct resv_map *reservations = vma_resv_map(vma); 1198 struct resv_map *resv = vma_resv_map(vma);
1130 1199
1131 /* Mark this page used in the map. */ 1200 /* Mark this page used in the map. */
1132 region_add(&reservations->regions, idx, idx + 1); 1201 region_add(&resv->regions, idx, idx + 1);
1133 } 1202 }
1134} 1203}
1135 1204
@@ -1155,38 +1224,35 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1155 chg = vma_needs_reservation(h, vma, addr); 1224 chg = vma_needs_reservation(h, vma, addr);
1156 if (chg < 0) 1225 if (chg < 0)
1157 return ERR_PTR(-ENOMEM); 1226 return ERR_PTR(-ENOMEM);
1158 if (chg) 1227 if (chg || avoid_reserve)
1159 if (hugepage_subpool_get_pages(spool, chg)) 1228 if (hugepage_subpool_get_pages(spool, 1))
1160 return ERR_PTR(-ENOSPC); 1229 return ERR_PTR(-ENOSPC);
1161 1230
1162 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); 1231 ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
1163 if (ret) { 1232 if (ret) {
1164 hugepage_subpool_put_pages(spool, chg); 1233 if (chg || avoid_reserve)
1234 hugepage_subpool_put_pages(spool, 1);
1165 return ERR_PTR(-ENOSPC); 1235 return ERR_PTR(-ENOSPC);
1166 } 1236 }
1167 spin_lock(&hugetlb_lock); 1237 spin_lock(&hugetlb_lock);
1168 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); 1238 page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
1169 if (page) { 1239 if (!page) {
1170 /* update page cgroup details */
1171 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1172 h_cg, page);
1173 spin_unlock(&hugetlb_lock);
1174 } else {
1175 spin_unlock(&hugetlb_lock); 1240 spin_unlock(&hugetlb_lock);
1176 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1241 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1177 if (!page) { 1242 if (!page) {
1178 hugetlb_cgroup_uncharge_cgroup(idx, 1243 hugetlb_cgroup_uncharge_cgroup(idx,
1179 pages_per_huge_page(h), 1244 pages_per_huge_page(h),
1180 h_cg); 1245 h_cg);
1181 hugepage_subpool_put_pages(spool, chg); 1246 if (chg || avoid_reserve)
1247 hugepage_subpool_put_pages(spool, 1);
1182 return ERR_PTR(-ENOSPC); 1248 return ERR_PTR(-ENOSPC);
1183 } 1249 }
1184 spin_lock(&hugetlb_lock); 1250 spin_lock(&hugetlb_lock);
1185 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
1186 h_cg, page);
1187 list_move(&page->lru, &h->hugepage_activelist); 1251 list_move(&page->lru, &h->hugepage_activelist);
1188 spin_unlock(&hugetlb_lock); 1252 /* Fall through */
1189 } 1253 }
1254 hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
1255 spin_unlock(&hugetlb_lock);
1190 1256
1191 set_page_private(page, (unsigned long)spool); 1257 set_page_private(page, (unsigned long)spool);
1192 1258
@@ -1194,17 +1260,29 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1194 return page; 1260 return page;
1195} 1261}
1196 1262
1263/*
1264 * alloc_huge_page()'s wrapper which simply returns the page if allocation
1265 * succeeds, otherwise NULL. This function is called from new_vma_page(),
1266 * where no ERR_VALUE is expected to be returned.
1267 */
1268struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
1269 unsigned long addr, int avoid_reserve)
1270{
1271 struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
1272 if (IS_ERR(page))
1273 page = NULL;
1274 return page;
1275}
1276
1197int __weak alloc_bootmem_huge_page(struct hstate *h) 1277int __weak alloc_bootmem_huge_page(struct hstate *h)
1198{ 1278{
1199 struct huge_bootmem_page *m; 1279 struct huge_bootmem_page *m;
1200 int nr_nodes = nodes_weight(node_states[N_MEMORY]); 1280 int nr_nodes, node;
1201 1281
1202 while (nr_nodes) { 1282 for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
1203 void *addr; 1283 void *addr;
1204 1284
1205 addr = __alloc_bootmem_node_nopanic( 1285 addr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
1206 NODE_DATA(hstate_next_node_to_alloc(h,
1207 &node_states[N_MEMORY])),
1208 huge_page_size(h), huge_page_size(h), 0); 1286 huge_page_size(h), huge_page_size(h), 0);
1209 1287
1210 if (addr) { 1288 if (addr) {
@@ -1216,7 +1294,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1216 m = addr; 1294 m = addr;
1217 goto found; 1295 goto found;
1218 } 1296 }
1219 nr_nodes--;
1220 } 1297 }
1221 return 0; 1298 return 0;
1222 1299
@@ -1355,48 +1432,28 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count,
1355static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, 1432static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
1356 int delta) 1433 int delta)
1357{ 1434{
1358 int start_nid, next_nid; 1435 int nr_nodes, node;
1359 int ret = 0;
1360 1436
1361 VM_BUG_ON(delta != -1 && delta != 1); 1437 VM_BUG_ON(delta != -1 && delta != 1);
1362 1438
1363 if (delta < 0) 1439 if (delta < 0) {
1364 start_nid = hstate_next_node_to_alloc(h, nodes_allowed); 1440 for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
1365 else 1441 if (h->surplus_huge_pages_node[node])
1366 start_nid = hstate_next_node_to_free(h, nodes_allowed); 1442 goto found;
1367 next_nid = start_nid;
1368
1369 do {
1370 int nid = next_nid;
1371 if (delta < 0) {
1372 /*
1373 * To shrink on this node, there must be a surplus page
1374 */
1375 if (!h->surplus_huge_pages_node[nid]) {
1376 next_nid = hstate_next_node_to_alloc(h,
1377 nodes_allowed);
1378 continue;
1379 }
1380 } 1443 }
1381 if (delta > 0) { 1444 } else {
1382 /* 1445 for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
1383 * Surplus cannot exceed the total number of pages 1446 if (h->surplus_huge_pages_node[node] <
1384 */ 1447 h->nr_huge_pages_node[node])
1385 if (h->surplus_huge_pages_node[nid] >= 1448 goto found;
1386 h->nr_huge_pages_node[nid]) {
1387 next_nid = hstate_next_node_to_free(h,
1388 nodes_allowed);
1389 continue;
1390 }
1391 } 1449 }
1450 }
1451 return 0;
1392 1452
1393 h->surplus_huge_pages += delta; 1453found:
1394 h->surplus_huge_pages_node[nid] += delta; 1454 h->surplus_huge_pages += delta;
1395 ret = 1; 1455 h->surplus_huge_pages_node[node] += delta;
1396 break; 1456 return 1;
1397 } while (next_nid != start_nid);
1398
1399 return ret;
1400} 1457}
1401 1458
1402#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) 1459#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
@@ -1526,7 +1583,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1526 struct hstate *h; 1583 struct hstate *h;
1527 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); 1584 NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
1528 1585
1529 err = strict_strtoul(buf, 10, &count); 1586 err = kstrtoul(buf, 10, &count);
1530 if (err) 1587 if (err)
1531 goto out; 1588 goto out;
1532 1589
@@ -1617,7 +1674,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1617 if (h->order >= MAX_ORDER) 1674 if (h->order >= MAX_ORDER)
1618 return -EINVAL; 1675 return -EINVAL;
1619 1676
1620 err = strict_strtoul(buf, 10, &input); 1677 err = kstrtoul(buf, 10, &input);
1621 if (err) 1678 if (err)
1622 return err; 1679 return err;
1623 1680
@@ -2068,18 +2125,6 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
2068} 2125}
2069#endif /* CONFIG_NUMA */ 2126#endif /* CONFIG_NUMA */
2070 2127
2071int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
2072 void __user *buffer,
2073 size_t *length, loff_t *ppos)
2074{
2075 proc_dointvec(table, write, buffer, length, ppos);
2076 if (hugepages_treat_as_movable)
2077 htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
2078 else
2079 htlb_alloc_mask = GFP_HIGHUSER;
2080 return 0;
2081}
2082
2083int hugetlb_overcommit_handler(struct ctl_table *table, int write, 2128int hugetlb_overcommit_handler(struct ctl_table *table, int write,
2084 void __user *buffer, 2129 void __user *buffer,
2085 size_t *length, loff_t *ppos) 2130 size_t *length, loff_t *ppos)
@@ -2207,7 +2252,7 @@ out:
2207 2252
2208static void hugetlb_vm_op_open(struct vm_area_struct *vma) 2253static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2209{ 2254{
2210 struct resv_map *reservations = vma_resv_map(vma); 2255 struct resv_map *resv = vma_resv_map(vma);
2211 2256
2212 /* 2257 /*
2213 * This new VMA should share its siblings reservation map if present. 2258 * This new VMA should share its siblings reservation map if present.
@@ -2217,34 +2262,34 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2217 * after this open call completes. It is therefore safe to take a 2262 * after this open call completes. It is therefore safe to take a
2218 * new reference here without additional locking. 2263 * new reference here without additional locking.
2219 */ 2264 */
2220 if (reservations) 2265 if (resv)
2221 kref_get(&reservations->refs); 2266 kref_get(&resv->refs);
2222} 2267}
2223 2268
2224static void resv_map_put(struct vm_area_struct *vma) 2269static void resv_map_put(struct vm_area_struct *vma)
2225{ 2270{
2226 struct resv_map *reservations = vma_resv_map(vma); 2271 struct resv_map *resv = vma_resv_map(vma);
2227 2272
2228 if (!reservations) 2273 if (!resv)
2229 return; 2274 return;
2230 kref_put(&reservations->refs, resv_map_release); 2275 kref_put(&resv->refs, resv_map_release);
2231} 2276}
2232 2277
2233static void hugetlb_vm_op_close(struct vm_area_struct *vma) 2278static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2234{ 2279{
2235 struct hstate *h = hstate_vma(vma); 2280 struct hstate *h = hstate_vma(vma);
2236 struct resv_map *reservations = vma_resv_map(vma); 2281 struct resv_map *resv = vma_resv_map(vma);
2237 struct hugepage_subpool *spool = subpool_vma(vma); 2282 struct hugepage_subpool *spool = subpool_vma(vma);
2238 unsigned long reserve; 2283 unsigned long reserve;
2239 unsigned long start; 2284 unsigned long start;
2240 unsigned long end; 2285 unsigned long end;
2241 2286
2242 if (reservations) { 2287 if (resv) {
2243 start = vma_hugecache_offset(h, vma, vma->vm_start); 2288 start = vma_hugecache_offset(h, vma, vma->vm_start);
2244 end = vma_hugecache_offset(h, vma, vma->vm_end); 2289 end = vma_hugecache_offset(h, vma, vma->vm_end);
2245 2290
2246 reserve = (end - start) - 2291 reserve = (end - start) -
2247 region_count(&reservations->regions, start, end); 2292 region_count(&resv->regions, start, end);
2248 2293
2249 resv_map_put(vma); 2294 resv_map_put(vma);
2250 2295
@@ -2557,7 +2602,6 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2557{ 2602{
2558 struct hstate *h = hstate_vma(vma); 2603 struct hstate *h = hstate_vma(vma);
2559 struct page *old_page, *new_page; 2604 struct page *old_page, *new_page;
2560 int avoidcopy;
2561 int outside_reserve = 0; 2605 int outside_reserve = 0;
2562 unsigned long mmun_start; /* For mmu_notifiers */ 2606 unsigned long mmun_start; /* For mmu_notifiers */
2563 unsigned long mmun_end; /* For mmu_notifiers */ 2607 unsigned long mmun_end; /* For mmu_notifiers */
@@ -2567,10 +2611,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2567retry_avoidcopy: 2611retry_avoidcopy:
2568 /* If no-one else is actually using this page, avoid the copy 2612 /* If no-one else is actually using this page, avoid the copy
2569 * and just make the page writable */ 2613 * and just make the page writable */
2570 avoidcopy = (page_mapcount(old_page) == 1); 2614 if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
2571 if (avoidcopy) { 2615 page_move_anon_rmap(old_page, vma, address);
2572 if (PageAnon(old_page))
2573 page_move_anon_rmap(old_page, vma, address);
2574 set_huge_ptep_writable(vma, address, ptep); 2616 set_huge_ptep_writable(vma, address, ptep);
2575 return 0; 2617 return 0;
2576 } 2618 }
@@ -2584,8 +2626,7 @@ retry_avoidcopy:
2584 * at the time of fork() could consume its reserves on COW instead 2626 * at the time of fork() could consume its reserves on COW instead
2585 * of the full address range. 2627 * of the full address range.
2586 */ 2628 */
2587 if (!(vma->vm_flags & VM_MAYSHARE) && 2629 if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
2588 is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
2589 old_page != pagecache_page) 2630 old_page != pagecache_page)
2590 outside_reserve = 1; 2631 outside_reserve = 1;
2591 2632
@@ -2657,6 +2698,8 @@ retry_avoidcopy:
2657 spin_lock(&mm->page_table_lock); 2698 spin_lock(&mm->page_table_lock);
2658 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2699 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2659 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2700 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
2701 ClearPagePrivate(new_page);
2702
2660 /* Break COW */ 2703 /* Break COW */
2661 huge_ptep_clear_flush(vma, address, ptep); 2704 huge_ptep_clear_flush(vma, address, ptep);
2662 set_huge_pte_at(mm, address, ptep, 2705 set_huge_pte_at(mm, address, ptep,
@@ -2668,10 +2711,11 @@ retry_avoidcopy:
2668 } 2711 }
2669 spin_unlock(&mm->page_table_lock); 2712 spin_unlock(&mm->page_table_lock);
2670 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2713 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2671 /* Caller expects lock to be held */
2672 spin_lock(&mm->page_table_lock);
2673 page_cache_release(new_page); 2714 page_cache_release(new_page);
2674 page_cache_release(old_page); 2715 page_cache_release(old_page);
2716
2717 /* Caller expects lock to be held */
2718 spin_lock(&mm->page_table_lock);
2675 return 0; 2719 return 0;
2676} 2720}
2677 2721
@@ -2767,6 +2811,7 @@ retry:
2767 goto retry; 2811 goto retry;
2768 goto out; 2812 goto out;
2769 } 2813 }
2814 ClearPagePrivate(page);
2770 2815
2771 spin_lock(&inode->i_lock); 2816 spin_lock(&inode->i_lock);
2772 inode->i_blocks += blocks_per_huge_page(h); 2817 inode->i_blocks += blocks_per_huge_page(h);
@@ -2813,8 +2858,10 @@ retry:
2813 if (!huge_pte_none(huge_ptep_get(ptep))) 2858 if (!huge_pte_none(huge_ptep_get(ptep)))
2814 goto backout; 2859 goto backout;
2815 2860
2816 if (anon_rmap) 2861 if (anon_rmap) {
2862 ClearPagePrivate(page);
2817 hugepage_add_new_anon_rmap(page, vma, address); 2863 hugepage_add_new_anon_rmap(page, vma, address);
2864 }
2818 else 2865 else
2819 page_dup_rmap(page); 2866 page_dup_rmap(page);
2820 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 2867 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
@@ -3431,3 +3478,45 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
3431 return ret; 3478 return ret;
3432} 3479}
3433#endif 3480#endif
3481
3482bool isolate_huge_page(struct page *page, struct list_head *list)
3483{
3484 VM_BUG_ON(!PageHead(page));
3485 if (!get_page_unless_zero(page))
3486 return false;
3487 spin_lock(&hugetlb_lock);
3488 list_move_tail(&page->lru, list);
3489 spin_unlock(&hugetlb_lock);
3490 return true;
3491}
3492
3493void putback_active_hugepage(struct page *page)
3494{
3495 VM_BUG_ON(!PageHead(page));
3496 spin_lock(&hugetlb_lock);
3497 list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
3498 spin_unlock(&hugetlb_lock);
3499 put_page(page);
3500}
3501
3502bool is_hugepage_active(struct page *page)
3503{
3504 VM_BUG_ON(!PageHuge(page));
3505 /*
3506 * This function can be called for a tail page because the caller,
3507 * scan_movable_pages, scans through a given pfn-range which typically
3508 * covers one memory block. In systems using gigantic hugepage (1GB
3509 * for x86_64,) a hugepage is larger than a memory block, and we don't
3510 * support migrating such large hugepages for now, so return false
3511 * when called for tail pages.
3512 */
3513 if (PageTail(page))
3514 return false;
3515 /*
3516 * Refcount of a hwpoisoned hugepages is 1, but they are not active,
3517 * so we should return false for them.
3518 */
3519 if (unlikely(PageHWPoison(page)))
3520 return false;
3521 return page_count(page) > 0;
3522}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 3a61efc518d5..afc2daa91c60 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -88,12 +88,12 @@ static int pfn_inject_init(void)
88 * hardware status change, hence do not require hardware support. 88 * hardware status change, hence do not require hardware support.
89 * They are mainly for testing hwpoison in software level. 89 * They are mainly for testing hwpoison in software level.
90 */ 90 */
91 dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, 91 dentry = debugfs_create_file("corrupt-pfn", 0200, hwpoison_dir,
92 NULL, &hwpoison_fops); 92 NULL, &hwpoison_fops);
93 if (!dentry) 93 if (!dentry)
94 goto fail; 94 goto fail;
95 95
96 dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, 96 dentry = debugfs_create_file("unpoison-pfn", 0200, hwpoison_dir,
97 NULL, &unpoison_fops); 97 NULL, &unpoison_fops);
98 if (!dentry) 98 if (!dentry)
99 goto fail; 99 goto fail;
diff --git a/mm/internal.h b/mm/internal.h
index 4390ac6c106e..684f7aa9692a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -85,6 +85,8 @@ extern unsigned long highest_memmap_pfn;
85 */ 85 */
86extern int isolate_lru_page(struct page *page); 86extern int isolate_lru_page(struct page *page);
87extern void putback_lru_page(struct page *page); 87extern void putback_lru_page(struct page *page);
88extern unsigned long zone_reclaimable_pages(struct zone *zone);
89extern bool zone_reclaimable(struct zone *zone);
88 90
89/* 91/*
90 * in mm/rmap.c: 92 * in mm/rmap.c:
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c8d7f3110fd0..e126b0ef9ad2 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1639,7 +1639,7 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
1639 else if (strncmp(buf, "scan=", 5) == 0) { 1639 else if (strncmp(buf, "scan=", 5) == 0) {
1640 unsigned long secs; 1640 unsigned long secs;
1641 1641
1642 ret = strict_strtoul(buf + 5, 0, &secs); 1642 ret = kstrtoul(buf + 5, 0, &secs);
1643 if (ret < 0) 1643 if (ret < 0)
1644 goto out; 1644 goto out;
1645 stop_scan_thread(); 1645 stop_scan_thread();
diff --git a/mm/ksm.c b/mm/ksm.c
index b6afe0c440d8..0bea2b262a47 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2194,7 +2194,7 @@ static ssize_t sleep_millisecs_store(struct kobject *kobj,
2194 unsigned long msecs; 2194 unsigned long msecs;
2195 int err; 2195 int err;
2196 2196
2197 err = strict_strtoul(buf, 10, &msecs); 2197 err = kstrtoul(buf, 10, &msecs);
2198 if (err || msecs > UINT_MAX) 2198 if (err || msecs > UINT_MAX)
2199 return -EINVAL; 2199 return -EINVAL;
2200 2200
@@ -2217,7 +2217,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
2217 int err; 2217 int err;
2218 unsigned long nr_pages; 2218 unsigned long nr_pages;
2219 2219
2220 err = strict_strtoul(buf, 10, &nr_pages); 2220 err = kstrtoul(buf, 10, &nr_pages);
2221 if (err || nr_pages > UINT_MAX) 2221 if (err || nr_pages > UINT_MAX)
2222 return -EINVAL; 2222 return -EINVAL;
2223 2223
@@ -2239,7 +2239,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
2239 int err; 2239 int err;
2240 unsigned long flags; 2240 unsigned long flags;
2241 2241
2242 err = strict_strtoul(buf, 10, &flags); 2242 err = kstrtoul(buf, 10, &flags);
2243 if (err || flags > UINT_MAX) 2243 if (err || flags > UINT_MAX)
2244 return -EINVAL; 2244 return -EINVAL;
2245 if (flags > KSM_RUN_UNMERGE) 2245 if (flags > KSM_RUN_UNMERGE)
diff --git a/mm/madvise.c b/mm/madvise.c
index 7055883e6e25..6975bc812542 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -42,11 +42,11 @@ static int madvise_need_mmap_write(int behavior)
42 * We can potentially split a vm area into separate 42 * We can potentially split a vm area into separate
43 * areas, each area with its own behavior. 43 * areas, each area with its own behavior.
44 */ 44 */
45static long madvise_behavior(struct vm_area_struct * vma, 45static long madvise_behavior(struct vm_area_struct *vma,
46 struct vm_area_struct **prev, 46 struct vm_area_struct **prev,
47 unsigned long start, unsigned long end, int behavior) 47 unsigned long start, unsigned long end, int behavior)
48{ 48{
49 struct mm_struct * mm = vma->vm_mm; 49 struct mm_struct *mm = vma->vm_mm;
50 int error = 0; 50 int error = 0;
51 pgoff_t pgoff; 51 pgoff_t pgoff;
52 unsigned long new_flags = vma->vm_flags; 52 unsigned long new_flags = vma->vm_flags;
@@ -215,8 +215,8 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
215/* 215/*
216 * Schedule all required I/O operations. Do not wait for completion. 216 * Schedule all required I/O operations. Do not wait for completion.
217 */ 217 */
218static long madvise_willneed(struct vm_area_struct * vma, 218static long madvise_willneed(struct vm_area_struct *vma,
219 struct vm_area_struct ** prev, 219 struct vm_area_struct **prev,
220 unsigned long start, unsigned long end) 220 unsigned long start, unsigned long end)
221{ 221{
222 struct file *file = vma->vm_file; 222 struct file *file = vma->vm_file;
@@ -270,8 +270,8 @@ static long madvise_willneed(struct vm_area_struct * vma,
270 * An interface that causes the system to free clean pages and flush 270 * An interface that causes the system to free clean pages and flush
271 * dirty pages is already available as msync(MS_INVALIDATE). 271 * dirty pages is already available as msync(MS_INVALIDATE).
272 */ 272 */
273static long madvise_dontneed(struct vm_area_struct * vma, 273static long madvise_dontneed(struct vm_area_struct *vma,
274 struct vm_area_struct ** prev, 274 struct vm_area_struct **prev,
275 unsigned long start, unsigned long end) 275 unsigned long start, unsigned long end)
276{ 276{
277 *prev = vma; 277 *prev = vma;
@@ -343,29 +343,34 @@ static long madvise_remove(struct vm_area_struct *vma,
343 */ 343 */
344static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) 344static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
345{ 345{
346 int ret = 0;
347
348 if (!capable(CAP_SYS_ADMIN)) 346 if (!capable(CAP_SYS_ADMIN))
349 return -EPERM; 347 return -EPERM;
350 for (; start < end; start += PAGE_SIZE) { 348 for (; start < end; start += PAGE_SIZE) {
351 struct page *p; 349 struct page *p;
352 int ret = get_user_pages_fast(start, 1, 0, &p); 350 int ret;
351
352 ret = get_user_pages_fast(start, 1, 0, &p);
353 if (ret != 1) 353 if (ret != 1)
354 return ret; 354 return ret;
355
356 if (PageHWPoison(p)) {
357 put_page(p);
358 continue;
359 }
355 if (bhv == MADV_SOFT_OFFLINE) { 360 if (bhv == MADV_SOFT_OFFLINE) {
356 printk(KERN_INFO "Soft offlining page %lx at %lx\n", 361 pr_info("Soft offlining page %#lx at %#lx\n",
357 page_to_pfn(p), start); 362 page_to_pfn(p), start);
358 ret = soft_offline_page(p, MF_COUNT_INCREASED); 363 ret = soft_offline_page(p, MF_COUNT_INCREASED);
359 if (ret) 364 if (ret)
360 break; 365 return ret;
361 continue; 366 continue;
362 } 367 }
363 printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", 368 pr_info("Injecting memory failure for page %#lx at %#lx\n",
364 page_to_pfn(p), start); 369 page_to_pfn(p), start);
365 /* Ignore return value for now */ 370 /* Ignore return value for now */
366 memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); 371 memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
367 } 372 }
368 return ret; 373 return 0;
369} 374}
370#endif 375#endif
371 376
@@ -459,7 +464,7 @@ madvise_behavior_valid(int behavior)
459SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 464SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
460{ 465{
461 unsigned long end, tmp; 466 unsigned long end, tmp;
462 struct vm_area_struct * vma, *prev; 467 struct vm_area_struct *vma, *prev;
463 int unmapped_error = 0; 468 int unmapped_error = 0;
464 int error = -EINVAL; 469 int error = -EINVAL;
465 int write; 470 int write;
diff --git a/mm/memblock.c b/mm/memblock.c
index a847bfe6f3ba..0ac412a0a7ee 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -914,6 +914,24 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
914 return memblock_search(&memblock.memory, addr) != -1; 914 return memblock_search(&memblock.memory, addr) != -1;
915} 915}
916 916
917#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
918int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
919 unsigned long *start_pfn, unsigned long *end_pfn)
920{
921 struct memblock_type *type = &memblock.memory;
922 int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT);
923
924 if (mid == -1)
925 return -1;
926
927 *start_pfn = type->regions[mid].base >> PAGE_SHIFT;
928 *end_pfn = (type->regions[mid].base + type->regions[mid].size)
929 >> PAGE_SHIFT;
930
931 return type->regions[mid].nid;
932}
933#endif
934
917/** 935/**
918 * memblock_is_region_memory - check if a region is a subset of memory 936 * memblock_is_region_memory - check if a region is a subset of memory
919 * @base: base of region to check 937 * @base: base of region to check
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3b83957b6439..c6bd28edd533 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3121,7 +3121,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3121 ssize_t size = memcg_caches_array_size(num_groups); 3121 ssize_t size = memcg_caches_array_size(num_groups);
3122 3122
3123 size *= sizeof(void *); 3123 size *= sizeof(void *);
3124 size += sizeof(struct memcg_cache_params); 3124 size += offsetof(struct memcg_cache_params, memcg_caches);
3125 3125
3126 s->memcg_params = kzalloc(size, GFP_KERNEL); 3126 s->memcg_params = kzalloc(size, GFP_KERNEL);
3127 if (!s->memcg_params) { 3127 if (!s->memcg_params) {
@@ -3164,13 +3164,16 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3164int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, 3164int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
3165 struct kmem_cache *root_cache) 3165 struct kmem_cache *root_cache)
3166{ 3166{
3167 size_t size = sizeof(struct memcg_cache_params); 3167 size_t size;
3168 3168
3169 if (!memcg_kmem_enabled()) 3169 if (!memcg_kmem_enabled())
3170 return 0; 3170 return 0;
3171 3171
3172 if (!memcg) 3172 if (!memcg) {
3173 size = offsetof(struct memcg_cache_params, memcg_caches);
3173 size += memcg_limited_groups_array_size * sizeof(void *); 3174 size += memcg_limited_groups_array_size * sizeof(void *);
3175 } else
3176 size = sizeof(struct memcg_cache_params);
3174 3177
3175 s->memcg_params = kzalloc(size, GFP_KERNEL); 3178 s->memcg_params = kzalloc(size, GFP_KERNEL);
3176 if (!s->memcg_params) 3179 if (!s->memcg_params)
@@ -5588,7 +5591,13 @@ static int compare_thresholds(const void *a, const void *b)
5588 const struct mem_cgroup_threshold *_a = a; 5591 const struct mem_cgroup_threshold *_a = a;
5589 const struct mem_cgroup_threshold *_b = b; 5592 const struct mem_cgroup_threshold *_b = b;
5590 5593
5591 return _a->threshold - _b->threshold; 5594 if (_a->threshold > _b->threshold)
5595 return 1;
5596
5597 if (_a->threshold < _b->threshold)
5598 return -1;
5599
5600 return 0;
5592} 5601}
5593 5602
5594static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 5603static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index baa4e0a45dec..947ed5413279 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -206,7 +206,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
206#ifdef __ARCH_SI_TRAPNO 206#ifdef __ARCH_SI_TRAPNO
207 si.si_trapno = trapno; 207 si.si_trapno = trapno;
208#endif 208#endif
209 si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; 209 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
210 210
211 if ((flags & MF_ACTION_REQUIRED) && t == current) { 211 if ((flags & MF_ACTION_REQUIRED) && t == current) {
212 si.si_code = BUS_MCEERR_AR; 212 si.si_code = BUS_MCEERR_AR;
@@ -985,7 +985,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
985static void set_page_hwpoison_huge_page(struct page *hpage) 985static void set_page_hwpoison_huge_page(struct page *hpage)
986{ 986{
987 int i; 987 int i;
988 int nr_pages = 1 << compound_trans_order(hpage); 988 int nr_pages = 1 << compound_order(hpage);
989 for (i = 0; i < nr_pages; i++) 989 for (i = 0; i < nr_pages; i++)
990 SetPageHWPoison(hpage + i); 990 SetPageHWPoison(hpage + i);
991} 991}
@@ -993,7 +993,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
993static void clear_page_hwpoison_huge_page(struct page *hpage) 993static void clear_page_hwpoison_huge_page(struct page *hpage)
994{ 994{
995 int i; 995 int i;
996 int nr_pages = 1 << compound_trans_order(hpage); 996 int nr_pages = 1 << compound_order(hpage);
997 for (i = 0; i < nr_pages; i++) 997 for (i = 0; i < nr_pages; i++)
998 ClearPageHWPoison(hpage + i); 998 ClearPageHWPoison(hpage + i);
999} 999}
@@ -1206,6 +1206,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
1206 for (ps = error_states;; ps++) 1206 for (ps = error_states;; ps++)
1207 if ((p->flags & ps->mask) == ps->res) 1207 if ((p->flags & ps->mask) == ps->res)
1208 break; 1208 break;
1209
1210 page_flags |= (p->flags & (1UL << PG_dirty));
1211
1209 if (!ps->mask) 1212 if (!ps->mask)
1210 for (ps = error_states;; ps++) 1213 for (ps = error_states;; ps++)
1211 if ((page_flags & ps->mask) == ps->res) 1214 if ((page_flags & ps->mask) == ps->res)
@@ -1341,7 +1344,17 @@ int unpoison_memory(unsigned long pfn)
1341 return 0; 1344 return 0;
1342 } 1345 }
1343 1346
1344 nr_pages = 1 << compound_trans_order(page); 1347 /*
1348 * unpoison_memory() can encounter thp only when the thp is being
1349 * worked by memory_failure() and the page lock is not held yet.
1350 * In such case, we yield to memory_failure() and make unpoison fail.
1351 */
1352 if (PageTransHuge(page)) {
1353 pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
1354 return 0;
1355 }
1356
1357 nr_pages = 1 << compound_order(page);
1345 1358
1346 if (!get_page_unless_zero(page)) { 1359 if (!get_page_unless_zero(page)) {
1347 /* 1360 /*
@@ -1355,7 +1368,7 @@ int unpoison_memory(unsigned long pfn)
1355 return 0; 1368 return 0;
1356 } 1369 }
1357 if (TestClearPageHWPoison(p)) 1370 if (TestClearPageHWPoison(p))
1358 atomic_long_sub(nr_pages, &num_poisoned_pages); 1371 atomic_long_dec(&num_poisoned_pages);
1359 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); 1372 pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
1360 return 0; 1373 return 0;
1361 } 1374 }
@@ -1377,7 +1390,7 @@ int unpoison_memory(unsigned long pfn)
1377 unlock_page(page); 1390 unlock_page(page);
1378 1391
1379 put_page(page); 1392 put_page(page);
1380 if (freeit) 1393 if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
1381 put_page(page); 1394 put_page(page);
1382 1395
1383 return 0; 1396 return 0;
@@ -1418,7 +1431,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
1418 * was free. This flag should be kept set until the source page 1431 * was free. This flag should be kept set until the source page
1419 * is freed and PG_hwpoison on it is set. 1432 * is freed and PG_hwpoison on it is set.
1420 */ 1433 */
1421 set_migratetype_isolate(p, true); 1434 if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE)
1435 set_migratetype_isolate(p, true);
1422 /* 1436 /*
1423 * When the target page is a free hugepage, just remove it 1437 * When the target page is a free hugepage, just remove it
1424 * from free hugepage list. 1438 * from free hugepage list.
@@ -1472,6 +1486,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
1472 int ret; 1486 int ret;
1473 unsigned long pfn = page_to_pfn(page); 1487 unsigned long pfn = page_to_pfn(page);
1474 struct page *hpage = compound_head(page); 1488 struct page *hpage = compound_head(page);
1489 LIST_HEAD(pagelist);
1475 1490
1476 /* 1491 /*
1477 * This double-check of PageHWPoison is to avoid the race with 1492 * This double-check of PageHWPoison is to avoid the race with
@@ -1487,86 +1502,29 @@ static int soft_offline_huge_page(struct page *page, int flags)
1487 unlock_page(hpage); 1502 unlock_page(hpage);
1488 1503
1489 /* Keep page count to indicate a given hugepage is isolated. */ 1504 /* Keep page count to indicate a given hugepage is isolated. */
1490 ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, 1505 list_move(&hpage->lru, &pagelist);
1491 MIGRATE_SYNC); 1506 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1492 put_page(hpage); 1507 MIGRATE_SYNC, MR_MEMORY_FAILURE);
1493 if (ret) { 1508 if (ret) {
1494 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1509 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1495 pfn, ret, page->flags); 1510 pfn, ret, page->flags);
1511 /*
1512 * We know that soft_offline_huge_page() tries to migrate
1513 * only one hugepage pointed to by hpage, so we need not
1514 * run through the pagelist here.
1515 */
1516 putback_active_hugepage(hpage);
1517 if (ret > 0)
1518 ret = -EIO;
1496 } else { 1519 } else {
1497 set_page_hwpoison_huge_page(hpage); 1520 set_page_hwpoison_huge_page(hpage);
1498 dequeue_hwpoisoned_huge_page(hpage); 1521 dequeue_hwpoisoned_huge_page(hpage);
1499 atomic_long_add(1 << compound_trans_order(hpage), 1522 atomic_long_add(1 << compound_order(hpage),
1500 &num_poisoned_pages); 1523 &num_poisoned_pages);
1501 } 1524 }
1502 return ret; 1525 return ret;
1503} 1526}
1504 1527
1505static int __soft_offline_page(struct page *page, int flags);
1506
1507/**
1508 * soft_offline_page - Soft offline a page.
1509 * @page: page to offline
1510 * @flags: flags. Same as memory_failure().
1511 *
1512 * Returns 0 on success, otherwise negated errno.
1513 *
1514 * Soft offline a page, by migration or invalidation,
1515 * without killing anything. This is for the case when
1516 * a page is not corrupted yet (so it's still valid to access),
1517 * but has had a number of corrected errors and is better taken
1518 * out.
1519 *
1520 * The actual policy on when to do that is maintained by
1521 * user space.
1522 *
1523 * This should never impact any application or cause data loss,
1524 * however it might take some time.
1525 *
1526 * This is not a 100% solution for all memory, but tries to be
1527 * ``good enough'' for the majority of memory.
1528 */
1529int soft_offline_page(struct page *page, int flags)
1530{
1531 int ret;
1532 unsigned long pfn = page_to_pfn(page);
1533 struct page *hpage = compound_trans_head(page);
1534
1535 if (PageHWPoison(page)) {
1536 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1537 return -EBUSY;
1538 }
1539 if (!PageHuge(page) && PageTransHuge(hpage)) {
1540 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1541 pr_info("soft offline: %#lx: failed to split THP\n",
1542 pfn);
1543 return -EBUSY;
1544 }
1545 }
1546
1547 ret = get_any_page(page, pfn, flags);
1548 if (ret < 0)
1549 return ret;
1550 if (ret) { /* for in-use pages */
1551 if (PageHuge(page))
1552 ret = soft_offline_huge_page(page, flags);
1553 else
1554 ret = __soft_offline_page(page, flags);
1555 } else { /* for free pages */
1556 if (PageHuge(page)) {
1557 set_page_hwpoison_huge_page(hpage);
1558 dequeue_hwpoisoned_huge_page(hpage);
1559 atomic_long_add(1 << compound_trans_order(hpage),
1560 &num_poisoned_pages);
1561 } else {
1562 SetPageHWPoison(page);
1563 atomic_long_inc(&num_poisoned_pages);
1564 }
1565 }
1566 unset_migratetype_isolate(page, MIGRATE_MOVABLE);
1567 return ret;
1568}
1569
1570static int __soft_offline_page(struct page *page, int flags) 1528static int __soft_offline_page(struct page *page, int flags)
1571{ 1529{
1572 int ret; 1530 int ret;
@@ -1653,3 +1611,67 @@ static int __soft_offline_page(struct page *page, int flags)
1653 } 1611 }
1654 return ret; 1612 return ret;
1655} 1613}
1614
1615/**
1616 * soft_offline_page - Soft offline a page.
1617 * @page: page to offline
1618 * @flags: flags. Same as memory_failure().
1619 *
1620 * Returns 0 on success, otherwise negated errno.
1621 *
1622 * Soft offline a page, by migration or invalidation,
1623 * without killing anything. This is for the case when
1624 * a page is not corrupted yet (so it's still valid to access),
1625 * but has had a number of corrected errors and is better taken
1626 * out.
1627 *
1628 * The actual policy on when to do that is maintained by
1629 * user space.
1630 *
1631 * This should never impact any application or cause data loss,
1632 * however it might take some time.
1633 *
1634 * This is not a 100% solution for all memory, but tries to be
1635 * ``good enough'' for the majority of memory.
1636 */
1637int soft_offline_page(struct page *page, int flags)
1638{
1639 int ret;
1640 unsigned long pfn = page_to_pfn(page);
1641 struct page *hpage = compound_trans_head(page);
1642
1643 if (PageHWPoison(page)) {
1644 pr_info("soft offline: %#lx page already poisoned\n", pfn);
1645 return -EBUSY;
1646 }
1647 if (!PageHuge(page) && PageTransHuge(hpage)) {
1648 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1649 pr_info("soft offline: %#lx: failed to split THP\n",
1650 pfn);
1651 return -EBUSY;
1652 }
1653 }
1654
1655 ret = get_any_page(page, pfn, flags);
1656 if (ret < 0)
1657 goto unset;
1658 if (ret) { /* for in-use pages */
1659 if (PageHuge(page))
1660 ret = soft_offline_huge_page(page, flags);
1661 else
1662 ret = __soft_offline_page(page, flags);
1663 } else { /* for free pages */
1664 if (PageHuge(page)) {
1665 set_page_hwpoison_huge_page(hpage);
1666 dequeue_hwpoisoned_huge_page(hpage);
1667 atomic_long_add(1 << compound_order(hpage),
1668 &num_poisoned_pages);
1669 } else {
1670 SetPageHWPoison(page);
1671 atomic_long_inc(&num_poisoned_pages);
1672 }
1673 }
1674unset:
1675 unset_migratetype_isolate(page, MIGRATE_MOVABLE);
1676 return ret;
1677}
diff --git a/mm/memory.c b/mm/memory.c
index b3c6bf9a398e..2b73dbde2274 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -373,30 +373,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
373#endif /* CONFIG_HAVE_RCU_TABLE_FREE */ 373#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
374 374
375/* 375/*
376 * If a p?d_bad entry is found while walking page tables, report
377 * the error, before resetting entry to p?d_none. Usually (but
378 * very seldom) called out from the p?d_none_or_clear_bad macros.
379 */
380
381void pgd_clear_bad(pgd_t *pgd)
382{
383 pgd_ERROR(*pgd);
384 pgd_clear(pgd);
385}
386
387void pud_clear_bad(pud_t *pud)
388{
389 pud_ERROR(*pud);
390 pud_clear(pud);
391}
392
393void pmd_clear_bad(pmd_t *pmd)
394{
395 pmd_ERROR(*pmd);
396 pmd_clear(pmd);
397}
398
399/*
400 * Note: this doesn't free the actual pages themselves. That 376 * Note: this doesn't free the actual pages themselves. That
401 * has been handled earlier when unmapping all the memory regions. 377 * has been handled earlier when unmapping all the memory regions.
402 */ 378 */
@@ -1505,7 +1481,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
1505 if (pud_none(*pud)) 1481 if (pud_none(*pud))
1506 goto no_page_table; 1482 goto no_page_table;
1507 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { 1483 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1508 BUG_ON(flags & FOLL_GET); 1484 if (flags & FOLL_GET)
1485 goto out;
1509 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); 1486 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1510 goto out; 1487 goto out;
1511 } 1488 }
@@ -1516,8 +1493,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
1516 if (pmd_none(*pmd)) 1493 if (pmd_none(*pmd))
1517 goto no_page_table; 1494 goto no_page_table;
1518 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { 1495 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1519 BUG_ON(flags & FOLL_GET);
1520 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1496 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1497 if (flags & FOLL_GET) {
1498 /*
1499 * Refcount on tail pages are not well-defined and
1500 * shouldn't be taken. The caller should handle a NULL
1501 * return when trying to follow tail pages.
1502 */
1503 if (PageHead(page))
1504 get_page(page);
1505 else {
1506 page = NULL;
1507 goto out;
1508 }
1509 }
1521 goto out; 1510 goto out;
1522 } 1511 }
1523 if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) 1512 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ca1dd3aa5eee..ed85fe3870e2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -30,6 +30,7 @@
30#include <linux/mm_inline.h> 30#include <linux/mm_inline.h>
31#include <linux/firmware-map.h> 31#include <linux/firmware-map.h>
32#include <linux/stop_machine.h> 32#include <linux/stop_machine.h>
33#include <linux/hugetlb.h>
33 34
34#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
35 36
@@ -51,14 +52,10 @@ DEFINE_MUTEX(mem_hotplug_mutex);
51void lock_memory_hotplug(void) 52void lock_memory_hotplug(void)
52{ 53{
53 mutex_lock(&mem_hotplug_mutex); 54 mutex_lock(&mem_hotplug_mutex);
54
55 /* for exclusive hibernation if CONFIG_HIBERNATION=y */
56 lock_system_sleep();
57} 55}
58 56
59void unlock_memory_hotplug(void) 57void unlock_memory_hotplug(void)
60{ 58{
61 unlock_system_sleep();
62 mutex_unlock(&mem_hotplug_mutex); 59 mutex_unlock(&mem_hotplug_mutex);
63} 60}
64 61
@@ -194,7 +191,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
194 191
195 zone = &pgdat->node_zones[0]; 192 zone = &pgdat->node_zones[0];
196 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { 193 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
197 if (zone->wait_table) { 194 if (zone_is_initialized(zone)) {
198 nr_pages = zone->wait_table_hash_nr_entries 195 nr_pages = zone->wait_table_hash_nr_entries
199 * sizeof(wait_queue_head_t); 196 * sizeof(wait_queue_head_t);
200 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; 197 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
@@ -229,8 +226,8 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
229 226
230 zone_span_writelock(zone); 227 zone_span_writelock(zone);
231 228
232 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 229 old_zone_end_pfn = zone_end_pfn(zone);
233 if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) 230 if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
234 zone->zone_start_pfn = start_pfn; 231 zone->zone_start_pfn = start_pfn;
235 232
236 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 233 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
@@ -305,7 +302,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
305 goto out_fail; 302 goto out_fail;
306 303
307 /* use start_pfn for z1's start_pfn if z1 is empty */ 304 /* use start_pfn for z1's start_pfn if z1 is empty */
308 if (z1->spanned_pages) 305 if (!zone_is_empty(z1))
309 z1_start_pfn = z1->zone_start_pfn; 306 z1_start_pfn = z1->zone_start_pfn;
310 else 307 else
311 z1_start_pfn = start_pfn; 308 z1_start_pfn = start_pfn;
@@ -347,7 +344,7 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
347 goto out_fail; 344 goto out_fail;
348 345
349 /* use end_pfn for z2's end_pfn if z2 is empty */ 346 /* use end_pfn for z2's end_pfn if z2 is empty */
350 if (z2->spanned_pages) 347 if (!zone_is_empty(z2))
351 z2_end_pfn = zone_end_pfn(z2); 348 z2_end_pfn = zone_end_pfn(z2);
352 else 349 else
353 z2_end_pfn = end_pfn; 350 z2_end_pfn = end_pfn;
@@ -514,8 +511,9 @@ static int find_biggest_section_pfn(int nid, struct zone *zone,
514static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, 511static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
515 unsigned long end_pfn) 512 unsigned long end_pfn)
516{ 513{
517 unsigned long zone_start_pfn = zone->zone_start_pfn; 514 unsigned long zone_start_pfn = zone->zone_start_pfn;
518 unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 515 unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
516 unsigned long zone_end_pfn = z;
519 unsigned long pfn; 517 unsigned long pfn;
520 struct mem_section *ms; 518 struct mem_section *ms;
521 int nid = zone_to_nid(zone); 519 int nid = zone_to_nid(zone);
@@ -1069,6 +1067,23 @@ out:
1069 return ret; 1067 return ret;
1070} 1068}
1071 1069
1070static int check_hotplug_memory_range(u64 start, u64 size)
1071{
1072 u64 start_pfn = start >> PAGE_SHIFT;
1073 u64 nr_pages = size >> PAGE_SHIFT;
1074
1075 /* Memory range must be aligned with section */
1076 if ((start_pfn & ~PAGE_SECTION_MASK) ||
1077 (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) {
1078 pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n",
1079 (unsigned long long)start,
1080 (unsigned long long)size);
1081 return -EINVAL;
1082 }
1083
1084 return 0;
1085}
1086
1072/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ 1087/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
1073int __ref add_memory(int nid, u64 start, u64 size) 1088int __ref add_memory(int nid, u64 start, u64 size)
1074{ 1089{
@@ -1078,6 +1093,10 @@ int __ref add_memory(int nid, u64 start, u64 size)
1078 struct resource *res; 1093 struct resource *res;
1079 int ret; 1094 int ret;
1080 1095
1096 ret = check_hotplug_memory_range(start, size);
1097 if (ret)
1098 return ret;
1099
1081 lock_memory_hotplug(); 1100 lock_memory_hotplug();
1082 1101
1083 res = register_memory_resource(start, size); 1102 res = register_memory_resource(start, size);
@@ -1208,10 +1227,12 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
1208} 1227}
1209 1228
1210/* 1229/*
1211 * Scanning pfn is much easier than scanning lru list. 1230 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages
1212 * Scan pfn from start to end and Find LRU page. 1231 * and hugepages). We scan pfn because it's much easier than scanning over
1232 * linked list. This function returns the pfn of the first found movable
1233 * page if it's found, otherwise 0.
1213 */ 1234 */
1214static unsigned long scan_lru_pages(unsigned long start, unsigned long end) 1235static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
1215{ 1236{
1216 unsigned long pfn; 1237 unsigned long pfn;
1217 struct page *page; 1238 struct page *page;
@@ -1220,6 +1241,13 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
1220 page = pfn_to_page(pfn); 1241 page = pfn_to_page(pfn);
1221 if (PageLRU(page)) 1242 if (PageLRU(page))
1222 return pfn; 1243 return pfn;
1244 if (PageHuge(page)) {
1245 if (is_hugepage_active(page))
1246 return pfn;
1247 else
1248 pfn = round_up(pfn + 1,
1249 1 << compound_order(page)) - 1;
1250 }
1223 } 1251 }
1224 } 1252 }
1225 return 0; 1253 return 0;
@@ -1240,6 +1268,19 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1240 if (!pfn_valid(pfn)) 1268 if (!pfn_valid(pfn))
1241 continue; 1269 continue;
1242 page = pfn_to_page(pfn); 1270 page = pfn_to_page(pfn);
1271
1272 if (PageHuge(page)) {
1273 struct page *head = compound_head(page);
1274 pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
1275 if (compound_order(head) > PFN_SECTION_SHIFT) {
1276 ret = -EBUSY;
1277 break;
1278 }
1279 if (isolate_huge_page(page, &source))
1280 move_pages -= 1 << compound_order(head);
1281 continue;
1282 }
1283
1243 if (!get_page_unless_zero(page)) 1284 if (!get_page_unless_zero(page))
1244 continue; 1285 continue;
1245 /* 1286 /*
@@ -1272,7 +1313,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1272 } 1313 }
1273 if (!list_empty(&source)) { 1314 if (!list_empty(&source)) {
1274 if (not_managed) { 1315 if (not_managed) {
1275 putback_lru_pages(&source); 1316 putback_movable_pages(&source);
1276 goto out; 1317 goto out;
1277 } 1318 }
1278 1319
@@ -1283,7 +1324,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
1283 ret = migrate_pages(&source, alloc_migrate_target, 0, 1324 ret = migrate_pages(&source, alloc_migrate_target, 0,
1284 MIGRATE_SYNC, MR_MEMORY_HOTPLUG); 1325 MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1285 if (ret) 1326 if (ret)
1286 putback_lru_pages(&source); 1327 putback_movable_pages(&source);
1287 } 1328 }
1288out: 1329out:
1289 return ret; 1330 return ret;
@@ -1472,7 +1513,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
1472 struct zone *zone; 1513 struct zone *zone;
1473 struct memory_notify arg; 1514 struct memory_notify arg;
1474 1515
1475 BUG_ON(start_pfn >= end_pfn);
1476 /* at least, alignment against pageblock is necessary */ 1516 /* at least, alignment against pageblock is necessary */
1477 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) 1517 if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
1478 return -EINVAL; 1518 return -EINVAL;
@@ -1527,8 +1567,8 @@ repeat:
1527 drain_all_pages(); 1567 drain_all_pages();
1528 } 1568 }
1529 1569
1530 pfn = scan_lru_pages(start_pfn, end_pfn); 1570 pfn = scan_movable_pages(start_pfn, end_pfn);
1531 if (pfn) { /* We have page on LRU */ 1571 if (pfn) { /* We have movable pages */
1532 ret = do_migrate_range(pfn, end_pfn); 1572 ret = do_migrate_range(pfn, end_pfn);
1533 if (!ret) { 1573 if (!ret) {
1534 drain = 1; 1574 drain = 1;
@@ -1547,6 +1587,11 @@ repeat:
1547 yield(); 1587 yield();
1548 /* drain pcp pages, this is synchronous. */ 1588 /* drain pcp pages, this is synchronous. */
1549 drain_all_pages(); 1589 drain_all_pages();
1590 /*
1591 * dissolve free hugepages in the memory block before doing offlining
1592 * actually in order to make hugetlbfs's object counting consistent.
1593 */
1594 dissolve_free_huge_pages(start_pfn, end_pfn);
1550 /* check again */ 1595 /* check again */
1551 offlined_pages = check_pages_isolated(start_pfn, end_pfn); 1596 offlined_pages = check_pages_isolated(start_pfn, end_pfn);
1552 if (offlined_pages < 0) { 1597 if (offlined_pages < 0) {
@@ -1674,9 +1719,8 @@ static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
1674 return ret; 1719 return ret;
1675} 1720}
1676 1721
1677static int check_cpu_on_node(void *data) 1722static int check_cpu_on_node(pg_data_t *pgdat)
1678{ 1723{
1679 struct pglist_data *pgdat = data;
1680 int cpu; 1724 int cpu;
1681 1725
1682 for_each_present_cpu(cpu) { 1726 for_each_present_cpu(cpu) {
@@ -1691,10 +1735,9 @@ static int check_cpu_on_node(void *data)
1691 return 0; 1735 return 0;
1692} 1736}
1693 1737
1694static void unmap_cpu_on_node(void *data) 1738static void unmap_cpu_on_node(pg_data_t *pgdat)
1695{ 1739{
1696#ifdef CONFIG_ACPI_NUMA 1740#ifdef CONFIG_ACPI_NUMA
1697 struct pglist_data *pgdat = data;
1698 int cpu; 1741 int cpu;
1699 1742
1700 for_each_possible_cpu(cpu) 1743 for_each_possible_cpu(cpu)
@@ -1703,10 +1746,11 @@ static void unmap_cpu_on_node(void *data)
1703#endif 1746#endif
1704} 1747}
1705 1748
1706static int check_and_unmap_cpu_on_node(void *data) 1749static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
1707{ 1750{
1708 int ret = check_cpu_on_node(data); 1751 int ret;
1709 1752
1753 ret = check_cpu_on_node(pgdat);
1710 if (ret) 1754 if (ret)
1711 return ret; 1755 return ret;
1712 1756
@@ -1715,11 +1759,18 @@ static int check_and_unmap_cpu_on_node(void *data)
1715 * the cpu_to_node() now. 1759 * the cpu_to_node() now.
1716 */ 1760 */
1717 1761
1718 unmap_cpu_on_node(data); 1762 unmap_cpu_on_node(pgdat);
1719 return 0; 1763 return 0;
1720} 1764}
1721 1765
1722/* offline the node if all memory sections of this node are removed */ 1766/**
1767 * try_offline_node
1768 *
1769 * Offline a node if all memory sections and cpus of the node are removed.
1770 *
1771 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
1772 * and online/offline operations before this call.
1773 */
1723void try_offline_node(int nid) 1774void try_offline_node(int nid)
1724{ 1775{
1725 pg_data_t *pgdat = NODE_DATA(nid); 1776 pg_data_t *pgdat = NODE_DATA(nid);
@@ -1745,7 +1796,7 @@ void try_offline_node(int nid)
1745 return; 1796 return;
1746 } 1797 }
1747 1798
1748 if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) 1799 if (check_and_unmap_cpu_on_node(pgdat))
1749 return; 1800 return;
1750 1801
1751 /* 1802 /*
@@ -1782,10 +1833,19 @@ void try_offline_node(int nid)
1782} 1833}
1783EXPORT_SYMBOL(try_offline_node); 1834EXPORT_SYMBOL(try_offline_node);
1784 1835
1836/**
1837 * remove_memory
1838 *
1839 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
1840 * and online/offline operations before this call, as required by
1841 * try_offline_node().
1842 */
1785void __ref remove_memory(int nid, u64 start, u64 size) 1843void __ref remove_memory(int nid, u64 start, u64 size)
1786{ 1844{
1787 int ret; 1845 int ret;
1788 1846
1847 BUG_ON(check_hotplug_memory_range(start, size));
1848
1789 lock_memory_hotplug(); 1849 lock_memory_hotplug();
1790 1850
1791 /* 1851 /*
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4baf12e534d1..04729647f359 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -123,16 +123,19 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES];
123static struct mempolicy *get_task_policy(struct task_struct *p) 123static struct mempolicy *get_task_policy(struct task_struct *p)
124{ 124{
125 struct mempolicy *pol = p->mempolicy; 125 struct mempolicy *pol = p->mempolicy;
126 int node;
127 126
128 if (!pol) { 127 if (!pol) {
129 node = numa_node_id(); 128 int node = numa_node_id();
130 if (node != NUMA_NO_NODE)
131 pol = &preferred_node_policy[node];
132 129
133 /* preferred_node_policy is not initialised early in boot */ 130 if (node != NUMA_NO_NODE) {
134 if (!pol->mode) 131 pol = &preferred_node_policy[node];
135 pol = NULL; 132 /*
133 * preferred_node_policy is not initialised early in
134 * boot
135 */
136 if (!pol->mode)
137 pol = NULL;
138 }
136 } 139 }
137 140
138 return pol; 141 return pol;
@@ -473,8 +476,11 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
473static void migrate_page_add(struct page *page, struct list_head *pagelist, 476static void migrate_page_add(struct page *page, struct list_head *pagelist,
474 unsigned long flags); 477 unsigned long flags);
475 478
476/* Scan through pages checking if pages follow certain conditions. */ 479/*
477static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 480 * Scan through pages checking if pages follow certain conditions,
481 * and move them to the pagelist if they do.
482 */
483static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
478 unsigned long addr, unsigned long end, 484 unsigned long addr, unsigned long end,
479 const nodemask_t *nodes, unsigned long flags, 485 const nodemask_t *nodes, unsigned long flags,
480 void *private) 486 void *private)
@@ -512,7 +518,31 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
512 return addr != end; 518 return addr != end;
513} 519}
514 520
515static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, 521static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
522 pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
523 void *private)
524{
525#ifdef CONFIG_HUGETLB_PAGE
526 int nid;
527 struct page *page;
528
529 spin_lock(&vma->vm_mm->page_table_lock);
530 page = pte_page(huge_ptep_get((pte_t *)pmd));
531 nid = page_to_nid(page);
532 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
533 goto unlock;
534 /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
535 if (flags & (MPOL_MF_MOVE_ALL) ||
536 (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
537 isolate_huge_page(page, private);
538unlock:
539 spin_unlock(&vma->vm_mm->page_table_lock);
540#else
541 BUG();
542#endif
543}
544
545static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
516 unsigned long addr, unsigned long end, 546 unsigned long addr, unsigned long end,
517 const nodemask_t *nodes, unsigned long flags, 547 const nodemask_t *nodes, unsigned long flags,
518 void *private) 548 void *private)
@@ -523,17 +553,24 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
523 pmd = pmd_offset(pud, addr); 553 pmd = pmd_offset(pud, addr);
524 do { 554 do {
525 next = pmd_addr_end(addr, end); 555 next = pmd_addr_end(addr, end);
556 if (!pmd_present(*pmd))
557 continue;
558 if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
559 queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
560 flags, private);
561 continue;
562 }
526 split_huge_page_pmd(vma, addr, pmd); 563 split_huge_page_pmd(vma, addr, pmd);
527 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 564 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
528 continue; 565 continue;
529 if (check_pte_range(vma, pmd, addr, next, nodes, 566 if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
530 flags, private)) 567 flags, private))
531 return -EIO; 568 return -EIO;
532 } while (pmd++, addr = next, addr != end); 569 } while (pmd++, addr = next, addr != end);
533 return 0; 570 return 0;
534} 571}
535 572
536static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 573static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
537 unsigned long addr, unsigned long end, 574 unsigned long addr, unsigned long end,
538 const nodemask_t *nodes, unsigned long flags, 575 const nodemask_t *nodes, unsigned long flags,
539 void *private) 576 void *private)
@@ -544,16 +581,18 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
544 pud = pud_offset(pgd, addr); 581 pud = pud_offset(pgd, addr);
545 do { 582 do {
546 next = pud_addr_end(addr, end); 583 next = pud_addr_end(addr, end);
584 if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
585 continue;
547 if (pud_none_or_clear_bad(pud)) 586 if (pud_none_or_clear_bad(pud))
548 continue; 587 continue;
549 if (check_pmd_range(vma, pud, addr, next, nodes, 588 if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
550 flags, private)) 589 flags, private))
551 return -EIO; 590 return -EIO;
552 } while (pud++, addr = next, addr != end); 591 } while (pud++, addr = next, addr != end);
553 return 0; 592 return 0;
554} 593}
555 594
556static inline int check_pgd_range(struct vm_area_struct *vma, 595static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
557 unsigned long addr, unsigned long end, 596 unsigned long addr, unsigned long end,
558 const nodemask_t *nodes, unsigned long flags, 597 const nodemask_t *nodes, unsigned long flags,
559 void *private) 598 void *private)
@@ -566,7 +605,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
566 next = pgd_addr_end(addr, end); 605 next = pgd_addr_end(addr, end);
567 if (pgd_none_or_clear_bad(pgd)) 606 if (pgd_none_or_clear_bad(pgd))
568 continue; 607 continue;
569 if (check_pud_range(vma, pgd, addr, next, nodes, 608 if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
570 flags, private)) 609 flags, private))
571 return -EIO; 610 return -EIO;
572 } while (pgd++, addr = next, addr != end); 611 } while (pgd++, addr = next, addr != end);
@@ -604,12 +643,14 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
604#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ 643#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
605 644
606/* 645/*
607 * Check if all pages in a range are on a set of nodes. 646 * Walk through page tables and collect pages to be migrated.
608 * If pagelist != NULL then isolate pages from the LRU and 647 *
609 * put them on the pagelist. 648 * If pages found in a given range are on a set of nodes (determined by
649 * @nodes and @flags,) it's isolated and queued to the pagelist which is
650 * passed via @private.)
610 */ 651 */
611static struct vm_area_struct * 652static struct vm_area_struct *
612check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 653queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
613 const nodemask_t *nodes, unsigned long flags, void *private) 654 const nodemask_t *nodes, unsigned long flags, void *private)
614{ 655{
615 int err; 656 int err;
@@ -635,9 +676,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
635 return ERR_PTR(-EFAULT); 676 return ERR_PTR(-EFAULT);
636 } 677 }
637 678
638 if (is_vm_hugetlb_page(vma))
639 goto next;
640
641 if (flags & MPOL_MF_LAZY) { 679 if (flags & MPOL_MF_LAZY) {
642 change_prot_numa(vma, start, endvma); 680 change_prot_numa(vma, start, endvma);
643 goto next; 681 goto next;
@@ -647,7 +685,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
647 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && 685 ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
648 vma_migratable(vma))) { 686 vma_migratable(vma))) {
649 687
650 err = check_pgd_range(vma, start, endvma, nodes, 688 err = queue_pages_pgd_range(vma, start, endvma, nodes,
651 flags, private); 689 flags, private);
652 if (err) { 690 if (err) {
653 first = ERR_PTR(err); 691 first = ERR_PTR(err);
@@ -990,7 +1028,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
990 1028
991static struct page *new_node_page(struct page *page, unsigned long node, int **x) 1029static struct page *new_node_page(struct page *page, unsigned long node, int **x)
992{ 1030{
993 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); 1031 if (PageHuge(page))
1032 return alloc_huge_page_node(page_hstate(compound_head(page)),
1033 node);
1034 else
1035 return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
994} 1036}
995 1037
996/* 1038/*
@@ -1013,14 +1055,14 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
1013 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. 1055 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
1014 */ 1056 */
1015 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); 1057 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
1016 check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, 1058 queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
1017 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 1059 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
1018 1060
1019 if (!list_empty(&pagelist)) { 1061 if (!list_empty(&pagelist)) {
1020 err = migrate_pages(&pagelist, new_node_page, dest, 1062 err = migrate_pages(&pagelist, new_node_page, dest,
1021 MIGRATE_SYNC, MR_SYSCALL); 1063 MIGRATE_SYNC, MR_SYSCALL);
1022 if (err) 1064 if (err)
1023 putback_lru_pages(&pagelist); 1065 putback_movable_pages(&pagelist);
1024 } 1066 }
1025 1067
1026 return err; 1068 return err;
@@ -1154,10 +1196,14 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
1154 break; 1196 break;
1155 vma = vma->vm_next; 1197 vma = vma->vm_next;
1156 } 1198 }
1157
1158 /* 1199 /*
1159 * if !vma, alloc_page_vma() will use task or system default policy 1200 * queue_pages_range() confirms that @page belongs to some vma,
1201 * so vma shouldn't be NULL.
1160 */ 1202 */
1203 BUG_ON(!vma);
1204
1205 if (PageHuge(page))
1206 return alloc_huge_page_noerr(vma, address, 1);
1161 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1207 return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1162} 1208}
1163#else 1209#else
@@ -1249,7 +1295,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1249 if (err) 1295 if (err)
1250 goto mpol_out; 1296 goto mpol_out;
1251 1297
1252 vma = check_range(mm, start, end, nmask, 1298 vma = queue_pages_range(mm, start, end, nmask,
1253 flags | MPOL_MF_INVERT, &pagelist); 1299 flags | MPOL_MF_INVERT, &pagelist);
1254 1300
1255 err = PTR_ERR(vma); /* maybe ... */ 1301 err = PTR_ERR(vma); /* maybe ... */
@@ -1265,7 +1311,7 @@ static long do_mbind(unsigned long start, unsigned long len,
1265 (unsigned long)vma, 1311 (unsigned long)vma,
1266 MIGRATE_SYNC, MR_MEMPOLICY_MBIND); 1312 MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1267 if (nr_failed) 1313 if (nr_failed)
1268 putback_lru_pages(&pagelist); 1314 putback_movable_pages(&pagelist);
1269 } 1315 }
1270 1316
1271 if (nr_failed && (flags & MPOL_MF_STRICT)) 1317 if (nr_failed && (flags & MPOL_MF_STRICT))
@@ -2065,6 +2111,16 @@ retry_cpuset:
2065} 2111}
2066EXPORT_SYMBOL(alloc_pages_current); 2112EXPORT_SYMBOL(alloc_pages_current);
2067 2113
2114int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2115{
2116 struct mempolicy *pol = mpol_dup(vma_policy(src));
2117
2118 if (IS_ERR(pol))
2119 return PTR_ERR(pol);
2120 dst->vm_policy = pol;
2121 return 0;
2122}
2123
2068/* 2124/*
2069 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it 2125 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2070 * rebinds the mempolicy its copying by calling mpol_rebind_policy() 2126 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
diff --git a/mm/mempool.c b/mm/mempool.c
index 54990476c049..659aa42bad16 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -73,7 +73,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
73 gfp_t gfp_mask, int node_id) 73 gfp_t gfp_mask, int node_id)
74{ 74{
75 mempool_t *pool; 75 mempool_t *pool;
76 pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); 76 pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id);
77 if (!pool) 77 if (!pool)
78 return NULL; 78 return NULL;
79 pool->elements = kmalloc_node(min_nr * sizeof(void *), 79 pool->elements = kmalloc_node(min_nr * sizeof(void *),
diff --git a/mm/migrate.c b/mm/migrate.c
index 6f0c24438bba..b7ded7eafe3a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -100,6 +100,10 @@ void putback_movable_pages(struct list_head *l)
100 struct page *page2; 100 struct page *page2;
101 101
102 list_for_each_entry_safe(page, page2, l, lru) { 102 list_for_each_entry_safe(page, page2, l, lru) {
103 if (unlikely(PageHuge(page))) {
104 putback_active_hugepage(page);
105 continue;
106 }
103 list_del(&page->lru); 107 list_del(&page->lru);
104 dec_zone_page_state(page, NR_ISOLATED_ANON + 108 dec_zone_page_state(page, NR_ISOLATED_ANON +
105 page_is_file_cache(page)); 109 page_is_file_cache(page));
@@ -945,6 +949,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
945 struct page *new_hpage = get_new_page(hpage, private, &result); 949 struct page *new_hpage = get_new_page(hpage, private, &result);
946 struct anon_vma *anon_vma = NULL; 950 struct anon_vma *anon_vma = NULL;
947 951
952 /*
953 * Movability of hugepages depends on architectures and hugepage size.
954 * This check is necessary because some callers of hugepage migration
955 * like soft offline and memory hotremove don't walk through page
956 * tables or check whether the hugepage is pmd-based or not before
957 * kicking migration.
958 */
959 if (!hugepage_migration_support(page_hstate(hpage)))
960 return -ENOSYS;
961
948 if (!new_hpage) 962 if (!new_hpage)
949 return -ENOMEM; 963 return -ENOMEM;
950 964
@@ -975,6 +989,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
975 989
976 unlock_page(hpage); 990 unlock_page(hpage);
977out: 991out:
992 if (rc != -EAGAIN)
993 putback_active_hugepage(hpage);
978 put_page(new_hpage); 994 put_page(new_hpage);
979 if (result) { 995 if (result) {
980 if (rc) 996 if (rc)
@@ -1025,7 +1041,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
1025 list_for_each_entry_safe(page, page2, from, lru) { 1041 list_for_each_entry_safe(page, page2, from, lru) {
1026 cond_resched(); 1042 cond_resched();
1027 1043
1028 rc = unmap_and_move(get_new_page, private, 1044 if (PageHuge(page))
1045 rc = unmap_and_move_huge_page(get_new_page,
1046 private, page, pass > 2, mode);
1047 else
1048 rc = unmap_and_move(get_new_page, private,
1029 page, pass > 2, mode); 1049 page, pass > 2, mode);
1030 1050
1031 switch(rc) { 1051 switch(rc) {
@@ -1058,32 +1078,6 @@ out:
1058 return rc; 1078 return rc;
1059} 1079}
1060 1080
1061int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1062 unsigned long private, enum migrate_mode mode)
1063{
1064 int pass, rc;
1065
1066 for (pass = 0; pass < 10; pass++) {
1067 rc = unmap_and_move_huge_page(get_new_page, private,
1068 hpage, pass > 2, mode);
1069 switch (rc) {
1070 case -ENOMEM:
1071 goto out;
1072 case -EAGAIN:
1073 /* try again */
1074 cond_resched();
1075 break;
1076 case MIGRATEPAGE_SUCCESS:
1077 goto out;
1078 default:
1079 rc = -EIO;
1080 goto out;
1081 }
1082 }
1083out:
1084 return rc;
1085}
1086
1087#ifdef CONFIG_NUMA 1081#ifdef CONFIG_NUMA
1088/* 1082/*
1089 * Move a list of individual pages 1083 * Move a list of individual pages
@@ -1108,7 +1102,11 @@ static struct page *new_page_node(struct page *p, unsigned long private,
1108 1102
1109 *result = &pm->status; 1103 *result = &pm->status;
1110 1104
1111 return alloc_pages_exact_node(pm->node, 1105 if (PageHuge(p))
1106 return alloc_huge_page_node(page_hstate(compound_head(p)),
1107 pm->node);
1108 else
1109 return alloc_pages_exact_node(pm->node,
1112 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); 1110 GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
1113} 1111}
1114 1112
@@ -1168,6 +1166,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
1168 !migrate_all) 1166 !migrate_all)
1169 goto put_and_set; 1167 goto put_and_set;
1170 1168
1169 if (PageHuge(page)) {
1170 isolate_huge_page(page, &pagelist);
1171 goto put_and_set;
1172 }
1173
1171 err = isolate_lru_page(page); 1174 err = isolate_lru_page(page);
1172 if (!err) { 1175 if (!err) {
1173 list_add_tail(&page->lru, &pagelist); 1176 list_add_tail(&page->lru, &pagelist);
@@ -1190,7 +1193,7 @@ set_status:
1190 err = migrate_pages(&pagelist, new_page_node, 1193 err = migrate_pages(&pagelist, new_page_node,
1191 (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); 1194 (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
1192 if (err) 1195 if (err)
1193 putback_lru_pages(&pagelist); 1196 putback_movable_pages(&pagelist);
1194 } 1197 }
1195 1198
1196 up_read(&mm->mmap_sem); 1199 up_read(&mm->mmap_sem);
@@ -1468,7 +1471,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
1468 if (!populated_zone(zone)) 1471 if (!populated_zone(zone))
1469 continue; 1472 continue;
1470 1473
1471 if (zone->all_unreclaimable) 1474 if (!zone_reclaimable(zone))
1472 continue; 1475 continue;
1473 1476
1474 /* Avoid waking kswapd by allocating pages_to_migrate pages. */ 1477 /* Avoid waking kswapd by allocating pages_to_migrate pages. */
diff --git a/mm/mlock.c b/mm/mlock.c
index 79b7cf7d1bca..d63802663242 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -11,6 +11,7 @@
11#include <linux/swap.h> 11#include <linux/swap.h>
12#include <linux/swapops.h> 12#include <linux/swapops.h>
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/pagevec.h>
14#include <linux/mempolicy.h> 15#include <linux/mempolicy.h>
15#include <linux/syscalls.h> 16#include <linux/syscalls.h>
16#include <linux/sched.h> 17#include <linux/sched.h>
@@ -18,6 +19,8 @@
18#include <linux/rmap.h> 19#include <linux/rmap.h>
19#include <linux/mmzone.h> 20#include <linux/mmzone.h>
20#include <linux/hugetlb.h> 21#include <linux/hugetlb.h>
22#include <linux/memcontrol.h>
23#include <linux/mm_inline.h>
21 24
22#include "internal.h" 25#include "internal.h"
23 26
@@ -87,6 +90,47 @@ void mlock_vma_page(struct page *page)
87 } 90 }
88} 91}
89 92
93/*
94 * Finish munlock after successful page isolation
95 *
96 * Page must be locked. This is a wrapper for try_to_munlock()
97 * and putback_lru_page() with munlock accounting.
98 */
99static void __munlock_isolated_page(struct page *page)
100{
101 int ret = SWAP_AGAIN;
102
103 /*
104 * Optimization: if the page was mapped just once, that's our mapping
105 * and we don't need to check all the other vmas.
106 */
107 if (page_mapcount(page) > 1)
108 ret = try_to_munlock(page);
109
110 /* Did try_to_unlock() succeed or punt? */
111 if (ret != SWAP_MLOCK)
112 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
113
114 putback_lru_page(page);
115}
116
117/*
118 * Accounting for page isolation fail during munlock
119 *
120 * Performs accounting when page isolation fails in munlock. There is nothing
121 * else to do because it means some other task has already removed the page
122 * from the LRU. putback_lru_page() will take care of removing the page from
123 * the unevictable list, if necessary. vmscan [page_referenced()] will move
124 * the page back to the unevictable list if some other vma has it mlocked.
125 */
126static void __munlock_isolation_failed(struct page *page)
127{
128 if (PageUnevictable(page))
129 count_vm_event(UNEVICTABLE_PGSTRANDED);
130 else
131 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
132}
133
90/** 134/**
91 * munlock_vma_page - munlock a vma page 135 * munlock_vma_page - munlock a vma page
92 * @page - page to be unlocked 136 * @page - page to be unlocked
@@ -112,37 +156,10 @@ unsigned int munlock_vma_page(struct page *page)
112 unsigned int nr_pages = hpage_nr_pages(page); 156 unsigned int nr_pages = hpage_nr_pages(page);
113 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); 157 mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
114 page_mask = nr_pages - 1; 158 page_mask = nr_pages - 1;
115 if (!isolate_lru_page(page)) { 159 if (!isolate_lru_page(page))
116 int ret = SWAP_AGAIN; 160 __munlock_isolated_page(page);
117 161 else
118 /* 162 __munlock_isolation_failed(page);
119 * Optimization: if the page was mapped just once,
120 * that's our mapping and we don't need to check all the
121 * other vmas.
122 */
123 if (page_mapcount(page) > 1)
124 ret = try_to_munlock(page);
125 /*
126 * did try_to_unlock() succeed or punt?
127 */
128 if (ret != SWAP_MLOCK)
129 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
130
131 putback_lru_page(page);
132 } else {
133 /*
134 * Some other task has removed the page from the LRU.
135 * putback_lru_page() will take care of removing the
136 * page from the unevictable list, if necessary.
137 * vmscan [page_referenced()] will move the page back
138 * to the unevictable list if some other vma has it
139 * mlocked.
140 */
141 if (PageUnevictable(page))
142 count_vm_event(UNEVICTABLE_PGSTRANDED);
143 else
144 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
145 }
146 } 163 }
147 164
148 return page_mask; 165 return page_mask;
@@ -210,6 +227,191 @@ static int __mlock_posix_error_return(long retval)
210} 227}
211 228
212/* 229/*
230 * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
231 *
232 * The fast path is available only for evictable pages with single mapping.
233 * Then we can bypass the per-cpu pvec and get better performance.
234 * when mapcount > 1 we need try_to_munlock() which can fail.
235 * when !page_evictable(), we need the full redo logic of putback_lru_page to
236 * avoid leaving evictable page in unevictable list.
237 *
238 * In case of success, @page is added to @pvec and @pgrescued is incremented
239 * in case that the page was previously unevictable. @page is also unlocked.
240 */
241static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
242 int *pgrescued)
243{
244 VM_BUG_ON(PageLRU(page));
245 VM_BUG_ON(!PageLocked(page));
246
247 if (page_mapcount(page) <= 1 && page_evictable(page)) {
248 pagevec_add(pvec, page);
249 if (TestClearPageUnevictable(page))
250 (*pgrescued)++;
251 unlock_page(page);
252 return true;
253 }
254
255 return false;
256}
257
258/*
259 * Putback multiple evictable pages to the LRU
260 *
261 * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
262 * the pages might have meanwhile become unevictable but that is OK.
263 */
264static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
265{
266 count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
267 /*
268 *__pagevec_lru_add() calls release_pages() so we don't call
269 * put_page() explicitly
270 */
271 __pagevec_lru_add(pvec);
272 count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
273}
274
275/*
276 * Munlock a batch of pages from the same zone
277 *
278 * The work is split to two main phases. First phase clears the Mlocked flag
279 * and attempts to isolate the pages, all under a single zone lru lock.
280 * The second phase finishes the munlock only for pages where isolation
281 * succeeded.
282 *
283 * Note that the pagevec may be modified during the process.
284 */
285static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
286{
287 int i;
288 int nr = pagevec_count(pvec);
289 int delta_munlocked = -nr;
290 struct pagevec pvec_putback;
291 int pgrescued = 0;
292
293 /* Phase 1: page isolation */
294 spin_lock_irq(&zone->lru_lock);
295 for (i = 0; i < nr; i++) {
296 struct page *page = pvec->pages[i];
297
298 if (TestClearPageMlocked(page)) {
299 struct lruvec *lruvec;
300 int lru;
301
302 if (PageLRU(page)) {
303 lruvec = mem_cgroup_page_lruvec(page, zone);
304 lru = page_lru(page);
305 /*
306 * We already have pin from follow_page_mask()
307 * so we can spare the get_page() here.
308 */
309 ClearPageLRU(page);
310 del_page_from_lru_list(page, lruvec, lru);
311 } else {
312 __munlock_isolation_failed(page);
313 goto skip_munlock;
314 }
315
316 } else {
317skip_munlock:
318 /*
319 * We won't be munlocking this page in the next phase
320 * but we still need to release the follow_page_mask()
321 * pin.
322 */
323 pvec->pages[i] = NULL;
324 put_page(page);
325 delta_munlocked++;
326 }
327 }
328 __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
329 spin_unlock_irq(&zone->lru_lock);
330
331 /* Phase 2: page munlock */
332 pagevec_init(&pvec_putback, 0);
333 for (i = 0; i < nr; i++) {
334 struct page *page = pvec->pages[i];
335
336 if (page) {
337 lock_page(page);
338 if (!__putback_lru_fast_prepare(page, &pvec_putback,
339 &pgrescued)) {
340 /*
341 * Slow path. We don't want to lose the last
342 * pin before unlock_page()
343 */
344 get_page(page); /* for putback_lru_page() */
345 __munlock_isolated_page(page);
346 unlock_page(page);
347 put_page(page); /* from follow_page_mask() */
348 }
349 }
350 }
351
352 /*
353 * Phase 3: page putback for pages that qualified for the fast path
354 * This will also call put_page() to return pin from follow_page_mask()
355 */
356 if (pagevec_count(&pvec_putback))
357 __putback_lru_fast(&pvec_putback, pgrescued);
358}
359
360/*
361 * Fill up pagevec for __munlock_pagevec using pte walk
362 *
363 * The function expects that the struct page corresponding to @start address is
364 * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
365 *
366 * The rest of @pvec is filled by subsequent pages within the same pmd and same
367 * zone, as long as the pte's are present and vm_normal_page() succeeds. These
368 * pages also get pinned.
369 *
370 * Returns the address of the next page that should be scanned. This equals
371 * @start + PAGE_SIZE when no page could be added by the pte walk.
372 */
373static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
374 struct vm_area_struct *vma, int zoneid, unsigned long start,
375 unsigned long end)
376{
377 pte_t *pte;
378 spinlock_t *ptl;
379
380 /*
381 * Initialize pte walk starting at the already pinned page where we
382 * are sure that there is a pte.
383 */
384 pte = get_locked_pte(vma->vm_mm, start, &ptl);
385 end = min(end, pmd_addr_end(start, end));
386
387 /* The page next to the pinned page is the first we will try to get */
388 start += PAGE_SIZE;
389 while (start < end) {
390 struct page *page = NULL;
391 pte++;
392 if (pte_present(*pte))
393 page = vm_normal_page(vma, start, *pte);
394 /*
395 * Break if page could not be obtained or the page's node+zone does not
396 * match
397 */
398 if (!page || page_zone_id(page) != zoneid)
399 break;
400
401 get_page(page);
402 /*
403 * Increase the address that will be returned *before* the
404 * eventual break due to pvec becoming full by adding the page
405 */
406 start += PAGE_SIZE;
407 if (pagevec_add(pvec, page) == 0)
408 break;
409 }
410 pte_unmap_unlock(pte, ptl);
411 return start;
412}
413
414/*
213 * munlock_vma_pages_range() - munlock all pages in the vma range.' 415 * munlock_vma_pages_range() - munlock all pages in the vma range.'
214 * @vma - vma containing range to be munlock()ed. 416 * @vma - vma containing range to be munlock()ed.
215 * @start - start address in @vma of the range 417 * @start - start address in @vma of the range
@@ -233,9 +435,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
233 vma->vm_flags &= ~VM_LOCKED; 435 vma->vm_flags &= ~VM_LOCKED;
234 436
235 while (start < end) { 437 while (start < end) {
236 struct page *page; 438 struct page *page = NULL;
237 unsigned int page_mask, page_increm; 439 unsigned int page_mask, page_increm;
440 struct pagevec pvec;
441 struct zone *zone;
442 int zoneid;
238 443
444 pagevec_init(&pvec, 0);
239 /* 445 /*
240 * Although FOLL_DUMP is intended for get_dump_page(), 446 * Although FOLL_DUMP is intended for get_dump_page(),
241 * it just so happens that its special treatment of the 447 * it just so happens that its special treatment of the
@@ -244,21 +450,45 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
244 * has sneaked into the range, we won't oops here: great). 450 * has sneaked into the range, we won't oops here: great).
245 */ 451 */
246 page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, 452 page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
247 &page_mask); 453 &page_mask);
454
248 if (page && !IS_ERR(page)) { 455 if (page && !IS_ERR(page)) {
249 lock_page(page); 456 if (PageTransHuge(page)) {
250 lru_add_drain(); 457 lock_page(page);
251 /* 458 /*
252 * Any THP page found by follow_page_mask() may have 459 * Any THP page found by follow_page_mask() may
253 * gotten split before reaching munlock_vma_page(), 460 * have gotten split before reaching
254 * so we need to recompute the page_mask here. 461 * munlock_vma_page(), so we need to recompute
255 */ 462 * the page_mask here.
256 page_mask = munlock_vma_page(page); 463 */
257 unlock_page(page); 464 page_mask = munlock_vma_page(page);
258 put_page(page); 465 unlock_page(page);
466 put_page(page); /* follow_page_mask() */
467 } else {
468 /*
469 * Non-huge pages are handled in batches via
470 * pagevec. The pin from follow_page_mask()
471 * prevents them from collapsing by THP.
472 */
473 pagevec_add(&pvec, page);
474 zone = page_zone(page);
475 zoneid = page_zone_id(page);
476
477 /*
478 * Try to fill the rest of pagevec using fast
479 * pte walk. This will also update start to
480 * the next page to process. Then munlock the
481 * pagevec.
482 */
483 start = __munlock_pagevec_fill(&pvec, vma,
484 zoneid, start, end);
485 __munlock_pagevec(&pvec, zone);
486 goto next;
487 }
259 } 488 }
260 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); 489 page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
261 start += page_increm * PAGE_SIZE; 490 start += page_increm * PAGE_SIZE;
491next:
262 cond_resched(); 492 cond_resched();
263 } 493 }
264} 494}
diff --git a/mm/mmap.c b/mm/mmap.c
index f9c97d10b873..9d548512ff8a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1202,7 +1202,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1202 unsigned long *populate) 1202 unsigned long *populate)
1203{ 1203{
1204 struct mm_struct * mm = current->mm; 1204 struct mm_struct * mm = current->mm;
1205 struct inode *inode;
1206 vm_flags_t vm_flags; 1205 vm_flags_t vm_flags;
1207 1206
1208 *populate = 0; 1207 *populate = 0;
@@ -1265,9 +1264,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1265 return -EAGAIN; 1264 return -EAGAIN;
1266 } 1265 }
1267 1266
1268 inode = file ? file_inode(file) : NULL;
1269
1270 if (file) { 1267 if (file) {
1268 struct inode *inode = file_inode(file);
1269
1271 switch (flags & MAP_TYPE) { 1270 switch (flags & MAP_TYPE) {
1272 case MAP_SHARED: 1271 case MAP_SHARED:
1273 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) 1272 if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
@@ -1302,6 +1301,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1302 1301
1303 if (!file->f_op || !file->f_op->mmap) 1302 if (!file->f_op || !file->f_op->mmap)
1304 return -ENODEV; 1303 return -ENODEV;
1304 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1305 return -EINVAL;
1305 break; 1306 break;
1306 1307
1307 default: 1308 default:
@@ -1310,6 +1311,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1310 } else { 1311 } else {
1311 switch (flags & MAP_TYPE) { 1312 switch (flags & MAP_TYPE) {
1312 case MAP_SHARED: 1313 case MAP_SHARED:
1314 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1315 return -EINVAL;
1313 /* 1316 /*
1314 * Ignore pgoff. 1317 * Ignore pgoff.
1315 */ 1318 */
@@ -1476,11 +1479,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1476{ 1479{
1477 struct mm_struct *mm = current->mm; 1480 struct mm_struct *mm = current->mm;
1478 struct vm_area_struct *vma, *prev; 1481 struct vm_area_struct *vma, *prev;
1479 int correct_wcount = 0;
1480 int error; 1482 int error;
1481 struct rb_node **rb_link, *rb_parent; 1483 struct rb_node **rb_link, *rb_parent;
1482 unsigned long charged = 0; 1484 unsigned long charged = 0;
1483 struct inode *inode = file ? file_inode(file) : NULL;
1484 1485
1485 /* Check against address space limit. */ 1486 /* Check against address space limit. */
1486 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { 1487 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
@@ -1544,16 +1545,11 @@ munmap_back:
1544 vma->vm_pgoff = pgoff; 1545 vma->vm_pgoff = pgoff;
1545 INIT_LIST_HEAD(&vma->anon_vma_chain); 1546 INIT_LIST_HEAD(&vma->anon_vma_chain);
1546 1547
1547 error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */
1548
1549 if (file) { 1548 if (file) {
1550 if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
1551 goto free_vma;
1552 if (vm_flags & VM_DENYWRITE) { 1549 if (vm_flags & VM_DENYWRITE) {
1553 error = deny_write_access(file); 1550 error = deny_write_access(file);
1554 if (error) 1551 if (error)
1555 goto free_vma; 1552 goto free_vma;
1556 correct_wcount = 1;
1557 } 1553 }
1558 vma->vm_file = get_file(file); 1554 vma->vm_file = get_file(file);
1559 error = file->f_op->mmap(file, vma); 1555 error = file->f_op->mmap(file, vma);
@@ -1570,11 +1566,8 @@ munmap_back:
1570 WARN_ON_ONCE(addr != vma->vm_start); 1566 WARN_ON_ONCE(addr != vma->vm_start);
1571 1567
1572 addr = vma->vm_start; 1568 addr = vma->vm_start;
1573 pgoff = vma->vm_pgoff;
1574 vm_flags = vma->vm_flags; 1569 vm_flags = vma->vm_flags;
1575 } else if (vm_flags & VM_SHARED) { 1570 } else if (vm_flags & VM_SHARED) {
1576 if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
1577 goto free_vma;
1578 error = shmem_zero_setup(vma); 1571 error = shmem_zero_setup(vma);
1579 if (error) 1572 if (error)
1580 goto free_vma; 1573 goto free_vma;
@@ -1596,11 +1589,10 @@ munmap_back:
1596 } 1589 }
1597 1590
1598 vma_link(mm, vma, prev, rb_link, rb_parent); 1591 vma_link(mm, vma, prev, rb_link, rb_parent);
1599 file = vma->vm_file;
1600
1601 /* Once vma denies write, undo our temporary denial count */ 1592 /* Once vma denies write, undo our temporary denial count */
1602 if (correct_wcount) 1593 if (vm_flags & VM_DENYWRITE)
1603 atomic_inc(&inode->i_writecount); 1594 allow_write_access(file);
1595 file = vma->vm_file;
1604out: 1596out:
1605 perf_event_mmap(vma); 1597 perf_event_mmap(vma);
1606 1598
@@ -1616,11 +1608,20 @@ out:
1616 if (file) 1608 if (file)
1617 uprobe_mmap(vma); 1609 uprobe_mmap(vma);
1618 1610
1611 /*
1612 * New (or expanded) vma always get soft dirty status.
1613 * Otherwise user-space soft-dirty page tracker won't
1614 * be able to distinguish situation when vma area unmapped,
1615 * then new mapped in-place (which must be aimed as
1616 * a completely new data area).
1617 */
1618 vma->vm_flags |= VM_SOFTDIRTY;
1619
1619 return addr; 1620 return addr;
1620 1621
1621unmap_and_free_vma: 1622unmap_and_free_vma:
1622 if (correct_wcount) 1623 if (vm_flags & VM_DENYWRITE)
1623 atomic_inc(&inode->i_writecount); 1624 allow_write_access(file);
1624 vma->vm_file = NULL; 1625 vma->vm_file = NULL;
1625 fput(file); 1626 fput(file);
1626 1627
@@ -2380,7 +2381,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
2380static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, 2381static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
2381 unsigned long addr, int new_below) 2382 unsigned long addr, int new_below)
2382{ 2383{
2383 struct mempolicy *pol;
2384 struct vm_area_struct *new; 2384 struct vm_area_struct *new;
2385 int err = -ENOMEM; 2385 int err = -ENOMEM;
2386 2386
@@ -2404,12 +2404,9 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
2404 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); 2404 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
2405 } 2405 }
2406 2406
2407 pol = mpol_dup(vma_policy(vma)); 2407 err = vma_dup_policy(vma, new);
2408 if (IS_ERR(pol)) { 2408 if (err)
2409 err = PTR_ERR(pol);
2410 goto out_free_vma; 2409 goto out_free_vma;
2411 }
2412 vma_set_policy(new, pol);
2413 2410
2414 if (anon_vma_clone(new, vma)) 2411 if (anon_vma_clone(new, vma))
2415 goto out_free_mpol; 2412 goto out_free_mpol;
@@ -2437,7 +2434,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
2437 fput(new->vm_file); 2434 fput(new->vm_file);
2438 unlink_anon_vmas(new); 2435 unlink_anon_vmas(new);
2439 out_free_mpol: 2436 out_free_mpol:
2440 mpol_put(pol); 2437 mpol_put(vma_policy(new));
2441 out_free_vma: 2438 out_free_vma:
2442 kmem_cache_free(vm_area_cachep, new); 2439 kmem_cache_free(vm_area_cachep, new);
2443 out_err: 2440 out_err:
@@ -2663,6 +2660,7 @@ out:
2663 mm->total_vm += len >> PAGE_SHIFT; 2660 mm->total_vm += len >> PAGE_SHIFT;
2664 if (flags & VM_LOCKED) 2661 if (flags & VM_LOCKED)
2665 mm->locked_vm += (len >> PAGE_SHIFT); 2662 mm->locked_vm += (len >> PAGE_SHIFT);
2663 vma->vm_flags |= VM_SOFTDIRTY;
2666 return addr; 2664 return addr;
2667} 2665}
2668 2666
@@ -2780,7 +2778,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2780 struct mm_struct *mm = vma->vm_mm; 2778 struct mm_struct *mm = vma->vm_mm;
2781 struct vm_area_struct *new_vma, *prev; 2779 struct vm_area_struct *new_vma, *prev;
2782 struct rb_node **rb_link, *rb_parent; 2780 struct rb_node **rb_link, *rb_parent;
2783 struct mempolicy *pol;
2784 bool faulted_in_anon_vma = true; 2781 bool faulted_in_anon_vma = true;
2785 2782
2786 /* 2783 /*
@@ -2825,10 +2822,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2825 new_vma->vm_start = addr; 2822 new_vma->vm_start = addr;
2826 new_vma->vm_end = addr + len; 2823 new_vma->vm_end = addr + len;
2827 new_vma->vm_pgoff = pgoff; 2824 new_vma->vm_pgoff = pgoff;
2828 pol = mpol_dup(vma_policy(vma)); 2825 if (vma_dup_policy(vma, new_vma))
2829 if (IS_ERR(pol))
2830 goto out_free_vma; 2826 goto out_free_vma;
2831 vma_set_policy(new_vma, pol);
2832 INIT_LIST_HEAD(&new_vma->anon_vma_chain); 2827 INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2833 if (anon_vma_clone(new_vma, vma)) 2828 if (anon_vma_clone(new_vma, vma))
2834 goto out_free_mempol; 2829 goto out_free_mempol;
@@ -2843,7 +2838,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2843 return new_vma; 2838 return new_vma;
2844 2839
2845 out_free_mempol: 2840 out_free_mempol:
2846 mpol_put(pol); 2841 mpol_put(vma_policy(new_vma));
2847 out_free_vma: 2842 out_free_vma:
2848 kmem_cache_free(vm_area_cachep, new_vma); 2843 kmem_cache_free(vm_area_cachep, new_vma);
2849 return NULL; 2844 return NULL;
@@ -2930,7 +2925,7 @@ int install_special_mapping(struct mm_struct *mm,
2930 vma->vm_start = addr; 2925 vma->vm_start = addr;
2931 vma->vm_end = addr + len; 2926 vma->vm_end = addr + len;
2932 2927
2933 vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; 2928 vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
2934 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 2929 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
2935 2930
2936 vma->vm_ops = &special_mapping_vmops; 2931 vma->vm_ops = &special_mapping_vmops;
diff --git a/mm/mremap.c b/mm/mremap.c
index 0843feb66f3d..91b13d6a16d4 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -25,6 +25,7 @@
25#include <asm/uaccess.h> 25#include <asm/uaccess.h>
26#include <asm/cacheflush.h> 26#include <asm/cacheflush.h>
27#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
28#include <asm/pgalloc.h>
28 29
29#include "internal.h" 30#include "internal.h"
30 31
@@ -62,8 +63,10 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
62 return NULL; 63 return NULL;
63 64
64 pmd = pmd_alloc(mm, pud, addr); 65 pmd = pmd_alloc(mm, pud, addr);
65 if (!pmd) 66 if (!pmd) {
67 pud_free(mm, pud);
66 return NULL; 68 return NULL;
69 }
67 70
68 VM_BUG_ON(pmd_trans_huge(*pmd)); 71 VM_BUG_ON(pmd_trans_huge(*pmd));
69 72
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3f0c895c71fe..6c7b0187be8e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,8 +36,11 @@
36#include <linux/pagevec.h> 36#include <linux/pagevec.h>
37#include <linux/timer.h> 37#include <linux/timer.h>
38#include <linux/sched/rt.h> 38#include <linux/sched/rt.h>
39#include <linux/mm_inline.h>
39#include <trace/events/writeback.h> 40#include <trace/events/writeback.h>
40 41
42#include "internal.h"
43
41/* 44/*
42 * Sleep at most 200ms at a time in balance_dirty_pages(). 45 * Sleep at most 200ms at a time in balance_dirty_pages().
43 */ 46 */
@@ -241,9 +244,6 @@ static unsigned long global_dirtyable_memory(void)
241 if (!vm_highmem_is_dirtyable) 244 if (!vm_highmem_is_dirtyable)
242 x -= highmem_dirtyable_memory(x); 245 x -= highmem_dirtyable_memory(x);
243 246
244 /* Subtract min_free_kbytes */
245 x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10));
246
247 return x + 1; /* Ensure that we never return 0 */ 247 return x + 1; /* Ensure that we never return 0 */
248} 248}
249 249
@@ -585,6 +585,37 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
585} 585}
586 586
587/* 587/*
588 * setpoint - dirty 3
589 * f(dirty) := 1.0 + (----------------)
590 * limit - setpoint
591 *
592 * it's a 3rd order polynomial that subjects to
593 *
594 * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast
595 * (2) f(setpoint) = 1.0 => the balance point
596 * (3) f(limit) = 0 => the hard limit
597 * (4) df/dx <= 0 => negative feedback control
598 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
599 * => fast response on large errors; small oscillation near setpoint
600 */
601static inline long long pos_ratio_polynom(unsigned long setpoint,
602 unsigned long dirty,
603 unsigned long limit)
604{
605 long long pos_ratio;
606 long x;
607
608 x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
609 limit - setpoint + 1);
610 pos_ratio = x;
611 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
612 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
613 pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
614
615 return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
616}
617
618/*
588 * Dirty position control. 619 * Dirty position control.
589 * 620 *
590 * (o) global/bdi setpoints 621 * (o) global/bdi setpoints
@@ -682,26 +713,80 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
682 /* 713 /*
683 * global setpoint 714 * global setpoint
684 * 715 *
685 * setpoint - dirty 3 716 * See comment for pos_ratio_polynom().
686 * f(dirty) := 1.0 + (----------------) 717 */
687 * limit - setpoint 718 setpoint = (freerun + limit) / 2;
719 pos_ratio = pos_ratio_polynom(setpoint, dirty, limit);
720
721 /*
722 * The strictlimit feature is a tool preventing mistrusted filesystems
723 * from growing a large number of dirty pages before throttling. For
724 * such filesystems balance_dirty_pages always checks bdi counters
725 * against bdi limits. Even if global "nr_dirty" is under "freerun".
726 * This is especially important for fuse which sets bdi->max_ratio to
727 * 1% by default. Without strictlimit feature, fuse writeback may
728 * consume arbitrary amount of RAM because it is accounted in
729 * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
688 * 730 *
689 * it's a 3rd order polynomial that subjects to 731 * Here, in bdi_position_ratio(), we calculate pos_ratio based on
732 * two values: bdi_dirty and bdi_thresh. Let's consider an example:
733 * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
734 * limits are set by default to 10% and 20% (background and throttle).
735 * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
736 * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
737 * about ~6K pages (as the average of background and throttle bdi
738 * limits). The 3rd order polynomial will provide positive feedback if
739 * bdi_dirty is under bdi_setpoint and vice versa.
690 * 740 *
691 * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast 741 * Note, that we cannot use global counters in these calculations
692 * (2) f(setpoint) = 1.0 => the balance point 742 * because we want to throttle process writing to a strictlimit BDI
693 * (3) f(limit) = 0 => the hard limit 743 * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
694 * (4) df/dx <= 0 => negative feedback control 744 * in the example above).
695 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
696 * => fast response on large errors; small oscillation near setpoint
697 */ 745 */
698 setpoint = (freerun + limit) / 2; 746 if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
699 x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, 747 long long bdi_pos_ratio;
700 limit - setpoint + 1); 748 unsigned long bdi_bg_thresh;
701 pos_ratio = x; 749
702 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; 750 if (bdi_dirty < 8)
703 pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; 751 return min_t(long long, pos_ratio * 2,
704 pos_ratio += 1 << RATELIMIT_CALC_SHIFT; 752 2 << RATELIMIT_CALC_SHIFT);
753
754 if (bdi_dirty >= bdi_thresh)
755 return 0;
756
757 bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
758 bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
759 bdi_bg_thresh);
760
761 if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh)
762 return 0;
763
764 bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty,
765 bdi_thresh);
766
767 /*
768 * Typically, for strictlimit case, bdi_setpoint << setpoint
769 * and pos_ratio >> bdi_pos_ratio. In the other words global
770 * state ("dirty") is not limiting factor and we have to
771 * make decision based on bdi counters. But there is an
772 * important case when global pos_ratio should get precedence:
773 * global limits are exceeded (e.g. due to activities on other
774 * BDIs) while given strictlimit BDI is below limit.
775 *
776 * "pos_ratio * bdi_pos_ratio" would work for the case above,
777 * but it would look too non-natural for the case of all
778 * activity in the system coming from a single strictlimit BDI
779 * with bdi->max_ratio == 100%.
780 *
781 * Note that min() below somewhat changes the dynamics of the
782 * control system. Normally, pos_ratio value can be well over 3
783 * (when globally we are at freerun and bdi is well below bdi
784 * setpoint). Now the maximum pos_ratio in the same situation
785 * is 2. We might want to tweak this if we observe the control
786 * system is too slow to adapt.
787 */
788 return min(pos_ratio, bdi_pos_ratio);
789 }
705 790
706 /* 791 /*
707 * We have computed basic pos_ratio above based on global situation. If 792 * We have computed basic pos_ratio above based on global situation. If
@@ -994,6 +1079,27 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
994 * keep that period small to reduce time lags). 1079 * keep that period small to reduce time lags).
995 */ 1080 */
996 step = 0; 1081 step = 0;
1082
1083 /*
1084 * For strictlimit case, calculations above were based on bdi counters
1085 * and limits (starting from pos_ratio = bdi_position_ratio() and up to
1086 * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
1087 * Hence, to calculate "step" properly, we have to use bdi_dirty as
1088 * "dirty" and bdi_setpoint as "setpoint".
1089 *
1090 * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
1091 * it's possible that bdi_thresh is close to zero due to inactivity
1092 * of backing device (see the implementation of bdi_dirty_limit()).
1093 */
1094 if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
1095 dirty = bdi_dirty;
1096 if (bdi_dirty < 8)
1097 setpoint = bdi_dirty + 1;
1098 else
1099 setpoint = (bdi_thresh +
1100 bdi_dirty_limit(bdi, bg_thresh)) / 2;
1101 }
1102
997 if (dirty < setpoint) { 1103 if (dirty < setpoint) {
998 x = min(bdi->balanced_dirty_ratelimit, 1104 x = min(bdi->balanced_dirty_ratelimit,
999 min(balanced_dirty_ratelimit, task_ratelimit)); 1105 min(balanced_dirty_ratelimit, task_ratelimit));
@@ -1198,6 +1304,56 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
1198 return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; 1304 return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
1199} 1305}
1200 1306
1307static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
1308 unsigned long dirty_thresh,
1309 unsigned long background_thresh,
1310 unsigned long *bdi_dirty,
1311 unsigned long *bdi_thresh,
1312 unsigned long *bdi_bg_thresh)
1313{
1314 unsigned long bdi_reclaimable;
1315
1316 /*
1317 * bdi_thresh is not treated as some limiting factor as
1318 * dirty_thresh, due to reasons
1319 * - in JBOD setup, bdi_thresh can fluctuate a lot
1320 * - in a system with HDD and USB key, the USB key may somehow
1321 * go into state (bdi_dirty >> bdi_thresh) either because
1322 * bdi_dirty starts high, or because bdi_thresh drops low.
1323 * In this case we don't want to hard throttle the USB key
1324 * dirtiers for 100 seconds until bdi_dirty drops under
1325 * bdi_thresh. Instead the auxiliary bdi control line in
1326 * bdi_position_ratio() will let the dirtier task progress
1327 * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
1328 */
1329 *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
1330
1331 if (bdi_bg_thresh)
1332 *bdi_bg_thresh = div_u64((u64)*bdi_thresh *
1333 background_thresh,
1334 dirty_thresh);
1335
1336 /*
1337 * In order to avoid the stacked BDI deadlock we need
1338 * to ensure we accurately count the 'dirty' pages when
1339 * the threshold is low.
1340 *
1341 * Otherwise it would be possible to get thresh+n pages
1342 * reported dirty, even though there are thresh-m pages
1343 * actually dirty; with m+n sitting in the percpu
1344 * deltas.
1345 */
1346 if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
1347 bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
1348 *bdi_dirty = bdi_reclaimable +
1349 bdi_stat_sum(bdi, BDI_WRITEBACK);
1350 } else {
1351 bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
1352 *bdi_dirty = bdi_reclaimable +
1353 bdi_stat(bdi, BDI_WRITEBACK);
1354 }
1355}
1356
1201/* 1357/*
1202 * balance_dirty_pages() must be called by processes which are generating dirty 1358 * balance_dirty_pages() must be called by processes which are generating dirty
1203 * data. It looks at the number of dirty pages in the machine and will force 1359 * data. It looks at the number of dirty pages in the machine and will force
@@ -1209,13 +1365,9 @@ static void balance_dirty_pages(struct address_space *mapping,
1209 unsigned long pages_dirtied) 1365 unsigned long pages_dirtied)
1210{ 1366{
1211 unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ 1367 unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
1212 unsigned long bdi_reclaimable;
1213 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ 1368 unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */
1214 unsigned long bdi_dirty;
1215 unsigned long freerun;
1216 unsigned long background_thresh; 1369 unsigned long background_thresh;
1217 unsigned long dirty_thresh; 1370 unsigned long dirty_thresh;
1218 unsigned long bdi_thresh;
1219 long period; 1371 long period;
1220 long pause; 1372 long pause;
1221 long max_pause; 1373 long max_pause;
@@ -1226,10 +1378,16 @@ static void balance_dirty_pages(struct address_space *mapping,
1226 unsigned long dirty_ratelimit; 1378 unsigned long dirty_ratelimit;
1227 unsigned long pos_ratio; 1379 unsigned long pos_ratio;
1228 struct backing_dev_info *bdi = mapping->backing_dev_info; 1380 struct backing_dev_info *bdi = mapping->backing_dev_info;
1381 bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
1229 unsigned long start_time = jiffies; 1382 unsigned long start_time = jiffies;
1230 1383
1231 for (;;) { 1384 for (;;) {
1232 unsigned long now = jiffies; 1385 unsigned long now = jiffies;
1386 unsigned long uninitialized_var(bdi_thresh);
1387 unsigned long thresh;
1388 unsigned long uninitialized_var(bdi_dirty);
1389 unsigned long dirty;
1390 unsigned long bg_thresh;
1233 1391
1234 /* 1392 /*
1235 * Unstable writes are a feature of certain networked 1393 * Unstable writes are a feature of certain networked
@@ -1243,61 +1401,44 @@ static void balance_dirty_pages(struct address_space *mapping,
1243 1401
1244 global_dirty_limits(&background_thresh, &dirty_thresh); 1402 global_dirty_limits(&background_thresh, &dirty_thresh);
1245 1403
1404 if (unlikely(strictlimit)) {
1405 bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
1406 &bdi_dirty, &bdi_thresh, &bg_thresh);
1407
1408 dirty = bdi_dirty;
1409 thresh = bdi_thresh;
1410 } else {
1411 dirty = nr_dirty;
1412 thresh = dirty_thresh;
1413 bg_thresh = background_thresh;
1414 }
1415
1246 /* 1416 /*
1247 * Throttle it only when the background writeback cannot 1417 * Throttle it only when the background writeback cannot
1248 * catch-up. This avoids (excessively) small writeouts 1418 * catch-up. This avoids (excessively) small writeouts
1249 * when the bdi limits are ramping up. 1419 * when the bdi limits are ramping up in case of !strictlimit.
1420 *
1421 * In strictlimit case make decision based on the bdi counters
1422 * and limits. Small writeouts when the bdi limits are ramping
1423 * up are the price we consciously pay for strictlimit-ing.
1250 */ 1424 */
1251 freerun = dirty_freerun_ceiling(dirty_thresh, 1425 if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
1252 background_thresh);
1253 if (nr_dirty <= freerun) {
1254 current->dirty_paused_when = now; 1426 current->dirty_paused_when = now;
1255 current->nr_dirtied = 0; 1427 current->nr_dirtied = 0;
1256 current->nr_dirtied_pause = 1428 current->nr_dirtied_pause =
1257 dirty_poll_interval(nr_dirty, dirty_thresh); 1429 dirty_poll_interval(dirty, thresh);
1258 break; 1430 break;
1259 } 1431 }
1260 1432
1261 if (unlikely(!writeback_in_progress(bdi))) 1433 if (unlikely(!writeback_in_progress(bdi)))
1262 bdi_start_background_writeback(bdi); 1434 bdi_start_background_writeback(bdi);
1263 1435
1264 /* 1436 if (!strictlimit)
1265 * bdi_thresh is not treated as some limiting factor as 1437 bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
1266 * dirty_thresh, due to reasons 1438 &bdi_dirty, &bdi_thresh, NULL);
1267 * - in JBOD setup, bdi_thresh can fluctuate a lot
1268 * - in a system with HDD and USB key, the USB key may somehow
1269 * go into state (bdi_dirty >> bdi_thresh) either because
1270 * bdi_dirty starts high, or because bdi_thresh drops low.
1271 * In this case we don't want to hard throttle the USB key
1272 * dirtiers for 100 seconds until bdi_dirty drops under
1273 * bdi_thresh. Instead the auxiliary bdi control line in
1274 * bdi_position_ratio() will let the dirtier task progress
1275 * at some rate <= (write_bw / 2) for bringing down bdi_dirty.
1276 */
1277 bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
1278
1279 /*
1280 * In order to avoid the stacked BDI deadlock we need
1281 * to ensure we accurately count the 'dirty' pages when
1282 * the threshold is low.
1283 *
1284 * Otherwise it would be possible to get thresh+n pages
1285 * reported dirty, even though there are thresh-m pages
1286 * actually dirty; with m+n sitting in the percpu
1287 * deltas.
1288 */
1289 if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
1290 bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
1291 bdi_dirty = bdi_reclaimable +
1292 bdi_stat_sum(bdi, BDI_WRITEBACK);
1293 } else {
1294 bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
1295 bdi_dirty = bdi_reclaimable +
1296 bdi_stat(bdi, BDI_WRITEBACK);
1297 }
1298 1439
1299 dirty_exceeded = (bdi_dirty > bdi_thresh) && 1440 dirty_exceeded = (bdi_dirty > bdi_thresh) &&
1300 (nr_dirty > dirty_thresh); 1441 ((nr_dirty > dirty_thresh) || strictlimit);
1301 if (dirty_exceeded && !bdi->dirty_exceeded) 1442 if (dirty_exceeded && !bdi->dirty_exceeded)
1302 bdi->dirty_exceeded = 1; 1443 bdi->dirty_exceeded = 1;
1303 1444
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c2b59dbda196..0ee638f76ebe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,6 +56,7 @@
56#include <linux/ftrace_event.h> 56#include <linux/ftrace_event.h>
57#include <linux/memcontrol.h> 57#include <linux/memcontrol.h>
58#include <linux/prefetch.h> 58#include <linux/prefetch.h>
59#include <linux/mm_inline.h>
59#include <linux/migrate.h> 60#include <linux/migrate.h>
60#include <linux/page-debug-flags.h> 61#include <linux/page-debug-flags.h>
61#include <linux/hugetlb.h> 62#include <linux/hugetlb.h>
@@ -488,8 +489,10 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
488 * (c) a page and its buddy have the same order && 489 * (c) a page and its buddy have the same order &&
489 * (d) a page and its buddy are in the same zone. 490 * (d) a page and its buddy are in the same zone.
490 * 491 *
491 * For recording whether a page is in the buddy system, we set ->_mapcount -2. 492 * For recording whether a page is in the buddy system, we set ->_mapcount
492 * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. 493 * PAGE_BUDDY_MAPCOUNT_VALUE.
494 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
495 * serialized by zone->lock.
493 * 496 *
494 * For recording page's order, we use page_private(page). 497 * For recording page's order, we use page_private(page).
495 */ 498 */
@@ -527,8 +530,9 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
527 * as necessary, plus some accounting needed to play nicely with other 530 * as necessary, plus some accounting needed to play nicely with other
528 * parts of the VM system. 531 * parts of the VM system.
529 * At each level, we keep a list of pages, which are heads of continuous 532 * At each level, we keep a list of pages, which are heads of continuous
530 * free pages of length of (1 << order) and marked with _mapcount -2. Page's 533 * free pages of length of (1 << order) and marked with _mapcount
531 * order is recorded in page_private(page) field. 534 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
535 * field.
532 * So when we are allocating or freeing one, we can derive the state of the 536 * So when we are allocating or freeing one, we can derive the state of the
533 * other. That is, if we allocate a small block, and both were 537 * other. That is, if we allocate a small block, and both were
534 * free, the remainder of the region must be split into blocks. 538 * free, the remainder of the region must be split into blocks.
@@ -647,7 +651,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
647 int to_free = count; 651 int to_free = count;
648 652
649 spin_lock(&zone->lock); 653 spin_lock(&zone->lock);
650 zone->all_unreclaimable = 0;
651 zone->pages_scanned = 0; 654 zone->pages_scanned = 0;
652 655
653 while (to_free) { 656 while (to_free) {
@@ -696,7 +699,6 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
696 int migratetype) 699 int migratetype)
697{ 700{
698 spin_lock(&zone->lock); 701 spin_lock(&zone->lock);
699 zone->all_unreclaimable = 0;
700 zone->pages_scanned = 0; 702 zone->pages_scanned = 0;
701 703
702 __free_one_page(page, zone, order, migratetype); 704 __free_one_page(page, zone, order, migratetype);
@@ -721,7 +723,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
721 return false; 723 return false;
722 724
723 if (!PageHighMem(page)) { 725 if (!PageHighMem(page)) {
724 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); 726 debug_check_no_locks_freed(page_address(page),
727 PAGE_SIZE << order);
725 debug_check_no_obj_freed(page_address(page), 728 debug_check_no_obj_freed(page_address(page),
726 PAGE_SIZE << order); 729 PAGE_SIZE << order);
727 } 730 }
@@ -750,19 +753,19 @@ static void __free_pages_ok(struct page *page, unsigned int order)
750void __init __free_pages_bootmem(struct page *page, unsigned int order) 753void __init __free_pages_bootmem(struct page *page, unsigned int order)
751{ 754{
752 unsigned int nr_pages = 1 << order; 755 unsigned int nr_pages = 1 << order;
756 struct page *p = page;
753 unsigned int loop; 757 unsigned int loop;
754 758
755 prefetchw(page); 759 prefetchw(p);
756 for (loop = 0; loop < nr_pages; loop++) { 760 for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
757 struct page *p = &page[loop]; 761 prefetchw(p + 1);
758
759 if (loop + 1 < nr_pages)
760 prefetchw(p + 1);
761 __ClearPageReserved(p); 762 __ClearPageReserved(p);
762 set_page_count(p, 0); 763 set_page_count(p, 0);
763 } 764 }
765 __ClearPageReserved(p);
766 set_page_count(p, 0);
764 767
765 page_zone(page)->managed_pages += 1 << order; 768 page_zone(page)->managed_pages += nr_pages;
766 set_page_refcounted(page); 769 set_page_refcounted(page);
767 __free_pages(page, order); 770 __free_pages(page, order);
768} 771}
@@ -885,7 +888,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
885 int migratetype) 888 int migratetype)
886{ 889{
887 unsigned int current_order; 890 unsigned int current_order;
888 struct free_area * area; 891 struct free_area *area;
889 struct page *page; 892 struct page *page;
890 893
891 /* Find a page of the appropriate size in the preferred list */ 894 /* Find a page of the appropriate size in the preferred list */
@@ -1007,14 +1010,60 @@ static void change_pageblock_range(struct page *pageblock_page,
1007 } 1010 }
1008} 1011}
1009 1012
1013/*
1014 * If breaking a large block of pages, move all free pages to the preferred
1015 * allocation list. If falling back for a reclaimable kernel allocation, be
1016 * more aggressive about taking ownership of free pages.
1017 *
1018 * On the other hand, never change migration type of MIGRATE_CMA pageblocks
1019 * nor move CMA pages to different free lists. We don't want unmovable pages
1020 * to be allocated from MIGRATE_CMA areas.
1021 *
1022 * Returns the new migratetype of the pageblock (or the same old migratetype
1023 * if it was unchanged).
1024 */
1025static int try_to_steal_freepages(struct zone *zone, struct page *page,
1026 int start_type, int fallback_type)
1027{
1028 int current_order = page_order(page);
1029
1030 if (is_migrate_cma(fallback_type))
1031 return fallback_type;
1032
1033 /* Take ownership for orders >= pageblock_order */
1034 if (current_order >= pageblock_order) {
1035 change_pageblock_range(page, current_order, start_type);
1036 return start_type;
1037 }
1038
1039 if (current_order >= pageblock_order / 2 ||
1040 start_type == MIGRATE_RECLAIMABLE ||
1041 page_group_by_mobility_disabled) {
1042 int pages;
1043
1044 pages = move_freepages_block(zone, page, start_type);
1045
1046 /* Claim the whole block if over half of it is free */
1047 if (pages >= (1 << (pageblock_order-1)) ||
1048 page_group_by_mobility_disabled) {
1049
1050 set_pageblock_migratetype(page, start_type);
1051 return start_type;
1052 }
1053
1054 }
1055
1056 return fallback_type;
1057}
1058
1010/* Remove an element from the buddy allocator from the fallback list */ 1059/* Remove an element from the buddy allocator from the fallback list */
1011static inline struct page * 1060static inline struct page *
1012__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) 1061__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1013{ 1062{
1014 struct free_area * area; 1063 struct free_area *area;
1015 int current_order; 1064 int current_order;
1016 struct page *page; 1065 struct page *page;
1017 int migratetype, i; 1066 int migratetype, new_type, i;
1018 1067
1019 /* Find the largest possible block of pages in the other list */ 1068 /* Find the largest possible block of pages in the other list */
1020 for (current_order = MAX_ORDER-1; current_order >= order; 1069 for (current_order = MAX_ORDER-1; current_order >= order;
@@ -1034,51 +1083,29 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
1034 struct page, lru); 1083 struct page, lru);
1035 area->nr_free--; 1084 area->nr_free--;
1036 1085
1037 /* 1086 new_type = try_to_steal_freepages(zone, page,
1038 * If breaking a large block of pages, move all free 1087 start_migratetype,
1039 * pages to the preferred allocation list. If falling 1088 migratetype);
1040 * back for a reclaimable kernel allocation, be more
1041 * aggressive about taking ownership of free pages
1042 *
1043 * On the other hand, never change migration
1044 * type of MIGRATE_CMA pageblocks nor move CMA
1045 * pages on different free lists. We don't
1046 * want unmovable pages to be allocated from
1047 * MIGRATE_CMA areas.
1048 */
1049 if (!is_migrate_cma(migratetype) &&
1050 (current_order >= pageblock_order / 2 ||
1051 start_migratetype == MIGRATE_RECLAIMABLE ||
1052 page_group_by_mobility_disabled)) {
1053 int pages;
1054 pages = move_freepages_block(zone, page,
1055 start_migratetype);
1056
1057 /* Claim the whole block if over half of it is free */
1058 if (pages >= (1 << (pageblock_order-1)) ||
1059 page_group_by_mobility_disabled)
1060 set_pageblock_migratetype(page,
1061 start_migratetype);
1062
1063 migratetype = start_migratetype;
1064 }
1065 1089
1066 /* Remove the page from the freelists */ 1090 /* Remove the page from the freelists */
1067 list_del(&page->lru); 1091 list_del(&page->lru);
1068 rmv_page_order(page); 1092 rmv_page_order(page);
1069 1093
1070 /* Take ownership for orders >= pageblock_order */ 1094 /*
1071 if (current_order >= pageblock_order && 1095 * Borrow the excess buddy pages as well, irrespective
1072 !is_migrate_cma(migratetype)) 1096 * of whether we stole freepages, or took ownership of
1073 change_pageblock_range(page, current_order, 1097 * the pageblock or not.
1074 start_migratetype); 1098 *
1075 1099 * Exception: When borrowing from MIGRATE_CMA, release
1100 * the excess buddy pages to CMA itself.
1101 */
1076 expand(zone, page, order, current_order, area, 1102 expand(zone, page, order, current_order, area,
1077 is_migrate_cma(migratetype) 1103 is_migrate_cma(migratetype)
1078 ? migratetype : start_migratetype); 1104 ? migratetype : start_migratetype);
1079 1105
1080 trace_mm_page_alloc_extfrag(page, order, current_order, 1106 trace_mm_page_alloc_extfrag(page, order,
1081 start_migratetype, migratetype); 1107 current_order, start_migratetype, migratetype,
1108 new_type == start_migratetype);
1082 1109
1083 return page; 1110 return page;
1084 } 1111 }
@@ -1281,7 +1308,7 @@ void mark_free_pages(struct zone *zone)
1281 int order, t; 1308 int order, t;
1282 struct list_head *curr; 1309 struct list_head *curr;
1283 1310
1284 if (!zone->spanned_pages) 1311 if (zone_is_empty(zone))
1285 return; 1312 return;
1286 1313
1287 spin_lock_irqsave(&zone->lock, flags); 1314 spin_lock_irqsave(&zone->lock, flags);
@@ -1526,6 +1553,7 @@ again:
1526 get_pageblock_migratetype(page)); 1553 get_pageblock_migratetype(page));
1527 } 1554 }
1528 1555
1556 __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
1529 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1557 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1530 zone_statistics(preferred_zone, zone, gfp_flags); 1558 zone_statistics(preferred_zone, zone, gfp_flags);
1531 local_irq_restore(flags); 1559 local_irq_restore(flags);
@@ -1792,6 +1820,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
1792 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1820 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1793} 1821}
1794 1822
1823static bool zone_local(struct zone *local_zone, struct zone *zone)
1824{
1825 return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE;
1826}
1827
1795static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1828static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1796{ 1829{
1797 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); 1830 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
@@ -1829,6 +1862,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
1829{ 1862{
1830} 1863}
1831 1864
1865static bool zone_local(struct zone *local_zone, struct zone *zone)
1866{
1867 return true;
1868}
1869
1832static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 1870static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1833{ 1871{
1834 return true; 1872 return true;
@@ -1860,16 +1898,41 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1860zonelist_scan: 1898zonelist_scan:
1861 /* 1899 /*
1862 * Scan zonelist, looking for a zone with enough free. 1900 * Scan zonelist, looking for a zone with enough free.
1863 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1901 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
1864 */ 1902 */
1865 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1903 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1866 high_zoneidx, nodemask) { 1904 high_zoneidx, nodemask) {
1905 unsigned long mark;
1906
1867 if (IS_ENABLED(CONFIG_NUMA) && zlc_active && 1907 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1868 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1908 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1869 continue; 1909 continue;
1870 if ((alloc_flags & ALLOC_CPUSET) && 1910 if ((alloc_flags & ALLOC_CPUSET) &&
1871 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1911 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1872 continue; 1912 continue;
1913 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1914 if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
1915 goto try_this_zone;
1916 /*
1917 * Distribute pages in proportion to the individual
1918 * zone size to ensure fair page aging. The zone a
1919 * page was allocated in should have no effect on the
1920 * time the page has in memory before being reclaimed.
1921 *
1922 * When zone_reclaim_mode is enabled, try to stay in
1923 * local zones in the fastpath. If that fails, the
1924 * slowpath is entered, which will do another pass
1925 * starting with the local zones, but ultimately fall
1926 * back to remote zones that do not partake in the
1927 * fairness round-robin cycle of this zonelist.
1928 */
1929 if (alloc_flags & ALLOC_WMARK_LOW) {
1930 if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
1931 continue;
1932 if (zone_reclaim_mode &&
1933 !zone_local(preferred_zone, zone))
1934 continue;
1935 }
1873 /* 1936 /*
1874 * When allocating a page cache page for writing, we 1937 * When allocating a page cache page for writing, we
1875 * want to get it from a zone that is within its dirty 1938 * want to get it from a zone that is within its dirty
@@ -1900,16 +1963,11 @@ zonelist_scan:
1900 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) 1963 (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
1901 goto this_zone_full; 1964 goto this_zone_full;
1902 1965
1903 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); 1966 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1904 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1967 if (!zone_watermark_ok(zone, order, mark,
1905 unsigned long mark; 1968 classzone_idx, alloc_flags)) {
1906 int ret; 1969 int ret;
1907 1970
1908 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1909 if (zone_watermark_ok(zone, order, mark,
1910 classzone_idx, alloc_flags))
1911 goto try_this_zone;
1912
1913 if (IS_ENABLED(CONFIG_NUMA) && 1971 if (IS_ENABLED(CONFIG_NUMA) &&
1914 !did_zlc_setup && nr_online_nodes > 1) { 1972 !did_zlc_setup && nr_online_nodes > 1) {
1915 /* 1973 /*
@@ -2321,16 +2379,30 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
2321 return page; 2379 return page;
2322} 2380}
2323 2381
2324static inline 2382static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
2325void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, 2383 struct zonelist *zonelist,
2326 enum zone_type high_zoneidx, 2384 enum zone_type high_zoneidx,
2327 enum zone_type classzone_idx) 2385 struct zone *preferred_zone)
2328{ 2386{
2329 struct zoneref *z; 2387 struct zoneref *z;
2330 struct zone *zone; 2388 struct zone *zone;
2331 2389
2332 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 2390 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
2333 wakeup_kswapd(zone, order, classzone_idx); 2391 if (!(gfp_mask & __GFP_NO_KSWAPD))
2392 wakeup_kswapd(zone, order, zone_idx(preferred_zone));
2393 /*
2394 * Only reset the batches of zones that were actually
2395 * considered in the fast path, we don't want to
2396 * thrash fairness information for zones that are not
2397 * actually part of this zonelist's round-robin cycle.
2398 */
2399 if (zone_reclaim_mode && !zone_local(preferred_zone, zone))
2400 continue;
2401 mod_zone_page_state(zone, NR_ALLOC_BATCH,
2402 high_wmark_pages(zone) -
2403 low_wmark_pages(zone) -
2404 zone_page_state(zone, NR_ALLOC_BATCH));
2405 }
2334} 2406}
2335 2407
2336static inline int 2408static inline int
@@ -2426,9 +2498,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2426 goto nopage; 2498 goto nopage;
2427 2499
2428restart: 2500restart:
2429 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2501 prepare_slowpath(gfp_mask, order, zonelist,
2430 wake_all_kswapd(order, zonelist, high_zoneidx, 2502 high_zoneidx, preferred_zone);
2431 zone_idx(preferred_zone));
2432 2503
2433 /* 2504 /*
2434 * OK, we're below the kswapd watermark and have kicked background 2505 * OK, we're below the kswapd watermark and have kicked background
@@ -3095,7 +3166,7 @@ void show_free_areas(unsigned int filter)
3095 K(zone_page_state(zone, NR_FREE_CMA_PAGES)), 3166 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
3096 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 3167 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
3097 zone->pages_scanned, 3168 zone->pages_scanned,
3098 (zone->all_unreclaimable ? "yes" : "no") 3169 (!zone_reclaimable(zone) ? "yes" : "no")
3099 ); 3170 );
3100 printk("lowmem_reserve[]:"); 3171 printk("lowmem_reserve[]:");
3101 for (i = 0; i < MAX_NR_ZONES; i++) 3172 for (i = 0; i < MAX_NR_ZONES; i++)
@@ -3104,7 +3175,7 @@ void show_free_areas(unsigned int filter)
3104 } 3175 }
3105 3176
3106 for_each_populated_zone(zone) { 3177 for_each_populated_zone(zone) {
3107 unsigned long nr[MAX_ORDER], flags, order, total = 0; 3178 unsigned long nr[MAX_ORDER], flags, order, total = 0;
3108 unsigned char types[MAX_ORDER]; 3179 unsigned char types[MAX_ORDER];
3109 3180
3110 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3181 if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@ -3416,11 +3487,11 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
3416static int default_zonelist_order(void) 3487static int default_zonelist_order(void)
3417{ 3488{
3418 int nid, zone_type; 3489 int nid, zone_type;
3419 unsigned long low_kmem_size,total_size; 3490 unsigned long low_kmem_size, total_size;
3420 struct zone *z; 3491 struct zone *z;
3421 int average_size; 3492 int average_size;
3422 /* 3493 /*
3423 * ZONE_DMA and ZONE_DMA32 can be very small area in the system. 3494 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
3424 * If they are really small and used heavily, the system can fall 3495 * If they are really small and used heavily, the system can fall
3425 * into OOM very easily. 3496 * into OOM very easily.
3426 * This function detect ZONE_DMA/DMA32 size and configures zone order. 3497 * This function detect ZONE_DMA/DMA32 size and configures zone order.
@@ -3452,9 +3523,9 @@ static int default_zonelist_order(void)
3452 return ZONELIST_ORDER_NODE; 3523 return ZONELIST_ORDER_NODE;
3453 /* 3524 /*
3454 * look into each node's config. 3525 * look into each node's config.
3455 * If there is a node whose DMA/DMA32 memory is very big area on 3526 * If there is a node whose DMA/DMA32 memory is very big area on
3456 * local memory, NODE_ORDER may be suitable. 3527 * local memory, NODE_ORDER may be suitable.
3457 */ 3528 */
3458 average_size = total_size / 3529 average_size = total_size /
3459 (nodes_weight(node_states[N_MEMORY]) + 1); 3530 (nodes_weight(node_states[N_MEMORY]) + 1);
3460 for_each_online_node(nid) { 3531 for_each_online_node(nid) {
@@ -4180,7 +4251,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
4180 if (!zone->wait_table) 4251 if (!zone->wait_table)
4181 return -ENOMEM; 4252 return -ENOMEM;
4182 4253
4183 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) 4254 for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
4184 init_waitqueue_head(zone->wait_table + i); 4255 init_waitqueue_head(zone->wait_table + i);
4185 4256
4186 return 0; 4257 return 0;
@@ -4237,7 +4308,7 @@ int __meminit init_currently_empty_zone(struct zone *zone,
4237int __meminit __early_pfn_to_nid(unsigned long pfn) 4308int __meminit __early_pfn_to_nid(unsigned long pfn)
4238{ 4309{
4239 unsigned long start_pfn, end_pfn; 4310 unsigned long start_pfn, end_pfn;
4240 int i, nid; 4311 int nid;
4241 /* 4312 /*
4242 * NOTE: The following SMP-unsafe globals are only used early in boot 4313 * NOTE: The following SMP-unsafe globals are only used early in boot
4243 * when the kernel is running single-threaded. 4314 * when the kernel is running single-threaded.
@@ -4248,15 +4319,14 @@ int __meminit __early_pfn_to_nid(unsigned long pfn)
4248 if (last_start_pfn <= pfn && pfn < last_end_pfn) 4319 if (last_start_pfn <= pfn && pfn < last_end_pfn)
4249 return last_nid; 4320 return last_nid;
4250 4321
4251 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 4322 nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
4252 if (start_pfn <= pfn && pfn < end_pfn) { 4323 if (nid != -1) {
4253 last_start_pfn = start_pfn; 4324 last_start_pfn = start_pfn;
4254 last_end_pfn = end_pfn; 4325 last_end_pfn = end_pfn;
4255 last_nid = nid; 4326 last_nid = nid;
4256 return nid; 4327 }
4257 } 4328
4258 /* This is a memory hole */ 4329 return nid;
4259 return -1;
4260} 4330}
4261#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 4331#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
4262 4332
@@ -4586,7 +4656,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
4586#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4656#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4587 4657
4588/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4658/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4589void __init set_pageblock_order(void) 4659void __paginginit set_pageblock_order(void)
4590{ 4660{
4591 unsigned int order; 4661 unsigned int order;
4592 4662
@@ -4614,7 +4684,7 @@ void __init set_pageblock_order(void)
4614 * include/linux/pageblock-flags.h for the values of pageblock_order based on 4684 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4615 * the kernel config 4685 * the kernel config
4616 */ 4686 */
4617void __init set_pageblock_order(void) 4687void __paginginit set_pageblock_order(void)
4618{ 4688{
4619} 4689}
4620 4690
@@ -4728,8 +4798,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4728 spin_lock_init(&zone->lru_lock); 4798 spin_lock_init(&zone->lru_lock);
4729 zone_seqlock_init(zone); 4799 zone_seqlock_init(zone);
4730 zone->zone_pgdat = pgdat; 4800 zone->zone_pgdat = pgdat;
4731
4732 zone_pcp_init(zone); 4801 zone_pcp_init(zone);
4802
4803 /* For bootup, initialized properly in watermark setup */
4804 mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
4805
4733 lruvec_init(&zone->lruvec); 4806 lruvec_init(&zone->lruvec);
4734 if (!size) 4807 if (!size)
4735 continue; 4808 continue;
@@ -4930,7 +5003,7 @@ static unsigned long __init early_calculate_totalpages(void)
4930 if (pages) 5003 if (pages)
4931 node_set_state(nid, N_MEMORY); 5004 node_set_state(nid, N_MEMORY);
4932 } 5005 }
4933 return totalpages; 5006 return totalpages;
4934} 5007}
4935 5008
4936/* 5009/*
@@ -5047,7 +5120,7 @@ restart:
5047 /* 5120 /*
5048 * Some kernelcore has been met, update counts and 5121 * Some kernelcore has been met, update counts and
5049 * break if the kernelcore for this node has been 5122 * break if the kernelcore for this node has been
5050 * satisified 5123 * satisfied
5051 */ 5124 */
5052 required_kernelcore -= min(required_kernelcore, 5125 required_kernelcore -= min(required_kernelcore,
5053 size_pages); 5126 size_pages);
@@ -5061,7 +5134,7 @@ restart:
5061 * If there is still required_kernelcore, we do another pass with one 5134 * If there is still required_kernelcore, we do another pass with one
5062 * less node in the count. This will push zone_movable_pfn[nid] further 5135 * less node in the count. This will push zone_movable_pfn[nid] further
5063 * along on the nodes that still have memory until kernelcore is 5136 * along on the nodes that still have memory until kernelcore is
5064 * satisified 5137 * satisfied
5065 */ 5138 */
5066 usable_nodes--; 5139 usable_nodes--;
5067 if (usable_nodes && required_kernelcore > usable_nodes) 5140 if (usable_nodes && required_kernelcore > usable_nodes)
@@ -5286,8 +5359,10 @@ void __init mem_init_print_info(const char *str)
5286 * 3) .rodata.* may be embedded into .text or .data sections. 5359 * 3) .rodata.* may be embedded into .text or .data sections.
5287 */ 5360 */
5288#define adj_init_size(start, end, size, pos, adj) \ 5361#define adj_init_size(start, end, size, pos, adj) \
5289 if (start <= pos && pos < end && size > adj) \ 5362 do { \
5290 size -= adj; 5363 if (start <= pos && pos < end && size > adj) \
5364 size -= adj; \
5365 } while (0)
5291 5366
5292 adj_init_size(__init_begin, __init_end, init_data_size, 5367 adj_init_size(__init_begin, __init_end, init_data_size,
5293 _sinittext, init_code_size); 5368 _sinittext, init_code_size);
@@ -5361,7 +5436,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
5361 * This is only okay since the processor is dead and cannot 5436 * This is only okay since the processor is dead and cannot
5362 * race with what we are doing. 5437 * race with what we are doing.
5363 */ 5438 */
5364 refresh_cpu_vm_stats(cpu); 5439 cpu_vm_stats_fold(cpu);
5365 } 5440 }
5366 return NOTIFY_OK; 5441 return NOTIFY_OK;
5367} 5442}
@@ -5498,6 +5573,11 @@ static void __setup_per_zone_wmarks(void)
5498 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5573 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5499 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5574 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5500 5575
5576 __mod_zone_page_state(zone, NR_ALLOC_BATCH,
5577 high_wmark_pages(zone) -
5578 low_wmark_pages(zone) -
5579 zone_page_state(zone, NR_ALLOC_BATCH));
5580
5501 setup_zone_migrate_reserve(zone); 5581 setup_zone_migrate_reserve(zone);
5502 spin_unlock_irqrestore(&zone->lock, flags); 5582 spin_unlock_irqrestore(&zone->lock, flags);
5503 } 5583 }
@@ -5570,7 +5650,7 @@ static void __meminit setup_per_zone_inactive_ratio(void)
5570 * we want it large (64MB max). But it is not linear, because network 5650 * we want it large (64MB max). But it is not linear, because network
5571 * bandwidth does not increase linearly with machine size. We use 5651 * bandwidth does not increase linearly with machine size. We use
5572 * 5652 *
5573 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 5653 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
5574 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 5654 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
5575 * 5655 *
5576 * which yields 5656 * which yields
@@ -5614,11 +5694,11 @@ int __meminit init_per_zone_wmark_min(void)
5614module_init(init_per_zone_wmark_min) 5694module_init(init_per_zone_wmark_min)
5615 5695
5616/* 5696/*
5617 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 5697 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
5618 * that we can call two helper functions whenever min_free_kbytes 5698 * that we can call two helper functions whenever min_free_kbytes
5619 * changes. 5699 * changes.
5620 */ 5700 */
5621int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 5701int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
5622 void __user *buffer, size_t *length, loff_t *ppos) 5702 void __user *buffer, size_t *length, loff_t *ppos)
5623{ 5703{
5624 proc_dointvec(table, write, buffer, length, ppos); 5704 proc_dointvec(table, write, buffer, length, ppos);
@@ -5682,8 +5762,8 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
5682 5762
5683/* 5763/*
5684 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 5764 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
5685 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist 5765 * cpu. It is the fraction of total pages in each zone that a hot per cpu
5686 * can have before it gets flushed back to buddy allocator. 5766 * pagelist can have before it gets flushed back to buddy allocator.
5687 */ 5767 */
5688int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 5768int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
5689 void __user *buffer, size_t *length, loff_t *ppos) 5769 void __user *buffer, size_t *length, loff_t *ppos)
@@ -5745,9 +5825,10 @@ void *__init alloc_large_system_hash(const char *tablename,
5745 if (!numentries) { 5825 if (!numentries) {
5746 /* round applicable memory size up to nearest megabyte */ 5826 /* round applicable memory size up to nearest megabyte */
5747 numentries = nr_kernel_pages; 5827 numentries = nr_kernel_pages;
5748 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 5828
5749 numentries >>= 20 - PAGE_SHIFT; 5829 /* It isn't necessary when PAGE_SIZE >= 1MB */
5750 numentries <<= 20 - PAGE_SHIFT; 5830 if (PAGE_SHIFT < 20)
5831 numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
5751 5832
5752 /* limit to 1 bucket per 2^scale bytes of low memory */ 5833 /* limit to 1 bucket per 2^scale bytes of low memory */
5753 if (scale > PAGE_SHIFT) 5834 if (scale > PAGE_SHIFT)
@@ -5900,7 +5981,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5900 * This function checks whether pageblock includes unmovable pages or not. 5981 * This function checks whether pageblock includes unmovable pages or not.
5901 * If @count is not zero, it is okay to include less @count unmovable pages 5982 * If @count is not zero, it is okay to include less @count unmovable pages
5902 * 5983 *
5903 * PageLRU check wihtout isolation or lru_lock could race so that 5984 * PageLRU check without isolation or lru_lock could race so that
5904 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 5985 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5905 * expect this function should be exact. 5986 * expect this function should be exact.
5906 */ 5987 */
@@ -5928,6 +6009,17 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
5928 continue; 6009 continue;
5929 6010
5930 page = pfn_to_page(check); 6011 page = pfn_to_page(check);
6012
6013 /*
6014 * Hugepages are not in LRU lists, but they're movable.
6015 * We need not scan over tail pages bacause we don't
6016 * handle each tail page individually in migration.
6017 */
6018 if (PageHuge(page)) {
6019 iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
6020 continue;
6021 }
6022
5931 /* 6023 /*
5932 * We can't use page_count without pin a page 6024 * We can't use page_count without pin a page
5933 * because another CPU can free compound page. 6025 * because another CPU can free compound page.
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 0cee10ffb98d..d1473b2e9481 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -6,6 +6,7 @@
6#include <linux/page-isolation.h> 6#include <linux/page-isolation.h>
7#include <linux/pageblock-flags.h> 7#include <linux/pageblock-flags.h>
8#include <linux/memory.h> 8#include <linux/memory.h>
9#include <linux/hugetlb.h>
9#include "internal.h" 10#include "internal.h"
10 11
11int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) 12int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
@@ -252,6 +253,19 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private,
252{ 253{
253 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; 254 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
254 255
256 /*
257 * TODO: allocate a destination hugepage from a nearest neighbor node,
258 * accordance with memory policy of the user process if possible. For
259 * now as a simple work-around, we use the next node for destination.
260 */
261 if (PageHuge(page)) {
262 nodemask_t src = nodemask_of_node(page_to_nid(page));
263 nodemask_t dst;
264 nodes_complement(dst, src);
265 return alloc_huge_page_node(page_hstate(compound_head(page)),
266 next_node(page_to_nid(page), dst));
267 }
268
255 if (PageHighMem(page)) 269 if (PageHighMem(page))
256 gfp_mask |= __GFP_HIGHMEM; 270 gfp_mask |= __GFP_HIGHMEM;
257 271
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index e1a6e4fab016..3929a40bd6c0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -10,6 +10,30 @@
10#include <asm/tlb.h> 10#include <asm/tlb.h>
11#include <asm-generic/pgtable.h> 11#include <asm-generic/pgtable.h>
12 12
13/*
14 * If a p?d_bad entry is found while walking page tables, report
15 * the error, before resetting entry to p?d_none. Usually (but
16 * very seldom) called out from the p?d_none_or_clear_bad macros.
17 */
18
19void pgd_clear_bad(pgd_t *pgd)
20{
21 pgd_ERROR(*pgd);
22 pgd_clear(pgd);
23}
24
25void pud_clear_bad(pud_t *pud)
26{
27 pud_ERROR(*pud);
28 pud_clear(pud);
29}
30
31void pmd_clear_bad(pmd_t *pmd)
32{
33 pmd_ERROR(*pmd);
34 pmd_clear(pmd);
35}
36
13#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS 37#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
14/* 38/*
15 * Only sets the access flags (dirty, accessed), as well as write 39 * Only sets the access flags (dirty, accessed), as well as write
diff --git a/mm/readahead.c b/mm/readahead.c
index 829a77c62834..e4ed04149785 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -371,10 +371,10 @@ static int try_context_readahead(struct address_space *mapping,
371 size = count_history_pages(mapping, ra, offset, max); 371 size = count_history_pages(mapping, ra, offset, max);
372 372
373 /* 373 /*
374 * no history pages: 374 * not enough history pages:
375 * it could be a random read 375 * it could be a random read
376 */ 376 */
377 if (!size) 377 if (size <= req_size)
378 return 0; 378 return 0;
379 379
380 /* 380 /*
@@ -385,8 +385,8 @@ static int try_context_readahead(struct address_space *mapping,
385 size *= 2; 385 size *= 2;
386 386
387 ra->start = offset; 387 ra->start = offset;
388 ra->size = get_init_ra_size(size + req_size, max); 388 ra->size = min(size + req_size, max);
389 ra->async_size = ra->size; 389 ra->async_size = 1;
390 390
391 return 1; 391 return 1;
392} 392}
diff --git a/mm/shmem.c b/mm/shmem.c
index 526149846d0a..8297623fcaed 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1205,7 +1205,7 @@ repeat:
1205 gfp & GFP_RECLAIM_MASK); 1205 gfp & GFP_RECLAIM_MASK);
1206 if (error) 1206 if (error)
1207 goto decused; 1207 goto decused;
1208 error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); 1208 error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
1209 if (!error) { 1209 if (!error) {
1210 error = shmem_add_to_page_cache(page, mapping, index, 1210 error = shmem_add_to_page_cache(page, mapping, index,
1211 gfp, NULL); 1211 gfp, NULL);
@@ -2819,6 +2819,10 @@ int __init shmem_init(void)
2819{ 2819{
2820 int error; 2820 int error;
2821 2821
2822 /* If rootfs called this, don't re-init */
2823 if (shmem_inode_cachep)
2824 return 0;
2825
2822 error = bdi_init(&shmem_backing_dev_info); 2826 error = bdi_init(&shmem_backing_dev_info);
2823 if (error) 2827 if (error)
2824 goto out4; 2828 goto out4;
diff --git a/mm/slub.c b/mm/slub.c
index e3ba1f2cf60c..51df8272cfaf 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4420,7 +4420,7 @@ static ssize_t order_store(struct kmem_cache *s,
4420 unsigned long order; 4420 unsigned long order;
4421 int err; 4421 int err;
4422 4422
4423 err = strict_strtoul(buf, 10, &order); 4423 err = kstrtoul(buf, 10, &order);
4424 if (err) 4424 if (err)
4425 return err; 4425 return err;
4426 4426
@@ -4448,7 +4448,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
4448 unsigned long min; 4448 unsigned long min;
4449 int err; 4449 int err;
4450 4450
4451 err = strict_strtoul(buf, 10, &min); 4451 err = kstrtoul(buf, 10, &min);
4452 if (err) 4452 if (err)
4453 return err; 4453 return err;
4454 4454
@@ -4468,7 +4468,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
4468 unsigned long objects; 4468 unsigned long objects;
4469 int err; 4469 int err;
4470 4470
4471 err = strict_strtoul(buf, 10, &objects); 4471 err = kstrtoul(buf, 10, &objects);
4472 if (err) 4472 if (err)
4473 return err; 4473 return err;
4474 if (objects && !kmem_cache_has_cpu_partial(s)) 4474 if (objects && !kmem_cache_has_cpu_partial(s))
@@ -4784,7 +4784,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
4784 unsigned long ratio; 4784 unsigned long ratio;
4785 int err; 4785 int err;
4786 4786
4787 err = strict_strtoul(buf, 10, &ratio); 4787 err = kstrtoul(buf, 10, &ratio);
4788 if (err) 4788 if (err)
4789 return err; 4789 return err;
4790 4790
diff --git a/mm/sparse.c b/mm/sparse.c
index 308d50331bc3..4ac1d7ef548f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -339,13 +339,14 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
339} 339}
340#endif /* CONFIG_MEMORY_HOTREMOVE */ 340#endif /* CONFIG_MEMORY_HOTREMOVE */
341 341
342static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, 342static void __init sparse_early_usemaps_alloc_node(void *data,
343 unsigned long pnum_begin, 343 unsigned long pnum_begin,
344 unsigned long pnum_end, 344 unsigned long pnum_end,
345 unsigned long usemap_count, int nodeid) 345 unsigned long usemap_count, int nodeid)
346{ 346{
347 void *usemap; 347 void *usemap;
348 unsigned long pnum; 348 unsigned long pnum;
349 unsigned long **usemap_map = (unsigned long **)data;
349 int size = usemap_size(); 350 int size = usemap_size();
350 351
351 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), 352 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
@@ -430,11 +431,12 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
430#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 431#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
431 432
432#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER 433#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
433static void __init sparse_early_mem_maps_alloc_node(struct page **map_map, 434static void __init sparse_early_mem_maps_alloc_node(void *data,
434 unsigned long pnum_begin, 435 unsigned long pnum_begin,
435 unsigned long pnum_end, 436 unsigned long pnum_end,
436 unsigned long map_count, int nodeid) 437 unsigned long map_count, int nodeid)
437{ 438{
439 struct page **map_map = (struct page **)data;
438 sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, 440 sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
439 map_count, nodeid); 441 map_count, nodeid);
440} 442}
@@ -460,6 +462,55 @@ void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
460{ 462{
461} 463}
462 464
465/**
466 * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap
467 * @map: usemap_map for pageblock flags or mmap_map for vmemmap
468 */
469static void __init alloc_usemap_and_memmap(void (*alloc_func)
470 (void *, unsigned long, unsigned long,
471 unsigned long, int), void *data)
472{
473 unsigned long pnum;
474 unsigned long map_count;
475 int nodeid_begin = 0;
476 unsigned long pnum_begin = 0;
477
478 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
479 struct mem_section *ms;
480
481 if (!present_section_nr(pnum))
482 continue;
483 ms = __nr_to_section(pnum);
484 nodeid_begin = sparse_early_nid(ms);
485 pnum_begin = pnum;
486 break;
487 }
488 map_count = 1;
489 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
490 struct mem_section *ms;
491 int nodeid;
492
493 if (!present_section_nr(pnum))
494 continue;
495 ms = __nr_to_section(pnum);
496 nodeid = sparse_early_nid(ms);
497 if (nodeid == nodeid_begin) {
498 map_count++;
499 continue;
500 }
501 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
502 alloc_func(data, pnum_begin, pnum,
503 map_count, nodeid_begin);
504 /* new start, update count etc*/
505 nodeid_begin = nodeid;
506 pnum_begin = pnum;
507 map_count = 1;
508 }
509 /* ok, last chunk */
510 alloc_func(data, pnum_begin, NR_MEM_SECTIONS,
511 map_count, nodeid_begin);
512}
513
463/* 514/*
464 * Allocate the accumulated non-linear sections, allocate a mem_map 515 * Allocate the accumulated non-linear sections, allocate a mem_map
465 * for each and record the physical to section mapping. 516 * for each and record the physical to section mapping.
@@ -471,11 +522,7 @@ void __init sparse_init(void)
471 unsigned long *usemap; 522 unsigned long *usemap;
472 unsigned long **usemap_map; 523 unsigned long **usemap_map;
473 int size; 524 int size;
474 int nodeid_begin = 0;
475 unsigned long pnum_begin = 0;
476 unsigned long usemap_count;
477#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER 525#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
478 unsigned long map_count;
479 int size2; 526 int size2;
480 struct page **map_map; 527 struct page **map_map;
481#endif 528#endif
@@ -501,82 +548,16 @@ void __init sparse_init(void)
501 usemap_map = alloc_bootmem(size); 548 usemap_map = alloc_bootmem(size);
502 if (!usemap_map) 549 if (!usemap_map)
503 panic("can not allocate usemap_map\n"); 550 panic("can not allocate usemap_map\n");
504 551 alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
505 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 552 (void *)usemap_map);
506 struct mem_section *ms;
507
508 if (!present_section_nr(pnum))
509 continue;
510 ms = __nr_to_section(pnum);
511 nodeid_begin = sparse_early_nid(ms);
512 pnum_begin = pnum;
513 break;
514 }
515 usemap_count = 1;
516 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
517 struct mem_section *ms;
518 int nodeid;
519
520 if (!present_section_nr(pnum))
521 continue;
522 ms = __nr_to_section(pnum);
523 nodeid = sparse_early_nid(ms);
524 if (nodeid == nodeid_begin) {
525 usemap_count++;
526 continue;
527 }
528 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
529 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum,
530 usemap_count, nodeid_begin);
531 /* new start, update count etc*/
532 nodeid_begin = nodeid;
533 pnum_begin = pnum;
534 usemap_count = 1;
535 }
536 /* ok, last chunk */
537 sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
538 usemap_count, nodeid_begin);
539 553
540#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER 554#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
541 size2 = sizeof(struct page *) * NR_MEM_SECTIONS; 555 size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
542 map_map = alloc_bootmem(size2); 556 map_map = alloc_bootmem(size2);
543 if (!map_map) 557 if (!map_map)
544 panic("can not allocate map_map\n"); 558 panic("can not allocate map_map\n");
545 559 alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
546 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 560 (void *)map_map);
547 struct mem_section *ms;
548
549 if (!present_section_nr(pnum))
550 continue;
551 ms = __nr_to_section(pnum);
552 nodeid_begin = sparse_early_nid(ms);
553 pnum_begin = pnum;
554 break;
555 }
556 map_count = 1;
557 for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
558 struct mem_section *ms;
559 int nodeid;
560
561 if (!present_section_nr(pnum))
562 continue;
563 ms = __nr_to_section(pnum);
564 nodeid = sparse_early_nid(ms);
565 if (nodeid == nodeid_begin) {
566 map_count++;
567 continue;
568 }
569 /* ok, we need to take cake of from pnum_begin to pnum - 1*/
570 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
571 map_count, nodeid_begin);
572 /* new start, update count etc*/
573 nodeid_begin = nodeid;
574 pnum_begin = pnum;
575 map_count = 1;
576 }
577 /* ok, last chunk */
578 sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
579 map_count, nodeid_begin);
580#endif 561#endif
581 562
582 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 563 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
diff --git a/mm/swap.c b/mm/swap.c
index 62b78a6e224f..c899502d3e36 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,6 +31,7 @@
31#include <linux/memcontrol.h> 31#include <linux/memcontrol.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/uio.h> 33#include <linux/uio.h>
34#include <linux/hugetlb.h>
34 35
35#include "internal.h" 36#include "internal.h"
36 37
@@ -81,6 +82,19 @@ static void __put_compound_page(struct page *page)
81 82
82static void put_compound_page(struct page *page) 83static void put_compound_page(struct page *page)
83{ 84{
85 /*
86 * hugetlbfs pages cannot be split from under us. If this is a
87 * hugetlbfs page, check refcount on head page and release the page if
88 * the refcount becomes zero.
89 */
90 if (PageHuge(page)) {
91 page = compound_head(page);
92 if (put_page_testzero(page))
93 __put_compound_page(page);
94
95 return;
96 }
97
84 if (unlikely(PageTail(page))) { 98 if (unlikely(PageTail(page))) {
85 /* __split_huge_page_refcount can run under us */ 99 /* __split_huge_page_refcount can run under us */
86 struct page *page_head = compound_trans_head(page); 100 struct page *page_head = compound_trans_head(page);
@@ -184,38 +198,51 @@ bool __get_page_tail(struct page *page)
184 * proper PT lock that already serializes against 198 * proper PT lock that already serializes against
185 * split_huge_page(). 199 * split_huge_page().
186 */ 200 */
187 unsigned long flags;
188 bool got = false; 201 bool got = false;
189 struct page *page_head = compound_trans_head(page); 202 struct page *page_head;
190 203
191 if (likely(page != page_head && get_page_unless_zero(page_head))) { 204 /*
205 * If this is a hugetlbfs page it cannot be split under us. Simply
206 * increment refcount for the head page.
207 */
208 if (PageHuge(page)) {
209 page_head = compound_head(page);
210 atomic_inc(&page_head->_count);
211 got = true;
212 } else {
213 unsigned long flags;
214
215 page_head = compound_trans_head(page);
216 if (likely(page != page_head &&
217 get_page_unless_zero(page_head))) {
218
219 /* Ref to put_compound_page() comment. */
220 if (PageSlab(page_head)) {
221 if (likely(PageTail(page))) {
222 __get_page_tail_foll(page, false);
223 return true;
224 } else {
225 put_page(page_head);
226 return false;
227 }
228 }
192 229
193 /* Ref to put_compound_page() comment. */ 230 /*
194 if (PageSlab(page_head)) { 231 * page_head wasn't a dangling pointer but it
232 * may not be a head page anymore by the time
233 * we obtain the lock. That is ok as long as it
234 * can't be freed from under us.
235 */
236 flags = compound_lock_irqsave(page_head);
237 /* here __split_huge_page_refcount won't run anymore */
195 if (likely(PageTail(page))) { 238 if (likely(PageTail(page))) {
196 __get_page_tail_foll(page, false); 239 __get_page_tail_foll(page, false);
197 return true; 240 got = true;
198 } else {
199 put_page(page_head);
200 return false;
201 } 241 }
242 compound_unlock_irqrestore(page_head, flags);
243 if (unlikely(!got))
244 put_page(page_head);
202 } 245 }
203
204 /*
205 * page_head wasn't a dangling pointer but it
206 * may not be a head page anymore by the time
207 * we obtain the lock. That is ok as long as it
208 * can't be freed from under us.
209 */
210 flags = compound_lock_irqsave(page_head);
211 /* here __split_huge_page_refcount won't run anymore */
212 if (likely(PageTail(page))) {
213 __get_page_tail_foll(page, false);
214 got = true;
215 }
216 compound_unlock_irqrestore(page_head, flags);
217 if (unlikely(!got))
218 put_page(page_head);
219 } 246 }
220 return got; 247 return got;
221} 248}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index f24ab0dff554..e6f15f8ca2af 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -122,7 +122,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
122{ 122{
123 int error; 123 int error;
124 124
125 error = radix_tree_preload(gfp_mask); 125 error = radix_tree_maybe_preload(gfp_mask);
126 if (!error) { 126 if (!error) {
127 error = __add_to_swap_cache(page, entry); 127 error = __add_to_swap_cache(page, entry);
128 radix_tree_preload_end(); 128 radix_tree_preload_end();
@@ -328,7 +328,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
328 /* 328 /*
329 * call radix_tree_preload() while we can wait. 329 * call radix_tree_preload() while we can wait.
330 */ 330 */
331 err = radix_tree_preload(gfp_mask & GFP_KERNEL); 331 err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
332 if (err) 332 if (err)
333 break; 333 break;
334 334
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cf2e60983b7..3963fc24fcc1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -175,14 +175,296 @@ static void discard_swap_cluster(struct swap_info_struct *si,
175 } 175 }
176} 176}
177 177
178static int wait_for_discard(void *word) 178#define SWAPFILE_CLUSTER 256
179#define LATENCY_LIMIT 256
180
181static inline void cluster_set_flag(struct swap_cluster_info *info,
182 unsigned int flag)
179{ 183{
180 schedule(); 184 info->flags = flag;
181 return 0;
182} 185}
183 186
184#define SWAPFILE_CLUSTER 256 187static inline unsigned int cluster_count(struct swap_cluster_info *info)
185#define LATENCY_LIMIT 256 188{
189 return info->data;
190}
191
192static inline void cluster_set_count(struct swap_cluster_info *info,
193 unsigned int c)
194{
195 info->data = c;
196}
197
198static inline void cluster_set_count_flag(struct swap_cluster_info *info,
199 unsigned int c, unsigned int f)
200{
201 info->flags = f;
202 info->data = c;
203}
204
205static inline unsigned int cluster_next(struct swap_cluster_info *info)
206{
207 return info->data;
208}
209
210static inline void cluster_set_next(struct swap_cluster_info *info,
211 unsigned int n)
212{
213 info->data = n;
214}
215
216static inline void cluster_set_next_flag(struct swap_cluster_info *info,
217 unsigned int n, unsigned int f)
218{
219 info->flags = f;
220 info->data = n;
221}
222
223static inline bool cluster_is_free(struct swap_cluster_info *info)
224{
225 return info->flags & CLUSTER_FLAG_FREE;
226}
227
228static inline bool cluster_is_null(struct swap_cluster_info *info)
229{
230 return info->flags & CLUSTER_FLAG_NEXT_NULL;
231}
232
233static inline void cluster_set_null(struct swap_cluster_info *info)
234{
235 info->flags = CLUSTER_FLAG_NEXT_NULL;
236 info->data = 0;
237}
238
239/* Add a cluster to discard list and schedule it to do discard */
240static void swap_cluster_schedule_discard(struct swap_info_struct *si,
241 unsigned int idx)
242{
243 /*
244 * If scan_swap_map() can't find a free cluster, it will check
245 * si->swap_map directly. To make sure the discarding cluster isn't
246 * taken by scan_swap_map(), mark the swap entries bad (occupied). It
247 * will be cleared after discard
248 */
249 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
250 SWAP_MAP_BAD, SWAPFILE_CLUSTER);
251
252 if (cluster_is_null(&si->discard_cluster_head)) {
253 cluster_set_next_flag(&si->discard_cluster_head,
254 idx, 0);
255 cluster_set_next_flag(&si->discard_cluster_tail,
256 idx, 0);
257 } else {
258 unsigned int tail = cluster_next(&si->discard_cluster_tail);
259 cluster_set_next(&si->cluster_info[tail], idx);
260 cluster_set_next_flag(&si->discard_cluster_tail,
261 idx, 0);
262 }
263
264 schedule_work(&si->discard_work);
265}
266
267/*
268 * Doing discard actually. After a cluster discard is finished, the cluster
269 * will be added to free cluster list. caller should hold si->lock.
270*/
271static void swap_do_scheduled_discard(struct swap_info_struct *si)
272{
273 struct swap_cluster_info *info;
274 unsigned int idx;
275
276 info = si->cluster_info;
277
278 while (!cluster_is_null(&si->discard_cluster_head)) {
279 idx = cluster_next(&si->discard_cluster_head);
280
281 cluster_set_next_flag(&si->discard_cluster_head,
282 cluster_next(&info[idx]), 0);
283 if (cluster_next(&si->discard_cluster_tail) == idx) {
284 cluster_set_null(&si->discard_cluster_head);
285 cluster_set_null(&si->discard_cluster_tail);
286 }
287 spin_unlock(&si->lock);
288
289 discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
290 SWAPFILE_CLUSTER);
291
292 spin_lock(&si->lock);
293 cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
294 if (cluster_is_null(&si->free_cluster_head)) {
295 cluster_set_next_flag(&si->free_cluster_head,
296 idx, 0);
297 cluster_set_next_flag(&si->free_cluster_tail,
298 idx, 0);
299 } else {
300 unsigned int tail;
301
302 tail = cluster_next(&si->free_cluster_tail);
303 cluster_set_next(&info[tail], idx);
304 cluster_set_next_flag(&si->free_cluster_tail,
305 idx, 0);
306 }
307 memset(si->swap_map + idx * SWAPFILE_CLUSTER,
308 0, SWAPFILE_CLUSTER);
309 }
310}
311
312static void swap_discard_work(struct work_struct *work)
313{
314 struct swap_info_struct *si;
315
316 si = container_of(work, struct swap_info_struct, discard_work);
317
318 spin_lock(&si->lock);
319 swap_do_scheduled_discard(si);
320 spin_unlock(&si->lock);
321}
322
323/*
324 * The cluster corresponding to page_nr will be used. The cluster will be
325 * removed from free cluster list and its usage counter will be increased.
326 */
327static void inc_cluster_info_page(struct swap_info_struct *p,
328 struct swap_cluster_info *cluster_info, unsigned long page_nr)
329{
330 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
331
332 if (!cluster_info)
333 return;
334 if (cluster_is_free(&cluster_info[idx])) {
335 VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx);
336 cluster_set_next_flag(&p->free_cluster_head,
337 cluster_next(&cluster_info[idx]), 0);
338 if (cluster_next(&p->free_cluster_tail) == idx) {
339 cluster_set_null(&p->free_cluster_tail);
340 cluster_set_null(&p->free_cluster_head);
341 }
342 cluster_set_count_flag(&cluster_info[idx], 0, 0);
343 }
344
345 VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
346 cluster_set_count(&cluster_info[idx],
347 cluster_count(&cluster_info[idx]) + 1);
348}
349
350/*
351 * The cluster corresponding to page_nr decreases one usage. If the usage
352 * counter becomes 0, which means no page in the cluster is in using, we can
353 * optionally discard the cluster and add it to free cluster list.
354 */
355static void dec_cluster_info_page(struct swap_info_struct *p,
356 struct swap_cluster_info *cluster_info, unsigned long page_nr)
357{
358 unsigned long idx = page_nr / SWAPFILE_CLUSTER;
359
360 if (!cluster_info)
361 return;
362
363 VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
364 cluster_set_count(&cluster_info[idx],
365 cluster_count(&cluster_info[idx]) - 1);
366
367 if (cluster_count(&cluster_info[idx]) == 0) {
368 /*
369 * If the swap is discardable, prepare discard the cluster
370 * instead of free it immediately. The cluster will be freed
371 * after discard.
372 */
373 if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
374 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
375 swap_cluster_schedule_discard(p, idx);
376 return;
377 }
378
379 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
380 if (cluster_is_null(&p->free_cluster_head)) {
381 cluster_set_next_flag(&p->free_cluster_head, idx, 0);
382 cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
383 } else {
384 unsigned int tail = cluster_next(&p->free_cluster_tail);
385 cluster_set_next(&cluster_info[tail], idx);
386 cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
387 }
388 }
389}
390
391/*
392 * It's possible scan_swap_map() uses a free cluster in the middle of free
393 * cluster list. Avoiding such abuse to avoid list corruption.
394 */
395static bool
396scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
397 unsigned long offset)
398{
399 struct percpu_cluster *percpu_cluster;
400 bool conflict;
401
402 offset /= SWAPFILE_CLUSTER;
403 conflict = !cluster_is_null(&si->free_cluster_head) &&
404 offset != cluster_next(&si->free_cluster_head) &&
405 cluster_is_free(&si->cluster_info[offset]);
406
407 if (!conflict)
408 return false;
409
410 percpu_cluster = this_cpu_ptr(si->percpu_cluster);
411 cluster_set_null(&percpu_cluster->index);
412 return true;
413}
414
415/*
416 * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
417 * might involve allocating a new cluster for current CPU too.
418 */
419static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
420 unsigned long *offset, unsigned long *scan_base)
421{
422 struct percpu_cluster *cluster;
423 bool found_free;
424 unsigned long tmp;
425
426new_cluster:
427 cluster = this_cpu_ptr(si->percpu_cluster);
428 if (cluster_is_null(&cluster->index)) {
429 if (!cluster_is_null(&si->free_cluster_head)) {
430 cluster->index = si->free_cluster_head;
431 cluster->next = cluster_next(&cluster->index) *
432 SWAPFILE_CLUSTER;
433 } else if (!cluster_is_null(&si->discard_cluster_head)) {
434 /*
435 * we don't have free cluster but have some clusters in
436 * discarding, do discard now and reclaim them
437 */
438 swap_do_scheduled_discard(si);
439 *scan_base = *offset = si->cluster_next;
440 goto new_cluster;
441 } else
442 return;
443 }
444
445 found_free = false;
446
447 /*
448 * Other CPUs can use our cluster if they can't find a free cluster,
449 * check if there is still free entry in the cluster
450 */
451 tmp = cluster->next;
452 while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
453 SWAPFILE_CLUSTER) {
454 if (!si->swap_map[tmp]) {
455 found_free = true;
456 break;
457 }
458 tmp++;
459 }
460 if (!found_free) {
461 cluster_set_null(&cluster->index);
462 goto new_cluster;
463 }
464 cluster->next = tmp + 1;
465 *offset = tmp;
466 *scan_base = tmp;
467}
186 468
187static unsigned long scan_swap_map(struct swap_info_struct *si, 469static unsigned long scan_swap_map(struct swap_info_struct *si,
188 unsigned char usage) 470 unsigned char usage)
@@ -191,7 +473,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
191 unsigned long scan_base; 473 unsigned long scan_base;
192 unsigned long last_in_cluster = 0; 474 unsigned long last_in_cluster = 0;
193 int latency_ration = LATENCY_LIMIT; 475 int latency_ration = LATENCY_LIMIT;
194 int found_free_cluster = 0;
195 476
196 /* 477 /*
197 * We try to cluster swap pages by allocating them sequentially 478 * We try to cluster swap pages by allocating them sequentially
@@ -207,24 +488,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
207 si->flags += SWP_SCANNING; 488 si->flags += SWP_SCANNING;
208 scan_base = offset = si->cluster_next; 489 scan_base = offset = si->cluster_next;
209 490
491 /* SSD algorithm */
492 if (si->cluster_info) {
493 scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
494 goto checks;
495 }
496
210 if (unlikely(!si->cluster_nr--)) { 497 if (unlikely(!si->cluster_nr--)) {
211 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { 498 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
212 si->cluster_nr = SWAPFILE_CLUSTER - 1; 499 si->cluster_nr = SWAPFILE_CLUSTER - 1;
213 goto checks; 500 goto checks;
214 } 501 }
215 if (si->flags & SWP_PAGE_DISCARD) { 502
216 /*
217 * Start range check on racing allocations, in case
218 * they overlap the cluster we eventually decide on
219 * (we scan without swap_lock to allow preemption).
220 * It's hardly conceivable that cluster_nr could be
221 * wrapped during our scan, but don't depend on it.
222 */
223 if (si->lowest_alloc)
224 goto checks;
225 si->lowest_alloc = si->max;
226 si->highest_alloc = 0;
227 }
228 spin_unlock(&si->lock); 503 spin_unlock(&si->lock);
229 504
230 /* 505 /*
@@ -248,7 +523,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
248 offset -= SWAPFILE_CLUSTER - 1; 523 offset -= SWAPFILE_CLUSTER - 1;
249 si->cluster_next = offset; 524 si->cluster_next = offset;
250 si->cluster_nr = SWAPFILE_CLUSTER - 1; 525 si->cluster_nr = SWAPFILE_CLUSTER - 1;
251 found_free_cluster = 1;
252 goto checks; 526 goto checks;
253 } 527 }
254 if (unlikely(--latency_ration < 0)) { 528 if (unlikely(--latency_ration < 0)) {
@@ -269,7 +543,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
269 offset -= SWAPFILE_CLUSTER - 1; 543 offset -= SWAPFILE_CLUSTER - 1;
270 si->cluster_next = offset; 544 si->cluster_next = offset;
271 si->cluster_nr = SWAPFILE_CLUSTER - 1; 545 si->cluster_nr = SWAPFILE_CLUSTER - 1;
272 found_free_cluster = 1;
273 goto checks; 546 goto checks;
274 } 547 }
275 if (unlikely(--latency_ration < 0)) { 548 if (unlikely(--latency_ration < 0)) {
@@ -281,10 +554,13 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
281 offset = scan_base; 554 offset = scan_base;
282 spin_lock(&si->lock); 555 spin_lock(&si->lock);
283 si->cluster_nr = SWAPFILE_CLUSTER - 1; 556 si->cluster_nr = SWAPFILE_CLUSTER - 1;
284 si->lowest_alloc = 0;
285 } 557 }
286 558
287checks: 559checks:
560 if (si->cluster_info) {
561 while (scan_swap_map_ssd_cluster_conflict(si, offset))
562 scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
563 }
288 if (!(si->flags & SWP_WRITEOK)) 564 if (!(si->flags & SWP_WRITEOK))
289 goto no_page; 565 goto no_page;
290 if (!si->highest_bit) 566 if (!si->highest_bit)
@@ -317,62 +593,10 @@ checks:
317 si->highest_bit = 0; 593 si->highest_bit = 0;
318 } 594 }
319 si->swap_map[offset] = usage; 595 si->swap_map[offset] = usage;
596 inc_cluster_info_page(si, si->cluster_info, offset);
320 si->cluster_next = offset + 1; 597 si->cluster_next = offset + 1;
321 si->flags -= SWP_SCANNING; 598 si->flags -= SWP_SCANNING;
322 599
323 if (si->lowest_alloc) {
324 /*
325 * Only set when SWP_PAGE_DISCARD, and there's a scan
326 * for a free cluster in progress or just completed.
327 */
328 if (found_free_cluster) {
329 /*
330 * To optimize wear-levelling, discard the
331 * old data of the cluster, taking care not to
332 * discard any of its pages that have already
333 * been allocated by racing tasks (offset has
334 * already stepped over any at the beginning).
335 */
336 if (offset < si->highest_alloc &&
337 si->lowest_alloc <= last_in_cluster)
338 last_in_cluster = si->lowest_alloc - 1;
339 si->flags |= SWP_DISCARDING;
340 spin_unlock(&si->lock);
341
342 if (offset < last_in_cluster)
343 discard_swap_cluster(si, offset,
344 last_in_cluster - offset + 1);
345
346 spin_lock(&si->lock);
347 si->lowest_alloc = 0;
348 si->flags &= ~SWP_DISCARDING;
349
350 smp_mb(); /* wake_up_bit advises this */
351 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
352
353 } else if (si->flags & SWP_DISCARDING) {
354 /*
355 * Delay using pages allocated by racing tasks
356 * until the whole discard has been issued. We
357 * could defer that delay until swap_writepage,
358 * but it's easier to keep this self-contained.
359 */
360 spin_unlock(&si->lock);
361 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
362 wait_for_discard, TASK_UNINTERRUPTIBLE);
363 spin_lock(&si->lock);
364 } else {
365 /*
366 * Note pages allocated by racing tasks while
367 * scan for a free cluster is in progress, so
368 * that its final discard can exclude them.
369 */
370 if (offset < si->lowest_alloc)
371 si->lowest_alloc = offset;
372 if (offset > si->highest_alloc)
373 si->highest_alloc = offset;
374 }
375 }
376 return offset; 600 return offset;
377 601
378scan: 602scan:
@@ -527,16 +751,16 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
527 return p; 751 return p;
528 752
529bad_free: 753bad_free:
530 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); 754 pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val);
531 goto out; 755 goto out;
532bad_offset: 756bad_offset:
533 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); 757 pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val);
534 goto out; 758 goto out;
535bad_device: 759bad_device:
536 printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); 760 pr_err("swap_free: %s%08lx\n", Unused_file, entry.val);
537 goto out; 761 goto out;
538bad_nofile: 762bad_nofile:
539 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); 763 pr_err("swap_free: %s%08lx\n", Bad_file, entry.val);
540out: 764out:
541 return NULL; 765 return NULL;
542} 766}
@@ -600,6 +824,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
600 824
601 /* free if no reference */ 825 /* free if no reference */
602 if (!usage) { 826 if (!usage) {
827 dec_cluster_info_page(p, p->cluster_info, offset);
603 if (offset < p->lowest_bit) 828 if (offset < p->lowest_bit)
604 p->lowest_bit = offset; 829 p->lowest_bit = offset;
605 if (offset > p->highest_bit) 830 if (offset > p->highest_bit)
@@ -1107,7 +1332,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1107 else 1332 else
1108 continue; 1333 continue;
1109 } 1334 }
1110 count = si->swap_map[i]; 1335 count = ACCESS_ONCE(si->swap_map[i]);
1111 if (count && swap_count(count) != SWAP_MAP_BAD) 1336 if (count && swap_count(count) != SWAP_MAP_BAD)
1112 break; 1337 break;
1113 } 1338 }
@@ -1127,7 +1352,11 @@ int try_to_unuse(unsigned int type, bool frontswap,
1127{ 1352{
1128 struct swap_info_struct *si = swap_info[type]; 1353 struct swap_info_struct *si = swap_info[type];
1129 struct mm_struct *start_mm; 1354 struct mm_struct *start_mm;
1130 unsigned char *swap_map; 1355 volatile unsigned char *swap_map; /* swap_map is accessed without
1356 * locking. Mark it as volatile
1357 * to prevent compiler doing
1358 * something odd.
1359 */
1131 unsigned char swcount; 1360 unsigned char swcount;
1132 struct page *page; 1361 struct page *page;
1133 swp_entry_t entry; 1362 swp_entry_t entry;
@@ -1178,7 +1407,15 @@ int try_to_unuse(unsigned int type, bool frontswap,
1178 * reused since sys_swapoff() already disabled 1407 * reused since sys_swapoff() already disabled
1179 * allocation from here, or alloc_page() failed. 1408 * allocation from here, or alloc_page() failed.
1180 */ 1409 */
1181 if (!*swap_map) 1410 swcount = *swap_map;
1411 /*
1412 * We don't hold lock here, so the swap entry could be
1413 * SWAP_MAP_BAD (when the cluster is discarding).
1414 * Instead of fail out, We can just skip the swap
1415 * entry because swapoff will wait for discarding
1416 * finish anyway.
1417 */
1418 if (!swcount || swcount == SWAP_MAP_BAD)
1182 continue; 1419 continue;
1183 retval = -ENOMEM; 1420 retval = -ENOMEM;
1184 break; 1421 break;
@@ -1524,7 +1761,8 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1524} 1761}
1525 1762
1526static void _enable_swap_info(struct swap_info_struct *p, int prio, 1763static void _enable_swap_info(struct swap_info_struct *p, int prio,
1527 unsigned char *swap_map) 1764 unsigned char *swap_map,
1765 struct swap_cluster_info *cluster_info)
1528{ 1766{
1529 int i, prev; 1767 int i, prev;
1530 1768
@@ -1533,6 +1771,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
1533 else 1771 else
1534 p->prio = --least_priority; 1772 p->prio = --least_priority;
1535 p->swap_map = swap_map; 1773 p->swap_map = swap_map;
1774 p->cluster_info = cluster_info;
1536 p->flags |= SWP_WRITEOK; 1775 p->flags |= SWP_WRITEOK;
1537 atomic_long_add(p->pages, &nr_swap_pages); 1776 atomic_long_add(p->pages, &nr_swap_pages);
1538 total_swap_pages += p->pages; 1777 total_swap_pages += p->pages;
@@ -1553,12 +1792,13 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
1553 1792
1554static void enable_swap_info(struct swap_info_struct *p, int prio, 1793static void enable_swap_info(struct swap_info_struct *p, int prio,
1555 unsigned char *swap_map, 1794 unsigned char *swap_map,
1795 struct swap_cluster_info *cluster_info,
1556 unsigned long *frontswap_map) 1796 unsigned long *frontswap_map)
1557{ 1797{
1558 frontswap_init(p->type, frontswap_map); 1798 frontswap_init(p->type, frontswap_map);
1559 spin_lock(&swap_lock); 1799 spin_lock(&swap_lock);
1560 spin_lock(&p->lock); 1800 spin_lock(&p->lock);
1561 _enable_swap_info(p, prio, swap_map); 1801 _enable_swap_info(p, prio, swap_map, cluster_info);
1562 spin_unlock(&p->lock); 1802 spin_unlock(&p->lock);
1563 spin_unlock(&swap_lock); 1803 spin_unlock(&swap_lock);
1564} 1804}
@@ -1567,7 +1807,7 @@ static void reinsert_swap_info(struct swap_info_struct *p)
1567{ 1807{
1568 spin_lock(&swap_lock); 1808 spin_lock(&swap_lock);
1569 spin_lock(&p->lock); 1809 spin_lock(&p->lock);
1570 _enable_swap_info(p, p->prio, p->swap_map); 1810 _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
1571 spin_unlock(&p->lock); 1811 spin_unlock(&p->lock);
1572 spin_unlock(&swap_lock); 1812 spin_unlock(&swap_lock);
1573} 1813}
@@ -1576,6 +1816,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1576{ 1816{
1577 struct swap_info_struct *p = NULL; 1817 struct swap_info_struct *p = NULL;
1578 unsigned char *swap_map; 1818 unsigned char *swap_map;
1819 struct swap_cluster_info *cluster_info;
1579 unsigned long *frontswap_map; 1820 unsigned long *frontswap_map;
1580 struct file *swap_file, *victim; 1821 struct file *swap_file, *victim;
1581 struct address_space *mapping; 1822 struct address_space *mapping;
@@ -1651,6 +1892,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1651 goto out_dput; 1892 goto out_dput;
1652 } 1893 }
1653 1894
1895 flush_work(&p->discard_work);
1896
1654 destroy_swap_extents(p); 1897 destroy_swap_extents(p);
1655 if (p->flags & SWP_CONTINUED) 1898 if (p->flags & SWP_CONTINUED)
1656 free_swap_count_continuations(p); 1899 free_swap_count_continuations(p);
@@ -1675,6 +1918,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1675 p->max = 0; 1918 p->max = 0;
1676 swap_map = p->swap_map; 1919 swap_map = p->swap_map;
1677 p->swap_map = NULL; 1920 p->swap_map = NULL;
1921 cluster_info = p->cluster_info;
1922 p->cluster_info = NULL;
1678 p->flags = 0; 1923 p->flags = 0;
1679 frontswap_map = frontswap_map_get(p); 1924 frontswap_map = frontswap_map_get(p);
1680 frontswap_map_set(p, NULL); 1925 frontswap_map_set(p, NULL);
@@ -1682,7 +1927,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1682 spin_unlock(&swap_lock); 1927 spin_unlock(&swap_lock);
1683 frontswap_invalidate_area(type); 1928 frontswap_invalidate_area(type);
1684 mutex_unlock(&swapon_mutex); 1929 mutex_unlock(&swapon_mutex);
1930 free_percpu(p->percpu_cluster);
1931 p->percpu_cluster = NULL;
1685 vfree(swap_map); 1932 vfree(swap_map);
1933 vfree(cluster_info);
1686 vfree(frontswap_map); 1934 vfree(frontswap_map);
1687 /* Destroy swap account informatin */ 1935 /* Destroy swap account informatin */
1688 swap_cgroup_swapoff(type); 1936 swap_cgroup_swapoff(type);
@@ -1926,9 +2174,10 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1926 int i; 2174 int i;
1927 unsigned long maxpages; 2175 unsigned long maxpages;
1928 unsigned long swapfilepages; 2176 unsigned long swapfilepages;
2177 unsigned long last_page;
1929 2178
1930 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { 2179 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1931 printk(KERN_ERR "Unable to find swap-space signature\n"); 2180 pr_err("Unable to find swap-space signature\n");
1932 return 0; 2181 return 0;
1933 } 2182 }
1934 2183
@@ -1942,9 +2191,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1942 } 2191 }
1943 /* Check the swap header's sub-version */ 2192 /* Check the swap header's sub-version */
1944 if (swap_header->info.version != 1) { 2193 if (swap_header->info.version != 1) {
1945 printk(KERN_WARNING 2194 pr_warn("Unable to handle swap header version %d\n",
1946 "Unable to handle swap header version %d\n", 2195 swap_header->info.version);
1947 swap_header->info.version);
1948 return 0; 2196 return 0;
1949 } 2197 }
1950 2198
@@ -1968,8 +2216,14 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1968 */ 2216 */
1969 maxpages = swp_offset(pte_to_swp_entry( 2217 maxpages = swp_offset(pte_to_swp_entry(
1970 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; 2218 swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
1971 if (maxpages > swap_header->info.last_page) { 2219 last_page = swap_header->info.last_page;
1972 maxpages = swap_header->info.last_page + 1; 2220 if (last_page > maxpages) {
2221 pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
2222 maxpages << (PAGE_SHIFT - 10),
2223 last_page << (PAGE_SHIFT - 10));
2224 }
2225 if (maxpages > last_page) {
2226 maxpages = last_page + 1;
1973 /* p->max is an unsigned int: don't overflow it */ 2227 /* p->max is an unsigned int: don't overflow it */
1974 if ((unsigned int)maxpages == 0) 2228 if ((unsigned int)maxpages == 0)
1975 maxpages = UINT_MAX; 2229 maxpages = UINT_MAX;
@@ -1980,8 +2234,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1980 return 0; 2234 return 0;
1981 swapfilepages = i_size_read(inode) >> PAGE_SHIFT; 2235 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1982 if (swapfilepages && maxpages > swapfilepages) { 2236 if (swapfilepages && maxpages > swapfilepages) {
1983 printk(KERN_WARNING 2237 pr_warn("Swap area shorter than signature indicates\n");
1984 "Swap area shorter than signature indicates\n");
1985 return 0; 2238 return 0;
1986 } 2239 }
1987 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 2240 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
@@ -1995,15 +2248,23 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
1995static int setup_swap_map_and_extents(struct swap_info_struct *p, 2248static int setup_swap_map_and_extents(struct swap_info_struct *p,
1996 union swap_header *swap_header, 2249 union swap_header *swap_header,
1997 unsigned char *swap_map, 2250 unsigned char *swap_map,
2251 struct swap_cluster_info *cluster_info,
1998 unsigned long maxpages, 2252 unsigned long maxpages,
1999 sector_t *span) 2253 sector_t *span)
2000{ 2254{
2001 int i; 2255 int i;
2002 unsigned int nr_good_pages; 2256 unsigned int nr_good_pages;
2003 int nr_extents; 2257 int nr_extents;
2258 unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
2259 unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER;
2004 2260
2005 nr_good_pages = maxpages - 1; /* omit header page */ 2261 nr_good_pages = maxpages - 1; /* omit header page */
2006 2262
2263 cluster_set_null(&p->free_cluster_head);
2264 cluster_set_null(&p->free_cluster_tail);
2265 cluster_set_null(&p->discard_cluster_head);
2266 cluster_set_null(&p->discard_cluster_tail);
2267
2007 for (i = 0; i < swap_header->info.nr_badpages; i++) { 2268 for (i = 0; i < swap_header->info.nr_badpages; i++) {
2008 unsigned int page_nr = swap_header->info.badpages[i]; 2269 unsigned int page_nr = swap_header->info.badpages[i];
2009 if (page_nr == 0 || page_nr > swap_header->info.last_page) 2270 if (page_nr == 0 || page_nr > swap_header->info.last_page)
@@ -2011,11 +2272,25 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
2011 if (page_nr < maxpages) { 2272 if (page_nr < maxpages) {
2012 swap_map[page_nr] = SWAP_MAP_BAD; 2273 swap_map[page_nr] = SWAP_MAP_BAD;
2013 nr_good_pages--; 2274 nr_good_pages--;
2275 /*
2276 * Haven't marked the cluster free yet, no list
2277 * operation involved
2278 */
2279 inc_cluster_info_page(p, cluster_info, page_nr);
2014 } 2280 }
2015 } 2281 }
2016 2282
2283 /* Haven't marked the cluster free yet, no list operation involved */
2284 for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
2285 inc_cluster_info_page(p, cluster_info, i);
2286
2017 if (nr_good_pages) { 2287 if (nr_good_pages) {
2018 swap_map[0] = SWAP_MAP_BAD; 2288 swap_map[0] = SWAP_MAP_BAD;
2289 /*
2290 * Not mark the cluster free yet, no list
2291 * operation involved
2292 */
2293 inc_cluster_info_page(p, cluster_info, 0);
2019 p->max = maxpages; 2294 p->max = maxpages;
2020 p->pages = nr_good_pages; 2295 p->pages = nr_good_pages;
2021 nr_extents = setup_swap_extents(p, span); 2296 nr_extents = setup_swap_extents(p, span);
@@ -2024,10 +2299,34 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
2024 nr_good_pages = p->pages; 2299 nr_good_pages = p->pages;
2025 } 2300 }
2026 if (!nr_good_pages) { 2301 if (!nr_good_pages) {
2027 printk(KERN_WARNING "Empty swap-file\n"); 2302 pr_warn("Empty swap-file\n");
2028 return -EINVAL; 2303 return -EINVAL;
2029 } 2304 }
2030 2305
2306 if (!cluster_info)
2307 return nr_extents;
2308
2309 for (i = 0; i < nr_clusters; i++) {
2310 if (!cluster_count(&cluster_info[idx])) {
2311 cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
2312 if (cluster_is_null(&p->free_cluster_head)) {
2313 cluster_set_next_flag(&p->free_cluster_head,
2314 idx, 0);
2315 cluster_set_next_flag(&p->free_cluster_tail,
2316 idx, 0);
2317 } else {
2318 unsigned int tail;
2319
2320 tail = cluster_next(&p->free_cluster_tail);
2321 cluster_set_next(&cluster_info[tail], idx);
2322 cluster_set_next_flag(&p->free_cluster_tail,
2323 idx, 0);
2324 }
2325 }
2326 idx++;
2327 if (idx == nr_clusters)
2328 idx = 0;
2329 }
2031 return nr_extents; 2330 return nr_extents;
2032} 2331}
2033 2332
@@ -2059,6 +2358,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2059 sector_t span; 2358 sector_t span;
2060 unsigned long maxpages; 2359 unsigned long maxpages;
2061 unsigned char *swap_map = NULL; 2360 unsigned char *swap_map = NULL;
2361 struct swap_cluster_info *cluster_info = NULL;
2062 unsigned long *frontswap_map = NULL; 2362 unsigned long *frontswap_map = NULL;
2063 struct page *page = NULL; 2363 struct page *page = NULL;
2064 struct inode *inode = NULL; 2364 struct inode *inode = NULL;
@@ -2073,6 +2373,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2073 if (IS_ERR(p)) 2373 if (IS_ERR(p))
2074 return PTR_ERR(p); 2374 return PTR_ERR(p);
2075 2375
2376 INIT_WORK(&p->discard_work, swap_discard_work);
2377
2076 name = getname(specialfile); 2378 name = getname(specialfile);
2077 if (IS_ERR(name)) { 2379 if (IS_ERR(name)) {
2078 error = PTR_ERR(name); 2380 error = PTR_ERR(name);
@@ -2132,13 +2434,38 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2132 error = -ENOMEM; 2434 error = -ENOMEM;
2133 goto bad_swap; 2435 goto bad_swap;
2134 } 2436 }
2437 if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
2438 p->flags |= SWP_SOLIDSTATE;
2439 /*
2440 * select a random position to start with to help wear leveling
2441 * SSD
2442 */
2443 p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
2444
2445 cluster_info = vzalloc(DIV_ROUND_UP(maxpages,
2446 SWAPFILE_CLUSTER) * sizeof(*cluster_info));
2447 if (!cluster_info) {
2448 error = -ENOMEM;
2449 goto bad_swap;
2450 }
2451 p->percpu_cluster = alloc_percpu(struct percpu_cluster);
2452 if (!p->percpu_cluster) {
2453 error = -ENOMEM;
2454 goto bad_swap;
2455 }
2456 for_each_possible_cpu(i) {
2457 struct percpu_cluster *cluster;
2458 cluster = per_cpu_ptr(p->percpu_cluster, i);
2459 cluster_set_null(&cluster->index);
2460 }
2461 }
2135 2462
2136 error = swap_cgroup_swapon(p->type, maxpages); 2463 error = swap_cgroup_swapon(p->type, maxpages);
2137 if (error) 2464 if (error)
2138 goto bad_swap; 2465 goto bad_swap;
2139 2466
2140 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, 2467 nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
2141 maxpages, &span); 2468 cluster_info, maxpages, &span);
2142 if (unlikely(nr_extents < 0)) { 2469 if (unlikely(nr_extents < 0)) {
2143 error = nr_extents; 2470 error = nr_extents;
2144 goto bad_swap; 2471 goto bad_swap;
@@ -2147,41 +2474,33 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2147 if (frontswap_enabled) 2474 if (frontswap_enabled)
2148 frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); 2475 frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
2149 2476
2150 if (p->bdev) { 2477 if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
2151 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { 2478 /*
2152 p->flags |= SWP_SOLIDSTATE; 2479 * When discard is enabled for swap with no particular
2153 p->cluster_next = 1 + (prandom_u32() % p->highest_bit); 2480 * policy flagged, we set all swap discard flags here in
2154 } 2481 * order to sustain backward compatibility with older
2155 2482 * swapon(8) releases.
2156 if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { 2483 */
2157 /* 2484 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
2158 * When discard is enabled for swap with no particular 2485 SWP_PAGE_DISCARD);
2159 * policy flagged, we set all swap discard flags here in
2160 * order to sustain backward compatibility with older
2161 * swapon(8) releases.
2162 */
2163 p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
2164 SWP_PAGE_DISCARD);
2165 2486
2166 /* 2487 /*
2167 * By flagging sys_swapon, a sysadmin can tell us to 2488 * By flagging sys_swapon, a sysadmin can tell us to
2168 * either do single-time area discards only, or to just 2489 * either do single-time area discards only, or to just
2169 * perform discards for released swap page-clusters. 2490 * perform discards for released swap page-clusters.
2170 * Now it's time to adjust the p->flags accordingly. 2491 * Now it's time to adjust the p->flags accordingly.
2171 */ 2492 */
2172 if (swap_flags & SWAP_FLAG_DISCARD_ONCE) 2493 if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
2173 p->flags &= ~SWP_PAGE_DISCARD; 2494 p->flags &= ~SWP_PAGE_DISCARD;
2174 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) 2495 else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
2175 p->flags &= ~SWP_AREA_DISCARD; 2496 p->flags &= ~SWP_AREA_DISCARD;
2176 2497
2177 /* issue a swapon-time discard if it's still required */ 2498 /* issue a swapon-time discard if it's still required */
2178 if (p->flags & SWP_AREA_DISCARD) { 2499 if (p->flags & SWP_AREA_DISCARD) {
2179 int err = discard_swap(p); 2500 int err = discard_swap(p);
2180 if (unlikely(err)) 2501 if (unlikely(err))
2181 printk(KERN_ERR 2502 pr_err("swapon: discard_swap(%p): %d\n",
2182 "swapon: discard_swap(%p): %d\n", 2503 p, err);
2183 p, err);
2184 }
2185 } 2504 }
2186 } 2505 }
2187 2506
@@ -2190,9 +2509,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2190 if (swap_flags & SWAP_FLAG_PREFER) 2509 if (swap_flags & SWAP_FLAG_PREFER)
2191 prio = 2510 prio =
2192 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; 2511 (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
2193 enable_swap_info(p, prio, swap_map, frontswap_map); 2512 enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
2194 2513
2195 printk(KERN_INFO "Adding %uk swap on %s. " 2514 pr_info("Adding %uk swap on %s. "
2196 "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", 2515 "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
2197 p->pages<<(PAGE_SHIFT-10), name->name, p->prio, 2516 p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
2198 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), 2517 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
@@ -2211,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2211 error = 0; 2530 error = 0;
2212 goto out; 2531 goto out;
2213bad_swap: 2532bad_swap:
2533 free_percpu(p->percpu_cluster);
2534 p->percpu_cluster = NULL;
2214 if (inode && S_ISBLK(inode->i_mode) && p->bdev) { 2535 if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
2215 set_blocksize(p->bdev, p->old_block_size); 2536 set_blocksize(p->bdev, p->old_block_size);
2216 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); 2537 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
@@ -2222,6 +2543,7 @@ bad_swap:
2222 p->flags = 0; 2543 p->flags = 0;
2223 spin_unlock(&swap_lock); 2544 spin_unlock(&swap_lock);
2224 vfree(swap_map); 2545 vfree(swap_map);
2546 vfree(cluster_info);
2225 if (swap_file) { 2547 if (swap_file) {
2226 if (inode && S_ISREG(inode->i_mode)) { 2548 if (inode && S_ISREG(inode->i_mode)) {
2227 mutex_unlock(&inode->i_mutex); 2549 mutex_unlock(&inode->i_mutex);
@@ -2291,6 +2613,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
2291 goto unlock_out; 2613 goto unlock_out;
2292 2614
2293 count = p->swap_map[offset]; 2615 count = p->swap_map[offset];
2616
2617 /*
2618 * swapin_readahead() doesn't check if a swap entry is valid, so the
2619 * swap entry could be SWAP_MAP_BAD. Check here with lock held.
2620 */
2621 if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
2622 err = -ENOENT;
2623 goto unlock_out;
2624 }
2625
2294 has_cache = count & SWAP_HAS_CACHE; 2626 has_cache = count & SWAP_HAS_CACHE;
2295 count &= ~SWAP_HAS_CACHE; 2627 count &= ~SWAP_HAS_CACHE;
2296 err = 0; 2628 err = 0;
@@ -2326,7 +2658,7 @@ out:
2326 return err; 2658 return err;
2327 2659
2328bad_file: 2660bad_file:
2329 printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); 2661 pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
2330 goto out; 2662 goto out;
2331} 2663}
2332 2664
diff --git a/mm/util.c b/mm/util.c
index 7441c41d00f6..eaf63fc2c92f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -388,15 +388,12 @@ struct address_space *page_mapping(struct page *page)
388 struct address_space *mapping = page->mapping; 388 struct address_space *mapping = page->mapping;
389 389
390 VM_BUG_ON(PageSlab(page)); 390 VM_BUG_ON(PageSlab(page));
391#ifdef CONFIG_SWAP
392 if (unlikely(PageSwapCache(page))) { 391 if (unlikely(PageSwapCache(page))) {
393 swp_entry_t entry; 392 swp_entry_t entry;
394 393
395 entry.val = page_private(page); 394 entry.val = page_private(page);
396 mapping = swap_address_space(entry); 395 mapping = swap_address_space(entry);
397 } else 396 } else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
398#endif
399 if ((unsigned long)mapping & PAGE_MAPPING_ANON)
400 mapping = NULL; 397 mapping = NULL;
401 return mapping; 398 return mapping;
402} 399}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 13a54953a273..107454312d5e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -752,7 +752,6 @@ struct vmap_block_queue {
752struct vmap_block { 752struct vmap_block {
753 spinlock_t lock; 753 spinlock_t lock;
754 struct vmap_area *va; 754 struct vmap_area *va;
755 struct vmap_block_queue *vbq;
756 unsigned long free, dirty; 755 unsigned long free, dirty;
757 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); 756 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
758 struct list_head free_list; 757 struct list_head free_list;
@@ -830,7 +829,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
830 radix_tree_preload_end(); 829 radix_tree_preload_end();
831 830
832 vbq = &get_cpu_var(vmap_block_queue); 831 vbq = &get_cpu_var(vmap_block_queue);
833 vb->vbq = vbq;
834 spin_lock(&vbq->lock); 832 spin_lock(&vbq->lock);
835 list_add_rcu(&vb->free_list, &vbq->free); 833 list_add_rcu(&vb->free_list, &vbq->free);
836 spin_unlock(&vbq->lock); 834 spin_unlock(&vbq->lock);
@@ -1018,15 +1016,16 @@ void vm_unmap_aliases(void)
1018 1016
1019 rcu_read_lock(); 1017 rcu_read_lock();
1020 list_for_each_entry_rcu(vb, &vbq->free, free_list) { 1018 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
1021 int i; 1019 int i, j;
1022 1020
1023 spin_lock(&vb->lock); 1021 spin_lock(&vb->lock);
1024 i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); 1022 i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
1025 while (i < VMAP_BBMAP_BITS) { 1023 if (i < VMAP_BBMAP_BITS) {
1026 unsigned long s, e; 1024 unsigned long s, e;
1027 int j; 1025
1028 j = find_next_zero_bit(vb->dirty_map, 1026 j = find_last_bit(vb->dirty_map,
1029 VMAP_BBMAP_BITS, i); 1027 VMAP_BBMAP_BITS);
1028 j = j + 1; /* need exclusive index */
1030 1029
1031 s = vb->va->va_start + (i << PAGE_SHIFT); 1030 s = vb->va->va_start + (i << PAGE_SHIFT);
1032 e = vb->va->va_start + (j << PAGE_SHIFT); 1031 e = vb->va->va_start + (j << PAGE_SHIFT);
@@ -1036,10 +1035,6 @@ void vm_unmap_aliases(void)
1036 start = s; 1035 start = s;
1037 if (e > end) 1036 if (e > end)
1038 end = e; 1037 end = e;
1039
1040 i = j;
1041 i = find_next_bit(vb->dirty_map,
1042 VMAP_BBMAP_BITS, i);
1043 } 1038 }
1044 spin_unlock(&vb->lock); 1039 spin_unlock(&vb->lock);
1045 } 1040 }
@@ -1263,7 +1258,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
1263int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) 1258int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
1264{ 1259{
1265 unsigned long addr = (unsigned long)area->addr; 1260 unsigned long addr = (unsigned long)area->addr;
1266 unsigned long end = addr + area->size - PAGE_SIZE; 1261 unsigned long end = addr + get_vm_area_size(area);
1267 int err; 1262 int err;
1268 1263
1269 err = vmap_page_range(addr, end, prot, *pages); 1264 err = vmap_page_range(addr, end, prot, *pages);
@@ -1558,7 +1553,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1558 unsigned int nr_pages, array_size, i; 1553 unsigned int nr_pages, array_size, i;
1559 gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; 1554 gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
1560 1555
1561 nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; 1556 nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
1562 array_size = (nr_pages * sizeof(struct page *)); 1557 array_size = (nr_pages * sizeof(struct page *));
1563 1558
1564 area->nr_pages = nr_pages; 1559 area->nr_pages = nr_pages;
@@ -1990,7 +1985,7 @@ long vread(char *buf, char *addr, unsigned long count)
1990 1985
1991 vm = va->vm; 1986 vm = va->vm;
1992 vaddr = (char *) vm->addr; 1987 vaddr = (char *) vm->addr;
1993 if (addr >= vaddr + vm->size - PAGE_SIZE) 1988 if (addr >= vaddr + get_vm_area_size(vm))
1994 continue; 1989 continue;
1995 while (addr < vaddr) { 1990 while (addr < vaddr) {
1996 if (count == 0) 1991 if (count == 0)
@@ -2000,7 +1995,7 @@ long vread(char *buf, char *addr, unsigned long count)
2000 addr++; 1995 addr++;
2001 count--; 1996 count--;
2002 } 1997 }
2003 n = vaddr + vm->size - PAGE_SIZE - addr; 1998 n = vaddr + get_vm_area_size(vm) - addr;
2004 if (n > count) 1999 if (n > count)
2005 n = count; 2000 n = count;
2006 if (!(vm->flags & VM_IOREMAP)) 2001 if (!(vm->flags & VM_IOREMAP))
@@ -2072,7 +2067,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
2072 2067
2073 vm = va->vm; 2068 vm = va->vm;
2074 vaddr = (char *) vm->addr; 2069 vaddr = (char *) vm->addr;
2075 if (addr >= vaddr + vm->size - PAGE_SIZE) 2070 if (addr >= vaddr + get_vm_area_size(vm))
2076 continue; 2071 continue;
2077 while (addr < vaddr) { 2072 while (addr < vaddr) {
2078 if (count == 0) 2073 if (count == 0)
@@ -2081,7 +2076,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
2081 addr++; 2076 addr++;
2082 count--; 2077 count--;
2083 } 2078 }
2084 n = vaddr + vm->size - PAGE_SIZE - addr; 2079 n = vaddr + get_vm_area_size(vm) - addr;
2085 if (n > count) 2080 if (n > count)
2086 n = count; 2081 n = count;
2087 if (!(vm->flags & VM_IOREMAP)) { 2082 if (!(vm->flags & VM_IOREMAP)) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e36454220614..beb35778c69f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -146,6 +146,25 @@ static bool global_reclaim(struct scan_control *sc)
146} 146}
147#endif 147#endif
148 148
149unsigned long zone_reclaimable_pages(struct zone *zone)
150{
151 int nr;
152
153 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
154 zone_page_state(zone, NR_INACTIVE_FILE);
155
156 if (get_nr_swap_pages() > 0)
157 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
158 zone_page_state(zone, NR_INACTIVE_ANON);
159
160 return nr;
161}
162
163bool zone_reclaimable(struct zone *zone)
164{
165 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
166}
167
149static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) 168static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
150{ 169{
151 if (!mem_cgroup_disabled()) 170 if (!mem_cgroup_disabled())
@@ -579,7 +598,7 @@ int remove_mapping(struct address_space *mapping, struct page *page)
579 */ 598 */
580void putback_lru_page(struct page *page) 599void putback_lru_page(struct page *page)
581{ 600{
582 int lru; 601 bool is_unevictable;
583 int was_unevictable = PageUnevictable(page); 602 int was_unevictable = PageUnevictable(page);
584 603
585 VM_BUG_ON(PageLRU(page)); 604 VM_BUG_ON(PageLRU(page));
@@ -594,14 +613,14 @@ redo:
594 * unevictable page on [in]active list. 613 * unevictable page on [in]active list.
595 * We know how to handle that. 614 * We know how to handle that.
596 */ 615 */
597 lru = page_lru_base_type(page); 616 is_unevictable = false;
598 lru_cache_add(page); 617 lru_cache_add(page);
599 } else { 618 } else {
600 /* 619 /*
601 * Put unevictable pages directly on zone's unevictable 620 * Put unevictable pages directly on zone's unevictable
602 * list. 621 * list.
603 */ 622 */
604 lru = LRU_UNEVICTABLE; 623 is_unevictable = true;
605 add_page_to_unevictable_list(page); 624 add_page_to_unevictable_list(page);
606 /* 625 /*
607 * When racing with an mlock or AS_UNEVICTABLE clearing 626 * When racing with an mlock or AS_UNEVICTABLE clearing
@@ -621,7 +640,7 @@ redo:
621 * page is on unevictable list, it never be freed. To avoid that, 640 * page is on unevictable list, it never be freed. To avoid that,
622 * check after we added it to the list, again. 641 * check after we added it to the list, again.
623 */ 642 */
624 if (lru == LRU_UNEVICTABLE && page_evictable(page)) { 643 if (is_unevictable && page_evictable(page)) {
625 if (!isolate_lru_page(page)) { 644 if (!isolate_lru_page(page)) {
626 put_page(page); 645 put_page(page);
627 goto redo; 646 goto redo;
@@ -632,9 +651,9 @@ redo:
632 */ 651 */
633 } 652 }
634 653
635 if (was_unevictable && lru != LRU_UNEVICTABLE) 654 if (was_unevictable && !is_unevictable)
636 count_vm_event(UNEVICTABLE_PGRESCUED); 655 count_vm_event(UNEVICTABLE_PGRESCUED);
637 else if (!was_unevictable && lru == LRU_UNEVICTABLE) 656 else if (!was_unevictable && is_unevictable)
638 count_vm_event(UNEVICTABLE_PGCULLED); 657 count_vm_event(UNEVICTABLE_PGCULLED);
639 658
640 put_page(page); /* drop ref from isolate */ 659 put_page(page); /* drop ref from isolate */
@@ -1823,7 +1842,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1823 * latencies, so it's better to scan a minimum amount there as 1842 * latencies, so it's better to scan a minimum amount there as
1824 * well. 1843 * well.
1825 */ 1844 */
1826 if (current_is_kswapd() && zone->all_unreclaimable) 1845 if (current_is_kswapd() && !zone_reclaimable(zone))
1827 force_scan = true; 1846 force_scan = true;
1828 if (!global_reclaim(sc)) 1847 if (!global_reclaim(sc))
1829 force_scan = true; 1848 force_scan = true;
@@ -2278,8 +2297,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2278 if (global_reclaim(sc)) { 2297 if (global_reclaim(sc)) {
2279 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2298 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2280 continue; 2299 continue;
2281 if (zone->all_unreclaimable && 2300 if (sc->priority != DEF_PRIORITY &&
2282 sc->priority != DEF_PRIORITY) 2301 !zone_reclaimable(zone))
2283 continue; /* Let kswapd poll it */ 2302 continue; /* Let kswapd poll it */
2284 if (IS_ENABLED(CONFIG_COMPACTION)) { 2303 if (IS_ENABLED(CONFIG_COMPACTION)) {
2285 /* 2304 /*
@@ -2317,11 +2336,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2317 return aborted_reclaim; 2336 return aborted_reclaim;
2318} 2337}
2319 2338
2320static bool zone_reclaimable(struct zone *zone)
2321{
2322 return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
2323}
2324
2325/* All zones in zonelist are unreclaimable? */ 2339/* All zones in zonelist are unreclaimable? */
2326static bool all_unreclaimable(struct zonelist *zonelist, 2340static bool all_unreclaimable(struct zonelist *zonelist,
2327 struct scan_control *sc) 2341 struct scan_control *sc)
@@ -2335,7 +2349,7 @@ static bool all_unreclaimable(struct zonelist *zonelist,
2335 continue; 2349 continue;
2336 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2350 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2337 continue; 2351 continue;
2338 if (!zone->all_unreclaimable) 2352 if (zone_reclaimable(zone))
2339 return false; 2353 return false;
2340 } 2354 }
2341 2355
@@ -2750,7 +2764,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2750 * DEF_PRIORITY. Effectively, it considers them balanced so 2764 * DEF_PRIORITY. Effectively, it considers them balanced so
2751 * they must be considered balanced here as well! 2765 * they must be considered balanced here as well!
2752 */ 2766 */
2753 if (zone->all_unreclaimable) { 2767 if (!zone_reclaimable(zone)) {
2754 balanced_pages += zone->managed_pages; 2768 balanced_pages += zone->managed_pages;
2755 continue; 2769 continue;
2756 } 2770 }
@@ -2811,7 +2825,6 @@ static bool kswapd_shrink_zone(struct zone *zone,
2811 unsigned long lru_pages, 2825 unsigned long lru_pages,
2812 unsigned long *nr_attempted) 2826 unsigned long *nr_attempted)
2813{ 2827{
2814 unsigned long nr_slab;
2815 int testorder = sc->order; 2828 int testorder = sc->order;
2816 unsigned long balance_gap; 2829 unsigned long balance_gap;
2817 struct reclaim_state *reclaim_state = current->reclaim_state; 2830 struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -2858,15 +2871,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
2858 node_set(zone_to_nid(zone), shrink.nodes_to_scan); 2871 node_set(zone_to_nid(zone), shrink.nodes_to_scan);
2859 2872
2860 reclaim_state->reclaimed_slab = 0; 2873 reclaim_state->reclaimed_slab = 0;
2861 nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); 2874 shrink_slab(&shrink, sc->nr_scanned, lru_pages);
2862 sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2875 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2863 2876
2864 /* Account for the number of pages attempted to reclaim */ 2877 /* Account for the number of pages attempted to reclaim */
2865 *nr_attempted += sc->nr_to_reclaim; 2878 *nr_attempted += sc->nr_to_reclaim;
2866 2879
2867 if (nr_slab == 0 && !zone_reclaimable(zone))
2868 zone->all_unreclaimable = 1;
2869
2870 zone_clear_flag(zone, ZONE_WRITEBACK); 2880 zone_clear_flag(zone, ZONE_WRITEBACK);
2871 2881
2872 /* 2882 /*
@@ -2875,7 +2885,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
2875 * BDIs but as pressure is relieved, speculatively avoid congestion 2885 * BDIs but as pressure is relieved, speculatively avoid congestion
2876 * waits. 2886 * waits.
2877 */ 2887 */
2878 if (!zone->all_unreclaimable && 2888 if (zone_reclaimable(zone) &&
2879 zone_balanced(zone, testorder, 0, classzone_idx)) { 2889 zone_balanced(zone, testorder, 0, classzone_idx)) {
2880 zone_clear_flag(zone, ZONE_CONGESTED); 2890 zone_clear_flag(zone, ZONE_CONGESTED);
2881 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); 2891 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
@@ -2941,8 +2951,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2941 if (!populated_zone(zone)) 2951 if (!populated_zone(zone))
2942 continue; 2952 continue;
2943 2953
2944 if (zone->all_unreclaimable && 2954 if (sc.priority != DEF_PRIORITY &&
2945 sc.priority != DEF_PRIORITY) 2955 !zone_reclaimable(zone))
2946 continue; 2956 continue;
2947 2957
2948 /* 2958 /*
@@ -3020,8 +3030,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
3020 if (!populated_zone(zone)) 3030 if (!populated_zone(zone))
3021 continue; 3031 continue;
3022 3032
3023 if (zone->all_unreclaimable && 3033 if (sc.priority != DEF_PRIORITY &&
3024 sc.priority != DEF_PRIORITY) 3034 !zone_reclaimable(zone))
3025 continue; 3035 continue;
3026 3036
3027 sc.nr_scanned = 0; 3037 sc.nr_scanned = 0;
@@ -3277,7 +3287,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
3277 } 3287 }
3278 if (!waitqueue_active(&pgdat->kswapd_wait)) 3288 if (!waitqueue_active(&pgdat->kswapd_wait))
3279 return; 3289 return;
3280 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) 3290 if (zone_balanced(zone, order, 0, 0))
3281 return; 3291 return;
3282 3292
3283 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); 3293 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
@@ -3305,20 +3315,6 @@ unsigned long global_reclaimable_pages(void)
3305 return nr; 3315 return nr;
3306} 3316}
3307 3317
3308unsigned long zone_reclaimable_pages(struct zone *zone)
3309{
3310 int nr;
3311
3312 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
3313 zone_page_state(zone, NR_INACTIVE_FILE);
3314
3315 if (get_nr_swap_pages() > 0)
3316 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
3317 zone_page_state(zone, NR_INACTIVE_ANON);
3318
3319 return nr;
3320}
3321
3322#ifdef CONFIG_HIBERNATION 3318#ifdef CONFIG_HIBERNATION
3323/* 3319/*
3324 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of 3320 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
@@ -3615,7 +3611,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3615 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) 3611 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
3616 return ZONE_RECLAIM_FULL; 3612 return ZONE_RECLAIM_FULL;
3617 3613
3618 if (zone->all_unreclaimable) 3614 if (!zone_reclaimable(zone))
3619 return ZONE_RECLAIM_FULL; 3615 return ZONE_RECLAIM_FULL;
3620 3616
3621 /* 3617 /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c2ef4458fa..9bb314577911 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -19,6 +19,9 @@
19#include <linux/math64.h> 19#include <linux/math64.h>
20#include <linux/writeback.h> 20#include <linux/writeback.h>
21#include <linux/compaction.h> 21#include <linux/compaction.h>
22#include <linux/mm_inline.h>
23
24#include "internal.h"
22 25
23#ifdef CONFIG_VM_EVENT_COUNTERS 26#ifdef CONFIG_VM_EVENT_COUNTERS
24DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 27DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -414,12 +417,17 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
414EXPORT_SYMBOL(dec_zone_page_state); 417EXPORT_SYMBOL(dec_zone_page_state);
415#endif 418#endif
416 419
420static inline void fold_diff(int *diff)
421{
422 int i;
423
424 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
425 if (diff[i])
426 atomic_long_add(diff[i], &vm_stat[i]);
427}
428
417/* 429/*
418 * Update the zone counters for one cpu. 430 * Update the zone counters for the current cpu.
419 *
420 * The cpu specified must be either the current cpu or a processor that
421 * is not online. If it is the current cpu then the execution thread must
422 * be pinned to the current cpu.
423 * 431 *
424 * Note that refresh_cpu_vm_stats strives to only access 432 * Note that refresh_cpu_vm_stats strives to only access
425 * node local memory. The per cpu pagesets on remote zones are placed 433 * node local memory. The per cpu pagesets on remote zones are placed
@@ -432,33 +440,29 @@ EXPORT_SYMBOL(dec_zone_page_state);
432 * with the global counters. These could cause remote node cache line 440 * with the global counters. These could cause remote node cache line
433 * bouncing and will have to be only done when necessary. 441 * bouncing and will have to be only done when necessary.
434 */ 442 */
435void refresh_cpu_vm_stats(int cpu) 443static void refresh_cpu_vm_stats(void)
436{ 444{
437 struct zone *zone; 445 struct zone *zone;
438 int i; 446 int i;
439 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 447 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
440 448
441 for_each_populated_zone(zone) { 449 for_each_populated_zone(zone) {
442 struct per_cpu_pageset *p; 450 struct per_cpu_pageset __percpu *p = zone->pageset;
443 451
444 p = per_cpu_ptr(zone->pageset, cpu); 452 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
453 int v;
445 454
446 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 455 v = this_cpu_xchg(p->vm_stat_diff[i], 0);
447 if (p->vm_stat_diff[i]) { 456 if (v) {
448 unsigned long flags;
449 int v;
450 457
451 local_irq_save(flags);
452 v = p->vm_stat_diff[i];
453 p->vm_stat_diff[i] = 0;
454 local_irq_restore(flags);
455 atomic_long_add(v, &zone->vm_stat[i]); 458 atomic_long_add(v, &zone->vm_stat[i]);
456 global_diff[i] += v; 459 global_diff[i] += v;
457#ifdef CONFIG_NUMA 460#ifdef CONFIG_NUMA
458 /* 3 seconds idle till flush */ 461 /* 3 seconds idle till flush */
459 p->expire = 3; 462 __this_cpu_write(p->expire, 3);
460#endif 463#endif
461 } 464 }
465 }
462 cond_resched(); 466 cond_resched();
463#ifdef CONFIG_NUMA 467#ifdef CONFIG_NUMA
464 /* 468 /*
@@ -468,29 +472,57 @@ void refresh_cpu_vm_stats(int cpu)
468 * Check if there are pages remaining in this pageset 472 * Check if there are pages remaining in this pageset
469 * if not then there is nothing to expire. 473 * if not then there is nothing to expire.
470 */ 474 */
471 if (!p->expire || !p->pcp.count) 475 if (!__this_cpu_read(p->expire) ||
476 !__this_cpu_read(p->pcp.count))
472 continue; 477 continue;
473 478
474 /* 479 /*
475 * We never drain zones local to this processor. 480 * We never drain zones local to this processor.
476 */ 481 */
477 if (zone_to_nid(zone) == numa_node_id()) { 482 if (zone_to_nid(zone) == numa_node_id()) {
478 p->expire = 0; 483 __this_cpu_write(p->expire, 0);
479 continue; 484 continue;
480 } 485 }
481 486
482 p->expire--; 487
483 if (p->expire) 488 if (__this_cpu_dec_return(p->expire))
484 continue; 489 continue;
485 490
486 if (p->pcp.count) 491 if (__this_cpu_read(p->pcp.count))
487 drain_zone_pages(zone, &p->pcp); 492 drain_zone_pages(zone, __this_cpu_ptr(&p->pcp));
488#endif 493#endif
489 } 494 }
495 fold_diff(global_diff);
496}
490 497
491 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 498/*
492 if (global_diff[i]) 499 * Fold the data for an offline cpu into the global array.
493 atomic_long_add(global_diff[i], &vm_stat[i]); 500 * There cannot be any access by the offline cpu and therefore
501 * synchronization is simplified.
502 */
503void cpu_vm_stats_fold(int cpu)
504{
505 struct zone *zone;
506 int i;
507 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
508
509 for_each_populated_zone(zone) {
510 struct per_cpu_pageset *p;
511
512 p = per_cpu_ptr(zone->pageset, cpu);
513
514 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
515 if (p->vm_stat_diff[i]) {
516 int v;
517
518 v = p->vm_stat_diff[i];
519 p->vm_stat_diff[i] = 0;
520 atomic_long_add(v, &zone->vm_stat[i]);
521 global_diff[i] += v;
522 }
523 }
524
525 fold_diff(global_diff);
494} 526}
495 527
496/* 528/*
@@ -703,6 +735,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
703const char * const vmstat_text[] = { 735const char * const vmstat_text[] = {
704 /* Zoned VM counters */ 736 /* Zoned VM counters */
705 "nr_free_pages", 737 "nr_free_pages",
738 "nr_alloc_batch",
706 "nr_inactive_anon", 739 "nr_inactive_anon",
707 "nr_active_anon", 740 "nr_active_anon",
708 "nr_inactive_file", 741 "nr_inactive_file",
@@ -817,6 +850,12 @@ const char * const vmstat_text[] = {
817 "thp_zero_page_alloc", 850 "thp_zero_page_alloc",
818 "thp_zero_page_alloc_failed", 851 "thp_zero_page_alloc_failed",
819#endif 852#endif
853#ifdef CONFIG_SMP
854 "nr_tlb_remote_flush",
855 "nr_tlb_remote_flush_received",
856#endif
857 "nr_tlb_local_flush_all",
858 "nr_tlb_local_flush_one",
820 859
821#endif /* CONFIG_VM_EVENTS_COUNTERS */ 860#endif /* CONFIG_VM_EVENTS_COUNTERS */
822}; 861};
@@ -1052,7 +1091,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1052 "\n all_unreclaimable: %u" 1091 "\n all_unreclaimable: %u"
1053 "\n start_pfn: %lu" 1092 "\n start_pfn: %lu"
1054 "\n inactive_ratio: %u", 1093 "\n inactive_ratio: %u",
1055 zone->all_unreclaimable, 1094 !zone_reclaimable(zone),
1056 zone->zone_start_pfn, 1095 zone->zone_start_pfn,
1057 zone->inactive_ratio); 1096 zone->inactive_ratio);
1058 seq_putc(m, '\n'); 1097 seq_putc(m, '\n');
@@ -1177,7 +1216,7 @@ int sysctl_stat_interval __read_mostly = HZ;
1177 1216
1178static void vmstat_update(struct work_struct *w) 1217static void vmstat_update(struct work_struct *w)
1179{ 1218{
1180 refresh_cpu_vm_stats(smp_processor_id()); 1219 refresh_cpu_vm_stats();
1181 schedule_delayed_work(&__get_cpu_var(vmstat_work), 1220 schedule_delayed_work(&__get_cpu_var(vmstat_work),
1182 round_jiffies_relative(sysctl_stat_interval)); 1221 round_jiffies_relative(sysctl_stat_interval));
1183} 1222}
diff --git a/mm/zbud.c b/mm/zbud.c
index ad1e781284fd..9451361e6aa7 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -16,7 +16,7 @@
16 * 16 *
17 * zbud works by storing compressed pages, or "zpages", together in pairs in a 17 * zbud works by storing compressed pages, or "zpages", together in pairs in a
18 * single memory page called a "zbud page". The first buddy is "left 18 * single memory page called a "zbud page". The first buddy is "left
19 * justifed" at the beginning of the zbud page, and the last buddy is "right 19 * justified" at the beginning of the zbud page, and the last buddy is "right
20 * justified" at the end of the zbud page. The benefit is that if either 20 * justified" at the end of the zbud page. The benefit is that if either
21 * buddy is freed, the freed buddy space, coalesced with whatever slack space 21 * buddy is freed, the freed buddy space, coalesced with whatever slack space
22 * that existed between the buddies, results in the largest possible free region 22 * that existed between the buddies, results in the largest possible free region
@@ -243,7 +243,7 @@ void zbud_destroy_pool(struct zbud_pool *pool)
243 * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used 243 * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
244 * as zbud pool pages. 244 * as zbud pool pages.
245 * 245 *
246 * Return: 0 if success and handle is set, otherwise -EINVAL is the size or 246 * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
247 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate 247 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
248 * a new page. 248 * a new page.
249 */ 249 */
diff --git a/mm/zswap.c b/mm/zswap.c
index deda2b671e12..841e35f1db22 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -409,7 +409,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
409 struct page **retpage) 409 struct page **retpage)
410{ 410{
411 struct page *found_page, *new_page = NULL; 411 struct page *found_page, *new_page = NULL;
412 struct address_space *swapper_space = &swapper_spaces[swp_type(entry)]; 412 struct address_space *swapper_space = swap_address_space(entry);
413 int err; 413 int err;
414 414
415 *retpage = NULL; 415 *retpage = NULL;
@@ -790,26 +790,14 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
790static void zswap_frontswap_invalidate_area(unsigned type) 790static void zswap_frontswap_invalidate_area(unsigned type)
791{ 791{
792 struct zswap_tree *tree = zswap_trees[type]; 792 struct zswap_tree *tree = zswap_trees[type];
793 struct rb_node *node; 793 struct zswap_entry *entry, *n;
794 struct zswap_entry *entry;
795 794
796 if (!tree) 795 if (!tree)
797 return; 796 return;
798 797
799 /* walk the tree and free everything */ 798 /* walk the tree and free everything */
800 spin_lock(&tree->lock); 799 spin_lock(&tree->lock);
801 /* 800 rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) {
802 * TODO: Even though this code should not be executed because
803 * the try_to_unuse() in swapoff should have emptied the tree,
804 * it is very wasteful to rebalance the tree after every
805 * removal when we are freeing the whole tree.
806 *
807 * If post-order traversal code is ever added to the rbtree
808 * implementation, it should be used here.
809 */
810 while ((node = rb_first(&tree->rbroot))) {
811 entry = rb_entry(node, struct zswap_entry, rbnode);
812 rb_erase(&entry->rbnode, &tree->rbroot);
813 zbud_free(tree->pool, entry->handle); 801 zbud_free(tree->pool, entry->handle);
814 zswap_entry_cache_free(entry); 802 zswap_entry_cache_free(entry);
815 atomic_dec(&zswap_stored_pages); 803 atomic_dec(&zswap_stored_pages);