aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/bootmem.c5
-rw-r--r--mm/compaction.c77
-rw-r--r--mm/filemap.c20
-rw-r--r--mm/huge_memory.c125
-rw-r--r--mm/hugetlb.c184
-rw-r--r--mm/ksm.c34
-rw-r--r--mm/memcontrol.c473
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c53
-rw-r--r--mm/mempolicy.c62
-rw-r--r--mm/migrate.c36
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mmap.c51
-rw-r--r--mm/mmu_context.c2
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/oom_kill.c166
-rw-r--r--mm/page-writeback.c1
-rw-r--r--mm/page_alloc.c58
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/pgtable-generic.c5
-rw-r--r--mm/rmap.c70
-rw-r--r--mm/shmem.c88
-rw-r--r--mm/slab.c13
-rw-r--r--mm/slub.c40
-rw-r--r--mm/sparse.c30
-rw-r--r--mm/swap.c4
-rw-r--r--mm/swap_state.c24
-rw-r--r--mm/swapfile.c58
-rw-r--r--mm/util.c41
-rw-r--r--mm/vmscan.c151
30 files changed, 1108 insertions, 771 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 668e94df8cf2..0131170c9d54 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -766,14 +766,13 @@ void * __init alloc_bootmem_section(unsigned long size,
766 unsigned long section_nr) 766 unsigned long section_nr)
767{ 767{
768 bootmem_data_t *bdata; 768 bootmem_data_t *bdata;
769 unsigned long pfn, goal, limit; 769 unsigned long pfn, goal;
770 770
771 pfn = section_nr_to_pfn(section_nr); 771 pfn = section_nr_to_pfn(section_nr);
772 goal = pfn << PAGE_SHIFT; 772 goal = pfn << PAGE_SHIFT;
773 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
774 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; 773 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
775 774
776 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); 775 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
777} 776}
778#endif 777#endif
779 778
diff --git a/mm/compaction.c b/mm/compaction.c
index d9ebebe1a2aa..74a8c825ff28 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,7 +35,7 @@ struct compact_control {
35 unsigned long migrate_pfn; /* isolate_migratepages search base */ 35 unsigned long migrate_pfn; /* isolate_migratepages search base */
36 bool sync; /* Synchronous migration */ 36 bool sync; /* Synchronous migration */
37 37
38 unsigned int order; /* order a direct compactor needs */ 38 int order; /* order a direct compactor needs */
39 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 39 int migratetype; /* MOVABLE, RECLAIMABLE etc */
40 struct zone *zone; 40 struct zone *zone;
41}; 41};
@@ -675,49 +675,71 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
675 675
676 676
677/* Compact all zones within a node */ 677/* Compact all zones within a node */
678static int compact_node(int nid) 678static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
679{ 679{
680 int zoneid; 680 int zoneid;
681 pg_data_t *pgdat;
682 struct zone *zone; 681 struct zone *zone;
683 682
684 if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
685 return -EINVAL;
686 pgdat = NODE_DATA(nid);
687
688 /* Flush pending updates to the LRU lists */
689 lru_add_drain_all();
690
691 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { 683 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
692 struct compact_control cc = {
693 .nr_freepages = 0,
694 .nr_migratepages = 0,
695 .order = -1,
696 .sync = true,
697 };
698 684
699 zone = &pgdat->node_zones[zoneid]; 685 zone = &pgdat->node_zones[zoneid];
700 if (!populated_zone(zone)) 686 if (!populated_zone(zone))
701 continue; 687 continue;
702 688
703 cc.zone = zone; 689 cc->nr_freepages = 0;
704 INIT_LIST_HEAD(&cc.freepages); 690 cc->nr_migratepages = 0;
705 INIT_LIST_HEAD(&cc.migratepages); 691 cc->zone = zone;
706 692 INIT_LIST_HEAD(&cc->freepages);
707 compact_zone(zone, &cc); 693 INIT_LIST_HEAD(&cc->migratepages);
694
695 if (cc->order == -1 || !compaction_deferred(zone, cc->order))
696 compact_zone(zone, cc);
697
698 if (cc->order > 0) {
699 int ok = zone_watermark_ok(zone, cc->order,
700 low_wmark_pages(zone), 0, 0);
701 if (ok && cc->order > zone->compact_order_failed)
702 zone->compact_order_failed = cc->order + 1;
703 /* Currently async compaction is never deferred. */
704 else if (!ok && cc->sync)
705 defer_compaction(zone, cc->order);
706 }
708 707
709 VM_BUG_ON(!list_empty(&cc.freepages)); 708 VM_BUG_ON(!list_empty(&cc->freepages));
710 VM_BUG_ON(!list_empty(&cc.migratepages)); 709 VM_BUG_ON(!list_empty(&cc->migratepages));
711 } 710 }
712 711
713 return 0; 712 return 0;
714} 713}
715 714
715int compact_pgdat(pg_data_t *pgdat, int order)
716{
717 struct compact_control cc = {
718 .order = order,
719 .sync = false,
720 };
721
722 return __compact_pgdat(pgdat, &cc);
723}
724
725static int compact_node(int nid)
726{
727 struct compact_control cc = {
728 .order = -1,
729 .sync = true,
730 };
731
732 return __compact_pgdat(NODE_DATA(nid), &cc);
733}
734
716/* Compact all nodes in the system */ 735/* Compact all nodes in the system */
717static int compact_nodes(void) 736static int compact_nodes(void)
718{ 737{
719 int nid; 738 int nid;
720 739
740 /* Flush pending updates to the LRU lists */
741 lru_add_drain_all();
742
721 for_each_online_node(nid) 743 for_each_online_node(nid)
722 compact_node(nid); 744 compact_node(nid);
723 745
@@ -750,7 +772,14 @@ ssize_t sysfs_compact_node(struct device *dev,
750 struct device_attribute *attr, 772 struct device_attribute *attr,
751 const char *buf, size_t count) 773 const char *buf, size_t count)
752{ 774{
753 compact_node(dev->id); 775 int nid = dev->id;
776
777 if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
778 /* Flush pending updates to the LRU lists */
779 lru_add_drain_all();
780
781 compact_node(nid);
782 }
754 783
755 return count; 784 return count;
756} 785}
diff --git a/mm/filemap.c b/mm/filemap.c
index 2f8165075a5a..843042045dc9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -101,9 +101,8 @@
101 * ->inode->i_lock (zap_pte_range->set_page_dirty) 101 * ->inode->i_lock (zap_pte_range->set_page_dirty)
102 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 102 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
103 * 103 *
104 * (code doesn't rely on that order, so you could switch it around) 104 * ->i_mmap_mutex
105 * ->tasklist_lock (memory_failure, collect_procs_ao) 105 * ->tasklist_lock (memory_failure, collect_procs_ao)
106 * ->i_mmap_mutex
107 */ 106 */
108 107
109/* 108/*
@@ -500,10 +499,13 @@ struct page *__page_cache_alloc(gfp_t gfp)
500 struct page *page; 499 struct page *page;
501 500
502 if (cpuset_do_page_mem_spread()) { 501 if (cpuset_do_page_mem_spread()) {
503 get_mems_allowed(); 502 unsigned int cpuset_mems_cookie;
504 n = cpuset_mem_spread_node(); 503 do {
505 page = alloc_pages_exact_node(n, gfp, 0); 504 cpuset_mems_cookie = get_mems_allowed();
506 put_mems_allowed(); 505 n = cpuset_mem_spread_node();
506 page = alloc_pages_exact_node(n, gfp, 0);
507 } while (!put_mems_allowed(cpuset_mems_cookie) && !page);
508
507 return page; 509 return page;
508 } 510 }
509 return alloc_pages(gfp, 0); 511 return alloc_pages(gfp, 0);
@@ -2341,7 +2343,9 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
2341 struct page *page; 2343 struct page *page;
2342 gfp_t gfp_notmask = 0; 2344 gfp_t gfp_notmask = 0;
2343 2345
2344 gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE; 2346 gfp_mask = mapping_gfp_mask(mapping);
2347 if (mapping_cap_account_dirty(mapping))
2348 gfp_mask |= __GFP_WRITE;
2345 if (flags & AOP_FLAG_NOFS) 2349 if (flags & AOP_FLAG_NOFS)
2346 gfp_notmask = __GFP_FS; 2350 gfp_notmask = __GFP_FS;
2347repeat: 2351repeat:
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8f7fc394f636..f0e5306eeb55 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1031,32 +1031,23 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1031{ 1031{
1032 int ret = 0; 1032 int ret = 0;
1033 1033
1034 spin_lock(&tlb->mm->page_table_lock); 1034 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1035 if (likely(pmd_trans_huge(*pmd))) { 1035 struct page *page;
1036 if (unlikely(pmd_trans_splitting(*pmd))) { 1036 pgtable_t pgtable;
1037 spin_unlock(&tlb->mm->page_table_lock); 1037 pgtable = get_pmd_huge_pte(tlb->mm);
1038 wait_split_huge_page(vma->anon_vma, 1038 page = pmd_page(*pmd);
1039 pmd); 1039 pmd_clear(pmd);
1040 } else { 1040 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1041 struct page *page; 1041 page_remove_rmap(page);
1042 pgtable_t pgtable; 1042 VM_BUG_ON(page_mapcount(page) < 0);
1043 pgtable = get_pmd_huge_pte(tlb->mm); 1043 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1044 page = pmd_page(*pmd); 1044 VM_BUG_ON(!PageHead(page));
1045 pmd_clear(pmd); 1045 tlb->mm->nr_ptes--;
1046 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1047 page_remove_rmap(page);
1048 VM_BUG_ON(page_mapcount(page) < 0);
1049 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1050 VM_BUG_ON(!PageHead(page));
1051 tlb->mm->nr_ptes--;
1052 spin_unlock(&tlb->mm->page_table_lock);
1053 tlb_remove_page(tlb, page);
1054 pte_free(tlb->mm, pgtable);
1055 ret = 1;
1056 }
1057 } else
1058 spin_unlock(&tlb->mm->page_table_lock); 1046 spin_unlock(&tlb->mm->page_table_lock);
1059 1047 tlb_remove_page(tlb, page);
1048 pte_free(tlb->mm, pgtable);
1049 ret = 1;
1050 }
1060 return ret; 1051 return ret;
1061} 1052}
1062 1053
@@ -1066,21 +1057,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1066{ 1057{
1067 int ret = 0; 1058 int ret = 0;
1068 1059
1069 spin_lock(&vma->vm_mm->page_table_lock); 1060 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1070 if (likely(pmd_trans_huge(*pmd))) { 1061 /*
1071 ret = !pmd_trans_splitting(*pmd); 1062 * All logical pages in the range are present
1072 spin_unlock(&vma->vm_mm->page_table_lock); 1063 * if backed by a huge page.
1073 if (unlikely(!ret)) 1064 */
1074 wait_split_huge_page(vma->anon_vma, pmd);
1075 else {
1076 /*
1077 * All logical pages in the range are present
1078 * if backed by a huge page.
1079 */
1080 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1081 }
1082 } else
1083 spin_unlock(&vma->vm_mm->page_table_lock); 1065 spin_unlock(&vma->vm_mm->page_table_lock);
1066 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1067 ret = 1;
1068 }
1084 1069
1085 return ret; 1070 return ret;
1086} 1071}
@@ -1110,20 +1095,11 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
1110 goto out; 1095 goto out;
1111 } 1096 }
1112 1097
1113 spin_lock(&mm->page_table_lock); 1098 ret = __pmd_trans_huge_lock(old_pmd, vma);
1114 if (likely(pmd_trans_huge(*old_pmd))) { 1099 if (ret == 1) {
1115 if (pmd_trans_splitting(*old_pmd)) { 1100 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1116 spin_unlock(&mm->page_table_lock); 1101 VM_BUG_ON(!pmd_none(*new_pmd));
1117 wait_split_huge_page(vma->anon_vma, old_pmd); 1102 set_pmd_at(mm, new_addr, new_pmd, pmd);
1118 ret = -1;
1119 } else {
1120 pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
1121 VM_BUG_ON(!pmd_none(*new_pmd));
1122 set_pmd_at(mm, new_addr, new_pmd, pmd);
1123 spin_unlock(&mm->page_table_lock);
1124 ret = 1;
1125 }
1126 } else {
1127 spin_unlock(&mm->page_table_lock); 1103 spin_unlock(&mm->page_table_lock);
1128 } 1104 }
1129out: 1105out:
@@ -1136,24 +1112,41 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1136 struct mm_struct *mm = vma->vm_mm; 1112 struct mm_struct *mm = vma->vm_mm;
1137 int ret = 0; 1113 int ret = 0;
1138 1114
1139 spin_lock(&mm->page_table_lock); 1115 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1116 pmd_t entry;
1117 entry = pmdp_get_and_clear(mm, addr, pmd);
1118 entry = pmd_modify(entry, newprot);
1119 set_pmd_at(mm, addr, pmd, entry);
1120 spin_unlock(&vma->vm_mm->page_table_lock);
1121 ret = 1;
1122 }
1123
1124 return ret;
1125}
1126
1127/*
1128 * Returns 1 if a given pmd maps a stable (not under splitting) thp.
1129 * Returns -1 if it maps a thp under splitting. Returns 0 otherwise.
1130 *
1131 * Note that if it returns 1, this routine returns without unlocking page
1132 * table locks. So callers must unlock them.
1133 */
1134int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
1135{
1136 spin_lock(&vma->vm_mm->page_table_lock);
1140 if (likely(pmd_trans_huge(*pmd))) { 1137 if (likely(pmd_trans_huge(*pmd))) {
1141 if (unlikely(pmd_trans_splitting(*pmd))) { 1138 if (unlikely(pmd_trans_splitting(*pmd))) {
1142 spin_unlock(&mm->page_table_lock); 1139 spin_unlock(&vma->vm_mm->page_table_lock);
1143 wait_split_huge_page(vma->anon_vma, pmd); 1140 wait_split_huge_page(vma->anon_vma, pmd);
1141 return -1;
1144 } else { 1142 } else {
1145 pmd_t entry; 1143 /* Thp mapped by 'pmd' is stable, so we can
1146 1144 * handle it as it is. */
1147 entry = pmdp_get_and_clear(mm, addr, pmd); 1145 return 1;
1148 entry = pmd_modify(entry, newprot);
1149 set_pmd_at(mm, addr, pmd, entry);
1150 spin_unlock(&vma->vm_mm->page_table_lock);
1151 ret = 1;
1152 } 1146 }
1153 } else 1147 }
1154 spin_unlock(&vma->vm_mm->page_table_lock); 1148 spin_unlock(&vma->vm_mm->page_table_lock);
1155 1149 return 0;
1156 return ret;
1157} 1150}
1158 1151
1159pmd_t *page_check_address_pmd(struct page *page, 1152pmd_t *page_check_address_pmd(struct page *page,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a876871f6be5..afa057a1d3fe 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size;
53 */ 53 */
54static DEFINE_SPINLOCK(hugetlb_lock); 54static DEFINE_SPINLOCK(hugetlb_lock);
55 55
56static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
57{
58 bool free = (spool->count == 0) && (spool->used_hpages == 0);
59
60 spin_unlock(&spool->lock);
61
62 /* If no pages are used, and no other handles to the subpool
63 * remain, free the subpool the subpool remain */
64 if (free)
65 kfree(spool);
66}
67
68struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
69{
70 struct hugepage_subpool *spool;
71
72 spool = kmalloc(sizeof(*spool), GFP_KERNEL);
73 if (!spool)
74 return NULL;
75
76 spin_lock_init(&spool->lock);
77 spool->count = 1;
78 spool->max_hpages = nr_blocks;
79 spool->used_hpages = 0;
80
81 return spool;
82}
83
84void hugepage_put_subpool(struct hugepage_subpool *spool)
85{
86 spin_lock(&spool->lock);
87 BUG_ON(!spool->count);
88 spool->count--;
89 unlock_or_release_subpool(spool);
90}
91
92static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
93 long delta)
94{
95 int ret = 0;
96
97 if (!spool)
98 return 0;
99
100 spin_lock(&spool->lock);
101 if ((spool->used_hpages + delta) <= spool->max_hpages) {
102 spool->used_hpages += delta;
103 } else {
104 ret = -ENOMEM;
105 }
106 spin_unlock(&spool->lock);
107
108 return ret;
109}
110
111static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
112 long delta)
113{
114 if (!spool)
115 return;
116
117 spin_lock(&spool->lock);
118 spool->used_hpages -= delta;
119 /* If hugetlbfs_put_super couldn't free spool due to
120 * an outstanding quota reference, free it now. */
121 unlock_or_release_subpool(spool);
122}
123
124static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
125{
126 return HUGETLBFS_SB(inode->i_sb)->spool;
127}
128
129static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
130{
131 return subpool_inode(vma->vm_file->f_dentry->d_inode);
132}
133
56/* 134/*
57 * Region tracking -- allows tracking of reservations and instantiated pages 135 * Region tracking -- allows tracking of reservations and instantiated pages
58 * across the pages in a mapping. 136 * across the pages in a mapping.
@@ -454,14 +532,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
454 struct vm_area_struct *vma, 532 struct vm_area_struct *vma,
455 unsigned long address, int avoid_reserve) 533 unsigned long address, int avoid_reserve)
456{ 534{
457 struct page *page = NULL; 535 struct page *page;
458 struct mempolicy *mpol; 536 struct mempolicy *mpol;
459 nodemask_t *nodemask; 537 nodemask_t *nodemask;
460 struct zonelist *zonelist; 538 struct zonelist *zonelist;
461 struct zone *zone; 539 struct zone *zone;
462 struct zoneref *z; 540 struct zoneref *z;
541 unsigned int cpuset_mems_cookie;
463 542
464 get_mems_allowed(); 543retry_cpuset:
544 cpuset_mems_cookie = get_mems_allowed();
465 zonelist = huge_zonelist(vma, address, 545 zonelist = huge_zonelist(vma, address,
466 htlb_alloc_mask, &mpol, &nodemask); 546 htlb_alloc_mask, &mpol, &nodemask);
467 /* 547 /*
@@ -488,10 +568,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
488 } 568 }
489 } 569 }
490 } 570 }
491err: 571
492 mpol_cond_put(mpol); 572 mpol_cond_put(mpol);
493 put_mems_allowed(); 573 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
574 goto retry_cpuset;
494 return page; 575 return page;
576
577err:
578 mpol_cond_put(mpol);
579 return NULL;
495} 580}
496 581
497static void update_and_free_page(struct hstate *h, struct page *page) 582static void update_and_free_page(struct hstate *h, struct page *page)
@@ -533,9 +618,9 @@ static void free_huge_page(struct page *page)
533 */ 618 */
534 struct hstate *h = page_hstate(page); 619 struct hstate *h = page_hstate(page);
535 int nid = page_to_nid(page); 620 int nid = page_to_nid(page);
536 struct address_space *mapping; 621 struct hugepage_subpool *spool =
622 (struct hugepage_subpool *)page_private(page);
537 623
538 mapping = (struct address_space *) page_private(page);
539 set_page_private(page, 0); 624 set_page_private(page, 0);
540 page->mapping = NULL; 625 page->mapping = NULL;
541 BUG_ON(page_count(page)); 626 BUG_ON(page_count(page));
@@ -551,8 +636,7 @@ static void free_huge_page(struct page *page)
551 enqueue_huge_page(h, page); 636 enqueue_huge_page(h, page);
552 } 637 }
553 spin_unlock(&hugetlb_lock); 638 spin_unlock(&hugetlb_lock);
554 if (mapping) 639 hugepage_subpool_put_pages(spool, 1);
555 hugetlb_put_quota(mapping, 1);
556} 640}
557 641
558static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) 642static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -852,6 +936,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
852 struct page *page, *tmp; 936 struct page *page, *tmp;
853 int ret, i; 937 int ret, i;
854 int needed, allocated; 938 int needed, allocated;
939 bool alloc_ok = true;
855 940
856 needed = (h->resv_huge_pages + delta) - h->free_huge_pages; 941 needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
857 if (needed <= 0) { 942 if (needed <= 0) {
@@ -867,17 +952,13 @@ retry:
867 spin_unlock(&hugetlb_lock); 952 spin_unlock(&hugetlb_lock);
868 for (i = 0; i < needed; i++) { 953 for (i = 0; i < needed; i++) {
869 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 954 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
870 if (!page) 955 if (!page) {
871 /* 956 alloc_ok = false;
872 * We were not able to allocate enough pages to 957 break;
873 * satisfy the entire reservation so we free what 958 }
874 * we've allocated so far.
875 */
876 goto free;
877
878 list_add(&page->lru, &surplus_list); 959 list_add(&page->lru, &surplus_list);
879 } 960 }
880 allocated += needed; 961 allocated += i;
881 962
882 /* 963 /*
883 * After retaking hugetlb_lock, we need to recalculate 'needed' 964 * After retaking hugetlb_lock, we need to recalculate 'needed'
@@ -886,9 +967,16 @@ retry:
886 spin_lock(&hugetlb_lock); 967 spin_lock(&hugetlb_lock);
887 needed = (h->resv_huge_pages + delta) - 968 needed = (h->resv_huge_pages + delta) -
888 (h->free_huge_pages + allocated); 969 (h->free_huge_pages + allocated);
889 if (needed > 0) 970 if (needed > 0) {
890 goto retry; 971 if (alloc_ok)
891 972 goto retry;
973 /*
974 * We were not able to allocate enough pages to
975 * satisfy the entire reservation so we free what
976 * we've allocated so far.
977 */
978 goto free;
979 }
892 /* 980 /*
893 * The surplus_list now contains _at_least_ the number of extra pages 981 * The surplus_list now contains _at_least_ the number of extra pages
894 * needed to accommodate the reservation. Add the appropriate number 982 * needed to accommodate the reservation. Add the appropriate number
@@ -914,10 +1002,10 @@ retry:
914 VM_BUG_ON(page_count(page)); 1002 VM_BUG_ON(page_count(page));
915 enqueue_huge_page(h, page); 1003 enqueue_huge_page(h, page);
916 } 1004 }
1005free:
917 spin_unlock(&hugetlb_lock); 1006 spin_unlock(&hugetlb_lock);
918 1007
919 /* Free unnecessary surplus pages to the buddy allocator */ 1008 /* Free unnecessary surplus pages to the buddy allocator */
920free:
921 if (!list_empty(&surplus_list)) { 1009 if (!list_empty(&surplus_list)) {
922 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 1010 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
923 list_del(&page->lru); 1011 list_del(&page->lru);
@@ -966,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h,
966/* 1054/*
967 * Determine if the huge page at addr within the vma has an associated 1055 * Determine if the huge page at addr within the vma has an associated
968 * reservation. Where it does not we will need to logically increase 1056 * reservation. Where it does not we will need to logically increase
969 * reservation and actually increase quota before an allocation can occur. 1057 * reservation and actually increase subpool usage before an allocation
970 * Where any new reservation would be required the reservation change is 1058 * can occur. Where any new reservation would be required the
971 * prepared, but not committed. Once the page has been quota'd allocated 1059 * reservation change is prepared, but not committed. Once the page
972 * an instantiated the change should be committed via vma_commit_reservation. 1060 * has been allocated from the subpool and instantiated the change should
973 * No action is required on failure. 1061 * be committed via vma_commit_reservation. No action is required on
1062 * failure.
974 */ 1063 */
975static long vma_needs_reservation(struct hstate *h, 1064static long vma_needs_reservation(struct hstate *h,
976 struct vm_area_struct *vma, unsigned long addr) 1065 struct vm_area_struct *vma, unsigned long addr)
@@ -1019,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h,
1019static struct page *alloc_huge_page(struct vm_area_struct *vma, 1108static struct page *alloc_huge_page(struct vm_area_struct *vma,
1020 unsigned long addr, int avoid_reserve) 1109 unsigned long addr, int avoid_reserve)
1021{ 1110{
1111 struct hugepage_subpool *spool = subpool_vma(vma);
1022 struct hstate *h = hstate_vma(vma); 1112 struct hstate *h = hstate_vma(vma);
1023 struct page *page; 1113 struct page *page;
1024 struct address_space *mapping = vma->vm_file->f_mapping;
1025 struct inode *inode = mapping->host;
1026 long chg; 1114 long chg;
1027 1115
1028 /* 1116 /*
1029 * Processes that did not create the mapping will have no reserves and 1117 * Processes that did not create the mapping will have no
1030 * will not have accounted against quota. Check that the quota can be 1118 * reserves and will not have accounted against subpool
1031 * made before satisfying the allocation 1119 * limit. Check that the subpool limit can be made before
1032 * MAP_NORESERVE mappings may also need pages and quota allocated 1120 * satisfying the allocation MAP_NORESERVE mappings may also
1033 * if no reserve mapping overlaps. 1121 * need pages and subpool limit allocated allocated if no reserve
1122 * mapping overlaps.
1034 */ 1123 */
1035 chg = vma_needs_reservation(h, vma, addr); 1124 chg = vma_needs_reservation(h, vma, addr);
1036 if (chg < 0) 1125 if (chg < 0)
1037 return ERR_PTR(-VM_FAULT_OOM); 1126 return ERR_PTR(-VM_FAULT_OOM);
1038 if (chg) 1127 if (chg)
1039 if (hugetlb_get_quota(inode->i_mapping, chg)) 1128 if (hugepage_subpool_get_pages(spool, chg))
1040 return ERR_PTR(-VM_FAULT_SIGBUS); 1129 return ERR_PTR(-VM_FAULT_SIGBUS);
1041 1130
1042 spin_lock(&hugetlb_lock); 1131 spin_lock(&hugetlb_lock);
@@ -1046,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
1046 if (!page) { 1135 if (!page) {
1047 page = alloc_buddy_huge_page(h, NUMA_NO_NODE); 1136 page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
1048 if (!page) { 1137 if (!page) {
1049 hugetlb_put_quota(inode->i_mapping, chg); 1138 hugepage_subpool_put_pages(spool, chg);
1050 return ERR_PTR(-VM_FAULT_SIGBUS); 1139 return ERR_PTR(-VM_FAULT_SIGBUS);
1051 } 1140 }
1052 } 1141 }
1053 1142
1054 set_page_private(page, (unsigned long) mapping); 1143 set_page_private(page, (unsigned long)spool);
1055 1144
1056 vma_commit_reservation(h, vma, addr); 1145 vma_commit_reservation(h, vma, addr);
1057 1146
@@ -2072,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2072{ 2161{
2073 struct hstate *h = hstate_vma(vma); 2162 struct hstate *h = hstate_vma(vma);
2074 struct resv_map *reservations = vma_resv_map(vma); 2163 struct resv_map *reservations = vma_resv_map(vma);
2164 struct hugepage_subpool *spool = subpool_vma(vma);
2075 unsigned long reserve; 2165 unsigned long reserve;
2076 unsigned long start; 2166 unsigned long start;
2077 unsigned long end; 2167 unsigned long end;
@@ -2087,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2087 2177
2088 if (reserve) { 2178 if (reserve) {
2089 hugetlb_acct_memory(h, -reserve); 2179 hugetlb_acct_memory(h, -reserve);
2090 hugetlb_put_quota(vma->vm_file->f_mapping, reserve); 2180 hugepage_subpool_put_pages(spool, reserve);
2091 } 2181 }
2092 } 2182 }
2093} 2183}
@@ -2276,6 +2366,10 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
2276 if (pte_dirty(pte)) 2366 if (pte_dirty(pte))
2277 set_page_dirty(page); 2367 set_page_dirty(page);
2278 list_add(&page->lru, &page_list); 2368 list_add(&page->lru, &page_list);
2369
2370 /* Bail out after unmapping reference page if supplied */
2371 if (ref_page)
2372 break;
2279 } 2373 }
2280 flush_tlb_range(vma, start, end); 2374 flush_tlb_range(vma, start, end);
2281 spin_unlock(&mm->page_table_lock); 2375 spin_unlock(&mm->page_table_lock);
@@ -2316,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2316 */ 2410 */
2317 address = address & huge_page_mask(h); 2411 address = address & huge_page_mask(h);
2318 pgoff = vma_hugecache_offset(h, vma, address); 2412 pgoff = vma_hugecache_offset(h, vma, address);
2319 mapping = (struct address_space *)page_private(page); 2413 mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
2320 2414
2321 /* 2415 /*
2322 * Take the mapping lock for the duration of the table walk. As 2416 * Take the mapping lock for the duration of the table walk. As
@@ -2869,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode,
2869{ 2963{
2870 long ret, chg; 2964 long ret, chg;
2871 struct hstate *h = hstate_inode(inode); 2965 struct hstate *h = hstate_inode(inode);
2966 struct hugepage_subpool *spool = subpool_inode(inode);
2872 2967
2873 /* 2968 /*
2874 * Only apply hugepage reservation if asked. At fault time, an 2969 * Only apply hugepage reservation if asked. At fault time, an
2875 * attempt will be made for VM_NORESERVE to allocate a page 2970 * attempt will be made for VM_NORESERVE to allocate a page
2876 * and filesystem quota without using reserves 2971 * without using reserves
2877 */ 2972 */
2878 if (vm_flags & VM_NORESERVE) 2973 if (vm_flags & VM_NORESERVE)
2879 return 0; 2974 return 0;
@@ -2900,17 +2995,17 @@ int hugetlb_reserve_pages(struct inode *inode,
2900 if (chg < 0) 2995 if (chg < 0)
2901 return chg; 2996 return chg;
2902 2997
2903 /* There must be enough filesystem quota for the mapping */ 2998 /* There must be enough pages in the subpool for the mapping */
2904 if (hugetlb_get_quota(inode->i_mapping, chg)) 2999 if (hugepage_subpool_get_pages(spool, chg))
2905 return -ENOSPC; 3000 return -ENOSPC;
2906 3001
2907 /* 3002 /*
2908 * Check enough hugepages are available for the reservation. 3003 * Check enough hugepages are available for the reservation.
2909 * Hand back the quota if there are not 3004 * Hand the pages back to the subpool if there are not
2910 */ 3005 */
2911 ret = hugetlb_acct_memory(h, chg); 3006 ret = hugetlb_acct_memory(h, chg);
2912 if (ret < 0) { 3007 if (ret < 0) {
2913 hugetlb_put_quota(inode->i_mapping, chg); 3008 hugepage_subpool_put_pages(spool, chg);
2914 return ret; 3009 return ret;
2915 } 3010 }
2916 3011
@@ -2934,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
2934{ 3029{
2935 struct hstate *h = hstate_inode(inode); 3030 struct hstate *h = hstate_inode(inode);
2936 long chg = region_truncate(&inode->i_mapping->private_list, offset); 3031 long chg = region_truncate(&inode->i_mapping->private_list, offset);
3032 struct hugepage_subpool *spool = subpool_inode(inode);
2937 3033
2938 spin_lock(&inode->i_lock); 3034 spin_lock(&inode->i_lock);
2939 inode->i_blocks -= (blocks_per_huge_page(h) * freed); 3035 inode->i_blocks -= (blocks_per_huge_page(h) * freed);
2940 spin_unlock(&inode->i_lock); 3036 spin_unlock(&inode->i_lock);
2941 3037
2942 hugetlb_put_quota(inode->i_mapping, (chg - freed)); 3038 hugepage_subpool_put_pages(spool, (chg - freed));
2943 hugetlb_acct_memory(h, -(chg - freed)); 3039 hugetlb_acct_memory(h, -(chg - freed));
2944} 3040}
2945 3041
diff --git a/mm/ksm.c b/mm/ksm.c
index a6d3fb7e6c10..47c885368890 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -374,6 +374,20 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
374 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; 374 return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
375} 375}
376 376
377static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
378 unsigned long addr)
379{
380 struct vm_area_struct *vma;
381 if (ksm_test_exit(mm))
382 return NULL;
383 vma = find_vma(mm, addr);
384 if (!vma || vma->vm_start > addr)
385 return NULL;
386 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
387 return NULL;
388 return vma;
389}
390
377static void break_cow(struct rmap_item *rmap_item) 391static void break_cow(struct rmap_item *rmap_item)
378{ 392{
379 struct mm_struct *mm = rmap_item->mm; 393 struct mm_struct *mm = rmap_item->mm;
@@ -387,15 +401,9 @@ static void break_cow(struct rmap_item *rmap_item)
387 put_anon_vma(rmap_item->anon_vma); 401 put_anon_vma(rmap_item->anon_vma);
388 402
389 down_read(&mm->mmap_sem); 403 down_read(&mm->mmap_sem);
390 if (ksm_test_exit(mm)) 404 vma = find_mergeable_vma(mm, addr);
391 goto out; 405 if (vma)
392 vma = find_vma(mm, addr); 406 break_ksm(vma, addr);
393 if (!vma || vma->vm_start > addr)
394 goto out;
395 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
396 goto out;
397 break_ksm(vma, addr);
398out:
399 up_read(&mm->mmap_sem); 407 up_read(&mm->mmap_sem);
400} 408}
401 409
@@ -421,12 +429,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
421 struct page *page; 429 struct page *page;
422 430
423 down_read(&mm->mmap_sem); 431 down_read(&mm->mmap_sem);
424 if (ksm_test_exit(mm)) 432 vma = find_mergeable_vma(mm, addr);
425 goto out; 433 if (!vma)
426 vma = find_vma(mm, addr);
427 if (!vma || vma->vm_start > addr)
428 goto out;
429 if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
430 goto out; 434 goto out;
431 435
432 page = follow_page(vma, addr, FOLL_GET); 436 page = follow_page(vma, addr, FOLL_GET);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 26c6f4ec20f4..b2ee6df0e9bb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -89,7 +89,6 @@ enum mem_cgroup_stat_index {
89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
91 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 91 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
92 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
93 MEM_CGROUP_STAT_NSTATS, 92 MEM_CGROUP_STAT_NSTATS,
94}; 93};
95 94
@@ -135,7 +134,7 @@ struct mem_cgroup_reclaim_iter {
135 */ 134 */
136struct mem_cgroup_per_zone { 135struct mem_cgroup_per_zone {
137 struct lruvec lruvec; 136 struct lruvec lruvec;
138 unsigned long count[NR_LRU_LISTS]; 137 unsigned long lru_size[NR_LRU_LISTS];
139 138
140 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 139 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
141 140
@@ -144,11 +143,9 @@ struct mem_cgroup_per_zone {
144 unsigned long long usage_in_excess;/* Set to the value by which */ 143 unsigned long long usage_in_excess;/* Set to the value by which */
145 /* the soft limit is exceeded*/ 144 /* the soft limit is exceeded*/
146 bool on_tree; 145 bool on_tree;
147 struct mem_cgroup *mem; /* Back pointer, we cannot */ 146 struct mem_cgroup *memcg; /* Back pointer, we cannot */
148 /* use container_of */ 147 /* use container_of */
149}; 148};
150/* Macro for accessing counter */
151#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
152 149
153struct mem_cgroup_per_node { 150struct mem_cgroup_per_node {
154 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 151 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
@@ -300,6 +297,12 @@ struct mem_cgroup {
300 */ 297 */
301 unsigned long move_charge_at_immigrate; 298 unsigned long move_charge_at_immigrate;
302 /* 299 /*
300 * set > 0 if pages under this cgroup are moving to other cgroup.
301 */
302 atomic_t moving_account;
303 /* taken only while moving_account > 0 */
304 spinlock_t move_lock;
305 /*
303 * percpu counter. 306 * percpu counter.
304 */ 307 */
305 struct mem_cgroup_stat_cpu *stat; 308 struct mem_cgroup_stat_cpu *stat;
@@ -612,9 +615,9 @@ retry:
612 * we will to add it back at the end of reclaim to its correct 615 * we will to add it back at the end of reclaim to its correct
613 * position in the tree. 616 * position in the tree.
614 */ 617 */
615 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 618 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
616 if (!res_counter_soft_limit_excess(&mz->mem->res) || 619 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
617 !css_tryget(&mz->mem->css)) 620 !css_tryget(&mz->memcg->css))
618 goto retry; 621 goto retry;
619done: 622done:
620 return mz; 623 return mz;
@@ -692,15 +695,19 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
692} 695}
693 696
694static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 697static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
695 bool file, int nr_pages) 698 bool anon, int nr_pages)
696{ 699{
697 preempt_disable(); 700 preempt_disable();
698 701
699 if (file) 702 /*
700 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 703 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
704 * counted as CACHE even if it's on ANON LRU.
705 */
706 if (anon)
707 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
701 nr_pages); 708 nr_pages);
702 else 709 else
703 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 710 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
704 nr_pages); 711 nr_pages);
705 712
706 /* pagein of a big page is an event. So, ignore page size */ 713 /* pagein of a big page is an event. So, ignore page size */
@@ -721,14 +728,14 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
721 unsigned int lru_mask) 728 unsigned int lru_mask)
722{ 729{
723 struct mem_cgroup_per_zone *mz; 730 struct mem_cgroup_per_zone *mz;
724 enum lru_list l; 731 enum lru_list lru;
725 unsigned long ret = 0; 732 unsigned long ret = 0;
726 733
727 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 734 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
728 735
729 for_each_lru(l) { 736 for_each_lru(lru) {
730 if (BIT(l) & lru_mask) 737 if (BIT(lru) & lru_mask)
731 ret += MEM_CGROUP_ZSTAT(mz, l); 738 ret += mz->lru_size[lru];
732 } 739 }
733 return ret; 740 return ret;
734} 741}
@@ -1077,7 +1084,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1077 1084
1078 mz = page_cgroup_zoneinfo(memcg, page); 1085 mz = page_cgroup_zoneinfo(memcg, page);
1079 /* compound_order() is stabilized through lru_lock */ 1086 /* compound_order() is stabilized through lru_lock */
1080 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 1087 mz->lru_size[lru] += 1 << compound_order(page);
1081 return &mz->lruvec; 1088 return &mz->lruvec;
1082} 1089}
1083 1090
@@ -1105,8 +1112,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
1105 VM_BUG_ON(!memcg); 1112 VM_BUG_ON(!memcg);
1106 mz = page_cgroup_zoneinfo(memcg, page); 1113 mz = page_cgroup_zoneinfo(memcg, page);
1107 /* huge page split is done under lru_lock. so, we have no races. */ 1114 /* huge page split is done under lru_lock. so, we have no races. */
1108 VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page))); 1115 VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page)));
1109 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); 1116 mz->lru_size[lru] -= 1 << compound_order(page);
1110} 1117}
1111 1118
1112void mem_cgroup_lru_del(struct page *page) 1119void mem_cgroup_lru_del(struct page *page)
@@ -1285,40 +1292,48 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1285 return memcg->swappiness; 1292 return memcg->swappiness;
1286} 1293}
1287 1294
1288static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1295/*
1289{ 1296 * memcg->moving_account is used for checking possibility that some thread is
1290 int cpu; 1297 * calling move_account(). When a thread on CPU-A starts moving pages under
1298 * a memcg, other threads should check memcg->moving_account under
1299 * rcu_read_lock(), like this:
1300 *
1301 * CPU-A CPU-B
1302 * rcu_read_lock()
1303 * memcg->moving_account+1 if (memcg->mocing_account)
1304 * take heavy locks.
1305 * synchronize_rcu() update something.
1306 * rcu_read_unlock()
1307 * start move here.
1308 */
1291 1309
1292 get_online_cpus(); 1310/* for quick checking without looking up memcg */
1293 spin_lock(&memcg->pcp_counter_lock); 1311atomic_t memcg_moving __read_mostly;
1294 for_each_online_cpu(cpu)
1295 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1296 memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1297 spin_unlock(&memcg->pcp_counter_lock);
1298 put_online_cpus();
1299 1312
1313static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1314{
1315 atomic_inc(&memcg_moving);
1316 atomic_inc(&memcg->moving_account);
1300 synchronize_rcu(); 1317 synchronize_rcu();
1301} 1318}
1302 1319
1303static void mem_cgroup_end_move(struct mem_cgroup *memcg) 1320static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1304{ 1321{
1305 int cpu; 1322 /*
1306 1323 * Now, mem_cgroup_clear_mc() may call this function with NULL.
1307 if (!memcg) 1324 * We check NULL in callee rather than caller.
1308 return; 1325 */
1309 get_online_cpus(); 1326 if (memcg) {
1310 spin_lock(&memcg->pcp_counter_lock); 1327 atomic_dec(&memcg_moving);
1311 for_each_online_cpu(cpu) 1328 atomic_dec(&memcg->moving_account);
1312 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; 1329 }
1313 memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1314 spin_unlock(&memcg->pcp_counter_lock);
1315 put_online_cpus();
1316} 1330}
1331
1317/* 1332/*
1318 * 2 routines for checking "mem" is under move_account() or not. 1333 * 2 routines for checking "mem" is under move_account() or not.
1319 * 1334 *
1320 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used 1335 * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This
1321 * for avoiding race in accounting. If true, 1336 * is used for avoiding races in accounting. If true,
1322 * pc->mem_cgroup may be overwritten. 1337 * pc->mem_cgroup may be overwritten.
1323 * 1338 *
1324 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1339 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
@@ -1326,10 +1341,10 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1326 * waiting at hith-memory prressure caused by "move". 1341 * waiting at hith-memory prressure caused by "move".
1327 */ 1342 */
1328 1343
1329static bool mem_cgroup_stealed(struct mem_cgroup *memcg) 1344static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1330{ 1345{
1331 VM_BUG_ON(!rcu_read_lock_held()); 1346 VM_BUG_ON(!rcu_read_lock_held());
1332 return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0; 1347 return atomic_read(&memcg->moving_account) > 0;
1333} 1348}
1334 1349
1335static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1350static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
@@ -1370,6 +1385,24 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1370 return false; 1385 return false;
1371} 1386}
1372 1387
1388/*
1389 * Take this lock when
1390 * - a code tries to modify page's memcg while it's USED.
1391 * - a code tries to modify page state accounting in a memcg.
1392 * see mem_cgroup_stolen(), too.
1393 */
1394static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1395 unsigned long *flags)
1396{
1397 spin_lock_irqsave(&memcg->move_lock, *flags);
1398}
1399
1400static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1401 unsigned long *flags)
1402{
1403 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1404}
1405
1373/** 1406/**
1374 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1407 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1375 * @memcg: The memory cgroup that went over limit 1408 * @memcg: The memory cgroup that went over limit
@@ -1393,7 +1426,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1393 if (!memcg || !p) 1426 if (!memcg || !p)
1394 return; 1427 return;
1395 1428
1396
1397 rcu_read_lock(); 1429 rcu_read_lock();
1398 1430
1399 mem_cgrp = memcg->css.cgroup; 1431 mem_cgrp = memcg->css.cgroup;
@@ -1772,22 +1804,22 @@ static DEFINE_SPINLOCK(memcg_oom_lock);
1772static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1804static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1773 1805
1774struct oom_wait_info { 1806struct oom_wait_info {
1775 struct mem_cgroup *mem; 1807 struct mem_cgroup *memcg;
1776 wait_queue_t wait; 1808 wait_queue_t wait;
1777}; 1809};
1778 1810
1779static int memcg_oom_wake_function(wait_queue_t *wait, 1811static int memcg_oom_wake_function(wait_queue_t *wait,
1780 unsigned mode, int sync, void *arg) 1812 unsigned mode, int sync, void *arg)
1781{ 1813{
1782 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg, 1814 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1783 *oom_wait_memcg; 1815 struct mem_cgroup *oom_wait_memcg;
1784 struct oom_wait_info *oom_wait_info; 1816 struct oom_wait_info *oom_wait_info;
1785 1817
1786 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1818 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1787 oom_wait_memcg = oom_wait_info->mem; 1819 oom_wait_memcg = oom_wait_info->memcg;
1788 1820
1789 /* 1821 /*
1790 * Both of oom_wait_info->mem and wake_mem are stable under us. 1822 * Both of oom_wait_info->memcg and wake_memcg are stable under us.
1791 * Then we can use css_is_ancestor without taking care of RCU. 1823 * Then we can use css_is_ancestor without taking care of RCU.
1792 */ 1824 */
1793 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) 1825 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
@@ -1811,12 +1843,12 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
1811/* 1843/*
1812 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1844 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1813 */ 1845 */
1814bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) 1846bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1815{ 1847{
1816 struct oom_wait_info owait; 1848 struct oom_wait_info owait;
1817 bool locked, need_to_kill; 1849 bool locked, need_to_kill;
1818 1850
1819 owait.mem = memcg; 1851 owait.memcg = memcg;
1820 owait.wait.flags = 0; 1852 owait.wait.flags = 0;
1821 owait.wait.func = memcg_oom_wake_function; 1853 owait.wait.func = memcg_oom_wake_function;
1822 owait.wait.private = current; 1854 owait.wait.private = current;
@@ -1841,7 +1873,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
1841 1873
1842 if (need_to_kill) { 1874 if (need_to_kill) {
1843 finish_wait(&memcg_oom_waitq, &owait.wait); 1875 finish_wait(&memcg_oom_waitq, &owait.wait);
1844 mem_cgroup_out_of_memory(memcg, mask); 1876 mem_cgroup_out_of_memory(memcg, mask, order);
1845 } else { 1877 } else {
1846 schedule(); 1878 schedule();
1847 finish_wait(&memcg_oom_waitq, &owait.wait); 1879 finish_wait(&memcg_oom_waitq, &owait.wait);
@@ -1881,41 +1913,66 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
1881 * by flags. 1913 * by flags.
1882 * 1914 *
1883 * Considering "move", this is an only case we see a race. To make the race 1915 * Considering "move", this is an only case we see a race. To make the race
1884 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are 1916 * small, we check mm->moving_account and detect there are possibility of race
1885 * possibility of race condition. If there is, we take a lock. 1917 * If there is, we take a lock.
1886 */ 1918 */
1887 1919
1920void __mem_cgroup_begin_update_page_stat(struct page *page,
1921 bool *locked, unsigned long *flags)
1922{
1923 struct mem_cgroup *memcg;
1924 struct page_cgroup *pc;
1925
1926 pc = lookup_page_cgroup(page);
1927again:
1928 memcg = pc->mem_cgroup;
1929 if (unlikely(!memcg || !PageCgroupUsed(pc)))
1930 return;
1931 /*
1932 * If this memory cgroup is not under account moving, we don't
1933 * need to take move_lock_page_cgroup(). Because we already hold
1934 * rcu_read_lock(), any calls to move_account will be delayed until
1935 * rcu_read_unlock() if mem_cgroup_stolen() == true.
1936 */
1937 if (!mem_cgroup_stolen(memcg))
1938 return;
1939
1940 move_lock_mem_cgroup(memcg, flags);
1941 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
1942 move_unlock_mem_cgroup(memcg, flags);
1943 goto again;
1944 }
1945 *locked = true;
1946}
1947
1948void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
1949{
1950 struct page_cgroup *pc = lookup_page_cgroup(page);
1951
1952 /*
1953 * It's guaranteed that pc->mem_cgroup never changes while
1954 * lock is held because a routine modifies pc->mem_cgroup
1955 * should take move_lock_page_cgroup().
1956 */
1957 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
1958}
1959
1888void mem_cgroup_update_page_stat(struct page *page, 1960void mem_cgroup_update_page_stat(struct page *page,
1889 enum mem_cgroup_page_stat_item idx, int val) 1961 enum mem_cgroup_page_stat_item idx, int val)
1890{ 1962{
1891 struct mem_cgroup *memcg; 1963 struct mem_cgroup *memcg;
1892 struct page_cgroup *pc = lookup_page_cgroup(page); 1964 struct page_cgroup *pc = lookup_page_cgroup(page);
1893 bool need_unlock = false;
1894 unsigned long uninitialized_var(flags); 1965 unsigned long uninitialized_var(flags);
1895 1966
1896 if (mem_cgroup_disabled()) 1967 if (mem_cgroup_disabled())
1897 return; 1968 return;
1898 1969
1899 rcu_read_lock();
1900 memcg = pc->mem_cgroup; 1970 memcg = pc->mem_cgroup;
1901 if (unlikely(!memcg || !PageCgroupUsed(pc))) 1971 if (unlikely(!memcg || !PageCgroupUsed(pc)))
1902 goto out; 1972 return;
1903 /* pc->mem_cgroup is unstable ? */
1904 if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
1905 /* take a lock against to access pc->mem_cgroup */
1906 move_lock_page_cgroup(pc, &flags);
1907 need_unlock = true;
1908 memcg = pc->mem_cgroup;
1909 if (!memcg || !PageCgroupUsed(pc))
1910 goto out;
1911 }
1912 1973
1913 switch (idx) { 1974 switch (idx) {
1914 case MEMCG_NR_FILE_MAPPED: 1975 case MEMCG_NR_FILE_MAPPED:
1915 if (val > 0)
1916 SetPageCgroupFileMapped(pc);
1917 else if (!page_mapped(page))
1918 ClearPageCgroupFileMapped(pc);
1919 idx = MEM_CGROUP_STAT_FILE_MAPPED; 1976 idx = MEM_CGROUP_STAT_FILE_MAPPED;
1920 break; 1977 break;
1921 default: 1978 default:
@@ -1923,14 +1980,7 @@ void mem_cgroup_update_page_stat(struct page *page,
1923 } 1980 }
1924 1981
1925 this_cpu_add(memcg->stat->count[idx], val); 1982 this_cpu_add(memcg->stat->count[idx], val);
1926
1927out:
1928 if (unlikely(need_unlock))
1929 move_unlock_page_cgroup(pc, &flags);
1930 rcu_read_unlock();
1931 return;
1932} 1983}
1933EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1934 1984
1935/* 1985/*
1936 * size of first charge trial. "32" comes from vmscan.c's magic value. 1986 * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -2101,17 +2151,6 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2101 per_cpu(memcg->stat->events[i], cpu) = 0; 2151 per_cpu(memcg->stat->events[i], cpu) = 0;
2102 memcg->nocpu_base.events[i] += x; 2152 memcg->nocpu_base.events[i] += x;
2103 } 2153 }
2104 /* need to clear ON_MOVE value, works as a kind of lock. */
2105 per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
2106 spin_unlock(&memcg->pcp_counter_lock);
2107}
2108
2109static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
2110{
2111 int idx = MEM_CGROUP_ON_MOVE;
2112
2113 spin_lock(&memcg->pcp_counter_lock);
2114 per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
2115 spin_unlock(&memcg->pcp_counter_lock); 2154 spin_unlock(&memcg->pcp_counter_lock);
2116} 2155}
2117 2156
@@ -2123,11 +2162,8 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2123 struct memcg_stock_pcp *stock; 2162 struct memcg_stock_pcp *stock;
2124 struct mem_cgroup *iter; 2163 struct mem_cgroup *iter;
2125 2164
2126 if ((action == CPU_ONLINE)) { 2165 if (action == CPU_ONLINE)
2127 for_each_mem_cgroup(iter)
2128 synchronize_mem_cgroup_on_move(iter, cpu);
2129 return NOTIFY_OK; 2166 return NOTIFY_OK;
2130 }
2131 2167
2132 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 2168 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2133 return NOTIFY_OK; 2169 return NOTIFY_OK;
@@ -2212,7 +2248,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2212 if (!oom_check) 2248 if (!oom_check)
2213 return CHARGE_NOMEM; 2249 return CHARGE_NOMEM;
2214 /* check OOM */ 2250 /* check OOM */
2215 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) 2251 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
2216 return CHARGE_OOM_DIE; 2252 return CHARGE_OOM_DIE;
2217 2253
2218 return CHARGE_RETRY; 2254 return CHARGE_RETRY;
@@ -2446,6 +2482,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2446{ 2482{
2447 struct zone *uninitialized_var(zone); 2483 struct zone *uninitialized_var(zone);
2448 bool was_on_lru = false; 2484 bool was_on_lru = false;
2485 bool anon;
2449 2486
2450 lock_page_cgroup(pc); 2487 lock_page_cgroup(pc);
2451 if (unlikely(PageCgroupUsed(pc))) { 2488 if (unlikely(PageCgroupUsed(pc))) {
@@ -2481,19 +2518,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2481 * See mem_cgroup_add_lru_list(), etc. 2518 * See mem_cgroup_add_lru_list(), etc.
2482 */ 2519 */
2483 smp_wmb(); 2520 smp_wmb();
2484 switch (ctype) { 2521 SetPageCgroupUsed(pc);
2485 case MEM_CGROUP_CHARGE_TYPE_CACHE:
2486 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
2487 SetPageCgroupCache(pc);
2488 SetPageCgroupUsed(pc);
2489 break;
2490 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2491 ClearPageCgroupCache(pc);
2492 SetPageCgroupUsed(pc);
2493 break;
2494 default:
2495 break;
2496 }
2497 2522
2498 if (lrucare) { 2523 if (lrucare) {
2499 if (was_on_lru) { 2524 if (was_on_lru) {
@@ -2504,7 +2529,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2504 spin_unlock_irq(&zone->lru_lock); 2529 spin_unlock_irq(&zone->lru_lock);
2505 } 2530 }
2506 2531
2507 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); 2532 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
2533 anon = true;
2534 else
2535 anon = false;
2536
2537 mem_cgroup_charge_statistics(memcg, anon, nr_pages);
2508 unlock_page_cgroup(pc); 2538 unlock_page_cgroup(pc);
2509 2539
2510 /* 2540 /*
@@ -2517,8 +2547,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2517 2547
2518#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2548#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2519 2549
2520#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ 2550#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION))
2521 (1 << PCG_MIGRATION))
2522/* 2551/*
2523 * Because tail pages are not marked as "used", set it. We're under 2552 * Because tail pages are not marked as "used", set it. We're under
2524 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2553 * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -2569,6 +2598,7 @@ static int mem_cgroup_move_account(struct page *page,
2569{ 2598{
2570 unsigned long flags; 2599 unsigned long flags;
2571 int ret; 2600 int ret;
2601 bool anon = PageAnon(page);
2572 2602
2573 VM_BUG_ON(from == to); 2603 VM_BUG_ON(from == to);
2574 VM_BUG_ON(PageLRU(page)); 2604 VM_BUG_ON(PageLRU(page));
@@ -2588,23 +2618,23 @@ static int mem_cgroup_move_account(struct page *page,
2588 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 2618 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2589 goto unlock; 2619 goto unlock;
2590 2620
2591 move_lock_page_cgroup(pc, &flags); 2621 move_lock_mem_cgroup(from, &flags);
2592 2622
2593 if (PageCgroupFileMapped(pc)) { 2623 if (!anon && page_mapped(page)) {
2594 /* Update mapped_file data for mem_cgroup */ 2624 /* Update mapped_file data for mem_cgroup */
2595 preempt_disable(); 2625 preempt_disable();
2596 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2626 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2597 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2627 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2598 preempt_enable(); 2628 preempt_enable();
2599 } 2629 }
2600 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); 2630 mem_cgroup_charge_statistics(from, anon, -nr_pages);
2601 if (uncharge) 2631 if (uncharge)
2602 /* This is not "cancel", but cancel_charge does all we need. */ 2632 /* This is not "cancel", but cancel_charge does all we need. */
2603 __mem_cgroup_cancel_charge(from, nr_pages); 2633 __mem_cgroup_cancel_charge(from, nr_pages);
2604 2634
2605 /* caller should have done css_get */ 2635 /* caller should have done css_get */
2606 pc->mem_cgroup = to; 2636 pc->mem_cgroup = to;
2607 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); 2637 mem_cgroup_charge_statistics(to, anon, nr_pages);
2608 /* 2638 /*
2609 * We charges against "to" which may not have any tasks. Then, "to" 2639 * We charges against "to" which may not have any tasks. Then, "to"
2610 * can be under rmdir(). But in current implementation, caller of 2640 * can be under rmdir(). But in current implementation, caller of
@@ -2612,7 +2642,7 @@ static int mem_cgroup_move_account(struct page *page,
2612 * guaranteed that "to" is never removed. So, we don't check rmdir 2642 * guaranteed that "to" is never removed. So, we don't check rmdir
2613 * status here. 2643 * status here.
2614 */ 2644 */
2615 move_unlock_page_cgroup(pc, &flags); 2645 move_unlock_mem_cgroup(from, &flags);
2616 ret = 0; 2646 ret = 0;
2617unlock: 2647unlock:
2618 unlock_page_cgroup(pc); 2648 unlock_page_cgroup(pc);
@@ -2914,7 +2944,6 @@ direct_uncharge:
2914 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); 2944 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
2915 if (unlikely(batch->memcg != memcg)) 2945 if (unlikely(batch->memcg != memcg))
2916 memcg_oom_recover(memcg); 2946 memcg_oom_recover(memcg);
2917 return;
2918} 2947}
2919 2948
2920/* 2949/*
@@ -2926,6 +2955,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2926 struct mem_cgroup *memcg = NULL; 2955 struct mem_cgroup *memcg = NULL;
2927 unsigned int nr_pages = 1; 2956 unsigned int nr_pages = 1;
2928 struct page_cgroup *pc; 2957 struct page_cgroup *pc;
2958 bool anon;
2929 2959
2930 if (mem_cgroup_disabled()) 2960 if (mem_cgroup_disabled())
2931 return NULL; 2961 return NULL;
@@ -2951,8 +2981,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2951 if (!PageCgroupUsed(pc)) 2981 if (!PageCgroupUsed(pc))
2952 goto unlock_out; 2982 goto unlock_out;
2953 2983
2984 anon = PageAnon(page);
2985
2954 switch (ctype) { 2986 switch (ctype) {
2955 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2987 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2988 /*
2989 * Generally PageAnon tells if it's the anon statistics to be
2990 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
2991 * used before page reached the stage of being marked PageAnon.
2992 */
2993 anon = true;
2994 /* fallthrough */
2956 case MEM_CGROUP_CHARGE_TYPE_DROP: 2995 case MEM_CGROUP_CHARGE_TYPE_DROP:
2957 /* See mem_cgroup_prepare_migration() */ 2996 /* See mem_cgroup_prepare_migration() */
2958 if (page_mapped(page) || PageCgroupMigration(pc)) 2997 if (page_mapped(page) || PageCgroupMigration(pc))
@@ -2969,7 +3008,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2969 break; 3008 break;
2970 } 3009 }
2971 3010
2972 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages); 3011 mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
2973 3012
2974 ClearPageCgroupUsed(pc); 3013 ClearPageCgroupUsed(pc);
2975 /* 3014 /*
@@ -3276,6 +3315,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3276{ 3315{
3277 struct page *used, *unused; 3316 struct page *used, *unused;
3278 struct page_cgroup *pc; 3317 struct page_cgroup *pc;
3318 bool anon;
3279 3319
3280 if (!memcg) 3320 if (!memcg)
3281 return; 3321 return;
@@ -3297,8 +3337,10 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3297 lock_page_cgroup(pc); 3337 lock_page_cgroup(pc);
3298 ClearPageCgroupMigration(pc); 3338 ClearPageCgroupMigration(pc);
3299 unlock_page_cgroup(pc); 3339 unlock_page_cgroup(pc);
3300 3340 anon = PageAnon(used);
3301 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 3341 __mem_cgroup_uncharge_common(unused,
3342 anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
3343 : MEM_CGROUP_CHARGE_TYPE_CACHE);
3302 3344
3303 /* 3345 /*
3304 * If a page is a file cache, radix-tree replacement is very atomic 3346 * If a page is a file cache, radix-tree replacement is very atomic
@@ -3308,7 +3350,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3308 * and USED bit check in mem_cgroup_uncharge_page() will do enough 3350 * and USED bit check in mem_cgroup_uncharge_page() will do enough
3309 * check. (see prepare_charge() also) 3351 * check. (see prepare_charge() also)
3310 */ 3352 */
3311 if (PageAnon(used)) 3353 if (anon)
3312 mem_cgroup_uncharge_page(used); 3354 mem_cgroup_uncharge_page(used);
3313 /* 3355 /*
3314 * At migration, we may charge account against cgroup which has no 3356 * At migration, we may charge account against cgroup which has no
@@ -3338,7 +3380,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
3338 /* fix accounting on old pages */ 3380 /* fix accounting on old pages */
3339 lock_page_cgroup(pc); 3381 lock_page_cgroup(pc);
3340 memcg = pc->mem_cgroup; 3382 memcg = pc->mem_cgroup;
3341 mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); 3383 mem_cgroup_charge_statistics(memcg, false, -1);
3342 ClearPageCgroupUsed(pc); 3384 ClearPageCgroupUsed(pc);
3343 unlock_page_cgroup(pc); 3385 unlock_page_cgroup(pc);
3344 3386
@@ -3549,7 +3591,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3549 break; 3591 break;
3550 3592
3551 nr_scanned = 0; 3593 nr_scanned = 0;
3552 reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, 3594 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
3553 gfp_mask, &nr_scanned); 3595 gfp_mask, &nr_scanned);
3554 nr_reclaimed += reclaimed; 3596 nr_reclaimed += reclaimed;
3555 *total_scanned += nr_scanned; 3597 *total_scanned += nr_scanned;
@@ -3576,13 +3618,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3576 next_mz = 3618 next_mz =
3577 __mem_cgroup_largest_soft_limit_node(mctz); 3619 __mem_cgroup_largest_soft_limit_node(mctz);
3578 if (next_mz == mz) 3620 if (next_mz == mz)
3579 css_put(&next_mz->mem->css); 3621 css_put(&next_mz->memcg->css);
3580 else /* next_mz == NULL or other memcg */ 3622 else /* next_mz == NULL or other memcg */
3581 break; 3623 break;
3582 } while (1); 3624 } while (1);
3583 } 3625 }
3584 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 3626 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
3585 excess = res_counter_soft_limit_excess(&mz->mem->res); 3627 excess = res_counter_soft_limit_excess(&mz->memcg->res);
3586 /* 3628 /*
3587 * One school of thought says that we should not add 3629 * One school of thought says that we should not add
3588 * back the node to the tree if reclaim returns 0. 3630 * back the node to the tree if reclaim returns 0.
@@ -3592,9 +3634,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3592 * term TODO. 3634 * term TODO.
3593 */ 3635 */
3594 /* If excess == 0, no tree ops */ 3636 /* If excess == 0, no tree ops */
3595 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 3637 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
3596 spin_unlock(&mctz->lock); 3638 spin_unlock(&mctz->lock);
3597 css_put(&mz->mem->css); 3639 css_put(&mz->memcg->css);
3598 loop++; 3640 loop++;
3599 /* 3641 /*
3600 * Could not reclaim anything and there are no more 3642 * Could not reclaim anything and there are no more
@@ -3607,7 +3649,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3607 break; 3649 break;
3608 } while (!nr_reclaimed); 3650 } while (!nr_reclaimed);
3609 if (next_mz) 3651 if (next_mz)
3610 css_put(&next_mz->mem->css); 3652 css_put(&next_mz->memcg->css);
3611 return nr_reclaimed; 3653 return nr_reclaimed;
3612} 3654}
3613 3655
@@ -3629,7 +3671,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3629 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3671 mz = mem_cgroup_zoneinfo(memcg, node, zid);
3630 list = &mz->lruvec.lists[lru]; 3672 list = &mz->lruvec.lists[lru];
3631 3673
3632 loop = MEM_CGROUP_ZSTAT(mz, lru); 3674 loop = mz->lru_size[lru];
3633 /* give some margin against EBUSY etc...*/ 3675 /* give some margin against EBUSY etc...*/
3634 loop += 256; 3676 loop += 256;
3635 busy = NULL; 3677 busy = NULL;
@@ -3703,10 +3745,10 @@ move_account:
3703 mem_cgroup_start_move(memcg); 3745 mem_cgroup_start_move(memcg);
3704 for_each_node_state(node, N_HIGH_MEMORY) { 3746 for_each_node_state(node, N_HIGH_MEMORY) {
3705 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3747 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3706 enum lru_list l; 3748 enum lru_list lru;
3707 for_each_lru(l) { 3749 for_each_lru(lru) {
3708 ret = mem_cgroup_force_empty_list(memcg, 3750 ret = mem_cgroup_force_empty_list(memcg,
3709 node, zid, l); 3751 node, zid, lru);
3710 if (ret) 3752 if (ret)
3711 break; 3753 break;
3712 } 3754 }
@@ -3860,7 +3902,6 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3860 break; 3902 break;
3861 default: 3903 default:
3862 BUG(); 3904 BUG();
3863 break;
3864 } 3905 }
3865 return val; 3906 return val;
3866} 3907}
@@ -3939,7 +3980,6 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3939out: 3980out:
3940 *mem_limit = min_limit; 3981 *mem_limit = min_limit;
3941 *memsw_limit = min_memsw_limit; 3982 *memsw_limit = min_memsw_limit;
3942 return;
3943} 3983}
3944 3984
3945static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3985static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -4098,38 +4138,38 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4098 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 4138 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4099 unsigned long node_nr; 4139 unsigned long node_nr;
4100 struct cgroup *cont = m->private; 4140 struct cgroup *cont = m->private;
4101 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4141 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4102 4142
4103 total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); 4143 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
4104 seq_printf(m, "total=%lu", total_nr); 4144 seq_printf(m, "total=%lu", total_nr);
4105 for_each_node_state(nid, N_HIGH_MEMORY) { 4145 for_each_node_state(nid, N_HIGH_MEMORY) {
4106 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); 4146 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
4107 seq_printf(m, " N%d=%lu", nid, node_nr); 4147 seq_printf(m, " N%d=%lu", nid, node_nr);
4108 } 4148 }
4109 seq_putc(m, '\n'); 4149 seq_putc(m, '\n');
4110 4150
4111 file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); 4151 file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
4112 seq_printf(m, "file=%lu", file_nr); 4152 seq_printf(m, "file=%lu", file_nr);
4113 for_each_node_state(nid, N_HIGH_MEMORY) { 4153 for_each_node_state(nid, N_HIGH_MEMORY) {
4114 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4154 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4115 LRU_ALL_FILE); 4155 LRU_ALL_FILE);
4116 seq_printf(m, " N%d=%lu", nid, node_nr); 4156 seq_printf(m, " N%d=%lu", nid, node_nr);
4117 } 4157 }
4118 seq_putc(m, '\n'); 4158 seq_putc(m, '\n');
4119 4159
4120 anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); 4160 anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
4121 seq_printf(m, "anon=%lu", anon_nr); 4161 seq_printf(m, "anon=%lu", anon_nr);
4122 for_each_node_state(nid, N_HIGH_MEMORY) { 4162 for_each_node_state(nid, N_HIGH_MEMORY) {
4123 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4163 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4124 LRU_ALL_ANON); 4164 LRU_ALL_ANON);
4125 seq_printf(m, " N%d=%lu", nid, node_nr); 4165 seq_printf(m, " N%d=%lu", nid, node_nr);
4126 } 4166 }
4127 seq_putc(m, '\n'); 4167 seq_putc(m, '\n');
4128 4168
4129 unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); 4169 unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4130 seq_printf(m, "unevictable=%lu", unevictable_nr); 4170 seq_printf(m, "unevictable=%lu", unevictable_nr);
4131 for_each_node_state(nid, N_HIGH_MEMORY) { 4171 for_each_node_state(nid, N_HIGH_MEMORY) {
4132 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4172 node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4133 BIT(LRU_UNEVICTABLE)); 4173 BIT(LRU_UNEVICTABLE));
4134 seq_printf(m, " N%d=%lu", nid, node_nr); 4174 seq_printf(m, " N%d=%lu", nid, node_nr);
4135 } 4175 }
@@ -4141,12 +4181,12 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4141static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4181static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4142 struct cgroup_map_cb *cb) 4182 struct cgroup_map_cb *cb)
4143{ 4183{
4144 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4184 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4145 struct mcs_total_stat mystat; 4185 struct mcs_total_stat mystat;
4146 int i; 4186 int i;
4147 4187
4148 memset(&mystat, 0, sizeof(mystat)); 4188 memset(&mystat, 0, sizeof(mystat));
4149 mem_cgroup_get_local_stat(mem_cont, &mystat); 4189 mem_cgroup_get_local_stat(memcg, &mystat);
4150 4190
4151 4191
4152 for (i = 0; i < NR_MCS_STAT; i++) { 4192 for (i = 0; i < NR_MCS_STAT; i++) {
@@ -4158,14 +4198,14 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4158 /* Hierarchical information */ 4198 /* Hierarchical information */
4159 { 4199 {
4160 unsigned long long limit, memsw_limit; 4200 unsigned long long limit, memsw_limit;
4161 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 4201 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
4162 cb->fill(cb, "hierarchical_memory_limit", limit); 4202 cb->fill(cb, "hierarchical_memory_limit", limit);
4163 if (do_swap_account) 4203 if (do_swap_account)
4164 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 4204 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
4165 } 4205 }
4166 4206
4167 memset(&mystat, 0, sizeof(mystat)); 4207 memset(&mystat, 0, sizeof(mystat));
4168 mem_cgroup_get_total_stat(mem_cont, &mystat); 4208 mem_cgroup_get_total_stat(memcg, &mystat);
4169 for (i = 0; i < NR_MCS_STAT; i++) { 4209 for (i = 0; i < NR_MCS_STAT; i++) {
4170 if (i == MCS_SWAP && !do_swap_account) 4210 if (i == MCS_SWAP && !do_swap_account)
4171 continue; 4211 continue;
@@ -4181,7 +4221,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4181 4221
4182 for_each_online_node(nid) 4222 for_each_online_node(nid)
4183 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4223 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4184 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 4224 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
4185 4225
4186 recent_rotated[0] += 4226 recent_rotated[0] +=
4187 mz->reclaim_stat.recent_rotated[0]; 4227 mz->reclaim_stat.recent_rotated[0];
@@ -4426,12 +4466,6 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4426 else 4466 else
4427 BUG(); 4467 BUG();
4428 4468
4429 /*
4430 * Something went wrong if we trying to unregister a threshold
4431 * if we don't have thresholds
4432 */
4433 BUG_ON(!thresholds);
4434
4435 if (!thresholds->primary) 4469 if (!thresholds->primary)
4436 goto unlock; 4470 goto unlock;
4437 4471
@@ -4736,7 +4770,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4736{ 4770{
4737 struct mem_cgroup_per_node *pn; 4771 struct mem_cgroup_per_node *pn;
4738 struct mem_cgroup_per_zone *mz; 4772 struct mem_cgroup_per_zone *mz;
4739 enum lru_list l; 4773 enum lru_list lru;
4740 int zone, tmp = node; 4774 int zone, tmp = node;
4741 /* 4775 /*
4742 * This routine is called against possible nodes. 4776 * This routine is called against possible nodes.
@@ -4754,11 +4788,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4754 4788
4755 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4789 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4756 mz = &pn->zoneinfo[zone]; 4790 mz = &pn->zoneinfo[zone];
4757 for_each_lru(l) 4791 for_each_lru(lru)
4758 INIT_LIST_HEAD(&mz->lruvec.lists[l]); 4792 INIT_LIST_HEAD(&mz->lruvec.lists[lru]);
4759 mz->usage_in_excess = 0; 4793 mz->usage_in_excess = 0;
4760 mz->on_tree = false; 4794 mz->on_tree = false;
4761 mz->mem = memcg; 4795 mz->memcg = memcg;
4762 } 4796 }
4763 memcg->info.nodeinfo[node] = pn; 4797 memcg->info.nodeinfo[node] = pn;
4764 return 0; 4798 return 0;
@@ -4771,29 +4805,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4771 4805
4772static struct mem_cgroup *mem_cgroup_alloc(void) 4806static struct mem_cgroup *mem_cgroup_alloc(void)
4773{ 4807{
4774 struct mem_cgroup *mem; 4808 struct mem_cgroup *memcg;
4775 int size = sizeof(struct mem_cgroup); 4809 int size = sizeof(struct mem_cgroup);
4776 4810
4777 /* Can be very big if MAX_NUMNODES is very big */ 4811 /* Can be very big if MAX_NUMNODES is very big */
4778 if (size < PAGE_SIZE) 4812 if (size < PAGE_SIZE)
4779 mem = kzalloc(size, GFP_KERNEL); 4813 memcg = kzalloc(size, GFP_KERNEL);
4780 else 4814 else
4781 mem = vzalloc(size); 4815 memcg = vzalloc(size);
4782 4816
4783 if (!mem) 4817 if (!memcg)
4784 return NULL; 4818 return NULL;
4785 4819
4786 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4820 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4787 if (!mem->stat) 4821 if (!memcg->stat)
4788 goto out_free; 4822 goto out_free;
4789 spin_lock_init(&mem->pcp_counter_lock); 4823 spin_lock_init(&memcg->pcp_counter_lock);
4790 return mem; 4824 return memcg;
4791 4825
4792out_free: 4826out_free:
4793 if (size < PAGE_SIZE) 4827 if (size < PAGE_SIZE)
4794 kfree(mem); 4828 kfree(memcg);
4795 else 4829 else
4796 vfree(mem); 4830 vfree(memcg);
4797 return NULL; 4831 return NULL;
4798} 4832}
4799 4833
@@ -4981,6 +5015,7 @@ mem_cgroup_create(struct cgroup *cont)
4981 atomic_set(&memcg->refcnt, 1); 5015 atomic_set(&memcg->refcnt, 1);
4982 memcg->move_charge_at_immigrate = 0; 5016 memcg->move_charge_at_immigrate = 0;
4983 mutex_init(&memcg->thresholds_lock); 5017 mutex_init(&memcg->thresholds_lock);
5018 spin_lock_init(&memcg->move_lock);
4984 return &memcg->css; 5019 return &memcg->css;
4985free_out: 5020free_out:
4986 __mem_cgroup_free(memcg); 5021 __mem_cgroup_free(memcg);
@@ -5075,7 +5110,7 @@ one_by_one:
5075} 5110}
5076 5111
5077/** 5112/**
5078 * is_target_pte_for_mc - check a pte whether it is valid for move charge 5113 * get_mctgt_type - get target type of moving charge
5079 * @vma: the vma the pte to be checked belongs 5114 * @vma: the vma the pte to be checked belongs
5080 * @addr: the address corresponding to the pte to be checked 5115 * @addr: the address corresponding to the pte to be checked
5081 * @ptent: the pte to be checked 5116 * @ptent: the pte to be checked
@@ -5098,7 +5133,7 @@ union mc_target {
5098}; 5133};
5099 5134
5100enum mc_target_type { 5135enum mc_target_type {
5101 MC_TARGET_NONE, /* not used */ 5136 MC_TARGET_NONE = 0,
5102 MC_TARGET_PAGE, 5137 MC_TARGET_PAGE,
5103 MC_TARGET_SWAP, 5138 MC_TARGET_SWAP,
5104}; 5139};
@@ -5179,12 +5214,12 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5179 return page; 5214 return page;
5180} 5215}
5181 5216
5182static int is_target_pte_for_mc(struct vm_area_struct *vma, 5217static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5183 unsigned long addr, pte_t ptent, union mc_target *target) 5218 unsigned long addr, pte_t ptent, union mc_target *target)
5184{ 5219{
5185 struct page *page = NULL; 5220 struct page *page = NULL;
5186 struct page_cgroup *pc; 5221 struct page_cgroup *pc;
5187 int ret = 0; 5222 enum mc_target_type ret = MC_TARGET_NONE;
5188 swp_entry_t ent = { .val = 0 }; 5223 swp_entry_t ent = { .val = 0 };
5189 5224
5190 if (pte_present(ptent)) 5225 if (pte_present(ptent))
@@ -5195,7 +5230,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
5195 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5230 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5196 5231
5197 if (!page && !ent.val) 5232 if (!page && !ent.val)
5198 return 0; 5233 return ret;
5199 if (page) { 5234 if (page) {
5200 pc = lookup_page_cgroup(page); 5235 pc = lookup_page_cgroup(page);
5201 /* 5236 /*
@@ -5221,6 +5256,41 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,
5221 return ret; 5256 return ret;
5222} 5257}
5223 5258
5259#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5260/*
5261 * We don't consider swapping or file mapped pages because THP does not
5262 * support them for now.
5263 * Caller should make sure that pmd_trans_huge(pmd) is true.
5264 */
5265static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5266 unsigned long addr, pmd_t pmd, union mc_target *target)
5267{
5268 struct page *page = NULL;
5269 struct page_cgroup *pc;
5270 enum mc_target_type ret = MC_TARGET_NONE;
5271
5272 page = pmd_page(pmd);
5273 VM_BUG_ON(!page || !PageHead(page));
5274 if (!move_anon())
5275 return ret;
5276 pc = lookup_page_cgroup(page);
5277 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5278 ret = MC_TARGET_PAGE;
5279 if (target) {
5280 get_page(page);
5281 target->page = page;
5282 }
5283 }
5284 return ret;
5285}
5286#else
5287static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5288 unsigned long addr, pmd_t pmd, union mc_target *target)
5289{
5290 return MC_TARGET_NONE;
5291}
5292#endif
5293
5224static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5294static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5225 unsigned long addr, unsigned long end, 5295 unsigned long addr, unsigned long end,
5226 struct mm_walk *walk) 5296 struct mm_walk *walk)
@@ -5229,11 +5299,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5229 pte_t *pte; 5299 pte_t *pte;
5230 spinlock_t *ptl; 5300 spinlock_t *ptl;
5231 5301
5232 split_huge_page_pmd(walk->mm, pmd); 5302 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5303 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5304 mc.precharge += HPAGE_PMD_NR;
5305 spin_unlock(&vma->vm_mm->page_table_lock);
5306 return 0;
5307 }
5233 5308
5234 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5309 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5235 for (; addr != end; pte++, addr += PAGE_SIZE) 5310 for (; addr != end; pte++, addr += PAGE_SIZE)
5236 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 5311 if (get_mctgt_type(vma, addr, *pte, NULL))
5237 mc.precharge++; /* increment precharge temporarily */ 5312 mc.precharge++; /* increment precharge temporarily */
5238 pte_unmap_unlock(pte - 1, ptl); 5313 pte_unmap_unlock(pte - 1, ptl);
5239 cond_resched(); 5314 cond_resched();
@@ -5388,23 +5463,55 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5388 struct vm_area_struct *vma = walk->private; 5463 struct vm_area_struct *vma = walk->private;
5389 pte_t *pte; 5464 pte_t *pte;
5390 spinlock_t *ptl; 5465 spinlock_t *ptl;
5466 enum mc_target_type target_type;
5467 union mc_target target;
5468 struct page *page;
5469 struct page_cgroup *pc;
5470
5471 /*
5472 * We don't take compound_lock() here but no race with splitting thp
5473 * happens because:
5474 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
5475 * under splitting, which means there's no concurrent thp split,
5476 * - if another thread runs into split_huge_page() just after we
5477 * entered this if-block, the thread must wait for page table lock
5478 * to be unlocked in __split_huge_page_splitting(), where the main
5479 * part of thp split is not executed yet.
5480 */
5481 if (pmd_trans_huge_lock(pmd, vma) == 1) {
5482 if (!mc.precharge) {
5483 spin_unlock(&vma->vm_mm->page_table_lock);
5484 return 0;
5485 }
5486 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5487 if (target_type == MC_TARGET_PAGE) {
5488 page = target.page;
5489 if (!isolate_lru_page(page)) {
5490 pc = lookup_page_cgroup(page);
5491 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
5492 pc, mc.from, mc.to,
5493 false)) {
5494 mc.precharge -= HPAGE_PMD_NR;
5495 mc.moved_charge += HPAGE_PMD_NR;
5496 }
5497 putback_lru_page(page);
5498 }
5499 put_page(page);
5500 }
5501 spin_unlock(&vma->vm_mm->page_table_lock);
5502 return 0;
5503 }
5391 5504
5392 split_huge_page_pmd(walk->mm, pmd);
5393retry: 5505retry:
5394 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5506 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5395 for (; addr != end; addr += PAGE_SIZE) { 5507 for (; addr != end; addr += PAGE_SIZE) {
5396 pte_t ptent = *(pte++); 5508 pte_t ptent = *(pte++);
5397 union mc_target target;
5398 int type;
5399 struct page *page;
5400 struct page_cgroup *pc;
5401 swp_entry_t ent; 5509 swp_entry_t ent;
5402 5510
5403 if (!mc.precharge) 5511 if (!mc.precharge)
5404 break; 5512 break;
5405 5513
5406 type = is_target_pte_for_mc(vma, addr, ptent, &target); 5514 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5407 switch (type) {
5408 case MC_TARGET_PAGE: 5515 case MC_TARGET_PAGE:
5409 page = target.page; 5516 page = target.page;
5410 if (isolate_lru_page(page)) 5517 if (isolate_lru_page(page))
@@ -5417,7 +5524,7 @@ retry:
5417 mc.moved_charge++; 5524 mc.moved_charge++;
5418 } 5525 }
5419 putback_lru_page(page); 5526 putback_lru_page(page);
5420put: /* is_target_pte_for_mc() gets the page */ 5527put: /* get_mctgt_type() gets the page */
5421 put_page(page); 5528 put_page(page);
5422 break; 5529 break;
5423 case MC_TARGET_SWAP: 5530 case MC_TARGET_SWAP:
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 56080ea36140..c22076ffdd44 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1063,7 +1063,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1063 * The check (unnecessarily) ignores LRU pages being isolated and 1063 * The check (unnecessarily) ignores LRU pages being isolated and
1064 * walked by the page reclaim code, however that's not a big loss. 1064 * walked by the page reclaim code, however that's not a big loss.
1065 */ 1065 */
1066 if (!PageHuge(p) && !PageTransCompound(p)) { 1066 if (!PageHuge(p) && !PageTransTail(p)) {
1067 if (!PageLRU(p)) 1067 if (!PageLRU(p))
1068 shake_page(p, 0); 1068 shake_page(p, 0);
1069 if (!PageLRU(p)) { 1069 if (!PageLRU(p)) {
diff --git a/mm/memory.c b/mm/memory.c
index 8438c157e4d9..3416b6e018d6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -125,17 +125,17 @@ core_initcall(init_zero_pfn);
125 125
126#if defined(SPLIT_RSS_COUNTING) 126#if defined(SPLIT_RSS_COUNTING)
127 127
128static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) 128void sync_mm_rss(struct mm_struct *mm)
129{ 129{
130 int i; 130 int i;
131 131
132 for (i = 0; i < NR_MM_COUNTERS; i++) { 132 for (i = 0; i < NR_MM_COUNTERS; i++) {
133 if (task->rss_stat.count[i]) { 133 if (current->rss_stat.count[i]) {
134 add_mm_counter(mm, i, task->rss_stat.count[i]); 134 add_mm_counter(mm, i, current->rss_stat.count[i]);
135 task->rss_stat.count[i] = 0; 135 current->rss_stat.count[i] = 0;
136 } 136 }
137 } 137 }
138 task->rss_stat.events = 0; 138 current->rss_stat.events = 0;
139} 139}
140 140
141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) 141static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
@@ -157,30 +157,7 @@ static void check_sync_rss_stat(struct task_struct *task)
157 if (unlikely(task != current)) 157 if (unlikely(task != current))
158 return; 158 return;
159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) 159 if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
160 __sync_task_rss_stat(task, task->mm); 160 sync_mm_rss(task->mm);
161}
162
163unsigned long get_mm_counter(struct mm_struct *mm, int member)
164{
165 long val = 0;
166
167 /*
168 * Don't use task->mm here...for avoiding to use task_get_mm()..
169 * The caller must guarantee task->mm is not invalid.
170 */
171 val = atomic_long_read(&mm->rss_stat.count[member]);
172 /*
173 * counter is updated in asynchronous manner and may go to minus.
174 * But it's never be expected number for users.
175 */
176 if (val < 0)
177 return 0;
178 return (unsigned long)val;
179}
180
181void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
182{
183 __sync_task_rss_stat(task, mm);
184} 161}
185#else /* SPLIT_RSS_COUNTING */ 162#else /* SPLIT_RSS_COUNTING */
186 163
@@ -661,7 +638,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
661 int i; 638 int i;
662 639
663 if (current->mm == mm) 640 if (current->mm == mm)
664 sync_mm_rss(current, mm); 641 sync_mm_rss(mm);
665 for (i = 0; i < NR_MM_COUNTERS; i++) 642 for (i = 0; i < NR_MM_COUNTERS; i++)
666 if (rss[i]) 643 if (rss[i])
667 add_mm_counter(mm, i, rss[i]); 644 add_mm_counter(mm, i, rss[i]);
@@ -1247,16 +1224,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
1247 do { 1224 do {
1248 next = pmd_addr_end(addr, end); 1225 next = pmd_addr_end(addr, end);
1249 if (pmd_trans_huge(*pmd)) { 1226 if (pmd_trans_huge(*pmd)) {
1250 if (next-addr != HPAGE_PMD_SIZE) { 1227 if (next - addr != HPAGE_PMD_SIZE) {
1251 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); 1228 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1252 split_huge_page_pmd(vma->vm_mm, pmd); 1229 split_huge_page_pmd(vma->vm_mm, pmd);
1253 } else if (zap_huge_pmd(tlb, vma, pmd, addr)) 1230 } else if (zap_huge_pmd(tlb, vma, pmd, addr))
1254 continue; 1231 goto next;
1255 /* fall through */ 1232 /* fall through */
1256 } 1233 }
1257 if (pmd_none_or_clear_bad(pmd)) 1234 /*
1258 continue; 1235 * Here there can be other concurrent MADV_DONTNEED or
1236 * trans huge page faults running, and if the pmd is
1237 * none or trans huge it can change under us. This is
1238 * because MADV_DONTNEED holds the mmap_sem in read
1239 * mode.
1240 */
1241 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1242 goto next;
1259 next = zap_pte_range(tlb, vma, pmd, addr, next, details); 1243 next = zap_pte_range(tlb, vma, pmd, addr, next, details);
1244next:
1260 cond_resched(); 1245 cond_resched();
1261 } while (pmd++, addr = next, addr != end); 1246 } while (pmd++, addr = next, addr != end);
1262 1247
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 47296fee23db..cfb6c8678754 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
512 do { 512 do {
513 next = pmd_addr_end(addr, end); 513 next = pmd_addr_end(addr, end);
514 split_huge_page_pmd(vma->vm_mm, pmd); 514 split_huge_page_pmd(vma->vm_mm, pmd);
515 if (pmd_none_or_clear_bad(pmd)) 515 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
516 continue; 516 continue;
517 if (check_pte_range(vma, pmd, addr, next, nodes, 517 if (check_pte_range(vma, pmd, addr, next, nodes,
518 flags, private)) 518 flags, private))
@@ -1323,12 +1323,9 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1323 err = -ESRCH; 1323 err = -ESRCH;
1324 goto out; 1324 goto out;
1325 } 1325 }
1326 mm = get_task_mm(task); 1326 get_task_struct(task);
1327 rcu_read_unlock();
1328 1327
1329 err = -EINVAL; 1328 err = -EINVAL;
1330 if (!mm)
1331 goto out;
1332 1329
1333 /* 1330 /*
1334 * Check if this process has the right to modify the specified 1331 * Check if this process has the right to modify the specified
@@ -1336,14 +1333,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1336 * capabilities, superuser privileges or the same 1333 * capabilities, superuser privileges or the same
1337 * userid as the target process. 1334 * userid as the target process.
1338 */ 1335 */
1339 rcu_read_lock();
1340 tcred = __task_cred(task); 1336 tcred = __task_cred(task);
1341 if (cred->euid != tcred->suid && cred->euid != tcred->uid && 1337 if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1342 cred->uid != tcred->suid && cred->uid != tcred->uid && 1338 cred->uid != tcred->suid && cred->uid != tcred->uid &&
1343 !capable(CAP_SYS_NICE)) { 1339 !capable(CAP_SYS_NICE)) {
1344 rcu_read_unlock(); 1340 rcu_read_unlock();
1345 err = -EPERM; 1341 err = -EPERM;
1346 goto out; 1342 goto out_put;
1347 } 1343 }
1348 rcu_read_unlock(); 1344 rcu_read_unlock();
1349 1345
@@ -1351,26 +1347,36 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1351 /* Is the user allowed to access the target nodes? */ 1347 /* Is the user allowed to access the target nodes? */
1352 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { 1348 if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1353 err = -EPERM; 1349 err = -EPERM;
1354 goto out; 1350 goto out_put;
1355 } 1351 }
1356 1352
1357 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { 1353 if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
1358 err = -EINVAL; 1354 err = -EINVAL;
1359 goto out; 1355 goto out_put;
1360 } 1356 }
1361 1357
1362 err = security_task_movememory(task); 1358 err = security_task_movememory(task);
1363 if (err) 1359 if (err)
1364 goto out; 1360 goto out_put;
1365 1361
1366 err = do_migrate_pages(mm, old, new, 1362 mm = get_task_mm(task);
1367 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); 1363 put_task_struct(task);
1368out:
1369 if (mm) 1364 if (mm)
1370 mmput(mm); 1365 err = do_migrate_pages(mm, old, new,
1366 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1367 else
1368 err = -EINVAL;
1369
1370 mmput(mm);
1371out:
1371 NODEMASK_SCRATCH_FREE(scratch); 1372 NODEMASK_SCRATCH_FREE(scratch);
1372 1373
1373 return err; 1374 return err;
1375
1376out_put:
1377 put_task_struct(task);
1378 goto out;
1379
1374} 1380}
1375 1381
1376 1382
@@ -1844,18 +1850,24 @@ struct page *
1844alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, 1850alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1845 unsigned long addr, int node) 1851 unsigned long addr, int node)
1846{ 1852{
1847 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1853 struct mempolicy *pol;
1848 struct zonelist *zl; 1854 struct zonelist *zl;
1849 struct page *page; 1855 struct page *page;
1856 unsigned int cpuset_mems_cookie;
1857
1858retry_cpuset:
1859 pol = get_vma_policy(current, vma, addr);
1860 cpuset_mems_cookie = get_mems_allowed();
1850 1861
1851 get_mems_allowed();
1852 if (unlikely(pol->mode == MPOL_INTERLEAVE)) { 1862 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1853 unsigned nid; 1863 unsigned nid;
1854 1864
1855 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); 1865 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1856 mpol_cond_put(pol); 1866 mpol_cond_put(pol);
1857 page = alloc_page_interleave(gfp, order, nid); 1867 page = alloc_page_interleave(gfp, order, nid);
1858 put_mems_allowed(); 1868 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1869 goto retry_cpuset;
1870
1859 return page; 1871 return page;
1860 } 1872 }
1861 zl = policy_zonelist(gfp, pol, node); 1873 zl = policy_zonelist(gfp, pol, node);
@@ -1866,7 +1878,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1866 struct page *page = __alloc_pages_nodemask(gfp, order, 1878 struct page *page = __alloc_pages_nodemask(gfp, order,
1867 zl, policy_nodemask(gfp, pol)); 1879 zl, policy_nodemask(gfp, pol));
1868 __mpol_put(pol); 1880 __mpol_put(pol);
1869 put_mems_allowed(); 1881 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1882 goto retry_cpuset;
1870 return page; 1883 return page;
1871 } 1884 }
1872 /* 1885 /*
@@ -1874,7 +1887,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1874 */ 1887 */
1875 page = __alloc_pages_nodemask(gfp, order, zl, 1888 page = __alloc_pages_nodemask(gfp, order, zl,
1876 policy_nodemask(gfp, pol)); 1889 policy_nodemask(gfp, pol));
1877 put_mems_allowed(); 1890 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1891 goto retry_cpuset;
1878 return page; 1892 return page;
1879} 1893}
1880 1894
@@ -1901,11 +1915,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1901{ 1915{
1902 struct mempolicy *pol = current->mempolicy; 1916 struct mempolicy *pol = current->mempolicy;
1903 struct page *page; 1917 struct page *page;
1918 unsigned int cpuset_mems_cookie;
1904 1919
1905 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1920 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1906 pol = &default_policy; 1921 pol = &default_policy;
1907 1922
1908 get_mems_allowed(); 1923retry_cpuset:
1924 cpuset_mems_cookie = get_mems_allowed();
1925
1909 /* 1926 /*
1910 * No reference counting needed for current->mempolicy 1927 * No reference counting needed for current->mempolicy
1911 * nor system default_policy 1928 * nor system default_policy
@@ -1916,7 +1933,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1916 page = __alloc_pages_nodemask(gfp, order, 1933 page = __alloc_pages_nodemask(gfp, order,
1917 policy_zonelist(gfp, pol, numa_node_id()), 1934 policy_zonelist(gfp, pol, numa_node_id()),
1918 policy_nodemask(gfp, pol)); 1935 policy_nodemask(gfp, pol));
1919 put_mems_allowed(); 1936
1937 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1938 goto retry_cpuset;
1939
1920 return page; 1940 return page;
1921} 1941}
1922EXPORT_SYMBOL(alloc_pages_current); 1942EXPORT_SYMBOL(alloc_pages_current);
diff --git a/mm/migrate.c b/mm/migrate.c
index 1503b6b54ecb..51c08a0c6f68 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1174,20 +1174,17 @@ set_status:
1174 * Migrate an array of page address onto an array of nodes and fill 1174 * Migrate an array of page address onto an array of nodes and fill
1175 * the corresponding array of status. 1175 * the corresponding array of status.
1176 */ 1176 */
1177static int do_pages_move(struct mm_struct *mm, struct task_struct *task, 1177static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
1178 unsigned long nr_pages, 1178 unsigned long nr_pages,
1179 const void __user * __user *pages, 1179 const void __user * __user *pages,
1180 const int __user *nodes, 1180 const int __user *nodes,
1181 int __user *status, int flags) 1181 int __user *status, int flags)
1182{ 1182{
1183 struct page_to_node *pm; 1183 struct page_to_node *pm;
1184 nodemask_t task_nodes;
1185 unsigned long chunk_nr_pages; 1184 unsigned long chunk_nr_pages;
1186 unsigned long chunk_start; 1185 unsigned long chunk_start;
1187 int err; 1186 int err;
1188 1187
1189 task_nodes = cpuset_mems_allowed(task);
1190
1191 err = -ENOMEM; 1188 err = -ENOMEM;
1192 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); 1189 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
1193 if (!pm) 1190 if (!pm)
@@ -1349,6 +1346,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1349 struct task_struct *task; 1346 struct task_struct *task;
1350 struct mm_struct *mm; 1347 struct mm_struct *mm;
1351 int err; 1348 int err;
1349 nodemask_t task_nodes;
1352 1350
1353 /* Check flags */ 1351 /* Check flags */
1354 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1352 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -1364,11 +1362,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1364 rcu_read_unlock(); 1362 rcu_read_unlock();
1365 return -ESRCH; 1363 return -ESRCH;
1366 } 1364 }
1367 mm = get_task_mm(task); 1365 get_task_struct(task);
1368 rcu_read_unlock();
1369
1370 if (!mm)
1371 return -EINVAL;
1372 1366
1373 /* 1367 /*
1374 * Check if this process has the right to modify the specified 1368 * Check if this process has the right to modify the specified
@@ -1376,7 +1370,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1376 * capabilities, superuser privileges or the same 1370 * capabilities, superuser privileges or the same
1377 * userid as the target process. 1371 * userid as the target process.
1378 */ 1372 */
1379 rcu_read_lock();
1380 tcred = __task_cred(task); 1373 tcred = __task_cred(task);
1381 if (cred->euid != tcred->suid && cred->euid != tcred->uid && 1374 if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1382 cred->uid != tcred->suid && cred->uid != tcred->uid && 1375 cred->uid != tcred->suid && cred->uid != tcred->uid &&
@@ -1391,16 +1384,25 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1391 if (err) 1384 if (err)
1392 goto out; 1385 goto out;
1393 1386
1394 if (nodes) { 1387 task_nodes = cpuset_mems_allowed(task);
1395 err = do_pages_move(mm, task, nr_pages, pages, nodes, status, 1388 mm = get_task_mm(task);
1396 flags); 1389 put_task_struct(task);
1397 } else { 1390
1398 err = do_pages_stat(mm, nr_pages, pages, status); 1391 if (mm) {
1399 } 1392 if (nodes)
1393 err = do_pages_move(mm, task_nodes, nr_pages, pages,
1394 nodes, status, flags);
1395 else
1396 err = do_pages_stat(mm, nr_pages, pages, status);
1397 } else
1398 err = -EINVAL;
1400 1399
1401out:
1402 mmput(mm); 1400 mmput(mm);
1403 return err; 1401 return err;
1402
1403out:
1404 put_task_struct(task);
1405 return err;
1404} 1406}
1405 1407
1406/* 1408/*
diff --git a/mm/mincore.c b/mm/mincore.c
index 636a86876ff2..936b4cee8cb1 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
164 } 164 }
165 /* fall through */ 165 /* fall through */
166 } 166 }
167 if (pmd_none_or_clear_bad(pmd)) 167 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
168 mincore_unmapped_range(vma, addr, next, vec); 168 mincore_unmapped_range(vma, addr, next, vec);
169 else 169 else
170 mincore_pte_range(vma, pmd, addr, next, vec); 170 mincore_pte_range(vma, pmd, addr, next, vec);
diff --git a/mm/mmap.c b/mm/mmap.c
index 6f3766b57803..a7bf6a31c9f6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -451,9 +451,8 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
451} 451}
452 452
453/* 453/*
454 * Helper for vma_adjust in the split_vma insert case: 454 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
455 * insert vm structure into list and rbtree and anon_vma, 455 * mm's list and rbtree. It has already been inserted into the prio_tree.
456 * but it has already been inserted into prio_tree earlier.
457 */ 456 */
458static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 457static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
459{ 458{
@@ -1112,9 +1111,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1112 * A dummy user value is used because we are not locking 1111 * A dummy user value is used because we are not locking
1113 * memory so no accounting is necessary 1112 * memory so no accounting is necessary
1114 */ 1113 */
1115 len = ALIGN(len, huge_page_size(&default_hstate)); 1114 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
1116 file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, 1115 VM_NORESERVE, &user,
1117 &user, HUGETLB_ANONHUGE_INODE); 1116 HUGETLB_ANONHUGE_INODE);
1118 if (IS_ERR(file)) 1117 if (IS_ERR(file))
1119 return PTR_ERR(file); 1118 return PTR_ERR(file);
1120 } 1119 }
@@ -1439,10 +1438,8 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
1439 /* 1438 /*
1440 * Is this a new hole at the lowest possible address? 1439 * Is this a new hole at the lowest possible address?
1441 */ 1440 */
1442 if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) { 1441 if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache)
1443 mm->free_area_cache = addr; 1442 mm->free_area_cache = addr;
1444 mm->cached_hole_size = ~0UL;
1445 }
1446} 1443}
1447 1444
1448/* 1445/*
@@ -1457,7 +1454,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1457{ 1454{
1458 struct vm_area_struct *vma; 1455 struct vm_area_struct *vma;
1459 struct mm_struct *mm = current->mm; 1456 struct mm_struct *mm = current->mm;
1460 unsigned long addr = addr0; 1457 unsigned long addr = addr0, start_addr;
1461 1458
1462 /* requested length too big for entire address space */ 1459 /* requested length too big for entire address space */
1463 if (len > TASK_SIZE) 1460 if (len > TASK_SIZE)
@@ -1481,22 +1478,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1481 mm->free_area_cache = mm->mmap_base; 1478 mm->free_area_cache = mm->mmap_base;
1482 } 1479 }
1483 1480
1481try_again:
1484 /* either no address requested or can't fit in requested address hole */ 1482 /* either no address requested or can't fit in requested address hole */
1485 addr = mm->free_area_cache; 1483 start_addr = addr = mm->free_area_cache;
1486
1487 /* make sure it can fit in the remaining address space */
1488 if (addr > len) {
1489 vma = find_vma(mm, addr-len);
1490 if (!vma || addr <= vma->vm_start)
1491 /* remember the address as a hint for next time */
1492 return (mm->free_area_cache = addr-len);
1493 }
1494
1495 if (mm->mmap_base < len)
1496 goto bottomup;
1497 1484
1498 addr = mm->mmap_base-len; 1485 if (addr < len)
1486 goto fail;
1499 1487
1488 addr -= len;
1500 do { 1489 do {
1501 /* 1490 /*
1502 * Lookup failure means no vma is above this address, 1491 * Lookup failure means no vma is above this address,
@@ -1516,7 +1505,21 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1516 addr = vma->vm_start-len; 1505 addr = vma->vm_start-len;
1517 } while (len < vma->vm_start); 1506 } while (len < vma->vm_start);
1518 1507
1519bottomup: 1508fail:
1509 /*
1510 * if hint left us with no space for the requested
1511 * mapping then try again:
1512 *
1513 * Note: this is different with the case of bottomup
1514 * which does the fully line-search, but we use find_vma
1515 * here that causes some holes skipped.
1516 */
1517 if (start_addr != mm->mmap_base) {
1518 mm->free_area_cache = mm->mmap_base;
1519 mm->cached_hole_size = 0;
1520 goto try_again;
1521 }
1522
1520 /* 1523 /*
1521 * A failed mmap() very likely causes application failure, 1524 * A failed mmap() very likely causes application failure,
1522 * so fall back to the bottom-up function here. This scenario 1525 * so fall back to the bottom-up function here. This scenario
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index cf332bc0080a..3dcfaf4ed355 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -53,7 +53,7 @@ void unuse_mm(struct mm_struct *mm)
53 struct task_struct *tsk = current; 53 struct task_struct *tsk = current;
54 54
55 task_lock(tsk); 55 task_lock(tsk);
56 sync_mm_rss(tsk, mm); 56 sync_mm_rss(mm);
57 tsk->mm = NULL; 57 tsk->mm = NULL;
58 /* active_mm is still 'mm' */ 58 /* active_mm is still 'mm' */
59 enter_lazy_tlb(mm, tsk); 59 enter_lazy_tlb(mm, tsk);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 142ef4a1f480..a40992610ab6 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -60,7 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
60 ptent = pte_mkwrite(ptent); 60 ptent = pte_mkwrite(ptent);
61 61
62 ptep_modify_prot_commit(mm, addr, pte, ptent); 62 ptep_modify_prot_commit(mm, addr, pte, ptent);
63 } else if (PAGE_MIGRATION && !pte_file(oldpte)) { 63 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
64 swp_entry_t entry = pte_to_swp_entry(oldpte); 64 swp_entry_t entry = pte_to_swp_entry(oldpte);
65 65
66 if (is_write_migration_entry(entry)) { 66 if (is_write_migration_entry(entry)) {
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2958fd8e7c9a..4198e000f41a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -34,6 +34,7 @@
34#include <linux/ptrace.h> 34#include <linux/ptrace.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36#include <linux/ftrace.h> 36#include <linux/ftrace.h>
37#include <linux/ratelimit.h>
37 38
38#define CREATE_TRACE_POINTS 39#define CREATE_TRACE_POINTS
39#include <trace/events/oom.h> 40#include <trace/events/oom.h>
@@ -309,7 +310,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
309 */ 310 */
310static struct task_struct *select_bad_process(unsigned int *ppoints, 311static struct task_struct *select_bad_process(unsigned int *ppoints,
311 unsigned long totalpages, struct mem_cgroup *memcg, 312 unsigned long totalpages, struct mem_cgroup *memcg,
312 const nodemask_t *nodemask) 313 const nodemask_t *nodemask, bool force_kill)
313{ 314{
314 struct task_struct *g, *p; 315 struct task_struct *g, *p;
315 struct task_struct *chosen = NULL; 316 struct task_struct *chosen = NULL;
@@ -335,7 +336,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
335 if (test_tsk_thread_flag(p, TIF_MEMDIE)) { 336 if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
336 if (unlikely(frozen(p))) 337 if (unlikely(frozen(p)))
337 __thaw_task(p); 338 __thaw_task(p);
338 return ERR_PTR(-1UL); 339 if (!force_kill)
340 return ERR_PTR(-1UL);
339 } 341 }
340 if (!p->mm) 342 if (!p->mm)
341 continue; 343 continue;
@@ -353,7 +355,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
353 if (p == current) { 355 if (p == current) {
354 chosen = p; 356 chosen = p;
355 *ppoints = 1000; 357 *ppoints = 1000;
356 } else { 358 } else if (!force_kill) {
357 /* 359 /*
358 * If this task is not being ptraced on exit, 360 * If this task is not being ptraced on exit,
359 * then wait for it to finish before killing 361 * then wait for it to finish before killing
@@ -434,66 +436,18 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
434} 436}
435 437
436#define K(x) ((x) << (PAGE_SHIFT-10)) 438#define K(x) ((x) << (PAGE_SHIFT-10))
437static int oom_kill_task(struct task_struct *p) 439static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
438{ 440 unsigned int points, unsigned long totalpages,
439 struct task_struct *q; 441 struct mem_cgroup *memcg, nodemask_t *nodemask,
440 struct mm_struct *mm; 442 const char *message)
441
442 p = find_lock_task_mm(p);
443 if (!p)
444 return 1;
445
446 /* mm cannot be safely dereferenced after task_unlock(p) */
447 mm = p->mm;
448
449 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
450 task_pid_nr(p), p->comm, K(p->mm->total_vm),
451 K(get_mm_counter(p->mm, MM_ANONPAGES)),
452 K(get_mm_counter(p->mm, MM_FILEPAGES)));
453 task_unlock(p);
454
455 /*
456 * Kill all user processes sharing p->mm in other thread groups, if any.
457 * They don't get access to memory reserves or a higher scheduler
458 * priority, though, to avoid depletion of all memory or task
459 * starvation. This prevents mm->mmap_sem livelock when an oom killed
460 * task cannot exit because it requires the semaphore and its contended
461 * by another thread trying to allocate memory itself. That thread will
462 * now get access to memory reserves since it has a pending fatal
463 * signal.
464 */
465 for_each_process(q)
466 if (q->mm == mm && !same_thread_group(q, p) &&
467 !(q->flags & PF_KTHREAD)) {
468 if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
469 continue;
470
471 task_lock(q); /* Protect ->comm from prctl() */
472 pr_err("Kill process %d (%s) sharing same memory\n",
473 task_pid_nr(q), q->comm);
474 task_unlock(q);
475 force_sig(SIGKILL, q);
476 }
477
478 set_tsk_thread_flag(p, TIF_MEMDIE);
479 force_sig(SIGKILL, p);
480
481 return 0;
482}
483#undef K
484
485static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
486 unsigned int points, unsigned long totalpages,
487 struct mem_cgroup *memcg, nodemask_t *nodemask,
488 const char *message)
489{ 443{
490 struct task_struct *victim = p; 444 struct task_struct *victim = p;
491 struct task_struct *child; 445 struct task_struct *child;
492 struct task_struct *t = p; 446 struct task_struct *t = p;
447 struct mm_struct *mm;
493 unsigned int victim_points = 0; 448 unsigned int victim_points = 0;
494 449 static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
495 if (printk_ratelimit()) 450 DEFAULT_RATELIMIT_BURST);
496 dump_header(p, gfp_mask, order, memcg, nodemask);
497 451
498 /* 452 /*
499 * If the task is already exiting, don't alarm the sysadmin or kill 453 * If the task is already exiting, don't alarm the sysadmin or kill
@@ -501,9 +455,12 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
501 */ 455 */
502 if (p->flags & PF_EXITING) { 456 if (p->flags & PF_EXITING) {
503 set_tsk_thread_flag(p, TIF_MEMDIE); 457 set_tsk_thread_flag(p, TIF_MEMDIE);
504 return 0; 458 return;
505 } 459 }
506 460
461 if (__ratelimit(&oom_rs))
462 dump_header(p, gfp_mask, order, memcg, nodemask);
463
507 task_lock(p); 464 task_lock(p);
508 pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", 465 pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
509 message, task_pid_nr(p), p->comm, points); 466 message, task_pid_nr(p), p->comm, points);
@@ -533,8 +490,44 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
533 } 490 }
534 } while_each_thread(p, t); 491 } while_each_thread(p, t);
535 492
536 return oom_kill_task(victim); 493 victim = find_lock_task_mm(victim);
494 if (!victim)
495 return;
496
497 /* mm cannot safely be dereferenced after task_unlock(victim) */
498 mm = victim->mm;
499 pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
500 task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
501 K(get_mm_counter(victim->mm, MM_ANONPAGES)),
502 K(get_mm_counter(victim->mm, MM_FILEPAGES)));
503 task_unlock(victim);
504
505 /*
506 * Kill all user processes sharing victim->mm in other thread groups, if
507 * any. They don't get access to memory reserves, though, to avoid
508 * depletion of all memory. This prevents mm->mmap_sem livelock when an
509 * oom killed thread cannot exit because it requires the semaphore and
510 * its contended by another thread trying to allocate memory itself.
511 * That thread will now get access to memory reserves since it has a
512 * pending fatal signal.
513 */
514 for_each_process(p)
515 if (p->mm == mm && !same_thread_group(p, victim) &&
516 !(p->flags & PF_KTHREAD)) {
517 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
518 continue;
519
520 task_lock(p); /* Protect ->comm from prctl() */
521 pr_err("Kill process %d (%s) sharing same memory\n",
522 task_pid_nr(p), p->comm);
523 task_unlock(p);
524 force_sig(SIGKILL, p);
525 }
526
527 set_tsk_thread_flag(victim, TIF_MEMDIE);
528 force_sig(SIGKILL, victim);
537} 529}
530#undef K
538 531
539/* 532/*
540 * Determines whether the kernel must panic because of the panic_on_oom sysctl. 533 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
@@ -561,7 +554,8 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
561} 554}
562 555
563#ifdef CONFIG_CGROUP_MEM_RES_CTLR 556#ifdef CONFIG_CGROUP_MEM_RES_CTLR
564void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) 557void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
558 int order)
565{ 559{
566 unsigned long limit; 560 unsigned long limit;
567 unsigned int points = 0; 561 unsigned int points = 0;
@@ -577,18 +571,13 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask)
577 return; 571 return;
578 } 572 }
579 573
580 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); 574 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
581 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; 575 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
582 read_lock(&tasklist_lock); 576 read_lock(&tasklist_lock);
583retry: 577 p = select_bad_process(&points, limit, memcg, NULL, false);
584 p = select_bad_process(&points, limit, memcg, NULL); 578 if (p && PTR_ERR(p) != -1UL)
585 if (!p || PTR_ERR(p) == -1UL) 579 oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
586 goto out; 580 "Memory cgroup out of memory");
587
588 if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL,
589 "Memory cgroup out of memory"))
590 goto retry;
591out:
592 read_unlock(&tasklist_lock); 581 read_unlock(&tasklist_lock);
593} 582}
594#endif 583#endif
@@ -700,6 +689,7 @@ static void clear_system_oom(void)
700 * @gfp_mask: memory allocation flags 689 * @gfp_mask: memory allocation flags
701 * @order: amount of memory being requested as a power of 2 690 * @order: amount of memory being requested as a power of 2
702 * @nodemask: nodemask passed to page allocator 691 * @nodemask: nodemask passed to page allocator
692 * @force_kill: true if a task must be killed, even if others are exiting
703 * 693 *
704 * If we run out of memory, we have the choice between either 694 * If we run out of memory, we have the choice between either
705 * killing a random task (bad), letting the system crash (worse) 695 * killing a random task (bad), letting the system crash (worse)
@@ -707,7 +697,7 @@ static void clear_system_oom(void)
707 * don't have to be perfect here, we just have to be good. 697 * don't have to be perfect here, we just have to be good.
708 */ 698 */
709void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, 699void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
710 int order, nodemask_t *nodemask) 700 int order, nodemask_t *nodemask, bool force_kill)
711{ 701{
712 const nodemask_t *mpol_mask; 702 const nodemask_t *mpol_mask;
713 struct task_struct *p; 703 struct task_struct *p;
@@ -745,33 +735,25 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
745 if (sysctl_oom_kill_allocating_task && 735 if (sysctl_oom_kill_allocating_task &&
746 !oom_unkillable_task(current, NULL, nodemask) && 736 !oom_unkillable_task(current, NULL, nodemask) &&
747 current->mm) { 737 current->mm) {
748 /* 738 oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
749 * oom_kill_process() needs tasklist_lock held. If it returns 739 nodemask,
750 * non-zero, current could not be killed so we must fallback to 740 "Out of memory (oom_kill_allocating_task)");
751 * the tasklist scan.
752 */
753 if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
754 NULL, nodemask,
755 "Out of memory (oom_kill_allocating_task)"))
756 goto out;
757 }
758
759retry:
760 p = select_bad_process(&points, totalpages, NULL, mpol_mask);
761 if (PTR_ERR(p) == -1UL)
762 goto out; 741 goto out;
742 }
763 743
744 p = select_bad_process(&points, totalpages, NULL, mpol_mask,
745 force_kill);
764 /* Found nothing?!?! Either we hang forever, or we panic. */ 746 /* Found nothing?!?! Either we hang forever, or we panic. */
765 if (!p) { 747 if (!p) {
766 dump_header(NULL, gfp_mask, order, NULL, mpol_mask); 748 dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
767 read_unlock(&tasklist_lock); 749 read_unlock(&tasklist_lock);
768 panic("Out of memory and no killable processes...\n"); 750 panic("Out of memory and no killable processes...\n");
769 } 751 }
770 752 if (PTR_ERR(p) != -1UL) {
771 if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, 753 oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
772 nodemask, "Out of memory")) 754 nodemask, "Out of memory");
773 goto retry; 755 killed = 1;
774 killed = 1; 756 }
775out: 757out:
776 read_unlock(&tasklist_lock); 758 read_unlock(&tasklist_lock);
777 759
@@ -792,7 +774,7 @@ out:
792void pagefault_out_of_memory(void) 774void pagefault_out_of_memory(void)
793{ 775{
794 if (try_set_system_oom()) { 776 if (try_set_system_oom()) {
795 out_of_memory(NULL, 0, 0, NULL); 777 out_of_memory(NULL, 0, 0, NULL, false);
796 clear_system_oom(); 778 clear_system_oom();
797 } 779 }
798 if (!test_thread_flag(TIF_MEMDIE)) 780 if (!test_thread_flag(TIF_MEMDIE))
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 363ba7082ef5..3fc261705b1e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1472,6 +1472,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
1472 1472
1473 for ( ; ; ) { 1473 for ( ; ; ) {
1474 global_dirty_limits(&background_thresh, &dirty_thresh); 1474 global_dirty_limits(&background_thresh, &dirty_thresh);
1475 dirty_thresh = hard_dirty_limit(dirty_thresh);
1475 1476
1476 /* 1477 /*
1477 * Boost the allowable dirty threshold a bit for page 1478 * Boost the allowable dirty threshold a bit for page
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a13ded1938f0..caea788628e4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1968,7 +1968,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1968 goto out; 1968 goto out;
1969 } 1969 }
1970 /* Exhausted what can be done so it's blamo time */ 1970 /* Exhausted what can be done so it's blamo time */
1971 out_of_memory(zonelist, gfp_mask, order, nodemask); 1971 out_of_memory(zonelist, gfp_mask, order, nodemask, false);
1972 1972
1973out: 1973out:
1974 clear_zonelist_oom(zonelist, gfp_mask); 1974 clear_zonelist_oom(zonelist, gfp_mask);
@@ -1990,7 +1990,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1990 if (!order) 1990 if (!order)
1991 return NULL; 1991 return NULL;
1992 1992
1993 if (compaction_deferred(preferred_zone)) { 1993 if (compaction_deferred(preferred_zone, order)) {
1994 *deferred_compaction = true; 1994 *deferred_compaction = true;
1995 return NULL; 1995 return NULL;
1996 } 1996 }
@@ -2012,6 +2012,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2012 if (page) { 2012 if (page) {
2013 preferred_zone->compact_considered = 0; 2013 preferred_zone->compact_considered = 0;
2014 preferred_zone->compact_defer_shift = 0; 2014 preferred_zone->compact_defer_shift = 0;
2015 if (order >= preferred_zone->compact_order_failed)
2016 preferred_zone->compact_order_failed = order + 1;
2015 count_vm_event(COMPACTSUCCESS); 2017 count_vm_event(COMPACTSUCCESS);
2016 return page; 2018 return page;
2017 } 2019 }
@@ -2028,7 +2030,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2028 * defer if the failure was a sync compaction failure. 2030 * defer if the failure was a sync compaction failure.
2029 */ 2031 */
2030 if (sync_migration) 2032 if (sync_migration)
2031 defer_compaction(preferred_zone); 2033 defer_compaction(preferred_zone, order);
2032 2034
2033 cond_resched(); 2035 cond_resched();
2034 } 2036 }
@@ -2378,8 +2380,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2378{ 2380{
2379 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 2381 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2380 struct zone *preferred_zone; 2382 struct zone *preferred_zone;
2381 struct page *page; 2383 struct page *page = NULL;
2382 int migratetype = allocflags_to_migratetype(gfp_mask); 2384 int migratetype = allocflags_to_migratetype(gfp_mask);
2385 unsigned int cpuset_mems_cookie;
2383 2386
2384 gfp_mask &= gfp_allowed_mask; 2387 gfp_mask &= gfp_allowed_mask;
2385 2388
@@ -2398,15 +2401,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2398 if (unlikely(!zonelist->_zonerefs->zone)) 2401 if (unlikely(!zonelist->_zonerefs->zone))
2399 return NULL; 2402 return NULL;
2400 2403
2401 get_mems_allowed(); 2404retry_cpuset:
2405 cpuset_mems_cookie = get_mems_allowed();
2406
2402 /* The preferred zone is used for statistics later */ 2407 /* The preferred zone is used for statistics later */
2403 first_zones_zonelist(zonelist, high_zoneidx, 2408 first_zones_zonelist(zonelist, high_zoneidx,
2404 nodemask ? : &cpuset_current_mems_allowed, 2409 nodemask ? : &cpuset_current_mems_allowed,
2405 &preferred_zone); 2410 &preferred_zone);
2406 if (!preferred_zone) { 2411 if (!preferred_zone)
2407 put_mems_allowed(); 2412 goto out;
2408 return NULL;
2409 }
2410 2413
2411 /* First allocation attempt */ 2414 /* First allocation attempt */
2412 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2415 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2416,9 +2419,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2416 page = __alloc_pages_slowpath(gfp_mask, order, 2419 page = __alloc_pages_slowpath(gfp_mask, order,
2417 zonelist, high_zoneidx, nodemask, 2420 zonelist, high_zoneidx, nodemask,
2418 preferred_zone, migratetype); 2421 preferred_zone, migratetype);
2419 put_mems_allowed();
2420 2422
2421 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2423 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2424
2425out:
2426 /*
2427 * When updating a task's mems_allowed, it is possible to race with
2428 * parallel threads in such a way that an allocation can fail while
2429 * the mask is being updated. If a page allocation is about to fail,
2430 * check if the cpuset changed during allocation and if so, retry.
2431 */
2432 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2433 goto retry_cpuset;
2434
2422 return page; 2435 return page;
2423} 2436}
2424EXPORT_SYMBOL(__alloc_pages_nodemask); 2437EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2632,13 +2645,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
2632bool skip_free_areas_node(unsigned int flags, int nid) 2645bool skip_free_areas_node(unsigned int flags, int nid)
2633{ 2646{
2634 bool ret = false; 2647 bool ret = false;
2648 unsigned int cpuset_mems_cookie;
2635 2649
2636 if (!(flags & SHOW_MEM_FILTER_NODES)) 2650 if (!(flags & SHOW_MEM_FILTER_NODES))
2637 goto out; 2651 goto out;
2638 2652
2639 get_mems_allowed(); 2653 do {
2640 ret = !node_isset(nid, cpuset_current_mems_allowed); 2654 cpuset_mems_cookie = get_mems_allowed();
2641 put_mems_allowed(); 2655 ret = !node_isset(nid, cpuset_current_mems_allowed);
2656 } while (!put_mems_allowed(cpuset_mems_cookie));
2642out: 2657out:
2643 return ret; 2658 return ret;
2644} 2659}
@@ -3925,18 +3940,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
3925 } 3940 }
3926} 3941}
3927 3942
3928int __init add_from_early_node_map(struct range *range, int az,
3929 int nr_range, int nid)
3930{
3931 unsigned long start_pfn, end_pfn;
3932 int i;
3933
3934 /* need to go over early_node_map to find out good range for node */
3935 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
3936 nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
3937 return nr_range;
3938}
3939
3940/** 3943/**
3941 * sparse_memory_present_with_active_regions - Call memory_present for each active range 3944 * sparse_memory_present_with_active_regions - Call memory_present for each active range
3942 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 3945 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -4521,7 +4524,7 @@ static unsigned long __init early_calculate_totalpages(void)
4521 * memory. When they don't, some nodes will have more kernelcore than 4524 * memory. When they don't, some nodes will have more kernelcore than
4522 * others 4525 * others
4523 */ 4526 */
4524static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) 4527static void __init find_zone_movable_pfns_for_nodes(void)
4525{ 4528{
4526 int i, nid; 4529 int i, nid;
4527 unsigned long usable_startpfn; 4530 unsigned long usable_startpfn;
@@ -4713,7 +4716,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4713 4716
4714 /* Find the PFNs that ZONE_MOVABLE begins at in each node */ 4717 /* Find the PFNs that ZONE_MOVABLE begins at in each node */
4715 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); 4718 memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
4716 find_zone_movable_pfns_for_nodes(zone_movable_pfn); 4719 find_zone_movable_pfns_for_nodes();
4717 4720
4718 /* Print out the zone ranges */ 4721 /* Print out the zone ranges */
4719 printk("Zone PFN ranges:\n"); 4722 printk("Zone PFN ranges:\n");
@@ -4823,6 +4826,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
4823 int cpu = (unsigned long)hcpu; 4826 int cpu = (unsigned long)hcpu;
4824 4827
4825 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 4828 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
4829 lru_add_drain_cpu(cpu);
4826 drain_pages(cpu); 4830 drain_pages(cpu);
4827 4831
4828 /* 4832 /*
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2f5cf10ff660..aa9701e12714 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -59,7 +59,7 @@ again:
59 continue; 59 continue;
60 60
61 split_huge_page_pmd(walk->mm, pmd); 61 split_huge_page_pmd(walk->mm, pmd);
62 if (pmd_none_or_clear_bad(pmd)) 62 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
63 goto again; 63 goto again;
64 err = walk_pte_range(pmd, addr, next, walk); 64 err = walk_pte_range(pmd, addr, next, walk);
65 if (err) 65 if (err)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index eb663fb533e0..5a74fea182f1 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -70,10 +70,11 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
70 unsigned long address, pmd_t *pmdp) 70 unsigned long address, pmd_t *pmdp)
71{ 71{
72 int young; 72 int young;
73#ifndef CONFIG_TRANSPARENT_HUGEPAGE 73#ifdef CONFIG_TRANSPARENT_HUGEPAGE
74 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
75#else
74 BUG(); 76 BUG();
75#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 77#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
76 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
77 young = pmdp_test_and_clear_young(vma, address, pmdp); 78 young = pmdp_test_and_clear_young(vma, address, pmdp);
78 if (young) 79 if (young)
79 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 80 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
diff --git a/mm/rmap.c b/mm/rmap.c
index c8454e06b6c8..5b5ad584ffb7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -120,6 +120,21 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
120 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); 120 kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
121} 121}
122 122
123static void anon_vma_chain_link(struct vm_area_struct *vma,
124 struct anon_vma_chain *avc,
125 struct anon_vma *anon_vma)
126{
127 avc->vma = vma;
128 avc->anon_vma = anon_vma;
129 list_add(&avc->same_vma, &vma->anon_vma_chain);
130
131 /*
132 * It's critical to add new vmas to the tail of the anon_vma,
133 * see comment in huge_memory.c:__split_huge_page().
134 */
135 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
136}
137
123/** 138/**
124 * anon_vma_prepare - attach an anon_vma to a memory region 139 * anon_vma_prepare - attach an anon_vma to a memory region
125 * @vma: the memory region in question 140 * @vma: the memory region in question
@@ -175,10 +190,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
175 spin_lock(&mm->page_table_lock); 190 spin_lock(&mm->page_table_lock);
176 if (likely(!vma->anon_vma)) { 191 if (likely(!vma->anon_vma)) {
177 vma->anon_vma = anon_vma; 192 vma->anon_vma = anon_vma;
178 avc->anon_vma = anon_vma; 193 anon_vma_chain_link(vma, avc, anon_vma);
179 avc->vma = vma;
180 list_add(&avc->same_vma, &vma->anon_vma_chain);
181 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
182 allocated = NULL; 194 allocated = NULL;
183 avc = NULL; 195 avc = NULL;
184 } 196 }
@@ -224,21 +236,6 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
224 mutex_unlock(&root->mutex); 236 mutex_unlock(&root->mutex);
225} 237}
226 238
227static void anon_vma_chain_link(struct vm_area_struct *vma,
228 struct anon_vma_chain *avc,
229 struct anon_vma *anon_vma)
230{
231 avc->vma = vma;
232 avc->anon_vma = anon_vma;
233 list_add(&avc->same_vma, &vma->anon_vma_chain);
234
235 /*
236 * It's critical to add new vmas to the tail of the anon_vma,
237 * see comment in huge_memory.c:__split_huge_page().
238 */
239 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
240}
241
242/* 239/*
243 * Attach the anon_vmas from src to dst. 240 * Attach the anon_vmas from src to dst.
244 * Returns 0 on success, -ENOMEM on failure. 241 * Returns 0 on success, -ENOMEM on failure.
@@ -1151,10 +1148,15 @@ void page_add_new_anon_rmap(struct page *page,
1151 */ 1148 */
1152void page_add_file_rmap(struct page *page) 1149void page_add_file_rmap(struct page *page)
1153{ 1150{
1151 bool locked;
1152 unsigned long flags;
1153
1154 mem_cgroup_begin_update_page_stat(page, &locked, &flags);
1154 if (atomic_inc_and_test(&page->_mapcount)) { 1155 if (atomic_inc_and_test(&page->_mapcount)) {
1155 __inc_zone_page_state(page, NR_FILE_MAPPED); 1156 __inc_zone_page_state(page, NR_FILE_MAPPED);
1156 mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); 1157 mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
1157 } 1158 }
1159 mem_cgroup_end_update_page_stat(page, &locked, &flags);
1158} 1160}
1159 1161
1160/** 1162/**
@@ -1165,9 +1167,21 @@ void page_add_file_rmap(struct page *page)
1165 */ 1167 */
1166void page_remove_rmap(struct page *page) 1168void page_remove_rmap(struct page *page)
1167{ 1169{
1170 bool anon = PageAnon(page);
1171 bool locked;
1172 unsigned long flags;
1173
1174 /*
1175 * The anon case has no mem_cgroup page_stat to update; but may
1176 * uncharge_page() below, where the lock ordering can deadlock if
1177 * we hold the lock against page_stat move: so avoid it on anon.
1178 */
1179 if (!anon)
1180 mem_cgroup_begin_update_page_stat(page, &locked, &flags);
1181
1168 /* page still mapped by someone else? */ 1182 /* page still mapped by someone else? */
1169 if (!atomic_add_negative(-1, &page->_mapcount)) 1183 if (!atomic_add_negative(-1, &page->_mapcount))
1170 return; 1184 goto out;
1171 1185
1172 /* 1186 /*
1173 * Now that the last pte has gone, s390 must transfer dirty 1187 * Now that the last pte has gone, s390 must transfer dirty
@@ -1176,7 +1190,7 @@ void page_remove_rmap(struct page *page)
1176 * not if it's in swapcache - there might be another pte slot 1190 * not if it's in swapcache - there might be another pte slot
1177 * containing the swap entry, but page not yet written to swap. 1191 * containing the swap entry, but page not yet written to swap.
1178 */ 1192 */
1179 if ((!PageAnon(page) || PageSwapCache(page)) && 1193 if ((!anon || PageSwapCache(page)) &&
1180 page_test_and_clear_dirty(page_to_pfn(page), 1)) 1194 page_test_and_clear_dirty(page_to_pfn(page), 1))
1181 set_page_dirty(page); 1195 set_page_dirty(page);
1182 /* 1196 /*
@@ -1184,8 +1198,8 @@ void page_remove_rmap(struct page *page)
1184 * and not charged by memcg for now. 1198 * and not charged by memcg for now.
1185 */ 1199 */
1186 if (unlikely(PageHuge(page))) 1200 if (unlikely(PageHuge(page)))
1187 return; 1201 goto out;
1188 if (PageAnon(page)) { 1202 if (anon) {
1189 mem_cgroup_uncharge_page(page); 1203 mem_cgroup_uncharge_page(page);
1190 if (!PageTransHuge(page)) 1204 if (!PageTransHuge(page))
1191 __dec_zone_page_state(page, NR_ANON_PAGES); 1205 __dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1205,6 +1219,9 @@ void page_remove_rmap(struct page *page)
1205 * Leaving it set also helps swapoff to reinstate ptes 1219 * Leaving it set also helps swapoff to reinstate ptes
1206 * faster for those pages still in swapcache. 1220 * faster for those pages still in swapcache.
1207 */ 1221 */
1222out:
1223 if (!anon)
1224 mem_cgroup_end_update_page_stat(page, &locked, &flags);
1208} 1225}
1209 1226
1210/* 1227/*
@@ -1282,7 +1299,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1282 } 1299 }
1283 dec_mm_counter(mm, MM_ANONPAGES); 1300 dec_mm_counter(mm, MM_ANONPAGES);
1284 inc_mm_counter(mm, MM_SWAPENTS); 1301 inc_mm_counter(mm, MM_SWAPENTS);
1285 } else if (PAGE_MIGRATION) { 1302 } else if (IS_ENABLED(CONFIG_MIGRATION)) {
1286 /* 1303 /*
1287 * Store the pfn of the page in a special migration 1304 * Store the pfn of the page in a special migration
1288 * pte. do_swap_page() will wait until the migration 1305 * pte. do_swap_page() will wait until the migration
@@ -1293,7 +1310,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1293 } 1310 }
1294 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 1311 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
1295 BUG_ON(pte_file(*pte)); 1312 BUG_ON(pte_file(*pte));
1296 } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { 1313 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
1314 (TTU_ACTION(flags) == TTU_MIGRATION)) {
1297 /* Establish migration entry for a file page */ 1315 /* Establish migration entry for a file page */
1298 swp_entry_t entry; 1316 swp_entry_t entry;
1299 entry = make_migration_entry(page, pte_write(pteval)); 1317 entry = make_migration_entry(page, pte_write(pteval));
@@ -1499,7 +1517,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1499 * locking requirements of exec(), migration skips 1517 * locking requirements of exec(), migration skips
1500 * temporary VMAs until after exec() completes. 1518 * temporary VMAs until after exec() completes.
1501 */ 1519 */
1502 if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && 1520 if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
1503 is_vma_temporary_stack(vma)) 1521 is_vma_temporary_stack(vma))
1504 continue; 1522 continue;
1505 1523
diff --git a/mm/shmem.c b/mm/shmem.c
index 7a45ad004cfd..f99ff3e50bd6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1178,6 +1178,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
1178static const struct inode_operations shmem_symlink_inode_operations; 1178static const struct inode_operations shmem_symlink_inode_operations;
1179static const struct inode_operations shmem_short_symlink_operations; 1179static const struct inode_operations shmem_short_symlink_operations;
1180 1180
1181#ifdef CONFIG_TMPFS_XATTR
1182static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
1183#else
1184#define shmem_initxattrs NULL
1185#endif
1186
1181static int 1187static int
1182shmem_write_begin(struct file *file, struct address_space *mapping, 1188shmem_write_begin(struct file *file, struct address_space *mapping,
1183 loff_t pos, unsigned len, unsigned flags, 1189 loff_t pos, unsigned len, unsigned flags,
@@ -1490,7 +1496,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
1490 if (inode) { 1496 if (inode) {
1491 error = security_inode_init_security(inode, dir, 1497 error = security_inode_init_security(inode, dir,
1492 &dentry->d_name, 1498 &dentry->d_name,
1493 NULL, NULL); 1499 shmem_initxattrs, NULL);
1494 if (error) { 1500 if (error) {
1495 if (error != -EOPNOTSUPP) { 1501 if (error != -EOPNOTSUPP) {
1496 iput(inode); 1502 iput(inode);
@@ -1630,7 +1636,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1630 return -ENOSPC; 1636 return -ENOSPC;
1631 1637
1632 error = security_inode_init_security(inode, dir, &dentry->d_name, 1638 error = security_inode_init_security(inode, dir, &dentry->d_name,
1633 NULL, NULL); 1639 shmem_initxattrs, NULL);
1634 if (error) { 1640 if (error) {
1635 if (error != -EOPNOTSUPP) { 1641 if (error != -EOPNOTSUPP) {
1636 iput(inode); 1642 iput(inode);
@@ -1704,6 +1710,66 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
1704 * filesystem level, though. 1710 * filesystem level, though.
1705 */ 1711 */
1706 1712
1713/*
1714 * Allocate new xattr and copy in the value; but leave the name to callers.
1715 */
1716static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size)
1717{
1718 struct shmem_xattr *new_xattr;
1719 size_t len;
1720
1721 /* wrap around? */
1722 len = sizeof(*new_xattr) + size;
1723 if (len <= sizeof(*new_xattr))
1724 return NULL;
1725
1726 new_xattr = kmalloc(len, GFP_KERNEL);
1727 if (!new_xattr)
1728 return NULL;
1729
1730 new_xattr->size = size;
1731 memcpy(new_xattr->value, value, size);
1732 return new_xattr;
1733}
1734
1735/*
1736 * Callback for security_inode_init_security() for acquiring xattrs.
1737 */
1738static int shmem_initxattrs(struct inode *inode,
1739 const struct xattr *xattr_array,
1740 void *fs_info)
1741{
1742 struct shmem_inode_info *info = SHMEM_I(inode);
1743 const struct xattr *xattr;
1744 struct shmem_xattr *new_xattr;
1745 size_t len;
1746
1747 for (xattr = xattr_array; xattr->name != NULL; xattr++) {
1748 new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len);
1749 if (!new_xattr)
1750 return -ENOMEM;
1751
1752 len = strlen(xattr->name) + 1;
1753 new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
1754 GFP_KERNEL);
1755 if (!new_xattr->name) {
1756 kfree(new_xattr);
1757 return -ENOMEM;
1758 }
1759
1760 memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
1761 XATTR_SECURITY_PREFIX_LEN);
1762 memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
1763 xattr->name, len);
1764
1765 spin_lock(&info->lock);
1766 list_add(&new_xattr->list, &info->xattr_list);
1767 spin_unlock(&info->lock);
1768 }
1769
1770 return 0;
1771}
1772
1707static int shmem_xattr_get(struct dentry *dentry, const char *name, 1773static int shmem_xattr_get(struct dentry *dentry, const char *name,
1708 void *buffer, size_t size) 1774 void *buffer, size_t size)
1709{ 1775{
@@ -1731,24 +1797,17 @@ static int shmem_xattr_get(struct dentry *dentry, const char *name,
1731 return ret; 1797 return ret;
1732} 1798}
1733 1799
1734static int shmem_xattr_set(struct dentry *dentry, const char *name, 1800static int shmem_xattr_set(struct inode *inode, const char *name,
1735 const void *value, size_t size, int flags) 1801 const void *value, size_t size, int flags)
1736{ 1802{
1737 struct inode *inode = dentry->d_inode;
1738 struct shmem_inode_info *info = SHMEM_I(inode); 1803 struct shmem_inode_info *info = SHMEM_I(inode);
1739 struct shmem_xattr *xattr; 1804 struct shmem_xattr *xattr;
1740 struct shmem_xattr *new_xattr = NULL; 1805 struct shmem_xattr *new_xattr = NULL;
1741 size_t len;
1742 int err = 0; 1806 int err = 0;
1743 1807
1744 /* value == NULL means remove */ 1808 /* value == NULL means remove */
1745 if (value) { 1809 if (value) {
1746 /* wrap around? */ 1810 new_xattr = shmem_xattr_alloc(value, size);
1747 len = sizeof(*new_xattr) + size;
1748 if (len <= sizeof(*new_xattr))
1749 return -ENOMEM;
1750
1751 new_xattr = kmalloc(len, GFP_KERNEL);
1752 if (!new_xattr) 1811 if (!new_xattr)
1753 return -ENOMEM; 1812 return -ENOMEM;
1754 1813
@@ -1757,9 +1816,6 @@ static int shmem_xattr_set(struct dentry *dentry, const char *name,
1757 kfree(new_xattr); 1816 kfree(new_xattr);
1758 return -ENOMEM; 1817 return -ENOMEM;
1759 } 1818 }
1760
1761 new_xattr->size = size;
1762 memcpy(new_xattr->value, value, size);
1763 } 1819 }
1764 1820
1765 spin_lock(&info->lock); 1821 spin_lock(&info->lock);
@@ -1858,7 +1914,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name,
1858 if (size == 0) 1914 if (size == 0)
1859 value = ""; /* empty EA, do not remove */ 1915 value = ""; /* empty EA, do not remove */
1860 1916
1861 return shmem_xattr_set(dentry, name, value, size, flags); 1917 return shmem_xattr_set(dentry->d_inode, name, value, size, flags);
1862 1918
1863} 1919}
1864 1920
@@ -1878,7 +1934,7 @@ static int shmem_removexattr(struct dentry *dentry, const char *name)
1878 if (err) 1934 if (err)
1879 return err; 1935 return err;
1880 1936
1881 return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE); 1937 return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE);
1882} 1938}
1883 1939
1884static bool xattr_is_trusted(const char *name) 1940static bool xattr_is_trusted(const char *name)
diff --git a/mm/slab.c b/mm/slab.c
index f0bd7857ab3b..29c8716eb7a9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3284,12 +3284,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3284 if (in_interrupt() || (flags & __GFP_THISNODE)) 3284 if (in_interrupt() || (flags & __GFP_THISNODE))
3285 return NULL; 3285 return NULL;
3286 nid_alloc = nid_here = numa_mem_id(); 3286 nid_alloc = nid_here = numa_mem_id();
3287 get_mems_allowed();
3288 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3287 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3289 nid_alloc = cpuset_slab_spread_node(); 3288 nid_alloc = cpuset_slab_spread_node();
3290 else if (current->mempolicy) 3289 else if (current->mempolicy)
3291 nid_alloc = slab_node(current->mempolicy); 3290 nid_alloc = slab_node(current->mempolicy);
3292 put_mems_allowed();
3293 if (nid_alloc != nid_here) 3291 if (nid_alloc != nid_here)
3294 return ____cache_alloc_node(cachep, flags, nid_alloc); 3292 return ____cache_alloc_node(cachep, flags, nid_alloc);
3295 return NULL; 3293 return NULL;
@@ -3312,14 +3310,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3312 enum zone_type high_zoneidx = gfp_zone(flags); 3310 enum zone_type high_zoneidx = gfp_zone(flags);
3313 void *obj = NULL; 3311 void *obj = NULL;
3314 int nid; 3312 int nid;
3313 unsigned int cpuset_mems_cookie;
3315 3314
3316 if (flags & __GFP_THISNODE) 3315 if (flags & __GFP_THISNODE)
3317 return NULL; 3316 return NULL;
3318 3317
3319 get_mems_allowed();
3320 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3321 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3318 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3322 3319
3320retry_cpuset:
3321 cpuset_mems_cookie = get_mems_allowed();
3322 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3323
3323retry: 3324retry:
3324 /* 3325 /*
3325 * Look through allowed nodes for objects available 3326 * Look through allowed nodes for objects available
@@ -3372,7 +3373,9 @@ retry:
3372 } 3373 }
3373 } 3374 }
3374 } 3375 }
3375 put_mems_allowed(); 3376
3377 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj))
3378 goto retry_cpuset;
3376 return obj; 3379 return obj;
3377} 3380}
3378 3381
diff --git a/mm/slub.c b/mm/slub.c
index 4907563ef7ff..f4a6229848fd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1581,6 +1581,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
1581 struct zone *zone; 1581 struct zone *zone;
1582 enum zone_type high_zoneidx = gfp_zone(flags); 1582 enum zone_type high_zoneidx = gfp_zone(flags);
1583 void *object; 1583 void *object;
1584 unsigned int cpuset_mems_cookie;
1584 1585
1585 /* 1586 /*
1586 * The defrag ratio allows a configuration of the tradeoffs between 1587 * The defrag ratio allows a configuration of the tradeoffs between
@@ -1604,23 +1605,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
1604 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1605 get_cycles() % 1024 > s->remote_node_defrag_ratio)
1605 return NULL; 1606 return NULL;
1606 1607
1607 get_mems_allowed(); 1608 do {
1608 zonelist = node_zonelist(slab_node(current->mempolicy), flags); 1609 cpuset_mems_cookie = get_mems_allowed();
1609 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1610 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1610 struct kmem_cache_node *n; 1611 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1611 1612 struct kmem_cache_node *n;
1612 n = get_node(s, zone_to_nid(zone)); 1613
1613 1614 n = get_node(s, zone_to_nid(zone));
1614 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1615
1615 n->nr_partial > s->min_partial) { 1616 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1616 object = get_partial_node(s, n, c); 1617 n->nr_partial > s->min_partial) {
1617 if (object) { 1618 object = get_partial_node(s, n, c);
1618 put_mems_allowed(); 1619 if (object) {
1619 return object; 1620 /*
1621 * Return the object even if
1622 * put_mems_allowed indicated that
1623 * the cpuset mems_allowed was
1624 * updated in parallel. It's a
1625 * harmless race between the alloc
1626 * and the cpuset update.
1627 */
1628 put_mems_allowed(cpuset_mems_cookie);
1629 return object;
1630 }
1620 } 1631 }
1621 } 1632 }
1622 } 1633 } while (!put_mems_allowed(cpuset_mems_cookie));
1623 put_mems_allowed();
1624#endif 1634#endif
1625 return NULL; 1635 return NULL;
1626} 1636}
diff --git a/mm/sparse.c b/mm/sparse.c
index 61d7cde23111..a8bc7d364deb 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -353,29 +353,21 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
353 353
354 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), 354 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
355 usemap_count); 355 usemap_count);
356 if (usemap) { 356 if (!usemap) {
357 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 357 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
358 if (!present_section_nr(pnum)) 358 if (!usemap) {
359 continue; 359 printk(KERN_WARNING "%s: allocation failed\n", __func__);
360 usemap_map[pnum] = usemap; 360 return;
361 usemap += size;
362 } 361 }
363 return;
364 } 362 }
365 363
366 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); 364 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
367 if (usemap) { 365 if (!present_section_nr(pnum))
368 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 366 continue;
369 if (!present_section_nr(pnum)) 367 usemap_map[pnum] = usemap;
370 continue; 368 usemap += size;
371 usemap_map[pnum] = usemap; 369 check_usemap_section_nr(nodeid, usemap_map[pnum]);
372 usemap += size;
373 check_usemap_section_nr(nodeid, usemap_map[pnum]);
374 }
375 return;
376 } 370 }
377
378 printk(KERN_WARNING "%s: allocation failed\n", __func__);
379} 371}
380 372
381#ifndef CONFIG_SPARSEMEM_VMEMMAP 373#ifndef CONFIG_SPARSEMEM_VMEMMAP
diff --git a/mm/swap.c b/mm/swap.c
index 14380e9fbe33..5c13f1338972 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -496,7 +496,7 @@ static void lru_deactivate_fn(struct page *page, void *arg)
496 * Either "cpu" is the current CPU, and preemption has already been 496 * Either "cpu" is the current CPU, and preemption has already been
497 * disabled; or "cpu" is being hot-unplugged, and is already dead. 497 * disabled; or "cpu" is being hot-unplugged, and is already dead.
498 */ 498 */
499static void drain_cpu_pagevecs(int cpu) 499void lru_add_drain_cpu(int cpu)
500{ 500{
501 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); 501 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
502 struct pagevec *pvec; 502 struct pagevec *pvec;
@@ -553,7 +553,7 @@ void deactivate_page(struct page *page)
553 553
554void lru_add_drain(void) 554void lru_add_drain(void)
555{ 555{
556 drain_cpu_pagevecs(get_cpu()); 556 lru_add_drain_cpu(get_cpu());
557 put_cpu(); 557 put_cpu();
558} 558}
559 559
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ea6b32d61873..9d3dd3763cf7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -372,25 +372,23 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
372struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, 372struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
373 struct vm_area_struct *vma, unsigned long addr) 373 struct vm_area_struct *vma, unsigned long addr)
374{ 374{
375 int nr_pages;
376 struct page *page; 375 struct page *page;
377 unsigned long offset; 376 unsigned long offset = swp_offset(entry);
378 unsigned long end_offset; 377 unsigned long start_offset, end_offset;
378 unsigned long mask = (1UL << page_cluster) - 1;
379 379
380 /* 380 /* Read a page_cluster sized and aligned cluster around offset. */
381 * Get starting offset for readaround, and number of pages to read. 381 start_offset = offset & ~mask;
382 * Adjust starting address by readbehind (for NUMA interleave case)? 382 end_offset = offset | mask;
383 * No, it's very unlikely that swap layout would follow vma layout, 383 if (!start_offset) /* First page is swap header. */
384 * more likely that neighbouring swap pages came from the same node: 384 start_offset++;
385 * so use the same "addr" to choose the same node for each swap read. 385
386 */ 386 for (offset = start_offset; offset <= end_offset ; offset++) {
387 nr_pages = valid_swaphandles(entry, &offset);
388 for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
389 /* Ok, do the async read-ahead now */ 387 /* Ok, do the async read-ahead now */
390 page = read_swap_cache_async(swp_entry(swp_type(entry), offset), 388 page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
391 gfp_mask, vma, addr); 389 gfp_mask, vma, addr);
392 if (!page) 390 if (!page)
393 break; 391 continue;
394 page_cache_release(page); 392 page_cache_release(page);
395 } 393 }
396 lru_add_drain(); /* Push any new pages onto the LRU now */ 394 lru_add_drain(); /* Push any new pages onto the LRU now */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6bf67ab6e469..dae42f380d6e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
932 pmd = pmd_offset(pud, addr); 932 pmd = pmd_offset(pud, addr);
933 do { 933 do {
934 next = pmd_addr_end(addr, end); 934 next = pmd_addr_end(addr, end);
935 if (unlikely(pmd_trans_huge(*pmd))) 935 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
936 continue;
937 if (pmd_none_or_clear_bad(pmd))
938 continue; 936 continue;
939 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 937 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
940 if (ret) 938 if (ret)
@@ -2107,7 +2105,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2107 p->flags |= SWP_SOLIDSTATE; 2105 p->flags |= SWP_SOLIDSTATE;
2108 p->cluster_next = 1 + (random32() % p->highest_bit); 2106 p->cluster_next = 1 + (random32() % p->highest_bit);
2109 } 2107 }
2110 if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD)) 2108 if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0)
2111 p->flags |= SWP_DISCARDABLE; 2109 p->flags |= SWP_DISCARDABLE;
2112 } 2110 }
2113 2111
@@ -2292,58 +2290,6 @@ int swapcache_prepare(swp_entry_t entry)
2292} 2290}
2293 2291
2294/* 2292/*
2295 * swap_lock prevents swap_map being freed. Don't grab an extra
2296 * reference on the swaphandle, it doesn't matter if it becomes unused.
2297 */
2298int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
2299{
2300 struct swap_info_struct *si;
2301 int our_page_cluster = page_cluster;
2302 pgoff_t target, toff;
2303 pgoff_t base, end;
2304 int nr_pages = 0;
2305
2306 if (!our_page_cluster) /* no readahead */
2307 return 0;
2308
2309 si = swap_info[swp_type(entry)];
2310 target = swp_offset(entry);
2311 base = (target >> our_page_cluster) << our_page_cluster;
2312 end = base + (1 << our_page_cluster);
2313 if (!base) /* first page is swap header */
2314 base++;
2315
2316 spin_lock(&swap_lock);
2317 if (end > si->max) /* don't go beyond end of map */
2318 end = si->max;
2319
2320 /* Count contiguous allocated slots above our target */
2321 for (toff = target; ++toff < end; nr_pages++) {
2322 /* Don't read in free or bad pages */
2323 if (!si->swap_map[toff])
2324 break;
2325 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2326 break;
2327 }
2328 /* Count contiguous allocated slots below our target */
2329 for (toff = target; --toff >= base; nr_pages++) {
2330 /* Don't read in free or bad pages */
2331 if (!si->swap_map[toff])
2332 break;
2333 if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD)
2334 break;
2335 }
2336 spin_unlock(&swap_lock);
2337
2338 /*
2339 * Indicate starting offset, and return number of pages to get:
2340 * if only 1, say 0, since there's then no readahead to be done.
2341 */
2342 *offset = ++toff;
2343 return nr_pages? ++nr_pages: 0;
2344}
2345
2346/*
2347 * add_swap_count_continuation - called when a swap count is duplicated 2293 * add_swap_count_continuation - called when a swap count is duplicated
2348 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's 2294 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
2349 * page of the original vmalloc'ed swap_map, to hold the continuation count 2295 * page of the original vmalloc'ed swap_map, to hold the continuation count
diff --git a/mm/util.c b/mm/util.c
index 136ac4f322b8..ae962b31de88 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -239,6 +239,47 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
239 next->vm_prev = vma; 239 next->vm_prev = vma;
240} 240}
241 241
242/* Check if the vma is being used as a stack by this task */
243static int vm_is_stack_for_task(struct task_struct *t,
244 struct vm_area_struct *vma)
245{
246 return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
247}
248
249/*
250 * Check if the vma is being used as a stack.
251 * If is_group is non-zero, check in the entire thread group or else
252 * just check in the current task. Returns the pid of the task that
253 * the vma is stack for.
254 */
255pid_t vm_is_stack(struct task_struct *task,
256 struct vm_area_struct *vma, int in_group)
257{
258 pid_t ret = 0;
259
260 if (vm_is_stack_for_task(task, vma))
261 return task->pid;
262
263 if (in_group) {
264 struct task_struct *t;
265 rcu_read_lock();
266 if (!pid_alive(task))
267 goto done;
268
269 t = task;
270 do {
271 if (vm_is_stack_for_task(t, vma)) {
272 ret = t->pid;
273 goto done;
274 }
275 } while_each_thread(task, t);
276done:
277 rcu_read_unlock();
278 }
279
280 return ret;
281}
282
242#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) 283#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
243void arch_pick_mmap_layout(struct mm_struct *mm) 284void arch_pick_mmap_layout(struct mm_struct *mm)
244{ 285{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c52b23552659..49f15ef0a99a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1138,7 +1138,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1138 * @mz: The mem_cgroup_zone to pull pages from. 1138 * @mz: The mem_cgroup_zone to pull pages from.
1139 * @dst: The temp list to put pages on to. 1139 * @dst: The temp list to put pages on to.
1140 * @nr_scanned: The number of pages that were scanned. 1140 * @nr_scanned: The number of pages that were scanned.
1141 * @order: The caller's attempted allocation order 1141 * @sc: The scan_control struct for this reclaim session
1142 * @mode: One of the LRU isolation modes 1142 * @mode: One of the LRU isolation modes
1143 * @active: True [1] if isolating active pages 1143 * @active: True [1] if isolating active pages
1144 * @file: True [1] if isolating file [!anon] pages 1144 * @file: True [1] if isolating file [!anon] pages
@@ -1147,8 +1147,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1147 */ 1147 */
1148static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1148static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1149 struct mem_cgroup_zone *mz, struct list_head *dst, 1149 struct mem_cgroup_zone *mz, struct list_head *dst,
1150 unsigned long *nr_scanned, int order, isolate_mode_t mode, 1150 unsigned long *nr_scanned, struct scan_control *sc,
1151 int active, int file) 1151 isolate_mode_t mode, int active, int file)
1152{ 1152{
1153 struct lruvec *lruvec; 1153 struct lruvec *lruvec;
1154 struct list_head *src; 1154 struct list_head *src;
@@ -1194,7 +1194,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1194 BUG(); 1194 BUG();
1195 } 1195 }
1196 1196
1197 if (!order) 1197 if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
1198 continue; 1198 continue;
1199 1199
1200 /* 1200 /*
@@ -1208,8 +1208,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1208 */ 1208 */
1209 zone_id = page_zone_id(page); 1209 zone_id = page_zone_id(page);
1210 page_pfn = page_to_pfn(page); 1210 page_pfn = page_to_pfn(page);
1211 pfn = page_pfn & ~((1 << order) - 1); 1211 pfn = page_pfn & ~((1 << sc->order) - 1);
1212 end_pfn = pfn + (1 << order); 1212 end_pfn = pfn + (1 << sc->order);
1213 for (; pfn < end_pfn; pfn++) { 1213 for (; pfn < end_pfn; pfn++) {
1214 struct page *cursor_page; 1214 struct page *cursor_page;
1215 1215
@@ -1275,7 +1275,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1275 1275
1276 *nr_scanned = scan; 1276 *nr_scanned = scan;
1277 1277
1278 trace_mm_vmscan_lru_isolate(order, 1278 trace_mm_vmscan_lru_isolate(sc->order,
1279 nr_to_scan, scan, 1279 nr_to_scan, scan,
1280 nr_taken, 1280 nr_taken,
1281 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, 1281 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
@@ -1413,7 +1413,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
1413 unsigned long *nr_anon, 1413 unsigned long *nr_anon,
1414 unsigned long *nr_file) 1414 unsigned long *nr_file)
1415{ 1415{
1416 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1417 struct zone *zone = mz->zone; 1416 struct zone *zone = mz->zone;
1418 unsigned int count[NR_LRU_LISTS] = { 0, }; 1417 unsigned int count[NR_LRU_LISTS] = { 0, };
1419 unsigned long nr_active = 0; 1418 unsigned long nr_active = 0;
@@ -1434,6 +1433,7 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
1434 count[lru] += numpages; 1433 count[lru] += numpages;
1435 } 1434 }
1436 1435
1436 preempt_disable();
1437 __count_vm_events(PGDEACTIVATE, nr_active); 1437 __count_vm_events(PGDEACTIVATE, nr_active);
1438 1438
1439 __mod_zone_page_state(zone, NR_ACTIVE_FILE, 1439 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
@@ -1448,8 +1448,9 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
1448 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; 1448 *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
1449 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; 1449 *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
1450 1450
1451 reclaim_stat->recent_scanned[0] += *nr_anon; 1451 __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
1452 reclaim_stat->recent_scanned[1] += *nr_file; 1452 __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
1453 preempt_enable();
1453} 1454}
1454 1455
1455/* 1456/*
@@ -1509,8 +1510,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1509 unsigned long nr_file; 1510 unsigned long nr_file;
1510 unsigned long nr_dirty = 0; 1511 unsigned long nr_dirty = 0;
1511 unsigned long nr_writeback = 0; 1512 unsigned long nr_writeback = 0;
1512 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; 1513 isolate_mode_t isolate_mode = ISOLATE_INACTIVE;
1513 struct zone *zone = mz->zone; 1514 struct zone *zone = mz->zone;
1515 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1514 1516
1515 while (unlikely(too_many_isolated(zone, file, sc))) { 1517 while (unlikely(too_many_isolated(zone, file, sc))) {
1516 congestion_wait(BLK_RW_ASYNC, HZ/10); 1518 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1522,20 +1524,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1522 1524
1523 set_reclaim_mode(priority, sc, false); 1525 set_reclaim_mode(priority, sc, false);
1524 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) 1526 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1525 reclaim_mode |= ISOLATE_ACTIVE; 1527 isolate_mode |= ISOLATE_ACTIVE;
1526 1528
1527 lru_add_drain(); 1529 lru_add_drain();
1528 1530
1529 if (!sc->may_unmap) 1531 if (!sc->may_unmap)
1530 reclaim_mode |= ISOLATE_UNMAPPED; 1532 isolate_mode |= ISOLATE_UNMAPPED;
1531 if (!sc->may_writepage) 1533 if (!sc->may_writepage)
1532 reclaim_mode |= ISOLATE_CLEAN; 1534 isolate_mode |= ISOLATE_CLEAN;
1533 1535
1534 spin_lock_irq(&zone->lru_lock); 1536 spin_lock_irq(&zone->lru_lock);
1535 1537
1536 nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, 1538 nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned,
1537 &nr_scanned, sc->order, 1539 sc, isolate_mode, 0, file);
1538 reclaim_mode, 0, file);
1539 if (global_reclaim(sc)) { 1540 if (global_reclaim(sc)) {
1540 zone->pages_scanned += nr_scanned; 1541 zone->pages_scanned += nr_scanned;
1541 if (current_is_kswapd()) 1542 if (current_is_kswapd())
@@ -1545,19 +1546,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1545 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1546 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1546 nr_scanned); 1547 nr_scanned);
1547 } 1548 }
1549 spin_unlock_irq(&zone->lru_lock);
1548 1550
1549 if (nr_taken == 0) { 1551 if (nr_taken == 0)
1550 spin_unlock_irq(&zone->lru_lock);
1551 return 0; 1552 return 0;
1552 }
1553 1553
1554 update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); 1554 update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
1555 1555
1556 __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
1557 __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
1558
1559 spin_unlock_irq(&zone->lru_lock);
1560
1561 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, 1556 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
1562 &nr_dirty, &nr_writeback); 1557 &nr_dirty, &nr_writeback);
1563 1558
@@ -1570,6 +1565,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1570 1565
1571 spin_lock_irq(&zone->lru_lock); 1566 spin_lock_irq(&zone->lru_lock);
1572 1567
1568 reclaim_stat->recent_scanned[0] += nr_anon;
1569 reclaim_stat->recent_scanned[1] += nr_file;
1570
1573 if (current_is_kswapd()) 1571 if (current_is_kswapd())
1574 __count_vm_events(KSWAPD_STEAL, nr_reclaimed); 1572 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
1575 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); 1573 __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
@@ -1643,18 +1641,6 @@ static void move_active_pages_to_lru(struct zone *zone,
1643 unsigned long pgmoved = 0; 1641 unsigned long pgmoved = 0;
1644 struct page *page; 1642 struct page *page;
1645 1643
1646 if (buffer_heads_over_limit) {
1647 spin_unlock_irq(&zone->lru_lock);
1648 list_for_each_entry(page, list, lru) {
1649 if (page_has_private(page) && trylock_page(page)) {
1650 if (page_has_private(page))
1651 try_to_release_page(page, 0);
1652 unlock_page(page);
1653 }
1654 }
1655 spin_lock_irq(&zone->lru_lock);
1656 }
1657
1658 while (!list_empty(list)) { 1644 while (!list_empty(list)) {
1659 struct lruvec *lruvec; 1645 struct lruvec *lruvec;
1660 1646
@@ -1699,21 +1685,22 @@ static void shrink_active_list(unsigned long nr_to_scan,
1699 struct page *page; 1685 struct page *page;
1700 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); 1686 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
1701 unsigned long nr_rotated = 0; 1687 unsigned long nr_rotated = 0;
1702 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; 1688 isolate_mode_t isolate_mode = ISOLATE_ACTIVE;
1703 struct zone *zone = mz->zone; 1689 struct zone *zone = mz->zone;
1704 1690
1705 lru_add_drain(); 1691 lru_add_drain();
1706 1692
1693 reset_reclaim_mode(sc);
1694
1707 if (!sc->may_unmap) 1695 if (!sc->may_unmap)
1708 reclaim_mode |= ISOLATE_UNMAPPED; 1696 isolate_mode |= ISOLATE_UNMAPPED;
1709 if (!sc->may_writepage) 1697 if (!sc->may_writepage)
1710 reclaim_mode |= ISOLATE_CLEAN; 1698 isolate_mode |= ISOLATE_CLEAN;
1711 1699
1712 spin_lock_irq(&zone->lru_lock); 1700 spin_lock_irq(&zone->lru_lock);
1713 1701
1714 nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, 1702 nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc,
1715 &nr_scanned, sc->order, 1703 isolate_mode, 1, file);
1716 reclaim_mode, 1, file);
1717 if (global_reclaim(sc)) 1704 if (global_reclaim(sc))
1718 zone->pages_scanned += nr_scanned; 1705 zone->pages_scanned += nr_scanned;
1719 1706
@@ -1737,6 +1724,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
1737 continue; 1724 continue;
1738 } 1725 }
1739 1726
1727 if (unlikely(buffer_heads_over_limit)) {
1728 if (page_has_private(page) && trylock_page(page)) {
1729 if (page_has_private(page))
1730 try_to_release_page(page, 0);
1731 unlock_page(page);
1732 }
1733 }
1734
1740 if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { 1735 if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
1741 nr_rotated += hpage_nr_pages(page); 1736 nr_rotated += hpage_nr_pages(page);
1742 /* 1737 /*
@@ -2112,7 +2107,12 @@ restart:
2112 * with multiple processes reclaiming pages, the total 2107 * with multiple processes reclaiming pages, the total
2113 * freeing target can get unreasonably large. 2108 * freeing target can get unreasonably large.
2114 */ 2109 */
2115 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 2110 if (nr_reclaimed >= nr_to_reclaim)
2111 nr_to_reclaim = 0;
2112 else
2113 nr_to_reclaim -= nr_reclaimed;
2114
2115 if (!nr_to_reclaim && priority < DEF_PRIORITY)
2116 break; 2116 break;
2117 } 2117 }
2118 blk_finish_plug(&plug); 2118 blk_finish_plug(&plug);
@@ -2195,7 +2195,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2195 * If compaction is deferred, reclaim up to a point where 2195 * If compaction is deferred, reclaim up to a point where
2196 * compaction will have a chance of success when re-enabled 2196 * compaction will have a chance of success when re-enabled
2197 */ 2197 */
2198 if (compaction_deferred(zone)) 2198 if (compaction_deferred(zone, sc->order))
2199 return watermark_ok; 2199 return watermark_ok;
2200 2200
2201 /* If compaction is not ready to start, keep reclaiming */ 2201 /* If compaction is not ready to start, keep reclaiming */
@@ -2235,6 +2235,14 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2235 unsigned long nr_soft_scanned; 2235 unsigned long nr_soft_scanned;
2236 bool aborted_reclaim = false; 2236 bool aborted_reclaim = false;
2237 2237
2238 /*
2239 * If the number of buffer_heads in the machine exceeds the maximum
2240 * allowed level, force direct reclaim to scan the highmem zone as
2241 * highmem pages could be pinning lowmem pages storing buffer_heads
2242 */
2243 if (buffer_heads_over_limit)
2244 sc->gfp_mask |= __GFP_HIGHMEM;
2245
2238 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2246 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2239 gfp_zone(sc->gfp_mask), sc->nodemask) { 2247 gfp_zone(sc->gfp_mask), sc->nodemask) {
2240 if (!populated_zone(zone)) 2248 if (!populated_zone(zone))
@@ -2255,8 +2263,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
2255 * Even though compaction is invoked for any 2263 * Even though compaction is invoked for any
2256 * non-zero order, only frequent costly order 2264 * non-zero order, only frequent costly order
2257 * reclamation is disruptive enough to become a 2265 * reclamation is disruptive enough to become a
2258 * noticable problem, like transparent huge page 2266 * noticeable problem, like transparent huge
2259 * allocations. 2267 * page allocations.
2260 */ 2268 */
2261 if (compaction_ready(zone, sc)) { 2269 if (compaction_ready(zone, sc)) {
2262 aborted_reclaim = true; 2270 aborted_reclaim = true;
@@ -2337,7 +2345,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2337 unsigned long writeback_threshold; 2345 unsigned long writeback_threshold;
2338 bool aborted_reclaim; 2346 bool aborted_reclaim;
2339 2347
2340 get_mems_allowed();
2341 delayacct_freepages_start(); 2348 delayacct_freepages_start();
2342 2349
2343 if (global_reclaim(sc)) 2350 if (global_reclaim(sc))
@@ -2401,7 +2408,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2401 2408
2402out: 2409out:
2403 delayacct_freepages_end(); 2410 delayacct_freepages_end();
2404 put_mems_allowed();
2405 2411
2406 if (sc->nr_reclaimed) 2412 if (sc->nr_reclaimed)
2407 return sc->nr_reclaimed; 2413 return sc->nr_reclaimed;
@@ -2724,6 +2730,17 @@ loop_again:
2724 */ 2730 */
2725 age_active_anon(zone, &sc, priority); 2731 age_active_anon(zone, &sc, priority);
2726 2732
2733 /*
2734 * If the number of buffer_heads in the machine
2735 * exceeds the maximum allowed level and this node
2736 * has a highmem zone, force kswapd to reclaim from
2737 * it to relieve lowmem pressure.
2738 */
2739 if (buffer_heads_over_limit && is_highmem_idx(i)) {
2740 end_zone = i;
2741 break;
2742 }
2743
2727 if (!zone_watermark_ok_safe(zone, order, 2744 if (!zone_watermark_ok_safe(zone, order,
2728 high_wmark_pages(zone), 0, 0)) { 2745 high_wmark_pages(zone), 0, 0)) {
2729 end_zone = i; 2746 end_zone = i;
@@ -2753,7 +2770,7 @@ loop_again:
2753 */ 2770 */
2754 for (i = 0; i <= end_zone; i++) { 2771 for (i = 0; i <= end_zone; i++) {
2755 struct zone *zone = pgdat->node_zones + i; 2772 struct zone *zone = pgdat->node_zones + i;
2756 int nr_slab; 2773 int nr_slab, testorder;
2757 unsigned long balance_gap; 2774 unsigned long balance_gap;
2758 2775
2759 if (!populated_zone(zone)) 2776 if (!populated_zone(zone))
@@ -2786,7 +2803,21 @@ loop_again:
2786 (zone->present_pages + 2803 (zone->present_pages +
2787 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2804 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2788 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2805 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2789 if (!zone_watermark_ok_safe(zone, order, 2806 /*
2807 * Kswapd reclaims only single pages with compaction
2808 * enabled. Trying too hard to reclaim until contiguous
2809 * free pages have become available can hurt performance
2810 * by evicting too much useful data from memory.
2811 * Do not reclaim more than needed for compaction.
2812 */
2813 testorder = order;
2814 if (COMPACTION_BUILD && order &&
2815 compaction_suitable(zone, order) !=
2816 COMPACT_SKIPPED)
2817 testorder = 0;
2818
2819 if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
2820 !zone_watermark_ok_safe(zone, order,
2790 high_wmark_pages(zone) + balance_gap, 2821 high_wmark_pages(zone) + balance_gap,
2791 end_zone, 0)) { 2822 end_zone, 0)) {
2792 shrink_zone(priority, zone, &sc); 2823 shrink_zone(priority, zone, &sc);
@@ -2815,7 +2846,7 @@ loop_again:
2815 continue; 2846 continue;
2816 } 2847 }
2817 2848
2818 if (!zone_watermark_ok_safe(zone, order, 2849 if (!zone_watermark_ok_safe(zone, testorder,
2819 high_wmark_pages(zone), end_zone, 0)) { 2850 high_wmark_pages(zone), end_zone, 0)) {
2820 all_zones_ok = 0; 2851 all_zones_ok = 0;
2821 /* 2852 /*
@@ -2903,6 +2934,8 @@ out:
2903 * and it is potentially going to sleep here. 2934 * and it is potentially going to sleep here.
2904 */ 2935 */
2905 if (order) { 2936 if (order) {
2937 int zones_need_compaction = 1;
2938
2906 for (i = 0; i <= end_zone; i++) { 2939 for (i = 0; i <= end_zone; i++) {
2907 struct zone *zone = pgdat->node_zones + i; 2940 struct zone *zone = pgdat->node_zones + i;
2908 2941
@@ -2912,6 +2945,10 @@ out:
2912 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2945 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2913 continue; 2946 continue;
2914 2947
2948 /* Would compaction fail due to lack of free memory? */
2949 if (compaction_suitable(zone, order) == COMPACT_SKIPPED)
2950 goto loop_again;
2951
2915 /* Confirm the zone is balanced for order-0 */ 2952 /* Confirm the zone is balanced for order-0 */
2916 if (!zone_watermark_ok(zone, 0, 2953 if (!zone_watermark_ok(zone, 0,
2917 high_wmark_pages(zone), 0, 0)) { 2954 high_wmark_pages(zone), 0, 0)) {
@@ -2919,11 +2956,17 @@ out:
2919 goto loop_again; 2956 goto loop_again;
2920 } 2957 }
2921 2958
2959 /* Check if the memory needs to be defragmented. */
2960 if (zone_watermark_ok(zone, order,
2961 low_wmark_pages(zone), *classzone_idx, 0))
2962 zones_need_compaction = 0;
2963
2922 /* If balanced, clear the congested flag */ 2964 /* If balanced, clear the congested flag */
2923 zone_clear_flag(zone, ZONE_CONGESTED); 2965 zone_clear_flag(zone, ZONE_CONGESTED);
2924 if (i <= *classzone_idx)
2925 balanced += zone->present_pages;
2926 } 2966 }
2967
2968 if (zones_need_compaction)
2969 compact_pgdat(pgdat, order);
2927 } 2970 }
2928 2971
2929 /* 2972 /*