diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-22 12:04:48 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-03-22 12:04:48 -0400 |
commit | 95211279c5ad00a317c98221d7e4365e02f20836 (patch) | |
tree | 2ddc8625378d2915b8c96392f3cf6663b705ed55 /mm | |
parent | 5375871d432ae9fc581014ac117b96aaee3cd0c7 (diff) | |
parent | 12724850e8064f64b6223d26d78c0597c742c65a (diff) |
Merge branch 'akpm' (Andrew's patch-bomb)
Merge first batch of patches from Andrew Morton:
"A few misc things and all the MM queue"
* emailed from Andrew Morton <akpm@linux-foundation.org>: (92 commits)
memcg: avoid THP split in task migration
thp: add HPAGE_PMD_* definitions for !CONFIG_TRANSPARENT_HUGEPAGE
memcg: clean up existing move charge code
mm/memcontrol.c: remove unnecessary 'break' in mem_cgroup_read()
mm/memcontrol.c: remove redundant BUG_ON() in mem_cgroup_usage_unregister_event()
mm/memcontrol.c: s/stealed/stolen/
memcg: fix performance of mem_cgroup_begin_update_page_stat()
memcg: remove PCG_FILE_MAPPED
memcg: use new logic for page stat accounting
memcg: remove PCG_MOVE_LOCK flag from page_cgroup
memcg: simplify move_account() check
memcg: remove EXPORT_SYMBOL(mem_cgroup_update_page_stat)
memcg: kill dead prev_priority stubs
memcg: remove PCG_CACHE page_cgroup flag
memcg: let css_get_next() rely upon rcu_read_lock()
cgroup: revert ss_id_lock to spinlock
idr: make idr_get_next() good for rcu_read_lock()
memcg: remove unnecessary thp check in page stat accounting
memcg: remove redundant returns
memcg: enum lru_list lru
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/bootmem.c | 5 | ||||
-rw-r--r-- | mm/compaction.c | 77 | ||||
-rw-r--r-- | mm/filemap.c | 20 | ||||
-rw-r--r-- | mm/huge_memory.c | 125 | ||||
-rw-r--r-- | mm/hugetlb.c | 184 | ||||
-rw-r--r-- | mm/ksm.c | 34 | ||||
-rw-r--r-- | mm/memcontrol.c | 473 | ||||
-rw-r--r-- | mm/memory-failure.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 53 | ||||
-rw-r--r-- | mm/mempolicy.c | 62 | ||||
-rw-r--r-- | mm/migrate.c | 36 | ||||
-rw-r--r-- | mm/mincore.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 51 | ||||
-rw-r--r-- | mm/mmu_context.c | 2 | ||||
-rw-r--r-- | mm/mprotect.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 166 | ||||
-rw-r--r-- | mm/page-writeback.c | 1 | ||||
-rw-r--r-- | mm/page_alloc.c | 58 | ||||
-rw-r--r-- | mm/pagewalk.c | 2 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 5 | ||||
-rw-r--r-- | mm/rmap.c | 70 | ||||
-rw-r--r-- | mm/shmem.c | 88 | ||||
-rw-r--r-- | mm/slab.c | 13 | ||||
-rw-r--r-- | mm/slub.c | 40 | ||||
-rw-r--r-- | mm/sparse.c | 30 | ||||
-rw-r--r-- | mm/swap.c | 4 | ||||
-rw-r--r-- | mm/swap_state.c | 24 | ||||
-rw-r--r-- | mm/swapfile.c | 58 | ||||
-rw-r--r-- | mm/util.c | 41 | ||||
-rw-r--r-- | mm/vmscan.c | 151 |
30 files changed, 1108 insertions, 771 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c index 668e94df8cf2..0131170c9d54 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -766,14 +766,13 @@ void * __init alloc_bootmem_section(unsigned long size, | |||
766 | unsigned long section_nr) | 766 | unsigned long section_nr) |
767 | { | 767 | { |
768 | bootmem_data_t *bdata; | 768 | bootmem_data_t *bdata; |
769 | unsigned long pfn, goal, limit; | 769 | unsigned long pfn, goal; |
770 | 770 | ||
771 | pfn = section_nr_to_pfn(section_nr); | 771 | pfn = section_nr_to_pfn(section_nr); |
772 | goal = pfn << PAGE_SHIFT; | 772 | goal = pfn << PAGE_SHIFT; |
773 | limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; | ||
774 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; | 773 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; |
775 | 774 | ||
776 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); | 775 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0); |
777 | } | 776 | } |
778 | #endif | 777 | #endif |
779 | 778 | ||
diff --git a/mm/compaction.c b/mm/compaction.c index d9ebebe1a2aa..74a8c825ff28 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -35,7 +35,7 @@ struct compact_control { | |||
35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 35 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
36 | bool sync; /* Synchronous migration */ | 36 | bool sync; /* Synchronous migration */ |
37 | 37 | ||
38 | unsigned int order; /* order a direct compactor needs */ | 38 | int order; /* order a direct compactor needs */ |
39 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 39 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
40 | struct zone *zone; | 40 | struct zone *zone; |
41 | }; | 41 | }; |
@@ -675,49 +675,71 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
675 | 675 | ||
676 | 676 | ||
677 | /* Compact all zones within a node */ | 677 | /* Compact all zones within a node */ |
678 | static int compact_node(int nid) | 678 | static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) |
679 | { | 679 | { |
680 | int zoneid; | 680 | int zoneid; |
681 | pg_data_t *pgdat; | ||
682 | struct zone *zone; | 681 | struct zone *zone; |
683 | 682 | ||
684 | if (nid < 0 || nid >= nr_node_ids || !node_online(nid)) | ||
685 | return -EINVAL; | ||
686 | pgdat = NODE_DATA(nid); | ||
687 | |||
688 | /* Flush pending updates to the LRU lists */ | ||
689 | lru_add_drain_all(); | ||
690 | |||
691 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { | 683 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { |
692 | struct compact_control cc = { | ||
693 | .nr_freepages = 0, | ||
694 | .nr_migratepages = 0, | ||
695 | .order = -1, | ||
696 | .sync = true, | ||
697 | }; | ||
698 | 684 | ||
699 | zone = &pgdat->node_zones[zoneid]; | 685 | zone = &pgdat->node_zones[zoneid]; |
700 | if (!populated_zone(zone)) | 686 | if (!populated_zone(zone)) |
701 | continue; | 687 | continue; |
702 | 688 | ||
703 | cc.zone = zone; | 689 | cc->nr_freepages = 0; |
704 | INIT_LIST_HEAD(&cc.freepages); | 690 | cc->nr_migratepages = 0; |
705 | INIT_LIST_HEAD(&cc.migratepages); | 691 | cc->zone = zone; |
706 | 692 | INIT_LIST_HEAD(&cc->freepages); | |
707 | compact_zone(zone, &cc); | 693 | INIT_LIST_HEAD(&cc->migratepages); |
694 | |||
695 | if (cc->order == -1 || !compaction_deferred(zone, cc->order)) | ||
696 | compact_zone(zone, cc); | ||
697 | |||
698 | if (cc->order > 0) { | ||
699 | int ok = zone_watermark_ok(zone, cc->order, | ||
700 | low_wmark_pages(zone), 0, 0); | ||
701 | if (ok && cc->order > zone->compact_order_failed) | ||
702 | zone->compact_order_failed = cc->order + 1; | ||
703 | /* Currently async compaction is never deferred. */ | ||
704 | else if (!ok && cc->sync) | ||
705 | defer_compaction(zone, cc->order); | ||
706 | } | ||
708 | 707 | ||
709 | VM_BUG_ON(!list_empty(&cc.freepages)); | 708 | VM_BUG_ON(!list_empty(&cc->freepages)); |
710 | VM_BUG_ON(!list_empty(&cc.migratepages)); | 709 | VM_BUG_ON(!list_empty(&cc->migratepages)); |
711 | } | 710 | } |
712 | 711 | ||
713 | return 0; | 712 | return 0; |
714 | } | 713 | } |
715 | 714 | ||
715 | int compact_pgdat(pg_data_t *pgdat, int order) | ||
716 | { | ||
717 | struct compact_control cc = { | ||
718 | .order = order, | ||
719 | .sync = false, | ||
720 | }; | ||
721 | |||
722 | return __compact_pgdat(pgdat, &cc); | ||
723 | } | ||
724 | |||
725 | static int compact_node(int nid) | ||
726 | { | ||
727 | struct compact_control cc = { | ||
728 | .order = -1, | ||
729 | .sync = true, | ||
730 | }; | ||
731 | |||
732 | return __compact_pgdat(NODE_DATA(nid), &cc); | ||
733 | } | ||
734 | |||
716 | /* Compact all nodes in the system */ | 735 | /* Compact all nodes in the system */ |
717 | static int compact_nodes(void) | 736 | static int compact_nodes(void) |
718 | { | 737 | { |
719 | int nid; | 738 | int nid; |
720 | 739 | ||
740 | /* Flush pending updates to the LRU lists */ | ||
741 | lru_add_drain_all(); | ||
742 | |||
721 | for_each_online_node(nid) | 743 | for_each_online_node(nid) |
722 | compact_node(nid); | 744 | compact_node(nid); |
723 | 745 | ||
@@ -750,7 +772,14 @@ ssize_t sysfs_compact_node(struct device *dev, | |||
750 | struct device_attribute *attr, | 772 | struct device_attribute *attr, |
751 | const char *buf, size_t count) | 773 | const char *buf, size_t count) |
752 | { | 774 | { |
753 | compact_node(dev->id); | 775 | int nid = dev->id; |
776 | |||
777 | if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { | ||
778 | /* Flush pending updates to the LRU lists */ | ||
779 | lru_add_drain_all(); | ||
780 | |||
781 | compact_node(nid); | ||
782 | } | ||
754 | 783 | ||
755 | return count; | 784 | return count; |
756 | } | 785 | } |
diff --git a/mm/filemap.c b/mm/filemap.c index 2f8165075a5a..843042045dc9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -101,9 +101,8 @@ | |||
101 | * ->inode->i_lock (zap_pte_range->set_page_dirty) | 101 | * ->inode->i_lock (zap_pte_range->set_page_dirty) |
102 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) | 102 | * ->private_lock (zap_pte_range->__set_page_dirty_buffers) |
103 | * | 103 | * |
104 | * (code doesn't rely on that order, so you could switch it around) | 104 | * ->i_mmap_mutex |
105 | * ->tasklist_lock (memory_failure, collect_procs_ao) | 105 | * ->tasklist_lock (memory_failure, collect_procs_ao) |
106 | * ->i_mmap_mutex | ||
107 | */ | 106 | */ |
108 | 107 | ||
109 | /* | 108 | /* |
@@ -500,10 +499,13 @@ struct page *__page_cache_alloc(gfp_t gfp) | |||
500 | struct page *page; | 499 | struct page *page; |
501 | 500 | ||
502 | if (cpuset_do_page_mem_spread()) { | 501 | if (cpuset_do_page_mem_spread()) { |
503 | get_mems_allowed(); | 502 | unsigned int cpuset_mems_cookie; |
504 | n = cpuset_mem_spread_node(); | 503 | do { |
505 | page = alloc_pages_exact_node(n, gfp, 0); | 504 | cpuset_mems_cookie = get_mems_allowed(); |
506 | put_mems_allowed(); | 505 | n = cpuset_mem_spread_node(); |
506 | page = alloc_pages_exact_node(n, gfp, 0); | ||
507 | } while (!put_mems_allowed(cpuset_mems_cookie) && !page); | ||
508 | |||
507 | return page; | 509 | return page; |
508 | } | 510 | } |
509 | return alloc_pages(gfp, 0); | 511 | return alloc_pages(gfp, 0); |
@@ -2341,7 +2343,9 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, | |||
2341 | struct page *page; | 2343 | struct page *page; |
2342 | gfp_t gfp_notmask = 0; | 2344 | gfp_t gfp_notmask = 0; |
2343 | 2345 | ||
2344 | gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE; | 2346 | gfp_mask = mapping_gfp_mask(mapping); |
2347 | if (mapping_cap_account_dirty(mapping)) | ||
2348 | gfp_mask |= __GFP_WRITE; | ||
2345 | if (flags & AOP_FLAG_NOFS) | 2349 | if (flags & AOP_FLAG_NOFS) |
2346 | gfp_notmask = __GFP_FS; | 2350 | gfp_notmask = __GFP_FS; |
2347 | repeat: | 2351 | repeat: |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8f7fc394f636..f0e5306eeb55 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -1031,32 +1031,23 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1031 | { | 1031 | { |
1032 | int ret = 0; | 1032 | int ret = 0; |
1033 | 1033 | ||
1034 | spin_lock(&tlb->mm->page_table_lock); | 1034 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1035 | if (likely(pmd_trans_huge(*pmd))) { | 1035 | struct page *page; |
1036 | if (unlikely(pmd_trans_splitting(*pmd))) { | 1036 | pgtable_t pgtable; |
1037 | spin_unlock(&tlb->mm->page_table_lock); | 1037 | pgtable = get_pmd_huge_pte(tlb->mm); |
1038 | wait_split_huge_page(vma->anon_vma, | 1038 | page = pmd_page(*pmd); |
1039 | pmd); | 1039 | pmd_clear(pmd); |
1040 | } else { | 1040 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
1041 | struct page *page; | 1041 | page_remove_rmap(page); |
1042 | pgtable_t pgtable; | 1042 | VM_BUG_ON(page_mapcount(page) < 0); |
1043 | pgtable = get_pmd_huge_pte(tlb->mm); | 1043 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); |
1044 | page = pmd_page(*pmd); | 1044 | VM_BUG_ON(!PageHead(page)); |
1045 | pmd_clear(pmd); | 1045 | tlb->mm->nr_ptes--; |
1046 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | ||
1047 | page_remove_rmap(page); | ||
1048 | VM_BUG_ON(page_mapcount(page) < 0); | ||
1049 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | ||
1050 | VM_BUG_ON(!PageHead(page)); | ||
1051 | tlb->mm->nr_ptes--; | ||
1052 | spin_unlock(&tlb->mm->page_table_lock); | ||
1053 | tlb_remove_page(tlb, page); | ||
1054 | pte_free(tlb->mm, pgtable); | ||
1055 | ret = 1; | ||
1056 | } | ||
1057 | } else | ||
1058 | spin_unlock(&tlb->mm->page_table_lock); | 1046 | spin_unlock(&tlb->mm->page_table_lock); |
1059 | 1047 | tlb_remove_page(tlb, page); | |
1048 | pte_free(tlb->mm, pgtable); | ||
1049 | ret = 1; | ||
1050 | } | ||
1060 | return ret; | 1051 | return ret; |
1061 | } | 1052 | } |
1062 | 1053 | ||
@@ -1066,21 +1057,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1066 | { | 1057 | { |
1067 | int ret = 0; | 1058 | int ret = 0; |
1068 | 1059 | ||
1069 | spin_lock(&vma->vm_mm->page_table_lock); | 1060 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1070 | if (likely(pmd_trans_huge(*pmd))) { | 1061 | /* |
1071 | ret = !pmd_trans_splitting(*pmd); | 1062 | * All logical pages in the range are present |
1072 | spin_unlock(&vma->vm_mm->page_table_lock); | 1063 | * if backed by a huge page. |
1073 | if (unlikely(!ret)) | 1064 | */ |
1074 | wait_split_huge_page(vma->anon_vma, pmd); | ||
1075 | else { | ||
1076 | /* | ||
1077 | * All logical pages in the range are present | ||
1078 | * if backed by a huge page. | ||
1079 | */ | ||
1080 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); | ||
1081 | } | ||
1082 | } else | ||
1083 | spin_unlock(&vma->vm_mm->page_table_lock); | 1065 | spin_unlock(&vma->vm_mm->page_table_lock); |
1066 | memset(vec, 1, (end - addr) >> PAGE_SHIFT); | ||
1067 | ret = 1; | ||
1068 | } | ||
1084 | 1069 | ||
1085 | return ret; | 1070 | return ret; |
1086 | } | 1071 | } |
@@ -1110,20 +1095,11 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | |||
1110 | goto out; | 1095 | goto out; |
1111 | } | 1096 | } |
1112 | 1097 | ||
1113 | spin_lock(&mm->page_table_lock); | 1098 | ret = __pmd_trans_huge_lock(old_pmd, vma); |
1114 | if (likely(pmd_trans_huge(*old_pmd))) { | 1099 | if (ret == 1) { |
1115 | if (pmd_trans_splitting(*old_pmd)) { | 1100 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); |
1116 | spin_unlock(&mm->page_table_lock); | 1101 | VM_BUG_ON(!pmd_none(*new_pmd)); |
1117 | wait_split_huge_page(vma->anon_vma, old_pmd); | 1102 | set_pmd_at(mm, new_addr, new_pmd, pmd); |
1118 | ret = -1; | ||
1119 | } else { | ||
1120 | pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); | ||
1121 | VM_BUG_ON(!pmd_none(*new_pmd)); | ||
1122 | set_pmd_at(mm, new_addr, new_pmd, pmd); | ||
1123 | spin_unlock(&mm->page_table_lock); | ||
1124 | ret = 1; | ||
1125 | } | ||
1126 | } else { | ||
1127 | spin_unlock(&mm->page_table_lock); | 1103 | spin_unlock(&mm->page_table_lock); |
1128 | } | 1104 | } |
1129 | out: | 1105 | out: |
@@ -1136,24 +1112,41 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1136 | struct mm_struct *mm = vma->vm_mm; | 1112 | struct mm_struct *mm = vma->vm_mm; |
1137 | int ret = 0; | 1113 | int ret = 0; |
1138 | 1114 | ||
1139 | spin_lock(&mm->page_table_lock); | 1115 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1116 | pmd_t entry; | ||
1117 | entry = pmdp_get_and_clear(mm, addr, pmd); | ||
1118 | entry = pmd_modify(entry, newprot); | ||
1119 | set_pmd_at(mm, addr, pmd, entry); | ||
1120 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
1121 | ret = 1; | ||
1122 | } | ||
1123 | |||
1124 | return ret; | ||
1125 | } | ||
1126 | |||
1127 | /* | ||
1128 | * Returns 1 if a given pmd maps a stable (not under splitting) thp. | ||
1129 | * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. | ||
1130 | * | ||
1131 | * Note that if it returns 1, this routine returns without unlocking page | ||
1132 | * table locks. So callers must unlock them. | ||
1133 | */ | ||
1134 | int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) | ||
1135 | { | ||
1136 | spin_lock(&vma->vm_mm->page_table_lock); | ||
1140 | if (likely(pmd_trans_huge(*pmd))) { | 1137 | if (likely(pmd_trans_huge(*pmd))) { |
1141 | if (unlikely(pmd_trans_splitting(*pmd))) { | 1138 | if (unlikely(pmd_trans_splitting(*pmd))) { |
1142 | spin_unlock(&mm->page_table_lock); | 1139 | spin_unlock(&vma->vm_mm->page_table_lock); |
1143 | wait_split_huge_page(vma->anon_vma, pmd); | 1140 | wait_split_huge_page(vma->anon_vma, pmd); |
1141 | return -1; | ||
1144 | } else { | 1142 | } else { |
1145 | pmd_t entry; | 1143 | /* Thp mapped by 'pmd' is stable, so we can |
1146 | 1144 | * handle it as it is. */ | |
1147 | entry = pmdp_get_and_clear(mm, addr, pmd); | 1145 | return 1; |
1148 | entry = pmd_modify(entry, newprot); | ||
1149 | set_pmd_at(mm, addr, pmd, entry); | ||
1150 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
1151 | ret = 1; | ||
1152 | } | 1146 | } |
1153 | } else | 1147 | } |
1154 | spin_unlock(&vma->vm_mm->page_table_lock); | 1148 | spin_unlock(&vma->vm_mm->page_table_lock); |
1155 | 1149 | return 0; | |
1156 | return ret; | ||
1157 | } | 1150 | } |
1158 | 1151 | ||
1159 | pmd_t *page_check_address_pmd(struct page *page, | 1152 | pmd_t *page_check_address_pmd(struct page *page, |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a876871f6be5..afa057a1d3fe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -53,6 +53,84 @@ static unsigned long __initdata default_hstate_size; | |||
53 | */ | 53 | */ |
54 | static DEFINE_SPINLOCK(hugetlb_lock); | 54 | static DEFINE_SPINLOCK(hugetlb_lock); |
55 | 55 | ||
56 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) | ||
57 | { | ||
58 | bool free = (spool->count == 0) && (spool->used_hpages == 0); | ||
59 | |||
60 | spin_unlock(&spool->lock); | ||
61 | |||
62 | /* If no pages are used, and no other handles to the subpool | ||
63 | * remain, free the subpool the subpool remain */ | ||
64 | if (free) | ||
65 | kfree(spool); | ||
66 | } | ||
67 | |||
68 | struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) | ||
69 | { | ||
70 | struct hugepage_subpool *spool; | ||
71 | |||
72 | spool = kmalloc(sizeof(*spool), GFP_KERNEL); | ||
73 | if (!spool) | ||
74 | return NULL; | ||
75 | |||
76 | spin_lock_init(&spool->lock); | ||
77 | spool->count = 1; | ||
78 | spool->max_hpages = nr_blocks; | ||
79 | spool->used_hpages = 0; | ||
80 | |||
81 | return spool; | ||
82 | } | ||
83 | |||
84 | void hugepage_put_subpool(struct hugepage_subpool *spool) | ||
85 | { | ||
86 | spin_lock(&spool->lock); | ||
87 | BUG_ON(!spool->count); | ||
88 | spool->count--; | ||
89 | unlock_or_release_subpool(spool); | ||
90 | } | ||
91 | |||
92 | static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, | ||
93 | long delta) | ||
94 | { | ||
95 | int ret = 0; | ||
96 | |||
97 | if (!spool) | ||
98 | return 0; | ||
99 | |||
100 | spin_lock(&spool->lock); | ||
101 | if ((spool->used_hpages + delta) <= spool->max_hpages) { | ||
102 | spool->used_hpages += delta; | ||
103 | } else { | ||
104 | ret = -ENOMEM; | ||
105 | } | ||
106 | spin_unlock(&spool->lock); | ||
107 | |||
108 | return ret; | ||
109 | } | ||
110 | |||
111 | static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, | ||
112 | long delta) | ||
113 | { | ||
114 | if (!spool) | ||
115 | return; | ||
116 | |||
117 | spin_lock(&spool->lock); | ||
118 | spool->used_hpages -= delta; | ||
119 | /* If hugetlbfs_put_super couldn't free spool due to | ||
120 | * an outstanding quota reference, free it now. */ | ||
121 | unlock_or_release_subpool(spool); | ||
122 | } | ||
123 | |||
124 | static inline struct hugepage_subpool *subpool_inode(struct inode *inode) | ||
125 | { | ||
126 | return HUGETLBFS_SB(inode->i_sb)->spool; | ||
127 | } | ||
128 | |||
129 | static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) | ||
130 | { | ||
131 | return subpool_inode(vma->vm_file->f_dentry->d_inode); | ||
132 | } | ||
133 | |||
56 | /* | 134 | /* |
57 | * Region tracking -- allows tracking of reservations and instantiated pages | 135 | * Region tracking -- allows tracking of reservations and instantiated pages |
58 | * across the pages in a mapping. | 136 | * across the pages in a mapping. |
@@ -454,14 +532,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
454 | struct vm_area_struct *vma, | 532 | struct vm_area_struct *vma, |
455 | unsigned long address, int avoid_reserve) | 533 | unsigned long address, int avoid_reserve) |
456 | { | 534 | { |
457 | struct page *page = NULL; | 535 | struct page *page; |
458 | struct mempolicy *mpol; | 536 | struct mempolicy *mpol; |
459 | nodemask_t *nodemask; | 537 | nodemask_t *nodemask; |
460 | struct zonelist *zonelist; | 538 | struct zonelist *zonelist; |
461 | struct zone *zone; | 539 | struct zone *zone; |
462 | struct zoneref *z; | 540 | struct zoneref *z; |
541 | unsigned int cpuset_mems_cookie; | ||
463 | 542 | ||
464 | get_mems_allowed(); | 543 | retry_cpuset: |
544 | cpuset_mems_cookie = get_mems_allowed(); | ||
465 | zonelist = huge_zonelist(vma, address, | 545 | zonelist = huge_zonelist(vma, address, |
466 | htlb_alloc_mask, &mpol, &nodemask); | 546 | htlb_alloc_mask, &mpol, &nodemask); |
467 | /* | 547 | /* |
@@ -488,10 +568,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, | |||
488 | } | 568 | } |
489 | } | 569 | } |
490 | } | 570 | } |
491 | err: | 571 | |
492 | mpol_cond_put(mpol); | 572 | mpol_cond_put(mpol); |
493 | put_mems_allowed(); | 573 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
574 | goto retry_cpuset; | ||
494 | return page; | 575 | return page; |
576 | |||
577 | err: | ||
578 | mpol_cond_put(mpol); | ||
579 | return NULL; | ||
495 | } | 580 | } |
496 | 581 | ||
497 | static void update_and_free_page(struct hstate *h, struct page *page) | 582 | static void update_and_free_page(struct hstate *h, struct page *page) |
@@ -533,9 +618,9 @@ static void free_huge_page(struct page *page) | |||
533 | */ | 618 | */ |
534 | struct hstate *h = page_hstate(page); | 619 | struct hstate *h = page_hstate(page); |
535 | int nid = page_to_nid(page); | 620 | int nid = page_to_nid(page); |
536 | struct address_space *mapping; | 621 | struct hugepage_subpool *spool = |
622 | (struct hugepage_subpool *)page_private(page); | ||
537 | 623 | ||
538 | mapping = (struct address_space *) page_private(page); | ||
539 | set_page_private(page, 0); | 624 | set_page_private(page, 0); |
540 | page->mapping = NULL; | 625 | page->mapping = NULL; |
541 | BUG_ON(page_count(page)); | 626 | BUG_ON(page_count(page)); |
@@ -551,8 +636,7 @@ static void free_huge_page(struct page *page) | |||
551 | enqueue_huge_page(h, page); | 636 | enqueue_huge_page(h, page); |
552 | } | 637 | } |
553 | spin_unlock(&hugetlb_lock); | 638 | spin_unlock(&hugetlb_lock); |
554 | if (mapping) | 639 | hugepage_subpool_put_pages(spool, 1); |
555 | hugetlb_put_quota(mapping, 1); | ||
556 | } | 640 | } |
557 | 641 | ||
558 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 642 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
@@ -852,6 +936,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) | |||
852 | struct page *page, *tmp; | 936 | struct page *page, *tmp; |
853 | int ret, i; | 937 | int ret, i; |
854 | int needed, allocated; | 938 | int needed, allocated; |
939 | bool alloc_ok = true; | ||
855 | 940 | ||
856 | needed = (h->resv_huge_pages + delta) - h->free_huge_pages; | 941 | needed = (h->resv_huge_pages + delta) - h->free_huge_pages; |
857 | if (needed <= 0) { | 942 | if (needed <= 0) { |
@@ -867,17 +952,13 @@ retry: | |||
867 | spin_unlock(&hugetlb_lock); | 952 | spin_unlock(&hugetlb_lock); |
868 | for (i = 0; i < needed; i++) { | 953 | for (i = 0; i < needed; i++) { |
869 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 954 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
870 | if (!page) | 955 | if (!page) { |
871 | /* | 956 | alloc_ok = false; |
872 | * We were not able to allocate enough pages to | 957 | break; |
873 | * satisfy the entire reservation so we free what | 958 | } |
874 | * we've allocated so far. | ||
875 | */ | ||
876 | goto free; | ||
877 | |||
878 | list_add(&page->lru, &surplus_list); | 959 | list_add(&page->lru, &surplus_list); |
879 | } | 960 | } |
880 | allocated += needed; | 961 | allocated += i; |
881 | 962 | ||
882 | /* | 963 | /* |
883 | * After retaking hugetlb_lock, we need to recalculate 'needed' | 964 | * After retaking hugetlb_lock, we need to recalculate 'needed' |
@@ -886,9 +967,16 @@ retry: | |||
886 | spin_lock(&hugetlb_lock); | 967 | spin_lock(&hugetlb_lock); |
887 | needed = (h->resv_huge_pages + delta) - | 968 | needed = (h->resv_huge_pages + delta) - |
888 | (h->free_huge_pages + allocated); | 969 | (h->free_huge_pages + allocated); |
889 | if (needed > 0) | 970 | if (needed > 0) { |
890 | goto retry; | 971 | if (alloc_ok) |
891 | 972 | goto retry; | |
973 | /* | ||
974 | * We were not able to allocate enough pages to | ||
975 | * satisfy the entire reservation so we free what | ||
976 | * we've allocated so far. | ||
977 | */ | ||
978 | goto free; | ||
979 | } | ||
892 | /* | 980 | /* |
893 | * The surplus_list now contains _at_least_ the number of extra pages | 981 | * The surplus_list now contains _at_least_ the number of extra pages |
894 | * needed to accommodate the reservation. Add the appropriate number | 982 | * needed to accommodate the reservation. Add the appropriate number |
@@ -914,10 +1002,10 @@ retry: | |||
914 | VM_BUG_ON(page_count(page)); | 1002 | VM_BUG_ON(page_count(page)); |
915 | enqueue_huge_page(h, page); | 1003 | enqueue_huge_page(h, page); |
916 | } | 1004 | } |
1005 | free: | ||
917 | spin_unlock(&hugetlb_lock); | 1006 | spin_unlock(&hugetlb_lock); |
918 | 1007 | ||
919 | /* Free unnecessary surplus pages to the buddy allocator */ | 1008 | /* Free unnecessary surplus pages to the buddy allocator */ |
920 | free: | ||
921 | if (!list_empty(&surplus_list)) { | 1009 | if (!list_empty(&surplus_list)) { |
922 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { | 1010 | list_for_each_entry_safe(page, tmp, &surplus_list, lru) { |
923 | list_del(&page->lru); | 1011 | list_del(&page->lru); |
@@ -966,11 +1054,12 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
966 | /* | 1054 | /* |
967 | * Determine if the huge page at addr within the vma has an associated | 1055 | * Determine if the huge page at addr within the vma has an associated |
968 | * reservation. Where it does not we will need to logically increase | 1056 | * reservation. Where it does not we will need to logically increase |
969 | * reservation and actually increase quota before an allocation can occur. | 1057 | * reservation and actually increase subpool usage before an allocation |
970 | * Where any new reservation would be required the reservation change is | 1058 | * can occur. Where any new reservation would be required the |
971 | * prepared, but not committed. Once the page has been quota'd allocated | 1059 | * reservation change is prepared, but not committed. Once the page |
972 | * an instantiated the change should be committed via vma_commit_reservation. | 1060 | * has been allocated from the subpool and instantiated the change should |
973 | * No action is required on failure. | 1061 | * be committed via vma_commit_reservation. No action is required on |
1062 | * failure. | ||
974 | */ | 1063 | */ |
975 | static long vma_needs_reservation(struct hstate *h, | 1064 | static long vma_needs_reservation(struct hstate *h, |
976 | struct vm_area_struct *vma, unsigned long addr) | 1065 | struct vm_area_struct *vma, unsigned long addr) |
@@ -1019,24 +1108,24 @@ static void vma_commit_reservation(struct hstate *h, | |||
1019 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 1108 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
1020 | unsigned long addr, int avoid_reserve) | 1109 | unsigned long addr, int avoid_reserve) |
1021 | { | 1110 | { |
1111 | struct hugepage_subpool *spool = subpool_vma(vma); | ||
1022 | struct hstate *h = hstate_vma(vma); | 1112 | struct hstate *h = hstate_vma(vma); |
1023 | struct page *page; | 1113 | struct page *page; |
1024 | struct address_space *mapping = vma->vm_file->f_mapping; | ||
1025 | struct inode *inode = mapping->host; | ||
1026 | long chg; | 1114 | long chg; |
1027 | 1115 | ||
1028 | /* | 1116 | /* |
1029 | * Processes that did not create the mapping will have no reserves and | 1117 | * Processes that did not create the mapping will have no |
1030 | * will not have accounted against quota. Check that the quota can be | 1118 | * reserves and will not have accounted against subpool |
1031 | * made before satisfying the allocation | 1119 | * limit. Check that the subpool limit can be made before |
1032 | * MAP_NORESERVE mappings may also need pages and quota allocated | 1120 | * satisfying the allocation MAP_NORESERVE mappings may also |
1033 | * if no reserve mapping overlaps. | 1121 | * need pages and subpool limit allocated allocated if no reserve |
1122 | * mapping overlaps. | ||
1034 | */ | 1123 | */ |
1035 | chg = vma_needs_reservation(h, vma, addr); | 1124 | chg = vma_needs_reservation(h, vma, addr); |
1036 | if (chg < 0) | 1125 | if (chg < 0) |
1037 | return ERR_PTR(-VM_FAULT_OOM); | 1126 | return ERR_PTR(-VM_FAULT_OOM); |
1038 | if (chg) | 1127 | if (chg) |
1039 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 1128 | if (hugepage_subpool_get_pages(spool, chg)) |
1040 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1129 | return ERR_PTR(-VM_FAULT_SIGBUS); |
1041 | 1130 | ||
1042 | spin_lock(&hugetlb_lock); | 1131 | spin_lock(&hugetlb_lock); |
@@ -1046,12 +1135,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
1046 | if (!page) { | 1135 | if (!page) { |
1047 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); | 1136 | page = alloc_buddy_huge_page(h, NUMA_NO_NODE); |
1048 | if (!page) { | 1137 | if (!page) { |
1049 | hugetlb_put_quota(inode->i_mapping, chg); | 1138 | hugepage_subpool_put_pages(spool, chg); |
1050 | return ERR_PTR(-VM_FAULT_SIGBUS); | 1139 | return ERR_PTR(-VM_FAULT_SIGBUS); |
1051 | } | 1140 | } |
1052 | } | 1141 | } |
1053 | 1142 | ||
1054 | set_page_private(page, (unsigned long) mapping); | 1143 | set_page_private(page, (unsigned long)spool); |
1055 | 1144 | ||
1056 | vma_commit_reservation(h, vma, addr); | 1145 | vma_commit_reservation(h, vma, addr); |
1057 | 1146 | ||
@@ -2072,6 +2161,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
2072 | { | 2161 | { |
2073 | struct hstate *h = hstate_vma(vma); | 2162 | struct hstate *h = hstate_vma(vma); |
2074 | struct resv_map *reservations = vma_resv_map(vma); | 2163 | struct resv_map *reservations = vma_resv_map(vma); |
2164 | struct hugepage_subpool *spool = subpool_vma(vma); | ||
2075 | unsigned long reserve; | 2165 | unsigned long reserve; |
2076 | unsigned long start; | 2166 | unsigned long start; |
2077 | unsigned long end; | 2167 | unsigned long end; |
@@ -2087,7 +2177,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
2087 | 2177 | ||
2088 | if (reserve) { | 2178 | if (reserve) { |
2089 | hugetlb_acct_memory(h, -reserve); | 2179 | hugetlb_acct_memory(h, -reserve); |
2090 | hugetlb_put_quota(vma->vm_file->f_mapping, reserve); | 2180 | hugepage_subpool_put_pages(spool, reserve); |
2091 | } | 2181 | } |
2092 | } | 2182 | } |
2093 | } | 2183 | } |
@@ -2276,6 +2366,10 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
2276 | if (pte_dirty(pte)) | 2366 | if (pte_dirty(pte)) |
2277 | set_page_dirty(page); | 2367 | set_page_dirty(page); |
2278 | list_add(&page->lru, &page_list); | 2368 | list_add(&page->lru, &page_list); |
2369 | |||
2370 | /* Bail out after unmapping reference page if supplied */ | ||
2371 | if (ref_page) | ||
2372 | break; | ||
2279 | } | 2373 | } |
2280 | flush_tlb_range(vma, start, end); | 2374 | flush_tlb_range(vma, start, end); |
2281 | spin_unlock(&mm->page_table_lock); | 2375 | spin_unlock(&mm->page_table_lock); |
@@ -2316,7 +2410,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2316 | */ | 2410 | */ |
2317 | address = address & huge_page_mask(h); | 2411 | address = address & huge_page_mask(h); |
2318 | pgoff = vma_hugecache_offset(h, vma, address); | 2412 | pgoff = vma_hugecache_offset(h, vma, address); |
2319 | mapping = (struct address_space *)page_private(page); | 2413 | mapping = vma->vm_file->f_dentry->d_inode->i_mapping; |
2320 | 2414 | ||
2321 | /* | 2415 | /* |
2322 | * Take the mapping lock for the duration of the table walk. As | 2416 | * Take the mapping lock for the duration of the table walk. As |
@@ -2869,11 +2963,12 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2869 | { | 2963 | { |
2870 | long ret, chg; | 2964 | long ret, chg; |
2871 | struct hstate *h = hstate_inode(inode); | 2965 | struct hstate *h = hstate_inode(inode); |
2966 | struct hugepage_subpool *spool = subpool_inode(inode); | ||
2872 | 2967 | ||
2873 | /* | 2968 | /* |
2874 | * Only apply hugepage reservation if asked. At fault time, an | 2969 | * Only apply hugepage reservation if asked. At fault time, an |
2875 | * attempt will be made for VM_NORESERVE to allocate a page | 2970 | * attempt will be made for VM_NORESERVE to allocate a page |
2876 | * and filesystem quota without using reserves | 2971 | * without using reserves |
2877 | */ | 2972 | */ |
2878 | if (vm_flags & VM_NORESERVE) | 2973 | if (vm_flags & VM_NORESERVE) |
2879 | return 0; | 2974 | return 0; |
@@ -2900,17 +2995,17 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2900 | if (chg < 0) | 2995 | if (chg < 0) |
2901 | return chg; | 2996 | return chg; |
2902 | 2997 | ||
2903 | /* There must be enough filesystem quota for the mapping */ | 2998 | /* There must be enough pages in the subpool for the mapping */ |
2904 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 2999 | if (hugepage_subpool_get_pages(spool, chg)) |
2905 | return -ENOSPC; | 3000 | return -ENOSPC; |
2906 | 3001 | ||
2907 | /* | 3002 | /* |
2908 | * Check enough hugepages are available for the reservation. | 3003 | * Check enough hugepages are available for the reservation. |
2909 | * Hand back the quota if there are not | 3004 | * Hand the pages back to the subpool if there are not |
2910 | */ | 3005 | */ |
2911 | ret = hugetlb_acct_memory(h, chg); | 3006 | ret = hugetlb_acct_memory(h, chg); |
2912 | if (ret < 0) { | 3007 | if (ret < 0) { |
2913 | hugetlb_put_quota(inode->i_mapping, chg); | 3008 | hugepage_subpool_put_pages(spool, chg); |
2914 | return ret; | 3009 | return ret; |
2915 | } | 3010 | } |
2916 | 3011 | ||
@@ -2934,12 +3029,13 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
2934 | { | 3029 | { |
2935 | struct hstate *h = hstate_inode(inode); | 3030 | struct hstate *h = hstate_inode(inode); |
2936 | long chg = region_truncate(&inode->i_mapping->private_list, offset); | 3031 | long chg = region_truncate(&inode->i_mapping->private_list, offset); |
3032 | struct hugepage_subpool *spool = subpool_inode(inode); | ||
2937 | 3033 | ||
2938 | spin_lock(&inode->i_lock); | 3034 | spin_lock(&inode->i_lock); |
2939 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); | 3035 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); |
2940 | spin_unlock(&inode->i_lock); | 3036 | spin_unlock(&inode->i_lock); |
2941 | 3037 | ||
2942 | hugetlb_put_quota(inode->i_mapping, (chg - freed)); | 3038 | hugepage_subpool_put_pages(spool, (chg - freed)); |
2943 | hugetlb_acct_memory(h, -(chg - freed)); | 3039 | hugetlb_acct_memory(h, -(chg - freed)); |
2944 | } | 3040 | } |
2945 | 3041 | ||
@@ -374,6 +374,20 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
374 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; | 374 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; |
375 | } | 375 | } |
376 | 376 | ||
377 | static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, | ||
378 | unsigned long addr) | ||
379 | { | ||
380 | struct vm_area_struct *vma; | ||
381 | if (ksm_test_exit(mm)) | ||
382 | return NULL; | ||
383 | vma = find_vma(mm, addr); | ||
384 | if (!vma || vma->vm_start > addr) | ||
385 | return NULL; | ||
386 | if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) | ||
387 | return NULL; | ||
388 | return vma; | ||
389 | } | ||
390 | |||
377 | static void break_cow(struct rmap_item *rmap_item) | 391 | static void break_cow(struct rmap_item *rmap_item) |
378 | { | 392 | { |
379 | struct mm_struct *mm = rmap_item->mm; | 393 | struct mm_struct *mm = rmap_item->mm; |
@@ -387,15 +401,9 @@ static void break_cow(struct rmap_item *rmap_item) | |||
387 | put_anon_vma(rmap_item->anon_vma); | 401 | put_anon_vma(rmap_item->anon_vma); |
388 | 402 | ||
389 | down_read(&mm->mmap_sem); | 403 | down_read(&mm->mmap_sem); |
390 | if (ksm_test_exit(mm)) | 404 | vma = find_mergeable_vma(mm, addr); |
391 | goto out; | 405 | if (vma) |
392 | vma = find_vma(mm, addr); | 406 | break_ksm(vma, addr); |
393 | if (!vma || vma->vm_start > addr) | ||
394 | goto out; | ||
395 | if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) | ||
396 | goto out; | ||
397 | break_ksm(vma, addr); | ||
398 | out: | ||
399 | up_read(&mm->mmap_sem); | 407 | up_read(&mm->mmap_sem); |
400 | } | 408 | } |
401 | 409 | ||
@@ -421,12 +429,8 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) | |||
421 | struct page *page; | 429 | struct page *page; |
422 | 430 | ||
423 | down_read(&mm->mmap_sem); | 431 | down_read(&mm->mmap_sem); |
424 | if (ksm_test_exit(mm)) | 432 | vma = find_mergeable_vma(mm, addr); |
425 | goto out; | 433 | if (!vma) |
426 | vma = find_vma(mm, addr); | ||
427 | if (!vma || vma->vm_start > addr) | ||
428 | goto out; | ||
429 | if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) | ||
430 | goto out; | 434 | goto out; |
431 | 435 | ||
432 | page = follow_page(vma, addr, FOLL_GET); | 436 | page = follow_page(vma, addr, FOLL_GET); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 26c6f4ec20f4..b2ee6df0e9bb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -89,7 +89,6 @@ enum mem_cgroup_stat_index { | |||
89 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 89 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
90 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 90 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
91 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ | 91 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ |
92 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ | ||
93 | MEM_CGROUP_STAT_NSTATS, | 92 | MEM_CGROUP_STAT_NSTATS, |
94 | }; | 93 | }; |
95 | 94 | ||
@@ -135,7 +134,7 @@ struct mem_cgroup_reclaim_iter { | |||
135 | */ | 134 | */ |
136 | struct mem_cgroup_per_zone { | 135 | struct mem_cgroup_per_zone { |
137 | struct lruvec lruvec; | 136 | struct lruvec lruvec; |
138 | unsigned long count[NR_LRU_LISTS]; | 137 | unsigned long lru_size[NR_LRU_LISTS]; |
139 | 138 | ||
140 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 139 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; |
141 | 140 | ||
@@ -144,11 +143,9 @@ struct mem_cgroup_per_zone { | |||
144 | unsigned long long usage_in_excess;/* Set to the value by which */ | 143 | unsigned long long usage_in_excess;/* Set to the value by which */ |
145 | /* the soft limit is exceeded*/ | 144 | /* the soft limit is exceeded*/ |
146 | bool on_tree; | 145 | bool on_tree; |
147 | struct mem_cgroup *mem; /* Back pointer, we cannot */ | 146 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ |
148 | /* use container_of */ | 147 | /* use container_of */ |
149 | }; | 148 | }; |
150 | /* Macro for accessing counter */ | ||
151 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | ||
152 | 149 | ||
153 | struct mem_cgroup_per_node { | 150 | struct mem_cgroup_per_node { |
154 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | 151 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; |
@@ -300,6 +297,12 @@ struct mem_cgroup { | |||
300 | */ | 297 | */ |
301 | unsigned long move_charge_at_immigrate; | 298 | unsigned long move_charge_at_immigrate; |
302 | /* | 299 | /* |
300 | * set > 0 if pages under this cgroup are moving to other cgroup. | ||
301 | */ | ||
302 | atomic_t moving_account; | ||
303 | /* taken only while moving_account > 0 */ | ||
304 | spinlock_t move_lock; | ||
305 | /* | ||
303 | * percpu counter. | 306 | * percpu counter. |
304 | */ | 307 | */ |
305 | struct mem_cgroup_stat_cpu *stat; | 308 | struct mem_cgroup_stat_cpu *stat; |
@@ -612,9 +615,9 @@ retry: | |||
612 | * we will to add it back at the end of reclaim to its correct | 615 | * we will to add it back at the end of reclaim to its correct |
613 | * position in the tree. | 616 | * position in the tree. |
614 | */ | 617 | */ |
615 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | 618 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); |
616 | if (!res_counter_soft_limit_excess(&mz->mem->res) || | 619 | if (!res_counter_soft_limit_excess(&mz->memcg->res) || |
617 | !css_tryget(&mz->mem->css)) | 620 | !css_tryget(&mz->memcg->css)) |
618 | goto retry; | 621 | goto retry; |
619 | done: | 622 | done: |
620 | return mz; | 623 | return mz; |
@@ -692,15 +695,19 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | |||
692 | } | 695 | } |
693 | 696 | ||
694 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | 697 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, |
695 | bool file, int nr_pages) | 698 | bool anon, int nr_pages) |
696 | { | 699 | { |
697 | preempt_disable(); | 700 | preempt_disable(); |
698 | 701 | ||
699 | if (file) | 702 | /* |
700 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], | 703 | * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is |
704 | * counted as CACHE even if it's on ANON LRU. | ||
705 | */ | ||
706 | if (anon) | ||
707 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], | ||
701 | nr_pages); | 708 | nr_pages); |
702 | else | 709 | else |
703 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], | 710 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], |
704 | nr_pages); | 711 | nr_pages); |
705 | 712 | ||
706 | /* pagein of a big page is an event. So, ignore page size */ | 713 | /* pagein of a big page is an event. So, ignore page size */ |
@@ -721,14 +728,14 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, | |||
721 | unsigned int lru_mask) | 728 | unsigned int lru_mask) |
722 | { | 729 | { |
723 | struct mem_cgroup_per_zone *mz; | 730 | struct mem_cgroup_per_zone *mz; |
724 | enum lru_list l; | 731 | enum lru_list lru; |
725 | unsigned long ret = 0; | 732 | unsigned long ret = 0; |
726 | 733 | ||
727 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); | 734 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
728 | 735 | ||
729 | for_each_lru(l) { | 736 | for_each_lru(lru) { |
730 | if (BIT(l) & lru_mask) | 737 | if (BIT(lru) & lru_mask) |
731 | ret += MEM_CGROUP_ZSTAT(mz, l); | 738 | ret += mz->lru_size[lru]; |
732 | } | 739 | } |
733 | return ret; | 740 | return ret; |
734 | } | 741 | } |
@@ -1077,7 +1084,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, | |||
1077 | 1084 | ||
1078 | mz = page_cgroup_zoneinfo(memcg, page); | 1085 | mz = page_cgroup_zoneinfo(memcg, page); |
1079 | /* compound_order() is stabilized through lru_lock */ | 1086 | /* compound_order() is stabilized through lru_lock */ |
1080 | MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); | 1087 | mz->lru_size[lru] += 1 << compound_order(page); |
1081 | return &mz->lruvec; | 1088 | return &mz->lruvec; |
1082 | } | 1089 | } |
1083 | 1090 | ||
@@ -1105,8 +1112,8 @@ void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) | |||
1105 | VM_BUG_ON(!memcg); | 1112 | VM_BUG_ON(!memcg); |
1106 | mz = page_cgroup_zoneinfo(memcg, page); | 1113 | mz = page_cgroup_zoneinfo(memcg, page); |
1107 | /* huge page split is done under lru_lock. so, we have no races. */ | 1114 | /* huge page split is done under lru_lock. so, we have no races. */ |
1108 | VM_BUG_ON(MEM_CGROUP_ZSTAT(mz, lru) < (1 << compound_order(page))); | 1115 | VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page))); |
1109 | MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); | 1116 | mz->lru_size[lru] -= 1 << compound_order(page); |
1110 | } | 1117 | } |
1111 | 1118 | ||
1112 | void mem_cgroup_lru_del(struct page *page) | 1119 | void mem_cgroup_lru_del(struct page *page) |
@@ -1285,40 +1292,48 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) | |||
1285 | return memcg->swappiness; | 1292 | return memcg->swappiness; |
1286 | } | 1293 | } |
1287 | 1294 | ||
1288 | static void mem_cgroup_start_move(struct mem_cgroup *memcg) | 1295 | /* |
1289 | { | 1296 | * memcg->moving_account is used for checking possibility that some thread is |
1290 | int cpu; | 1297 | * calling move_account(). When a thread on CPU-A starts moving pages under |
1298 | * a memcg, other threads should check memcg->moving_account under | ||
1299 | * rcu_read_lock(), like this: | ||
1300 | * | ||
1301 | * CPU-A CPU-B | ||
1302 | * rcu_read_lock() | ||
1303 | * memcg->moving_account+1 if (memcg->mocing_account) | ||
1304 | * take heavy locks. | ||
1305 | * synchronize_rcu() update something. | ||
1306 | * rcu_read_unlock() | ||
1307 | * start move here. | ||
1308 | */ | ||
1291 | 1309 | ||
1292 | get_online_cpus(); | 1310 | /* for quick checking without looking up memcg */ |
1293 | spin_lock(&memcg->pcp_counter_lock); | 1311 | atomic_t memcg_moving __read_mostly; |
1294 | for_each_online_cpu(cpu) | ||
1295 | per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; | ||
1296 | memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; | ||
1297 | spin_unlock(&memcg->pcp_counter_lock); | ||
1298 | put_online_cpus(); | ||
1299 | 1312 | ||
1313 | static void mem_cgroup_start_move(struct mem_cgroup *memcg) | ||
1314 | { | ||
1315 | atomic_inc(&memcg_moving); | ||
1316 | atomic_inc(&memcg->moving_account); | ||
1300 | synchronize_rcu(); | 1317 | synchronize_rcu(); |
1301 | } | 1318 | } |
1302 | 1319 | ||
1303 | static void mem_cgroup_end_move(struct mem_cgroup *memcg) | 1320 | static void mem_cgroup_end_move(struct mem_cgroup *memcg) |
1304 | { | 1321 | { |
1305 | int cpu; | 1322 | /* |
1306 | 1323 | * Now, mem_cgroup_clear_mc() may call this function with NULL. | |
1307 | if (!memcg) | 1324 | * We check NULL in callee rather than caller. |
1308 | return; | 1325 | */ |
1309 | get_online_cpus(); | 1326 | if (memcg) { |
1310 | spin_lock(&memcg->pcp_counter_lock); | 1327 | atomic_dec(&memcg_moving); |
1311 | for_each_online_cpu(cpu) | 1328 | atomic_dec(&memcg->moving_account); |
1312 | per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; | 1329 | } |
1313 | memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; | ||
1314 | spin_unlock(&memcg->pcp_counter_lock); | ||
1315 | put_online_cpus(); | ||
1316 | } | 1330 | } |
1331 | |||
1317 | /* | 1332 | /* |
1318 | * 2 routines for checking "mem" is under move_account() or not. | 1333 | * 2 routines for checking "mem" is under move_account() or not. |
1319 | * | 1334 | * |
1320 | * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used | 1335 | * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This |
1321 | * for avoiding race in accounting. If true, | 1336 | * is used for avoiding races in accounting. If true, |
1322 | * pc->mem_cgroup may be overwritten. | 1337 | * pc->mem_cgroup may be overwritten. |
1323 | * | 1338 | * |
1324 | * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or | 1339 | * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or |
@@ -1326,10 +1341,10 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) | |||
1326 | * waiting at hith-memory prressure caused by "move". | 1341 | * waiting at hith-memory prressure caused by "move". |
1327 | */ | 1342 | */ |
1328 | 1343 | ||
1329 | static bool mem_cgroup_stealed(struct mem_cgroup *memcg) | 1344 | static bool mem_cgroup_stolen(struct mem_cgroup *memcg) |
1330 | { | 1345 | { |
1331 | VM_BUG_ON(!rcu_read_lock_held()); | 1346 | VM_BUG_ON(!rcu_read_lock_held()); |
1332 | return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0; | 1347 | return atomic_read(&memcg->moving_account) > 0; |
1333 | } | 1348 | } |
1334 | 1349 | ||
1335 | static bool mem_cgroup_under_move(struct mem_cgroup *memcg) | 1350 | static bool mem_cgroup_under_move(struct mem_cgroup *memcg) |
@@ -1370,6 +1385,24 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) | |||
1370 | return false; | 1385 | return false; |
1371 | } | 1386 | } |
1372 | 1387 | ||
1388 | /* | ||
1389 | * Take this lock when | ||
1390 | * - a code tries to modify page's memcg while it's USED. | ||
1391 | * - a code tries to modify page state accounting in a memcg. | ||
1392 | * see mem_cgroup_stolen(), too. | ||
1393 | */ | ||
1394 | static void move_lock_mem_cgroup(struct mem_cgroup *memcg, | ||
1395 | unsigned long *flags) | ||
1396 | { | ||
1397 | spin_lock_irqsave(&memcg->move_lock, *flags); | ||
1398 | } | ||
1399 | |||
1400 | static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, | ||
1401 | unsigned long *flags) | ||
1402 | { | ||
1403 | spin_unlock_irqrestore(&memcg->move_lock, *flags); | ||
1404 | } | ||
1405 | |||
1373 | /** | 1406 | /** |
1374 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. | 1407 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. |
1375 | * @memcg: The memory cgroup that went over limit | 1408 | * @memcg: The memory cgroup that went over limit |
@@ -1393,7 +1426,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1393 | if (!memcg || !p) | 1426 | if (!memcg || !p) |
1394 | return; | 1427 | return; |
1395 | 1428 | ||
1396 | |||
1397 | rcu_read_lock(); | 1429 | rcu_read_lock(); |
1398 | 1430 | ||
1399 | mem_cgrp = memcg->css.cgroup; | 1431 | mem_cgrp = memcg->css.cgroup; |
@@ -1772,22 +1804,22 @@ static DEFINE_SPINLOCK(memcg_oom_lock); | |||
1772 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1804 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
1773 | 1805 | ||
1774 | struct oom_wait_info { | 1806 | struct oom_wait_info { |
1775 | struct mem_cgroup *mem; | 1807 | struct mem_cgroup *memcg; |
1776 | wait_queue_t wait; | 1808 | wait_queue_t wait; |
1777 | }; | 1809 | }; |
1778 | 1810 | ||
1779 | static int memcg_oom_wake_function(wait_queue_t *wait, | 1811 | static int memcg_oom_wake_function(wait_queue_t *wait, |
1780 | unsigned mode, int sync, void *arg) | 1812 | unsigned mode, int sync, void *arg) |
1781 | { | 1813 | { |
1782 | struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg, | 1814 | struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; |
1783 | *oom_wait_memcg; | 1815 | struct mem_cgroup *oom_wait_memcg; |
1784 | struct oom_wait_info *oom_wait_info; | 1816 | struct oom_wait_info *oom_wait_info; |
1785 | 1817 | ||
1786 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); | 1818 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); |
1787 | oom_wait_memcg = oom_wait_info->mem; | 1819 | oom_wait_memcg = oom_wait_info->memcg; |
1788 | 1820 | ||
1789 | /* | 1821 | /* |
1790 | * Both of oom_wait_info->mem and wake_mem are stable under us. | 1822 | * Both of oom_wait_info->memcg and wake_memcg are stable under us. |
1791 | * Then we can use css_is_ancestor without taking care of RCU. | 1823 | * Then we can use css_is_ancestor without taking care of RCU. |
1792 | */ | 1824 | */ |
1793 | if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) | 1825 | if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) |
@@ -1811,12 +1843,12 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) | |||
1811 | /* | 1843 | /* |
1812 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 1844 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. |
1813 | */ | 1845 | */ |
1814 | bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) | 1846 | bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
1815 | { | 1847 | { |
1816 | struct oom_wait_info owait; | 1848 | struct oom_wait_info owait; |
1817 | bool locked, need_to_kill; | 1849 | bool locked, need_to_kill; |
1818 | 1850 | ||
1819 | owait.mem = memcg; | 1851 | owait.memcg = memcg; |
1820 | owait.wait.flags = 0; | 1852 | owait.wait.flags = 0; |
1821 | owait.wait.func = memcg_oom_wake_function; | 1853 | owait.wait.func = memcg_oom_wake_function; |
1822 | owait.wait.private = current; | 1854 | owait.wait.private = current; |
@@ -1841,7 +1873,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) | |||
1841 | 1873 | ||
1842 | if (need_to_kill) { | 1874 | if (need_to_kill) { |
1843 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1875 | finish_wait(&memcg_oom_waitq, &owait.wait); |
1844 | mem_cgroup_out_of_memory(memcg, mask); | 1876 | mem_cgroup_out_of_memory(memcg, mask, order); |
1845 | } else { | 1877 | } else { |
1846 | schedule(); | 1878 | schedule(); |
1847 | finish_wait(&memcg_oom_waitq, &owait.wait); | 1879 | finish_wait(&memcg_oom_waitq, &owait.wait); |
@@ -1881,41 +1913,66 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) | |||
1881 | * by flags. | 1913 | * by flags. |
1882 | * | 1914 | * |
1883 | * Considering "move", this is an only case we see a race. To make the race | 1915 | * Considering "move", this is an only case we see a race. To make the race |
1884 | * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are | 1916 | * small, we check mm->moving_account and detect there are possibility of race |
1885 | * possibility of race condition. If there is, we take a lock. | 1917 | * If there is, we take a lock. |
1886 | */ | 1918 | */ |
1887 | 1919 | ||
1920 | void __mem_cgroup_begin_update_page_stat(struct page *page, | ||
1921 | bool *locked, unsigned long *flags) | ||
1922 | { | ||
1923 | struct mem_cgroup *memcg; | ||
1924 | struct page_cgroup *pc; | ||
1925 | |||
1926 | pc = lookup_page_cgroup(page); | ||
1927 | again: | ||
1928 | memcg = pc->mem_cgroup; | ||
1929 | if (unlikely(!memcg || !PageCgroupUsed(pc))) | ||
1930 | return; | ||
1931 | /* | ||
1932 | * If this memory cgroup is not under account moving, we don't | ||
1933 | * need to take move_lock_page_cgroup(). Because we already hold | ||
1934 | * rcu_read_lock(), any calls to move_account will be delayed until | ||
1935 | * rcu_read_unlock() if mem_cgroup_stolen() == true. | ||
1936 | */ | ||
1937 | if (!mem_cgroup_stolen(memcg)) | ||
1938 | return; | ||
1939 | |||
1940 | move_lock_mem_cgroup(memcg, flags); | ||
1941 | if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { | ||
1942 | move_unlock_mem_cgroup(memcg, flags); | ||
1943 | goto again; | ||
1944 | } | ||
1945 | *locked = true; | ||
1946 | } | ||
1947 | |||
1948 | void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) | ||
1949 | { | ||
1950 | struct page_cgroup *pc = lookup_page_cgroup(page); | ||
1951 | |||
1952 | /* | ||
1953 | * It's guaranteed that pc->mem_cgroup never changes while | ||
1954 | * lock is held because a routine modifies pc->mem_cgroup | ||
1955 | * should take move_lock_page_cgroup(). | ||
1956 | */ | ||
1957 | move_unlock_mem_cgroup(pc->mem_cgroup, flags); | ||
1958 | } | ||
1959 | |||
1888 | void mem_cgroup_update_page_stat(struct page *page, | 1960 | void mem_cgroup_update_page_stat(struct page *page, |
1889 | enum mem_cgroup_page_stat_item idx, int val) | 1961 | enum mem_cgroup_page_stat_item idx, int val) |
1890 | { | 1962 | { |
1891 | struct mem_cgroup *memcg; | 1963 | struct mem_cgroup *memcg; |
1892 | struct page_cgroup *pc = lookup_page_cgroup(page); | 1964 | struct page_cgroup *pc = lookup_page_cgroup(page); |
1893 | bool need_unlock = false; | ||
1894 | unsigned long uninitialized_var(flags); | 1965 | unsigned long uninitialized_var(flags); |
1895 | 1966 | ||
1896 | if (mem_cgroup_disabled()) | 1967 | if (mem_cgroup_disabled()) |
1897 | return; | 1968 | return; |
1898 | 1969 | ||
1899 | rcu_read_lock(); | ||
1900 | memcg = pc->mem_cgroup; | 1970 | memcg = pc->mem_cgroup; |
1901 | if (unlikely(!memcg || !PageCgroupUsed(pc))) | 1971 | if (unlikely(!memcg || !PageCgroupUsed(pc))) |
1902 | goto out; | 1972 | return; |
1903 | /* pc->mem_cgroup is unstable ? */ | ||
1904 | if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) { | ||
1905 | /* take a lock against to access pc->mem_cgroup */ | ||
1906 | move_lock_page_cgroup(pc, &flags); | ||
1907 | need_unlock = true; | ||
1908 | memcg = pc->mem_cgroup; | ||
1909 | if (!memcg || !PageCgroupUsed(pc)) | ||
1910 | goto out; | ||
1911 | } | ||
1912 | 1973 | ||
1913 | switch (idx) { | 1974 | switch (idx) { |
1914 | case MEMCG_NR_FILE_MAPPED: | 1975 | case MEMCG_NR_FILE_MAPPED: |
1915 | if (val > 0) | ||
1916 | SetPageCgroupFileMapped(pc); | ||
1917 | else if (!page_mapped(page)) | ||
1918 | ClearPageCgroupFileMapped(pc); | ||
1919 | idx = MEM_CGROUP_STAT_FILE_MAPPED; | 1976 | idx = MEM_CGROUP_STAT_FILE_MAPPED; |
1920 | break; | 1977 | break; |
1921 | default: | 1978 | default: |
@@ -1923,14 +1980,7 @@ void mem_cgroup_update_page_stat(struct page *page, | |||
1923 | } | 1980 | } |
1924 | 1981 | ||
1925 | this_cpu_add(memcg->stat->count[idx], val); | 1982 | this_cpu_add(memcg->stat->count[idx], val); |
1926 | |||
1927 | out: | ||
1928 | if (unlikely(need_unlock)) | ||
1929 | move_unlock_page_cgroup(pc, &flags); | ||
1930 | rcu_read_unlock(); | ||
1931 | return; | ||
1932 | } | 1983 | } |
1933 | EXPORT_SYMBOL(mem_cgroup_update_page_stat); | ||
1934 | 1984 | ||
1935 | /* | 1985 | /* |
1936 | * size of first charge trial. "32" comes from vmscan.c's magic value. | 1986 | * size of first charge trial. "32" comes from vmscan.c's magic value. |
@@ -2101,17 +2151,6 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) | |||
2101 | per_cpu(memcg->stat->events[i], cpu) = 0; | 2151 | per_cpu(memcg->stat->events[i], cpu) = 0; |
2102 | memcg->nocpu_base.events[i] += x; | 2152 | memcg->nocpu_base.events[i] += x; |
2103 | } | 2153 | } |
2104 | /* need to clear ON_MOVE value, works as a kind of lock. */ | ||
2105 | per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; | ||
2106 | spin_unlock(&memcg->pcp_counter_lock); | ||
2107 | } | ||
2108 | |||
2109 | static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu) | ||
2110 | { | ||
2111 | int idx = MEM_CGROUP_ON_MOVE; | ||
2112 | |||
2113 | spin_lock(&memcg->pcp_counter_lock); | ||
2114 | per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx]; | ||
2115 | spin_unlock(&memcg->pcp_counter_lock); | 2154 | spin_unlock(&memcg->pcp_counter_lock); |
2116 | } | 2155 | } |
2117 | 2156 | ||
@@ -2123,11 +2162,8 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, | |||
2123 | struct memcg_stock_pcp *stock; | 2162 | struct memcg_stock_pcp *stock; |
2124 | struct mem_cgroup *iter; | 2163 | struct mem_cgroup *iter; |
2125 | 2164 | ||
2126 | if ((action == CPU_ONLINE)) { | 2165 | if (action == CPU_ONLINE) |
2127 | for_each_mem_cgroup(iter) | ||
2128 | synchronize_mem_cgroup_on_move(iter, cpu); | ||
2129 | return NOTIFY_OK; | 2166 | return NOTIFY_OK; |
2130 | } | ||
2131 | 2167 | ||
2132 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) | 2168 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) |
2133 | return NOTIFY_OK; | 2169 | return NOTIFY_OK; |
@@ -2212,7 +2248,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2212 | if (!oom_check) | 2248 | if (!oom_check) |
2213 | return CHARGE_NOMEM; | 2249 | return CHARGE_NOMEM; |
2214 | /* check OOM */ | 2250 | /* check OOM */ |
2215 | if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) | 2251 | if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) |
2216 | return CHARGE_OOM_DIE; | 2252 | return CHARGE_OOM_DIE; |
2217 | 2253 | ||
2218 | return CHARGE_RETRY; | 2254 | return CHARGE_RETRY; |
@@ -2446,6 +2482,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2446 | { | 2482 | { |
2447 | struct zone *uninitialized_var(zone); | 2483 | struct zone *uninitialized_var(zone); |
2448 | bool was_on_lru = false; | 2484 | bool was_on_lru = false; |
2485 | bool anon; | ||
2449 | 2486 | ||
2450 | lock_page_cgroup(pc); | 2487 | lock_page_cgroup(pc); |
2451 | if (unlikely(PageCgroupUsed(pc))) { | 2488 | if (unlikely(PageCgroupUsed(pc))) { |
@@ -2481,19 +2518,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2481 | * See mem_cgroup_add_lru_list(), etc. | 2518 | * See mem_cgroup_add_lru_list(), etc. |
2482 | */ | 2519 | */ |
2483 | smp_wmb(); | 2520 | smp_wmb(); |
2484 | switch (ctype) { | 2521 | SetPageCgroupUsed(pc); |
2485 | case MEM_CGROUP_CHARGE_TYPE_CACHE: | ||
2486 | case MEM_CGROUP_CHARGE_TYPE_SHMEM: | ||
2487 | SetPageCgroupCache(pc); | ||
2488 | SetPageCgroupUsed(pc); | ||
2489 | break; | ||
2490 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | ||
2491 | ClearPageCgroupCache(pc); | ||
2492 | SetPageCgroupUsed(pc); | ||
2493 | break; | ||
2494 | default: | ||
2495 | break; | ||
2496 | } | ||
2497 | 2522 | ||
2498 | if (lrucare) { | 2523 | if (lrucare) { |
2499 | if (was_on_lru) { | 2524 | if (was_on_lru) { |
@@ -2504,7 +2529,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2504 | spin_unlock_irq(&zone->lru_lock); | 2529 | spin_unlock_irq(&zone->lru_lock); |
2505 | } | 2530 | } |
2506 | 2531 | ||
2507 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); | 2532 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) |
2533 | anon = true; | ||
2534 | else | ||
2535 | anon = false; | ||
2536 | |||
2537 | mem_cgroup_charge_statistics(memcg, anon, nr_pages); | ||
2508 | unlock_page_cgroup(pc); | 2538 | unlock_page_cgroup(pc); |
2509 | 2539 | ||
2510 | /* | 2540 | /* |
@@ -2517,8 +2547,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, | |||
2517 | 2547 | ||
2518 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 2548 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
2519 | 2549 | ||
2520 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ | 2550 | #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION)) |
2521 | (1 << PCG_MIGRATION)) | ||
2522 | /* | 2551 | /* |
2523 | * Because tail pages are not marked as "used", set it. We're under | 2552 | * Because tail pages are not marked as "used", set it. We're under |
2524 | * zone->lru_lock, 'splitting on pmd' and compound_lock. | 2553 | * zone->lru_lock, 'splitting on pmd' and compound_lock. |
@@ -2569,6 +2598,7 @@ static int mem_cgroup_move_account(struct page *page, | |||
2569 | { | 2598 | { |
2570 | unsigned long flags; | 2599 | unsigned long flags; |
2571 | int ret; | 2600 | int ret; |
2601 | bool anon = PageAnon(page); | ||
2572 | 2602 | ||
2573 | VM_BUG_ON(from == to); | 2603 | VM_BUG_ON(from == to); |
2574 | VM_BUG_ON(PageLRU(page)); | 2604 | VM_BUG_ON(PageLRU(page)); |
@@ -2588,23 +2618,23 @@ static int mem_cgroup_move_account(struct page *page, | |||
2588 | if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) | 2618 | if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) |
2589 | goto unlock; | 2619 | goto unlock; |
2590 | 2620 | ||
2591 | move_lock_page_cgroup(pc, &flags); | 2621 | move_lock_mem_cgroup(from, &flags); |
2592 | 2622 | ||
2593 | if (PageCgroupFileMapped(pc)) { | 2623 | if (!anon && page_mapped(page)) { |
2594 | /* Update mapped_file data for mem_cgroup */ | 2624 | /* Update mapped_file data for mem_cgroup */ |
2595 | preempt_disable(); | 2625 | preempt_disable(); |
2596 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 2626 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
2597 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 2627 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
2598 | preempt_enable(); | 2628 | preempt_enable(); |
2599 | } | 2629 | } |
2600 | mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); | 2630 | mem_cgroup_charge_statistics(from, anon, -nr_pages); |
2601 | if (uncharge) | 2631 | if (uncharge) |
2602 | /* This is not "cancel", but cancel_charge does all we need. */ | 2632 | /* This is not "cancel", but cancel_charge does all we need. */ |
2603 | __mem_cgroup_cancel_charge(from, nr_pages); | 2633 | __mem_cgroup_cancel_charge(from, nr_pages); |
2604 | 2634 | ||
2605 | /* caller should have done css_get */ | 2635 | /* caller should have done css_get */ |
2606 | pc->mem_cgroup = to; | 2636 | pc->mem_cgroup = to; |
2607 | mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); | 2637 | mem_cgroup_charge_statistics(to, anon, nr_pages); |
2608 | /* | 2638 | /* |
2609 | * We charges against "to" which may not have any tasks. Then, "to" | 2639 | * We charges against "to" which may not have any tasks. Then, "to" |
2610 | * can be under rmdir(). But in current implementation, caller of | 2640 | * can be under rmdir(). But in current implementation, caller of |
@@ -2612,7 +2642,7 @@ static int mem_cgroup_move_account(struct page *page, | |||
2612 | * guaranteed that "to" is never removed. So, we don't check rmdir | 2642 | * guaranteed that "to" is never removed. So, we don't check rmdir |
2613 | * status here. | 2643 | * status here. |
2614 | */ | 2644 | */ |
2615 | move_unlock_page_cgroup(pc, &flags); | 2645 | move_unlock_mem_cgroup(from, &flags); |
2616 | ret = 0; | 2646 | ret = 0; |
2617 | unlock: | 2647 | unlock: |
2618 | unlock_page_cgroup(pc); | 2648 | unlock_page_cgroup(pc); |
@@ -2914,7 +2944,6 @@ direct_uncharge: | |||
2914 | res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); | 2944 | res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); |
2915 | if (unlikely(batch->memcg != memcg)) | 2945 | if (unlikely(batch->memcg != memcg)) |
2916 | memcg_oom_recover(memcg); | 2946 | memcg_oom_recover(memcg); |
2917 | return; | ||
2918 | } | 2947 | } |
2919 | 2948 | ||
2920 | /* | 2949 | /* |
@@ -2926,6 +2955,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2926 | struct mem_cgroup *memcg = NULL; | 2955 | struct mem_cgroup *memcg = NULL; |
2927 | unsigned int nr_pages = 1; | 2956 | unsigned int nr_pages = 1; |
2928 | struct page_cgroup *pc; | 2957 | struct page_cgroup *pc; |
2958 | bool anon; | ||
2929 | 2959 | ||
2930 | if (mem_cgroup_disabled()) | 2960 | if (mem_cgroup_disabled()) |
2931 | return NULL; | 2961 | return NULL; |
@@ -2951,8 +2981,17 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2951 | if (!PageCgroupUsed(pc)) | 2981 | if (!PageCgroupUsed(pc)) |
2952 | goto unlock_out; | 2982 | goto unlock_out; |
2953 | 2983 | ||
2984 | anon = PageAnon(page); | ||
2985 | |||
2954 | switch (ctype) { | 2986 | switch (ctype) { |
2955 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | 2987 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: |
2988 | /* | ||
2989 | * Generally PageAnon tells if it's the anon statistics to be | ||
2990 | * updated; but sometimes e.g. mem_cgroup_uncharge_page() is | ||
2991 | * used before page reached the stage of being marked PageAnon. | ||
2992 | */ | ||
2993 | anon = true; | ||
2994 | /* fallthrough */ | ||
2956 | case MEM_CGROUP_CHARGE_TYPE_DROP: | 2995 | case MEM_CGROUP_CHARGE_TYPE_DROP: |
2957 | /* See mem_cgroup_prepare_migration() */ | 2996 | /* See mem_cgroup_prepare_migration() */ |
2958 | if (page_mapped(page) || PageCgroupMigration(pc)) | 2997 | if (page_mapped(page) || PageCgroupMigration(pc)) |
@@ -2969,7 +3008,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2969 | break; | 3008 | break; |
2970 | } | 3009 | } |
2971 | 3010 | ||
2972 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages); | 3011 | mem_cgroup_charge_statistics(memcg, anon, -nr_pages); |
2973 | 3012 | ||
2974 | ClearPageCgroupUsed(pc); | 3013 | ClearPageCgroupUsed(pc); |
2975 | /* | 3014 | /* |
@@ -3276,6 +3315,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3276 | { | 3315 | { |
3277 | struct page *used, *unused; | 3316 | struct page *used, *unused; |
3278 | struct page_cgroup *pc; | 3317 | struct page_cgroup *pc; |
3318 | bool anon; | ||
3279 | 3319 | ||
3280 | if (!memcg) | 3320 | if (!memcg) |
3281 | return; | 3321 | return; |
@@ -3297,8 +3337,10 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3297 | lock_page_cgroup(pc); | 3337 | lock_page_cgroup(pc); |
3298 | ClearPageCgroupMigration(pc); | 3338 | ClearPageCgroupMigration(pc); |
3299 | unlock_page_cgroup(pc); | 3339 | unlock_page_cgroup(pc); |
3300 | 3340 | anon = PageAnon(used); | |
3301 | __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); | 3341 | __mem_cgroup_uncharge_common(unused, |
3342 | anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED | ||
3343 | : MEM_CGROUP_CHARGE_TYPE_CACHE); | ||
3302 | 3344 | ||
3303 | /* | 3345 | /* |
3304 | * If a page is a file cache, radix-tree replacement is very atomic | 3346 | * If a page is a file cache, radix-tree replacement is very atomic |
@@ -3308,7 +3350,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3308 | * and USED bit check in mem_cgroup_uncharge_page() will do enough | 3350 | * and USED bit check in mem_cgroup_uncharge_page() will do enough |
3309 | * check. (see prepare_charge() also) | 3351 | * check. (see prepare_charge() also) |
3310 | */ | 3352 | */ |
3311 | if (PageAnon(used)) | 3353 | if (anon) |
3312 | mem_cgroup_uncharge_page(used); | 3354 | mem_cgroup_uncharge_page(used); |
3313 | /* | 3355 | /* |
3314 | * At migration, we may charge account against cgroup which has no | 3356 | * At migration, we may charge account against cgroup which has no |
@@ -3338,7 +3380,7 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, | |||
3338 | /* fix accounting on old pages */ | 3380 | /* fix accounting on old pages */ |
3339 | lock_page_cgroup(pc); | 3381 | lock_page_cgroup(pc); |
3340 | memcg = pc->mem_cgroup; | 3382 | memcg = pc->mem_cgroup; |
3341 | mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1); | 3383 | mem_cgroup_charge_statistics(memcg, false, -1); |
3342 | ClearPageCgroupUsed(pc); | 3384 | ClearPageCgroupUsed(pc); |
3343 | unlock_page_cgroup(pc); | 3385 | unlock_page_cgroup(pc); |
3344 | 3386 | ||
@@ -3549,7 +3591,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3549 | break; | 3591 | break; |
3550 | 3592 | ||
3551 | nr_scanned = 0; | 3593 | nr_scanned = 0; |
3552 | reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, | 3594 | reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, |
3553 | gfp_mask, &nr_scanned); | 3595 | gfp_mask, &nr_scanned); |
3554 | nr_reclaimed += reclaimed; | 3596 | nr_reclaimed += reclaimed; |
3555 | *total_scanned += nr_scanned; | 3597 | *total_scanned += nr_scanned; |
@@ -3576,13 +3618,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3576 | next_mz = | 3618 | next_mz = |
3577 | __mem_cgroup_largest_soft_limit_node(mctz); | 3619 | __mem_cgroup_largest_soft_limit_node(mctz); |
3578 | if (next_mz == mz) | 3620 | if (next_mz == mz) |
3579 | css_put(&next_mz->mem->css); | 3621 | css_put(&next_mz->memcg->css); |
3580 | else /* next_mz == NULL or other memcg */ | 3622 | else /* next_mz == NULL or other memcg */ |
3581 | break; | 3623 | break; |
3582 | } while (1); | 3624 | } while (1); |
3583 | } | 3625 | } |
3584 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | 3626 | __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); |
3585 | excess = res_counter_soft_limit_excess(&mz->mem->res); | 3627 | excess = res_counter_soft_limit_excess(&mz->memcg->res); |
3586 | /* | 3628 | /* |
3587 | * One school of thought says that we should not add | 3629 | * One school of thought says that we should not add |
3588 | * back the node to the tree if reclaim returns 0. | 3630 | * back the node to the tree if reclaim returns 0. |
@@ -3592,9 +3634,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3592 | * term TODO. | 3634 | * term TODO. |
3593 | */ | 3635 | */ |
3594 | /* If excess == 0, no tree ops */ | 3636 | /* If excess == 0, no tree ops */ |
3595 | __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); | 3637 | __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); |
3596 | spin_unlock(&mctz->lock); | 3638 | spin_unlock(&mctz->lock); |
3597 | css_put(&mz->mem->css); | 3639 | css_put(&mz->memcg->css); |
3598 | loop++; | 3640 | loop++; |
3599 | /* | 3641 | /* |
3600 | * Could not reclaim anything and there are no more | 3642 | * Could not reclaim anything and there are no more |
@@ -3607,7 +3649,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
3607 | break; | 3649 | break; |
3608 | } while (!nr_reclaimed); | 3650 | } while (!nr_reclaimed); |
3609 | if (next_mz) | 3651 | if (next_mz) |
3610 | css_put(&next_mz->mem->css); | 3652 | css_put(&next_mz->memcg->css); |
3611 | return nr_reclaimed; | 3653 | return nr_reclaimed; |
3612 | } | 3654 | } |
3613 | 3655 | ||
@@ -3629,7 +3671,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | |||
3629 | mz = mem_cgroup_zoneinfo(memcg, node, zid); | 3671 | mz = mem_cgroup_zoneinfo(memcg, node, zid); |
3630 | list = &mz->lruvec.lists[lru]; | 3672 | list = &mz->lruvec.lists[lru]; |
3631 | 3673 | ||
3632 | loop = MEM_CGROUP_ZSTAT(mz, lru); | 3674 | loop = mz->lru_size[lru]; |
3633 | /* give some margin against EBUSY etc...*/ | 3675 | /* give some margin against EBUSY etc...*/ |
3634 | loop += 256; | 3676 | loop += 256; |
3635 | busy = NULL; | 3677 | busy = NULL; |
@@ -3703,10 +3745,10 @@ move_account: | |||
3703 | mem_cgroup_start_move(memcg); | 3745 | mem_cgroup_start_move(memcg); |
3704 | for_each_node_state(node, N_HIGH_MEMORY) { | 3746 | for_each_node_state(node, N_HIGH_MEMORY) { |
3705 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 3747 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
3706 | enum lru_list l; | 3748 | enum lru_list lru; |
3707 | for_each_lru(l) { | 3749 | for_each_lru(lru) { |
3708 | ret = mem_cgroup_force_empty_list(memcg, | 3750 | ret = mem_cgroup_force_empty_list(memcg, |
3709 | node, zid, l); | 3751 | node, zid, lru); |
3710 | if (ret) | 3752 | if (ret) |
3711 | break; | 3753 | break; |
3712 | } | 3754 | } |
@@ -3860,7 +3902,6 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | |||
3860 | break; | 3902 | break; |
3861 | default: | 3903 | default: |
3862 | BUG(); | 3904 | BUG(); |
3863 | break; | ||
3864 | } | 3905 | } |
3865 | return val; | 3906 | return val; |
3866 | } | 3907 | } |
@@ -3939,7 +3980,6 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, | |||
3939 | out: | 3980 | out: |
3940 | *mem_limit = min_limit; | 3981 | *mem_limit = min_limit; |
3941 | *memsw_limit = min_memsw_limit; | 3982 | *memsw_limit = min_memsw_limit; |
3942 | return; | ||
3943 | } | 3983 | } |
3944 | 3984 | ||
3945 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 3985 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
@@ -4098,38 +4138,38 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) | |||
4098 | unsigned long total_nr, file_nr, anon_nr, unevictable_nr; | 4138 | unsigned long total_nr, file_nr, anon_nr, unevictable_nr; |
4099 | unsigned long node_nr; | 4139 | unsigned long node_nr; |
4100 | struct cgroup *cont = m->private; | 4140 | struct cgroup *cont = m->private; |
4101 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); | 4141 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4102 | 4142 | ||
4103 | total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); | 4143 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); |
4104 | seq_printf(m, "total=%lu", total_nr); | 4144 | seq_printf(m, "total=%lu", total_nr); |
4105 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4145 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4106 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); | 4146 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); |
4107 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4147 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4108 | } | 4148 | } |
4109 | seq_putc(m, '\n'); | 4149 | seq_putc(m, '\n'); |
4110 | 4150 | ||
4111 | file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); | 4151 | file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); |
4112 | seq_printf(m, "file=%lu", file_nr); | 4152 | seq_printf(m, "file=%lu", file_nr); |
4113 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4153 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4114 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, | 4154 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4115 | LRU_ALL_FILE); | 4155 | LRU_ALL_FILE); |
4116 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4156 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4117 | } | 4157 | } |
4118 | seq_putc(m, '\n'); | 4158 | seq_putc(m, '\n'); |
4119 | 4159 | ||
4120 | anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); | 4160 | anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); |
4121 | seq_printf(m, "anon=%lu", anon_nr); | 4161 | seq_printf(m, "anon=%lu", anon_nr); |
4122 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4162 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4123 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, | 4163 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4124 | LRU_ALL_ANON); | 4164 | LRU_ALL_ANON); |
4125 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4165 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4126 | } | 4166 | } |
4127 | seq_putc(m, '\n'); | 4167 | seq_putc(m, '\n'); |
4128 | 4168 | ||
4129 | unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); | 4169 | unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); |
4130 | seq_printf(m, "unevictable=%lu", unevictable_nr); | 4170 | seq_printf(m, "unevictable=%lu", unevictable_nr); |
4131 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4171 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4132 | node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, | 4172 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4133 | BIT(LRU_UNEVICTABLE)); | 4173 | BIT(LRU_UNEVICTABLE)); |
4134 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4174 | seq_printf(m, " N%d=%lu", nid, node_nr); |
4135 | } | 4175 | } |
@@ -4141,12 +4181,12 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) | |||
4141 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | 4181 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, |
4142 | struct cgroup_map_cb *cb) | 4182 | struct cgroup_map_cb *cb) |
4143 | { | 4183 | { |
4144 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); | 4184 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
4145 | struct mcs_total_stat mystat; | 4185 | struct mcs_total_stat mystat; |
4146 | int i; | 4186 | int i; |
4147 | 4187 | ||
4148 | memset(&mystat, 0, sizeof(mystat)); | 4188 | memset(&mystat, 0, sizeof(mystat)); |
4149 | mem_cgroup_get_local_stat(mem_cont, &mystat); | 4189 | mem_cgroup_get_local_stat(memcg, &mystat); |
4150 | 4190 | ||
4151 | 4191 | ||
4152 | for (i = 0; i < NR_MCS_STAT; i++) { | 4192 | for (i = 0; i < NR_MCS_STAT; i++) { |
@@ -4158,14 +4198,14 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4158 | /* Hierarchical information */ | 4198 | /* Hierarchical information */ |
4159 | { | 4199 | { |
4160 | unsigned long long limit, memsw_limit; | 4200 | unsigned long long limit, memsw_limit; |
4161 | memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); | 4201 | memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); |
4162 | cb->fill(cb, "hierarchical_memory_limit", limit); | 4202 | cb->fill(cb, "hierarchical_memory_limit", limit); |
4163 | if (do_swap_account) | 4203 | if (do_swap_account) |
4164 | cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); | 4204 | cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); |
4165 | } | 4205 | } |
4166 | 4206 | ||
4167 | memset(&mystat, 0, sizeof(mystat)); | 4207 | memset(&mystat, 0, sizeof(mystat)); |
4168 | mem_cgroup_get_total_stat(mem_cont, &mystat); | 4208 | mem_cgroup_get_total_stat(memcg, &mystat); |
4169 | for (i = 0; i < NR_MCS_STAT; i++) { | 4209 | for (i = 0; i < NR_MCS_STAT; i++) { |
4170 | if (i == MCS_SWAP && !do_swap_account) | 4210 | if (i == MCS_SWAP && !do_swap_account) |
4171 | continue; | 4211 | continue; |
@@ -4181,7 +4221,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4181 | 4221 | ||
4182 | for_each_online_node(nid) | 4222 | for_each_online_node(nid) |
4183 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 4223 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
4184 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | 4224 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
4185 | 4225 | ||
4186 | recent_rotated[0] += | 4226 | recent_rotated[0] += |
4187 | mz->reclaim_stat.recent_rotated[0]; | 4227 | mz->reclaim_stat.recent_rotated[0]; |
@@ -4426,12 +4466,6 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, | |||
4426 | else | 4466 | else |
4427 | BUG(); | 4467 | BUG(); |
4428 | 4468 | ||
4429 | /* | ||
4430 | * Something went wrong if we trying to unregister a threshold | ||
4431 | * if we don't have thresholds | ||
4432 | */ | ||
4433 | BUG_ON(!thresholds); | ||
4434 | |||
4435 | if (!thresholds->primary) | 4469 | if (!thresholds->primary) |
4436 | goto unlock; | 4470 | goto unlock; |
4437 | 4471 | ||
@@ -4736,7 +4770,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
4736 | { | 4770 | { |
4737 | struct mem_cgroup_per_node *pn; | 4771 | struct mem_cgroup_per_node *pn; |
4738 | struct mem_cgroup_per_zone *mz; | 4772 | struct mem_cgroup_per_zone *mz; |
4739 | enum lru_list l; | 4773 | enum lru_list lru; |
4740 | int zone, tmp = node; | 4774 | int zone, tmp = node; |
4741 | /* | 4775 | /* |
4742 | * This routine is called against possible nodes. | 4776 | * This routine is called against possible nodes. |
@@ -4754,11 +4788,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
4754 | 4788 | ||
4755 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 4789 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
4756 | mz = &pn->zoneinfo[zone]; | 4790 | mz = &pn->zoneinfo[zone]; |
4757 | for_each_lru(l) | 4791 | for_each_lru(lru) |
4758 | INIT_LIST_HEAD(&mz->lruvec.lists[l]); | 4792 | INIT_LIST_HEAD(&mz->lruvec.lists[lru]); |
4759 | mz->usage_in_excess = 0; | 4793 | mz->usage_in_excess = 0; |
4760 | mz->on_tree = false; | 4794 | mz->on_tree = false; |
4761 | mz->mem = memcg; | 4795 | mz->memcg = memcg; |
4762 | } | 4796 | } |
4763 | memcg->info.nodeinfo[node] = pn; | 4797 | memcg->info.nodeinfo[node] = pn; |
4764 | return 0; | 4798 | return 0; |
@@ -4771,29 +4805,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | |||
4771 | 4805 | ||
4772 | static struct mem_cgroup *mem_cgroup_alloc(void) | 4806 | static struct mem_cgroup *mem_cgroup_alloc(void) |
4773 | { | 4807 | { |
4774 | struct mem_cgroup *mem; | 4808 | struct mem_cgroup *memcg; |
4775 | int size = sizeof(struct mem_cgroup); | 4809 | int size = sizeof(struct mem_cgroup); |
4776 | 4810 | ||
4777 | /* Can be very big if MAX_NUMNODES is very big */ | 4811 | /* Can be very big if MAX_NUMNODES is very big */ |
4778 | if (size < PAGE_SIZE) | 4812 | if (size < PAGE_SIZE) |
4779 | mem = kzalloc(size, GFP_KERNEL); | 4813 | memcg = kzalloc(size, GFP_KERNEL); |
4780 | else | 4814 | else |
4781 | mem = vzalloc(size); | 4815 | memcg = vzalloc(size); |
4782 | 4816 | ||
4783 | if (!mem) | 4817 | if (!memcg) |
4784 | return NULL; | 4818 | return NULL; |
4785 | 4819 | ||
4786 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); | 4820 | memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); |
4787 | if (!mem->stat) | 4821 | if (!memcg->stat) |
4788 | goto out_free; | 4822 | goto out_free; |
4789 | spin_lock_init(&mem->pcp_counter_lock); | 4823 | spin_lock_init(&memcg->pcp_counter_lock); |
4790 | return mem; | 4824 | return memcg; |
4791 | 4825 | ||
4792 | out_free: | 4826 | out_free: |
4793 | if (size < PAGE_SIZE) | 4827 | if (size < PAGE_SIZE) |
4794 | kfree(mem); | 4828 | kfree(memcg); |
4795 | else | 4829 | else |
4796 | vfree(mem); | 4830 | vfree(memcg); |
4797 | return NULL; | 4831 | return NULL; |
4798 | } | 4832 | } |
4799 | 4833 | ||
@@ -4981,6 +5015,7 @@ mem_cgroup_create(struct cgroup *cont) | |||
4981 | atomic_set(&memcg->refcnt, 1); | 5015 | atomic_set(&memcg->refcnt, 1); |
4982 | memcg->move_charge_at_immigrate = 0; | 5016 | memcg->move_charge_at_immigrate = 0; |
4983 | mutex_init(&memcg->thresholds_lock); | 5017 | mutex_init(&memcg->thresholds_lock); |
5018 | spin_lock_init(&memcg->move_lock); | ||
4984 | return &memcg->css; | 5019 | return &memcg->css; |
4985 | free_out: | 5020 | free_out: |
4986 | __mem_cgroup_free(memcg); | 5021 | __mem_cgroup_free(memcg); |
@@ -5075,7 +5110,7 @@ one_by_one: | |||
5075 | } | 5110 | } |
5076 | 5111 | ||
5077 | /** | 5112 | /** |
5078 | * is_target_pte_for_mc - check a pte whether it is valid for move charge | 5113 | * get_mctgt_type - get target type of moving charge |
5079 | * @vma: the vma the pte to be checked belongs | 5114 | * @vma: the vma the pte to be checked belongs |
5080 | * @addr: the address corresponding to the pte to be checked | 5115 | * @addr: the address corresponding to the pte to be checked |
5081 | * @ptent: the pte to be checked | 5116 | * @ptent: the pte to be checked |
@@ -5098,7 +5133,7 @@ union mc_target { | |||
5098 | }; | 5133 | }; |
5099 | 5134 | ||
5100 | enum mc_target_type { | 5135 | enum mc_target_type { |
5101 | MC_TARGET_NONE, /* not used */ | 5136 | MC_TARGET_NONE = 0, |
5102 | MC_TARGET_PAGE, | 5137 | MC_TARGET_PAGE, |
5103 | MC_TARGET_SWAP, | 5138 | MC_TARGET_SWAP, |
5104 | }; | 5139 | }; |
@@ -5179,12 +5214,12 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
5179 | return page; | 5214 | return page; |
5180 | } | 5215 | } |
5181 | 5216 | ||
5182 | static int is_target_pte_for_mc(struct vm_area_struct *vma, | 5217 | static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, |
5183 | unsigned long addr, pte_t ptent, union mc_target *target) | 5218 | unsigned long addr, pte_t ptent, union mc_target *target) |
5184 | { | 5219 | { |
5185 | struct page *page = NULL; | 5220 | struct page *page = NULL; |
5186 | struct page_cgroup *pc; | 5221 | struct page_cgroup *pc; |
5187 | int ret = 0; | 5222 | enum mc_target_type ret = MC_TARGET_NONE; |
5188 | swp_entry_t ent = { .val = 0 }; | 5223 | swp_entry_t ent = { .val = 0 }; |
5189 | 5224 | ||
5190 | if (pte_present(ptent)) | 5225 | if (pte_present(ptent)) |
@@ -5195,7 +5230,7 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, | |||
5195 | page = mc_handle_file_pte(vma, addr, ptent, &ent); | 5230 | page = mc_handle_file_pte(vma, addr, ptent, &ent); |
5196 | 5231 | ||
5197 | if (!page && !ent.val) | 5232 | if (!page && !ent.val) |
5198 | return 0; | 5233 | return ret; |
5199 | if (page) { | 5234 | if (page) { |
5200 | pc = lookup_page_cgroup(page); | 5235 | pc = lookup_page_cgroup(page); |
5201 | /* | 5236 | /* |
@@ -5221,6 +5256,41 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, | |||
5221 | return ret; | 5256 | return ret; |
5222 | } | 5257 | } |
5223 | 5258 | ||
5259 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
5260 | /* | ||
5261 | * We don't consider swapping or file mapped pages because THP does not | ||
5262 | * support them for now. | ||
5263 | * Caller should make sure that pmd_trans_huge(pmd) is true. | ||
5264 | */ | ||
5265 | static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | ||
5266 | unsigned long addr, pmd_t pmd, union mc_target *target) | ||
5267 | { | ||
5268 | struct page *page = NULL; | ||
5269 | struct page_cgroup *pc; | ||
5270 | enum mc_target_type ret = MC_TARGET_NONE; | ||
5271 | |||
5272 | page = pmd_page(pmd); | ||
5273 | VM_BUG_ON(!page || !PageHead(page)); | ||
5274 | if (!move_anon()) | ||
5275 | return ret; | ||
5276 | pc = lookup_page_cgroup(page); | ||
5277 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | ||
5278 | ret = MC_TARGET_PAGE; | ||
5279 | if (target) { | ||
5280 | get_page(page); | ||
5281 | target->page = page; | ||
5282 | } | ||
5283 | } | ||
5284 | return ret; | ||
5285 | } | ||
5286 | #else | ||
5287 | static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | ||
5288 | unsigned long addr, pmd_t pmd, union mc_target *target) | ||
5289 | { | ||
5290 | return MC_TARGET_NONE; | ||
5291 | } | ||
5292 | #endif | ||
5293 | |||
5224 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | 5294 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, |
5225 | unsigned long addr, unsigned long end, | 5295 | unsigned long addr, unsigned long end, |
5226 | struct mm_walk *walk) | 5296 | struct mm_walk *walk) |
@@ -5229,11 +5299,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
5229 | pte_t *pte; | 5299 | pte_t *pte; |
5230 | spinlock_t *ptl; | 5300 | spinlock_t *ptl; |
5231 | 5301 | ||
5232 | split_huge_page_pmd(walk->mm, pmd); | 5302 | if (pmd_trans_huge_lock(pmd, vma) == 1) { |
5303 | if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) | ||
5304 | mc.precharge += HPAGE_PMD_NR; | ||
5305 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
5306 | return 0; | ||
5307 | } | ||
5233 | 5308 | ||
5234 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 5309 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
5235 | for (; addr != end; pte++, addr += PAGE_SIZE) | 5310 | for (; addr != end; pte++, addr += PAGE_SIZE) |
5236 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | 5311 | if (get_mctgt_type(vma, addr, *pte, NULL)) |
5237 | mc.precharge++; /* increment precharge temporarily */ | 5312 | mc.precharge++; /* increment precharge temporarily */ |
5238 | pte_unmap_unlock(pte - 1, ptl); | 5313 | pte_unmap_unlock(pte - 1, ptl); |
5239 | cond_resched(); | 5314 | cond_resched(); |
@@ -5388,23 +5463,55 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
5388 | struct vm_area_struct *vma = walk->private; | 5463 | struct vm_area_struct *vma = walk->private; |
5389 | pte_t *pte; | 5464 | pte_t *pte; |
5390 | spinlock_t *ptl; | 5465 | spinlock_t *ptl; |
5466 | enum mc_target_type target_type; | ||
5467 | union mc_target target; | ||
5468 | struct page *page; | ||
5469 | struct page_cgroup *pc; | ||
5470 | |||
5471 | /* | ||
5472 | * We don't take compound_lock() here but no race with splitting thp | ||
5473 | * happens because: | ||
5474 | * - if pmd_trans_huge_lock() returns 1, the relevant thp is not | ||
5475 | * under splitting, which means there's no concurrent thp split, | ||
5476 | * - if another thread runs into split_huge_page() just after we | ||
5477 | * entered this if-block, the thread must wait for page table lock | ||
5478 | * to be unlocked in __split_huge_page_splitting(), where the main | ||
5479 | * part of thp split is not executed yet. | ||
5480 | */ | ||
5481 | if (pmd_trans_huge_lock(pmd, vma) == 1) { | ||
5482 | if (!mc.precharge) { | ||
5483 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
5484 | return 0; | ||
5485 | } | ||
5486 | target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); | ||
5487 | if (target_type == MC_TARGET_PAGE) { | ||
5488 | page = target.page; | ||
5489 | if (!isolate_lru_page(page)) { | ||
5490 | pc = lookup_page_cgroup(page); | ||
5491 | if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, | ||
5492 | pc, mc.from, mc.to, | ||
5493 | false)) { | ||
5494 | mc.precharge -= HPAGE_PMD_NR; | ||
5495 | mc.moved_charge += HPAGE_PMD_NR; | ||
5496 | } | ||
5497 | putback_lru_page(page); | ||
5498 | } | ||
5499 | put_page(page); | ||
5500 | } | ||
5501 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
5502 | return 0; | ||
5503 | } | ||
5391 | 5504 | ||
5392 | split_huge_page_pmd(walk->mm, pmd); | ||
5393 | retry: | 5505 | retry: |
5394 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 5506 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
5395 | for (; addr != end; addr += PAGE_SIZE) { | 5507 | for (; addr != end; addr += PAGE_SIZE) { |
5396 | pte_t ptent = *(pte++); | 5508 | pte_t ptent = *(pte++); |
5397 | union mc_target target; | ||
5398 | int type; | ||
5399 | struct page *page; | ||
5400 | struct page_cgroup *pc; | ||
5401 | swp_entry_t ent; | 5509 | swp_entry_t ent; |
5402 | 5510 | ||
5403 | if (!mc.precharge) | 5511 | if (!mc.precharge) |
5404 | break; | 5512 | break; |
5405 | 5513 | ||
5406 | type = is_target_pte_for_mc(vma, addr, ptent, &target); | 5514 | switch (get_mctgt_type(vma, addr, ptent, &target)) { |
5407 | switch (type) { | ||
5408 | case MC_TARGET_PAGE: | 5515 | case MC_TARGET_PAGE: |
5409 | page = target.page; | 5516 | page = target.page; |
5410 | if (isolate_lru_page(page)) | 5517 | if (isolate_lru_page(page)) |
@@ -5417,7 +5524,7 @@ retry: | |||
5417 | mc.moved_charge++; | 5524 | mc.moved_charge++; |
5418 | } | 5525 | } |
5419 | putback_lru_page(page); | 5526 | putback_lru_page(page); |
5420 | put: /* is_target_pte_for_mc() gets the page */ | 5527 | put: /* get_mctgt_type() gets the page */ |
5421 | put_page(page); | 5528 | put_page(page); |
5422 | break; | 5529 | break; |
5423 | case MC_TARGET_SWAP: | 5530 | case MC_TARGET_SWAP: |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 56080ea36140..c22076ffdd44 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1063,7 +1063,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) | |||
1063 | * The check (unnecessarily) ignores LRU pages being isolated and | 1063 | * The check (unnecessarily) ignores LRU pages being isolated and |
1064 | * walked by the page reclaim code, however that's not a big loss. | 1064 | * walked by the page reclaim code, however that's not a big loss. |
1065 | */ | 1065 | */ |
1066 | if (!PageHuge(p) && !PageTransCompound(p)) { | 1066 | if (!PageHuge(p) && !PageTransTail(p)) { |
1067 | if (!PageLRU(p)) | 1067 | if (!PageLRU(p)) |
1068 | shake_page(p, 0); | 1068 | shake_page(p, 0); |
1069 | if (!PageLRU(p)) { | 1069 | if (!PageLRU(p)) { |
diff --git a/mm/memory.c b/mm/memory.c index 8438c157e4d9..3416b6e018d6 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -125,17 +125,17 @@ core_initcall(init_zero_pfn); | |||
125 | 125 | ||
126 | #if defined(SPLIT_RSS_COUNTING) | 126 | #if defined(SPLIT_RSS_COUNTING) |
127 | 127 | ||
128 | static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) | 128 | void sync_mm_rss(struct mm_struct *mm) |
129 | { | 129 | { |
130 | int i; | 130 | int i; |
131 | 131 | ||
132 | for (i = 0; i < NR_MM_COUNTERS; i++) { | 132 | for (i = 0; i < NR_MM_COUNTERS; i++) { |
133 | if (task->rss_stat.count[i]) { | 133 | if (current->rss_stat.count[i]) { |
134 | add_mm_counter(mm, i, task->rss_stat.count[i]); | 134 | add_mm_counter(mm, i, current->rss_stat.count[i]); |
135 | task->rss_stat.count[i] = 0; | 135 | current->rss_stat.count[i] = 0; |
136 | } | 136 | } |
137 | } | 137 | } |
138 | task->rss_stat.events = 0; | 138 | current->rss_stat.events = 0; |
139 | } | 139 | } |
140 | 140 | ||
141 | static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) | 141 | static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) |
@@ -157,30 +157,7 @@ static void check_sync_rss_stat(struct task_struct *task) | |||
157 | if (unlikely(task != current)) | 157 | if (unlikely(task != current)) |
158 | return; | 158 | return; |
159 | if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) | 159 | if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) |
160 | __sync_task_rss_stat(task, task->mm); | 160 | sync_mm_rss(task->mm); |
161 | } | ||
162 | |||
163 | unsigned long get_mm_counter(struct mm_struct *mm, int member) | ||
164 | { | ||
165 | long val = 0; | ||
166 | |||
167 | /* | ||
168 | * Don't use task->mm here...for avoiding to use task_get_mm().. | ||
169 | * The caller must guarantee task->mm is not invalid. | ||
170 | */ | ||
171 | val = atomic_long_read(&mm->rss_stat.count[member]); | ||
172 | /* | ||
173 | * counter is updated in asynchronous manner and may go to minus. | ||
174 | * But it's never be expected number for users. | ||
175 | */ | ||
176 | if (val < 0) | ||
177 | return 0; | ||
178 | return (unsigned long)val; | ||
179 | } | ||
180 | |||
181 | void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) | ||
182 | { | ||
183 | __sync_task_rss_stat(task, mm); | ||
184 | } | 161 | } |
185 | #else /* SPLIT_RSS_COUNTING */ | 162 | #else /* SPLIT_RSS_COUNTING */ |
186 | 163 | ||
@@ -661,7 +638,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) | |||
661 | int i; | 638 | int i; |
662 | 639 | ||
663 | if (current->mm == mm) | 640 | if (current->mm == mm) |
664 | sync_mm_rss(current, mm); | 641 | sync_mm_rss(mm); |
665 | for (i = 0; i < NR_MM_COUNTERS; i++) | 642 | for (i = 0; i < NR_MM_COUNTERS; i++) |
666 | if (rss[i]) | 643 | if (rss[i]) |
667 | add_mm_counter(mm, i, rss[i]); | 644 | add_mm_counter(mm, i, rss[i]); |
@@ -1247,16 +1224,24 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1247 | do { | 1224 | do { |
1248 | next = pmd_addr_end(addr, end); | 1225 | next = pmd_addr_end(addr, end); |
1249 | if (pmd_trans_huge(*pmd)) { | 1226 | if (pmd_trans_huge(*pmd)) { |
1250 | if (next-addr != HPAGE_PMD_SIZE) { | 1227 | if (next - addr != HPAGE_PMD_SIZE) { |
1251 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | 1228 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); |
1252 | split_huge_page_pmd(vma->vm_mm, pmd); | 1229 | split_huge_page_pmd(vma->vm_mm, pmd); |
1253 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) | 1230 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) |
1254 | continue; | 1231 | goto next; |
1255 | /* fall through */ | 1232 | /* fall through */ |
1256 | } | 1233 | } |
1257 | if (pmd_none_or_clear_bad(pmd)) | 1234 | /* |
1258 | continue; | 1235 | * Here there can be other concurrent MADV_DONTNEED or |
1236 | * trans huge page faults running, and if the pmd is | ||
1237 | * none or trans huge it can change under us. This is | ||
1238 | * because MADV_DONTNEED holds the mmap_sem in read | ||
1239 | * mode. | ||
1240 | */ | ||
1241 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | ||
1242 | goto next; | ||
1259 | next = zap_pte_range(tlb, vma, pmd, addr, next, details); | 1243 | next = zap_pte_range(tlb, vma, pmd, addr, next, details); |
1244 | next: | ||
1260 | cond_resched(); | 1245 | cond_resched(); |
1261 | } while (pmd++, addr = next, addr != end); | 1246 | } while (pmd++, addr = next, addr != end); |
1262 | 1247 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 47296fee23db..cfb6c8678754 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -512,7 +512,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
512 | do { | 512 | do { |
513 | next = pmd_addr_end(addr, end); | 513 | next = pmd_addr_end(addr, end); |
514 | split_huge_page_pmd(vma->vm_mm, pmd); | 514 | split_huge_page_pmd(vma->vm_mm, pmd); |
515 | if (pmd_none_or_clear_bad(pmd)) | 515 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
516 | continue; | 516 | continue; |
517 | if (check_pte_range(vma, pmd, addr, next, nodes, | 517 | if (check_pte_range(vma, pmd, addr, next, nodes, |
518 | flags, private)) | 518 | flags, private)) |
@@ -1323,12 +1323,9 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1323 | err = -ESRCH; | 1323 | err = -ESRCH; |
1324 | goto out; | 1324 | goto out; |
1325 | } | 1325 | } |
1326 | mm = get_task_mm(task); | 1326 | get_task_struct(task); |
1327 | rcu_read_unlock(); | ||
1328 | 1327 | ||
1329 | err = -EINVAL; | 1328 | err = -EINVAL; |
1330 | if (!mm) | ||
1331 | goto out; | ||
1332 | 1329 | ||
1333 | /* | 1330 | /* |
1334 | * Check if this process has the right to modify the specified | 1331 | * Check if this process has the right to modify the specified |
@@ -1336,14 +1333,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1336 | * capabilities, superuser privileges or the same | 1333 | * capabilities, superuser privileges or the same |
1337 | * userid as the target process. | 1334 | * userid as the target process. |
1338 | */ | 1335 | */ |
1339 | rcu_read_lock(); | ||
1340 | tcred = __task_cred(task); | 1336 | tcred = __task_cred(task); |
1341 | if (cred->euid != tcred->suid && cred->euid != tcred->uid && | 1337 | if (cred->euid != tcred->suid && cred->euid != tcred->uid && |
1342 | cred->uid != tcred->suid && cred->uid != tcred->uid && | 1338 | cred->uid != tcred->suid && cred->uid != tcred->uid && |
1343 | !capable(CAP_SYS_NICE)) { | 1339 | !capable(CAP_SYS_NICE)) { |
1344 | rcu_read_unlock(); | 1340 | rcu_read_unlock(); |
1345 | err = -EPERM; | 1341 | err = -EPERM; |
1346 | goto out; | 1342 | goto out_put; |
1347 | } | 1343 | } |
1348 | rcu_read_unlock(); | 1344 | rcu_read_unlock(); |
1349 | 1345 | ||
@@ -1351,26 +1347,36 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
1351 | /* Is the user allowed to access the target nodes? */ | 1347 | /* Is the user allowed to access the target nodes? */ |
1352 | if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { | 1348 | if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { |
1353 | err = -EPERM; | 1349 | err = -EPERM; |
1354 | goto out; | 1350 | goto out_put; |
1355 | } | 1351 | } |
1356 | 1352 | ||
1357 | if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { | 1353 | if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { |
1358 | err = -EINVAL; | 1354 | err = -EINVAL; |
1359 | goto out; | 1355 | goto out_put; |
1360 | } | 1356 | } |
1361 | 1357 | ||
1362 | err = security_task_movememory(task); | 1358 | err = security_task_movememory(task); |
1363 | if (err) | 1359 | if (err) |
1364 | goto out; | 1360 | goto out_put; |
1365 | 1361 | ||
1366 | err = do_migrate_pages(mm, old, new, | 1362 | mm = get_task_mm(task); |
1367 | capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); | 1363 | put_task_struct(task); |
1368 | out: | ||
1369 | if (mm) | 1364 | if (mm) |
1370 | mmput(mm); | 1365 | err = do_migrate_pages(mm, old, new, |
1366 | capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); | ||
1367 | else | ||
1368 | err = -EINVAL; | ||
1369 | |||
1370 | mmput(mm); | ||
1371 | out: | ||
1371 | NODEMASK_SCRATCH_FREE(scratch); | 1372 | NODEMASK_SCRATCH_FREE(scratch); |
1372 | 1373 | ||
1373 | return err; | 1374 | return err; |
1375 | |||
1376 | out_put: | ||
1377 | put_task_struct(task); | ||
1378 | goto out; | ||
1379 | |||
1374 | } | 1380 | } |
1375 | 1381 | ||
1376 | 1382 | ||
@@ -1844,18 +1850,24 @@ struct page * | |||
1844 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | 1850 | alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, |
1845 | unsigned long addr, int node) | 1851 | unsigned long addr, int node) |
1846 | { | 1852 | { |
1847 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1853 | struct mempolicy *pol; |
1848 | struct zonelist *zl; | 1854 | struct zonelist *zl; |
1849 | struct page *page; | 1855 | struct page *page; |
1856 | unsigned int cpuset_mems_cookie; | ||
1857 | |||
1858 | retry_cpuset: | ||
1859 | pol = get_vma_policy(current, vma, addr); | ||
1860 | cpuset_mems_cookie = get_mems_allowed(); | ||
1850 | 1861 | ||
1851 | get_mems_allowed(); | ||
1852 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1862 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
1853 | unsigned nid; | 1863 | unsigned nid; |
1854 | 1864 | ||
1855 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); | 1865 | nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); |
1856 | mpol_cond_put(pol); | 1866 | mpol_cond_put(pol); |
1857 | page = alloc_page_interleave(gfp, order, nid); | 1867 | page = alloc_page_interleave(gfp, order, nid); |
1858 | put_mems_allowed(); | 1868 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1869 | goto retry_cpuset; | ||
1870 | |||
1859 | return page; | 1871 | return page; |
1860 | } | 1872 | } |
1861 | zl = policy_zonelist(gfp, pol, node); | 1873 | zl = policy_zonelist(gfp, pol, node); |
@@ -1866,7 +1878,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
1866 | struct page *page = __alloc_pages_nodemask(gfp, order, | 1878 | struct page *page = __alloc_pages_nodemask(gfp, order, |
1867 | zl, policy_nodemask(gfp, pol)); | 1879 | zl, policy_nodemask(gfp, pol)); |
1868 | __mpol_put(pol); | 1880 | __mpol_put(pol); |
1869 | put_mems_allowed(); | 1881 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1882 | goto retry_cpuset; | ||
1870 | return page; | 1883 | return page; |
1871 | } | 1884 | } |
1872 | /* | 1885 | /* |
@@ -1874,7 +1887,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
1874 | */ | 1887 | */ |
1875 | page = __alloc_pages_nodemask(gfp, order, zl, | 1888 | page = __alloc_pages_nodemask(gfp, order, zl, |
1876 | policy_nodemask(gfp, pol)); | 1889 | policy_nodemask(gfp, pol)); |
1877 | put_mems_allowed(); | 1890 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
1891 | goto retry_cpuset; | ||
1878 | return page; | 1892 | return page; |
1879 | } | 1893 | } |
1880 | 1894 | ||
@@ -1901,11 +1915,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1901 | { | 1915 | { |
1902 | struct mempolicy *pol = current->mempolicy; | 1916 | struct mempolicy *pol = current->mempolicy; |
1903 | struct page *page; | 1917 | struct page *page; |
1918 | unsigned int cpuset_mems_cookie; | ||
1904 | 1919 | ||
1905 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 1920 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
1906 | pol = &default_policy; | 1921 | pol = &default_policy; |
1907 | 1922 | ||
1908 | get_mems_allowed(); | 1923 | retry_cpuset: |
1924 | cpuset_mems_cookie = get_mems_allowed(); | ||
1925 | |||
1909 | /* | 1926 | /* |
1910 | * No reference counting needed for current->mempolicy | 1927 | * No reference counting needed for current->mempolicy |
1911 | * nor system default_policy | 1928 | * nor system default_policy |
@@ -1916,7 +1933,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1916 | page = __alloc_pages_nodemask(gfp, order, | 1933 | page = __alloc_pages_nodemask(gfp, order, |
1917 | policy_zonelist(gfp, pol, numa_node_id()), | 1934 | policy_zonelist(gfp, pol, numa_node_id()), |
1918 | policy_nodemask(gfp, pol)); | 1935 | policy_nodemask(gfp, pol)); |
1919 | put_mems_allowed(); | 1936 | |
1937 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
1938 | goto retry_cpuset; | ||
1939 | |||
1920 | return page; | 1940 | return page; |
1921 | } | 1941 | } |
1922 | EXPORT_SYMBOL(alloc_pages_current); | 1942 | EXPORT_SYMBOL(alloc_pages_current); |
diff --git a/mm/migrate.c b/mm/migrate.c index 1503b6b54ecb..51c08a0c6f68 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -1174,20 +1174,17 @@ set_status: | |||
1174 | * Migrate an array of page address onto an array of nodes and fill | 1174 | * Migrate an array of page address onto an array of nodes and fill |
1175 | * the corresponding array of status. | 1175 | * the corresponding array of status. |
1176 | */ | 1176 | */ |
1177 | static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | 1177 | static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, |
1178 | unsigned long nr_pages, | 1178 | unsigned long nr_pages, |
1179 | const void __user * __user *pages, | 1179 | const void __user * __user *pages, |
1180 | const int __user *nodes, | 1180 | const int __user *nodes, |
1181 | int __user *status, int flags) | 1181 | int __user *status, int flags) |
1182 | { | 1182 | { |
1183 | struct page_to_node *pm; | 1183 | struct page_to_node *pm; |
1184 | nodemask_t task_nodes; | ||
1185 | unsigned long chunk_nr_pages; | 1184 | unsigned long chunk_nr_pages; |
1186 | unsigned long chunk_start; | 1185 | unsigned long chunk_start; |
1187 | int err; | 1186 | int err; |
1188 | 1187 | ||
1189 | task_nodes = cpuset_mems_allowed(task); | ||
1190 | |||
1191 | err = -ENOMEM; | 1188 | err = -ENOMEM; |
1192 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); | 1189 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); |
1193 | if (!pm) | 1190 | if (!pm) |
@@ -1349,6 +1346,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1349 | struct task_struct *task; | 1346 | struct task_struct *task; |
1350 | struct mm_struct *mm; | 1347 | struct mm_struct *mm; |
1351 | int err; | 1348 | int err; |
1349 | nodemask_t task_nodes; | ||
1352 | 1350 | ||
1353 | /* Check flags */ | 1351 | /* Check flags */ |
1354 | if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) | 1352 | if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) |
@@ -1364,11 +1362,7 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1364 | rcu_read_unlock(); | 1362 | rcu_read_unlock(); |
1365 | return -ESRCH; | 1363 | return -ESRCH; |
1366 | } | 1364 | } |
1367 | mm = get_task_mm(task); | 1365 | get_task_struct(task); |
1368 | rcu_read_unlock(); | ||
1369 | |||
1370 | if (!mm) | ||
1371 | return -EINVAL; | ||
1372 | 1366 | ||
1373 | /* | 1367 | /* |
1374 | * Check if this process has the right to modify the specified | 1368 | * Check if this process has the right to modify the specified |
@@ -1376,7 +1370,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1376 | * capabilities, superuser privileges or the same | 1370 | * capabilities, superuser privileges or the same |
1377 | * userid as the target process. | 1371 | * userid as the target process. |
1378 | */ | 1372 | */ |
1379 | rcu_read_lock(); | ||
1380 | tcred = __task_cred(task); | 1373 | tcred = __task_cred(task); |
1381 | if (cred->euid != tcred->suid && cred->euid != tcred->uid && | 1374 | if (cred->euid != tcred->suid && cred->euid != tcred->uid && |
1382 | cred->uid != tcred->suid && cred->uid != tcred->uid && | 1375 | cred->uid != tcred->suid && cred->uid != tcred->uid && |
@@ -1391,16 +1384,25 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, | |||
1391 | if (err) | 1384 | if (err) |
1392 | goto out; | 1385 | goto out; |
1393 | 1386 | ||
1394 | if (nodes) { | 1387 | task_nodes = cpuset_mems_allowed(task); |
1395 | err = do_pages_move(mm, task, nr_pages, pages, nodes, status, | 1388 | mm = get_task_mm(task); |
1396 | flags); | 1389 | put_task_struct(task); |
1397 | } else { | 1390 | |
1398 | err = do_pages_stat(mm, nr_pages, pages, status); | 1391 | if (mm) { |
1399 | } | 1392 | if (nodes) |
1393 | err = do_pages_move(mm, task_nodes, nr_pages, pages, | ||
1394 | nodes, status, flags); | ||
1395 | else | ||
1396 | err = do_pages_stat(mm, nr_pages, pages, status); | ||
1397 | } else | ||
1398 | err = -EINVAL; | ||
1400 | 1399 | ||
1401 | out: | ||
1402 | mmput(mm); | 1400 | mmput(mm); |
1403 | return err; | 1401 | return err; |
1402 | |||
1403 | out: | ||
1404 | put_task_struct(task); | ||
1405 | return err; | ||
1404 | } | 1406 | } |
1405 | 1407 | ||
1406 | /* | 1408 | /* |
diff --git a/mm/mincore.c b/mm/mincore.c index 636a86876ff2..936b4cee8cb1 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -164,7 +164,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
164 | } | 164 | } |
165 | /* fall through */ | 165 | /* fall through */ |
166 | } | 166 | } |
167 | if (pmd_none_or_clear_bad(pmd)) | 167 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
168 | mincore_unmapped_range(vma, addr, next, vec); | 168 | mincore_unmapped_range(vma, addr, next, vec); |
169 | else | 169 | else |
170 | mincore_pte_range(vma, pmd, addr, next, vec); | 170 | mincore_pte_range(vma, pmd, addr, next, vec); |
@@ -451,9 +451,8 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
451 | } | 451 | } |
452 | 452 | ||
453 | /* | 453 | /* |
454 | * Helper for vma_adjust in the split_vma insert case: | 454 | * Helper for vma_adjust() in the split_vma insert case: insert a vma into the |
455 | * insert vm structure into list and rbtree and anon_vma, | 455 | * mm's list and rbtree. It has already been inserted into the prio_tree. |
456 | * but it has already been inserted into prio_tree earlier. | ||
457 | */ | 456 | */ |
458 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) | 457 | static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) |
459 | { | 458 | { |
@@ -1112,9 +1111,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
1112 | * A dummy user value is used because we are not locking | 1111 | * A dummy user value is used because we are not locking |
1113 | * memory so no accounting is necessary | 1112 | * memory so no accounting is necessary |
1114 | */ | 1113 | */ |
1115 | len = ALIGN(len, huge_page_size(&default_hstate)); | 1114 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, |
1116 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, | 1115 | VM_NORESERVE, &user, |
1117 | &user, HUGETLB_ANONHUGE_INODE); | 1116 | HUGETLB_ANONHUGE_INODE); |
1118 | if (IS_ERR(file)) | 1117 | if (IS_ERR(file)) |
1119 | return PTR_ERR(file); | 1118 | return PTR_ERR(file); |
1120 | } | 1119 | } |
@@ -1439,10 +1438,8 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr) | |||
1439 | /* | 1438 | /* |
1440 | * Is this a new hole at the lowest possible address? | 1439 | * Is this a new hole at the lowest possible address? |
1441 | */ | 1440 | */ |
1442 | if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) { | 1441 | if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) |
1443 | mm->free_area_cache = addr; | 1442 | mm->free_area_cache = addr; |
1444 | mm->cached_hole_size = ~0UL; | ||
1445 | } | ||
1446 | } | 1443 | } |
1447 | 1444 | ||
1448 | /* | 1445 | /* |
@@ -1457,7 +1454,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1457 | { | 1454 | { |
1458 | struct vm_area_struct *vma; | 1455 | struct vm_area_struct *vma; |
1459 | struct mm_struct *mm = current->mm; | 1456 | struct mm_struct *mm = current->mm; |
1460 | unsigned long addr = addr0; | 1457 | unsigned long addr = addr0, start_addr; |
1461 | 1458 | ||
1462 | /* requested length too big for entire address space */ | 1459 | /* requested length too big for entire address space */ |
1463 | if (len > TASK_SIZE) | 1460 | if (len > TASK_SIZE) |
@@ -1481,22 +1478,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1481 | mm->free_area_cache = mm->mmap_base; | 1478 | mm->free_area_cache = mm->mmap_base; |
1482 | } | 1479 | } |
1483 | 1480 | ||
1481 | try_again: | ||
1484 | /* either no address requested or can't fit in requested address hole */ | 1482 | /* either no address requested or can't fit in requested address hole */ |
1485 | addr = mm->free_area_cache; | 1483 | start_addr = addr = mm->free_area_cache; |
1486 | |||
1487 | /* make sure it can fit in the remaining address space */ | ||
1488 | if (addr > len) { | ||
1489 | vma = find_vma(mm, addr-len); | ||
1490 | if (!vma || addr <= vma->vm_start) | ||
1491 | /* remember the address as a hint for next time */ | ||
1492 | return (mm->free_area_cache = addr-len); | ||
1493 | } | ||
1494 | |||
1495 | if (mm->mmap_base < len) | ||
1496 | goto bottomup; | ||
1497 | 1484 | ||
1498 | addr = mm->mmap_base-len; | 1485 | if (addr < len) |
1486 | goto fail; | ||
1499 | 1487 | ||
1488 | addr -= len; | ||
1500 | do { | 1489 | do { |
1501 | /* | 1490 | /* |
1502 | * Lookup failure means no vma is above this address, | 1491 | * Lookup failure means no vma is above this address, |
@@ -1516,7 +1505,21 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, | |||
1516 | addr = vma->vm_start-len; | 1505 | addr = vma->vm_start-len; |
1517 | } while (len < vma->vm_start); | 1506 | } while (len < vma->vm_start); |
1518 | 1507 | ||
1519 | bottomup: | 1508 | fail: |
1509 | /* | ||
1510 | * if hint left us with no space for the requested | ||
1511 | * mapping then try again: | ||
1512 | * | ||
1513 | * Note: this is different with the case of bottomup | ||
1514 | * which does the fully line-search, but we use find_vma | ||
1515 | * here that causes some holes skipped. | ||
1516 | */ | ||
1517 | if (start_addr != mm->mmap_base) { | ||
1518 | mm->free_area_cache = mm->mmap_base; | ||
1519 | mm->cached_hole_size = 0; | ||
1520 | goto try_again; | ||
1521 | } | ||
1522 | |||
1520 | /* | 1523 | /* |
1521 | * A failed mmap() very likely causes application failure, | 1524 | * A failed mmap() very likely causes application failure, |
1522 | * so fall back to the bottom-up function here. This scenario | 1525 | * so fall back to the bottom-up function here. This scenario |
diff --git a/mm/mmu_context.c b/mm/mmu_context.c index cf332bc0080a..3dcfaf4ed355 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c | |||
@@ -53,7 +53,7 @@ void unuse_mm(struct mm_struct *mm) | |||
53 | struct task_struct *tsk = current; | 53 | struct task_struct *tsk = current; |
54 | 54 | ||
55 | task_lock(tsk); | 55 | task_lock(tsk); |
56 | sync_mm_rss(tsk, mm); | 56 | sync_mm_rss(mm); |
57 | tsk->mm = NULL; | 57 | tsk->mm = NULL; |
58 | /* active_mm is still 'mm' */ | 58 | /* active_mm is still 'mm' */ |
59 | enter_lazy_tlb(mm, tsk); | 59 | enter_lazy_tlb(mm, tsk); |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 142ef4a1f480..a40992610ab6 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -60,7 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
60 | ptent = pte_mkwrite(ptent); | 60 | ptent = pte_mkwrite(ptent); |
61 | 61 | ||
62 | ptep_modify_prot_commit(mm, addr, pte, ptent); | 62 | ptep_modify_prot_commit(mm, addr, pte, ptent); |
63 | } else if (PAGE_MIGRATION && !pte_file(oldpte)) { | 63 | } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { |
64 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 64 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
65 | 65 | ||
66 | if (is_write_migration_entry(entry)) { | 66 | if (is_write_migration_entry(entry)) { |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2958fd8e7c9a..4198e000f41a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/ptrace.h> | 34 | #include <linux/ptrace.h> |
35 | #include <linux/freezer.h> | 35 | #include <linux/freezer.h> |
36 | #include <linux/ftrace.h> | 36 | #include <linux/ftrace.h> |
37 | #include <linux/ratelimit.h> | ||
37 | 38 | ||
38 | #define CREATE_TRACE_POINTS | 39 | #define CREATE_TRACE_POINTS |
39 | #include <trace/events/oom.h> | 40 | #include <trace/events/oom.h> |
@@ -309,7 +310,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
309 | */ | 310 | */ |
310 | static struct task_struct *select_bad_process(unsigned int *ppoints, | 311 | static struct task_struct *select_bad_process(unsigned int *ppoints, |
311 | unsigned long totalpages, struct mem_cgroup *memcg, | 312 | unsigned long totalpages, struct mem_cgroup *memcg, |
312 | const nodemask_t *nodemask) | 313 | const nodemask_t *nodemask, bool force_kill) |
313 | { | 314 | { |
314 | struct task_struct *g, *p; | 315 | struct task_struct *g, *p; |
315 | struct task_struct *chosen = NULL; | 316 | struct task_struct *chosen = NULL; |
@@ -335,7 +336,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
335 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) { | 336 | if (test_tsk_thread_flag(p, TIF_MEMDIE)) { |
336 | if (unlikely(frozen(p))) | 337 | if (unlikely(frozen(p))) |
337 | __thaw_task(p); | 338 | __thaw_task(p); |
338 | return ERR_PTR(-1UL); | 339 | if (!force_kill) |
340 | return ERR_PTR(-1UL); | ||
339 | } | 341 | } |
340 | if (!p->mm) | 342 | if (!p->mm) |
341 | continue; | 343 | continue; |
@@ -353,7 +355,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
353 | if (p == current) { | 355 | if (p == current) { |
354 | chosen = p; | 356 | chosen = p; |
355 | *ppoints = 1000; | 357 | *ppoints = 1000; |
356 | } else { | 358 | } else if (!force_kill) { |
357 | /* | 359 | /* |
358 | * If this task is not being ptraced on exit, | 360 | * If this task is not being ptraced on exit, |
359 | * then wait for it to finish before killing | 361 | * then wait for it to finish before killing |
@@ -434,66 +436,18 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
434 | } | 436 | } |
435 | 437 | ||
436 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 438 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
437 | static int oom_kill_task(struct task_struct *p) | 439 | static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, |
438 | { | 440 | unsigned int points, unsigned long totalpages, |
439 | struct task_struct *q; | 441 | struct mem_cgroup *memcg, nodemask_t *nodemask, |
440 | struct mm_struct *mm; | 442 | const char *message) |
441 | |||
442 | p = find_lock_task_mm(p); | ||
443 | if (!p) | ||
444 | return 1; | ||
445 | |||
446 | /* mm cannot be safely dereferenced after task_unlock(p) */ | ||
447 | mm = p->mm; | ||
448 | |||
449 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", | ||
450 | task_pid_nr(p), p->comm, K(p->mm->total_vm), | ||
451 | K(get_mm_counter(p->mm, MM_ANONPAGES)), | ||
452 | K(get_mm_counter(p->mm, MM_FILEPAGES))); | ||
453 | task_unlock(p); | ||
454 | |||
455 | /* | ||
456 | * Kill all user processes sharing p->mm in other thread groups, if any. | ||
457 | * They don't get access to memory reserves or a higher scheduler | ||
458 | * priority, though, to avoid depletion of all memory or task | ||
459 | * starvation. This prevents mm->mmap_sem livelock when an oom killed | ||
460 | * task cannot exit because it requires the semaphore and its contended | ||
461 | * by another thread trying to allocate memory itself. That thread will | ||
462 | * now get access to memory reserves since it has a pending fatal | ||
463 | * signal. | ||
464 | */ | ||
465 | for_each_process(q) | ||
466 | if (q->mm == mm && !same_thread_group(q, p) && | ||
467 | !(q->flags & PF_KTHREAD)) { | ||
468 | if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
469 | continue; | ||
470 | |||
471 | task_lock(q); /* Protect ->comm from prctl() */ | ||
472 | pr_err("Kill process %d (%s) sharing same memory\n", | ||
473 | task_pid_nr(q), q->comm); | ||
474 | task_unlock(q); | ||
475 | force_sig(SIGKILL, q); | ||
476 | } | ||
477 | |||
478 | set_tsk_thread_flag(p, TIF_MEMDIE); | ||
479 | force_sig(SIGKILL, p); | ||
480 | |||
481 | return 0; | ||
482 | } | ||
483 | #undef K | ||
484 | |||
485 | static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | ||
486 | unsigned int points, unsigned long totalpages, | ||
487 | struct mem_cgroup *memcg, nodemask_t *nodemask, | ||
488 | const char *message) | ||
489 | { | 443 | { |
490 | struct task_struct *victim = p; | 444 | struct task_struct *victim = p; |
491 | struct task_struct *child; | 445 | struct task_struct *child; |
492 | struct task_struct *t = p; | 446 | struct task_struct *t = p; |
447 | struct mm_struct *mm; | ||
493 | unsigned int victim_points = 0; | 448 | unsigned int victim_points = 0; |
494 | 449 | static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, | |
495 | if (printk_ratelimit()) | 450 | DEFAULT_RATELIMIT_BURST); |
496 | dump_header(p, gfp_mask, order, memcg, nodemask); | ||
497 | 451 | ||
498 | /* | 452 | /* |
499 | * If the task is already exiting, don't alarm the sysadmin or kill | 453 | * If the task is already exiting, don't alarm the sysadmin or kill |
@@ -501,9 +455,12 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
501 | */ | 455 | */ |
502 | if (p->flags & PF_EXITING) { | 456 | if (p->flags & PF_EXITING) { |
503 | set_tsk_thread_flag(p, TIF_MEMDIE); | 457 | set_tsk_thread_flag(p, TIF_MEMDIE); |
504 | return 0; | 458 | return; |
505 | } | 459 | } |
506 | 460 | ||
461 | if (__ratelimit(&oom_rs)) | ||
462 | dump_header(p, gfp_mask, order, memcg, nodemask); | ||
463 | |||
507 | task_lock(p); | 464 | task_lock(p); |
508 | pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", | 465 | pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", |
509 | message, task_pid_nr(p), p->comm, points); | 466 | message, task_pid_nr(p), p->comm, points); |
@@ -533,8 +490,44 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
533 | } | 490 | } |
534 | } while_each_thread(p, t); | 491 | } while_each_thread(p, t); |
535 | 492 | ||
536 | return oom_kill_task(victim); | 493 | victim = find_lock_task_mm(victim); |
494 | if (!victim) | ||
495 | return; | ||
496 | |||
497 | /* mm cannot safely be dereferenced after task_unlock(victim) */ | ||
498 | mm = victim->mm; | ||
499 | pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", | ||
500 | task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), | ||
501 | K(get_mm_counter(victim->mm, MM_ANONPAGES)), | ||
502 | K(get_mm_counter(victim->mm, MM_FILEPAGES))); | ||
503 | task_unlock(victim); | ||
504 | |||
505 | /* | ||
506 | * Kill all user processes sharing victim->mm in other thread groups, if | ||
507 | * any. They don't get access to memory reserves, though, to avoid | ||
508 | * depletion of all memory. This prevents mm->mmap_sem livelock when an | ||
509 | * oom killed thread cannot exit because it requires the semaphore and | ||
510 | * its contended by another thread trying to allocate memory itself. | ||
511 | * That thread will now get access to memory reserves since it has a | ||
512 | * pending fatal signal. | ||
513 | */ | ||
514 | for_each_process(p) | ||
515 | if (p->mm == mm && !same_thread_group(p, victim) && | ||
516 | !(p->flags & PF_KTHREAD)) { | ||
517 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
518 | continue; | ||
519 | |||
520 | task_lock(p); /* Protect ->comm from prctl() */ | ||
521 | pr_err("Kill process %d (%s) sharing same memory\n", | ||
522 | task_pid_nr(p), p->comm); | ||
523 | task_unlock(p); | ||
524 | force_sig(SIGKILL, p); | ||
525 | } | ||
526 | |||
527 | set_tsk_thread_flag(victim, TIF_MEMDIE); | ||
528 | force_sig(SIGKILL, victim); | ||
537 | } | 529 | } |
530 | #undef K | ||
538 | 531 | ||
539 | /* | 532 | /* |
540 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. | 533 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. |
@@ -561,7 +554,8 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | |||
561 | } | 554 | } |
562 | 555 | ||
563 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 556 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
564 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) | 557 | void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, |
558 | int order) | ||
565 | { | 559 | { |
566 | unsigned long limit; | 560 | unsigned long limit; |
567 | unsigned int points = 0; | 561 | unsigned int points = 0; |
@@ -577,18 +571,13 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask) | |||
577 | return; | 571 | return; |
578 | } | 572 | } |
579 | 573 | ||
580 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); | 574 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); |
581 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; | 575 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; |
582 | read_lock(&tasklist_lock); | 576 | read_lock(&tasklist_lock); |
583 | retry: | 577 | p = select_bad_process(&points, limit, memcg, NULL, false); |
584 | p = select_bad_process(&points, limit, memcg, NULL); | 578 | if (p && PTR_ERR(p) != -1UL) |
585 | if (!p || PTR_ERR(p) == -1UL) | 579 | oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL, |
586 | goto out; | 580 | "Memory cgroup out of memory"); |
587 | |||
588 | if (oom_kill_process(p, gfp_mask, 0, points, limit, memcg, NULL, | ||
589 | "Memory cgroup out of memory")) | ||
590 | goto retry; | ||
591 | out: | ||
592 | read_unlock(&tasklist_lock); | 581 | read_unlock(&tasklist_lock); |
593 | } | 582 | } |
594 | #endif | 583 | #endif |
@@ -700,6 +689,7 @@ static void clear_system_oom(void) | |||
700 | * @gfp_mask: memory allocation flags | 689 | * @gfp_mask: memory allocation flags |
701 | * @order: amount of memory being requested as a power of 2 | 690 | * @order: amount of memory being requested as a power of 2 |
702 | * @nodemask: nodemask passed to page allocator | 691 | * @nodemask: nodemask passed to page allocator |
692 | * @force_kill: true if a task must be killed, even if others are exiting | ||
703 | * | 693 | * |
704 | * If we run out of memory, we have the choice between either | 694 | * If we run out of memory, we have the choice between either |
705 | * killing a random task (bad), letting the system crash (worse) | 695 | * killing a random task (bad), letting the system crash (worse) |
@@ -707,7 +697,7 @@ static void clear_system_oom(void) | |||
707 | * don't have to be perfect here, we just have to be good. | 697 | * don't have to be perfect here, we just have to be good. |
708 | */ | 698 | */ |
709 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | 699 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
710 | int order, nodemask_t *nodemask) | 700 | int order, nodemask_t *nodemask, bool force_kill) |
711 | { | 701 | { |
712 | const nodemask_t *mpol_mask; | 702 | const nodemask_t *mpol_mask; |
713 | struct task_struct *p; | 703 | struct task_struct *p; |
@@ -745,33 +735,25 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
745 | if (sysctl_oom_kill_allocating_task && | 735 | if (sysctl_oom_kill_allocating_task && |
746 | !oom_unkillable_task(current, NULL, nodemask) && | 736 | !oom_unkillable_task(current, NULL, nodemask) && |
747 | current->mm) { | 737 | current->mm) { |
748 | /* | 738 | oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, |
749 | * oom_kill_process() needs tasklist_lock held. If it returns | 739 | nodemask, |
750 | * non-zero, current could not be killed so we must fallback to | 740 | "Out of memory (oom_kill_allocating_task)"); |
751 | * the tasklist scan. | ||
752 | */ | ||
753 | if (!oom_kill_process(current, gfp_mask, order, 0, totalpages, | ||
754 | NULL, nodemask, | ||
755 | "Out of memory (oom_kill_allocating_task)")) | ||
756 | goto out; | ||
757 | } | ||
758 | |||
759 | retry: | ||
760 | p = select_bad_process(&points, totalpages, NULL, mpol_mask); | ||
761 | if (PTR_ERR(p) == -1UL) | ||
762 | goto out; | 741 | goto out; |
742 | } | ||
763 | 743 | ||
744 | p = select_bad_process(&points, totalpages, NULL, mpol_mask, | ||
745 | force_kill); | ||
764 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 746 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
765 | if (!p) { | 747 | if (!p) { |
766 | dump_header(NULL, gfp_mask, order, NULL, mpol_mask); | 748 | dump_header(NULL, gfp_mask, order, NULL, mpol_mask); |
767 | read_unlock(&tasklist_lock); | 749 | read_unlock(&tasklist_lock); |
768 | panic("Out of memory and no killable processes...\n"); | 750 | panic("Out of memory and no killable processes...\n"); |
769 | } | 751 | } |
770 | 752 | if (PTR_ERR(p) != -1UL) { | |
771 | if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, | 753 | oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, |
772 | nodemask, "Out of memory")) | 754 | nodemask, "Out of memory"); |
773 | goto retry; | 755 | killed = 1; |
774 | killed = 1; | 756 | } |
775 | out: | 757 | out: |
776 | read_unlock(&tasklist_lock); | 758 | read_unlock(&tasklist_lock); |
777 | 759 | ||
@@ -792,7 +774,7 @@ out: | |||
792 | void pagefault_out_of_memory(void) | 774 | void pagefault_out_of_memory(void) |
793 | { | 775 | { |
794 | if (try_set_system_oom()) { | 776 | if (try_set_system_oom()) { |
795 | out_of_memory(NULL, 0, 0, NULL); | 777 | out_of_memory(NULL, 0, 0, NULL, false); |
796 | clear_system_oom(); | 778 | clear_system_oom(); |
797 | } | 779 | } |
798 | if (!test_thread_flag(TIF_MEMDIE)) | 780 | if (!test_thread_flag(TIF_MEMDIE)) |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 363ba7082ef5..3fc261705b1e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1472,6 +1472,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
1472 | 1472 | ||
1473 | for ( ; ; ) { | 1473 | for ( ; ; ) { |
1474 | global_dirty_limits(&background_thresh, &dirty_thresh); | 1474 | global_dirty_limits(&background_thresh, &dirty_thresh); |
1475 | dirty_thresh = hard_dirty_limit(dirty_thresh); | ||
1475 | 1476 | ||
1476 | /* | 1477 | /* |
1477 | * Boost the allowable dirty threshold a bit for page | 1478 | * Boost the allowable dirty threshold a bit for page |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a13ded1938f0..caea788628e4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1968,7 +1968,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
1968 | goto out; | 1968 | goto out; |
1969 | } | 1969 | } |
1970 | /* Exhausted what can be done so it's blamo time */ | 1970 | /* Exhausted what can be done so it's blamo time */ |
1971 | out_of_memory(zonelist, gfp_mask, order, nodemask); | 1971 | out_of_memory(zonelist, gfp_mask, order, nodemask, false); |
1972 | 1972 | ||
1973 | out: | 1973 | out: |
1974 | clear_zonelist_oom(zonelist, gfp_mask); | 1974 | clear_zonelist_oom(zonelist, gfp_mask); |
@@ -1990,7 +1990,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
1990 | if (!order) | 1990 | if (!order) |
1991 | return NULL; | 1991 | return NULL; |
1992 | 1992 | ||
1993 | if (compaction_deferred(preferred_zone)) { | 1993 | if (compaction_deferred(preferred_zone, order)) { |
1994 | *deferred_compaction = true; | 1994 | *deferred_compaction = true; |
1995 | return NULL; | 1995 | return NULL; |
1996 | } | 1996 | } |
@@ -2012,6 +2012,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2012 | if (page) { | 2012 | if (page) { |
2013 | preferred_zone->compact_considered = 0; | 2013 | preferred_zone->compact_considered = 0; |
2014 | preferred_zone->compact_defer_shift = 0; | 2014 | preferred_zone->compact_defer_shift = 0; |
2015 | if (order >= preferred_zone->compact_order_failed) | ||
2016 | preferred_zone->compact_order_failed = order + 1; | ||
2015 | count_vm_event(COMPACTSUCCESS); | 2017 | count_vm_event(COMPACTSUCCESS); |
2016 | return page; | 2018 | return page; |
2017 | } | 2019 | } |
@@ -2028,7 +2030,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2028 | * defer if the failure was a sync compaction failure. | 2030 | * defer if the failure was a sync compaction failure. |
2029 | */ | 2031 | */ |
2030 | if (sync_migration) | 2032 | if (sync_migration) |
2031 | defer_compaction(preferred_zone); | 2033 | defer_compaction(preferred_zone, order); |
2032 | 2034 | ||
2033 | cond_resched(); | 2035 | cond_resched(); |
2034 | } | 2036 | } |
@@ -2378,8 +2380,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2378 | { | 2380 | { |
2379 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 2381 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
2380 | struct zone *preferred_zone; | 2382 | struct zone *preferred_zone; |
2381 | struct page *page; | 2383 | struct page *page = NULL; |
2382 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2384 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2385 | unsigned int cpuset_mems_cookie; | ||
2383 | 2386 | ||
2384 | gfp_mask &= gfp_allowed_mask; | 2387 | gfp_mask &= gfp_allowed_mask; |
2385 | 2388 | ||
@@ -2398,15 +2401,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2398 | if (unlikely(!zonelist->_zonerefs->zone)) | 2401 | if (unlikely(!zonelist->_zonerefs->zone)) |
2399 | return NULL; | 2402 | return NULL; |
2400 | 2403 | ||
2401 | get_mems_allowed(); | 2404 | retry_cpuset: |
2405 | cpuset_mems_cookie = get_mems_allowed(); | ||
2406 | |||
2402 | /* The preferred zone is used for statistics later */ | 2407 | /* The preferred zone is used for statistics later */ |
2403 | first_zones_zonelist(zonelist, high_zoneidx, | 2408 | first_zones_zonelist(zonelist, high_zoneidx, |
2404 | nodemask ? : &cpuset_current_mems_allowed, | 2409 | nodemask ? : &cpuset_current_mems_allowed, |
2405 | &preferred_zone); | 2410 | &preferred_zone); |
2406 | if (!preferred_zone) { | 2411 | if (!preferred_zone) |
2407 | put_mems_allowed(); | 2412 | goto out; |
2408 | return NULL; | ||
2409 | } | ||
2410 | 2413 | ||
2411 | /* First allocation attempt */ | 2414 | /* First allocation attempt */ |
2412 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2415 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
@@ -2416,9 +2419,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2416 | page = __alloc_pages_slowpath(gfp_mask, order, | 2419 | page = __alloc_pages_slowpath(gfp_mask, order, |
2417 | zonelist, high_zoneidx, nodemask, | 2420 | zonelist, high_zoneidx, nodemask, |
2418 | preferred_zone, migratetype); | 2421 | preferred_zone, migratetype); |
2419 | put_mems_allowed(); | ||
2420 | 2422 | ||
2421 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2423 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
2424 | |||
2425 | out: | ||
2426 | /* | ||
2427 | * When updating a task's mems_allowed, it is possible to race with | ||
2428 | * parallel threads in such a way that an allocation can fail while | ||
2429 | * the mask is being updated. If a page allocation is about to fail, | ||
2430 | * check if the cpuset changed during allocation and if so, retry. | ||
2431 | */ | ||
2432 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | ||
2433 | goto retry_cpuset; | ||
2434 | |||
2422 | return page; | 2435 | return page; |
2423 | } | 2436 | } |
2424 | EXPORT_SYMBOL(__alloc_pages_nodemask); | 2437 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
@@ -2632,13 +2645,15 @@ void si_meminfo_node(struct sysinfo *val, int nid) | |||
2632 | bool skip_free_areas_node(unsigned int flags, int nid) | 2645 | bool skip_free_areas_node(unsigned int flags, int nid) |
2633 | { | 2646 | { |
2634 | bool ret = false; | 2647 | bool ret = false; |
2648 | unsigned int cpuset_mems_cookie; | ||
2635 | 2649 | ||
2636 | if (!(flags & SHOW_MEM_FILTER_NODES)) | 2650 | if (!(flags & SHOW_MEM_FILTER_NODES)) |
2637 | goto out; | 2651 | goto out; |
2638 | 2652 | ||
2639 | get_mems_allowed(); | 2653 | do { |
2640 | ret = !node_isset(nid, cpuset_current_mems_allowed); | 2654 | cpuset_mems_cookie = get_mems_allowed(); |
2641 | put_mems_allowed(); | 2655 | ret = !node_isset(nid, cpuset_current_mems_allowed); |
2656 | } while (!put_mems_allowed(cpuset_mems_cookie)); | ||
2642 | out: | 2657 | out: |
2643 | return ret; | 2658 | return ret; |
2644 | } | 2659 | } |
@@ -3925,18 +3940,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | |||
3925 | } | 3940 | } |
3926 | } | 3941 | } |
3927 | 3942 | ||
3928 | int __init add_from_early_node_map(struct range *range, int az, | ||
3929 | int nr_range, int nid) | ||
3930 | { | ||
3931 | unsigned long start_pfn, end_pfn; | ||
3932 | int i; | ||
3933 | |||
3934 | /* need to go over early_node_map to find out good range for node */ | ||
3935 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) | ||
3936 | nr_range = add_range(range, az, nr_range, start_pfn, end_pfn); | ||
3937 | return nr_range; | ||
3938 | } | ||
3939 | |||
3940 | /** | 3943 | /** |
3941 | * sparse_memory_present_with_active_regions - Call memory_present for each active range | 3944 | * sparse_memory_present_with_active_regions - Call memory_present for each active range |
3942 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. | 3945 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. |
@@ -4521,7 +4524,7 @@ static unsigned long __init early_calculate_totalpages(void) | |||
4521 | * memory. When they don't, some nodes will have more kernelcore than | 4524 | * memory. When they don't, some nodes will have more kernelcore than |
4522 | * others | 4525 | * others |
4523 | */ | 4526 | */ |
4524 | static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | 4527 | static void __init find_zone_movable_pfns_for_nodes(void) |
4525 | { | 4528 | { |
4526 | int i, nid; | 4529 | int i, nid; |
4527 | unsigned long usable_startpfn; | 4530 | unsigned long usable_startpfn; |
@@ -4713,7 +4716,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4713 | 4716 | ||
4714 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ | 4717 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ |
4715 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); | 4718 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); |
4716 | find_zone_movable_pfns_for_nodes(zone_movable_pfn); | 4719 | find_zone_movable_pfns_for_nodes(); |
4717 | 4720 | ||
4718 | /* Print out the zone ranges */ | 4721 | /* Print out the zone ranges */ |
4719 | printk("Zone PFN ranges:\n"); | 4722 | printk("Zone PFN ranges:\n"); |
@@ -4823,6 +4826,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self, | |||
4823 | int cpu = (unsigned long)hcpu; | 4826 | int cpu = (unsigned long)hcpu; |
4824 | 4827 | ||
4825 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | 4828 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { |
4829 | lru_add_drain_cpu(cpu); | ||
4826 | drain_pages(cpu); | 4830 | drain_pages(cpu); |
4827 | 4831 | ||
4828 | /* | 4832 | /* |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2f5cf10ff660..aa9701e12714 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -59,7 +59,7 @@ again: | |||
59 | continue; | 59 | continue; |
60 | 60 | ||
61 | split_huge_page_pmd(walk->mm, pmd); | 61 | split_huge_page_pmd(walk->mm, pmd); |
62 | if (pmd_none_or_clear_bad(pmd)) | 62 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
63 | goto again; | 63 | goto again; |
64 | err = walk_pte_range(pmd, addr, next, walk); | 64 | err = walk_pte_range(pmd, addr, next, walk); |
65 | if (err) | 65 | if (err) |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index eb663fb533e0..5a74fea182f1 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -70,10 +70,11 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, | |||
70 | unsigned long address, pmd_t *pmdp) | 70 | unsigned long address, pmd_t *pmdp) |
71 | { | 71 | { |
72 | int young; | 72 | int young; |
73 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE | 73 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
74 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
75 | #else | ||
74 | BUG(); | 76 | BUG(); |
75 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 77 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
76 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
77 | young = pmdp_test_and_clear_young(vma, address, pmdp); | 78 | young = pmdp_test_and_clear_young(vma, address, pmdp); |
78 | if (young) | 79 | if (young) |
79 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | 80 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
@@ -120,6 +120,21 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | |||
120 | kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); | 120 | kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); |
121 | } | 121 | } |
122 | 122 | ||
123 | static void anon_vma_chain_link(struct vm_area_struct *vma, | ||
124 | struct anon_vma_chain *avc, | ||
125 | struct anon_vma *anon_vma) | ||
126 | { | ||
127 | avc->vma = vma; | ||
128 | avc->anon_vma = anon_vma; | ||
129 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
130 | |||
131 | /* | ||
132 | * It's critical to add new vmas to the tail of the anon_vma, | ||
133 | * see comment in huge_memory.c:__split_huge_page(). | ||
134 | */ | ||
135 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
136 | } | ||
137 | |||
123 | /** | 138 | /** |
124 | * anon_vma_prepare - attach an anon_vma to a memory region | 139 | * anon_vma_prepare - attach an anon_vma to a memory region |
125 | * @vma: the memory region in question | 140 | * @vma: the memory region in question |
@@ -175,10 +190,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
175 | spin_lock(&mm->page_table_lock); | 190 | spin_lock(&mm->page_table_lock); |
176 | if (likely(!vma->anon_vma)) { | 191 | if (likely(!vma->anon_vma)) { |
177 | vma->anon_vma = anon_vma; | 192 | vma->anon_vma = anon_vma; |
178 | avc->anon_vma = anon_vma; | 193 | anon_vma_chain_link(vma, avc, anon_vma); |
179 | avc->vma = vma; | ||
180 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
181 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
182 | allocated = NULL; | 194 | allocated = NULL; |
183 | avc = NULL; | 195 | avc = NULL; |
184 | } | 196 | } |
@@ -224,21 +236,6 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) | |||
224 | mutex_unlock(&root->mutex); | 236 | mutex_unlock(&root->mutex); |
225 | } | 237 | } |
226 | 238 | ||
227 | static void anon_vma_chain_link(struct vm_area_struct *vma, | ||
228 | struct anon_vma_chain *avc, | ||
229 | struct anon_vma *anon_vma) | ||
230 | { | ||
231 | avc->vma = vma; | ||
232 | avc->anon_vma = anon_vma; | ||
233 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
234 | |||
235 | /* | ||
236 | * It's critical to add new vmas to the tail of the anon_vma, | ||
237 | * see comment in huge_memory.c:__split_huge_page(). | ||
238 | */ | ||
239 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
240 | } | ||
241 | |||
242 | /* | 239 | /* |
243 | * Attach the anon_vmas from src to dst. | 240 | * Attach the anon_vmas from src to dst. |
244 | * Returns 0 on success, -ENOMEM on failure. | 241 | * Returns 0 on success, -ENOMEM on failure. |
@@ -1151,10 +1148,15 @@ void page_add_new_anon_rmap(struct page *page, | |||
1151 | */ | 1148 | */ |
1152 | void page_add_file_rmap(struct page *page) | 1149 | void page_add_file_rmap(struct page *page) |
1153 | { | 1150 | { |
1151 | bool locked; | ||
1152 | unsigned long flags; | ||
1153 | |||
1154 | mem_cgroup_begin_update_page_stat(page, &locked, &flags); | ||
1154 | if (atomic_inc_and_test(&page->_mapcount)) { | 1155 | if (atomic_inc_and_test(&page->_mapcount)) { |
1155 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 1156 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
1156 | mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); | 1157 | mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); |
1157 | } | 1158 | } |
1159 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | ||
1158 | } | 1160 | } |
1159 | 1161 | ||
1160 | /** | 1162 | /** |
@@ -1165,9 +1167,21 @@ void page_add_file_rmap(struct page *page) | |||
1165 | */ | 1167 | */ |
1166 | void page_remove_rmap(struct page *page) | 1168 | void page_remove_rmap(struct page *page) |
1167 | { | 1169 | { |
1170 | bool anon = PageAnon(page); | ||
1171 | bool locked; | ||
1172 | unsigned long flags; | ||
1173 | |||
1174 | /* | ||
1175 | * The anon case has no mem_cgroup page_stat to update; but may | ||
1176 | * uncharge_page() below, where the lock ordering can deadlock if | ||
1177 | * we hold the lock against page_stat move: so avoid it on anon. | ||
1178 | */ | ||
1179 | if (!anon) | ||
1180 | mem_cgroup_begin_update_page_stat(page, &locked, &flags); | ||
1181 | |||
1168 | /* page still mapped by someone else? */ | 1182 | /* page still mapped by someone else? */ |
1169 | if (!atomic_add_negative(-1, &page->_mapcount)) | 1183 | if (!atomic_add_negative(-1, &page->_mapcount)) |
1170 | return; | 1184 | goto out; |
1171 | 1185 | ||
1172 | /* | 1186 | /* |
1173 | * Now that the last pte has gone, s390 must transfer dirty | 1187 | * Now that the last pte has gone, s390 must transfer dirty |
@@ -1176,7 +1190,7 @@ void page_remove_rmap(struct page *page) | |||
1176 | * not if it's in swapcache - there might be another pte slot | 1190 | * not if it's in swapcache - there might be another pte slot |
1177 | * containing the swap entry, but page not yet written to swap. | 1191 | * containing the swap entry, but page not yet written to swap. |
1178 | */ | 1192 | */ |
1179 | if ((!PageAnon(page) || PageSwapCache(page)) && | 1193 | if ((!anon || PageSwapCache(page)) && |
1180 | page_test_and_clear_dirty(page_to_pfn(page), 1)) | 1194 | page_test_and_clear_dirty(page_to_pfn(page), 1)) |
1181 | set_page_dirty(page); | 1195 | set_page_dirty(page); |
1182 | /* | 1196 | /* |
@@ -1184,8 +1198,8 @@ void page_remove_rmap(struct page *page) | |||
1184 | * and not charged by memcg for now. | 1198 | * and not charged by memcg for now. |
1185 | */ | 1199 | */ |
1186 | if (unlikely(PageHuge(page))) | 1200 | if (unlikely(PageHuge(page))) |
1187 | return; | 1201 | goto out; |
1188 | if (PageAnon(page)) { | 1202 | if (anon) { |
1189 | mem_cgroup_uncharge_page(page); | 1203 | mem_cgroup_uncharge_page(page); |
1190 | if (!PageTransHuge(page)) | 1204 | if (!PageTransHuge(page)) |
1191 | __dec_zone_page_state(page, NR_ANON_PAGES); | 1205 | __dec_zone_page_state(page, NR_ANON_PAGES); |
@@ -1205,6 +1219,9 @@ void page_remove_rmap(struct page *page) | |||
1205 | * Leaving it set also helps swapoff to reinstate ptes | 1219 | * Leaving it set also helps swapoff to reinstate ptes |
1206 | * faster for those pages still in swapcache. | 1220 | * faster for those pages still in swapcache. |
1207 | */ | 1221 | */ |
1222 | out: | ||
1223 | if (!anon) | ||
1224 | mem_cgroup_end_update_page_stat(page, &locked, &flags); | ||
1208 | } | 1225 | } |
1209 | 1226 | ||
1210 | /* | 1227 | /* |
@@ -1282,7 +1299,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1282 | } | 1299 | } |
1283 | dec_mm_counter(mm, MM_ANONPAGES); | 1300 | dec_mm_counter(mm, MM_ANONPAGES); |
1284 | inc_mm_counter(mm, MM_SWAPENTS); | 1301 | inc_mm_counter(mm, MM_SWAPENTS); |
1285 | } else if (PAGE_MIGRATION) { | 1302 | } else if (IS_ENABLED(CONFIG_MIGRATION)) { |
1286 | /* | 1303 | /* |
1287 | * Store the pfn of the page in a special migration | 1304 | * Store the pfn of the page in a special migration |
1288 | * pte. do_swap_page() will wait until the migration | 1305 | * pte. do_swap_page() will wait until the migration |
@@ -1293,7 +1310,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1293 | } | 1310 | } |
1294 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 1311 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
1295 | BUG_ON(pte_file(*pte)); | 1312 | BUG_ON(pte_file(*pte)); |
1296 | } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { | 1313 | } else if (IS_ENABLED(CONFIG_MIGRATION) && |
1314 | (TTU_ACTION(flags) == TTU_MIGRATION)) { | ||
1297 | /* Establish migration entry for a file page */ | 1315 | /* Establish migration entry for a file page */ |
1298 | swp_entry_t entry; | 1316 | swp_entry_t entry; |
1299 | entry = make_migration_entry(page, pte_write(pteval)); | 1317 | entry = make_migration_entry(page, pte_write(pteval)); |
@@ -1499,7 +1517,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1499 | * locking requirements of exec(), migration skips | 1517 | * locking requirements of exec(), migration skips |
1500 | * temporary VMAs until after exec() completes. | 1518 | * temporary VMAs until after exec() completes. |
1501 | */ | 1519 | */ |
1502 | if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && | 1520 | if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) && |
1503 | is_vma_temporary_stack(vma)) | 1521 | is_vma_temporary_stack(vma)) |
1504 | continue; | 1522 | continue; |
1505 | 1523 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 7a45ad004cfd..f99ff3e50bd6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -1178,6 +1178,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode | |||
1178 | static const struct inode_operations shmem_symlink_inode_operations; | 1178 | static const struct inode_operations shmem_symlink_inode_operations; |
1179 | static const struct inode_operations shmem_short_symlink_operations; | 1179 | static const struct inode_operations shmem_short_symlink_operations; |
1180 | 1180 | ||
1181 | #ifdef CONFIG_TMPFS_XATTR | ||
1182 | static int shmem_initxattrs(struct inode *, const struct xattr *, void *); | ||
1183 | #else | ||
1184 | #define shmem_initxattrs NULL | ||
1185 | #endif | ||
1186 | |||
1181 | static int | 1187 | static int |
1182 | shmem_write_begin(struct file *file, struct address_space *mapping, | 1188 | shmem_write_begin(struct file *file, struct address_space *mapping, |
1183 | loff_t pos, unsigned len, unsigned flags, | 1189 | loff_t pos, unsigned len, unsigned flags, |
@@ -1490,7 +1496,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) | |||
1490 | if (inode) { | 1496 | if (inode) { |
1491 | error = security_inode_init_security(inode, dir, | 1497 | error = security_inode_init_security(inode, dir, |
1492 | &dentry->d_name, | 1498 | &dentry->d_name, |
1493 | NULL, NULL); | 1499 | shmem_initxattrs, NULL); |
1494 | if (error) { | 1500 | if (error) { |
1495 | if (error != -EOPNOTSUPP) { | 1501 | if (error != -EOPNOTSUPP) { |
1496 | iput(inode); | 1502 | iput(inode); |
@@ -1630,7 +1636,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
1630 | return -ENOSPC; | 1636 | return -ENOSPC; |
1631 | 1637 | ||
1632 | error = security_inode_init_security(inode, dir, &dentry->d_name, | 1638 | error = security_inode_init_security(inode, dir, &dentry->d_name, |
1633 | NULL, NULL); | 1639 | shmem_initxattrs, NULL); |
1634 | if (error) { | 1640 | if (error) { |
1635 | if (error != -EOPNOTSUPP) { | 1641 | if (error != -EOPNOTSUPP) { |
1636 | iput(inode); | 1642 | iput(inode); |
@@ -1704,6 +1710,66 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co | |||
1704 | * filesystem level, though. | 1710 | * filesystem level, though. |
1705 | */ | 1711 | */ |
1706 | 1712 | ||
1713 | /* | ||
1714 | * Allocate new xattr and copy in the value; but leave the name to callers. | ||
1715 | */ | ||
1716 | static struct shmem_xattr *shmem_xattr_alloc(const void *value, size_t size) | ||
1717 | { | ||
1718 | struct shmem_xattr *new_xattr; | ||
1719 | size_t len; | ||
1720 | |||
1721 | /* wrap around? */ | ||
1722 | len = sizeof(*new_xattr) + size; | ||
1723 | if (len <= sizeof(*new_xattr)) | ||
1724 | return NULL; | ||
1725 | |||
1726 | new_xattr = kmalloc(len, GFP_KERNEL); | ||
1727 | if (!new_xattr) | ||
1728 | return NULL; | ||
1729 | |||
1730 | new_xattr->size = size; | ||
1731 | memcpy(new_xattr->value, value, size); | ||
1732 | return new_xattr; | ||
1733 | } | ||
1734 | |||
1735 | /* | ||
1736 | * Callback for security_inode_init_security() for acquiring xattrs. | ||
1737 | */ | ||
1738 | static int shmem_initxattrs(struct inode *inode, | ||
1739 | const struct xattr *xattr_array, | ||
1740 | void *fs_info) | ||
1741 | { | ||
1742 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
1743 | const struct xattr *xattr; | ||
1744 | struct shmem_xattr *new_xattr; | ||
1745 | size_t len; | ||
1746 | |||
1747 | for (xattr = xattr_array; xattr->name != NULL; xattr++) { | ||
1748 | new_xattr = shmem_xattr_alloc(xattr->value, xattr->value_len); | ||
1749 | if (!new_xattr) | ||
1750 | return -ENOMEM; | ||
1751 | |||
1752 | len = strlen(xattr->name) + 1; | ||
1753 | new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, | ||
1754 | GFP_KERNEL); | ||
1755 | if (!new_xattr->name) { | ||
1756 | kfree(new_xattr); | ||
1757 | return -ENOMEM; | ||
1758 | } | ||
1759 | |||
1760 | memcpy(new_xattr->name, XATTR_SECURITY_PREFIX, | ||
1761 | XATTR_SECURITY_PREFIX_LEN); | ||
1762 | memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, | ||
1763 | xattr->name, len); | ||
1764 | |||
1765 | spin_lock(&info->lock); | ||
1766 | list_add(&new_xattr->list, &info->xattr_list); | ||
1767 | spin_unlock(&info->lock); | ||
1768 | } | ||
1769 | |||
1770 | return 0; | ||
1771 | } | ||
1772 | |||
1707 | static int shmem_xattr_get(struct dentry *dentry, const char *name, | 1773 | static int shmem_xattr_get(struct dentry *dentry, const char *name, |
1708 | void *buffer, size_t size) | 1774 | void *buffer, size_t size) |
1709 | { | 1775 | { |
@@ -1731,24 +1797,17 @@ static int shmem_xattr_get(struct dentry *dentry, const char *name, | |||
1731 | return ret; | 1797 | return ret; |
1732 | } | 1798 | } |
1733 | 1799 | ||
1734 | static int shmem_xattr_set(struct dentry *dentry, const char *name, | 1800 | static int shmem_xattr_set(struct inode *inode, const char *name, |
1735 | const void *value, size_t size, int flags) | 1801 | const void *value, size_t size, int flags) |
1736 | { | 1802 | { |
1737 | struct inode *inode = dentry->d_inode; | ||
1738 | struct shmem_inode_info *info = SHMEM_I(inode); | 1803 | struct shmem_inode_info *info = SHMEM_I(inode); |
1739 | struct shmem_xattr *xattr; | 1804 | struct shmem_xattr *xattr; |
1740 | struct shmem_xattr *new_xattr = NULL; | 1805 | struct shmem_xattr *new_xattr = NULL; |
1741 | size_t len; | ||
1742 | int err = 0; | 1806 | int err = 0; |
1743 | 1807 | ||
1744 | /* value == NULL means remove */ | 1808 | /* value == NULL means remove */ |
1745 | if (value) { | 1809 | if (value) { |
1746 | /* wrap around? */ | 1810 | new_xattr = shmem_xattr_alloc(value, size); |
1747 | len = sizeof(*new_xattr) + size; | ||
1748 | if (len <= sizeof(*new_xattr)) | ||
1749 | return -ENOMEM; | ||
1750 | |||
1751 | new_xattr = kmalloc(len, GFP_KERNEL); | ||
1752 | if (!new_xattr) | 1811 | if (!new_xattr) |
1753 | return -ENOMEM; | 1812 | return -ENOMEM; |
1754 | 1813 | ||
@@ -1757,9 +1816,6 @@ static int shmem_xattr_set(struct dentry *dentry, const char *name, | |||
1757 | kfree(new_xattr); | 1816 | kfree(new_xattr); |
1758 | return -ENOMEM; | 1817 | return -ENOMEM; |
1759 | } | 1818 | } |
1760 | |||
1761 | new_xattr->size = size; | ||
1762 | memcpy(new_xattr->value, value, size); | ||
1763 | } | 1819 | } |
1764 | 1820 | ||
1765 | spin_lock(&info->lock); | 1821 | spin_lock(&info->lock); |
@@ -1858,7 +1914,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name, | |||
1858 | if (size == 0) | 1914 | if (size == 0) |
1859 | value = ""; /* empty EA, do not remove */ | 1915 | value = ""; /* empty EA, do not remove */ |
1860 | 1916 | ||
1861 | return shmem_xattr_set(dentry, name, value, size, flags); | 1917 | return shmem_xattr_set(dentry->d_inode, name, value, size, flags); |
1862 | 1918 | ||
1863 | } | 1919 | } |
1864 | 1920 | ||
@@ -1878,7 +1934,7 @@ static int shmem_removexattr(struct dentry *dentry, const char *name) | |||
1878 | if (err) | 1934 | if (err) |
1879 | return err; | 1935 | return err; |
1880 | 1936 | ||
1881 | return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE); | 1937 | return shmem_xattr_set(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); |
1882 | } | 1938 | } |
1883 | 1939 | ||
1884 | static bool xattr_is_trusted(const char *name) | 1940 | static bool xattr_is_trusted(const char *name) |
@@ -3284,12 +3284,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3284 | if (in_interrupt() || (flags & __GFP_THISNODE)) | 3284 | if (in_interrupt() || (flags & __GFP_THISNODE)) |
3285 | return NULL; | 3285 | return NULL; |
3286 | nid_alloc = nid_here = numa_mem_id(); | 3286 | nid_alloc = nid_here = numa_mem_id(); |
3287 | get_mems_allowed(); | ||
3288 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | 3287 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) |
3289 | nid_alloc = cpuset_slab_spread_node(); | 3288 | nid_alloc = cpuset_slab_spread_node(); |
3290 | else if (current->mempolicy) | 3289 | else if (current->mempolicy) |
3291 | nid_alloc = slab_node(current->mempolicy); | 3290 | nid_alloc = slab_node(current->mempolicy); |
3292 | put_mems_allowed(); | ||
3293 | if (nid_alloc != nid_here) | 3291 | if (nid_alloc != nid_here) |
3294 | return ____cache_alloc_node(cachep, flags, nid_alloc); | 3292 | return ____cache_alloc_node(cachep, flags, nid_alloc); |
3295 | return NULL; | 3293 | return NULL; |
@@ -3312,14 +3310,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3312 | enum zone_type high_zoneidx = gfp_zone(flags); | 3310 | enum zone_type high_zoneidx = gfp_zone(flags); |
3313 | void *obj = NULL; | 3311 | void *obj = NULL; |
3314 | int nid; | 3312 | int nid; |
3313 | unsigned int cpuset_mems_cookie; | ||
3315 | 3314 | ||
3316 | if (flags & __GFP_THISNODE) | 3315 | if (flags & __GFP_THISNODE) |
3317 | return NULL; | 3316 | return NULL; |
3318 | 3317 | ||
3319 | get_mems_allowed(); | ||
3320 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | ||
3321 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); | 3318 | local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); |
3322 | 3319 | ||
3320 | retry_cpuset: | ||
3321 | cpuset_mems_cookie = get_mems_allowed(); | ||
3322 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | ||
3323 | |||
3323 | retry: | 3324 | retry: |
3324 | /* | 3325 | /* |
3325 | * Look through allowed nodes for objects available | 3326 | * Look through allowed nodes for objects available |
@@ -3372,7 +3373,9 @@ retry: | |||
3372 | } | 3373 | } |
3373 | } | 3374 | } |
3374 | } | 3375 | } |
3375 | put_mems_allowed(); | 3376 | |
3377 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) | ||
3378 | goto retry_cpuset; | ||
3376 | return obj; | 3379 | return obj; |
3377 | } | 3380 | } |
3378 | 3381 | ||
@@ -1581,6 +1581,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, | |||
1581 | struct zone *zone; | 1581 | struct zone *zone; |
1582 | enum zone_type high_zoneidx = gfp_zone(flags); | 1582 | enum zone_type high_zoneidx = gfp_zone(flags); |
1583 | void *object; | 1583 | void *object; |
1584 | unsigned int cpuset_mems_cookie; | ||
1584 | 1585 | ||
1585 | /* | 1586 | /* |
1586 | * The defrag ratio allows a configuration of the tradeoffs between | 1587 | * The defrag ratio allows a configuration of the tradeoffs between |
@@ -1604,23 +1605,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, | |||
1604 | get_cycles() % 1024 > s->remote_node_defrag_ratio) | 1605 | get_cycles() % 1024 > s->remote_node_defrag_ratio) |
1605 | return NULL; | 1606 | return NULL; |
1606 | 1607 | ||
1607 | get_mems_allowed(); | 1608 | do { |
1608 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); | 1609 | cpuset_mems_cookie = get_mems_allowed(); |
1609 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 1610 | zonelist = node_zonelist(slab_node(current->mempolicy), flags); |
1610 | struct kmem_cache_node *n; | 1611 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1611 | 1612 | struct kmem_cache_node *n; | |
1612 | n = get_node(s, zone_to_nid(zone)); | 1613 | |
1613 | 1614 | n = get_node(s, zone_to_nid(zone)); | |
1614 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && | 1615 | |
1615 | n->nr_partial > s->min_partial) { | 1616 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && |
1616 | object = get_partial_node(s, n, c); | 1617 | n->nr_partial > s->min_partial) { |
1617 | if (object) { | 1618 | object = get_partial_node(s, n, c); |
1618 | put_mems_allowed(); | 1619 | if (object) { |
1619 | return object; | 1620 | /* |
1621 | * Return the object even if | ||
1622 | * put_mems_allowed indicated that | ||
1623 | * the cpuset mems_allowed was | ||
1624 | * updated in parallel. It's a | ||
1625 | * harmless race between the alloc | ||
1626 | * and the cpuset update. | ||
1627 | */ | ||
1628 | put_mems_allowed(cpuset_mems_cookie); | ||
1629 | return object; | ||
1630 | } | ||
1620 | } | 1631 | } |
1621 | } | 1632 | } |
1622 | } | 1633 | } while (!put_mems_allowed(cpuset_mems_cookie)); |
1623 | put_mems_allowed(); | ||
1624 | #endif | 1634 | #endif |
1625 | return NULL; | 1635 | return NULL; |
1626 | } | 1636 | } |
diff --git a/mm/sparse.c b/mm/sparse.c index 61d7cde23111..a8bc7d364deb 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -353,29 +353,21 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, | |||
353 | 353 | ||
354 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), | 354 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), |
355 | usemap_count); | 355 | usemap_count); |
356 | if (usemap) { | 356 | if (!usemap) { |
357 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 357 | usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); |
358 | if (!present_section_nr(pnum)) | 358 | if (!usemap) { |
359 | continue; | 359 | printk(KERN_WARNING "%s: allocation failed\n", __func__); |
360 | usemap_map[pnum] = usemap; | 360 | return; |
361 | usemap += size; | ||
362 | } | 361 | } |
363 | return; | ||
364 | } | 362 | } |
365 | 363 | ||
366 | usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); | 364 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
367 | if (usemap) { | 365 | if (!present_section_nr(pnum)) |
368 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 366 | continue; |
369 | if (!present_section_nr(pnum)) | 367 | usemap_map[pnum] = usemap; |
370 | continue; | 368 | usemap += size; |
371 | usemap_map[pnum] = usemap; | 369 | check_usemap_section_nr(nodeid, usemap_map[pnum]); |
372 | usemap += size; | ||
373 | check_usemap_section_nr(nodeid, usemap_map[pnum]); | ||
374 | } | ||
375 | return; | ||
376 | } | 370 | } |
377 | |||
378 | printk(KERN_WARNING "%s: allocation failed\n", __func__); | ||
379 | } | 371 | } |
380 | 372 | ||
381 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 373 | #ifndef CONFIG_SPARSEMEM_VMEMMAP |
@@ -496,7 +496,7 @@ static void lru_deactivate_fn(struct page *page, void *arg) | |||
496 | * Either "cpu" is the current CPU, and preemption has already been | 496 | * Either "cpu" is the current CPU, and preemption has already been |
497 | * disabled; or "cpu" is being hot-unplugged, and is already dead. | 497 | * disabled; or "cpu" is being hot-unplugged, and is already dead. |
498 | */ | 498 | */ |
499 | static void drain_cpu_pagevecs(int cpu) | 499 | void lru_add_drain_cpu(int cpu) |
500 | { | 500 | { |
501 | struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); | 501 | struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); |
502 | struct pagevec *pvec; | 502 | struct pagevec *pvec; |
@@ -553,7 +553,7 @@ void deactivate_page(struct page *page) | |||
553 | 553 | ||
554 | void lru_add_drain(void) | 554 | void lru_add_drain(void) |
555 | { | 555 | { |
556 | drain_cpu_pagevecs(get_cpu()); | 556 | lru_add_drain_cpu(get_cpu()); |
557 | put_cpu(); | 557 | put_cpu(); |
558 | } | 558 | } |
559 | 559 | ||
diff --git a/mm/swap_state.c b/mm/swap_state.c index ea6b32d61873..9d3dd3763cf7 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -372,25 +372,23 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
372 | struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, | 372 | struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, |
373 | struct vm_area_struct *vma, unsigned long addr) | 373 | struct vm_area_struct *vma, unsigned long addr) |
374 | { | 374 | { |
375 | int nr_pages; | ||
376 | struct page *page; | 375 | struct page *page; |
377 | unsigned long offset; | 376 | unsigned long offset = swp_offset(entry); |
378 | unsigned long end_offset; | 377 | unsigned long start_offset, end_offset; |
378 | unsigned long mask = (1UL << page_cluster) - 1; | ||
379 | 379 | ||
380 | /* | 380 | /* Read a page_cluster sized and aligned cluster around offset. */ |
381 | * Get starting offset for readaround, and number of pages to read. | 381 | start_offset = offset & ~mask; |
382 | * Adjust starting address by readbehind (for NUMA interleave case)? | 382 | end_offset = offset | mask; |
383 | * No, it's very unlikely that swap layout would follow vma layout, | 383 | if (!start_offset) /* First page is swap header. */ |
384 | * more likely that neighbouring swap pages came from the same node: | 384 | start_offset++; |
385 | * so use the same "addr" to choose the same node for each swap read. | 385 | |
386 | */ | 386 | for (offset = start_offset; offset <= end_offset ; offset++) { |
387 | nr_pages = valid_swaphandles(entry, &offset); | ||
388 | for (end_offset = offset + nr_pages; offset < end_offset; offset++) { | ||
389 | /* Ok, do the async read-ahead now */ | 387 | /* Ok, do the async read-ahead now */ |
390 | page = read_swap_cache_async(swp_entry(swp_type(entry), offset), | 388 | page = read_swap_cache_async(swp_entry(swp_type(entry), offset), |
391 | gfp_mask, vma, addr); | 389 | gfp_mask, vma, addr); |
392 | if (!page) | 390 | if (!page) |
393 | break; | 391 | continue; |
394 | page_cache_release(page); | 392 | page_cache_release(page); |
395 | } | 393 | } |
396 | lru_add_drain(); /* Push any new pages onto the LRU now */ | 394 | lru_add_drain(); /* Push any new pages onto the LRU now */ |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 6bf67ab6e469..dae42f380d6e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -932,9 +932,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
932 | pmd = pmd_offset(pud, addr); | 932 | pmd = pmd_offset(pud, addr); |
933 | do { | 933 | do { |
934 | next = pmd_addr_end(addr, end); | 934 | next = pmd_addr_end(addr, end); |
935 | if (unlikely(pmd_trans_huge(*pmd))) | 935 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
936 | continue; | ||
937 | if (pmd_none_or_clear_bad(pmd)) | ||
938 | continue; | 936 | continue; |
939 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); | 937 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); |
940 | if (ret) | 938 | if (ret) |
@@ -2107,7 +2105,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
2107 | p->flags |= SWP_SOLIDSTATE; | 2105 | p->flags |= SWP_SOLIDSTATE; |
2108 | p->cluster_next = 1 + (random32() % p->highest_bit); | 2106 | p->cluster_next = 1 + (random32() % p->highest_bit); |
2109 | } | 2107 | } |
2110 | if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD)) | 2108 | if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) |
2111 | p->flags |= SWP_DISCARDABLE; | 2109 | p->flags |= SWP_DISCARDABLE; |
2112 | } | 2110 | } |
2113 | 2111 | ||
@@ -2292,58 +2290,6 @@ int swapcache_prepare(swp_entry_t entry) | |||
2292 | } | 2290 | } |
2293 | 2291 | ||
2294 | /* | 2292 | /* |
2295 | * swap_lock prevents swap_map being freed. Don't grab an extra | ||
2296 | * reference on the swaphandle, it doesn't matter if it becomes unused. | ||
2297 | */ | ||
2298 | int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | ||
2299 | { | ||
2300 | struct swap_info_struct *si; | ||
2301 | int our_page_cluster = page_cluster; | ||
2302 | pgoff_t target, toff; | ||
2303 | pgoff_t base, end; | ||
2304 | int nr_pages = 0; | ||
2305 | |||
2306 | if (!our_page_cluster) /* no readahead */ | ||
2307 | return 0; | ||
2308 | |||
2309 | si = swap_info[swp_type(entry)]; | ||
2310 | target = swp_offset(entry); | ||
2311 | base = (target >> our_page_cluster) << our_page_cluster; | ||
2312 | end = base + (1 << our_page_cluster); | ||
2313 | if (!base) /* first page is swap header */ | ||
2314 | base++; | ||
2315 | |||
2316 | spin_lock(&swap_lock); | ||
2317 | if (end > si->max) /* don't go beyond end of map */ | ||
2318 | end = si->max; | ||
2319 | |||
2320 | /* Count contiguous allocated slots above our target */ | ||
2321 | for (toff = target; ++toff < end; nr_pages++) { | ||
2322 | /* Don't read in free or bad pages */ | ||
2323 | if (!si->swap_map[toff]) | ||
2324 | break; | ||
2325 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) | ||
2326 | break; | ||
2327 | } | ||
2328 | /* Count contiguous allocated slots below our target */ | ||
2329 | for (toff = target; --toff >= base; nr_pages++) { | ||
2330 | /* Don't read in free or bad pages */ | ||
2331 | if (!si->swap_map[toff]) | ||
2332 | break; | ||
2333 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) | ||
2334 | break; | ||
2335 | } | ||
2336 | spin_unlock(&swap_lock); | ||
2337 | |||
2338 | /* | ||
2339 | * Indicate starting offset, and return number of pages to get: | ||
2340 | * if only 1, say 0, since there's then no readahead to be done. | ||
2341 | */ | ||
2342 | *offset = ++toff; | ||
2343 | return nr_pages? ++nr_pages: 0; | ||
2344 | } | ||
2345 | |||
2346 | /* | ||
2347 | * add_swap_count_continuation - called when a swap count is duplicated | 2293 | * add_swap_count_continuation - called when a swap count is duplicated |
2348 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's | 2294 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's |
2349 | * page of the original vmalloc'ed swap_map, to hold the continuation count | 2295 | * page of the original vmalloc'ed swap_map, to hold the continuation count |
@@ -239,6 +239,47 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, | |||
239 | next->vm_prev = vma; | 239 | next->vm_prev = vma; |
240 | } | 240 | } |
241 | 241 | ||
242 | /* Check if the vma is being used as a stack by this task */ | ||
243 | static int vm_is_stack_for_task(struct task_struct *t, | ||
244 | struct vm_area_struct *vma) | ||
245 | { | ||
246 | return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * Check if the vma is being used as a stack. | ||
251 | * If is_group is non-zero, check in the entire thread group or else | ||
252 | * just check in the current task. Returns the pid of the task that | ||
253 | * the vma is stack for. | ||
254 | */ | ||
255 | pid_t vm_is_stack(struct task_struct *task, | ||
256 | struct vm_area_struct *vma, int in_group) | ||
257 | { | ||
258 | pid_t ret = 0; | ||
259 | |||
260 | if (vm_is_stack_for_task(task, vma)) | ||
261 | return task->pid; | ||
262 | |||
263 | if (in_group) { | ||
264 | struct task_struct *t; | ||
265 | rcu_read_lock(); | ||
266 | if (!pid_alive(task)) | ||
267 | goto done; | ||
268 | |||
269 | t = task; | ||
270 | do { | ||
271 | if (vm_is_stack_for_task(t, vma)) { | ||
272 | ret = t->pid; | ||
273 | goto done; | ||
274 | } | ||
275 | } while_each_thread(task, t); | ||
276 | done: | ||
277 | rcu_read_unlock(); | ||
278 | } | ||
279 | |||
280 | return ret; | ||
281 | } | ||
282 | |||
242 | #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) | 283 | #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) |
243 | void arch_pick_mmap_layout(struct mm_struct *mm) | 284 | void arch_pick_mmap_layout(struct mm_struct *mm) |
244 | { | 285 | { |
diff --git a/mm/vmscan.c b/mm/vmscan.c index c52b23552659..49f15ef0a99a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -1138,7 +1138,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) | |||
1138 | * @mz: The mem_cgroup_zone to pull pages from. | 1138 | * @mz: The mem_cgroup_zone to pull pages from. |
1139 | * @dst: The temp list to put pages on to. | 1139 | * @dst: The temp list to put pages on to. |
1140 | * @nr_scanned: The number of pages that were scanned. | 1140 | * @nr_scanned: The number of pages that were scanned. |
1141 | * @order: The caller's attempted allocation order | 1141 | * @sc: The scan_control struct for this reclaim session |
1142 | * @mode: One of the LRU isolation modes | 1142 | * @mode: One of the LRU isolation modes |
1143 | * @active: True [1] if isolating active pages | 1143 | * @active: True [1] if isolating active pages |
1144 | * @file: True [1] if isolating file [!anon] pages | 1144 | * @file: True [1] if isolating file [!anon] pages |
@@ -1147,8 +1147,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) | |||
1147 | */ | 1147 | */ |
1148 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | 1148 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
1149 | struct mem_cgroup_zone *mz, struct list_head *dst, | 1149 | struct mem_cgroup_zone *mz, struct list_head *dst, |
1150 | unsigned long *nr_scanned, int order, isolate_mode_t mode, | 1150 | unsigned long *nr_scanned, struct scan_control *sc, |
1151 | int active, int file) | 1151 | isolate_mode_t mode, int active, int file) |
1152 | { | 1152 | { |
1153 | struct lruvec *lruvec; | 1153 | struct lruvec *lruvec; |
1154 | struct list_head *src; | 1154 | struct list_head *src; |
@@ -1194,7 +1194,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1194 | BUG(); | 1194 | BUG(); |
1195 | } | 1195 | } |
1196 | 1196 | ||
1197 | if (!order) | 1197 | if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)) |
1198 | continue; | 1198 | continue; |
1199 | 1199 | ||
1200 | /* | 1200 | /* |
@@ -1208,8 +1208,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1208 | */ | 1208 | */ |
1209 | zone_id = page_zone_id(page); | 1209 | zone_id = page_zone_id(page); |
1210 | page_pfn = page_to_pfn(page); | 1210 | page_pfn = page_to_pfn(page); |
1211 | pfn = page_pfn & ~((1 << order) - 1); | 1211 | pfn = page_pfn & ~((1 << sc->order) - 1); |
1212 | end_pfn = pfn + (1 << order); | 1212 | end_pfn = pfn + (1 << sc->order); |
1213 | for (; pfn < end_pfn; pfn++) { | 1213 | for (; pfn < end_pfn; pfn++) { |
1214 | struct page *cursor_page; | 1214 | struct page *cursor_page; |
1215 | 1215 | ||
@@ -1275,7 +1275,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1275 | 1275 | ||
1276 | *nr_scanned = scan; | 1276 | *nr_scanned = scan; |
1277 | 1277 | ||
1278 | trace_mm_vmscan_lru_isolate(order, | 1278 | trace_mm_vmscan_lru_isolate(sc->order, |
1279 | nr_to_scan, scan, | 1279 | nr_to_scan, scan, |
1280 | nr_taken, | 1280 | nr_taken, |
1281 | nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, | 1281 | nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, |
@@ -1413,7 +1413,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz, | |||
1413 | unsigned long *nr_anon, | 1413 | unsigned long *nr_anon, |
1414 | unsigned long *nr_file) | 1414 | unsigned long *nr_file) |
1415 | { | 1415 | { |
1416 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | ||
1417 | struct zone *zone = mz->zone; | 1416 | struct zone *zone = mz->zone; |
1418 | unsigned int count[NR_LRU_LISTS] = { 0, }; | 1417 | unsigned int count[NR_LRU_LISTS] = { 0, }; |
1419 | unsigned long nr_active = 0; | 1418 | unsigned long nr_active = 0; |
@@ -1434,6 +1433,7 @@ update_isolated_counts(struct mem_cgroup_zone *mz, | |||
1434 | count[lru] += numpages; | 1433 | count[lru] += numpages; |
1435 | } | 1434 | } |
1436 | 1435 | ||
1436 | preempt_disable(); | ||
1437 | __count_vm_events(PGDEACTIVATE, nr_active); | 1437 | __count_vm_events(PGDEACTIVATE, nr_active); |
1438 | 1438 | ||
1439 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, | 1439 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, |
@@ -1448,8 +1448,9 @@ update_isolated_counts(struct mem_cgroup_zone *mz, | |||
1448 | *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; | 1448 | *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; |
1449 | *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; | 1449 | *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; |
1450 | 1450 | ||
1451 | reclaim_stat->recent_scanned[0] += *nr_anon; | 1451 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); |
1452 | reclaim_stat->recent_scanned[1] += *nr_file; | 1452 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); |
1453 | preempt_enable(); | ||
1453 | } | 1454 | } |
1454 | 1455 | ||
1455 | /* | 1456 | /* |
@@ -1509,8 +1510,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1509 | unsigned long nr_file; | 1510 | unsigned long nr_file; |
1510 | unsigned long nr_dirty = 0; | 1511 | unsigned long nr_dirty = 0; |
1511 | unsigned long nr_writeback = 0; | 1512 | unsigned long nr_writeback = 0; |
1512 | isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; | 1513 | isolate_mode_t isolate_mode = ISOLATE_INACTIVE; |
1513 | struct zone *zone = mz->zone; | 1514 | struct zone *zone = mz->zone; |
1515 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | ||
1514 | 1516 | ||
1515 | while (unlikely(too_many_isolated(zone, file, sc))) { | 1517 | while (unlikely(too_many_isolated(zone, file, sc))) { |
1516 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1518 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
@@ -1522,20 +1524,19 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1522 | 1524 | ||
1523 | set_reclaim_mode(priority, sc, false); | 1525 | set_reclaim_mode(priority, sc, false); |
1524 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) | 1526 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) |
1525 | reclaim_mode |= ISOLATE_ACTIVE; | 1527 | isolate_mode |= ISOLATE_ACTIVE; |
1526 | 1528 | ||
1527 | lru_add_drain(); | 1529 | lru_add_drain(); |
1528 | 1530 | ||
1529 | if (!sc->may_unmap) | 1531 | if (!sc->may_unmap) |
1530 | reclaim_mode |= ISOLATE_UNMAPPED; | 1532 | isolate_mode |= ISOLATE_UNMAPPED; |
1531 | if (!sc->may_writepage) | 1533 | if (!sc->may_writepage) |
1532 | reclaim_mode |= ISOLATE_CLEAN; | 1534 | isolate_mode |= ISOLATE_CLEAN; |
1533 | 1535 | ||
1534 | spin_lock_irq(&zone->lru_lock); | 1536 | spin_lock_irq(&zone->lru_lock); |
1535 | 1537 | ||
1536 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, | 1538 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned, |
1537 | &nr_scanned, sc->order, | 1539 | sc, isolate_mode, 0, file); |
1538 | reclaim_mode, 0, file); | ||
1539 | if (global_reclaim(sc)) { | 1540 | if (global_reclaim(sc)) { |
1540 | zone->pages_scanned += nr_scanned; | 1541 | zone->pages_scanned += nr_scanned; |
1541 | if (current_is_kswapd()) | 1542 | if (current_is_kswapd()) |
@@ -1545,19 +1546,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1545 | __count_zone_vm_events(PGSCAN_DIRECT, zone, | 1546 | __count_zone_vm_events(PGSCAN_DIRECT, zone, |
1546 | nr_scanned); | 1547 | nr_scanned); |
1547 | } | 1548 | } |
1549 | spin_unlock_irq(&zone->lru_lock); | ||
1548 | 1550 | ||
1549 | if (nr_taken == 0) { | 1551 | if (nr_taken == 0) |
1550 | spin_unlock_irq(&zone->lru_lock); | ||
1551 | return 0; | 1552 | return 0; |
1552 | } | ||
1553 | 1553 | ||
1554 | update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); | 1554 | update_isolated_counts(mz, &page_list, &nr_anon, &nr_file); |
1555 | 1555 | ||
1556 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); | ||
1557 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); | ||
1558 | |||
1559 | spin_unlock_irq(&zone->lru_lock); | ||
1560 | |||
1561 | nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, | 1556 | nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, |
1562 | &nr_dirty, &nr_writeback); | 1557 | &nr_dirty, &nr_writeback); |
1563 | 1558 | ||
@@ -1570,6 +1565,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1570 | 1565 | ||
1571 | spin_lock_irq(&zone->lru_lock); | 1566 | spin_lock_irq(&zone->lru_lock); |
1572 | 1567 | ||
1568 | reclaim_stat->recent_scanned[0] += nr_anon; | ||
1569 | reclaim_stat->recent_scanned[1] += nr_file; | ||
1570 | |||
1573 | if (current_is_kswapd()) | 1571 | if (current_is_kswapd()) |
1574 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); | 1572 | __count_vm_events(KSWAPD_STEAL, nr_reclaimed); |
1575 | __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); | 1573 | __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); |
@@ -1643,18 +1641,6 @@ static void move_active_pages_to_lru(struct zone *zone, | |||
1643 | unsigned long pgmoved = 0; | 1641 | unsigned long pgmoved = 0; |
1644 | struct page *page; | 1642 | struct page *page; |
1645 | 1643 | ||
1646 | if (buffer_heads_over_limit) { | ||
1647 | spin_unlock_irq(&zone->lru_lock); | ||
1648 | list_for_each_entry(page, list, lru) { | ||
1649 | if (page_has_private(page) && trylock_page(page)) { | ||
1650 | if (page_has_private(page)) | ||
1651 | try_to_release_page(page, 0); | ||
1652 | unlock_page(page); | ||
1653 | } | ||
1654 | } | ||
1655 | spin_lock_irq(&zone->lru_lock); | ||
1656 | } | ||
1657 | |||
1658 | while (!list_empty(list)) { | 1644 | while (!list_empty(list)) { |
1659 | struct lruvec *lruvec; | 1645 | struct lruvec *lruvec; |
1660 | 1646 | ||
@@ -1699,21 +1685,22 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1699 | struct page *page; | 1685 | struct page *page; |
1700 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); | 1686 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz); |
1701 | unsigned long nr_rotated = 0; | 1687 | unsigned long nr_rotated = 0; |
1702 | isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; | 1688 | isolate_mode_t isolate_mode = ISOLATE_ACTIVE; |
1703 | struct zone *zone = mz->zone; | 1689 | struct zone *zone = mz->zone; |
1704 | 1690 | ||
1705 | lru_add_drain(); | 1691 | lru_add_drain(); |
1706 | 1692 | ||
1693 | reset_reclaim_mode(sc); | ||
1694 | |||
1707 | if (!sc->may_unmap) | 1695 | if (!sc->may_unmap) |
1708 | reclaim_mode |= ISOLATE_UNMAPPED; | 1696 | isolate_mode |= ISOLATE_UNMAPPED; |
1709 | if (!sc->may_writepage) | 1697 | if (!sc->may_writepage) |
1710 | reclaim_mode |= ISOLATE_CLEAN; | 1698 | isolate_mode |= ISOLATE_CLEAN; |
1711 | 1699 | ||
1712 | spin_lock_irq(&zone->lru_lock); | 1700 | spin_lock_irq(&zone->lru_lock); |
1713 | 1701 | ||
1714 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, | 1702 | nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc, |
1715 | &nr_scanned, sc->order, | 1703 | isolate_mode, 1, file); |
1716 | reclaim_mode, 1, file); | ||
1717 | if (global_reclaim(sc)) | 1704 | if (global_reclaim(sc)) |
1718 | zone->pages_scanned += nr_scanned; | 1705 | zone->pages_scanned += nr_scanned; |
1719 | 1706 | ||
@@ -1737,6 +1724,14 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1737 | continue; | 1724 | continue; |
1738 | } | 1725 | } |
1739 | 1726 | ||
1727 | if (unlikely(buffer_heads_over_limit)) { | ||
1728 | if (page_has_private(page) && trylock_page(page)) { | ||
1729 | if (page_has_private(page)) | ||
1730 | try_to_release_page(page, 0); | ||
1731 | unlock_page(page); | ||
1732 | } | ||
1733 | } | ||
1734 | |||
1740 | if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { | 1735 | if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { |
1741 | nr_rotated += hpage_nr_pages(page); | 1736 | nr_rotated += hpage_nr_pages(page); |
1742 | /* | 1737 | /* |
@@ -2112,7 +2107,12 @@ restart: | |||
2112 | * with multiple processes reclaiming pages, the total | 2107 | * with multiple processes reclaiming pages, the total |
2113 | * freeing target can get unreasonably large. | 2108 | * freeing target can get unreasonably large. |
2114 | */ | 2109 | */ |
2115 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) | 2110 | if (nr_reclaimed >= nr_to_reclaim) |
2111 | nr_to_reclaim = 0; | ||
2112 | else | ||
2113 | nr_to_reclaim -= nr_reclaimed; | ||
2114 | |||
2115 | if (!nr_to_reclaim && priority < DEF_PRIORITY) | ||
2116 | break; | 2116 | break; |
2117 | } | 2117 | } |
2118 | blk_finish_plug(&plug); | 2118 | blk_finish_plug(&plug); |
@@ -2195,7 +2195,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | |||
2195 | * If compaction is deferred, reclaim up to a point where | 2195 | * If compaction is deferred, reclaim up to a point where |
2196 | * compaction will have a chance of success when re-enabled | 2196 | * compaction will have a chance of success when re-enabled |
2197 | */ | 2197 | */ |
2198 | if (compaction_deferred(zone)) | 2198 | if (compaction_deferred(zone, sc->order)) |
2199 | return watermark_ok; | 2199 | return watermark_ok; |
2200 | 2200 | ||
2201 | /* If compaction is not ready to start, keep reclaiming */ | 2201 | /* If compaction is not ready to start, keep reclaiming */ |
@@ -2235,6 +2235,14 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, | |||
2235 | unsigned long nr_soft_scanned; | 2235 | unsigned long nr_soft_scanned; |
2236 | bool aborted_reclaim = false; | 2236 | bool aborted_reclaim = false; |
2237 | 2237 | ||
2238 | /* | ||
2239 | * If the number of buffer_heads in the machine exceeds the maximum | ||
2240 | * allowed level, force direct reclaim to scan the highmem zone as | ||
2241 | * highmem pages could be pinning lowmem pages storing buffer_heads | ||
2242 | */ | ||
2243 | if (buffer_heads_over_limit) | ||
2244 | sc->gfp_mask |= __GFP_HIGHMEM; | ||
2245 | |||
2238 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 2246 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2239 | gfp_zone(sc->gfp_mask), sc->nodemask) { | 2247 | gfp_zone(sc->gfp_mask), sc->nodemask) { |
2240 | if (!populated_zone(zone)) | 2248 | if (!populated_zone(zone)) |
@@ -2255,8 +2263,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, | |||
2255 | * Even though compaction is invoked for any | 2263 | * Even though compaction is invoked for any |
2256 | * non-zero order, only frequent costly order | 2264 | * non-zero order, only frequent costly order |
2257 | * reclamation is disruptive enough to become a | 2265 | * reclamation is disruptive enough to become a |
2258 | * noticable problem, like transparent huge page | 2266 | * noticeable problem, like transparent huge |
2259 | * allocations. | 2267 | * page allocations. |
2260 | */ | 2268 | */ |
2261 | if (compaction_ready(zone, sc)) { | 2269 | if (compaction_ready(zone, sc)) { |
2262 | aborted_reclaim = true; | 2270 | aborted_reclaim = true; |
@@ -2337,7 +2345,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2337 | unsigned long writeback_threshold; | 2345 | unsigned long writeback_threshold; |
2338 | bool aborted_reclaim; | 2346 | bool aborted_reclaim; |
2339 | 2347 | ||
2340 | get_mems_allowed(); | ||
2341 | delayacct_freepages_start(); | 2348 | delayacct_freepages_start(); |
2342 | 2349 | ||
2343 | if (global_reclaim(sc)) | 2350 | if (global_reclaim(sc)) |
@@ -2401,7 +2408,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2401 | 2408 | ||
2402 | out: | 2409 | out: |
2403 | delayacct_freepages_end(); | 2410 | delayacct_freepages_end(); |
2404 | put_mems_allowed(); | ||
2405 | 2411 | ||
2406 | if (sc->nr_reclaimed) | 2412 | if (sc->nr_reclaimed) |
2407 | return sc->nr_reclaimed; | 2413 | return sc->nr_reclaimed; |
@@ -2724,6 +2730,17 @@ loop_again: | |||
2724 | */ | 2730 | */ |
2725 | age_active_anon(zone, &sc, priority); | 2731 | age_active_anon(zone, &sc, priority); |
2726 | 2732 | ||
2733 | /* | ||
2734 | * If the number of buffer_heads in the machine | ||
2735 | * exceeds the maximum allowed level and this node | ||
2736 | * has a highmem zone, force kswapd to reclaim from | ||
2737 | * it to relieve lowmem pressure. | ||
2738 | */ | ||
2739 | if (buffer_heads_over_limit && is_highmem_idx(i)) { | ||
2740 | end_zone = i; | ||
2741 | break; | ||
2742 | } | ||
2743 | |||
2727 | if (!zone_watermark_ok_safe(zone, order, | 2744 | if (!zone_watermark_ok_safe(zone, order, |
2728 | high_wmark_pages(zone), 0, 0)) { | 2745 | high_wmark_pages(zone), 0, 0)) { |
2729 | end_zone = i; | 2746 | end_zone = i; |
@@ -2753,7 +2770,7 @@ loop_again: | |||
2753 | */ | 2770 | */ |
2754 | for (i = 0; i <= end_zone; i++) { | 2771 | for (i = 0; i <= end_zone; i++) { |
2755 | struct zone *zone = pgdat->node_zones + i; | 2772 | struct zone *zone = pgdat->node_zones + i; |
2756 | int nr_slab; | 2773 | int nr_slab, testorder; |
2757 | unsigned long balance_gap; | 2774 | unsigned long balance_gap; |
2758 | 2775 | ||
2759 | if (!populated_zone(zone)) | 2776 | if (!populated_zone(zone)) |
@@ -2786,7 +2803,21 @@ loop_again: | |||
2786 | (zone->present_pages + | 2803 | (zone->present_pages + |
2787 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 2804 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / |
2788 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | 2805 | KSWAPD_ZONE_BALANCE_GAP_RATIO); |
2789 | if (!zone_watermark_ok_safe(zone, order, | 2806 | /* |
2807 | * Kswapd reclaims only single pages with compaction | ||
2808 | * enabled. Trying too hard to reclaim until contiguous | ||
2809 | * free pages have become available can hurt performance | ||
2810 | * by evicting too much useful data from memory. | ||
2811 | * Do not reclaim more than needed for compaction. | ||
2812 | */ | ||
2813 | testorder = order; | ||
2814 | if (COMPACTION_BUILD && order && | ||
2815 | compaction_suitable(zone, order) != | ||
2816 | COMPACT_SKIPPED) | ||
2817 | testorder = 0; | ||
2818 | |||
2819 | if ((buffer_heads_over_limit && is_highmem_idx(i)) || | ||
2820 | !zone_watermark_ok_safe(zone, order, | ||
2790 | high_wmark_pages(zone) + balance_gap, | 2821 | high_wmark_pages(zone) + balance_gap, |
2791 | end_zone, 0)) { | 2822 | end_zone, 0)) { |
2792 | shrink_zone(priority, zone, &sc); | 2823 | shrink_zone(priority, zone, &sc); |
@@ -2815,7 +2846,7 @@ loop_again: | |||
2815 | continue; | 2846 | continue; |
2816 | } | 2847 | } |
2817 | 2848 | ||
2818 | if (!zone_watermark_ok_safe(zone, order, | 2849 | if (!zone_watermark_ok_safe(zone, testorder, |
2819 | high_wmark_pages(zone), end_zone, 0)) { | 2850 | high_wmark_pages(zone), end_zone, 0)) { |
2820 | all_zones_ok = 0; | 2851 | all_zones_ok = 0; |
2821 | /* | 2852 | /* |
@@ -2903,6 +2934,8 @@ out: | |||
2903 | * and it is potentially going to sleep here. | 2934 | * and it is potentially going to sleep here. |
2904 | */ | 2935 | */ |
2905 | if (order) { | 2936 | if (order) { |
2937 | int zones_need_compaction = 1; | ||
2938 | |||
2906 | for (i = 0; i <= end_zone; i++) { | 2939 | for (i = 0; i <= end_zone; i++) { |
2907 | struct zone *zone = pgdat->node_zones + i; | 2940 | struct zone *zone = pgdat->node_zones + i; |
2908 | 2941 | ||
@@ -2912,6 +2945,10 @@ out: | |||
2912 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 2945 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
2913 | continue; | 2946 | continue; |
2914 | 2947 | ||
2948 | /* Would compaction fail due to lack of free memory? */ | ||
2949 | if (compaction_suitable(zone, order) == COMPACT_SKIPPED) | ||
2950 | goto loop_again; | ||
2951 | |||
2915 | /* Confirm the zone is balanced for order-0 */ | 2952 | /* Confirm the zone is balanced for order-0 */ |
2916 | if (!zone_watermark_ok(zone, 0, | 2953 | if (!zone_watermark_ok(zone, 0, |
2917 | high_wmark_pages(zone), 0, 0)) { | 2954 | high_wmark_pages(zone), 0, 0)) { |
@@ -2919,11 +2956,17 @@ out: | |||
2919 | goto loop_again; | 2956 | goto loop_again; |
2920 | } | 2957 | } |
2921 | 2958 | ||
2959 | /* Check if the memory needs to be defragmented. */ | ||
2960 | if (zone_watermark_ok(zone, order, | ||
2961 | low_wmark_pages(zone), *classzone_idx, 0)) | ||
2962 | zones_need_compaction = 0; | ||
2963 | |||
2922 | /* If balanced, clear the congested flag */ | 2964 | /* If balanced, clear the congested flag */ |
2923 | zone_clear_flag(zone, ZONE_CONGESTED); | 2965 | zone_clear_flag(zone, ZONE_CONGESTED); |
2924 | if (i <= *classzone_idx) | ||
2925 | balanced += zone->present_pages; | ||
2926 | } | 2966 | } |
2967 | |||
2968 | if (zones_need_compaction) | ||
2969 | compact_pgdat(pgdat, order); | ||
2927 | } | 2970 | } |
2928 | 2971 | ||
2929 | /* | 2972 | /* |