aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2012-12-08 09:25:06 -0500
committerIngo Molnar <mingo@kernel.org>2012-12-08 09:25:06 -0500
commitf0b9abfb044649bc452fb2fb975ff2fd599cc6a3 (patch)
tree7800081c5cb16a4dfee1e57a70f3be90f7b50d9a /mm
parentadc1ef1e37358d3c17d1a74a58b2e104fc0bda15 (diff)
parent1b3c393cd43f22ead8a6a2f839efc6df8ebd7465 (diff)
Merge branch 'linus' into perf/core
Conflicts: tools/perf/Makefile tools/perf/builtin-test.c tools/perf/perf.h tools/perf/tests/parse-events.c tools/perf/util/evsel.h Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/bootmem.c10
-rw-r--r--mm/compaction.c10
-rw-r--r--mm/highmem.c2
-rw-r--r--mm/memblock.c24
-rw-r--r--mm/memcontrol.c67
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/memory.c10
-rw-r--r--mm/memory_hotplug.c7
-rw-r--r--mm/mempolicy.c22
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/mmu_notifier.c26
-rw-r--r--mm/mmzone.c6
-rw-r--r--mm/nobootmem.c3
-rw-r--r--mm/page_alloc.c83
-rw-r--r--mm/rmap.c20
-rw-r--r--mm/shmem.c44
-rw-r--r--mm/sparse.c10
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/vmscan.c107
19 files changed, 245 insertions, 220 deletions
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 434be4ae7a04..f468185b3b28 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -198,8 +198,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
198 int order = ilog2(BITS_PER_LONG); 198 int order = ilog2(BITS_PER_LONG);
199 199
200 __free_pages_bootmem(pfn_to_page(start), order); 200 __free_pages_bootmem(pfn_to_page(start), order);
201 fixup_zone_present_pages(page_to_nid(pfn_to_page(start)),
202 start, start + BITS_PER_LONG);
203 count += BITS_PER_LONG; 201 count += BITS_PER_LONG;
204 start += BITS_PER_LONG; 202 start += BITS_PER_LONG;
205 } else { 203 } else {
@@ -210,9 +208,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
210 if (vec & 1) { 208 if (vec & 1) {
211 page = pfn_to_page(start + off); 209 page = pfn_to_page(start + off);
212 __free_pages_bootmem(page, 0); 210 __free_pages_bootmem(page, 0);
213 fixup_zone_present_pages(
214 page_to_nid(page),
215 start + off, start + off + 1);
216 count++; 211 count++;
217 } 212 }
218 vec >>= 1; 213 vec >>= 1;
@@ -226,11 +221,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
226 pages = bdata->node_low_pfn - bdata->node_min_pfn; 221 pages = bdata->node_low_pfn - bdata->node_min_pfn;
227 pages = bootmem_bootmap_pages(pages); 222 pages = bootmem_bootmap_pages(pages);
228 count += pages; 223 count += pages;
229 while (pages--) { 224 while (pages--)
230 fixup_zone_present_pages(page_to_nid(page),
231 page_to_pfn(page), page_to_pfn(page) + 1);
232 __free_pages_bootmem(page++, 0); 225 __free_pages_bootmem(page++, 0);
233 }
234 226
235 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); 227 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
236 228
diff --git a/mm/compaction.c b/mm/compaction.c
index 9eef55838fca..694eaabaaebd 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -713,7 +713,15 @@ static void isolate_freepages(struct zone *zone,
713 713
714 /* Found a block suitable for isolating free pages from */ 714 /* Found a block suitable for isolating free pages from */
715 isolated = 0; 715 isolated = 0;
716 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); 716
717 /*
718 * As pfn may not start aligned, pfn+pageblock_nr_page
719 * may cross a MAX_ORDER_NR_PAGES boundary and miss
720 * a pfn_valid check. Ensure isolate_freepages_block()
721 * only scans within a pageblock
722 */
723 end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
724 end_pfn = min(end_pfn, zone_end_pfn);
717 isolated = isolate_freepages_block(cc, pfn, end_pfn, 725 isolated = isolate_freepages_block(cc, pfn, end_pfn,
718 freelist, false); 726 freelist, false);
719 nr_freepages += isolated; 727 nr_freepages += isolated;
diff --git a/mm/highmem.c b/mm/highmem.c
index d517cd16a6eb..2da13a5c50e2 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -98,7 +98,7 @@ struct page *kmap_to_page(void *vaddr)
98{ 98{
99 unsigned long addr = (unsigned long)vaddr; 99 unsigned long addr = (unsigned long)vaddr;
100 100
101 if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) { 101 if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
102 int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; 102 int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
103 return pte_page(pkmap_page_table[i]); 103 return pte_page(pkmap_page_table[i]);
104 } 104 }
diff --git a/mm/memblock.c b/mm/memblock.c
index 931eef145af5..625905523c2a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -930,6 +930,30 @@ int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t si
930 return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; 930 return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
931} 931}
932 932
933void __init_memblock memblock_trim_memory(phys_addr_t align)
934{
935 int i;
936 phys_addr_t start, end, orig_start, orig_end;
937 struct memblock_type *mem = &memblock.memory;
938
939 for (i = 0; i < mem->cnt; i++) {
940 orig_start = mem->regions[i].base;
941 orig_end = mem->regions[i].base + mem->regions[i].size;
942 start = round_up(orig_start, align);
943 end = round_down(orig_end, align);
944
945 if (start == orig_start && end == orig_end)
946 continue;
947
948 if (start < end) {
949 mem->regions[i].base = start;
950 mem->regions[i].size = end - start;
951 } else {
952 memblock_remove_region(mem, i);
953 i--;
954 }
955 }
956}
933 957
934void __init_memblock memblock_set_current_limit(phys_addr_t limit) 958void __init_memblock memblock_set_current_limit(phys_addr_t limit)
935{ 959{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7acf43bf04a2..dd39ba000b31 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1055,12 +1055,24 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1055 struct mem_cgroup *memcg) 1055 struct mem_cgroup *memcg)
1056{ 1056{
1057 struct mem_cgroup_per_zone *mz; 1057 struct mem_cgroup_per_zone *mz;
1058 struct lruvec *lruvec;
1058 1059
1059 if (mem_cgroup_disabled()) 1060 if (mem_cgroup_disabled()) {
1060 return &zone->lruvec; 1061 lruvec = &zone->lruvec;
1062 goto out;
1063 }
1061 1064
1062 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); 1065 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1063 return &mz->lruvec; 1066 lruvec = &mz->lruvec;
1067out:
1068 /*
1069 * Since a node can be onlined after the mem_cgroup was created,
1070 * we have to be prepared to initialize lruvec->zone here;
1071 * and if offlined then reonlined, we need to reinitialize it.
1072 */
1073 if (unlikely(lruvec->zone != zone))
1074 lruvec->zone = zone;
1075 return lruvec;
1064} 1076}
1065 1077
1066/* 1078/*
@@ -1087,9 +1099,12 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1087 struct mem_cgroup_per_zone *mz; 1099 struct mem_cgroup_per_zone *mz;
1088 struct mem_cgroup *memcg; 1100 struct mem_cgroup *memcg;
1089 struct page_cgroup *pc; 1101 struct page_cgroup *pc;
1102 struct lruvec *lruvec;
1090 1103
1091 if (mem_cgroup_disabled()) 1104 if (mem_cgroup_disabled()) {
1092 return &zone->lruvec; 1105 lruvec = &zone->lruvec;
1106 goto out;
1107 }
1093 1108
1094 pc = lookup_page_cgroup(page); 1109 pc = lookup_page_cgroup(page);
1095 memcg = pc->mem_cgroup; 1110 memcg = pc->mem_cgroup;
@@ -1107,7 +1122,16 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1107 pc->mem_cgroup = memcg = root_mem_cgroup; 1122 pc->mem_cgroup = memcg = root_mem_cgroup;
1108 1123
1109 mz = page_cgroup_zoneinfo(memcg, page); 1124 mz = page_cgroup_zoneinfo(memcg, page);
1110 return &mz->lruvec; 1125 lruvec = &mz->lruvec;
1126out:
1127 /*
1128 * Since a node can be onlined after the mem_cgroup was created,
1129 * we have to be prepared to initialize lruvec->zone here;
1130 * and if offlined then reonlined, we need to reinitialize it.
1131 */
1132 if (unlikely(lruvec->zone != zone))
1133 lruvec->zone = zone;
1134 return lruvec;
1111} 1135}
1112 1136
1113/** 1137/**
@@ -1452,17 +1476,26 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1452static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1476static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1453{ 1477{
1454 u64 limit; 1478 u64 limit;
1455 u64 memsw;
1456 1479
1457 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1480 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1458 limit += total_swap_pages << PAGE_SHIFT;
1459 1481
1460 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1461 /* 1482 /*
1462 * If memsw is finite and limits the amount of swap space available 1483 * Do not consider swap space if we cannot swap due to swappiness
1463 * to this memcg, return that limit.
1464 */ 1484 */
1465 return min(limit, memsw); 1485 if (mem_cgroup_swappiness(memcg)) {
1486 u64 memsw;
1487
1488 limit += total_swap_pages << PAGE_SHIFT;
1489 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1490
1491 /*
1492 * If memsw is finite and limits the amount of swap space
1493 * available to this memcg, return that limit.
1494 */
1495 limit = min(limit, memsw);
1496 }
1497
1498 return limit;
1466} 1499}
1467 1500
1468void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1501void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -3688,17 +3721,17 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3688static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3721static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3689 int node, int zid, enum lru_list lru) 3722 int node, int zid, enum lru_list lru)
3690{ 3723{
3691 struct mem_cgroup_per_zone *mz; 3724 struct lruvec *lruvec;
3692 unsigned long flags, loop; 3725 unsigned long flags, loop;
3693 struct list_head *list; 3726 struct list_head *list;
3694 struct page *busy; 3727 struct page *busy;
3695 struct zone *zone; 3728 struct zone *zone;
3696 3729
3697 zone = &NODE_DATA(node)->node_zones[zid]; 3730 zone = &NODE_DATA(node)->node_zones[zid];
3698 mz = mem_cgroup_zoneinfo(memcg, node, zid); 3731 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
3699 list = &mz->lruvec.lists[lru]; 3732 list = &lruvec->lists[lru];
3700 3733
3701 loop = mz->lru_size[lru]; 3734 loop = mem_cgroup_get_lru_size(lruvec, lru);
3702 /* give some margin against EBUSY etc...*/ 3735 /* give some margin against EBUSY etc...*/
3703 loop += 256; 3736 loop += 256;
3704 busy = NULL; 3737 busy = NULL;
@@ -4736,7 +4769,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4736 4769
4737 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4770 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4738 mz = &pn->zoneinfo[zone]; 4771 mz = &pn->zoneinfo[zone];
4739 lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]); 4772 lruvec_init(&mz->lruvec);
4740 mz->usage_in_excess = 0; 4773 mz->usage_in_excess = 0;
4741 mz->on_tree = false; 4774 mz->on_tree = false;
4742 mz->memcg = memcg; 4775 mz->memcg = memcg;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6c5899b9034a..8b20278be6a6 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1476,9 +1476,17 @@ int soft_offline_page(struct page *page, int flags)
1476{ 1476{
1477 int ret; 1477 int ret;
1478 unsigned long pfn = page_to_pfn(page); 1478 unsigned long pfn = page_to_pfn(page);
1479 struct page *hpage = compound_trans_head(page);
1479 1480
1480 if (PageHuge(page)) 1481 if (PageHuge(page))
1481 return soft_offline_huge_page(page, flags); 1482 return soft_offline_huge_page(page, flags);
1483 if (PageTransHuge(hpage)) {
1484 if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
1485 pr_info("soft offline: %#lx: failed to split THP\n",
1486 pfn);
1487 return -EBUSY;
1488 }
1489 }
1482 1490
1483 ret = get_any_page(page, pfn, flags); 1491 ret = get_any_page(page, pfn, flags);
1484 if (ret < 0) 1492 if (ret < 0)
diff --git a/mm/memory.c b/mm/memory.c
index fb135ba4aba9..221fc9ffcab1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2527,9 +2527,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2527 int ret = 0; 2527 int ret = 0;
2528 int page_mkwrite = 0; 2528 int page_mkwrite = 0;
2529 struct page *dirty_page = NULL; 2529 struct page *dirty_page = NULL;
2530 unsigned long mmun_start; /* For mmu_notifiers */ 2530 unsigned long mmun_start = 0; /* For mmu_notifiers */
2531 unsigned long mmun_end; /* For mmu_notifiers */ 2531 unsigned long mmun_end = 0; /* For mmu_notifiers */
2532 bool mmun_called = false; /* For mmu_notifiers */
2533 2532
2534 old_page = vm_normal_page(vma, address, orig_pte); 2533 old_page = vm_normal_page(vma, address, orig_pte);
2535 if (!old_page) { 2534 if (!old_page) {
@@ -2708,8 +2707,7 @@ gotten:
2708 goto oom_free_new; 2707 goto oom_free_new;
2709 2708
2710 mmun_start = address & PAGE_MASK; 2709 mmun_start = address & PAGE_MASK;
2711 mmun_end = (address & PAGE_MASK) + PAGE_SIZE; 2710 mmun_end = mmun_start + PAGE_SIZE;
2712 mmun_called = true;
2713 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); 2711 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2714 2712
2715 /* 2713 /*
@@ -2778,7 +2776,7 @@ gotten:
2778 page_cache_release(new_page); 2776 page_cache_release(new_page);
2779unlock: 2777unlock:
2780 pte_unmap_unlock(page_table, ptl); 2778 pte_unmap_unlock(page_table, ptl);
2781 if (mmun_called) 2779 if (mmun_end > mmun_start)
2782 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 2780 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2783 if (old_page) { 2781 if (old_page) {
2784 /* 2782 /*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 56b758ae57d2..e4eeacae2b91 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,7 +106,6 @@ static void get_page_bootmem(unsigned long info, struct page *page,
106void __ref put_page_bootmem(struct page *page) 106void __ref put_page_bootmem(struct page *page)
107{ 107{
108 unsigned long type; 108 unsigned long type;
109 struct zone *zone;
110 109
111 type = (unsigned long) page->lru.next; 110 type = (unsigned long) page->lru.next;
112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 111 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -117,12 +116,6 @@ void __ref put_page_bootmem(struct page *page)
117 set_page_private(page, 0); 116 set_page_private(page, 0);
118 INIT_LIST_HEAD(&page->lru); 117 INIT_LIST_HEAD(&page->lru);
119 __free_pages_bootmem(page, 0); 118 __free_pages_bootmem(page, 0);
120
121 zone = page_zone(page);
122 zone_span_writelock(zone);
123 zone->present_pages++;
124 zone_span_writeunlock(zone);
125 totalram_pages++;
126 } 119 }
127 120
128} 121}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d04a8a54c294..4ea600da8940 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2037,28 +2037,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
2037 return new; 2037 return new;
2038} 2038}
2039 2039
2040/*
2041 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
2042 * eliminate the * MPOL_F_* flags that require conditional ref and
2043 * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
2044 * after return. Use the returned value.
2045 *
2046 * Allows use of a mempolicy for, e.g., multiple allocations with a single
2047 * policy lookup, even if the policy needs/has extra ref on lookup.
2048 * shmem_readahead needs this.
2049 */
2050struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
2051 struct mempolicy *frompol)
2052{
2053 if (!mpol_needs_cond_ref(frompol))
2054 return frompol;
2055
2056 *tompol = *frompol;
2057 tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
2058 __mpol_put(frompol);
2059 return tompol;
2060}
2061
2062/* Slow path of a mempolicy comparison */ 2040/* Slow path of a mempolicy comparison */
2063bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) 2041bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2064{ 2042{
diff --git a/mm/mmap.c b/mm/mmap.c
index 2d942353d681..9a796c41e7d9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -334,8 +334,10 @@ void validate_mm(struct mm_struct *mm)
334 struct vm_area_struct *vma = mm->mmap; 334 struct vm_area_struct *vma = mm->mmap;
335 while (vma) { 335 while (vma) {
336 struct anon_vma_chain *avc; 336 struct anon_vma_chain *avc;
337 vma_lock_anon_vma(vma);
337 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 338 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
338 anon_vma_interval_tree_verify(avc); 339 anon_vma_interval_tree_verify(avc);
340 vma_unlock_anon_vma(vma);
339 vma = vma->vm_next; 341 vma = vma->vm_next;
340 i++; 342 i++;
341 } 343 }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 479a1e751a73..8a5ac8c686b0 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -196,28 +196,28 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
196 BUG_ON(atomic_read(&mm->mm_users) <= 0); 196 BUG_ON(atomic_read(&mm->mm_users) <= 0);
197 197
198 /* 198 /*
199 * Verify that mmu_notifier_init() already run and the global srcu is 199 * Verify that mmu_notifier_init() already run and the global srcu is
200 * initialized. 200 * initialized.
201 */ 201 */
202 BUG_ON(!srcu.per_cpu_ref); 202 BUG_ON(!srcu.per_cpu_ref);
203 203
204 ret = -ENOMEM;
205 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
206 if (unlikely(!mmu_notifier_mm))
207 goto out;
208
204 if (take_mmap_sem) 209 if (take_mmap_sem)
205 down_write(&mm->mmap_sem); 210 down_write(&mm->mmap_sem);
206 ret = mm_take_all_locks(mm); 211 ret = mm_take_all_locks(mm);
207 if (unlikely(ret)) 212 if (unlikely(ret))
208 goto out; 213 goto out_clean;
209 214
210 if (!mm_has_notifiers(mm)) { 215 if (!mm_has_notifiers(mm)) {
211 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm),
212 GFP_KERNEL);
213 if (unlikely(!mmu_notifier_mm)) {
214 ret = -ENOMEM;
215 goto out_of_mem;
216 }
217 INIT_HLIST_HEAD(&mmu_notifier_mm->list); 216 INIT_HLIST_HEAD(&mmu_notifier_mm->list);
218 spin_lock_init(&mmu_notifier_mm->lock); 217 spin_lock_init(&mmu_notifier_mm->lock);
219 218
220 mm->mmu_notifier_mm = mmu_notifier_mm; 219 mm->mmu_notifier_mm = mmu_notifier_mm;
220 mmu_notifier_mm = NULL;
221 } 221 }
222 atomic_inc(&mm->mm_count); 222 atomic_inc(&mm->mm_count);
223 223
@@ -233,12 +233,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
233 hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); 233 hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
234 spin_unlock(&mm->mmu_notifier_mm->lock); 234 spin_unlock(&mm->mmu_notifier_mm->lock);
235 235
236out_of_mem:
237 mm_drop_all_locks(mm); 236 mm_drop_all_locks(mm);
238out: 237out_clean:
239 if (take_mmap_sem) 238 if (take_mmap_sem)
240 up_write(&mm->mmap_sem); 239 up_write(&mm->mmap_sem);
241 240 kfree(mmu_notifier_mm);
241out:
242 BUG_ON(atomic_read(&mm->mm_users) <= 0); 242 BUG_ON(atomic_read(&mm->mm_users) <= 0);
243 return ret; 243 return ret;
244} 244}
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 3cef80f6ac79..4596d81b89b1 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,7 +87,7 @@ int memmap_valid_within(unsigned long pfn,
87} 87}
88#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 88#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
89 89
90void lruvec_init(struct lruvec *lruvec, struct zone *zone) 90void lruvec_init(struct lruvec *lruvec)
91{ 91{
92 enum lru_list lru; 92 enum lru_list lru;
93 93
@@ -95,8 +95,4 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone)
95 95
96 for_each_lru(lru) 96 for_each_lru(lru)
97 INIT_LIST_HEAD(&lruvec->lists[lru]); 97 INIT_LIST_HEAD(&lruvec->lists[lru]);
98
99#ifdef CONFIG_MEMCG
100 lruvec->zone = zone;
101#endif
102} 98}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 714d5d650470..bd82f6b31411 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -116,8 +116,6 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
116 return 0; 116 return 0;
117 117
118 __free_pages_memory(start_pfn, end_pfn); 118 __free_pages_memory(start_pfn, end_pfn);
119 fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT),
120 start_pfn, end_pfn);
121 119
122 return end_pfn - start_pfn; 120 return end_pfn - start_pfn;
123} 121}
@@ -128,7 +126,6 @@ unsigned long __init free_low_memory_core_early(int nodeid)
128 phys_addr_t start, end, size; 126 phys_addr_t start, end, size;
129 u64 i; 127 u64 i;
130 128
131 reset_zone_present_pages();
132 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) 129 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
133 count += __free_memory_core(start, end); 130 count += __free_memory_core(start, end);
134 131
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bb90971182bd..a8f2c87792c3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1405,7 +1405,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1405 1405
1406 mt = get_pageblock_migratetype(page); 1406 mt = get_pageblock_migratetype(page);
1407 if (unlikely(mt != MIGRATE_ISOLATE)) 1407 if (unlikely(mt != MIGRATE_ISOLATE))
1408 __mod_zone_freepage_state(zone, -(1UL << order), mt); 1408 __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
1409 1409
1410 if (alloc_order != order) 1410 if (alloc_order != order)
1411 expand(zone, page, alloc_order, order, 1411 expand(zone, page, alloc_order, order,
@@ -1422,7 +1422,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1422 } 1422 }
1423 } 1423 }
1424 1424
1425 return 1UL << order; 1425 return 1UL << alloc_order;
1426} 1426}
1427 1427
1428/* 1428/*
@@ -1809,10 +1809,10 @@ static void __paginginit init_zone_allows_reclaim(int nid)
1809 int i; 1809 int i;
1810 1810
1811 for_each_online_node(i) 1811 for_each_online_node(i)
1812 if (node_distance(nid, i) <= RECLAIM_DISTANCE) { 1812 if (node_distance(nid, i) <= RECLAIM_DISTANCE)
1813 node_set(i, NODE_DATA(nid)->reclaim_nodes); 1813 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1814 else
1814 zone_reclaim_mode = 1; 1815 zone_reclaim_mode = 1;
1815 }
1816} 1816}
1817 1817
1818#else /* CONFIG_NUMA */ 1818#else /* CONFIG_NUMA */
@@ -2378,6 +2378,15 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2378 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); 2378 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2379} 2379}
2380 2380
2381/* Returns true if the allocation is likely for THP */
2382static bool is_thp_alloc(gfp_t gfp_mask, unsigned int order)
2383{
2384 if (order == pageblock_order &&
2385 (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
2386 return true;
2387 return false;
2388}
2389
2381static inline struct page * 2390static inline struct page *
2382__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2391__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2383 struct zonelist *zonelist, enum zone_type high_zoneidx, 2392 struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2416,7 +2425,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2416 goto nopage; 2425 goto nopage;
2417 2426
2418restart: 2427restart:
2419 wake_all_kswapd(order, zonelist, high_zoneidx, 2428 /* The decision whether to wake kswapd for THP is made later */
2429 if (!is_thp_alloc(gfp_mask, order))
2430 wake_all_kswapd(order, zonelist, high_zoneidx,
2420 zone_idx(preferred_zone)); 2431 zone_idx(preferred_zone));
2421 2432
2422 /* 2433 /*
@@ -2487,15 +2498,21 @@ rebalance:
2487 goto got_pg; 2498 goto got_pg;
2488 sync_migration = true; 2499 sync_migration = true;
2489 2500
2490 /* 2501 if (is_thp_alloc(gfp_mask, order)) {
2491 * If compaction is deferred for high-order allocations, it is because 2502 /*
2492 * sync compaction recently failed. In this is the case and the caller 2503 * If compaction is deferred for high-order allocations, it is
2493 * requested a movable allocation that does not heavily disrupt the 2504 * because sync compaction recently failed. If this is the case
2494 * system then fail the allocation instead of entering direct reclaim. 2505 * and the caller requested a movable allocation that does not
2495 */ 2506 * heavily disrupt the system then fail the allocation instead
2496 if ((deferred_compaction || contended_compaction) && 2507 * of entering direct reclaim.
2497 (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE) 2508 */
2498 goto nopage; 2509 if (deferred_compaction || contended_compaction)
2510 goto nopage;
2511
2512 /* If process is willing to reclaim/compact then wake kswapd */
2513 wake_all_kswapd(order, zonelist, high_zoneidx,
2514 zone_idx(preferred_zone));
2515 }
2499 2516
2500 /* Try direct reclaim and then allocating */ 2517 /* Try direct reclaim and then allocating */
2501 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2518 page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -4505,7 +4522,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4505 zone->zone_pgdat = pgdat; 4522 zone->zone_pgdat = pgdat;
4506 4523
4507 zone_pcp_init(zone); 4524 zone_pcp_init(zone);
4508 lruvec_init(&zone->lruvec, zone); 4525 lruvec_init(&zone->lruvec);
4509 if (!size) 4526 if (!size)
4510 continue; 4527 continue;
4511 4528
@@ -5825,7 +5842,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5825 ret = start_isolate_page_range(pfn_max_align_down(start), 5842 ret = start_isolate_page_range(pfn_max_align_down(start),
5826 pfn_max_align_up(end), migratetype); 5843 pfn_max_align_up(end), migratetype);
5827 if (ret) 5844 if (ret)
5828 goto done; 5845 return ret;
5829 5846
5830 ret = __alloc_contig_migrate_range(&cc, start, end); 5847 ret = __alloc_contig_migrate_range(&cc, start, end);
5831 if (ret) 5848 if (ret)
@@ -6098,37 +6115,3 @@ void dump_page(struct page *page)
6098 dump_page_flags(page->flags); 6115 dump_page_flags(page->flags);
6099 mem_cgroup_print_bad_page(page); 6116 mem_cgroup_print_bad_page(page);
6100} 6117}
6101
6102/* reset zone->present_pages */
6103void reset_zone_present_pages(void)
6104{
6105 struct zone *z;
6106 int i, nid;
6107
6108 for_each_node_state(nid, N_HIGH_MEMORY) {
6109 for (i = 0; i < MAX_NR_ZONES; i++) {
6110 z = NODE_DATA(nid)->node_zones + i;
6111 z->present_pages = 0;
6112 }
6113 }
6114}
6115
6116/* calculate zone's present pages in buddy system */
6117void fixup_zone_present_pages(int nid, unsigned long start_pfn,
6118 unsigned long end_pfn)
6119{
6120 struct zone *z;
6121 unsigned long zone_start_pfn, zone_end_pfn;
6122 int i;
6123
6124 for (i = 0; i < MAX_NR_ZONES; i++) {
6125 z = NODE_DATA(nid)->node_zones + i;
6126 zone_start_pfn = z->zone_start_pfn;
6127 zone_end_pfn = zone_start_pfn + z->spanned_pages;
6128
6129 /* if the two regions intersect */
6130 if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn))
6131 z->present_pages += min(end_pfn, zone_end_pfn) -
6132 max(start_pfn, zone_start_pfn);
6133 }
6134}
diff --git a/mm/rmap.c b/mm/rmap.c
index 7df7984d476c..2ee1ef0f317b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
56#include <linux/mmu_notifier.h> 56#include <linux/mmu_notifier.h>
57#include <linux/migrate.h> 57#include <linux/migrate.h>
58#include <linux/hugetlb.h> 58#include <linux/hugetlb.h>
59#include <linux/backing-dev.h>
59 60
60#include <asm/tlbflush.h> 61#include <asm/tlbflush.h>
61 62
@@ -926,11 +927,8 @@ int page_mkclean(struct page *page)
926 927
927 if (page_mapped(page)) { 928 if (page_mapped(page)) {
928 struct address_space *mapping = page_mapping(page); 929 struct address_space *mapping = page_mapping(page);
929 if (mapping) { 930 if (mapping)
930 ret = page_mkclean_file(mapping, page); 931 ret = page_mkclean_file(mapping, page);
931 if (page_test_and_clear_dirty(page_to_pfn(page), 1))
932 ret = 1;
933 }
934 } 932 }
935 933
936 return ret; 934 return ret;
@@ -1116,6 +1114,7 @@ void page_add_file_rmap(struct page *page)
1116 */ 1114 */
1117void page_remove_rmap(struct page *page) 1115void page_remove_rmap(struct page *page)
1118{ 1116{
1117 struct address_space *mapping = page_mapping(page);
1119 bool anon = PageAnon(page); 1118 bool anon = PageAnon(page);
1120 bool locked; 1119 bool locked;
1121 unsigned long flags; 1120 unsigned long flags;
@@ -1138,8 +1137,19 @@ void page_remove_rmap(struct page *page)
1138 * this if the page is anon, so about to be freed; but perhaps 1137 * this if the page is anon, so about to be freed; but perhaps
1139 * not if it's in swapcache - there might be another pte slot 1138 * not if it's in swapcache - there might be another pte slot
1140 * containing the swap entry, but page not yet written to swap. 1139 * containing the swap entry, but page not yet written to swap.
1140 *
1141 * And we can skip it on file pages, so long as the filesystem
1142 * participates in dirty tracking; but need to catch shm and tmpfs
1143 * and ramfs pages which have been modified since creation by read
1144 * fault.
1145 *
1146 * Note that mapping must be decided above, before decrementing
1147 * mapcount (which luckily provides a barrier): once page is unmapped,
1148 * it could be truncated and page->mapping reset to NULL at any moment.
1149 * Note also that we are relying on page_mapping(page) to set mapping
1150 * to &swapper_space when PageSwapCache(page).
1141 */ 1151 */
1142 if ((!anon || PageSwapCache(page)) && 1152 if (mapping && !mapping_cap_account_dirty(mapping) &&
1143 page_test_and_clear_dirty(page_to_pfn(page), 1)) 1153 page_test_and_clear_dirty(page_to_pfn(page), 1))
1144 set_page_dirty(page); 1154 set_page_dirty(page);
1145 /* 1155 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 67afba5117f2..50c5b8f3a359 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -643,7 +643,7 @@ static void shmem_evict_inode(struct inode *inode)
643 kfree(info->symlink); 643 kfree(info->symlink);
644 644
645 simple_xattrs_free(&info->xattrs); 645 simple_xattrs_free(&info->xattrs);
646 BUG_ON(inode->i_blocks); 646 WARN_ON(inode->i_blocks);
647 shmem_free_inode(inode->i_sb); 647 shmem_free_inode(inode->i_sb);
648 clear_inode(inode); 648 clear_inode(inode);
649} 649}
@@ -910,25 +910,29 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
910static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, 910static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
911 struct shmem_inode_info *info, pgoff_t index) 911 struct shmem_inode_info *info, pgoff_t index)
912{ 912{
913 struct mempolicy mpol, *spol;
914 struct vm_area_struct pvma; 913 struct vm_area_struct pvma;
915 914 struct page *page;
916 spol = mpol_cond_copy(&mpol,
917 mpol_shared_policy_lookup(&info->policy, index));
918 915
919 /* Create a pseudo vma that just contains the policy */ 916 /* Create a pseudo vma that just contains the policy */
920 pvma.vm_start = 0; 917 pvma.vm_start = 0;
921 /* Bias interleave by inode number to distribute better across nodes */ 918 /* Bias interleave by inode number to distribute better across nodes */
922 pvma.vm_pgoff = index + info->vfs_inode.i_ino; 919 pvma.vm_pgoff = index + info->vfs_inode.i_ino;
923 pvma.vm_ops = NULL; 920 pvma.vm_ops = NULL;
924 pvma.vm_policy = spol; 921 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
925 return swapin_readahead(swap, gfp, &pvma, 0); 922
923 page = swapin_readahead(swap, gfp, &pvma, 0);
924
925 /* Drop reference taken by mpol_shared_policy_lookup() */
926 mpol_cond_put(pvma.vm_policy);
927
928 return page;
926} 929}
927 930
928static struct page *shmem_alloc_page(gfp_t gfp, 931static struct page *shmem_alloc_page(gfp_t gfp,
929 struct shmem_inode_info *info, pgoff_t index) 932 struct shmem_inode_info *info, pgoff_t index)
930{ 933{
931 struct vm_area_struct pvma; 934 struct vm_area_struct pvma;
935 struct page *page;
932 936
933 /* Create a pseudo vma that just contains the policy */ 937 /* Create a pseudo vma that just contains the policy */
934 pvma.vm_start = 0; 938 pvma.vm_start = 0;
@@ -937,10 +941,12 @@ static struct page *shmem_alloc_page(gfp_t gfp,
937 pvma.vm_ops = NULL; 941 pvma.vm_ops = NULL;
938 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); 942 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
939 943
940 /* 944 page = alloc_page_vma(gfp, &pvma, 0);
941 * alloc_page_vma() will drop the shared policy reference 945
942 */ 946 /* Drop reference taken by mpol_shared_policy_lookup() */
943 return alloc_page_vma(gfp, &pvma, 0); 947 mpol_cond_put(pvma.vm_policy);
948
949 return page;
944} 950}
945#else /* !CONFIG_NUMA */ 951#else /* !CONFIG_NUMA */
946#ifdef CONFIG_TMPFS 952#ifdef CONFIG_TMPFS
@@ -1145,8 +1151,20 @@ repeat:
1145 if (!error) { 1151 if (!error) {
1146 error = shmem_add_to_page_cache(page, mapping, index, 1152 error = shmem_add_to_page_cache(page, mapping, index,
1147 gfp, swp_to_radix_entry(swap)); 1153 gfp, swp_to_radix_entry(swap));
1148 /* We already confirmed swap, and make no allocation */ 1154 /*
1149 VM_BUG_ON(error); 1155 * We already confirmed swap under page lock, and make
1156 * no memory allocation here, so usually no possibility
1157 * of error; but free_swap_and_cache() only trylocks a
1158 * page, so it is just possible that the entry has been
1159 * truncated or holepunched since swap was confirmed.
1160 * shmem_undo_range() will have done some of the
1161 * unaccounting, now delete_from_swap_cache() will do
1162 * the rest (including mem_cgroup_uncharge_swapcache).
1163 * Reset swap.val? No, leave it so "failed" goes back to
1164 * "repeat": reading a hole and writing should succeed.
1165 */
1166 if (error)
1167 delete_from_swap_cache(page);
1150 } 1168 }
1151 if (error) 1169 if (error)
1152 goto failed; 1170 goto failed;
diff --git a/mm/sparse.c b/mm/sparse.c
index fac95f2888f2..a83de2f72b30 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -617,7 +617,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
617{ 617{
618 return; /* XXX: Not implemented yet */ 618 return; /* XXX: Not implemented yet */
619} 619}
620static void free_map_bootmem(struct page *page, unsigned long nr_pages) 620static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
621{ 621{
622} 622}
623#else 623#else
@@ -658,10 +658,11 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
658 get_order(sizeof(struct page) * nr_pages)); 658 get_order(sizeof(struct page) * nr_pages));
659} 659}
660 660
661static void free_map_bootmem(struct page *page, unsigned long nr_pages) 661static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
662{ 662{
663 unsigned long maps_section_nr, removing_section_nr, i; 663 unsigned long maps_section_nr, removing_section_nr, i;
664 unsigned long magic; 664 unsigned long magic;
665 struct page *page = virt_to_page(memmap);
665 666
666 for (i = 0; i < nr_pages; i++, page++) { 667 for (i = 0; i < nr_pages; i++, page++) {
667 magic = (unsigned long) page->lru.next; 668 magic = (unsigned long) page->lru.next;
@@ -710,13 +711,10 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
710 */ 711 */
711 712
712 if (memmap) { 713 if (memmap) {
713 struct page *memmap_page;
714 memmap_page = virt_to_page(memmap);
715
716 nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) 714 nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
717 >> PAGE_SHIFT; 715 >> PAGE_SHIFT;
718 716
719 free_map_bootmem(memmap_page, nr_pages); 717 free_map_bootmem(memmap, nr_pages);
720 } 718 }
721} 719}
722 720
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 71cd288b2001..f91a25547ffe 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1494,9 +1494,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1494 BUG_ON(!current->mm); 1494 BUG_ON(!current->mm);
1495 1495
1496 pathname = getname(specialfile); 1496 pathname = getname(specialfile);
1497 err = PTR_ERR(pathname);
1498 if (IS_ERR(pathname)) 1497 if (IS_ERR(pathname))
1499 goto out; 1498 return PTR_ERR(pathname);
1500 1499
1501 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0); 1500 victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
1502 err = PTR_ERR(victim); 1501 err = PTR_ERR(victim);
@@ -1608,6 +1607,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1608out_dput: 1607out_dput:
1609 filp_close(victim, NULL); 1608 filp_close(victim, NULL);
1610out: 1609out:
1610 putname(pathname);
1611 return err; 1611 return err;
1612} 1612}
1613 1613
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2624edcfb420..124bbfe5cc52 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1760,28 +1760,6 @@ static bool in_reclaim_compaction(struct scan_control *sc)
1760 return false; 1760 return false;
1761} 1761}
1762 1762
1763#ifdef CONFIG_COMPACTION
1764/*
1765 * If compaction is deferred for sc->order then scale the number of pages
1766 * reclaimed based on the number of consecutive allocation failures
1767 */
1768static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
1769 struct lruvec *lruvec, struct scan_control *sc)
1770{
1771 struct zone *zone = lruvec_zone(lruvec);
1772
1773 if (zone->compact_order_failed <= sc->order)
1774 pages_for_compaction <<= zone->compact_defer_shift;
1775 return pages_for_compaction;
1776}
1777#else
1778static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
1779 struct lruvec *lruvec, struct scan_control *sc)
1780{
1781 return pages_for_compaction;
1782}
1783#endif
1784
1785/* 1763/*
1786 * Reclaim/compaction is used for high-order allocation requests. It reclaims 1764 * Reclaim/compaction is used for high-order allocation requests. It reclaims
1787 * order-0 pages before compacting the zone. should_continue_reclaim() returns 1765 * order-0 pages before compacting the zone. should_continue_reclaim() returns
@@ -1829,9 +1807,6 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
1829 * inactive lists are large enough, continue reclaiming 1807 * inactive lists are large enough, continue reclaiming
1830 */ 1808 */
1831 pages_for_compaction = (2UL << sc->order); 1809 pages_for_compaction = (2UL << sc->order);
1832
1833 pages_for_compaction = scale_for_compaction(pages_for_compaction,
1834 lruvec, sc);
1835 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); 1810 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
1836 if (nr_swap_pages > 0) 1811 if (nr_swap_pages > 0)
1837 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); 1812 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
@@ -2232,9 +2207,12 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2232 * Throttle direct reclaimers if backing storage is backed by the network 2207 * Throttle direct reclaimers if backing storage is backed by the network
2233 * and the PFMEMALLOC reserve for the preferred node is getting dangerously 2208 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
2234 * depleted. kswapd will continue to make progress and wake the processes 2209 * depleted. kswapd will continue to make progress and wake the processes
2235 * when the low watermark is reached 2210 * when the low watermark is reached.
2211 *
2212 * Returns true if a fatal signal was delivered during throttling. If this
2213 * happens, the page allocator should not consider triggering the OOM killer.
2236 */ 2214 */
2237static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, 2215static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2238 nodemask_t *nodemask) 2216 nodemask_t *nodemask)
2239{ 2217{
2240 struct zone *zone; 2218 struct zone *zone;
@@ -2249,13 +2227,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2249 * processes to block on log_wait_commit(). 2227 * processes to block on log_wait_commit().
2250 */ 2228 */
2251 if (current->flags & PF_KTHREAD) 2229 if (current->flags & PF_KTHREAD)
2252 return; 2230 goto out;
2231
2232 /*
2233 * If a fatal signal is pending, this process should not throttle.
2234 * It should return quickly so it can exit and free its memory
2235 */
2236 if (fatal_signal_pending(current))
2237 goto out;
2253 2238
2254 /* Check if the pfmemalloc reserves are ok */ 2239 /* Check if the pfmemalloc reserves are ok */
2255 first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); 2240 first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
2256 pgdat = zone->zone_pgdat; 2241 pgdat = zone->zone_pgdat;
2257 if (pfmemalloc_watermark_ok(pgdat)) 2242 if (pfmemalloc_watermark_ok(pgdat))
2258 return; 2243 goto out;
2259 2244
2260 /* Account for the throttling */ 2245 /* Account for the throttling */
2261 count_vm_event(PGSCAN_DIRECT_THROTTLE); 2246 count_vm_event(PGSCAN_DIRECT_THROTTLE);
@@ -2271,12 +2256,20 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2271 if (!(gfp_mask & __GFP_FS)) { 2256 if (!(gfp_mask & __GFP_FS)) {
2272 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, 2257 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2273 pfmemalloc_watermark_ok(pgdat), HZ); 2258 pfmemalloc_watermark_ok(pgdat), HZ);
2274 return; 2259
2260 goto check_pending;
2275 } 2261 }
2276 2262
2277 /* Throttle until kswapd wakes the process */ 2263 /* Throttle until kswapd wakes the process */
2278 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, 2264 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2279 pfmemalloc_watermark_ok(pgdat)); 2265 pfmemalloc_watermark_ok(pgdat));
2266
2267check_pending:
2268 if (fatal_signal_pending(current))
2269 return true;
2270
2271out:
2272 return false;
2280} 2273}
2281 2274
2282unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 2275unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
@@ -2298,13 +2291,12 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2298 .gfp_mask = sc.gfp_mask, 2291 .gfp_mask = sc.gfp_mask,
2299 }; 2292 };
2300 2293
2301 throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
2302
2303 /* 2294 /*
2304 * Do not enter reclaim if fatal signal is pending. 1 is returned so 2295 * Do not enter reclaim if fatal signal was delivered while throttled.
2305 * that the page allocator does not consider triggering OOM 2296 * 1 is returned so that the page allocator does not OOM kill at this
2297 * point.
2306 */ 2298 */
2307 if (fatal_signal_pending(current)) 2299 if (throttle_direct_reclaim(gfp_mask, zonelist, nodemask))
2308 return 1; 2300 return 1;
2309 2301
2310 trace_mm_vmscan_direct_reclaim_begin(order, 2302 trace_mm_vmscan_direct_reclaim_begin(order,
@@ -2422,6 +2414,19 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
2422 } while (memcg); 2414 } while (memcg);
2423} 2415}
2424 2416
2417static bool zone_balanced(struct zone *zone, int order,
2418 unsigned long balance_gap, int classzone_idx)
2419{
2420 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
2421 balance_gap, classzone_idx, 0))
2422 return false;
2423
2424 if (COMPACTION_BUILD && order && !compaction_suitable(zone, order))
2425 return false;
2426
2427 return true;
2428}
2429
2425/* 2430/*
2426 * pgdat_balanced is used when checking if a node is balanced for high-order 2431 * pgdat_balanced is used when checking if a node is balanced for high-order
2427 * allocations. Only zones that meet watermarks and are in a zone allowed 2432 * allocations. Only zones that meet watermarks and are in a zone allowed
@@ -2500,8 +2505,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2500 continue; 2505 continue;
2501 } 2506 }
2502 2507
2503 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 2508 if (!zone_balanced(zone, order, 0, i))
2504 i, 0))
2505 all_zones_ok = false; 2509 all_zones_ok = false;
2506 else 2510 else
2507 balanced += zone->present_pages; 2511 balanced += zone->present_pages;
@@ -2610,8 +2614,7 @@ loop_again:
2610 break; 2614 break;
2611 } 2615 }
2612 2616
2613 if (!zone_watermark_ok_safe(zone, order, 2617 if (!zone_balanced(zone, order, 0, 0)) {
2614 high_wmark_pages(zone), 0, 0)) {
2615 end_zone = i; 2618 end_zone = i;
2616 break; 2619 break;
2617 } else { 2620 } else {
@@ -2687,9 +2690,8 @@ loop_again:
2687 testorder = 0; 2690 testorder = 0;
2688 2691
2689 if ((buffer_heads_over_limit && is_highmem_idx(i)) || 2692 if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
2690 !zone_watermark_ok_safe(zone, testorder, 2693 !zone_balanced(zone, testorder,
2691 high_wmark_pages(zone) + balance_gap, 2694 balance_gap, end_zone)) {
2692 end_zone, 0)) {
2693 shrink_zone(zone, &sc); 2695 shrink_zone(zone, &sc);
2694 2696
2695 reclaim_state->reclaimed_slab = 0; 2697 reclaim_state->reclaimed_slab = 0;
@@ -2716,8 +2718,7 @@ loop_again:
2716 continue; 2718 continue;
2717 } 2719 }
2718 2720
2719 if (!zone_watermark_ok_safe(zone, testorder, 2721 if (!zone_balanced(zone, testorder, 0, end_zone)) {
2720 high_wmark_pages(zone), end_zone, 0)) {
2721 all_zones_ok = 0; 2722 all_zones_ok = 0;
2722 /* 2723 /*
2723 * We are still under min water mark. This 2724 * We are still under min water mark. This
@@ -2822,22 +2823,6 @@ out:
2822 if (!populated_zone(zone)) 2823 if (!populated_zone(zone))
2823 continue; 2824 continue;
2824 2825
2825 if (zone->all_unreclaimable &&
2826 sc.priority != DEF_PRIORITY)
2827 continue;
2828
2829 /* Would compaction fail due to lack of free memory? */
2830 if (COMPACTION_BUILD &&
2831 compaction_suitable(zone, order) == COMPACT_SKIPPED)
2832 goto loop_again;
2833
2834 /* Confirm the zone is balanced for order-0 */
2835 if (!zone_watermark_ok(zone, 0,
2836 high_wmark_pages(zone), 0, 0)) {
2837 order = sc.order = 0;
2838 goto loop_again;
2839 }
2840
2841 /* Check if the memory needs to be defragmented. */ 2826 /* Check if the memory needs to be defragmented. */
2842 if (zone_watermark_ok(zone, order, 2827 if (zone_watermark_ok(zone, order,
2843 low_wmark_pages(zone), *classzone_idx, 0)) 2828 low_wmark_pages(zone), *classzone_idx, 0))
@@ -3017,6 +3002,8 @@ static int kswapd(void *p)
3017 &balanced_classzone_idx); 3002 &balanced_classzone_idx);
3018 } 3003 }
3019 } 3004 }
3005
3006 current->reclaim_state = NULL;
3020 return 0; 3007 return 0;
3021} 3008}
3022 3009