aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-11 21:05:37 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-11 21:05:37 -0500
commit608ff1a210ab0e8b969399039bf8e18693605910 (patch)
treefaea7bb1764461c73d0953089bd5439d91733a03 /mm
parent414a6750e59b0b687034764c464e9ddecac0f7a6 (diff)
parent74d42d8fe146e870c52bde3b1c692f86cc8ff844 (diff)
Merge branch 'akpm' (Andrew's patchbomb)
Merge misc updates from Andrew Morton: "About half of most of MM. Going very early this time due to uncertainty over the coreautounifiednumasched things. I'll send the other half of most of MM tomorrow. The rest of MM awaits a slab merge from Pekka." * emailed patches from Andrew Morton: (71 commits) memory_hotplug: ensure every online node has NORMAL memory memory_hotplug: handle empty zone when online_movable/online_kernel mm, memory-hotplug: dynamic configure movable memory and portion memory drivers/base/node.c: cleanup node_state_attr[] bootmem: fix wrong call parameter for free_bootmem() avr32, kconfig: remove HAVE_ARCH_BOOTMEM mm: cma: remove watermark hacks mm: cma: skip watermarks check for already isolated blocks in split_free_page() mm, oom: fix race when specifying a thread as the oom origin mm, oom: change type of oom_score_adj to short mm: cleanup register_node() mm, mempolicy: remove duplicate code mm/vmscan.c: try_to_freeze() returns boolean mm: introduce putback_movable_pages() virtio_balloon: introduce migration primitives to balloon pages mm: introduce compaction and migration for ballooned pages mm: introduce a common interface for balloon pages mobility mm: redefine address_space.assoc_mapping mm: adjust address_space_operations.migratepage() return code arch/sparc/kernel/sys_sparc_64.c: s/COLOUR/COLOR/ ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig15
-rw-r--r--mm/Makefile3
-rw-r--r--mm/balloon_compaction.c302
-rw-r--r--mm/bootmem.c20
-rw-r--r--mm/compaction.c27
-rw-r--r--mm/dmapool.c24
-rw-r--r--mm/highmem.c29
-rw-r--r--mm/huge_memory.c174
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/internal.h5
-rw-r--r--mm/ksm.c21
-rw-r--r--mm/memcontrol.c4
-rw-r--r--mm/memory-failure.c28
-rw-r--r--mm/memory.c8
-rw-r--r--mm/memory_hotplug.c332
-rw-r--r--mm/mempolicy.c21
-rw-r--r--mm/migrate.c99
-rw-r--r--mm/mmap.c513
-rw-r--r--mm/oom_kill.c86
-rw-r--r--mm/page-writeback.c11
-rw-r--r--mm/page_alloc.c176
-rw-r--r--mm/page_cgroup.c3
-rw-r--r--mm/page_isolation.c27
-rw-r--r--mm/rmap.c56
-rw-r--r--mm/slub.c4
-rw-r--r--mm/sparse.c25
-rw-r--r--mm/swapfile.c31
-rw-r--r--mm/vmalloc.c4
-rw-r--r--mm/vmscan.c26
29 files changed, 1493 insertions, 585 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a3f8dddaaab3..e6651c5de14f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -188,6 +188,21 @@ config SPLIT_PTLOCK_CPUS
188 default "4" 188 default "4"
189 189
190# 190#
191# support for memory balloon compaction
192config BALLOON_COMPACTION
193 bool "Allow for balloon memory compaction/migration"
194 def_bool y
195 depends on COMPACTION && VIRTIO_BALLOON
196 help
197 Memory fragmentation introduced by ballooning might reduce
198 significantly the number of 2MB contiguous memory blocks that can be
199 used within a guest, thus imposing performance penalties associated
200 with the reduced number of transparent huge pages that could be used
201 by the guest workload. Allowing the compaction & migration for memory
202 pages enlisted as being part of memory balloon devices avoids the
203 scenario aforementioned and helps improving memory defragmentation.
204
205#
191# support for memory compaction 206# support for memory compaction
192config COMPACTION 207config COMPACTION
193 bool "Allow for memory compaction" 208 bool "Allow for memory compaction"
diff --git a/mm/Makefile b/mm/Makefile
index 6b025f80af34..3a4628751f89 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -16,7 +16,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
16 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
17 util.o mmzone.o vmstat.o backing-dev.o \ 17 util.o mmzone.o vmstat.o backing-dev.o \
18 mm_init.o mmu_context.o percpu.o slab_common.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o interval_tree.o $(mmu-y) 19 compaction.o balloon_compaction.o \
20 interval_tree.o $(mmu-y)
20 21
21obj-y += init-mm.o 22obj-y += init-mm.o
22 23
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
new file mode 100644
index 000000000000..07dbc8ec46cf
--- /dev/null
+++ b/mm/balloon_compaction.c
@@ -0,0 +1,302 @@
1/*
2 * mm/balloon_compaction.c
3 *
4 * Common interface for making balloon pages movable by compaction.
5 *
6 * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini <aquini@redhat.com>
7 */
8#include <linux/mm.h>
9#include <linux/slab.h>
10#include <linux/export.h>
11#include <linux/balloon_compaction.h>
12
13/*
14 * balloon_devinfo_alloc - allocates a balloon device information descriptor.
15 * @balloon_dev_descriptor: pointer to reference the balloon device which
16 * this struct balloon_dev_info will be servicing.
17 *
18 * Driver must call it to properly allocate and initialize an instance of
19 * struct balloon_dev_info which will be used to reference a balloon device
20 * as well as to keep track of the balloon device page list.
21 */
22struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
23{
24 struct balloon_dev_info *b_dev_info;
25 b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
26 if (!b_dev_info)
27 return ERR_PTR(-ENOMEM);
28
29 b_dev_info->balloon_device = balloon_dev_descriptor;
30 b_dev_info->mapping = NULL;
31 b_dev_info->isolated_pages = 0;
32 spin_lock_init(&b_dev_info->pages_lock);
33 INIT_LIST_HEAD(&b_dev_info->pages);
34
35 return b_dev_info;
36}
37EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
38
39/*
40 * balloon_page_enqueue - allocates a new page and inserts it into the balloon
41 * page list.
42 * @b_dev_info: balloon device decriptor where we will insert a new page to
43 *
44 * Driver must call it to properly allocate a new enlisted balloon page
45 * before definetively removing it from the guest system.
46 * This function returns the page address for the recently enqueued page or
47 * NULL in the case we fail to allocate a new page this turn.
48 */
49struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
50{
51 unsigned long flags;
52 struct page *page = alloc_page(balloon_mapping_gfp_mask() |
53 __GFP_NOMEMALLOC | __GFP_NORETRY);
54 if (!page)
55 return NULL;
56
57 /*
58 * Block others from accessing the 'page' when we get around to
59 * establishing additional references. We should be the only one
60 * holding a reference to the 'page' at this point.
61 */
62 BUG_ON(!trylock_page(page));
63 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
64 balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
65 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
66 unlock_page(page);
67 return page;
68}
69EXPORT_SYMBOL_GPL(balloon_page_enqueue);
70
71/*
72 * balloon_page_dequeue - removes a page from balloon's page list and returns
73 * the its address to allow the driver release the page.
74 * @b_dev_info: balloon device decriptor where we will grab a page from.
75 *
76 * Driver must call it to properly de-allocate a previous enlisted balloon page
77 * before definetively releasing it back to the guest system.
78 * This function returns the page address for the recently dequeued page or
79 * NULL in the case we find balloon's page list temporarily empty due to
80 * compaction isolated pages.
81 */
82struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
83{
84 struct page *page, *tmp;
85 unsigned long flags;
86 bool dequeued_page;
87
88 dequeued_page = false;
89 list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
90 /*
91 * Block others from accessing the 'page' while we get around
92 * establishing additional references and preparing the 'page'
93 * to be released by the balloon driver.
94 */
95 if (trylock_page(page)) {
96 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
97 /*
98 * Raise the page refcount here to prevent any wrong
99 * attempt to isolate this page, in case of coliding
100 * with balloon_page_isolate() just after we release
101 * the page lock.
102 *
103 * balloon_page_free() will take care of dropping
104 * this extra refcount later.
105 */
106 get_page(page);
107 balloon_page_delete(page);
108 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
109 unlock_page(page);
110 dequeued_page = true;
111 break;
112 }
113 }
114
115 if (!dequeued_page) {
116 /*
117 * If we are unable to dequeue a balloon page because the page
118 * list is empty and there is no isolated pages, then something
119 * went out of track and some balloon pages are lost.
120 * BUG() here, otherwise the balloon driver may get stuck into
121 * an infinite loop while attempting to release all its pages.
122 */
123 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
124 if (unlikely(list_empty(&b_dev_info->pages) &&
125 !b_dev_info->isolated_pages))
126 BUG();
127 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
128 page = NULL;
129 }
130 return page;
131}
132EXPORT_SYMBOL_GPL(balloon_page_dequeue);
133
134#ifdef CONFIG_BALLOON_COMPACTION
135/*
136 * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
137 * @b_dev_info: holds the balloon device information descriptor.
138 * @a_ops: balloon_mapping address_space_operations descriptor.
139 *
140 * Driver must call it to properly allocate and initialize an instance of
141 * struct address_space which will be used as the special page->mapping for
142 * balloon device enlisted page instances.
143 */
144struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
145 const struct address_space_operations *a_ops)
146{
147 struct address_space *mapping;
148
149 mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
150 if (!mapping)
151 return ERR_PTR(-ENOMEM);
152
153 /*
154 * Give a clean 'zeroed' status to all elements of this special
155 * balloon page->mapping struct address_space instance.
156 */
157 address_space_init_once(mapping);
158
159 /*
160 * Set mapping->flags appropriately, to allow balloon pages
161 * ->mapping identification.
162 */
163 mapping_set_balloon(mapping);
164 mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
165
166 /* balloon's page->mapping->a_ops callback descriptor */
167 mapping->a_ops = a_ops;
168
169 /*
170 * Establish a pointer reference back to the balloon device descriptor
171 * this particular page->mapping will be servicing.
172 * This is used by compaction / migration procedures to identify and
173 * access the balloon device pageset while isolating / migrating pages.
174 *
175 * As some balloon drivers can register multiple balloon devices
176 * for a single guest, this also helps compaction / migration to
177 * properly deal with multiple balloon pagesets, when required.
178 */
179 mapping->private_data = b_dev_info;
180 b_dev_info->mapping = mapping;
181
182 return mapping;
183}
184EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
185
186static inline void __isolate_balloon_page(struct page *page)
187{
188 struct balloon_dev_info *b_dev_info = page->mapping->private_data;
189 unsigned long flags;
190 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
191 list_del(&page->lru);
192 b_dev_info->isolated_pages++;
193 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
194}
195
196static inline void __putback_balloon_page(struct page *page)
197{
198 struct balloon_dev_info *b_dev_info = page->mapping->private_data;
199 unsigned long flags;
200 spin_lock_irqsave(&b_dev_info->pages_lock, flags);
201 list_add(&page->lru, &b_dev_info->pages);
202 b_dev_info->isolated_pages--;
203 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
204}
205
206static inline int __migrate_balloon_page(struct address_space *mapping,
207 struct page *newpage, struct page *page, enum migrate_mode mode)
208{
209 return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
210}
211
212/* __isolate_lru_page() counterpart for a ballooned page */
213bool balloon_page_isolate(struct page *page)
214{
215 /*
216 * Avoid burning cycles with pages that are yet under __free_pages(),
217 * or just got freed under us.
218 *
219 * In case we 'win' a race for a balloon page being freed under us and
220 * raise its refcount preventing __free_pages() from doing its job
221 * the put_page() at the end of this block will take care of
222 * release this page, thus avoiding a nasty leakage.
223 */
224 if (likely(get_page_unless_zero(page))) {
225 /*
226 * As balloon pages are not isolated from LRU lists, concurrent
227 * compaction threads can race against page migration functions
228 * as well as race against the balloon driver releasing a page.
229 *
230 * In order to avoid having an already isolated balloon page
231 * being (wrongly) re-isolated while it is under migration,
232 * or to avoid attempting to isolate pages being released by
233 * the balloon driver, lets be sure we have the page lock
234 * before proceeding with the balloon page isolation steps.
235 */
236 if (likely(trylock_page(page))) {
237 /*
238 * A ballooned page, by default, has just one refcount.
239 * Prevent concurrent compaction threads from isolating
240 * an already isolated balloon page by refcount check.
241 */
242 if (__is_movable_balloon_page(page) &&
243 page_count(page) == 2) {
244 __isolate_balloon_page(page);
245 unlock_page(page);
246 return true;
247 }
248 unlock_page(page);
249 }
250 put_page(page);
251 }
252 return false;
253}
254
255/* putback_lru_page() counterpart for a ballooned page */
256void balloon_page_putback(struct page *page)
257{
258 /*
259 * 'lock_page()' stabilizes the page and prevents races against
260 * concurrent isolation threads attempting to re-isolate it.
261 */
262 lock_page(page);
263
264 if (__is_movable_balloon_page(page)) {
265 __putback_balloon_page(page);
266 /* drop the extra ref count taken for page isolation */
267 put_page(page);
268 } else {
269 WARN_ON(1);
270 dump_page(page);
271 }
272 unlock_page(page);
273}
274
275/* move_to_new_page() counterpart for a ballooned page */
276int balloon_page_migrate(struct page *newpage,
277 struct page *page, enum migrate_mode mode)
278{
279 struct address_space *mapping;
280 int rc = -EAGAIN;
281
282 /*
283 * Block others from accessing the 'newpage' when we get around to
284 * establishing additional references. We should be the only one
285 * holding a reference to the 'newpage' at this point.
286 */
287 BUG_ON(!trylock_page(newpage));
288
289 if (WARN_ON(!__is_movable_balloon_page(page))) {
290 dump_page(page);
291 unlock_page(newpage);
292 return rc;
293 }
294
295 mapping = page->mapping;
296 if (mapping)
297 rc = __migrate_balloon_page(mapping, newpage, page, mode);
298
299 unlock_page(newpage);
300 return rc;
301}
302#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f468185b3b28..ecc45958ac0c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
147 147
148/* 148/*
149 * free_bootmem_late - free bootmem pages directly to page allocator 149 * free_bootmem_late - free bootmem pages directly to page allocator
150 * @addr: starting address of the range 150 * @addr: starting physical address of the range
151 * @size: size of the range in bytes 151 * @size: size of the range in bytes
152 * 152 *
153 * This is only useful when the bootmem allocator has already been torn 153 * This is only useful when the bootmem allocator has already been torn
154 * down, but we are still initializing the system. Pages are given directly 154 * down, but we are still initializing the system. Pages are given directly
155 * to the page allocator, no bootmem metadata is updated because it is gone. 155 * to the page allocator, no bootmem metadata is updated because it is gone.
156 */ 156 */
157void __init free_bootmem_late(unsigned long addr, unsigned long size) 157void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
158{ 158{
159 unsigned long cursor, end; 159 unsigned long cursor, end;
160 160
161 kmemleak_free_part(__va(addr), size); 161 kmemleak_free_part(__va(physaddr), size);
162 162
163 cursor = PFN_UP(addr); 163 cursor = PFN_UP(physaddr);
164 end = PFN_DOWN(addr + size); 164 end = PFN_DOWN(physaddr + size);
165 165
166 for (; cursor < end; cursor++) { 166 for (; cursor < end; cursor++) {
167 __free_pages_bootmem(pfn_to_page(cursor), 0); 167 __free_pages_bootmem(pfn_to_page(cursor), 0);
@@ -377,21 +377,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
377 377
378/** 378/**
379 * free_bootmem - mark a page range as usable 379 * free_bootmem - mark a page range as usable
380 * @addr: starting address of the range 380 * @addr: starting physical address of the range
381 * @size: size of the range in bytes 381 * @size: size of the range in bytes
382 * 382 *
383 * Partial pages will be considered reserved and left as they are. 383 * Partial pages will be considered reserved and left as they are.
384 * 384 *
385 * The range must be contiguous but may span node boundaries. 385 * The range must be contiguous but may span node boundaries.
386 */ 386 */
387void __init free_bootmem(unsigned long addr, unsigned long size) 387void __init free_bootmem(unsigned long physaddr, unsigned long size)
388{ 388{
389 unsigned long start, end; 389 unsigned long start, end;
390 390
391 kmemleak_free_part(__va(addr), size); 391 kmemleak_free_part(__va(physaddr), size);
392 392
393 start = PFN_UP(addr); 393 start = PFN_UP(physaddr);
394 end = PFN_DOWN(addr + size); 394 end = PFN_DOWN(physaddr + size);
395 395
396 mark_bootmem(start, end, 0, 0); 396 mark_bootmem(start, end, 0, 0);
397} 397}
diff --git a/mm/compaction.c b/mm/compaction.c
index 694eaabaaebd..d24dd2d7bad4 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -14,6 +14,7 @@
14#include <linux/backing-dev.h> 14#include <linux/backing-dev.h>
15#include <linux/sysctl.h> 15#include <linux/sysctl.h>
16#include <linux/sysfs.h> 16#include <linux/sysfs.h>
17#include <linux/balloon_compaction.h>
17#include "internal.h" 18#include "internal.h"
18 19
19#if defined CONFIG_COMPACTION || defined CONFIG_CMA 20#if defined CONFIG_COMPACTION || defined CONFIG_CMA
@@ -565,9 +566,24 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
565 goto next_pageblock; 566 goto next_pageblock;
566 } 567 }
567 568
568 /* Check may be lockless but that's ok as we recheck later */ 569 /*
569 if (!PageLRU(page)) 570 * Check may be lockless but that's ok as we recheck later.
571 * It's possible to migrate LRU pages and balloon pages
572 * Skip any other type of page
573 */
574 if (!PageLRU(page)) {
575 if (unlikely(balloon_page_movable(page))) {
576 if (locked && balloon_page_isolate(page)) {
577 /* Successfully isolated */
578 cc->finished_update_migrate = true;
579 list_add(&page->lru, migratelist);
580 cc->nr_migratepages++;
581 nr_isolated++;
582 goto check_compact_cluster;
583 }
584 }
570 continue; 585 continue;
586 }
571 587
572 /* 588 /*
573 * PageLRU is set. lru_lock normally excludes isolation 589 * PageLRU is set. lru_lock normally excludes isolation
@@ -621,6 +637,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
621 cc->nr_migratepages++; 637 cc->nr_migratepages++;
622 nr_isolated++; 638 nr_isolated++;
623 639
640check_compact_cluster:
624 /* Avoid isolating too much */ 641 /* Avoid isolating too much */
625 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { 642 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
626 ++low_pfn; 643 ++low_pfn;
@@ -986,7 +1003,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
986 switch (isolate_migratepages(zone, cc)) { 1003 switch (isolate_migratepages(zone, cc)) {
987 case ISOLATE_ABORT: 1004 case ISOLATE_ABORT:
988 ret = COMPACT_PARTIAL; 1005 ret = COMPACT_PARTIAL;
989 putback_lru_pages(&cc->migratepages); 1006 putback_movable_pages(&cc->migratepages);
990 cc->nr_migratepages = 0; 1007 cc->nr_migratepages = 0;
991 goto out; 1008 goto out;
992 case ISOLATE_NONE: 1009 case ISOLATE_NONE:
@@ -1009,9 +1026,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
1009 trace_mm_compaction_migratepages(nr_migrate - nr_remaining, 1026 trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
1010 nr_remaining); 1027 nr_remaining);
1011 1028
1012 /* Release LRU pages not migrated */ 1029 /* Release isolated pages not migrated */
1013 if (err) { 1030 if (err) {
1014 putback_lru_pages(&cc->migratepages); 1031 putback_movable_pages(&cc->migratepages);
1015 cc->nr_migratepages = 0; 1032 cc->nr_migratepages = 0;
1016 if (err == -ENOMEM) { 1033 if (err == -ENOMEM) {
1017 ret = COMPACT_PARTIAL; 1034 ret = COMPACT_PARTIAL;
diff --git a/mm/dmapool.c b/mm/dmapool.c
index da1b0f0b8709..c69781e97cf9 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -332,6 +332,30 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
332 retval = offset + page->vaddr; 332 retval = offset + page->vaddr;
333 *handle = offset + page->dma; 333 *handle = offset + page->dma;
334#ifdef DMAPOOL_DEBUG 334#ifdef DMAPOOL_DEBUG
335 {
336 int i;
337 u8 *data = retval;
338 /* page->offset is stored in first 4 bytes */
339 for (i = sizeof(page->offset); i < pool->size; i++) {
340 if (data[i] == POOL_POISON_FREED)
341 continue;
342 if (pool->dev)
343 dev_err(pool->dev,
344 "dma_pool_alloc %s, %p (corruped)\n",
345 pool->name, retval);
346 else
347 pr_err("dma_pool_alloc %s, %p (corruped)\n",
348 pool->name, retval);
349
350 /*
351 * Dump the first 4 bytes even if they are not
352 * POOL_POISON_FREED
353 */
354 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1,
355 data, pool->size, 1);
356 break;
357 }
358 }
335 memset(retval, POOL_POISON_ALLOCATED, pool->size); 359 memset(retval, POOL_POISON_ALLOCATED, pool->size);
336#endif 360#endif
337 spin_unlock_irqrestore(&pool->lock, flags); 361 spin_unlock_irqrestore(&pool->lock, flags);
diff --git a/mm/highmem.c b/mm/highmem.c
index 2da13a5c50e2..d999077431df 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -99,7 +99,7 @@ struct page *kmap_to_page(void *vaddr)
99 unsigned long addr = (unsigned long)vaddr; 99 unsigned long addr = (unsigned long)vaddr;
100 100
101 if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { 101 if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) {
102 int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; 102 int i = PKMAP_NR(addr);
103 return pte_page(pkmap_page_table[i]); 103 return pte_page(pkmap_page_table[i]);
104 } 104 }
105 105
@@ -137,8 +137,7 @@ static void flush_all_zero_pkmaps(void)
137 * So no dangers, even with speculative execution. 137 * So no dangers, even with speculative execution.
138 */ 138 */
139 page = pte_page(pkmap_page_table[i]); 139 page = pte_page(pkmap_page_table[i]);
140 pte_clear(&init_mm, (unsigned long)page_address(page), 140 pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]);
141 &pkmap_page_table[i]);
142 141
143 set_page_address(page, NULL); 142 set_page_address(page, NULL);
144 need_flush = 1; 143 need_flush = 1;
@@ -324,11 +323,7 @@ struct page_address_map {
324 struct list_head list; 323 struct list_head list;
325}; 324};
326 325
327/* 326static struct page_address_map page_address_maps[LAST_PKMAP];
328 * page_address_map freelist, allocated from page_address_maps.
329 */
330static struct list_head page_address_pool; /* freelist */
331static spinlock_t pool_lock; /* protects page_address_pool */
332 327
333/* 328/*
334 * Hash table bucket 329 * Hash table bucket
@@ -393,14 +388,7 @@ void set_page_address(struct page *page, void *virtual)
393 388
394 pas = page_slot(page); 389 pas = page_slot(page);
395 if (virtual) { /* Add */ 390 if (virtual) { /* Add */
396 BUG_ON(list_empty(&page_address_pool)); 391 pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)];
397
398 spin_lock_irqsave(&pool_lock, flags);
399 pam = list_entry(page_address_pool.next,
400 struct page_address_map, list);
401 list_del(&pam->list);
402 spin_unlock_irqrestore(&pool_lock, flags);
403
404 pam->page = page; 392 pam->page = page;
405 pam->virtual = virtual; 393 pam->virtual = virtual;
406 394
@@ -413,9 +401,6 @@ void set_page_address(struct page *page, void *virtual)
413 if (pam->page == page) { 401 if (pam->page == page) {
414 list_del(&pam->list); 402 list_del(&pam->list);
415 spin_unlock_irqrestore(&pas->lock, flags); 403 spin_unlock_irqrestore(&pas->lock, flags);
416 spin_lock_irqsave(&pool_lock, flags);
417 list_add_tail(&pam->list, &page_address_pool);
418 spin_unlock_irqrestore(&pool_lock, flags);
419 goto done; 404 goto done;
420 } 405 }
421 } 406 }
@@ -425,20 +410,14 @@ done:
425 return; 410 return;
426} 411}
427 412
428static struct page_address_map page_address_maps[LAST_PKMAP];
429
430void __init page_address_init(void) 413void __init page_address_init(void)
431{ 414{
432 int i; 415 int i;
433 416
434 INIT_LIST_HEAD(&page_address_pool);
435 for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
436 list_add(&page_address_maps[i].list, &page_address_pool);
437 for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { 417 for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
438 INIT_LIST_HEAD(&page_address_htable[i].lh); 418 INIT_LIST_HEAD(&page_address_htable[i].lh);
439 spin_lock_init(&page_address_htable[i].lock); 419 spin_lock_init(&page_address_htable[i].lock);
440 } 420 }
441 spin_lock_init(&pool_lock);
442} 421}
443 422
444#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 423#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 40f17c34b415..5f902e20e8c0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -606,6 +606,15 @@ static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
606 return pmd; 606 return pmd;
607} 607}
608 608
609static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma)
610{
611 pmd_t entry;
612 entry = mk_pmd(page, vma->vm_page_prot);
613 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
614 entry = pmd_mkhuge(entry);
615 return entry;
616}
617
609static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, 618static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
610 struct vm_area_struct *vma, 619 struct vm_area_struct *vma,
611 unsigned long haddr, pmd_t *pmd, 620 unsigned long haddr, pmd_t *pmd,
@@ -629,9 +638,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
629 pte_free(mm, pgtable); 638 pte_free(mm, pgtable);
630 } else { 639 } else {
631 pmd_t entry; 640 pmd_t entry;
632 entry = mk_pmd(page, vma->vm_page_prot); 641 entry = mk_huge_pmd(page, vma);
633 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
634 entry = pmd_mkhuge(entry);
635 /* 642 /*
636 * The spinlocking to take the lru_lock inside 643 * The spinlocking to take the lru_lock inside
637 * page_add_new_anon_rmap() acts as a full memory 644 * page_add_new_anon_rmap() acts as a full memory
@@ -777,6 +784,28 @@ out:
777 return ret; 784 return ret;
778} 785}
779 786
787void huge_pmd_set_accessed(struct mm_struct *mm,
788 struct vm_area_struct *vma,
789 unsigned long address,
790 pmd_t *pmd, pmd_t orig_pmd,
791 int dirty)
792{
793 pmd_t entry;
794 unsigned long haddr;
795
796 spin_lock(&mm->page_table_lock);
797 if (unlikely(!pmd_same(*pmd, orig_pmd)))
798 goto unlock;
799
800 entry = pmd_mkyoung(orig_pmd);
801 haddr = address & HPAGE_PMD_MASK;
802 if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty))
803 update_mmu_cache_pmd(vma, address, pmd);
804
805unlock:
806 spin_unlock(&mm->page_table_lock);
807}
808
780static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 809static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
781 struct vm_area_struct *vma, 810 struct vm_area_struct *vma,
782 unsigned long address, 811 unsigned long address,
@@ -951,9 +980,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
951 } else { 980 } else {
952 pmd_t entry; 981 pmd_t entry;
953 VM_BUG_ON(!PageHead(page)); 982 VM_BUG_ON(!PageHead(page));
954 entry = mk_pmd(new_page, vma->vm_page_prot); 983 entry = mk_huge_pmd(new_page, vma);
955 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
956 entry = pmd_mkhuge(entry);
957 pmdp_clear_flush(vma, haddr, pmd); 984 pmdp_clear_flush(vma, haddr, pmd);
958 page_add_new_anon_rmap(new_page, vma, haddr); 985 page_add_new_anon_rmap(new_page, vma, haddr);
959 set_pmd_at(mm, haddr, pmd, entry); 986 set_pmd_at(mm, haddr, pmd, entry);
@@ -1146,22 +1173,14 @@ pmd_t *page_check_address_pmd(struct page *page,
1146 unsigned long address, 1173 unsigned long address,
1147 enum page_check_address_pmd_flag flag) 1174 enum page_check_address_pmd_flag flag)
1148{ 1175{
1149 pgd_t *pgd;
1150 pud_t *pud;
1151 pmd_t *pmd, *ret = NULL; 1176 pmd_t *pmd, *ret = NULL;
1152 1177
1153 if (address & ~HPAGE_PMD_MASK) 1178 if (address & ~HPAGE_PMD_MASK)
1154 goto out; 1179 goto out;
1155 1180
1156 pgd = pgd_offset(mm, address); 1181 pmd = mm_find_pmd(mm, address);
1157 if (!pgd_present(*pgd)) 1182 if (!pmd)
1158 goto out;
1159
1160 pud = pud_offset(pgd, address);
1161 if (!pud_present(*pud))
1162 goto out; 1183 goto out;
1163
1164 pmd = pmd_offset(pud, address);
1165 if (pmd_none(*pmd)) 1184 if (pmd_none(*pmd))
1166 goto out; 1185 goto out;
1167 if (pmd_page(*pmd) != page) 1186 if (pmd_page(*pmd) != page)
@@ -1701,64 +1720,49 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte)
1701 } 1720 }
1702} 1721}
1703 1722
1704static void release_all_pte_pages(pte_t *pte)
1705{
1706 release_pte_pages(pte, pte + HPAGE_PMD_NR);
1707}
1708
1709static int __collapse_huge_page_isolate(struct vm_area_struct *vma, 1723static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1710 unsigned long address, 1724 unsigned long address,
1711 pte_t *pte) 1725 pte_t *pte)
1712{ 1726{
1713 struct page *page; 1727 struct page *page;
1714 pte_t *_pte; 1728 pte_t *_pte;
1715 int referenced = 0, isolated = 0, none = 0; 1729 int referenced = 0, none = 0;
1716 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; 1730 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
1717 _pte++, address += PAGE_SIZE) { 1731 _pte++, address += PAGE_SIZE) {
1718 pte_t pteval = *_pte; 1732 pte_t pteval = *_pte;
1719 if (pte_none(pteval)) { 1733 if (pte_none(pteval)) {
1720 if (++none <= khugepaged_max_ptes_none) 1734 if (++none <= khugepaged_max_ptes_none)
1721 continue; 1735 continue;
1722 else { 1736 else
1723 release_pte_pages(pte, _pte);
1724 goto out; 1737 goto out;
1725 }
1726 } 1738 }
1727 if (!pte_present(pteval) || !pte_write(pteval)) { 1739 if (!pte_present(pteval) || !pte_write(pteval))
1728 release_pte_pages(pte, _pte);
1729 goto out; 1740 goto out;
1730 }
1731 page = vm_normal_page(vma, address, pteval); 1741 page = vm_normal_page(vma, address, pteval);
1732 if (unlikely(!page)) { 1742 if (unlikely(!page))
1733 release_pte_pages(pte, _pte);
1734 goto out; 1743 goto out;
1735 } 1744
1736 VM_BUG_ON(PageCompound(page)); 1745 VM_BUG_ON(PageCompound(page));
1737 BUG_ON(!PageAnon(page)); 1746 BUG_ON(!PageAnon(page));
1738 VM_BUG_ON(!PageSwapBacked(page)); 1747 VM_BUG_ON(!PageSwapBacked(page));
1739 1748
1740 /* cannot use mapcount: can't collapse if there's a gup pin */ 1749 /* cannot use mapcount: can't collapse if there's a gup pin */
1741 if (page_count(page) != 1) { 1750 if (page_count(page) != 1)
1742 release_pte_pages(pte, _pte);
1743 goto out; 1751 goto out;
1744 }
1745 /* 1752 /*
1746 * We can do it before isolate_lru_page because the 1753 * We can do it before isolate_lru_page because the
1747 * page can't be freed from under us. NOTE: PG_lock 1754 * page can't be freed from under us. NOTE: PG_lock
1748 * is needed to serialize against split_huge_page 1755 * is needed to serialize against split_huge_page
1749 * when invoked from the VM. 1756 * when invoked from the VM.
1750 */ 1757 */
1751 if (!trylock_page(page)) { 1758 if (!trylock_page(page))
1752 release_pte_pages(pte, _pte);
1753 goto out; 1759 goto out;
1754 }
1755 /* 1760 /*
1756 * Isolate the page to avoid collapsing an hugepage 1761 * Isolate the page to avoid collapsing an hugepage
1757 * currently in use by the VM. 1762 * currently in use by the VM.
1758 */ 1763 */
1759 if (isolate_lru_page(page)) { 1764 if (isolate_lru_page(page)) {
1760 unlock_page(page); 1765 unlock_page(page);
1761 release_pte_pages(pte, _pte);
1762 goto out; 1766 goto out;
1763 } 1767 }
1764 /* 0 stands for page_is_file_cache(page) == false */ 1768 /* 0 stands for page_is_file_cache(page) == false */
@@ -1771,12 +1775,11 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1771 mmu_notifier_test_young(vma->vm_mm, address)) 1775 mmu_notifier_test_young(vma->vm_mm, address))
1772 referenced = 1; 1776 referenced = 1;
1773 } 1777 }
1774 if (unlikely(!referenced)) 1778 if (likely(referenced))
1775 release_all_pte_pages(pte); 1779 return 1;
1776 else
1777 isolated = 1;
1778out: 1780out:
1779 return isolated; 1781 release_pte_pages(pte, _pte);
1782 return 0;
1780} 1783}
1781 1784
1782static void __collapse_huge_page_copy(pte_t *pte, struct page *page, 1785static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
@@ -1918,14 +1921,26 @@ static struct page
1918} 1921}
1919#endif 1922#endif
1920 1923
1924static bool hugepage_vma_check(struct vm_area_struct *vma)
1925{
1926 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
1927 (vma->vm_flags & VM_NOHUGEPAGE))
1928 return false;
1929
1930 if (!vma->anon_vma || vma->vm_ops)
1931 return false;
1932 if (is_vma_temporary_stack(vma))
1933 return false;
1934 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
1935 return true;
1936}
1937
1921static void collapse_huge_page(struct mm_struct *mm, 1938static void collapse_huge_page(struct mm_struct *mm,
1922 unsigned long address, 1939 unsigned long address,
1923 struct page **hpage, 1940 struct page **hpage,
1924 struct vm_area_struct *vma, 1941 struct vm_area_struct *vma,
1925 int node) 1942 int node)
1926{ 1943{
1927 pgd_t *pgd;
1928 pud_t *pud;
1929 pmd_t *pmd, _pmd; 1944 pmd_t *pmd, _pmd;
1930 pte_t *pte; 1945 pte_t *pte;
1931 pgtable_t pgtable; 1946 pgtable_t pgtable;
@@ -1960,28 +1975,12 @@ static void collapse_huge_page(struct mm_struct *mm,
1960 hend = vma->vm_end & HPAGE_PMD_MASK; 1975 hend = vma->vm_end & HPAGE_PMD_MASK;
1961 if (address < hstart || address + HPAGE_PMD_SIZE > hend) 1976 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
1962 goto out; 1977 goto out;
1963 1978 if (!hugepage_vma_check(vma))
1964 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
1965 (vma->vm_flags & VM_NOHUGEPAGE))
1966 goto out;
1967
1968 if (!vma->anon_vma || vma->vm_ops)
1969 goto out;
1970 if (is_vma_temporary_stack(vma))
1971 goto out; 1979 goto out;
1972 VM_BUG_ON(vma->vm_flags & VM_NO_THP); 1980 pmd = mm_find_pmd(mm, address);
1973 1981 if (!pmd)
1974 pgd = pgd_offset(mm, address);
1975 if (!pgd_present(*pgd))
1976 goto out; 1982 goto out;
1977 1983 if (pmd_trans_huge(*pmd))
1978 pud = pud_offset(pgd, address);
1979 if (!pud_present(*pud))
1980 goto out;
1981
1982 pmd = pmd_offset(pud, address);
1983 /* pmd can't go away or become huge under us */
1984 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1985 goto out; 1984 goto out;
1986 1985
1987 anon_vma_lock(vma->anon_vma); 1986 anon_vma_lock(vma->anon_vma);
@@ -2028,9 +2027,7 @@ static void collapse_huge_page(struct mm_struct *mm,
2028 __SetPageUptodate(new_page); 2027 __SetPageUptodate(new_page);
2029 pgtable = pmd_pgtable(_pmd); 2028 pgtable = pmd_pgtable(_pmd);
2030 2029
2031 _pmd = mk_pmd(new_page, vma->vm_page_prot); 2030 _pmd = mk_huge_pmd(new_page, vma);
2032 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
2033 _pmd = pmd_mkhuge(_pmd);
2034 2031
2035 /* 2032 /*
2036 * spin_lock() below is not the equivalent of smp_wmb(), so 2033 * spin_lock() below is not the equivalent of smp_wmb(), so
@@ -2064,8 +2061,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2064 unsigned long address, 2061 unsigned long address,
2065 struct page **hpage) 2062 struct page **hpage)
2066{ 2063{
2067 pgd_t *pgd;
2068 pud_t *pud;
2069 pmd_t *pmd; 2064 pmd_t *pmd;
2070 pte_t *pte, *_pte; 2065 pte_t *pte, *_pte;
2071 int ret = 0, referenced = 0, none = 0; 2066 int ret = 0, referenced = 0, none = 0;
@@ -2076,16 +2071,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
2076 2071
2077 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 2072 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
2078 2073
2079 pgd = pgd_offset(mm, address); 2074 pmd = mm_find_pmd(mm, address);
2080 if (!pgd_present(*pgd)) 2075 if (!pmd)
2081 goto out;
2082
2083 pud = pud_offset(pgd, address);
2084 if (!pud_present(*pud))
2085 goto out; 2076 goto out;
2086 2077 if (pmd_trans_huge(*pmd))
2087 pmd = pmd_offset(pud, address);
2088 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
2089 goto out; 2078 goto out;
2090 2079
2091 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 2080 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2193,20 +2182,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2193 progress++; 2182 progress++;
2194 break; 2183 break;
2195 } 2184 }
2196 2185 if (!hugepage_vma_check(vma)) {
2197 if ((!(vma->vm_flags & VM_HUGEPAGE) && 2186skip:
2198 !khugepaged_always()) ||
2199 (vma->vm_flags & VM_NOHUGEPAGE)) {
2200 skip:
2201 progress++; 2187 progress++;
2202 continue; 2188 continue;
2203 } 2189 }
2204 if (!vma->anon_vma || vma->vm_ops)
2205 goto skip;
2206 if (is_vma_temporary_stack(vma))
2207 goto skip;
2208 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2209
2210 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2190 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2211 hend = vma->vm_end & HPAGE_PMD_MASK; 2191 hend = vma->vm_end & HPAGE_PMD_MASK;
2212 if (hstart >= hend) 2192 if (hstart >= hend)
@@ -2379,22 +2359,12 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
2379static void split_huge_page_address(struct mm_struct *mm, 2359static void split_huge_page_address(struct mm_struct *mm,
2380 unsigned long address) 2360 unsigned long address)
2381{ 2361{
2382 pgd_t *pgd;
2383 pud_t *pud;
2384 pmd_t *pmd; 2362 pmd_t *pmd;
2385 2363
2386 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); 2364 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2387 2365
2388 pgd = pgd_offset(mm, address); 2366 pmd = mm_find_pmd(mm, address);
2389 if (!pgd_present(*pgd)) 2367 if (!pmd)
2390 return;
2391
2392 pud = pud_offset(pgd, address);
2393 if (!pud_present(*pud))
2394 return;
2395
2396 pmd = pmd_offset(pud, address);
2397 if (!pmd_present(*pmd))
2398 return; 2368 return;
2399 /* 2369 /*
2400 * Caller holds the mmap_sem write mode, so a huge pmd cannot 2370 * Caller holds the mmap_sem write mode, so a huge pmd cannot
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 59a0059b39e2..1ef2cd4ae3c9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1800,7 +1800,7 @@ static void hugetlb_unregister_all_nodes(void)
1800 * remove hstate attributes from any nodes that have them. 1800 * remove hstate attributes from any nodes that have them.
1801 */ 1801 */
1802 for (nid = 0; nid < nr_node_ids; nid++) 1802 for (nid = 0; nid < nr_node_ids; nid++)
1803 hugetlb_unregister_node(&node_devices[nid]); 1803 hugetlb_unregister_node(node_devices[nid]);
1804} 1804}
1805 1805
1806/* 1806/*
@@ -1845,7 +1845,7 @@ static void hugetlb_register_all_nodes(void)
1845 int nid; 1845 int nid;
1846 1846
1847 for_each_node_state(nid, N_HIGH_MEMORY) { 1847 for_each_node_state(nid, N_HIGH_MEMORY) {
1848 struct node *node = &node_devices[nid]; 1848 struct node *node = node_devices[nid];
1849 if (node->dev.id == nid) 1849 if (node->dev.id == nid)
1850 hugetlb_register_node(node); 1850 hugetlb_register_node(node);
1851 } 1851 }
diff --git a/mm/internal.h b/mm/internal.h
index a4fa284f6bc2..52d1fa957194 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -92,6 +92,11 @@ extern int isolate_lru_page(struct page *page);
92extern void putback_lru_page(struct page *page); 92extern void putback_lru_page(struct page *page);
93 93
94/* 94/*
95 * in mm/rmap.c:
96 */
97extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
98
99/*
95 * in mm/page_alloc.c 100 * in mm/page_alloc.c
96 */ 101 */
97extern void __free_pages_bootmem(struct page *page, unsigned int order); 102extern void __free_pages_bootmem(struct page *page, unsigned int order);
diff --git a/mm/ksm.c b/mm/ksm.c
index ae539f0b8aa1..382d930a0bf1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -778,8 +778,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
778 struct page *kpage, pte_t orig_pte) 778 struct page *kpage, pte_t orig_pte)
779{ 779{
780 struct mm_struct *mm = vma->vm_mm; 780 struct mm_struct *mm = vma->vm_mm;
781 pgd_t *pgd;
782 pud_t *pud;
783 pmd_t *pmd; 781 pmd_t *pmd;
784 pte_t *ptep; 782 pte_t *ptep;
785 spinlock_t *ptl; 783 spinlock_t *ptl;
@@ -792,18 +790,10 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
792 if (addr == -EFAULT) 790 if (addr == -EFAULT)
793 goto out; 791 goto out;
794 792
795 pgd = pgd_offset(mm, addr); 793 pmd = mm_find_pmd(mm, addr);
796 if (!pgd_present(*pgd)) 794 if (!pmd)
797 goto out; 795 goto out;
798
799 pud = pud_offset(pgd, addr);
800 if (!pud_present(*pud))
801 goto out;
802
803 pmd = pmd_offset(pud, addr);
804 BUG_ON(pmd_trans_huge(*pmd)); 796 BUG_ON(pmd_trans_huge(*pmd));
805 if (!pmd_present(*pmd))
806 goto out;
807 797
808 mmun_start = addr; 798 mmun_start = addr;
809 mmun_end = addr + PAGE_SIZE; 799 mmun_end = addr + PAGE_SIZE;
@@ -1929,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
1929 if (ksm_run != flags) { 1919 if (ksm_run != flags) {
1930 ksm_run = flags; 1920 ksm_run = flags;
1931 if (flags & KSM_RUN_UNMERGE) { 1921 if (flags & KSM_RUN_UNMERGE) {
1932 int oom_score_adj; 1922 set_current_oom_origin();
1933
1934 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
1935 err = unmerge_and_remove_all_rmap_items(); 1923 err = unmerge_and_remove_all_rmap_items();
1936 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, 1924 clear_current_oom_origin();
1937 oom_score_adj);
1938 if (err) { 1925 if (err) {
1939 ksm_run = KSM_RUN_STOP; 1926 ksm_run = KSM_RUN_STOP;
1940 count = err; 1927 count = err;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dd39ba000b31..cf6d0df4849c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1498,8 +1498,8 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1498 return limit; 1498 return limit;
1499} 1499}
1500 1500
1501void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1501static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1502 int order) 1502 int order)
1503{ 1503{
1504 struct mem_cgroup *iter; 1504 struct mem_cgroup *iter;
1505 unsigned long chosen_points = 0; 1505 unsigned long chosen_points = 0;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 8b20278be6a6..108c52fa60f6 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -781,16 +781,16 @@ static struct page_state {
781 { compound, compound, "huge", me_huge_page }, 781 { compound, compound, "huge", me_huge_page },
782#endif 782#endif
783 783
784 { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, 784 { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty },
785 { sc|dirty, sc, "swapcache", me_swapcache_clean }, 785 { sc|dirty, sc, "clean swapcache", me_swapcache_clean },
786 786
787 { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, 787 { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty },
788 { unevict, unevict, "unevictable LRU", me_pagecache_clean}, 788 { unevict, unevict, "clean unevictable LRU", me_pagecache_clean },
789 789
790 { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, 790 { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty },
791 { mlock, mlock, "mlocked LRU", me_pagecache_clean }, 791 { mlock, mlock, "clean mlocked LRU", me_pagecache_clean },
792 792
793 { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, 793 { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty },
794 { lru|dirty, lru, "clean LRU", me_pagecache_clean }, 794 { lru|dirty, lru, "clean LRU", me_pagecache_clean },
795 795
796 /* 796 /*
@@ -812,14 +812,14 @@ static struct page_state {
812#undef slab 812#undef slab
813#undef reserved 813#undef reserved
814 814
815/*
816 * "Dirty/Clean" indication is not 100% accurate due to the possibility of
817 * setting PG_dirty outside page lock. See also comment above set_page_dirty().
818 */
815static void action_result(unsigned long pfn, char *msg, int result) 819static void action_result(unsigned long pfn, char *msg, int result)
816{ 820{
817 struct page *page = pfn_to_page(pfn); 821 pr_err("MCE %#lx: %s page recovery: %s\n",
818 822 pfn, msg, action_name[result]);
819 printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
820 pfn,
821 PageDirty(page) ? "dirty " : "",
822 msg, action_name[result]);
823} 823}
824 824
825static int page_action(struct page_state *ps, struct page *p, 825static int page_action(struct page_state *ps, struct page *p,
@@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1385 * Isolate the page, so that it doesn't get reallocated if it 1385 * Isolate the page, so that it doesn't get reallocated if it
1386 * was free. 1386 * was free.
1387 */ 1387 */
1388 set_migratetype_isolate(p); 1388 set_migratetype_isolate(p, true);
1389 /* 1389 /*
1390 * When the target page is a free hugepage, just remove it 1390 * When the target page is a free hugepage, just remove it
1391 * from free hugepage list. 1391 * from free hugepage list.
diff --git a/mm/memory.c b/mm/memory.c
index 221fc9ffcab1..765377385632 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3537,8 +3537,9 @@ retry:
3537 3537
3538 barrier(); 3538 barrier();
3539 if (pmd_trans_huge(orig_pmd)) { 3539 if (pmd_trans_huge(orig_pmd)) {
3540 if (flags & FAULT_FLAG_WRITE && 3540 unsigned int dirty = flags & FAULT_FLAG_WRITE;
3541 !pmd_write(orig_pmd) && 3541
3542 if (dirty && !pmd_write(orig_pmd) &&
3542 !pmd_trans_splitting(orig_pmd)) { 3543 !pmd_trans_splitting(orig_pmd)) {
3543 ret = do_huge_pmd_wp_page(mm, vma, address, pmd, 3544 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3544 orig_pmd); 3545 orig_pmd);
@@ -3550,6 +3551,9 @@ retry:
3550 if (unlikely(ret & VM_FAULT_OOM)) 3551 if (unlikely(ret & VM_FAULT_OOM))
3551 goto retry; 3552 goto retry;
3552 return ret; 3553 return ret;
3554 } else {
3555 huge_pmd_set_accessed(mm, vma, address, pmd,
3556 orig_pmd, dirty);
3553 } 3557 }
3554 return 0; 3558 return 0;
3555 } 3559 }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index e4eeacae2b91..de9cb14ae753 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -205,7 +205,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
205 zone_span_writelock(zone); 205 zone_span_writelock(zone);
206 206
207 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 207 old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
208 if (start_pfn < zone->zone_start_pfn) 208 if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
209 zone->zone_start_pfn = start_pfn; 209 zone->zone_start_pfn = start_pfn;
210 210
211 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - 211 zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
@@ -214,13 +214,134 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
214 zone_span_writeunlock(zone); 214 zone_span_writeunlock(zone);
215} 215}
216 216
217static void resize_zone(struct zone *zone, unsigned long start_pfn,
218 unsigned long end_pfn)
219{
220 zone_span_writelock(zone);
221
222 if (end_pfn - start_pfn) {
223 zone->zone_start_pfn = start_pfn;
224 zone->spanned_pages = end_pfn - start_pfn;
225 } else {
226 /*
227 * make it consist as free_area_init_core(),
228 * if spanned_pages = 0, then keep start_pfn = 0
229 */
230 zone->zone_start_pfn = 0;
231 zone->spanned_pages = 0;
232 }
233
234 zone_span_writeunlock(zone);
235}
236
237static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
238 unsigned long end_pfn)
239{
240 enum zone_type zid = zone_idx(zone);
241 int nid = zone->zone_pgdat->node_id;
242 unsigned long pfn;
243
244 for (pfn = start_pfn; pfn < end_pfn; pfn++)
245 set_page_links(pfn_to_page(pfn), zid, nid, pfn);
246}
247
248static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
249 unsigned long start_pfn, unsigned long end_pfn)
250{
251 int ret;
252 unsigned long flags;
253 unsigned long z1_start_pfn;
254
255 if (!z1->wait_table) {
256 ret = init_currently_empty_zone(z1, start_pfn,
257 end_pfn - start_pfn, MEMMAP_HOTPLUG);
258 if (ret)
259 return ret;
260 }
261
262 pgdat_resize_lock(z1->zone_pgdat, &flags);
263
264 /* can't move pfns which are higher than @z2 */
265 if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
266 goto out_fail;
267 /* the move out part mast at the left most of @z2 */
268 if (start_pfn > z2->zone_start_pfn)
269 goto out_fail;
270 /* must included/overlap */
271 if (end_pfn <= z2->zone_start_pfn)
272 goto out_fail;
273
274 /* use start_pfn for z1's start_pfn if z1 is empty */
275 if (z1->spanned_pages)
276 z1_start_pfn = z1->zone_start_pfn;
277 else
278 z1_start_pfn = start_pfn;
279
280 resize_zone(z1, z1_start_pfn, end_pfn);
281 resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
282
283 pgdat_resize_unlock(z1->zone_pgdat, &flags);
284
285 fix_zone_id(z1, start_pfn, end_pfn);
286
287 return 0;
288out_fail:
289 pgdat_resize_unlock(z1->zone_pgdat, &flags);
290 return -1;
291}
292
293static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
294 unsigned long start_pfn, unsigned long end_pfn)
295{
296 int ret;
297 unsigned long flags;
298 unsigned long z2_end_pfn;
299
300 if (!z2->wait_table) {
301 ret = init_currently_empty_zone(z2, start_pfn,
302 end_pfn - start_pfn, MEMMAP_HOTPLUG);
303 if (ret)
304 return ret;
305 }
306
307 pgdat_resize_lock(z1->zone_pgdat, &flags);
308
309 /* can't move pfns which are lower than @z1 */
310 if (z1->zone_start_pfn > start_pfn)
311 goto out_fail;
312 /* the move out part mast at the right most of @z1 */
313 if (z1->zone_start_pfn + z1->spanned_pages > end_pfn)
314 goto out_fail;
315 /* must included/overlap */
316 if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
317 goto out_fail;
318
319 /* use end_pfn for z2's end_pfn if z2 is empty */
320 if (z2->spanned_pages)
321 z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
322 else
323 z2_end_pfn = end_pfn;
324
325 resize_zone(z1, z1->zone_start_pfn, start_pfn);
326 resize_zone(z2, start_pfn, z2_end_pfn);
327
328 pgdat_resize_unlock(z1->zone_pgdat, &flags);
329
330 fix_zone_id(z2, start_pfn, end_pfn);
331
332 return 0;
333out_fail:
334 pgdat_resize_unlock(z1->zone_pgdat, &flags);
335 return -1;
336}
337
217static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, 338static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
218 unsigned long end_pfn) 339 unsigned long end_pfn)
219{ 340{
220 unsigned long old_pgdat_end_pfn = 341 unsigned long old_pgdat_end_pfn =
221 pgdat->node_start_pfn + pgdat->node_spanned_pages; 342 pgdat->node_start_pfn + pgdat->node_spanned_pages;
222 343
223 if (start_pfn < pgdat->node_start_pfn) 344 if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
224 pgdat->node_start_pfn = start_pfn; 345 pgdat->node_start_pfn = start_pfn;
225 346
226 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - 347 pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
@@ -460,8 +581,61 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
460 return 0; 581 return 0;
461} 582}
462 583
584/* ensure every online node has NORMAL memory */
585static bool can_online_high_movable(struct zone *zone)
586{
587 return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
588}
589
590/* check which state of node_states will be changed when online memory */
591static void node_states_check_changes_online(unsigned long nr_pages,
592 struct zone *zone, struct memory_notify *arg)
593{
594 int nid = zone_to_nid(zone);
595 enum zone_type zone_last = ZONE_NORMAL;
596
597 /*
598 * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
599 * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL.
600 *
601 * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
602 * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
603 */
604 if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
605 zone_last = ZONE_MOVABLE;
606
607 /*
608 * if the memory to be online is in a zone of 0...zone_last, and
609 * the zones of 0...zone_last don't have memory before online, we will
610 * need to set the node to node_states[N_NORMAL_MEMORY] after
611 * the memory is online.
612 */
613 if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
614 arg->status_change_nid_normal = nid;
615 else
616 arg->status_change_nid_normal = -1;
617
618 /*
619 * if the node don't have memory befor online, we will need to
620 * set the node to node_states[N_HIGH_MEMORY] after the memory
621 * is online.
622 */
623 if (!node_state(nid, N_HIGH_MEMORY))
624 arg->status_change_nid = nid;
625 else
626 arg->status_change_nid = -1;
627}
628
629static void node_states_set_node(int node, struct memory_notify *arg)
630{
631 if (arg->status_change_nid_normal >= 0)
632 node_set_state(node, N_NORMAL_MEMORY);
633
634 node_set_state(node, N_HIGH_MEMORY);
635}
463 636
464int __ref online_pages(unsigned long pfn, unsigned long nr_pages) 637
638int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
465{ 639{
466 unsigned long onlined_pages = 0; 640 unsigned long onlined_pages = 0;
467 struct zone *zone; 641 struct zone *zone;
@@ -471,13 +645,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
471 struct memory_notify arg; 645 struct memory_notify arg;
472 646
473 lock_memory_hotplug(); 647 lock_memory_hotplug();
648 /*
649 * This doesn't need a lock to do pfn_to_page().
650 * The section can't be removed here because of the
651 * memory_block->state_mutex.
652 */
653 zone = page_zone(pfn_to_page(pfn));
654
655 if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
656 !can_online_high_movable(zone)) {
657 unlock_memory_hotplug();
658 return -1;
659 }
660
661 if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
662 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
663 unlock_memory_hotplug();
664 return -1;
665 }
666 }
667 if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
668 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
669 unlock_memory_hotplug();
670 return -1;
671 }
672 }
673
674 /* Previous code may changed the zone of the pfn range */
675 zone = page_zone(pfn_to_page(pfn));
676
474 arg.start_pfn = pfn; 677 arg.start_pfn = pfn;
475 arg.nr_pages = nr_pages; 678 arg.nr_pages = nr_pages;
476 arg.status_change_nid = -1; 679 node_states_check_changes_online(nr_pages, zone, &arg);
477 680
478 nid = page_to_nid(pfn_to_page(pfn)); 681 nid = page_to_nid(pfn_to_page(pfn));
479 if (node_present_pages(nid) == 0)
480 arg.status_change_nid = nid;
481 682
482 ret = memory_notify(MEM_GOING_ONLINE, &arg); 683 ret = memory_notify(MEM_GOING_ONLINE, &arg);
483 ret = notifier_to_errno(ret); 684 ret = notifier_to_errno(ret);
@@ -487,23 +688,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
487 return ret; 688 return ret;
488 } 689 }
489 /* 690 /*
490 * This doesn't need a lock to do pfn_to_page().
491 * The section can't be removed here because of the
492 * memory_block->state_mutex.
493 */
494 zone = page_zone(pfn_to_page(pfn));
495 /*
496 * If this zone is not populated, then it is not in zonelist. 691 * If this zone is not populated, then it is not in zonelist.
497 * This means the page allocator ignores this zone. 692 * This means the page allocator ignores this zone.
498 * So, zonelist must be updated after online. 693 * So, zonelist must be updated after online.
499 */ 694 */
500 mutex_lock(&zonelists_mutex); 695 mutex_lock(&zonelists_mutex);
501 if (!populated_zone(zone)) 696 if (!populated_zone(zone)) {
502 need_zonelists_rebuild = 1; 697 need_zonelists_rebuild = 1;
698 build_all_zonelists(NULL, zone);
699 }
503 700
504 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, 701 ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
505 online_pages_range); 702 online_pages_range);
506 if (ret) { 703 if (ret) {
704 if (need_zonelists_rebuild)
705 zone_pcp_reset(zone);
507 mutex_unlock(&zonelists_mutex); 706 mutex_unlock(&zonelists_mutex);
508 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", 707 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
509 (unsigned long long) pfn << PAGE_SHIFT, 708 (unsigned long long) pfn << PAGE_SHIFT,
@@ -517,9 +716,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
517 zone->present_pages += onlined_pages; 716 zone->present_pages += onlined_pages;
518 zone->zone_pgdat->node_present_pages += onlined_pages; 717 zone->zone_pgdat->node_present_pages += onlined_pages;
519 if (onlined_pages) { 718 if (onlined_pages) {
520 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); 719 node_states_set_node(zone_to_nid(zone), &arg);
521 if (need_zonelists_rebuild) 720 if (need_zonelists_rebuild)
522 build_all_zonelists(NULL, zone); 721 build_all_zonelists(NULL, NULL);
523 else 722 else
524 zone_pcp_update(zone); 723 zone_pcp_update(zone);
525 } 724 }
@@ -847,7 +1046,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
847{ 1046{
848 int ret; 1047 int ret;
849 long offlined = *(long *)data; 1048 long offlined = *(long *)data;
850 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); 1049 ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
851 offlined = nr_pages; 1050 offlined = nr_pages;
852 if (!ret) 1051 if (!ret)
853 *(long *)data += offlined; 1052 *(long *)data += offlined;
@@ -867,6 +1066,91 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
867 return offlined; 1066 return offlined;
868} 1067}
869 1068
1069/* ensure the node has NORMAL memory if it is still online */
1070static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
1071{
1072 struct pglist_data *pgdat = zone->zone_pgdat;
1073 unsigned long present_pages = 0;
1074 enum zone_type zt;
1075
1076 for (zt = 0; zt <= ZONE_NORMAL; zt++)
1077 present_pages += pgdat->node_zones[zt].present_pages;
1078
1079 if (present_pages > nr_pages)
1080 return true;
1081
1082 present_pages = 0;
1083 for (; zt <= ZONE_MOVABLE; zt++)
1084 present_pages += pgdat->node_zones[zt].present_pages;
1085
1086 /*
1087 * we can't offline the last normal memory until all
1088 * higher memory is offlined.
1089 */
1090 return present_pages == 0;
1091}
1092
1093/* check which state of node_states will be changed when offline memory */
1094static void node_states_check_changes_offline(unsigned long nr_pages,
1095 struct zone *zone, struct memory_notify *arg)
1096{
1097 struct pglist_data *pgdat = zone->zone_pgdat;
1098 unsigned long present_pages = 0;
1099 enum zone_type zt, zone_last = ZONE_NORMAL;
1100
1101 /*
1102 * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
1103 * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL.
1104 *
1105 * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
1106 * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
1107 */
1108 if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
1109 zone_last = ZONE_MOVABLE;
1110
1111 /*
1112 * check whether node_states[N_NORMAL_MEMORY] will be changed.
1113 * If the memory to be offline is in a zone of 0...zone_last,
1114 * and it is the last present memory, 0...zone_last will
1115 * become empty after offline , thus we can determind we will
1116 * need to clear the node from node_states[N_NORMAL_MEMORY].
1117 */
1118 for (zt = 0; zt <= zone_last; zt++)
1119 present_pages += pgdat->node_zones[zt].present_pages;
1120 if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
1121 arg->status_change_nid_normal = zone_to_nid(zone);
1122 else
1123 arg->status_change_nid_normal = -1;
1124
1125 /*
1126 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
1127 */
1128 zone_last = ZONE_MOVABLE;
1129
1130 /*
1131 * check whether node_states[N_HIGH_MEMORY] will be changed
1132 * If we try to offline the last present @nr_pages from the node,
1133 * we can determind we will need to clear the node from
1134 * node_states[N_HIGH_MEMORY].
1135 */
1136 for (; zt <= zone_last; zt++)
1137 present_pages += pgdat->node_zones[zt].present_pages;
1138 if (nr_pages >= present_pages)
1139 arg->status_change_nid = zone_to_nid(zone);
1140 else
1141 arg->status_change_nid = -1;
1142}
1143
1144static void node_states_clear_node(int node, struct memory_notify *arg)
1145{
1146 if (arg->status_change_nid_normal >= 0)
1147 node_clear_state(node, N_NORMAL_MEMORY);
1148
1149 if ((N_HIGH_MEMORY != N_NORMAL_MEMORY) &&
1150 (arg->status_change_nid >= 0))
1151 node_clear_state(node, N_HIGH_MEMORY);
1152}
1153
870static int __ref __offline_pages(unsigned long start_pfn, 1154static int __ref __offline_pages(unsigned long start_pfn,
871 unsigned long end_pfn, unsigned long timeout) 1155 unsigned long end_pfn, unsigned long timeout)
872{ 1156{
@@ -893,16 +1177,19 @@ static int __ref __offline_pages(unsigned long start_pfn,
893 node = zone_to_nid(zone); 1177 node = zone_to_nid(zone);
894 nr_pages = end_pfn - start_pfn; 1178 nr_pages = end_pfn - start_pfn;
895 1179
1180 ret = -EINVAL;
1181 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
1182 goto out;
1183
896 /* set above range as isolated */ 1184 /* set above range as isolated */
897 ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1185 ret = start_isolate_page_range(start_pfn, end_pfn,
1186 MIGRATE_MOVABLE, true);
898 if (ret) 1187 if (ret)
899 goto out; 1188 goto out;
900 1189
901 arg.start_pfn = start_pfn; 1190 arg.start_pfn = start_pfn;
902 arg.nr_pages = nr_pages; 1191 arg.nr_pages = nr_pages;
903 arg.status_change_nid = -1; 1192 node_states_check_changes_offline(nr_pages, zone, &arg);
904 if (nr_pages >= node_present_pages(node))
905 arg.status_change_nid = node;
906 1193
907 ret = memory_notify(MEM_GOING_OFFLINE, &arg); 1194 ret = memory_notify(MEM_GOING_OFFLINE, &arg);
908 ret = notifier_to_errno(ret); 1195 ret = notifier_to_errno(ret);
@@ -975,10 +1262,9 @@ repeat:
975 } else 1262 } else
976 zone_pcp_update(zone); 1263 zone_pcp_update(zone);
977 1264
978 if (!node_present_pages(node)) { 1265 node_states_clear_node(node, &arg);
979 node_clear_state(node, N_HIGH_MEMORY); 1266 if (arg.status_change_nid >= 0)
980 kswapd_stop(node); 1267 kswapd_stop(node);
981 }
982 1268
983 vm_total_pages = nr_free_pagecache_pages(); 1269 vm_total_pages = nr_free_pagecache_pages();
984 writeback_set_ratelimit(); 1270 writeback_set_ratelimit();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ea600da8940..05b28361a39b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1907,7 +1907,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1907 unsigned long addr, int node) 1907 unsigned long addr, int node)
1908{ 1908{
1909 struct mempolicy *pol; 1909 struct mempolicy *pol;
1910 struct zonelist *zl;
1911 struct page *page; 1910 struct page *page;
1912 unsigned int cpuset_mems_cookie; 1911 unsigned int cpuset_mems_cookie;
1913 1912
@@ -1926,23 +1925,11 @@ retry_cpuset:
1926 1925
1927 return page; 1926 return page;
1928 } 1927 }
1929 zl = policy_zonelist(gfp, pol, node); 1928 page = __alloc_pages_nodemask(gfp, order,
1930 if (unlikely(mpol_needs_cond_ref(pol))) { 1929 policy_zonelist(gfp, pol, node),
1931 /*
1932 * slow path: ref counted shared policy
1933 */
1934 struct page *page = __alloc_pages_nodemask(gfp, order,
1935 zl, policy_nodemask(gfp, pol));
1936 __mpol_put(pol);
1937 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1938 goto retry_cpuset;
1939 return page;
1940 }
1941 /*
1942 * fast path: default or task policy
1943 */
1944 page = __alloc_pages_nodemask(gfp, order, zl,
1945 policy_nodemask(gfp, pol)); 1930 policy_nodemask(gfp, pol));
1931 if (unlikely(mpol_needs_cond_ref(pol)))
1932 __mpol_put(pol);
1946 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 1933 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
1947 goto retry_cpuset; 1934 goto retry_cpuset;
1948 return page; 1935 return page;
diff --git a/mm/migrate.c b/mm/migrate.c
index 77ed2d773705..3f675ca08279 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,6 +35,7 @@
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/hugetlb_cgroup.h> 36#include <linux/hugetlb_cgroup.h>
37#include <linux/gfp.h> 37#include <linux/gfp.h>
38#include <linux/balloon_compaction.h>
38 39
39#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
40 41
@@ -79,7 +80,30 @@ void putback_lru_pages(struct list_head *l)
79 list_del(&page->lru); 80 list_del(&page->lru);
80 dec_zone_page_state(page, NR_ISOLATED_ANON + 81 dec_zone_page_state(page, NR_ISOLATED_ANON +
81 page_is_file_cache(page)); 82 page_is_file_cache(page));
82 putback_lru_page(page); 83 putback_lru_page(page);
84 }
85}
86
87/*
88 * Put previously isolated pages back onto the appropriate lists
89 * from where they were once taken off for compaction/migration.
90 *
91 * This function shall be used instead of putback_lru_pages(),
92 * whenever the isolated pageset has been built by isolate_migratepages_range()
93 */
94void putback_movable_pages(struct list_head *l)
95{
96 struct page *page;
97 struct page *page2;
98
99 list_for_each_entry_safe(page, page2, l, lru) {
100 list_del(&page->lru);
101 dec_zone_page_state(page, NR_ISOLATED_ANON +
102 page_is_file_cache(page));
103 if (unlikely(balloon_page_movable(page)))
104 balloon_page_putback(page);
105 else
106 putback_lru_page(page);
83 } 107 }
84} 108}
85 109
@@ -91,8 +115,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
91{ 115{
92 struct mm_struct *mm = vma->vm_mm; 116 struct mm_struct *mm = vma->vm_mm;
93 swp_entry_t entry; 117 swp_entry_t entry;
94 pgd_t *pgd;
95 pud_t *pud;
96 pmd_t *pmd; 118 pmd_t *pmd;
97 pte_t *ptep, pte; 119 pte_t *ptep, pte;
98 spinlock_t *ptl; 120 spinlock_t *ptl;
@@ -103,19 +125,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
103 goto out; 125 goto out;
104 ptl = &mm->page_table_lock; 126 ptl = &mm->page_table_lock;
105 } else { 127 } else {
106 pgd = pgd_offset(mm, addr); 128 pmd = mm_find_pmd(mm, addr);
107 if (!pgd_present(*pgd)) 129 if (!pmd)
108 goto out;
109
110 pud = pud_offset(pgd, addr);
111 if (!pud_present(*pud))
112 goto out; 130 goto out;
113
114 pmd = pmd_offset(pud, addr);
115 if (pmd_trans_huge(*pmd)) 131 if (pmd_trans_huge(*pmd))
116 goto out; 132 goto out;
117 if (!pmd_present(*pmd))
118 goto out;
119 133
120 ptep = pte_offset_map(pmd, addr); 134 ptep = pte_offset_map(pmd, addr);
121 135
@@ -286,7 +300,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
286 /* Anonymous page without mapping */ 300 /* Anonymous page without mapping */
287 if (page_count(page) != 1) 301 if (page_count(page) != 1)
288 return -EAGAIN; 302 return -EAGAIN;
289 return 0; 303 return MIGRATEPAGE_SUCCESS;
290 } 304 }
291 305
292 spin_lock_irq(&mapping->tree_lock); 306 spin_lock_irq(&mapping->tree_lock);
@@ -356,7 +370,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
356 } 370 }
357 spin_unlock_irq(&mapping->tree_lock); 371 spin_unlock_irq(&mapping->tree_lock);
358 372
359 return 0; 373 return MIGRATEPAGE_SUCCESS;
360} 374}
361 375
362/* 376/*
@@ -372,7 +386,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
372 if (!mapping) { 386 if (!mapping) {
373 if (page_count(page) != 1) 387 if (page_count(page) != 1)
374 return -EAGAIN; 388 return -EAGAIN;
375 return 0; 389 return MIGRATEPAGE_SUCCESS;
376 } 390 }
377 391
378 spin_lock_irq(&mapping->tree_lock); 392 spin_lock_irq(&mapping->tree_lock);
@@ -399,7 +413,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
399 page_unfreeze_refs(page, expected_count - 1); 413 page_unfreeze_refs(page, expected_count - 1);
400 414
401 spin_unlock_irq(&mapping->tree_lock); 415 spin_unlock_irq(&mapping->tree_lock);
402 return 0; 416 return MIGRATEPAGE_SUCCESS;
403} 417}
404 418
405/* 419/*
@@ -486,11 +500,11 @@ int migrate_page(struct address_space *mapping,
486 500
487 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); 501 rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
488 502
489 if (rc) 503 if (rc != MIGRATEPAGE_SUCCESS)
490 return rc; 504 return rc;
491 505
492 migrate_page_copy(newpage, page); 506 migrate_page_copy(newpage, page);
493 return 0; 507 return MIGRATEPAGE_SUCCESS;
494} 508}
495EXPORT_SYMBOL(migrate_page); 509EXPORT_SYMBOL(migrate_page);
496 510
@@ -513,7 +527,7 @@ int buffer_migrate_page(struct address_space *mapping,
513 527
514 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); 528 rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
515 529
516 if (rc) 530 if (rc != MIGRATEPAGE_SUCCESS)
517 return rc; 531 return rc;
518 532
519 /* 533 /*
@@ -549,7 +563,7 @@ int buffer_migrate_page(struct address_space *mapping,
549 563
550 } while (bh != head); 564 } while (bh != head);
551 565
552 return 0; 566 return MIGRATEPAGE_SUCCESS;
553} 567}
554EXPORT_SYMBOL(buffer_migrate_page); 568EXPORT_SYMBOL(buffer_migrate_page);
555#endif 569#endif
@@ -628,7 +642,7 @@ static int fallback_migrate_page(struct address_space *mapping,
628 * 642 *
629 * Return value: 643 * Return value:
630 * < 0 - error code 644 * < 0 - error code
631 * == 0 - success 645 * MIGRATEPAGE_SUCCESS - success
632 */ 646 */
633static int move_to_new_page(struct page *newpage, struct page *page, 647static int move_to_new_page(struct page *newpage, struct page *page,
634 int remap_swapcache, enum migrate_mode mode) 648 int remap_swapcache, enum migrate_mode mode)
@@ -665,7 +679,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
665 else 679 else
666 rc = fallback_migrate_page(mapping, newpage, page, mode); 680 rc = fallback_migrate_page(mapping, newpage, page, mode);
667 681
668 if (rc) { 682 if (rc != MIGRATEPAGE_SUCCESS) {
669 newpage->mapping = NULL; 683 newpage->mapping = NULL;
670 } else { 684 } else {
671 if (remap_swapcache) 685 if (remap_swapcache)
@@ -778,6 +792,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
778 } 792 }
779 } 793 }
780 794
795 if (unlikely(balloon_page_movable(page))) {
796 /*
797 * A ballooned page does not need any special attention from
798 * physical to virtual reverse mapping procedures.
799 * Skip any attempt to unmap PTEs or to remap swap cache,
800 * in order to avoid burning cycles at rmap level, and perform
801 * the page migration right away (proteced by page lock).
802 */
803 rc = balloon_page_migrate(newpage, page, mode);
804 goto uncharge;
805 }
806
781 /* 807 /*
782 * Corner case handling: 808 * Corner case handling:
783 * 1. When a new swap-cache page is read into, it is added to the LRU 809 * 1. When a new swap-cache page is read into, it is added to the LRU
@@ -814,7 +840,9 @@ skip_unmap:
814 put_anon_vma(anon_vma); 840 put_anon_vma(anon_vma);
815 841
816uncharge: 842uncharge:
817 mem_cgroup_end_migration(mem, page, newpage, rc == 0); 843 mem_cgroup_end_migration(mem, page, newpage,
844 (rc == MIGRATEPAGE_SUCCESS ||
845 rc == MIGRATEPAGE_BALLOON_SUCCESS));
818unlock: 846unlock:
819 unlock_page(page); 847 unlock_page(page);
820out: 848out:
@@ -846,6 +874,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
846 goto out; 874 goto out;
847 875
848 rc = __unmap_and_move(page, newpage, force, offlining, mode); 876 rc = __unmap_and_move(page, newpage, force, offlining, mode);
877
878 if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
879 /*
880 * A ballooned page has been migrated already.
881 * Now, it's the time to wrap-up counters,
882 * handle the page back to Buddy and return.
883 */
884 dec_zone_page_state(page, NR_ISOLATED_ANON +
885 page_is_file_cache(page));
886 balloon_page_free(page);
887 return MIGRATEPAGE_SUCCESS;
888 }
849out: 889out:
850 if (rc != -EAGAIN) { 890 if (rc != -EAGAIN) {
851 /* 891 /*
@@ -987,7 +1027,7 @@ int migrate_pages(struct list_head *from,
987 case -EAGAIN: 1027 case -EAGAIN:
988 retry++; 1028 retry++;
989 break; 1029 break;
990 case 0: 1030 case MIGRATEPAGE_SUCCESS:
991 break; 1031 break;
992 default: 1032 default:
993 /* Permanent failure */ 1033 /* Permanent failure */
@@ -996,15 +1036,12 @@ int migrate_pages(struct list_head *from,
996 } 1036 }
997 } 1037 }
998 } 1038 }
999 rc = 0; 1039 rc = nr_failed + retry;
1000out: 1040out:
1001 if (!swapwrite) 1041 if (!swapwrite)
1002 current->flags &= ~PF_SWAPWRITE; 1042 current->flags &= ~PF_SWAPWRITE;
1003 1043
1004 if (rc) 1044 return rc;
1005 return rc;
1006
1007 return nr_failed + retry;
1008} 1045}
1009 1046
1010int migrate_huge_page(struct page *hpage, new_page_t get_new_page, 1047int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
@@ -1024,7 +1061,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
1024 /* try again */ 1061 /* try again */
1025 cond_resched(); 1062 cond_resched();
1026 break; 1063 break;
1027 case 0: 1064 case MIGRATEPAGE_SUCCESS:
1028 goto out; 1065 goto out;
1029 default: 1066 default:
1030 rc = -EIO; 1067 rc = -EIO;
diff --git a/mm/mmap.c b/mm/mmap.c
index 7d416055f08c..f940062c8d4b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -31,6 +31,7 @@
31#include <linux/audit.h> 31#include <linux/audit.h>
32#include <linux/khugepaged.h> 32#include <linux/khugepaged.h>
33#include <linux/uprobes.h> 33#include <linux/uprobes.h>
34#include <linux/rbtree_augmented.h>
34 35
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
@@ -311,40 +312,88 @@ out:
311 return retval; 312 return retval;
312} 313}
313 314
315static long vma_compute_subtree_gap(struct vm_area_struct *vma)
316{
317 unsigned long max, subtree_gap;
318 max = vma->vm_start;
319 if (vma->vm_prev)
320 max -= vma->vm_prev->vm_end;
321 if (vma->vm_rb.rb_left) {
322 subtree_gap = rb_entry(vma->vm_rb.rb_left,
323 struct vm_area_struct, vm_rb)->rb_subtree_gap;
324 if (subtree_gap > max)
325 max = subtree_gap;
326 }
327 if (vma->vm_rb.rb_right) {
328 subtree_gap = rb_entry(vma->vm_rb.rb_right,
329 struct vm_area_struct, vm_rb)->rb_subtree_gap;
330 if (subtree_gap > max)
331 max = subtree_gap;
332 }
333 return max;
334}
335
314#ifdef CONFIG_DEBUG_VM_RB 336#ifdef CONFIG_DEBUG_VM_RB
315static int browse_rb(struct rb_root *root) 337static int browse_rb(struct rb_root *root)
316{ 338{
317 int i = 0, j; 339 int i = 0, j, bug = 0;
318 struct rb_node *nd, *pn = NULL; 340 struct rb_node *nd, *pn = NULL;
319 unsigned long prev = 0, pend = 0; 341 unsigned long prev = 0, pend = 0;
320 342
321 for (nd = rb_first(root); nd; nd = rb_next(nd)) { 343 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
322 struct vm_area_struct *vma; 344 struct vm_area_struct *vma;
323 vma = rb_entry(nd, struct vm_area_struct, vm_rb); 345 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
324 if (vma->vm_start < prev) 346 if (vma->vm_start < prev) {
325 printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; 347 printk("vm_start %lx prev %lx\n", vma->vm_start, prev);
326 if (vma->vm_start < pend) 348 bug = 1;
349 }
350 if (vma->vm_start < pend) {
327 printk("vm_start %lx pend %lx\n", vma->vm_start, pend); 351 printk("vm_start %lx pend %lx\n", vma->vm_start, pend);
328 if (vma->vm_start > vma->vm_end) 352 bug = 1;
329 printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); 353 }
354 if (vma->vm_start > vma->vm_end) {
355 printk("vm_end %lx < vm_start %lx\n",
356 vma->vm_end, vma->vm_start);
357 bug = 1;
358 }
359 if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
360 printk("free gap %lx, correct %lx\n",
361 vma->rb_subtree_gap,
362 vma_compute_subtree_gap(vma));
363 bug = 1;
364 }
330 i++; 365 i++;
331 pn = nd; 366 pn = nd;
332 prev = vma->vm_start; 367 prev = vma->vm_start;
333 pend = vma->vm_end; 368 pend = vma->vm_end;
334 } 369 }
335 j = 0; 370 j = 0;
336 for (nd = pn; nd; nd = rb_prev(nd)) { 371 for (nd = pn; nd; nd = rb_prev(nd))
337 j++; 372 j++;
373 if (i != j) {
374 printk("backwards %d, forwards %d\n", j, i);
375 bug = 1;
376 }
377 return bug ? -1 : i;
378}
379
380static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
381{
382 struct rb_node *nd;
383
384 for (nd = rb_first(root); nd; nd = rb_next(nd)) {
385 struct vm_area_struct *vma;
386 vma = rb_entry(nd, struct vm_area_struct, vm_rb);
387 BUG_ON(vma != ignore &&
388 vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
338 } 389 }
339 if (i != j)
340 printk("backwards %d, forwards %d\n", j, i), i = 0;
341 return i;
342} 390}
343 391
344void validate_mm(struct mm_struct *mm) 392void validate_mm(struct mm_struct *mm)
345{ 393{
346 int bug = 0; 394 int bug = 0;
347 int i = 0; 395 int i = 0;
396 unsigned long highest_address = 0;
348 struct vm_area_struct *vma = mm->mmap; 397 struct vm_area_struct *vma = mm->mmap;
349 while (vma) { 398 while (vma) {
350 struct anon_vma_chain *avc; 399 struct anon_vma_chain *avc;
@@ -352,20 +401,73 @@ void validate_mm(struct mm_struct *mm)
352 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) 401 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
353 anon_vma_interval_tree_verify(avc); 402 anon_vma_interval_tree_verify(avc);
354 vma_unlock_anon_vma(vma); 403 vma_unlock_anon_vma(vma);
404 highest_address = vma->vm_end;
355 vma = vma->vm_next; 405 vma = vma->vm_next;
356 i++; 406 i++;
357 } 407 }
358 if (i != mm->map_count) 408 if (i != mm->map_count) {
359 printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; 409 printk("map_count %d vm_next %d\n", mm->map_count, i);
410 bug = 1;
411 }
412 if (highest_address != mm->highest_vm_end) {
413 printk("mm->highest_vm_end %lx, found %lx\n",
414 mm->highest_vm_end, highest_address);
415 bug = 1;
416 }
360 i = browse_rb(&mm->mm_rb); 417 i = browse_rb(&mm->mm_rb);
361 if (i != mm->map_count) 418 if (i != mm->map_count) {
362 printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; 419 printk("map_count %d rb %d\n", mm->map_count, i);
420 bug = 1;
421 }
363 BUG_ON(bug); 422 BUG_ON(bug);
364} 423}
365#else 424#else
425#define validate_mm_rb(root, ignore) do { } while (0)
366#define validate_mm(mm) do { } while (0) 426#define validate_mm(mm) do { } while (0)
367#endif 427#endif
368 428
429RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
430 unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
431
432/*
433 * Update augmented rbtree rb_subtree_gap values after vma->vm_start or
434 * vma->vm_prev->vm_end values changed, without modifying the vma's position
435 * in the rbtree.
436 */
437static void vma_gap_update(struct vm_area_struct *vma)
438{
439 /*
440 * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
441 * function that does exacltly what we want.
442 */
443 vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
444}
445
446static inline void vma_rb_insert(struct vm_area_struct *vma,
447 struct rb_root *root)
448{
449 /* All rb_subtree_gap values must be consistent prior to insertion */
450 validate_mm_rb(root, NULL);
451
452 rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
453}
454
455static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
456{
457 /*
458 * All rb_subtree_gap values must be consistent prior to erase,
459 * with the possible exception of the vma being erased.
460 */
461 validate_mm_rb(root, vma);
462
463 /*
464 * Note rb_erase_augmented is a fairly large inline function,
465 * so make sure we instantiate it only once with our desired
466 * augmented rbtree callbacks.
467 */
468 rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
469}
470
369/* 471/*
370 * vma has some anon_vma assigned, and is already inserted on that 472 * vma has some anon_vma assigned, and is already inserted on that
371 * anon_vma's interval trees. 473 * anon_vma's interval trees.
@@ -435,8 +537,25 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr,
435void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 537void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
436 struct rb_node **rb_link, struct rb_node *rb_parent) 538 struct rb_node **rb_link, struct rb_node *rb_parent)
437{ 539{
540 /* Update tracking information for the gap following the new vma. */
541 if (vma->vm_next)
542 vma_gap_update(vma->vm_next);
543 else
544 mm->highest_vm_end = vma->vm_end;
545
546 /*
547 * vma->vm_prev wasn't known when we followed the rbtree to find the
548 * correct insertion point for that vma. As a result, we could not
549 * update the vma vm_rb parents rb_subtree_gap values on the way down.
550 * So, we first insert the vma with a zero rb_subtree_gap value
551 * (to be consistent with what we did on the way down), and then
552 * immediately update the gap to the correct value. Finally we
553 * rebalance the rbtree after all augmented values have been set.
554 */
438 rb_link_node(&vma->vm_rb, rb_parent, rb_link); 555 rb_link_node(&vma->vm_rb, rb_parent, rb_link);
439 rb_insert_color(&vma->vm_rb, &mm->mm_rb); 556 vma->rb_subtree_gap = 0;
557 vma_gap_update(vma);
558 vma_rb_insert(vma, &mm->mm_rb);
440} 559}
441 560
442static void __vma_link_file(struct vm_area_struct *vma) 561static void __vma_link_file(struct vm_area_struct *vma)
@@ -512,12 +631,12 @@ static inline void
512__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, 631__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
513 struct vm_area_struct *prev) 632 struct vm_area_struct *prev)
514{ 633{
515 struct vm_area_struct *next = vma->vm_next; 634 struct vm_area_struct *next;
516 635
517 prev->vm_next = next; 636 vma_rb_erase(vma, &mm->mm_rb);
637 prev->vm_next = next = vma->vm_next;
518 if (next) 638 if (next)
519 next->vm_prev = prev; 639 next->vm_prev = prev;
520 rb_erase(&vma->vm_rb, &mm->mm_rb);
521 if (mm->mmap_cache == vma) 640 if (mm->mmap_cache == vma)
522 mm->mmap_cache = prev; 641 mm->mmap_cache = prev;
523} 642}
@@ -539,6 +658,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
539 struct rb_root *root = NULL; 658 struct rb_root *root = NULL;
540 struct anon_vma *anon_vma = NULL; 659 struct anon_vma *anon_vma = NULL;
541 struct file *file = vma->vm_file; 660 struct file *file = vma->vm_file;
661 bool start_changed = false, end_changed = false;
542 long adjust_next = 0; 662 long adjust_next = 0;
543 int remove_next = 0; 663 int remove_next = 0;
544 664
@@ -629,8 +749,14 @@ again: remove_next = 1 + (end > next->vm_end);
629 vma_interval_tree_remove(next, root); 749 vma_interval_tree_remove(next, root);
630 } 750 }
631 751
632 vma->vm_start = start; 752 if (start != vma->vm_start) {
633 vma->vm_end = end; 753 vma->vm_start = start;
754 start_changed = true;
755 }
756 if (end != vma->vm_end) {
757 vma->vm_end = end;
758 end_changed = true;
759 }
634 vma->vm_pgoff = pgoff; 760 vma->vm_pgoff = pgoff;
635 if (adjust_next) { 761 if (adjust_next) {
636 next->vm_start += adjust_next << PAGE_SHIFT; 762 next->vm_start += adjust_next << PAGE_SHIFT;
@@ -659,6 +785,15 @@ again: remove_next = 1 + (end > next->vm_end);
659 * (it may either follow vma or precede it). 785 * (it may either follow vma or precede it).
660 */ 786 */
661 __insert_vm_struct(mm, insert); 787 __insert_vm_struct(mm, insert);
788 } else {
789 if (start_changed)
790 vma_gap_update(vma);
791 if (end_changed) {
792 if (!next)
793 mm->highest_vm_end = end;
794 else if (!adjust_next)
795 vma_gap_update(next);
796 }
662 } 797 }
663 798
664 if (anon_vma) { 799 if (anon_vma) {
@@ -692,10 +827,13 @@ again: remove_next = 1 + (end > next->vm_end);
692 * we must remove another next too. It would clutter 827 * we must remove another next too. It would clutter
693 * up the code too much to do both in one go. 828 * up the code too much to do both in one go.
694 */ 829 */
695 if (remove_next == 2) { 830 next = vma->vm_next;
696 next = vma->vm_next; 831 if (remove_next == 2)
697 goto again; 832 goto again;
698 } 833 else if (next)
834 vma_gap_update(next);
835 else
836 mm->highest_vm_end = end;
699 } 837 }
700 if (insert && file) 838 if (insert && file)
701 uprobe_mmap(insert); 839 uprobe_mmap(insert);
@@ -1167,8 +1305,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1167 * memory so no accounting is necessary 1305 * memory so no accounting is necessary
1168 */ 1306 */
1169 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, 1307 file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
1170 VM_NORESERVE, &user, 1308 VM_NORESERVE,
1171 HUGETLB_ANONHUGE_INODE); 1309 &user, HUGETLB_ANONHUGE_INODE,
1310 (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
1172 if (IS_ERR(file)) 1311 if (IS_ERR(file))
1173 return PTR_ERR(file); 1312 return PTR_ERR(file);
1174 } 1313 }
@@ -1414,6 +1553,206 @@ unacct_error:
1414 return error; 1553 return error;
1415} 1554}
1416 1555
1556unsigned long unmapped_area(struct vm_unmapped_area_info *info)
1557{
1558 /*
1559 * We implement the search by looking for an rbtree node that
1560 * immediately follows a suitable gap. That is,
1561 * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length;
1562 * - gap_end = vma->vm_start >= info->low_limit + length;
1563 * - gap_end - gap_start >= length
1564 */
1565
1566 struct mm_struct *mm = current->mm;
1567 struct vm_area_struct *vma;
1568 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1569
1570 /* Adjust search length to account for worst case alignment overhead */
1571 length = info->length + info->align_mask;
1572 if (length < info->length)
1573 return -ENOMEM;
1574
1575 /* Adjust search limits by the desired length */
1576 if (info->high_limit < length)
1577 return -ENOMEM;
1578 high_limit = info->high_limit - length;
1579
1580 if (info->low_limit > high_limit)
1581 return -ENOMEM;
1582 low_limit = info->low_limit + length;
1583
1584 /* Check if rbtree root looks promising */
1585 if (RB_EMPTY_ROOT(&mm->mm_rb))
1586 goto check_highest;
1587 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1588 if (vma->rb_subtree_gap < length)
1589 goto check_highest;
1590
1591 while (true) {
1592 /* Visit left subtree if it looks promising */
1593 gap_end = vma->vm_start;
1594 if (gap_end >= low_limit && vma->vm_rb.rb_left) {
1595 struct vm_area_struct *left =
1596 rb_entry(vma->vm_rb.rb_left,
1597 struct vm_area_struct, vm_rb);
1598 if (left->rb_subtree_gap >= length) {
1599 vma = left;
1600 continue;
1601 }
1602 }
1603
1604 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1605check_current:
1606 /* Check if current node has a suitable gap */
1607 if (gap_start > high_limit)
1608 return -ENOMEM;
1609 if (gap_end >= low_limit && gap_end - gap_start >= length)
1610 goto found;
1611
1612 /* Visit right subtree if it looks promising */
1613 if (vma->vm_rb.rb_right) {
1614 struct vm_area_struct *right =
1615 rb_entry(vma->vm_rb.rb_right,
1616 struct vm_area_struct, vm_rb);
1617 if (right->rb_subtree_gap >= length) {
1618 vma = right;
1619 continue;
1620 }
1621 }
1622
1623 /* Go back up the rbtree to find next candidate node */
1624 while (true) {
1625 struct rb_node *prev = &vma->vm_rb;
1626 if (!rb_parent(prev))
1627 goto check_highest;
1628 vma = rb_entry(rb_parent(prev),
1629 struct vm_area_struct, vm_rb);
1630 if (prev == vma->vm_rb.rb_left) {
1631 gap_start = vma->vm_prev->vm_end;
1632 gap_end = vma->vm_start;
1633 goto check_current;
1634 }
1635 }
1636 }
1637
1638check_highest:
1639 /* Check highest gap, which does not precede any rbtree node */
1640 gap_start = mm->highest_vm_end;
1641 gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */
1642 if (gap_start > high_limit)
1643 return -ENOMEM;
1644
1645found:
1646 /* We found a suitable gap. Clip it with the original low_limit. */
1647 if (gap_start < info->low_limit)
1648 gap_start = info->low_limit;
1649
1650 /* Adjust gap address to the desired alignment */
1651 gap_start += (info->align_offset - gap_start) & info->align_mask;
1652
1653 VM_BUG_ON(gap_start + info->length > info->high_limit);
1654 VM_BUG_ON(gap_start + info->length > gap_end);
1655 return gap_start;
1656}
1657
1658unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
1659{
1660 struct mm_struct *mm = current->mm;
1661 struct vm_area_struct *vma;
1662 unsigned long length, low_limit, high_limit, gap_start, gap_end;
1663
1664 /* Adjust search length to account for worst case alignment overhead */
1665 length = info->length + info->align_mask;
1666 if (length < info->length)
1667 return -ENOMEM;
1668
1669 /*
1670 * Adjust search limits by the desired length.
1671 * See implementation comment at top of unmapped_area().
1672 */
1673 gap_end = info->high_limit;
1674 if (gap_end < length)
1675 return -ENOMEM;
1676 high_limit = gap_end - length;
1677
1678 if (info->low_limit > high_limit)
1679 return -ENOMEM;
1680 low_limit = info->low_limit + length;
1681
1682 /* Check highest gap, which does not precede any rbtree node */
1683 gap_start = mm->highest_vm_end;
1684 if (gap_start <= high_limit)
1685 goto found_highest;
1686
1687 /* Check if rbtree root looks promising */
1688 if (RB_EMPTY_ROOT(&mm->mm_rb))
1689 return -ENOMEM;
1690 vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb);
1691 if (vma->rb_subtree_gap < length)
1692 return -ENOMEM;
1693
1694 while (true) {
1695 /* Visit right subtree if it looks promising */
1696 gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
1697 if (gap_start <= high_limit && vma->vm_rb.rb_right) {
1698 struct vm_area_struct *right =
1699 rb_entry(vma->vm_rb.rb_right,
1700 struct vm_area_struct, vm_rb);
1701 if (right->rb_subtree_gap >= length) {
1702 vma = right;
1703 continue;
1704 }
1705 }
1706
1707check_current:
1708 /* Check if current node has a suitable gap */
1709 gap_end = vma->vm_start;
1710 if (gap_end < low_limit)
1711 return -ENOMEM;
1712 if (gap_start <= high_limit && gap_end - gap_start >= length)
1713 goto found;
1714
1715 /* Visit left subtree if it looks promising */
1716 if (vma->vm_rb.rb_left) {
1717 struct vm_area_struct *left =
1718 rb_entry(vma->vm_rb.rb_left,
1719 struct vm_area_struct, vm_rb);
1720 if (left->rb_subtree_gap >= length) {
1721 vma = left;
1722 continue;
1723 }
1724 }
1725
1726 /* Go back up the rbtree to find next candidate node */
1727 while (true) {
1728 struct rb_node *prev = &vma->vm_rb;
1729 if (!rb_parent(prev))
1730 return -ENOMEM;
1731 vma = rb_entry(rb_parent(prev),
1732 struct vm_area_struct, vm_rb);
1733 if (prev == vma->vm_rb.rb_right) {
1734 gap_start = vma->vm_prev ?
1735 vma->vm_prev->vm_end : 0;
1736 goto check_current;
1737 }
1738 }
1739 }
1740
1741found:
1742 /* We found a suitable gap. Clip it with the original high_limit. */
1743 if (gap_end > info->high_limit)
1744 gap_end = info->high_limit;
1745
1746found_highest:
1747 /* Compute highest gap address at the desired alignment */
1748 gap_end -= info->length;
1749 gap_end -= (gap_end - info->align_offset) & info->align_mask;
1750
1751 VM_BUG_ON(gap_end < info->low_limit);
1752 VM_BUG_ON(gap_end < gap_start);
1753 return gap_end;
1754}
1755
1417/* Get an address range which is currently unmapped. 1756/* Get an address range which is currently unmapped.
1418 * For shmat() with addr=0. 1757 * For shmat() with addr=0.
1419 * 1758 *
@@ -1432,7 +1771,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1432{ 1771{
1433 struct mm_struct *mm = current->mm; 1772 struct mm_struct *mm = current->mm;
1434 struct vm_area_struct *vma; 1773 struct vm_area_struct *vma;
1435 unsigned long start_addr; 1774 struct vm_unmapped_area_info info;
1436 1775
1437 if (len > TASK_SIZE) 1776 if (len > TASK_SIZE)
1438 return -ENOMEM; 1777 return -ENOMEM;
@@ -1447,40 +1786,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
1447 (!vma || addr + len <= vma->vm_start)) 1786 (!vma || addr + len <= vma->vm_start))
1448 return addr; 1787 return addr;
1449 } 1788 }
1450 if (len > mm->cached_hole_size) {
1451 start_addr = addr = mm->free_area_cache;
1452 } else {
1453 start_addr = addr = TASK_UNMAPPED_BASE;
1454 mm->cached_hole_size = 0;
1455 }
1456 1789
1457full_search: 1790 info.flags = 0;
1458 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 1791 info.length = len;
1459 /* At this point: (!vma || addr < vma->vm_end). */ 1792 info.low_limit = TASK_UNMAPPED_BASE;
1460 if (TASK_SIZE - len < addr) { 1793 info.high_limit = TASK_SIZE;
1461 /* 1794 info.align_mask = 0;
1462 * Start a new search - just in case we missed 1795 return vm_unmapped_area(&info);
1463 * some holes.
1464 */
1465 if (start_addr != TASK_UNMAPPED_BASE) {
1466 addr = TASK_UNMAPPED_BASE;
1467 start_addr = addr;
1468 mm->cached_hole_size = 0;
1469 goto full_search;
1470 }
1471 return -ENOMEM;
1472 }
1473 if (!vma || addr + len <= vma->vm_start) {
1474 /*
1475 * Remember the place where we stopped the search:
1476 */
1477 mm->free_area_cache = addr + len;
1478 return addr;
1479 }
1480 if (addr + mm->cached_hole_size < vma->vm_start)
1481 mm->cached_hole_size = vma->vm_start - addr;
1482 addr = vma->vm_end;
1483 }
1484} 1796}
1485#endif 1797#endif
1486 1798
@@ -1505,7 +1817,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1505{ 1817{
1506 struct vm_area_struct *vma; 1818 struct vm_area_struct *vma;
1507 struct mm_struct *mm = current->mm; 1819 struct mm_struct *mm = current->mm;
1508 unsigned long addr = addr0, start_addr; 1820 unsigned long addr = addr0;
1821 struct vm_unmapped_area_info info;
1509 1822
1510 /* requested length too big for entire address space */ 1823 /* requested length too big for entire address space */
1511 if (len > TASK_SIZE) 1824 if (len > TASK_SIZE)
@@ -1523,53 +1836,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
1523 return addr; 1836 return addr;
1524 } 1837 }
1525 1838
1526 /* check if free_area_cache is useful for us */ 1839 info.flags = VM_UNMAPPED_AREA_TOPDOWN;
1527 if (len <= mm->cached_hole_size) { 1840 info.length = len;
1528 mm->cached_hole_size = 0; 1841 info.low_limit = PAGE_SIZE;
1529 mm->free_area_cache = mm->mmap_base; 1842 info.high_limit = mm->mmap_base;
1530 } 1843 info.align_mask = 0;
1531 1844 addr = vm_unmapped_area(&info);
1532try_again:
1533 /* either no address requested or can't fit in requested address hole */
1534 start_addr = addr = mm->free_area_cache;
1535
1536 if (addr < len)
1537 goto fail;
1538
1539 addr -= len;
1540 do {
1541 /*
1542 * Lookup failure means no vma is above this address,
1543 * else if new region fits below vma->vm_start,
1544 * return with success:
1545 */
1546 vma = find_vma(mm, addr);
1547 if (!vma || addr+len <= vma->vm_start)
1548 /* remember the address as a hint for next time */
1549 return (mm->free_area_cache = addr);
1550
1551 /* remember the largest hole we saw so far */
1552 if (addr + mm->cached_hole_size < vma->vm_start)
1553 mm->cached_hole_size = vma->vm_start - addr;
1554
1555 /* try just below the current vma->vm_start */
1556 addr = vma->vm_start-len;
1557 } while (len < vma->vm_start);
1558
1559fail:
1560 /*
1561 * if hint left us with no space for the requested
1562 * mapping then try again:
1563 *
1564 * Note: this is different with the case of bottomup
1565 * which does the fully line-search, but we use find_vma
1566 * here that causes some holes skipped.
1567 */
1568 if (start_addr != mm->mmap_base) {
1569 mm->free_area_cache = mm->mmap_base;
1570 mm->cached_hole_size = 0;
1571 goto try_again;
1572 }
1573 1845
1574 /* 1846 /*
1575 * A failed mmap() very likely causes application failure, 1847 * A failed mmap() very likely causes application failure,
@@ -1577,14 +1849,13 @@ fail:
1577 * can happen with large stack limits and large mmap() 1849 * can happen with large stack limits and large mmap()
1578 * allocations. 1850 * allocations.
1579 */ 1851 */
1580 mm->cached_hole_size = ~0UL; 1852 if (addr & ~PAGE_MASK) {
1581 mm->free_area_cache = TASK_UNMAPPED_BASE; 1853 VM_BUG_ON(addr != -ENOMEM);
1582 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); 1854 info.flags = 0;
1583 /* 1855 info.low_limit = TASK_UNMAPPED_BASE;
1584 * Restore the topdown base: 1856 info.high_limit = TASK_SIZE;
1585 */ 1857 addr = vm_unmapped_area(&info);
1586 mm->free_area_cache = mm->mmap_base; 1858 }
1587 mm->cached_hole_size = ~0UL;
1588 1859
1589 return addr; 1860 return addr;
1590} 1861}
@@ -1797,6 +2068,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1797 anon_vma_interval_tree_pre_update_vma(vma); 2068 anon_vma_interval_tree_pre_update_vma(vma);
1798 vma->vm_end = address; 2069 vma->vm_end = address;
1799 anon_vma_interval_tree_post_update_vma(vma); 2070 anon_vma_interval_tree_post_update_vma(vma);
2071 if (vma->vm_next)
2072 vma_gap_update(vma->vm_next);
2073 else
2074 vma->vm_mm->highest_vm_end = address;
1800 perf_event_mmap(vma); 2075 perf_event_mmap(vma);
1801 } 2076 }
1802 } 2077 }
@@ -1851,6 +2126,7 @@ int expand_downwards(struct vm_area_struct *vma,
1851 vma->vm_start = address; 2126 vma->vm_start = address;
1852 vma->vm_pgoff -= grow; 2127 vma->vm_pgoff -= grow;
1853 anon_vma_interval_tree_post_update_vma(vma); 2128 anon_vma_interval_tree_post_update_vma(vma);
2129 vma_gap_update(vma);
1854 perf_event_mmap(vma); 2130 perf_event_mmap(vma);
1855 } 2131 }
1856 } 2132 }
@@ -1973,14 +2249,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
1973 insertion_point = (prev ? &prev->vm_next : &mm->mmap); 2249 insertion_point = (prev ? &prev->vm_next : &mm->mmap);
1974 vma->vm_prev = NULL; 2250 vma->vm_prev = NULL;
1975 do { 2251 do {
1976 rb_erase(&vma->vm_rb, &mm->mm_rb); 2252 vma_rb_erase(vma, &mm->mm_rb);
1977 mm->map_count--; 2253 mm->map_count--;
1978 tail_vma = vma; 2254 tail_vma = vma;
1979 vma = vma->vm_next; 2255 vma = vma->vm_next;
1980 } while (vma && vma->vm_start < end); 2256 } while (vma && vma->vm_start < end);
1981 *insertion_point = vma; 2257 *insertion_point = vma;
1982 if (vma) 2258 if (vma) {
1983 vma->vm_prev = prev; 2259 vma->vm_prev = prev;
2260 vma_gap_update(vma);
2261 } else
2262 mm->highest_vm_end = prev ? prev->vm_end : 0;
1984 tail_vma->vm_next = NULL; 2263 tail_vma->vm_next = NULL;
1985 if (mm->unmap_area == arch_unmap_area) 2264 if (mm->unmap_area == arch_unmap_area)
1986 addr = prev ? prev->vm_end : mm->mmap_base; 2265 addr = prev ? prev->vm_end : mm->mmap_base;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 79e0f3e24831..18f1ae2b45de 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task;
44int sysctl_oom_dump_tasks = 1; 44int sysctl_oom_dump_tasks = 1;
45static DEFINE_SPINLOCK(zone_scan_lock); 45static DEFINE_SPINLOCK(zone_scan_lock);
46 46
47/*
48 * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
49 * @old_val: old oom_score_adj for compare
50 * @new_val: new oom_score_adj for swap
51 *
52 * Sets the oom_score_adj value for current to @new_val iff its present value is
53 * @old_val. Usually used to reinstate a previous value to prevent racing with
54 * userspacing tuning the value in the interim.
55 */
56void compare_swap_oom_score_adj(int old_val, int new_val)
57{
58 struct sighand_struct *sighand = current->sighand;
59
60 spin_lock_irq(&sighand->siglock);
61 if (current->signal->oom_score_adj == old_val)
62 current->signal->oom_score_adj = new_val;
63 trace_oom_score_adj_update(current);
64 spin_unlock_irq(&sighand->siglock);
65}
66
67/**
68 * test_set_oom_score_adj() - set current's oom_score_adj and return old value
69 * @new_val: new oom_score_adj value
70 *
71 * Sets the oom_score_adj value for current to @new_val with proper
72 * synchronization and returns the old value. Usually used to temporarily
73 * set a value, save the old value in the caller, and then reinstate it later.
74 */
75int test_set_oom_score_adj(int new_val)
76{
77 struct sighand_struct *sighand = current->sighand;
78 int old_val;
79
80 spin_lock_irq(&sighand->siglock);
81 old_val = current->signal->oom_score_adj;
82 current->signal->oom_score_adj = new_val;
83 trace_oom_score_adj_update(current);
84 spin_unlock_irq(&sighand->siglock);
85
86 return old_val;
87}
88
89#ifdef CONFIG_NUMA 47#ifdef CONFIG_NUMA
90/** 48/**
91 * has_intersects_mems_allowed() - check task eligiblity for kill 49 * has_intersects_mems_allowed() - check task eligiblity for kill
@@ -193,7 +151,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
193 if (!p) 151 if (!p)
194 return 0; 152 return 0;
195 153
196 adj = p->signal->oom_score_adj; 154 adj = (long)p->signal->oom_score_adj;
197 if (adj == OOM_SCORE_ADJ_MIN) { 155 if (adj == OOM_SCORE_ADJ_MIN) {
198 task_unlock(p); 156 task_unlock(p);
199 return 0; 157 return 0;
@@ -310,26 +268,20 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
310 if (!task->mm) 268 if (!task->mm)
311 return OOM_SCAN_CONTINUE; 269 return OOM_SCAN_CONTINUE;
312 270
313 if (task->flags & PF_EXITING) { 271 /*
272 * If task is allocating a lot of memory and has been marked to be
273 * killed first if it triggers an oom, then select it.
274 */
275 if (oom_task_origin(task))
276 return OOM_SCAN_SELECT;
277
278 if (task->flags & PF_EXITING && !force_kill) {
314 /* 279 /*
315 * If task is current and is in the process of releasing memory, 280 * If this task is not being ptraced on exit, then wait for it
316 * allow the "kill" to set TIF_MEMDIE, which will allow it to 281 * to finish before killing some other task unnecessarily.
317 * access memory reserves. Otherwise, it may stall forever.
318 *
319 * The iteration isn't broken here, however, in case other
320 * threads are found to have already been oom killed.
321 */ 282 */
322 if (task == current) 283 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
323 return OOM_SCAN_SELECT; 284 return OOM_SCAN_ABORT;
324 else if (!force_kill) {
325 /*
326 * If this task is not being ptraced on exit, then wait
327 * for it to finish before killing some other task
328 * unnecessarily.
329 */
330 if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
331 return OOM_SCAN_ABORT;
332 }
333 } 285 }
334 return OOM_SCAN_OK; 286 return OOM_SCAN_OK;
335} 287}
@@ -412,7 +364,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
412 continue; 364 continue;
413 } 365 }
414 366
415 pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", 367 pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n",
416 task->pid, from_kuid(&init_user_ns, task_uid(task)), 368 task->pid, from_kuid(&init_user_ns, task_uid(task)),
417 task->tgid, task->mm->total_vm, get_mm_rss(task->mm), 369 task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
418 task->mm->nr_ptes, 370 task->mm->nr_ptes,
@@ -428,7 +380,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
428{ 380{
429 task_lock(current); 381 task_lock(current);
430 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 382 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
431 "oom_score_adj=%d\n", 383 "oom_score_adj=%hd\n",
432 current->comm, gfp_mask, order, 384 current->comm, gfp_mask, order,
433 current->signal->oom_score_adj); 385 current->signal->oom_score_adj);
434 cpuset_print_task_mems_allowed(current); 386 cpuset_print_task_mems_allowed(current);
@@ -706,11 +658,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
706 return; 658 return;
707 659
708 /* 660 /*
709 * If current has a pending SIGKILL, then automatically select it. The 661 * If current has a pending SIGKILL or is exiting, then automatically
710 * goal is to allow it to allocate so that it may quickly exit and free 662 * select it. The goal is to allow it to allocate so that it may
711 * its memory. 663 * quickly exit and free its memory.
712 */ 664 */
713 if (fatal_signal_pending(current)) { 665 if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
714 set_thread_flag(TIF_MEMDIE); 666 set_thread_flag(TIF_MEMDIE);
715 return; 667 return;
716 } 668 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 830893b2b3c7..6f4271224493 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1069,7 +1069,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
1069} 1069}
1070 1070
1071/* 1071/*
1072 * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() 1072 * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
1073 * will look to see if it needs to start dirty throttling. 1073 * will look to see if it needs to start dirty throttling.
1074 * 1074 *
1075 * If dirty_poll_interval is too low, big NUMA machines will call the expensive 1075 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
@@ -1436,9 +1436,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
1436DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; 1436DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
1437 1437
1438/** 1438/**
1439 * balance_dirty_pages_ratelimited_nr - balance dirty memory state 1439 * balance_dirty_pages_ratelimited - balance dirty memory state
1440 * @mapping: address_space which was dirtied 1440 * @mapping: address_space which was dirtied
1441 * @nr_pages_dirtied: number of pages which the caller has just dirtied
1442 * 1441 *
1443 * Processes which are dirtying memory should call in here once for each page 1442 * Processes which are dirtying memory should call in here once for each page
1444 * which was newly dirtied. The function will periodically check the system's 1443 * which was newly dirtied. The function will periodically check the system's
@@ -1449,8 +1448,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
1449 * limit we decrease the ratelimiting by a lot, to prevent individual processes 1448 * limit we decrease the ratelimiting by a lot, to prevent individual processes
1450 * from overshooting the limit by (ratelimit_pages) each. 1449 * from overshooting the limit by (ratelimit_pages) each.
1451 */ 1450 */
1452void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, 1451void balance_dirty_pages_ratelimited(struct address_space *mapping)
1453 unsigned long nr_pages_dirtied)
1454{ 1452{
1455 struct backing_dev_info *bdi = mapping->backing_dev_info; 1453 struct backing_dev_info *bdi = mapping->backing_dev_info;
1456 int ratelimit; 1454 int ratelimit;
@@ -1484,6 +1482,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1484 */ 1482 */
1485 p = &__get_cpu_var(dirty_throttle_leaks); 1483 p = &__get_cpu_var(dirty_throttle_leaks);
1486 if (*p > 0 && current->nr_dirtied < ratelimit) { 1484 if (*p > 0 && current->nr_dirtied < ratelimit) {
1485 unsigned long nr_pages_dirtied;
1487 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); 1486 nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
1488 *p -= nr_pages_dirtied; 1487 *p -= nr_pages_dirtied;
1489 current->nr_dirtied += nr_pages_dirtied; 1488 current->nr_dirtied += nr_pages_dirtied;
@@ -1493,7 +1492,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
1493 if (unlikely(current->nr_dirtied >= ratelimit)) 1492 if (unlikely(current->nr_dirtied >= ratelimit))
1494 balance_dirty_pages(mapping, current->nr_dirtied); 1493 balance_dirty_pages(mapping, current->nr_dirtied);
1495} 1494}
1496EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); 1495EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
1497 1496
1498void throttle_vm_writeout(gfp_t gfp_mask) 1497void throttle_vm_writeout(gfp_t gfp_mask)
1499{ 1498{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7e208f0ad68c..5a8d339d282a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -667,11 +667,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
667 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 667 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
668 __free_one_page(page, zone, 0, mt); 668 __free_one_page(page, zone, 0, mt);
669 trace_mm_page_pcpu_drain(page, 0, mt); 669 trace_mm_page_pcpu_drain(page, 0, mt);
670 if (is_migrate_cma(mt)) 670 if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
671 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); 671 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
672 if (is_migrate_cma(mt))
673 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
674 }
672 } while (--to_free && --batch_free && !list_empty(list)); 675 } while (--to_free && --batch_free && !list_empty(list));
673 } 676 }
674 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
675 spin_unlock(&zone->lock); 677 spin_unlock(&zone->lock);
676} 678}
677 679
@@ -1392,21 +1394,22 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1392 1394
1393 zone = page_zone(page); 1395 zone = page_zone(page);
1394 order = page_order(page); 1396 order = page_order(page);
1397 mt = get_pageblock_migratetype(page);
1395 1398
1396 /* Obey watermarks as if the page was being allocated */ 1399 if (mt != MIGRATE_ISOLATE) {
1397 watermark = low_wmark_pages(zone) + (1 << order); 1400 /* Obey watermarks as if the page was being allocated */
1398 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1401 watermark = low_wmark_pages(zone) + (1 << order);
1399 return 0; 1402 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1403 return 0;
1404
1405 __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
1406 }
1400 1407
1401 /* Remove page from free list */ 1408 /* Remove page from free list */
1402 list_del(&page->lru); 1409 list_del(&page->lru);
1403 zone->free_area[order].nr_free--; 1410 zone->free_area[order].nr_free--;
1404 rmv_page_order(page); 1411 rmv_page_order(page);
1405 1412
1406 mt = get_pageblock_migratetype(page);
1407 if (unlikely(mt != MIGRATE_ISOLATE))
1408 __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
1409
1410 if (alloc_order != order) 1413 if (alloc_order != order)
1411 expand(zone, page, alloc_order, order, 1414 expand(zone, page, alloc_order, order,
1412 &zone->free_area[order], migratetype); 1415 &zone->free_area[order], migratetype);
@@ -1871,7 +1874,7 @@ zonelist_scan:
1871 */ 1874 */
1872 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1875 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1873 high_zoneidx, nodemask) { 1876 high_zoneidx, nodemask) {
1874 if (NUMA_BUILD && zlc_active && 1877 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1875 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1878 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1876 continue; 1879 continue;
1877 if ((alloc_flags & ALLOC_CPUSET) && 1880 if ((alloc_flags & ALLOC_CPUSET) &&
@@ -1917,7 +1920,8 @@ zonelist_scan:
1917 classzone_idx, alloc_flags)) 1920 classzone_idx, alloc_flags))
1918 goto try_this_zone; 1921 goto try_this_zone;
1919 1922
1920 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { 1923 if (IS_ENABLED(CONFIG_NUMA) &&
1924 !did_zlc_setup && nr_online_nodes > 1) {
1921 /* 1925 /*
1922 * we do zlc_setup if there are multiple nodes 1926 * we do zlc_setup if there are multiple nodes
1923 * and before considering the first zone allowed 1927 * and before considering the first zone allowed
@@ -1936,7 +1940,7 @@ zonelist_scan:
1936 * As we may have just activated ZLC, check if the first 1940 * As we may have just activated ZLC, check if the first
1937 * eligible zone has failed zone_reclaim recently. 1941 * eligible zone has failed zone_reclaim recently.
1938 */ 1942 */
1939 if (NUMA_BUILD && zlc_active && 1943 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1940 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1944 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1941 continue; 1945 continue;
1942 1946
@@ -1962,11 +1966,11 @@ try_this_zone:
1962 if (page) 1966 if (page)
1963 break; 1967 break;
1964this_zone_full: 1968this_zone_full:
1965 if (NUMA_BUILD) 1969 if (IS_ENABLED(CONFIG_NUMA))
1966 zlc_mark_zone_full(zonelist, z); 1970 zlc_mark_zone_full(zonelist, z);
1967 } 1971 }
1968 1972
1969 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1973 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
1970 /* Disable zlc cache for second zonelist scan */ 1974 /* Disable zlc cache for second zonelist scan */
1971 zlc_active = 0; 1975 zlc_active = 0;
1972 goto zonelist_scan; 1976 goto zonelist_scan;
@@ -2266,7 +2270,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2266 return NULL; 2270 return NULL;
2267 2271
2268 /* After successful reclaim, reconsider all zones for allocation */ 2272 /* After successful reclaim, reconsider all zones for allocation */
2269 if (NUMA_BUILD) 2273 if (IS_ENABLED(CONFIG_NUMA))
2270 zlc_clear_zones_full(zonelist); 2274 zlc_clear_zones_full(zonelist);
2271 2275
2272retry: 2276retry:
@@ -2412,7 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2412 * allowed per node queues are empty and that nodes are 2416 * allowed per node queues are empty and that nodes are
2413 * over allocated. 2417 * over allocated.
2414 */ 2418 */
2415 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2419 if (IS_ENABLED(CONFIG_NUMA) &&
2420 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2416 goto nopage; 2421 goto nopage;
2417 2422
2418restart: 2423restart:
@@ -2819,7 +2824,7 @@ unsigned int nr_free_pagecache_pages(void)
2819 2824
2820static inline void show_node(struct zone *zone) 2825static inline void show_node(struct zone *zone)
2821{ 2826{
2822 if (NUMA_BUILD) 2827 if (IS_ENABLED(CONFIG_NUMA))
2823 printk("Node %d ", zone_to_nid(zone)); 2828 printk("Node %d ", zone_to_nid(zone));
2824} 2829}
2825 2830
@@ -2877,6 +2882,31 @@ out:
2877 2882
2878#define K(x) ((x) << (PAGE_SHIFT-10)) 2883#define K(x) ((x) << (PAGE_SHIFT-10))
2879 2884
2885static void show_migration_types(unsigned char type)
2886{
2887 static const char types[MIGRATE_TYPES] = {
2888 [MIGRATE_UNMOVABLE] = 'U',
2889 [MIGRATE_RECLAIMABLE] = 'E',
2890 [MIGRATE_MOVABLE] = 'M',
2891 [MIGRATE_RESERVE] = 'R',
2892#ifdef CONFIG_CMA
2893 [MIGRATE_CMA] = 'C',
2894#endif
2895 [MIGRATE_ISOLATE] = 'I',
2896 };
2897 char tmp[MIGRATE_TYPES + 1];
2898 char *p = tmp;
2899 int i;
2900
2901 for (i = 0; i < MIGRATE_TYPES; i++) {
2902 if (type & (1 << i))
2903 *p++ = types[i];
2904 }
2905
2906 *p = '\0';
2907 printk("(%s) ", tmp);
2908}
2909
2880/* 2910/*
2881 * Show free area list (used inside shift_scroll-lock stuff) 2911 * Show free area list (used inside shift_scroll-lock stuff)
2882 * We also calculate the percentage fragmentation. We do this by counting the 2912 * We also calculate the percentage fragmentation. We do this by counting the
@@ -3005,6 +3035,7 @@ void show_free_areas(unsigned int filter)
3005 3035
3006 for_each_populated_zone(zone) { 3036 for_each_populated_zone(zone) {
3007 unsigned long nr[MAX_ORDER], flags, order, total = 0; 3037 unsigned long nr[MAX_ORDER], flags, order, total = 0;
3038 unsigned char types[MAX_ORDER];
3008 3039
3009 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3040 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3010 continue; 3041 continue;
@@ -3013,12 +3044,24 @@ void show_free_areas(unsigned int filter)
3013 3044
3014 spin_lock_irqsave(&zone->lock, flags); 3045 spin_lock_irqsave(&zone->lock, flags);
3015 for (order = 0; order < MAX_ORDER; order++) { 3046 for (order = 0; order < MAX_ORDER; order++) {
3016 nr[order] = zone->free_area[order].nr_free; 3047 struct free_area *area = &zone->free_area[order];
3048 int type;
3049
3050 nr[order] = area->nr_free;
3017 total += nr[order] << order; 3051 total += nr[order] << order;
3052
3053 types[order] = 0;
3054 for (type = 0; type < MIGRATE_TYPES; type++) {
3055 if (!list_empty(&area->free_list[type]))
3056 types[order] |= 1 << type;
3057 }
3018 } 3058 }
3019 spin_unlock_irqrestore(&zone->lock, flags); 3059 spin_unlock_irqrestore(&zone->lock, flags);
3020 for (order = 0; order < MAX_ORDER; order++) 3060 for (order = 0; order < MAX_ORDER; order++) {
3021 printk("%lu*%lukB ", nr[order], K(1UL) << order); 3061 printk("%lu*%lukB ", nr[order], K(1UL) << order);
3062 if (nr[order])
3063 show_migration_types(types[order]);
3064 }
3022 printk("= %lukB\n", K(total)); 3065 printk("= %lukB\n", K(total));
3023 } 3066 }
3024 3067
@@ -5175,10 +5218,6 @@ static void __setup_per_zone_wmarks(void)
5175 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5218 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5176 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5219 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5177 5220
5178 zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
5179 zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
5180 zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
5181
5182 setup_zone_migrate_reserve(zone); 5221 setup_zone_migrate_reserve(zone);
5183 spin_unlock_irqrestore(&zone->lock, flags); 5222 spin_unlock_irqrestore(&zone->lock, flags);
5184 } 5223 }
@@ -5576,7 +5615,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5576 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 5615 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5577 * expect this function should be exact. 5616 * expect this function should be exact.
5578 */ 5617 */
5579bool has_unmovable_pages(struct zone *zone, struct page *page, int count) 5618bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
5619 bool skip_hwpoisoned_pages)
5580{ 5620{
5581 unsigned long pfn, iter, found; 5621 unsigned long pfn, iter, found;
5582 int mt; 5622 int mt;
@@ -5611,6 +5651,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
5611 continue; 5651 continue;
5612 } 5652 }
5613 5653
5654 /*
5655 * The HWPoisoned page may be not in buddy system, and
5656 * page_count() is not 0.
5657 */
5658 if (skip_hwpoisoned_pages && PageHWPoison(page))
5659 continue;
5660
5614 if (!PageLRU(page)) 5661 if (!PageLRU(page))
5615 found++; 5662 found++;
5616 /* 5663 /*
@@ -5653,7 +5700,7 @@ bool is_pageblock_removable_nolock(struct page *page)
5653 zone->zone_start_pfn + zone->spanned_pages <= pfn) 5700 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5654 return false; 5701 return false;
5655 5702
5656 return !has_unmovable_pages(zone, page, 0); 5703 return !has_unmovable_pages(zone, page, 0, true);
5657} 5704}
5658 5705
5659#ifdef CONFIG_CMA 5706#ifdef CONFIG_CMA
@@ -5711,58 +5758,10 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
5711 0, false, MIGRATE_SYNC); 5758 0, false, MIGRATE_SYNC);
5712 } 5759 }
5713 5760
5714 putback_lru_pages(&cc->migratepages); 5761 putback_movable_pages(&cc->migratepages);
5715 return ret > 0 ? 0 : ret; 5762 return ret > 0 ? 0 : ret;
5716} 5763}
5717 5764
5718/*
5719 * Update zone's cma pages counter used for watermark level calculation.
5720 */
5721static inline void __update_cma_watermarks(struct zone *zone, int count)
5722{
5723 unsigned long flags;
5724 spin_lock_irqsave(&zone->lock, flags);
5725 zone->min_cma_pages += count;
5726 spin_unlock_irqrestore(&zone->lock, flags);
5727 setup_per_zone_wmarks();
5728}
5729
5730/*
5731 * Trigger memory pressure bump to reclaim some pages in order to be able to
5732 * allocate 'count' pages in single page units. Does similar work as
5733 *__alloc_pages_slowpath() function.
5734 */
5735static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5736{
5737 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
5738 struct zonelist *zonelist = node_zonelist(0, gfp_mask);
5739 int did_some_progress = 0;
5740 int order = 1;
5741
5742 /*
5743 * Increase level of watermarks to force kswapd do his job
5744 * to stabilise at new watermark level.
5745 */
5746 __update_cma_watermarks(zone, count);
5747
5748 /* Obey watermarks as if the page was being allocated */
5749 while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
5750 wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
5751
5752 did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
5753 NULL);
5754 if (!did_some_progress) {
5755 /* Exhausted what can be done so it's blamo time */
5756 out_of_memory(zonelist, gfp_mask, order, NULL, false);
5757 }
5758 }
5759
5760 /* Restore original watermark levels. */
5761 __update_cma_watermarks(zone, -count);
5762
5763 return count;
5764}
5765
5766/** 5765/**
5767 * alloc_contig_range() -- tries to allocate given range of pages 5766 * alloc_contig_range() -- tries to allocate given range of pages
5768 * @start: start PFN to allocate 5767 * @start: start PFN to allocate
@@ -5786,7 +5785,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5786int alloc_contig_range(unsigned long start, unsigned long end, 5785int alloc_contig_range(unsigned long start, unsigned long end,
5787 unsigned migratetype) 5786 unsigned migratetype)
5788{ 5787{
5789 struct zone *zone = page_zone(pfn_to_page(start));
5790 unsigned long outer_start, outer_end; 5788 unsigned long outer_start, outer_end;
5791 int ret = 0, order; 5789 int ret = 0, order;
5792 5790
@@ -5824,7 +5822,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5824 */ 5822 */
5825 5823
5826 ret = start_isolate_page_range(pfn_max_align_down(start), 5824 ret = start_isolate_page_range(pfn_max_align_down(start),
5827 pfn_max_align_up(end), migratetype); 5825 pfn_max_align_up(end), migratetype,
5826 false);
5828 if (ret) 5827 if (ret)
5829 return ret; 5828 return ret;
5830 5829
@@ -5863,18 +5862,13 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5863 } 5862 }
5864 5863
5865 /* Make sure the range is really isolated. */ 5864 /* Make sure the range is really isolated. */
5866 if (test_pages_isolated(outer_start, end)) { 5865 if (test_pages_isolated(outer_start, end, false)) {
5867 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 5866 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
5868 outer_start, end); 5867 outer_start, end);
5869 ret = -EBUSY; 5868 ret = -EBUSY;
5870 goto done; 5869 goto done;
5871 } 5870 }
5872 5871
5873 /*
5874 * Reclaim enough pages to make sure that contiguous allocation
5875 * will not starve the system.
5876 */
5877 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
5878 5872
5879 /* Grab isolated pages from freelists. */ 5873 /* Grab isolated pages from freelists. */
5880 outer_end = isolate_freepages_range(&cc, outer_start, end); 5874 outer_end = isolate_freepages_range(&cc, outer_start, end);
@@ -5932,7 +5926,6 @@ void __meminit zone_pcp_update(struct zone *zone)
5932} 5926}
5933#endif 5927#endif
5934 5928
5935#ifdef CONFIG_MEMORY_HOTREMOVE
5936void zone_pcp_reset(struct zone *zone) 5929void zone_pcp_reset(struct zone *zone)
5937{ 5930{
5938 unsigned long flags; 5931 unsigned long flags;
@@ -5952,6 +5945,7 @@ void zone_pcp_reset(struct zone *zone)
5952 local_irq_restore(flags); 5945 local_irq_restore(flags);
5953} 5946}
5954 5947
5948#ifdef CONFIG_MEMORY_HOTREMOVE
5955/* 5949/*
5956 * All pages in the range must be isolated before calling this. 5950 * All pages in the range must be isolated before calling this.
5957 */ 5951 */
@@ -5978,6 +5972,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5978 continue; 5972 continue;
5979 } 5973 }
5980 page = pfn_to_page(pfn); 5974 page = pfn_to_page(pfn);
5975 /*
5976 * The HWPoisoned page may be not in buddy system, and
5977 * page_count() is not 0.
5978 */
5979 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
5980 pfn++;
5981 SetPageReserved(page);
5982 continue;
5983 }
5984
5981 BUG_ON(page_count(page)); 5985 BUG_ON(page_count(page));
5982 BUG_ON(!PageBuddy(page)); 5986 BUG_ON(!PageBuddy(page));
5983 order = page_order(page); 5987 order = page_order(page);
@@ -5988,8 +5992,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5988 list_del(&page->lru); 5992 list_del(&page->lru);
5989 rmv_page_order(page); 5993 rmv_page_order(page);
5990 zone->free_area[order].nr_free--; 5994 zone->free_area[order].nr_free--;
5991 __mod_zone_page_state(zone, NR_FREE_PAGES,
5992 - (1UL << order));
5993 for (i = 0; i < (1 << order); i++) 5995 for (i = 0; i < (1 << order); i++)
5994 SetPageReserved((page+i)); 5996 SetPageReserved((page+i));
5995 pfn += (1 << order); 5997 pfn += (1 << order);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5ddad0c6daa6..44db00e253ed 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
251 mn->nr_pages, mn->status_change_nid); 251 mn->nr_pages, mn->status_change_nid);
252 break; 252 break;
253 case MEM_CANCEL_ONLINE: 253 case MEM_CANCEL_ONLINE:
254 offline_page_cgroup(mn->start_pfn,
255 mn->nr_pages, mn->status_change_nid);
256 break;
254 case MEM_GOING_OFFLINE: 257 case MEM_GOING_OFFLINE:
255 break; 258 break;
256 case MEM_ONLINE: 259 case MEM_ONLINE:
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f2f5b4818e94..9d2264ea4606 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -30,7 +30,7 @@ static void restore_pageblock_isolate(struct page *page, int migratetype)
30 zone->nr_pageblock_isolate--; 30 zone->nr_pageblock_isolate--;
31} 31}
32 32
33int set_migratetype_isolate(struct page *page) 33int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
34{ 34{
35 struct zone *zone; 35 struct zone *zone;
36 unsigned long flags, pfn; 36 unsigned long flags, pfn;
@@ -66,7 +66,8 @@ int set_migratetype_isolate(struct page *page)
66 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. 66 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
67 * We just check MOVABLE pages. 67 * We just check MOVABLE pages.
68 */ 68 */
69 if (!has_unmovable_pages(zone, page, arg.pages_found)) 69 if (!has_unmovable_pages(zone, page, arg.pages_found,
70 skip_hwpoisoned_pages))
70 ret = 0; 71 ret = 0;
71 72
72 /* 73 /*
@@ -134,7 +135,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
134 * Returns 0 on success and -EBUSY if any part of range cannot be isolated. 135 * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
135 */ 136 */
136int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, 137int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
137 unsigned migratetype) 138 unsigned migratetype, bool skip_hwpoisoned_pages)
138{ 139{
139 unsigned long pfn; 140 unsigned long pfn;
140 unsigned long undo_pfn; 141 unsigned long undo_pfn;
@@ -147,7 +148,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
147 pfn < end_pfn; 148 pfn < end_pfn;
148 pfn += pageblock_nr_pages) { 149 pfn += pageblock_nr_pages) {
149 page = __first_valid_page(pfn, pageblock_nr_pages); 150 page = __first_valid_page(pfn, pageblock_nr_pages);
150 if (page && set_migratetype_isolate(page)) { 151 if (page &&
152 set_migratetype_isolate(page, skip_hwpoisoned_pages)) {
151 undo_pfn = pfn; 153 undo_pfn = pfn;
152 goto undo; 154 goto undo;
153 } 155 }
@@ -190,7 +192,8 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
190 * Returns 1 if all pages in the range are isolated. 192 * Returns 1 if all pages in the range are isolated.
191 */ 193 */
192static int 194static int
193__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) 195__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
196 bool skip_hwpoisoned_pages)
194{ 197{
195 struct page *page; 198 struct page *page;
196 199
@@ -220,6 +223,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
220 else if (page_count(page) == 0 && 223 else if (page_count(page) == 0 &&
221 get_freepage_migratetype(page) == MIGRATE_ISOLATE) 224 get_freepage_migratetype(page) == MIGRATE_ISOLATE)
222 pfn += 1; 225 pfn += 1;
226 else if (skip_hwpoisoned_pages && PageHWPoison(page)) {
227 /*
228 * The HWPoisoned page may be not in buddy
229 * system, and page_count() is not 0.
230 */
231 pfn++;
232 continue;
233 }
223 else 234 else
224 break; 235 break;
225 } 236 }
@@ -228,7 +239,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
228 return 1; 239 return 1;
229} 240}
230 241
231int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 242int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
243 bool skip_hwpoisoned_pages)
232{ 244{
233 unsigned long pfn, flags; 245 unsigned long pfn, flags;
234 struct page *page; 246 struct page *page;
@@ -251,7 +263,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
251 /* Check all pages are free or Marked as ISOLATED */ 263 /* Check all pages are free or Marked as ISOLATED */
252 zone = page_zone(page); 264 zone = page_zone(page);
253 spin_lock_irqsave(&zone->lock, flags); 265 spin_lock_irqsave(&zone->lock, flags);
254 ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); 266 ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
267 skip_hwpoisoned_pages);
255 spin_unlock_irqrestore(&zone->lock, flags); 268 spin_unlock_irqrestore(&zone->lock, flags);
256 return ret ? 0 : -EBUSY; 269 return ret ? 0 : -EBUSY;
257} 270}
diff --git a/mm/rmap.c b/mm/rmap.c
index 2ee1ef0f317b..cf7e99a87c32 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -562,6 +562,27 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
562 return address; 562 return address;
563} 563}
564 564
565pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
566{
567 pgd_t *pgd;
568 pud_t *pud;
569 pmd_t *pmd = NULL;
570
571 pgd = pgd_offset(mm, address);
572 if (!pgd_present(*pgd))
573 goto out;
574
575 pud = pud_offset(pgd, address);
576 if (!pud_present(*pud))
577 goto out;
578
579 pmd = pmd_offset(pud, address);
580 if (!pmd_present(*pmd))
581 pmd = NULL;
582out:
583 return pmd;
584}
585
565/* 586/*
566 * Check that @page is mapped at @address into @mm. 587 * Check that @page is mapped at @address into @mm.
567 * 588 *
@@ -574,8 +595,6 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
574pte_t *__page_check_address(struct page *page, struct mm_struct *mm, 595pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
575 unsigned long address, spinlock_t **ptlp, int sync) 596 unsigned long address, spinlock_t **ptlp, int sync)
576{ 597{
577 pgd_t *pgd;
578 pud_t *pud;
579 pmd_t *pmd; 598 pmd_t *pmd;
580 pte_t *pte; 599 pte_t *pte;
581 spinlock_t *ptl; 600 spinlock_t *ptl;
@@ -586,17 +605,10 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
586 goto check; 605 goto check;
587 } 606 }
588 607
589 pgd = pgd_offset(mm, address); 608 pmd = mm_find_pmd(mm, address);
590 if (!pgd_present(*pgd)) 609 if (!pmd)
591 return NULL;
592
593 pud = pud_offset(pgd, address);
594 if (!pud_present(*pud))
595 return NULL; 610 return NULL;
596 611
597 pmd = pmd_offset(pud, address);
598 if (!pmd_present(*pmd))
599 return NULL;
600 if (pmd_trans_huge(*pmd)) 612 if (pmd_trans_huge(*pmd))
601 return NULL; 613 return NULL;
602 614
@@ -1139,9 +1151,11 @@ void page_remove_rmap(struct page *page)
1139 * containing the swap entry, but page not yet written to swap. 1151 * containing the swap entry, but page not yet written to swap.
1140 * 1152 *
1141 * And we can skip it on file pages, so long as the filesystem 1153 * And we can skip it on file pages, so long as the filesystem
1142 * participates in dirty tracking; but need to catch shm and tmpfs 1154 * participates in dirty tracking (note that this is not only an
1143 * and ramfs pages which have been modified since creation by read 1155 * optimization but also solves problems caused by dirty flag in
1144 * fault. 1156 * storage key getting set by a write from inside kernel); but need to
1157 * catch shm and tmpfs and ramfs pages which have been modified since
1158 * creation by read fault.
1145 * 1159 *
1146 * Note that mapping must be decided above, before decrementing 1160 * Note that mapping must be decided above, before decrementing
1147 * mapcount (which luckily provides a barrier): once page is unmapped, 1161 * mapcount (which luckily provides a barrier): once page is unmapped,
@@ -1345,8 +1359,6 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1345 struct vm_area_struct *vma, struct page *check_page) 1359 struct vm_area_struct *vma, struct page *check_page)
1346{ 1360{
1347 struct mm_struct *mm = vma->vm_mm; 1361 struct mm_struct *mm = vma->vm_mm;
1348 pgd_t *pgd;
1349 pud_t *pud;
1350 pmd_t *pmd; 1362 pmd_t *pmd;
1351 pte_t *pte; 1363 pte_t *pte;
1352 pte_t pteval; 1364 pte_t pteval;
@@ -1366,16 +1378,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1366 if (end > vma->vm_end) 1378 if (end > vma->vm_end)
1367 end = vma->vm_end; 1379 end = vma->vm_end;
1368 1380
1369 pgd = pgd_offset(mm, address); 1381 pmd = mm_find_pmd(mm, address);
1370 if (!pgd_present(*pgd)) 1382 if (!pmd)
1371 return ret;
1372
1373 pud = pud_offset(pgd, address);
1374 if (!pud_present(*pud))
1375 return ret;
1376
1377 pmd = pmd_offset(pud, address);
1378 if (!pmd_present(*pmd))
1379 return ret; 1383 return ret;
1380 1384
1381 mmun_start = address; 1385 mmun_start = address;
diff --git a/mm/slub.c b/mm/slub.c
index a0d698467f70..487f0bdd53c0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3573,7 +3573,7 @@ static void slab_mem_offline_callback(void *arg)
3573 struct memory_notify *marg = arg; 3573 struct memory_notify *marg = arg;
3574 int offline_node; 3574 int offline_node;
3575 3575
3576 offline_node = marg->status_change_nid; 3576 offline_node = marg->status_change_nid_normal;
3577 3577
3578 /* 3578 /*
3579 * If the node still has available memory. we need kmem_cache_node 3579 * If the node still has available memory. we need kmem_cache_node
@@ -3606,7 +3606,7 @@ static int slab_mem_going_online_callback(void *arg)
3606 struct kmem_cache_node *n; 3606 struct kmem_cache_node *n;
3607 struct kmem_cache *s; 3607 struct kmem_cache *s;
3608 struct memory_notify *marg = arg; 3608 struct memory_notify *marg = arg;
3609 int nid = marg->status_change_nid; 3609 int nid = marg->status_change_nid_normal;
3610 int ret = 0; 3610 int ret = 0;
3611 3611
3612 /* 3612 /*
diff --git a/mm/sparse.c b/mm/sparse.c
index a83de2f72b30..6b5fb762e2ca 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -638,7 +638,6 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
638got_map_page: 638got_map_page:
639 ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); 639 ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
640got_map_ptr: 640got_map_ptr:
641 memset(ret, 0, memmap_size);
642 641
643 return ret; 642 return ret;
644} 643}
@@ -758,6 +757,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
758 goto out; 757 goto out;
759 } 758 }
760 759
760 memset(memmap, 0, sizeof(struct page) * nr_pages);
761
761 ms->section_mem_map |= SECTION_MARKED_PRESENT; 762 ms->section_mem_map |= SECTION_MARKED_PRESENT;
762 763
763 ret = sparse_init_one_section(ms, section_nr, memmap, usemap); 764 ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
@@ -771,6 +772,27 @@ out:
771 return ret; 772 return ret;
772} 773}
773 774
775#ifdef CONFIG_MEMORY_FAILURE
776static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
777{
778 int i;
779
780 if (!memmap)
781 return;
782
783 for (i = 0; i < PAGES_PER_SECTION; i++) {
784 if (PageHWPoison(&memmap[i])) {
785 atomic_long_sub(1, &mce_bad_pages);
786 ClearPageHWPoison(&memmap[i]);
787 }
788 }
789}
790#else
791static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
792{
793}
794#endif
795
774void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) 796void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
775{ 797{
776 struct page *memmap = NULL; 798 struct page *memmap = NULL;
@@ -784,6 +806,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
784 ms->pageblock_flags = NULL; 806 ms->pageblock_flags = NULL;
785 } 807 }
786 808
809 clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
787 free_section_usemap(memmap, usemap); 810 free_section_usemap(memmap, usemap);
788} 811}
789#endif 812#endif
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f91a25547ffe..e97a0e5aea91 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1443,13 +1443,12 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
1443 return generic_swapfile_activate(sis, swap_file, span); 1443 return generic_swapfile_activate(sis, swap_file, span);
1444} 1444}
1445 1445
1446static void enable_swap_info(struct swap_info_struct *p, int prio, 1446static void _enable_swap_info(struct swap_info_struct *p, int prio,
1447 unsigned char *swap_map, 1447 unsigned char *swap_map,
1448 unsigned long *frontswap_map) 1448 unsigned long *frontswap_map)
1449{ 1449{
1450 int i, prev; 1450 int i, prev;
1451 1451
1452 spin_lock(&swap_lock);
1453 if (prio >= 0) 1452 if (prio >= 0)
1454 p->prio = prio; 1453 p->prio = prio;
1455 else 1454 else
@@ -1472,10 +1471,25 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
1472 swap_list.head = swap_list.next = p->type; 1471 swap_list.head = swap_list.next = p->type;
1473 else 1472 else
1474 swap_info[prev]->next = p->type; 1473 swap_info[prev]->next = p->type;
1474}
1475
1476static void enable_swap_info(struct swap_info_struct *p, int prio,
1477 unsigned char *swap_map,
1478 unsigned long *frontswap_map)
1479{
1480 spin_lock(&swap_lock);
1481 _enable_swap_info(p, prio, swap_map, frontswap_map);
1475 frontswap_init(p->type); 1482 frontswap_init(p->type);
1476 spin_unlock(&swap_lock); 1483 spin_unlock(&swap_lock);
1477} 1484}
1478 1485
1486static void reinsert_swap_info(struct swap_info_struct *p)
1487{
1488 spin_lock(&swap_lock);
1489 _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
1490 spin_unlock(&swap_lock);
1491}
1492
1479SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) 1493SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1480{ 1494{
1481 struct swap_info_struct *p = NULL; 1495 struct swap_info_struct *p = NULL;
@@ -1484,7 +1498,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1484 struct address_space *mapping; 1498 struct address_space *mapping;
1485 struct inode *inode; 1499 struct inode *inode;
1486 struct filename *pathname; 1500 struct filename *pathname;
1487 int oom_score_adj;
1488 int i, type, prev; 1501 int i, type, prev;
1489 int err; 1502 int err;
1490 1503
@@ -1543,19 +1556,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1543 p->flags &= ~SWP_WRITEOK; 1556 p->flags &= ~SWP_WRITEOK;
1544 spin_unlock(&swap_lock); 1557 spin_unlock(&swap_lock);
1545 1558
1546 oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); 1559 set_current_oom_origin();
1547 err = try_to_unuse(type, false, 0); /* force all pages to be unused */ 1560 err = try_to_unuse(type, false, 0); /* force all pages to be unused */
1548 compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); 1561 clear_current_oom_origin();
1549 1562
1550 if (err) { 1563 if (err) {
1551 /*
1552 * reading p->prio and p->swap_map outside the lock is
1553 * safe here because only sys_swapon and sys_swapoff
1554 * change them, and there can be no other sys_swapon or
1555 * sys_swapoff for this swap_info_struct at this point.
1556 */
1557 /* re-insert swap space back into swap_list */ 1564 /* re-insert swap space back into swap_list */
1558 enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); 1565 reinsert_swap_info(p);
1559 goto out_dput; 1566 goto out_dput;
1560 } 1567 }
1561 1568
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 78e08300db21..5123a169ab7b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2550,7 +2550,7 @@ static void s_stop(struct seq_file *m, void *p)
2550 2550
2551static void show_numa_info(struct seq_file *m, struct vm_struct *v) 2551static void show_numa_info(struct seq_file *m, struct vm_struct *v)
2552{ 2552{
2553 if (NUMA_BUILD) { 2553 if (IS_ENABLED(CONFIG_NUMA)) {
2554 unsigned int nr, *counters = m->private; 2554 unsigned int nr, *counters = m->private;
2555 2555
2556 if (!counters) 2556 if (!counters)
@@ -2615,7 +2615,7 @@ static int vmalloc_open(struct inode *inode, struct file *file)
2615 unsigned int *ptr = NULL; 2615 unsigned int *ptr = NULL;
2616 int ret; 2616 int ret;
2617 2617
2618 if (NUMA_BUILD) { 2618 if (IS_ENABLED(CONFIG_NUMA)) {
2619 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); 2619 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
2620 if (ptr == NULL) 2620 if (ptr == NULL)
2621 return -ENOMEM; 2621 return -ENOMEM;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b7ed37675644..157bb116dec8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1679,13 +1679,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1679 1679
1680 if (global_reclaim(sc)) { 1680 if (global_reclaim(sc)) {
1681 free = zone_page_state(zone, NR_FREE_PAGES); 1681 free = zone_page_state(zone, NR_FREE_PAGES);
1682 /* If we have very few page cache pages,
1683 force-scan anon pages. */
1684 if (unlikely(file + free <= high_wmark_pages(zone))) { 1682 if (unlikely(file + free <= high_wmark_pages(zone))) {
1683 /*
1684 * If we have very few page cache pages, force-scan
1685 * anon pages.
1686 */
1685 fraction[0] = 1; 1687 fraction[0] = 1;
1686 fraction[1] = 0; 1688 fraction[1] = 0;
1687 denominator = 1; 1689 denominator = 1;
1688 goto out; 1690 goto out;
1691 } else if (!inactive_file_is_low_global(zone)) {
1692 /*
1693 * There is enough inactive page cache, do not
1694 * reclaim anything from the working set right now.
1695 */
1696 fraction[0] = 0;
1697 fraction[1] = 1;
1698 denominator = 1;
1699 goto out;
1689 } 1700 }
1690 } 1701 }
1691 1702
@@ -1752,7 +1763,7 @@ out:
1752/* Use reclaim/compaction for costly allocs or under memory pressure */ 1763/* Use reclaim/compaction for costly allocs or under memory pressure */
1753static bool in_reclaim_compaction(struct scan_control *sc) 1764static bool in_reclaim_compaction(struct scan_control *sc)
1754{ 1765{
1755 if (COMPACTION_BUILD && sc->order && 1766 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
1756 (sc->order > PAGE_ALLOC_COSTLY_ORDER || 1767 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
1757 sc->priority < DEF_PRIORITY - 2)) 1768 sc->priority < DEF_PRIORITY - 2))
1758 return true; 1769 return true;
@@ -2005,7 +2016,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2005 if (zone->all_unreclaimable && 2016 if (zone->all_unreclaimable &&
2006 sc->priority != DEF_PRIORITY) 2017 sc->priority != DEF_PRIORITY)
2007 continue; /* Let kswapd poll it */ 2018 continue; /* Let kswapd poll it */
2008 if (COMPACTION_BUILD) { 2019 if (IS_ENABLED(CONFIG_COMPACTION)) {
2009 /* 2020 /*
2010 * If we already have plenty of memory free for 2021 * If we already have plenty of memory free for
2011 * compaction in this zone, don't free any more. 2022 * compaction in this zone, don't free any more.
@@ -2421,7 +2432,8 @@ static bool zone_balanced(struct zone *zone, int order,
2421 balance_gap, classzone_idx, 0)) 2432 balance_gap, classzone_idx, 0))
2422 return false; 2433 return false;
2423 2434
2424 if (COMPACTION_BUILD && order && !compaction_suitable(zone, order)) 2435 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2436 !compaction_suitable(zone, order))
2425 return false; 2437 return false;
2426 2438
2427 return true; 2439 return true;
@@ -2684,7 +2696,7 @@ loop_again:
2684 * Do not reclaim more than needed for compaction. 2696 * Do not reclaim more than needed for compaction.
2685 */ 2697 */
2686 testorder = order; 2698 testorder = order;
2687 if (COMPACTION_BUILD && order && 2699 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2688 compaction_suitable(zone, order) != 2700 compaction_suitable(zone, order) !=
2689 COMPACT_SKIPPED) 2701 COMPACT_SKIPPED)
2690 testorder = 0; 2702 testorder = 0;
@@ -2951,7 +2963,7 @@ static int kswapd(void *p)
2951 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 2963 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2952 balanced_classzone_idx = classzone_idx; 2964 balanced_classzone_idx = classzone_idx;
2953 for ( ; ; ) { 2965 for ( ; ; ) {
2954 int ret; 2966 bool ret;
2955 2967
2956 /* 2968 /*
2957 * If the last balance_pgdat was unsuccessful it's unlikely a 2969 * If the last balance_pgdat was unsuccessful it's unlikely a