aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-10-09 03:23:15 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-09 03:23:15 -0400
commit9e2d8656f5e8aa214e66b462680cf86b210b74a8 (patch)
treef67d62e896cedf75599ea45f9ecf9999c6ad24cd /mm
parent1ea4f4f8405cc1ceec23f2d261bc3775785e6712 (diff)
parent9e695d2ecc8451cc2c1603d60b5c8e7f5581923a (diff)
Merge branch 'akpm' (Andrew's patch-bomb)
Merge patches from Andrew Morton: "A few misc things and very nearly all of the MM tree. A tremendous amount of stuff (again), including a significant rbtree library rework." * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (160 commits) sparc64: Support transparent huge pages. mm: thp: Use more portable PMD clearing sequenece in zap_huge_pmd(). mm: Add and use update_mmu_cache_pmd() in transparent huge page code. sparc64: Document PGD and PMD layout. sparc64: Eliminate PTE table memory wastage. sparc64: Halve the size of PTE tables sparc64: Only support 4MB huge pages and 8KB base pages. memory-hotplug: suppress "Trying to free nonexistent resource <XXXXXXXXXXXXXXXX-YYYYYYYYYYYYYYYY>" warning mm: memcg: clean up mm_match_cgroup() signature mm: document PageHuge somewhat mm: use %pK for /proc/vmallocinfo mm, thp: fix mlock statistics mm, thp: fix mapped pages avoiding unevictable list on mlock memory-hotplug: update memory block's state and notify userspace memory-hotplug: preparation to notify memory block's state at memory hot remove mm: avoid section mismatch warning for memblock_type_name make GFP_NOTRACK definition unconditional cma: decrease cc.nr_migratepages after reclaiming pagelist CMA: migrate mlocked pages kpageflags: fix wrong KPF_THP on non-huge compound pages ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/Makefile4
-rw-r--r--mm/bootmem.c10
-rw-r--r--mm/compaction.c562
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/filemap_xip.c10
-rw-r--r--mm/fremap.c16
-rw-r--r--mm/huge_memory.c440
-rw-r--r--mm/hugetlb.c34
-rw-r--r--mm/internal.h52
-rw-r--r--mm/interval_tree.c112
-rw-r--r--mm/kmemleak.c100
-rw-r--r--mm/ksm.c40
-rw-r--r--mm/madvise.c8
-rw-r--r--mm/memblock.c5
-rw-r--r--mm/memcontrol.c22
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/memory.c115
-rw-r--r--mm/memory_hotplug.c77
-rw-r--r--mm/mempolicy.c148
-rw-r--r--mm/mlock.c27
-rw-r--r--mm/mmap.c207
-rw-r--r--mm/mmu_notifier.c103
-rw-r--r--mm/mremap.c73
-rw-r--r--mm/nobootmem.c5
-rw-r--r--mm/nommu.c33
-rw-r--r--mm/oom_kill.c4
-rw-r--r--mm/page_alloc.c317
-rw-r--r--mm/page_isolation.c43
-rw-r--r--mm/pgtable-generic.c50
-rw-r--r--mm/prio_tree.c208
-rw-r--r--mm/rmap.c159
-rw-r--r--mm/shmem.c3
-rw-r--r--mm/swap.c13
-rw-r--r--mm/truncate.c3
-rw-r--r--mm/vmalloc.c5
-rw-r--r--mm/vmscan.c111
-rw-r--r--mm/vmstat.c14
38 files changed, 1830 insertions, 1320 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d5c8019c6627..a3f8dddaaab3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -191,6 +191,7 @@ config SPLIT_PTLOCK_CPUS
191# support for memory compaction 191# support for memory compaction
192config COMPACTION 192config COMPACTION
193 bool "Allow for memory compaction" 193 bool "Allow for memory compaction"
194 def_bool y
194 select MIGRATION 195 select MIGRATION
195 depends on MMU 196 depends on MMU
196 help 197 help
@@ -318,7 +319,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
318 319
319config TRANSPARENT_HUGEPAGE 320config TRANSPARENT_HUGEPAGE
320 bool "Transparent Hugepage Support" 321 bool "Transparent Hugepage Support"
321 depends on X86 && MMU 322 depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
322 select COMPACTION 323 select COMPACTION
323 help 324 help
324 Transparent Hugepages allows the kernel to use huge pages and 325 Transparent Hugepages allows the kernel to use huge pages and
diff --git a/mm/Makefile b/mm/Makefile
index 92753e2d82da..6b025f80af34 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -14,9 +14,9 @@ endif
14obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ 14obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
15 maccess.o page_alloc.o page-writeback.o \ 15 maccess.o page_alloc.o page-writeback.o \
16 readahead.o swap.o truncate.o vmscan.o shmem.o \ 16 readahead.o swap.o truncate.o vmscan.o shmem.o \
17 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 17 util.o mmzone.o vmstat.o backing-dev.o \
18 mm_init.o mmu_context.o percpu.o slab_common.o \ 18 mm_init.o mmu_context.o percpu.o slab_common.o \
19 compaction.o $(mmu-y) 19 compaction.o interval_tree.o $(mmu-y)
20 20
21obj-y += init-mm.o 21obj-y += init-mm.o
22 22
diff --git a/mm/bootmem.c b/mm/bootmem.c
index f468185b3b28..434be4ae7a04 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -198,6 +198,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
198 int order = ilog2(BITS_PER_LONG); 198 int order = ilog2(BITS_PER_LONG);
199 199
200 __free_pages_bootmem(pfn_to_page(start), order); 200 __free_pages_bootmem(pfn_to_page(start), order);
201 fixup_zone_present_pages(page_to_nid(pfn_to_page(start)),
202 start, start + BITS_PER_LONG);
201 count += BITS_PER_LONG; 203 count += BITS_PER_LONG;
202 start += BITS_PER_LONG; 204 start += BITS_PER_LONG;
203 } else { 205 } else {
@@ -208,6 +210,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
208 if (vec & 1) { 210 if (vec & 1) {
209 page = pfn_to_page(start + off); 211 page = pfn_to_page(start + off);
210 __free_pages_bootmem(page, 0); 212 __free_pages_bootmem(page, 0);
213 fixup_zone_present_pages(
214 page_to_nid(page),
215 start + off, start + off + 1);
211 count++; 216 count++;
212 } 217 }
213 vec >>= 1; 218 vec >>= 1;
@@ -221,8 +226,11 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
221 pages = bdata->node_low_pfn - bdata->node_min_pfn; 226 pages = bdata->node_low_pfn - bdata->node_min_pfn;
222 pages = bootmem_bootmap_pages(pages); 227 pages = bootmem_bootmap_pages(pages);
223 count += pages; 228 count += pages;
224 while (pages--) 229 while (pages--) {
230 fixup_zone_present_pages(page_to_nid(page),
231 page_to_pfn(page), page_to_pfn(page) + 1);
225 __free_pages_bootmem(page++, 0); 232 __free_pages_bootmem(page++, 0);
233 }
226 234
227 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); 235 bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
228 236
diff --git a/mm/compaction.c b/mm/compaction.c
index 7fcd3a52e68d..2c4ce17651d8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,111 @@ static inline bool migrate_async_suitable(int migratetype)
50 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; 50 return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
51} 51}
52 52
53#ifdef CONFIG_COMPACTION
54/* Returns true if the pageblock should be scanned for pages to isolate. */
55static inline bool isolation_suitable(struct compact_control *cc,
56 struct page *page)
57{
58 if (cc->ignore_skip_hint)
59 return true;
60
61 return !get_pageblock_skip(page);
62}
63
64/*
65 * This function is called to clear all cached information on pageblocks that
66 * should be skipped for page isolation when the migrate and free page scanner
67 * meet.
68 */
69static void __reset_isolation_suitable(struct zone *zone)
70{
71 unsigned long start_pfn = zone->zone_start_pfn;
72 unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
73 unsigned long pfn;
74
75 zone->compact_cached_migrate_pfn = start_pfn;
76 zone->compact_cached_free_pfn = end_pfn;
77 zone->compact_blockskip_flush = false;
78
79 /* Walk the zone and mark every pageblock as suitable for isolation */
80 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
81 struct page *page;
82
83 cond_resched();
84
85 if (!pfn_valid(pfn))
86 continue;
87
88 page = pfn_to_page(pfn);
89 if (zone != page_zone(page))
90 continue;
91
92 clear_pageblock_skip(page);
93 }
94}
95
96void reset_isolation_suitable(pg_data_t *pgdat)
97{
98 int zoneid;
99
100 for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
101 struct zone *zone = &pgdat->node_zones[zoneid];
102 if (!populated_zone(zone))
103 continue;
104
105 /* Only flush if a full compaction finished recently */
106 if (zone->compact_blockskip_flush)
107 __reset_isolation_suitable(zone);
108 }
109}
110
111/*
112 * If no pages were isolated then mark this pageblock to be skipped in the
113 * future. The information is later cleared by __reset_isolation_suitable().
114 */
115static void update_pageblock_skip(struct compact_control *cc,
116 struct page *page, unsigned long nr_isolated,
117 bool migrate_scanner)
118{
119 struct zone *zone = cc->zone;
120 if (!page)
121 return;
122
123 if (!nr_isolated) {
124 unsigned long pfn = page_to_pfn(page);
125 set_pageblock_skip(page);
126
127 /* Update where compaction should restart */
128 if (migrate_scanner) {
129 if (!cc->finished_update_migrate &&
130 pfn > zone->compact_cached_migrate_pfn)
131 zone->compact_cached_migrate_pfn = pfn;
132 } else {
133 if (!cc->finished_update_free &&
134 pfn < zone->compact_cached_free_pfn)
135 zone->compact_cached_free_pfn = pfn;
136 }
137 }
138}
139#else
140static inline bool isolation_suitable(struct compact_control *cc,
141 struct page *page)
142{
143 return true;
144}
145
146static void update_pageblock_skip(struct compact_control *cc,
147 struct page *page, unsigned long nr_isolated,
148 bool migrate_scanner)
149{
150}
151#endif /* CONFIG_COMPACTION */
152
153static inline bool should_release_lock(spinlock_t *lock)
154{
155 return need_resched() || spin_is_contended(lock);
156}
157
53/* 158/*
54 * Compaction requires the taking of some coarse locks that are potentially 159 * Compaction requires the taking of some coarse locks that are potentially
55 * very heavily contended. Check if the process needs to be scheduled or 160 * very heavily contended. Check if the process needs to be scheduled or
@@ -62,7 +167,7 @@ static inline bool migrate_async_suitable(int migratetype)
62static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, 167static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
63 bool locked, struct compact_control *cc) 168 bool locked, struct compact_control *cc)
64{ 169{
65 if (need_resched() || spin_is_contended(lock)) { 170 if (should_release_lock(lock)) {
66 if (locked) { 171 if (locked) {
67 spin_unlock_irqrestore(lock, *flags); 172 spin_unlock_irqrestore(lock, *flags);
68 locked = false; 173 locked = false;
@@ -70,14 +175,11 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
70 175
71 /* async aborts if taking too long or contended */ 176 /* async aborts if taking too long or contended */
72 if (!cc->sync) { 177 if (!cc->sync) {
73 if (cc->contended) 178 cc->contended = true;
74 *cc->contended = true;
75 return false; 179 return false;
76 } 180 }
77 181
78 cond_resched(); 182 cond_resched();
79 if (fatal_signal_pending(current))
80 return false;
81 } 183 }
82 184
83 if (!locked) 185 if (!locked)
@@ -91,44 +193,139 @@ static inline bool compact_trylock_irqsave(spinlock_t *lock,
91 return compact_checklock_irqsave(lock, flags, false, cc); 193 return compact_checklock_irqsave(lock, flags, false, cc);
92} 194}
93 195
196/* Returns true if the page is within a block suitable for migration to */
197static bool suitable_migration_target(struct page *page)
198{
199 int migratetype = get_pageblock_migratetype(page);
200
201 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
202 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
203 return false;
204
205 /* If the page is a large free page, then allow migration */
206 if (PageBuddy(page) && page_order(page) >= pageblock_order)
207 return true;
208
209 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
210 if (migrate_async_suitable(migratetype))
211 return true;
212
213 /* Otherwise skip the block */
214 return false;
215}
216
217static void compact_capture_page(struct compact_control *cc)
218{
219 unsigned long flags;
220 int mtype, mtype_low, mtype_high;
221
222 if (!cc->page || *cc->page)
223 return;
224
225 /*
226 * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP
227 * regardless of the migratetype of the freelist is is captured from.
228 * This is fine because the order for a high-order MIGRATE_MOVABLE
229 * allocation is typically at least a pageblock size and overall
230 * fragmentation is not impaired. Other allocation types must
231 * capture pages from their own migratelist because otherwise they
232 * could pollute other pageblocks like MIGRATE_MOVABLE with
233 * difficult to move pages and making fragmentation worse overall.
234 */
235 if (cc->migratetype == MIGRATE_MOVABLE) {
236 mtype_low = 0;
237 mtype_high = MIGRATE_PCPTYPES;
238 } else {
239 mtype_low = cc->migratetype;
240 mtype_high = cc->migratetype + 1;
241 }
242
243 /* Speculatively examine the free lists without zone lock */
244 for (mtype = mtype_low; mtype < mtype_high; mtype++) {
245 int order;
246 for (order = cc->order; order < MAX_ORDER; order++) {
247 struct page *page;
248 struct free_area *area;
249 area = &(cc->zone->free_area[order]);
250 if (list_empty(&area->free_list[mtype]))
251 continue;
252
253 /* Take the lock and attempt capture of the page */
254 if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc))
255 return;
256 if (!list_empty(&area->free_list[mtype])) {
257 page = list_entry(area->free_list[mtype].next,
258 struct page, lru);
259 if (capture_free_page(page, cc->order, mtype)) {
260 spin_unlock_irqrestore(&cc->zone->lock,
261 flags);
262 *cc->page = page;
263 return;
264 }
265 }
266 spin_unlock_irqrestore(&cc->zone->lock, flags);
267 }
268 }
269}
270
94/* 271/*
95 * Isolate free pages onto a private freelist. Caller must hold zone->lock. 272 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
96 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free 273 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
97 * pages inside of the pageblock (even though it may still end up isolating 274 * pages inside of the pageblock (even though it may still end up isolating
98 * some pages). 275 * some pages).
99 */ 276 */
100static unsigned long isolate_freepages_block(unsigned long blockpfn, 277static unsigned long isolate_freepages_block(struct compact_control *cc,
278 unsigned long blockpfn,
101 unsigned long end_pfn, 279 unsigned long end_pfn,
102 struct list_head *freelist, 280 struct list_head *freelist,
103 bool strict) 281 bool strict)
104{ 282{
105 int nr_scanned = 0, total_isolated = 0; 283 int nr_scanned = 0, total_isolated = 0;
106 struct page *cursor; 284 struct page *cursor, *valid_page = NULL;
285 unsigned long nr_strict_required = end_pfn - blockpfn;
286 unsigned long flags;
287 bool locked = false;
107 288
108 cursor = pfn_to_page(blockpfn); 289 cursor = pfn_to_page(blockpfn);
109 290
110 /* Isolate free pages. This assumes the block is valid */ 291 /* Isolate free pages. */
111 for (; blockpfn < end_pfn; blockpfn++, cursor++) { 292 for (; blockpfn < end_pfn; blockpfn++, cursor++) {
112 int isolated, i; 293 int isolated, i;
113 struct page *page = cursor; 294 struct page *page = cursor;
114 295
115 if (!pfn_valid_within(blockpfn)) {
116 if (strict)
117 return 0;
118 continue;
119 }
120 nr_scanned++; 296 nr_scanned++;
297 if (!pfn_valid_within(blockpfn))
298 continue;
299 if (!valid_page)
300 valid_page = page;
301 if (!PageBuddy(page))
302 continue;
121 303
122 if (!PageBuddy(page)) { 304 /*
123 if (strict) 305 * The zone lock must be held to isolate freepages.
124 return 0; 306 * Unfortunately this is a very coarse lock and can be
307 * heavily contended if there are parallel allocations
308 * or parallel compactions. For async compaction do not
309 * spin on the lock and we acquire the lock as late as
310 * possible.
311 */
312 locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
313 locked, cc);
314 if (!locked)
315 break;
316
317 /* Recheck this is a suitable migration target under lock */
318 if (!strict && !suitable_migration_target(page))
319 break;
320
321 /* Recheck this is a buddy page under lock */
322 if (!PageBuddy(page))
125 continue; 323 continue;
126 }
127 324
128 /* Found a free page, break it into order-0 pages */ 325 /* Found a free page, break it into order-0 pages */
129 isolated = split_free_page(page); 326 isolated = split_free_page(page);
130 if (!isolated && strict) 327 if (!isolated && strict)
131 return 0; 328 break;
132 total_isolated += isolated; 329 total_isolated += isolated;
133 for (i = 0; i < isolated; i++) { 330 for (i = 0; i < isolated; i++) {
134 list_add(&page->lru, freelist); 331 list_add(&page->lru, freelist);
@@ -143,6 +340,22 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
143 } 340 }
144 341
145 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); 342 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
343
344 /*
345 * If strict isolation is requested by CMA then check that all the
346 * pages requested were isolated. If there were any failures, 0 is
347 * returned and CMA will fail.
348 */
349 if (strict && nr_strict_required != total_isolated)
350 total_isolated = 0;
351
352 if (locked)
353 spin_unlock_irqrestore(&cc->zone->lock, flags);
354
355 /* Update the pageblock-skip if the whole pageblock was scanned */
356 if (blockpfn == end_pfn)
357 update_pageblock_skip(cc, valid_page, total_isolated, false);
358
146 return total_isolated; 359 return total_isolated;
147} 360}
148 361
@@ -160,17 +373,14 @@ static unsigned long isolate_freepages_block(unsigned long blockpfn,
160 * a free page). 373 * a free page).
161 */ 374 */
162unsigned long 375unsigned long
163isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) 376isolate_freepages_range(struct compact_control *cc,
377 unsigned long start_pfn, unsigned long end_pfn)
164{ 378{
165 unsigned long isolated, pfn, block_end_pfn, flags; 379 unsigned long isolated, pfn, block_end_pfn;
166 struct zone *zone = NULL;
167 LIST_HEAD(freelist); 380 LIST_HEAD(freelist);
168 381
169 if (pfn_valid(start_pfn))
170 zone = page_zone(pfn_to_page(start_pfn));
171
172 for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { 382 for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
173 if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) 383 if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
174 break; 384 break;
175 385
176 /* 386 /*
@@ -180,10 +390,8 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
180 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); 390 block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
181 block_end_pfn = min(block_end_pfn, end_pfn); 391 block_end_pfn = min(block_end_pfn, end_pfn);
182 392
183 spin_lock_irqsave(&zone->lock, flags); 393 isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
184 isolated = isolate_freepages_block(pfn, block_end_pfn,
185 &freelist, true); 394 &freelist, true);
186 spin_unlock_irqrestore(&zone->lock, flags);
187 395
188 /* 396 /*
189 * In strict mode, isolate_freepages_block() returns 0 if 397 * In strict mode, isolate_freepages_block() returns 0 if
@@ -253,6 +461,7 @@ static bool too_many_isolated(struct zone *zone)
253 * @cc: Compaction control structure. 461 * @cc: Compaction control structure.
254 * @low_pfn: The first PFN of the range. 462 * @low_pfn: The first PFN of the range.
255 * @end_pfn: The one-past-the-last PFN of the range. 463 * @end_pfn: The one-past-the-last PFN of the range.
464 * @unevictable: true if it allows to isolate unevictable pages
256 * 465 *
257 * Isolate all pages that can be migrated from the range specified by 466 * Isolate all pages that can be migrated from the range specified by
258 * [low_pfn, end_pfn). Returns zero if there is a fatal signal 467 * [low_pfn, end_pfn). Returns zero if there is a fatal signal
@@ -268,7 +477,7 @@ static bool too_many_isolated(struct zone *zone)
268 */ 477 */
269unsigned long 478unsigned long
270isolate_migratepages_range(struct zone *zone, struct compact_control *cc, 479isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
271 unsigned long low_pfn, unsigned long end_pfn) 480 unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
272{ 481{
273 unsigned long last_pageblock_nr = 0, pageblock_nr; 482 unsigned long last_pageblock_nr = 0, pageblock_nr;
274 unsigned long nr_scanned = 0, nr_isolated = 0; 483 unsigned long nr_scanned = 0, nr_isolated = 0;
@@ -276,7 +485,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
276 isolate_mode_t mode = 0; 485 isolate_mode_t mode = 0;
277 struct lruvec *lruvec; 486 struct lruvec *lruvec;
278 unsigned long flags; 487 unsigned long flags;
279 bool locked; 488 bool locked = false;
489 struct page *page = NULL, *valid_page = NULL;
280 490
281 /* 491 /*
282 * Ensure that there are not too many pages isolated from the LRU 492 * Ensure that there are not too many pages isolated from the LRU
@@ -296,23 +506,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
296 506
297 /* Time to isolate some pages for migration */ 507 /* Time to isolate some pages for migration */
298 cond_resched(); 508 cond_resched();
299 spin_lock_irqsave(&zone->lru_lock, flags);
300 locked = true;
301 for (; low_pfn < end_pfn; low_pfn++) { 509 for (; low_pfn < end_pfn; low_pfn++) {
302 struct page *page;
303
304 /* give a chance to irqs before checking need_resched() */ 510 /* give a chance to irqs before checking need_resched() */
305 if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { 511 if (locked && !((low_pfn+1) % SWAP_CLUSTER_MAX)) {
306 spin_unlock_irqrestore(&zone->lru_lock, flags); 512 if (should_release_lock(&zone->lru_lock)) {
307 locked = false; 513 spin_unlock_irqrestore(&zone->lru_lock, flags);
514 locked = false;
515 }
308 } 516 }
309 517
310 /* Check if it is ok to still hold the lock */
311 locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
312 locked, cc);
313 if (!locked)
314 break;
315
316 /* 518 /*
317 * migrate_pfn does not necessarily start aligned to a 519 * migrate_pfn does not necessarily start aligned to a
318 * pageblock. Ensure that pfn_valid is called when moving 520 * pageblock. Ensure that pfn_valid is called when moving
@@ -340,6 +542,14 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
340 if (page_zone(page) != zone) 542 if (page_zone(page) != zone)
341 continue; 543 continue;
342 544
545 if (!valid_page)
546 valid_page = page;
547
548 /* If isolation recently failed, do not retry */
549 pageblock_nr = low_pfn >> pageblock_order;
550 if (!isolation_suitable(cc, page))
551 goto next_pageblock;
552
343 /* Skip if free */ 553 /* Skip if free */
344 if (PageBuddy(page)) 554 if (PageBuddy(page))
345 continue; 555 continue;
@@ -349,24 +559,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
349 * migration is optimistic to see if the minimum amount of work 559 * migration is optimistic to see if the minimum amount of work
350 * satisfies the allocation 560 * satisfies the allocation
351 */ 561 */
352 pageblock_nr = low_pfn >> pageblock_order;
353 if (!cc->sync && last_pageblock_nr != pageblock_nr && 562 if (!cc->sync && last_pageblock_nr != pageblock_nr &&
354 !migrate_async_suitable(get_pageblock_migratetype(page))) { 563 !migrate_async_suitable(get_pageblock_migratetype(page))) {
355 low_pfn += pageblock_nr_pages; 564 cc->finished_update_migrate = true;
356 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; 565 goto next_pageblock;
357 last_pageblock_nr = pageblock_nr;
358 continue;
359 } 566 }
360 567
568 /* Check may be lockless but that's ok as we recheck later */
361 if (!PageLRU(page)) 569 if (!PageLRU(page))
362 continue; 570 continue;
363 571
364 /* 572 /*
365 * PageLRU is set, and lru_lock excludes isolation, 573 * PageLRU is set. lru_lock normally excludes isolation
366 * splitting and collapsing (collapsing has already 574 * splitting and collapsing (collapsing has already happened
367 * happened if PageLRU is set). 575 * if PageLRU is set) but the lock is not necessarily taken
576 * here and it is wasteful to take it just to check transhuge.
577 * Check TransHuge without lock and skip the whole pageblock if
578 * it's either a transhuge or hugetlbfs page, as calling
579 * compound_order() without preventing THP from splitting the
580 * page underneath us may return surprising results.
368 */ 581 */
369 if (PageTransHuge(page)) { 582 if (PageTransHuge(page)) {
583 if (!locked)
584 goto next_pageblock;
585 low_pfn += (1 << compound_order(page)) - 1;
586 continue;
587 }
588
589 /* Check if it is ok to still hold the lock */
590 locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
591 locked, cc);
592 if (!locked || fatal_signal_pending(current))
593 break;
594
595 /* Recheck PageLRU and PageTransHuge under lock */
596 if (!PageLRU(page))
597 continue;
598 if (PageTransHuge(page)) {
370 low_pfn += (1 << compound_order(page)) - 1; 599 low_pfn += (1 << compound_order(page)) - 1;
371 continue; 600 continue;
372 } 601 }
@@ -374,6 +603,9 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
374 if (!cc->sync) 603 if (!cc->sync)
375 mode |= ISOLATE_ASYNC_MIGRATE; 604 mode |= ISOLATE_ASYNC_MIGRATE;
376 605
606 if (unevictable)
607 mode |= ISOLATE_UNEVICTABLE;
608
377 lruvec = mem_cgroup_page_lruvec(page, zone); 609 lruvec = mem_cgroup_page_lruvec(page, zone);
378 610
379 /* Try isolate the page */ 611 /* Try isolate the page */
@@ -383,6 +615,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
383 VM_BUG_ON(PageTransCompound(page)); 615 VM_BUG_ON(PageTransCompound(page));
384 616
385 /* Successfully isolated */ 617 /* Successfully isolated */
618 cc->finished_update_migrate = true;
386 del_page_from_lru_list(page, lruvec, page_lru(page)); 619 del_page_from_lru_list(page, lruvec, page_lru(page));
387 list_add(&page->lru, migratelist); 620 list_add(&page->lru, migratelist);
388 cc->nr_migratepages++; 621 cc->nr_migratepages++;
@@ -393,6 +626,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
393 ++low_pfn; 626 ++low_pfn;
394 break; 627 break;
395 } 628 }
629
630 continue;
631
632next_pageblock:
633 low_pfn += pageblock_nr_pages;
634 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
635 last_pageblock_nr = pageblock_nr;
396 } 636 }
397 637
398 acct_isolated(zone, locked, cc); 638 acct_isolated(zone, locked, cc);
@@ -400,6 +640,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
400 if (locked) 640 if (locked)
401 spin_unlock_irqrestore(&zone->lru_lock, flags); 641 spin_unlock_irqrestore(&zone->lru_lock, flags);
402 642
643 /* Update the pageblock-skip if the whole pageblock was scanned */
644 if (low_pfn == end_pfn)
645 update_pageblock_skip(cc, valid_page, nr_isolated, true);
646
403 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); 647 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
404 648
405 return low_pfn; 649 return low_pfn;
@@ -407,43 +651,6 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
407 651
408#endif /* CONFIG_COMPACTION || CONFIG_CMA */ 652#endif /* CONFIG_COMPACTION || CONFIG_CMA */
409#ifdef CONFIG_COMPACTION 653#ifdef CONFIG_COMPACTION
410
411/* Returns true if the page is within a block suitable for migration to */
412static bool suitable_migration_target(struct page *page)
413{
414
415 int migratetype = get_pageblock_migratetype(page);
416
417 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
418 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
419 return false;
420
421 /* If the page is a large free page, then allow migration */
422 if (PageBuddy(page) && page_order(page) >= pageblock_order)
423 return true;
424
425 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
426 if (migrate_async_suitable(migratetype))
427 return true;
428
429 /* Otherwise skip the block */
430 return false;
431}
432
433/*
434 * Returns the start pfn of the last page block in a zone. This is the starting
435 * point for full compaction of a zone. Compaction searches for free pages from
436 * the end of each zone, while isolate_freepages_block scans forward inside each
437 * page block.
438 */
439static unsigned long start_free_pfn(struct zone *zone)
440{
441 unsigned long free_pfn;
442 free_pfn = zone->zone_start_pfn + zone->spanned_pages;
443 free_pfn &= ~(pageblock_nr_pages-1);
444 return free_pfn;
445}
446
447/* 654/*
448 * Based on information in the current compact_control, find blocks 655 * Based on information in the current compact_control, find blocks
449 * suitable for isolating free pages from and then isolate them. 656 * suitable for isolating free pages from and then isolate them.
@@ -453,7 +660,6 @@ static void isolate_freepages(struct zone *zone,
453{ 660{
454 struct page *page; 661 struct page *page;
455 unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; 662 unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
456 unsigned long flags;
457 int nr_freepages = cc->nr_freepages; 663 int nr_freepages = cc->nr_freepages;
458 struct list_head *freelist = &cc->freepages; 664 struct list_head *freelist = &cc->freepages;
459 665
@@ -501,30 +707,16 @@ static void isolate_freepages(struct zone *zone,
501 if (!suitable_migration_target(page)) 707 if (!suitable_migration_target(page))
502 continue; 708 continue;
503 709
504 /* 710 /* If isolation recently failed, do not retry */
505 * Found a block suitable for isolating free pages from. Now 711 if (!isolation_suitable(cc, page))
506 * we disabled interrupts, double check things are ok and 712 continue;
507 * isolate the pages. This is to minimise the time IRQs
508 * are disabled
509 */
510 isolated = 0;
511 713
512 /* 714 /* Found a block suitable for isolating free pages from */
513 * The zone lock must be held to isolate freepages. This 715 isolated = 0;
514 * unfortunately this is a very coarse lock and can be 716 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
515 * heavily contended if there are parallel allocations 717 isolated = isolate_freepages_block(cc, pfn, end_pfn,
516 * or parallel compactions. For async compaction do not 718 freelist, false);
517 * spin on the lock 719 nr_freepages += isolated;
518 */
519 if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
520 break;
521 if (suitable_migration_target(page)) {
522 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
523 isolated = isolate_freepages_block(pfn, end_pfn,
524 freelist, false);
525 nr_freepages += isolated;
526 }
527 spin_unlock_irqrestore(&zone->lock, flags);
528 720
529 /* 721 /*
530 * Record the highest PFN we isolated pages from. When next 722 * Record the highest PFN we isolated pages from. When next
@@ -532,17 +724,8 @@ static void isolate_freepages(struct zone *zone,
532 * page migration may have returned some pages to the allocator 724 * page migration may have returned some pages to the allocator
533 */ 725 */
534 if (isolated) { 726 if (isolated) {
727 cc->finished_update_free = true;
535 high_pfn = max(high_pfn, pfn); 728 high_pfn = max(high_pfn, pfn);
536
537 /*
538 * If the free scanner has wrapped, update
539 * compact_cached_free_pfn to point to the highest
540 * pageblock with free pages. This reduces excessive
541 * scanning of full pageblocks near the end of the
542 * zone
543 */
544 if (cc->order > 0 && cc->wrapped)
545 zone->compact_cached_free_pfn = high_pfn;
546 } 729 }
547 } 730 }
548 731
@@ -551,11 +734,6 @@ static void isolate_freepages(struct zone *zone,
551 734
552 cc->free_pfn = high_pfn; 735 cc->free_pfn = high_pfn;
553 cc->nr_freepages = nr_freepages; 736 cc->nr_freepages = nr_freepages;
554
555 /* If compact_cached_free_pfn is reset then set it now */
556 if (cc->order > 0 && !cc->wrapped &&
557 zone->compact_cached_free_pfn == start_free_pfn(zone))
558 zone->compact_cached_free_pfn = high_pfn;
559} 737}
560 738
561/* 739/*
@@ -633,8 +811,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
633 } 811 }
634 812
635 /* Perform the isolation */ 813 /* Perform the isolation */
636 low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); 814 low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
637 if (!low_pfn) 815 if (!low_pfn || cc->contended)
638 return ISOLATE_ABORT; 816 return ISOLATE_ABORT;
639 817
640 cc->migrate_pfn = low_pfn; 818 cc->migrate_pfn = low_pfn;
@@ -645,33 +823,24 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
645static int compact_finished(struct zone *zone, 823static int compact_finished(struct zone *zone,
646 struct compact_control *cc) 824 struct compact_control *cc)
647{ 825{
648 unsigned int order;
649 unsigned long watermark; 826 unsigned long watermark;
650 827
651 if (fatal_signal_pending(current)) 828 if (fatal_signal_pending(current))
652 return COMPACT_PARTIAL; 829 return COMPACT_PARTIAL;
653 830
654 /* 831 /* Compaction run completes if the migrate and free scanner meet */
655 * A full (order == -1) compaction run starts at the beginning and
656 * end of a zone; it completes when the migrate and free scanner meet.
657 * A partial (order > 0) compaction can start with the free scanner
658 * at a random point in the zone, and may have to restart.
659 */
660 if (cc->free_pfn <= cc->migrate_pfn) { 832 if (cc->free_pfn <= cc->migrate_pfn) {
661 if (cc->order > 0 && !cc->wrapped) { 833 /*
662 /* We started partway through; restart at the end. */ 834 * Mark that the PG_migrate_skip information should be cleared
663 unsigned long free_pfn = start_free_pfn(zone); 835 * by kswapd when it goes to sleep. kswapd does not set the
664 zone->compact_cached_free_pfn = free_pfn; 836 * flag itself as the decision to be clear should be directly
665 cc->free_pfn = free_pfn; 837 * based on an allocation request.
666 cc->wrapped = 1; 838 */
667 return COMPACT_CONTINUE; 839 if (!current_is_kswapd())
668 } 840 zone->compact_blockskip_flush = true;
669 return COMPACT_COMPLETE;
670 }
671 841
672 /* We wrapped around and ended up where we started. */
673 if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
674 return COMPACT_COMPLETE; 842 return COMPACT_COMPLETE;
843 }
675 844
676 /* 845 /*
677 * order == -1 is expected when compacting via 846 * order == -1 is expected when compacting via
@@ -688,14 +857,22 @@ static int compact_finished(struct zone *zone,
688 return COMPACT_CONTINUE; 857 return COMPACT_CONTINUE;
689 858
690 /* Direct compactor: Is a suitable page free? */ 859 /* Direct compactor: Is a suitable page free? */
691 for (order = cc->order; order < MAX_ORDER; order++) { 860 if (cc->page) {
692 /* Job done if page is free of the right migratetype */ 861 /* Was a suitable page captured? */
693 if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) 862 if (*cc->page)
694 return COMPACT_PARTIAL;
695
696 /* Job done if allocation would set block type */
697 if (order >= pageblock_order && zone->free_area[order].nr_free)
698 return COMPACT_PARTIAL; 863 return COMPACT_PARTIAL;
864 } else {
865 unsigned int order;
866 for (order = cc->order; order < MAX_ORDER; order++) {
867 struct free_area *area = &zone->free_area[cc->order];
868 /* Job done if page is free of the right migratetype */
869 if (!list_empty(&area->free_list[cc->migratetype]))
870 return COMPACT_PARTIAL;
871
872 /* Job done if allocation would set block type */
873 if (cc->order >= pageblock_order && area->nr_free)
874 return COMPACT_PARTIAL;
875 }
699 } 876 }
700 877
701 return COMPACT_CONTINUE; 878 return COMPACT_CONTINUE;
@@ -754,6 +931,8 @@ unsigned long compaction_suitable(struct zone *zone, int order)
754static int compact_zone(struct zone *zone, struct compact_control *cc) 931static int compact_zone(struct zone *zone, struct compact_control *cc)
755{ 932{
756 int ret; 933 int ret;
934 unsigned long start_pfn = zone->zone_start_pfn;
935 unsigned long end_pfn = zone->zone_start_pfn + zone->spanned_pages;
757 936
758 ret = compaction_suitable(zone, cc->order); 937 ret = compaction_suitable(zone, cc->order);
759 switch (ret) { 938 switch (ret) {
@@ -766,18 +945,30 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
766 ; 945 ;
767 } 946 }
768 947
769 /* Setup to move all movable pages to the end of the zone */ 948 /*
770 cc->migrate_pfn = zone->zone_start_pfn; 949 * Setup to move all movable pages to the end of the zone. Used cached
771 950 * information on where the scanners should start but check that it
772 if (cc->order > 0) { 951 * is initialised by ensuring the values are within zone boundaries.
773 /* Incremental compaction. Start where the last one stopped. */ 952 */
774 cc->free_pfn = zone->compact_cached_free_pfn; 953 cc->migrate_pfn = zone->compact_cached_migrate_pfn;
775 cc->start_free_pfn = cc->free_pfn; 954 cc->free_pfn = zone->compact_cached_free_pfn;
776 } else { 955 if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) {
777 /* Order == -1 starts at the end of the zone. */ 956 cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1);
778 cc->free_pfn = start_free_pfn(zone); 957 zone->compact_cached_free_pfn = cc->free_pfn;
958 }
959 if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) {
960 cc->migrate_pfn = start_pfn;
961 zone->compact_cached_migrate_pfn = cc->migrate_pfn;
779 } 962 }
780 963
964 /*
965 * Clear pageblock skip if there were failures recently and compaction
966 * is about to be retried after being deferred. kswapd does not do
967 * this reset as it'll reset the cached information when going to sleep.
968 */
969 if (compaction_restarting(zone, cc->order) && !current_is_kswapd())
970 __reset_isolation_suitable(zone);
971
781 migrate_prep_local(); 972 migrate_prep_local();
782 973
783 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { 974 while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
@@ -787,6 +978,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
787 switch (isolate_migratepages(zone, cc)) { 978 switch (isolate_migratepages(zone, cc)) {
788 case ISOLATE_ABORT: 979 case ISOLATE_ABORT:
789 ret = COMPACT_PARTIAL; 980 ret = COMPACT_PARTIAL;
981 putback_lru_pages(&cc->migratepages);
982 cc->nr_migratepages = 0;
790 goto out; 983 goto out;
791 case ISOLATE_NONE: 984 case ISOLATE_NONE:
792 continue; 985 continue;
@@ -817,6 +1010,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
817 goto out; 1010 goto out;
818 } 1011 }
819 } 1012 }
1013
1014 /* Capture a page now if it is a suitable size */
1015 compact_capture_page(cc);
820 } 1016 }
821 1017
822out: 1018out:
@@ -829,8 +1025,10 @@ out:
829 1025
830static unsigned long compact_zone_order(struct zone *zone, 1026static unsigned long compact_zone_order(struct zone *zone,
831 int order, gfp_t gfp_mask, 1027 int order, gfp_t gfp_mask,
832 bool sync, bool *contended) 1028 bool sync, bool *contended,
1029 struct page **page)
833{ 1030{
1031 unsigned long ret;
834 struct compact_control cc = { 1032 struct compact_control cc = {
835 .nr_freepages = 0, 1033 .nr_freepages = 0,
836 .nr_migratepages = 0, 1034 .nr_migratepages = 0,
@@ -838,12 +1036,18 @@ static unsigned long compact_zone_order(struct zone *zone,
838 .migratetype = allocflags_to_migratetype(gfp_mask), 1036 .migratetype = allocflags_to_migratetype(gfp_mask),
839 .zone = zone, 1037 .zone = zone,
840 .sync = sync, 1038 .sync = sync,
841 .contended = contended, 1039 .page = page,
842 }; 1040 };
843 INIT_LIST_HEAD(&cc.freepages); 1041 INIT_LIST_HEAD(&cc.freepages);
844 INIT_LIST_HEAD(&cc.migratepages); 1042 INIT_LIST_HEAD(&cc.migratepages);
845 1043
846 return compact_zone(zone, &cc); 1044 ret = compact_zone(zone, &cc);
1045
1046 VM_BUG_ON(!list_empty(&cc.freepages));
1047 VM_BUG_ON(!list_empty(&cc.migratepages));
1048
1049 *contended = cc.contended;
1050 return ret;
847} 1051}
848 1052
849int sysctl_extfrag_threshold = 500; 1053int sysctl_extfrag_threshold = 500;
@@ -855,12 +1059,14 @@ int sysctl_extfrag_threshold = 500;
855 * @gfp_mask: The GFP mask of the current allocation 1059 * @gfp_mask: The GFP mask of the current allocation
856 * @nodemask: The allowed nodes to allocate from 1060 * @nodemask: The allowed nodes to allocate from
857 * @sync: Whether migration is synchronous or not 1061 * @sync: Whether migration is synchronous or not
1062 * @contended: Return value that is true if compaction was aborted due to lock contention
1063 * @page: Optionally capture a free page of the requested order during compaction
858 * 1064 *
859 * This is the main entry point for direct page compaction. 1065 * This is the main entry point for direct page compaction.
860 */ 1066 */
861unsigned long try_to_compact_pages(struct zonelist *zonelist, 1067unsigned long try_to_compact_pages(struct zonelist *zonelist,
862 int order, gfp_t gfp_mask, nodemask_t *nodemask, 1068 int order, gfp_t gfp_mask, nodemask_t *nodemask,
863 bool sync, bool *contended) 1069 bool sync, bool *contended, struct page **page)
864{ 1070{
865 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1071 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
866 int may_enter_fs = gfp_mask & __GFP_FS; 1072 int may_enter_fs = gfp_mask & __GFP_FS;
@@ -868,28 +1074,30 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
868 struct zoneref *z; 1074 struct zoneref *z;
869 struct zone *zone; 1075 struct zone *zone;
870 int rc = COMPACT_SKIPPED; 1076 int rc = COMPACT_SKIPPED;
1077 int alloc_flags = 0;
871 1078
872 /* 1079 /* Check if the GFP flags allow compaction */
873 * Check whether it is worth even starting compaction. The order check is
874 * made because an assumption is made that the page allocator can satisfy
875 * the "cheaper" orders without taking special steps
876 */
877 if (!order || !may_enter_fs || !may_perform_io) 1080 if (!order || !may_enter_fs || !may_perform_io)
878 return rc; 1081 return rc;
879 1082
880 count_vm_event(COMPACTSTALL); 1083 count_vm_event(COMPACTSTALL);
881 1084
1085#ifdef CONFIG_CMA
1086 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
1087 alloc_flags |= ALLOC_CMA;
1088#endif
882 /* Compact each zone in the list */ 1089 /* Compact each zone in the list */
883 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 1090 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
884 nodemask) { 1091 nodemask) {
885 int status; 1092 int status;
886 1093
887 status = compact_zone_order(zone, order, gfp_mask, sync, 1094 status = compact_zone_order(zone, order, gfp_mask, sync,
888 contended); 1095 contended, page);
889 rc = max(status, rc); 1096 rc = max(status, rc);
890 1097
891 /* If a normal allocation would succeed, stop compacting */ 1098 /* If a normal allocation would succeed, stop compacting */
892 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) 1099 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
1100 alloc_flags))
893 break; 1101 break;
894 } 1102 }
895 1103
@@ -940,6 +1148,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
940 struct compact_control cc = { 1148 struct compact_control cc = {
941 .order = order, 1149 .order = order,
942 .sync = false, 1150 .sync = false,
1151 .page = NULL,
943 }; 1152 };
944 1153
945 return __compact_pgdat(pgdat, &cc); 1154 return __compact_pgdat(pgdat, &cc);
@@ -950,6 +1159,7 @@ static int compact_node(int nid)
950 struct compact_control cc = { 1159 struct compact_control cc = {
951 .order = -1, 1160 .order = -1,
952 .sync = true, 1161 .sync = true,
1162 .page = NULL,
953 }; 1163 };
954 1164
955 return __compact_pgdat(NODE_DATA(nid), &cc); 1165 return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/filemap.c b/mm/filemap.c
index 384344575c37..83efee76a5c0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1607,13 +1607,13 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1607 * Do we have something in the page cache already? 1607 * Do we have something in the page cache already?
1608 */ 1608 */
1609 page = find_get_page(mapping, offset); 1609 page = find_get_page(mapping, offset);
1610 if (likely(page)) { 1610 if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
1611 /* 1611 /*
1612 * We found the page, so try async readahead before 1612 * We found the page, so try async readahead before
1613 * waiting for the lock. 1613 * waiting for the lock.
1614 */ 1614 */
1615 do_async_mmap_readahead(vma, ra, file, page, offset); 1615 do_async_mmap_readahead(vma, ra, file, page, offset);
1616 } else { 1616 } else if (!page) {
1617 /* No page in the page cache at all */ 1617 /* No page in the page cache at all */
1618 do_sync_mmap_readahead(vma, ra, file, offset); 1618 do_sync_mmap_readahead(vma, ra, file, offset);
1619 count_vm_event(PGMAJFAULT); 1619 count_vm_event(PGMAJFAULT);
@@ -1737,6 +1737,7 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
1737const struct vm_operations_struct generic_file_vm_ops = { 1737const struct vm_operations_struct generic_file_vm_ops = {
1738 .fault = filemap_fault, 1738 .fault = filemap_fault,
1739 .page_mkwrite = filemap_page_mkwrite, 1739 .page_mkwrite = filemap_page_mkwrite,
1740 .remap_pages = generic_file_remap_pages,
1740}; 1741};
1741 1742
1742/* This is used for a general mmap of a disk file */ 1743/* This is used for a general mmap of a disk file */
@@ -1749,7 +1750,6 @@ int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
1749 return -ENOEXEC; 1750 return -ENOEXEC;
1750 file_accessed(file); 1751 file_accessed(file);
1751 vma->vm_ops = &generic_file_vm_ops; 1752 vma->vm_ops = &generic_file_vm_ops;
1752 vma->vm_flags |= VM_CAN_NONLINEAR;
1753 return 0; 1753 return 0;
1754} 1754}
1755 1755
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 13e013b1270c..a912da6ddfd4 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -167,7 +167,6 @@ __xip_unmap (struct address_space * mapping,
167{ 167{
168 struct vm_area_struct *vma; 168 struct vm_area_struct *vma;
169 struct mm_struct *mm; 169 struct mm_struct *mm;
170 struct prio_tree_iter iter;
171 unsigned long address; 170 unsigned long address;
172 pte_t *pte; 171 pte_t *pte;
173 pte_t pteval; 172 pte_t pteval;
@@ -184,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
184 183
185retry: 184retry:
186 mutex_lock(&mapping->i_mmap_mutex); 185 mutex_lock(&mapping->i_mmap_mutex);
187 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 186 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
188 mm = vma->vm_mm; 187 mm = vma->vm_mm;
189 address = vma->vm_start + 188 address = vma->vm_start +
190 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 189 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
@@ -193,11 +192,13 @@ retry:
193 if (pte) { 192 if (pte) {
194 /* Nuke the page table entry. */ 193 /* Nuke the page table entry. */
195 flush_cache_page(vma, address, pte_pfn(*pte)); 194 flush_cache_page(vma, address, pte_pfn(*pte));
196 pteval = ptep_clear_flush_notify(vma, address, pte); 195 pteval = ptep_clear_flush(vma, address, pte);
197 page_remove_rmap(page); 196 page_remove_rmap(page);
198 dec_mm_counter(mm, MM_FILEPAGES); 197 dec_mm_counter(mm, MM_FILEPAGES);
199 BUG_ON(pte_dirty(pteval)); 198 BUG_ON(pte_dirty(pteval));
200 pte_unmap_unlock(pte, ptl); 199 pte_unmap_unlock(pte, ptl);
200 /* must invalidate_page _before_ freeing the page */
201 mmu_notifier_invalidate_page(mm, address);
201 page_cache_release(page); 202 page_cache_release(page);
202 } 203 }
203 } 204 }
@@ -305,6 +306,7 @@ out:
305static const struct vm_operations_struct xip_file_vm_ops = { 306static const struct vm_operations_struct xip_file_vm_ops = {
306 .fault = xip_file_fault, 307 .fault = xip_file_fault,
307 .page_mkwrite = filemap_page_mkwrite, 308 .page_mkwrite = filemap_page_mkwrite,
309 .remap_pages = generic_file_remap_pages,
308}; 310};
309 311
310int xip_file_mmap(struct file * file, struct vm_area_struct * vma) 312int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -313,7 +315,7 @@ int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
313 315
314 file_accessed(file); 316 file_accessed(file);
315 vma->vm_ops = &xip_file_vm_ops; 317 vma->vm_ops = &xip_file_vm_ops;
316 vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP; 318 vma->vm_flags |= VM_MIXEDMAP;
317 return 0; 319 return 0;
318} 320}
319EXPORT_SYMBOL_GPL(xip_file_mmap); 321EXPORT_SYMBOL_GPL(xip_file_mmap);
diff --git a/mm/fremap.c b/mm/fremap.c
index 048659c0c03d..3899a86851ce 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -5,6 +5,7 @@
5 * 5 *
6 * started by Ingo Molnar, Copyright (C) 2002, 2003 6 * started by Ingo Molnar, Copyright (C) 2002, 2003
7 */ 7 */
8#include <linux/export.h>
8#include <linux/backing-dev.h> 9#include <linux/backing-dev.h>
9#include <linux/mm.h> 10#include <linux/mm.h>
10#include <linux/swap.h> 11#include <linux/swap.h>
@@ -80,9 +81,10 @@ out:
80 return err; 81 return err;
81} 82}
82 83
83static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, 84int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
84 unsigned long addr, unsigned long size, pgoff_t pgoff) 85 unsigned long size, pgoff_t pgoff)
85{ 86{
87 struct mm_struct *mm = vma->vm_mm;
86 int err; 88 int err;
87 89
88 do { 90 do {
@@ -95,9 +97,9 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
95 pgoff++; 97 pgoff++;
96 } while (size); 98 } while (size);
97 99
98 return 0; 100 return 0;
99
100} 101}
102EXPORT_SYMBOL(generic_file_remap_pages);
101 103
102/** 104/**
103 * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma 105 * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
@@ -167,7 +169,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
167 if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR)) 169 if (vma->vm_private_data && !(vma->vm_flags & VM_NONLINEAR))
168 goto out; 170 goto out;
169 171
170 if (!(vma->vm_flags & VM_CAN_NONLINEAR)) 172 if (!vma->vm_ops->remap_pages)
171 goto out; 173 goto out;
172 174
173 if (start < vma->vm_start || start + size > vma->vm_end) 175 if (start < vma->vm_start || start + size > vma->vm_end)
@@ -212,7 +214,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
212 mutex_lock(&mapping->i_mmap_mutex); 214 mutex_lock(&mapping->i_mmap_mutex);
213 flush_dcache_mmap_lock(mapping); 215 flush_dcache_mmap_lock(mapping);
214 vma->vm_flags |= VM_NONLINEAR; 216 vma->vm_flags |= VM_NONLINEAR;
215 vma_prio_tree_remove(vma, &mapping->i_mmap); 217 vma_interval_tree_remove(vma, &mapping->i_mmap);
216 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); 218 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
217 flush_dcache_mmap_unlock(mapping); 219 flush_dcache_mmap_unlock(mapping);
218 mutex_unlock(&mapping->i_mmap_mutex); 220 mutex_unlock(&mapping->i_mmap_mutex);
@@ -228,7 +230,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
228 } 230 }
229 231
230 mmu_notifier_invalidate_range_start(mm, start, start + size); 232 mmu_notifier_invalidate_range_start(mm, start, start + size);
231 err = populate_range(mm, vma, start, size, pgoff); 233 err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
232 mmu_notifier_invalidate_range_end(mm, start, start + size); 234 mmu_notifier_invalidate_range_end(mm, start, start + size);
233 if (!err && !(flags & MAP_NONBLOCK)) { 235 if (!err && !(flags & MAP_NONBLOCK)) {
234 if (vma->vm_flags & VM_LOCKED) { 236 if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 141dbb695097..a863af26c79c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -102,10 +102,7 @@ static int set_recommended_min_free_kbytes(void)
102 unsigned long recommended_min; 102 unsigned long recommended_min;
103 extern int min_free_kbytes; 103 extern int min_free_kbytes;
104 104
105 if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, 105 if (!khugepaged_enabled())
106 &transparent_hugepage_flags) &&
107 !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
108 &transparent_hugepage_flags))
109 return 0; 106 return 0;
110 107
111 for_each_populated_zone(zone) 108 for_each_populated_zone(zone)
@@ -139,12 +136,6 @@ static int start_khugepaged(void)
139{ 136{
140 int err = 0; 137 int err = 0;
141 if (khugepaged_enabled()) { 138 if (khugepaged_enabled()) {
142 int wakeup;
143 if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
144 err = -ENOMEM;
145 goto out;
146 }
147 mutex_lock(&khugepaged_mutex);
148 if (!khugepaged_thread) 139 if (!khugepaged_thread)
149 khugepaged_thread = kthread_run(khugepaged, NULL, 140 khugepaged_thread = kthread_run(khugepaged, NULL,
150 "khugepaged"); 141 "khugepaged");
@@ -154,16 +145,16 @@ static int start_khugepaged(void)
154 err = PTR_ERR(khugepaged_thread); 145 err = PTR_ERR(khugepaged_thread);
155 khugepaged_thread = NULL; 146 khugepaged_thread = NULL;
156 } 147 }
157 wakeup = !list_empty(&khugepaged_scan.mm_head); 148
158 mutex_unlock(&khugepaged_mutex); 149 if (!list_empty(&khugepaged_scan.mm_head))
159 if (wakeup)
160 wake_up_interruptible(&khugepaged_wait); 150 wake_up_interruptible(&khugepaged_wait);
161 151
162 set_recommended_min_free_kbytes(); 152 set_recommended_min_free_kbytes();
163 } else 153 } else if (khugepaged_thread) {
164 /* wakeup to exit */ 154 kthread_stop(khugepaged_thread);
165 wake_up_interruptible(&khugepaged_wait); 155 khugepaged_thread = NULL;
166out: 156 }
157
167 return err; 158 return err;
168} 159}
169 160
@@ -224,18 +215,16 @@ static ssize_t enabled_store(struct kobject *kobj,
224 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 215 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
225 216
226 if (ret > 0) { 217 if (ret > 0) {
227 int err = start_khugepaged(); 218 int err;
219
220 mutex_lock(&khugepaged_mutex);
221 err = start_khugepaged();
222 mutex_unlock(&khugepaged_mutex);
223
228 if (err) 224 if (err)
229 ret = err; 225 ret = err;
230 } 226 }
231 227
232 if (ret > 0 &&
233 (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
234 &transparent_hugepage_flags) ||
235 test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
236 &transparent_hugepage_flags)))
237 set_recommended_min_free_kbytes();
238
239 return ret; 228 return ret;
240} 229}
241static struct kobj_attribute enabled_attr = 230static struct kobj_attribute enabled_attr =
@@ -570,8 +559,6 @@ static int __init hugepage_init(void)
570 559
571 start_khugepaged(); 560 start_khugepaged();
572 561
573 set_recommended_min_free_kbytes();
574
575 return 0; 562 return 0;
576out: 563out:
577 hugepage_exit_sysfs(hugepage_kobj); 564 hugepage_exit_sysfs(hugepage_kobj);
@@ -611,19 +598,6 @@ out:
611} 598}
612__setup("transparent_hugepage=", setup_transparent_hugepage); 599__setup("transparent_hugepage=", setup_transparent_hugepage);
613 600
614static void prepare_pmd_huge_pte(pgtable_t pgtable,
615 struct mm_struct *mm)
616{
617 assert_spin_locked(&mm->page_table_lock);
618
619 /* FIFO */
620 if (!mm->pmd_huge_pte)
621 INIT_LIST_HEAD(&pgtable->lru);
622 else
623 list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
624 mm->pmd_huge_pte = pgtable;
625}
626
627static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) 601static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
628{ 602{
629 if (likely(vma->vm_flags & VM_WRITE)) 603 if (likely(vma->vm_flags & VM_WRITE))
@@ -665,7 +639,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
665 */ 639 */
666 page_add_new_anon_rmap(page, vma, haddr); 640 page_add_new_anon_rmap(page, vma, haddr);
667 set_pmd_at(mm, haddr, pmd, entry); 641 set_pmd_at(mm, haddr, pmd, entry);
668 prepare_pmd_huge_pte(pgtable, mm); 642 pgtable_trans_huge_deposit(mm, pgtable);
669 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); 643 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
670 mm->nr_ptes++; 644 mm->nr_ptes++;
671 spin_unlock(&mm->page_table_lock); 645 spin_unlock(&mm->page_table_lock);
@@ -791,7 +765,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
791 pmdp_set_wrprotect(src_mm, addr, src_pmd); 765 pmdp_set_wrprotect(src_mm, addr, src_pmd);
792 pmd = pmd_mkold(pmd_wrprotect(pmd)); 766 pmd = pmd_mkold(pmd_wrprotect(pmd));
793 set_pmd_at(dst_mm, addr, dst_pmd, pmd); 767 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
794 prepare_pmd_huge_pte(pgtable, dst_mm); 768 pgtable_trans_huge_deposit(dst_mm, pgtable);
795 dst_mm->nr_ptes++; 769 dst_mm->nr_ptes++;
796 770
797 ret = 0; 771 ret = 0;
@@ -802,25 +776,6 @@ out:
802 return ret; 776 return ret;
803} 777}
804 778
805/* no "address" argument so destroys page coloring of some arch */
806pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
807{
808 pgtable_t pgtable;
809
810 assert_spin_locked(&mm->page_table_lock);
811
812 /* FIFO */
813 pgtable = mm->pmd_huge_pte;
814 if (list_empty(&pgtable->lru))
815 mm->pmd_huge_pte = NULL;
816 else {
817 mm->pmd_huge_pte = list_entry(pgtable->lru.next,
818 struct page, lru);
819 list_del(&pgtable->lru);
820 }
821 return pgtable;
822}
823
824static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, 779static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
825 struct vm_area_struct *vma, 780 struct vm_area_struct *vma,
826 unsigned long address, 781 unsigned long address,
@@ -832,6 +787,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
832 pmd_t _pmd; 787 pmd_t _pmd;
833 int ret = 0, i; 788 int ret = 0, i;
834 struct page **pages; 789 struct page **pages;
790 unsigned long mmun_start; /* For mmu_notifiers */
791 unsigned long mmun_end; /* For mmu_notifiers */
835 792
836 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, 793 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
837 GFP_KERNEL); 794 GFP_KERNEL);
@@ -868,15 +825,19 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
868 cond_resched(); 825 cond_resched();
869 } 826 }
870 827
828 mmun_start = haddr;
829 mmun_end = haddr + HPAGE_PMD_SIZE;
830 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
831
871 spin_lock(&mm->page_table_lock); 832 spin_lock(&mm->page_table_lock);
872 if (unlikely(!pmd_same(*pmd, orig_pmd))) 833 if (unlikely(!pmd_same(*pmd, orig_pmd)))
873 goto out_free_pages; 834 goto out_free_pages;
874 VM_BUG_ON(!PageHead(page)); 835 VM_BUG_ON(!PageHead(page));
875 836
876 pmdp_clear_flush_notify(vma, haddr, pmd); 837 pmdp_clear_flush(vma, haddr, pmd);
877 /* leave pmd empty until pte is filled */ 838 /* leave pmd empty until pte is filled */
878 839
879 pgtable = get_pmd_huge_pte(mm); 840 pgtable = pgtable_trans_huge_withdraw(mm);
880 pmd_populate(mm, &_pmd, pgtable); 841 pmd_populate(mm, &_pmd, pgtable);
881 842
882 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { 843 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
@@ -896,6 +857,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
896 page_remove_rmap(page); 857 page_remove_rmap(page);
897 spin_unlock(&mm->page_table_lock); 858 spin_unlock(&mm->page_table_lock);
898 859
860 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
861
899 ret |= VM_FAULT_WRITE; 862 ret |= VM_FAULT_WRITE;
900 put_page(page); 863 put_page(page);
901 864
@@ -904,6 +867,7 @@ out:
904 867
905out_free_pages: 868out_free_pages:
906 spin_unlock(&mm->page_table_lock); 869 spin_unlock(&mm->page_table_lock);
870 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
907 mem_cgroup_uncharge_start(); 871 mem_cgroup_uncharge_start();
908 for (i = 0; i < HPAGE_PMD_NR; i++) { 872 for (i = 0; i < HPAGE_PMD_NR; i++) {
909 mem_cgroup_uncharge_page(pages[i]); 873 mem_cgroup_uncharge_page(pages[i]);
@@ -920,6 +884,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
920 int ret = 0; 884 int ret = 0;
921 struct page *page, *new_page; 885 struct page *page, *new_page;
922 unsigned long haddr; 886 unsigned long haddr;
887 unsigned long mmun_start; /* For mmu_notifiers */
888 unsigned long mmun_end; /* For mmu_notifiers */
923 889
924 VM_BUG_ON(!vma->anon_vma); 890 VM_BUG_ON(!vma->anon_vma);
925 spin_lock(&mm->page_table_lock); 891 spin_lock(&mm->page_table_lock);
@@ -934,7 +900,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
934 entry = pmd_mkyoung(orig_pmd); 900 entry = pmd_mkyoung(orig_pmd);
935 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 901 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
936 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) 902 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
937 update_mmu_cache(vma, address, entry); 903 update_mmu_cache_pmd(vma, address, pmd);
938 ret |= VM_FAULT_WRITE; 904 ret |= VM_FAULT_WRITE;
939 goto out_unlock; 905 goto out_unlock;
940 } 906 }
@@ -970,38 +936,47 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
970 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); 936 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
971 __SetPageUptodate(new_page); 937 __SetPageUptodate(new_page);
972 938
939 mmun_start = haddr;
940 mmun_end = haddr + HPAGE_PMD_SIZE;
941 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
942
973 spin_lock(&mm->page_table_lock); 943 spin_lock(&mm->page_table_lock);
974 put_page(page); 944 put_page(page);
975 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 945 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
976 spin_unlock(&mm->page_table_lock); 946 spin_unlock(&mm->page_table_lock);
977 mem_cgroup_uncharge_page(new_page); 947 mem_cgroup_uncharge_page(new_page);
978 put_page(new_page); 948 put_page(new_page);
979 goto out; 949 goto out_mn;
980 } else { 950 } else {
981 pmd_t entry; 951 pmd_t entry;
982 VM_BUG_ON(!PageHead(page)); 952 VM_BUG_ON(!PageHead(page));
983 entry = mk_pmd(new_page, vma->vm_page_prot); 953 entry = mk_pmd(new_page, vma->vm_page_prot);
984 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); 954 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
985 entry = pmd_mkhuge(entry); 955 entry = pmd_mkhuge(entry);
986 pmdp_clear_flush_notify(vma, haddr, pmd); 956 pmdp_clear_flush(vma, haddr, pmd);
987 page_add_new_anon_rmap(new_page, vma, haddr); 957 page_add_new_anon_rmap(new_page, vma, haddr);
988 set_pmd_at(mm, haddr, pmd, entry); 958 set_pmd_at(mm, haddr, pmd, entry);
989 update_mmu_cache(vma, address, entry); 959 update_mmu_cache_pmd(vma, address, pmd);
990 page_remove_rmap(page); 960 page_remove_rmap(page);
991 put_page(page); 961 put_page(page);
992 ret |= VM_FAULT_WRITE; 962 ret |= VM_FAULT_WRITE;
993 } 963 }
994out_unlock:
995 spin_unlock(&mm->page_table_lock); 964 spin_unlock(&mm->page_table_lock);
965out_mn:
966 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
996out: 967out:
997 return ret; 968 return ret;
969out_unlock:
970 spin_unlock(&mm->page_table_lock);
971 return ret;
998} 972}
999 973
1000struct page *follow_trans_huge_pmd(struct mm_struct *mm, 974struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1001 unsigned long addr, 975 unsigned long addr,
1002 pmd_t *pmd, 976 pmd_t *pmd,
1003 unsigned int flags) 977 unsigned int flags)
1004{ 978{
979 struct mm_struct *mm = vma->vm_mm;
1005 struct page *page = NULL; 980 struct page *page = NULL;
1006 981
1007 assert_spin_locked(&mm->page_table_lock); 982 assert_spin_locked(&mm->page_table_lock);
@@ -1024,6 +999,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
1024 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); 999 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
1025 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); 1000 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
1026 } 1001 }
1002 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1003 if (page->mapping && trylock_page(page)) {
1004 lru_add_drain();
1005 if (page->mapping)
1006 mlock_vma_page(page);
1007 unlock_page(page);
1008 }
1009 }
1027 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; 1010 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1028 VM_BUG_ON(!PageCompound(page)); 1011 VM_BUG_ON(!PageCompound(page));
1029 if (flags & FOLL_GET) 1012 if (flags & FOLL_GET)
@@ -1041,9 +1024,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1041 if (__pmd_trans_huge_lock(pmd, vma) == 1) { 1024 if (__pmd_trans_huge_lock(pmd, vma) == 1) {
1042 struct page *page; 1025 struct page *page;
1043 pgtable_t pgtable; 1026 pgtable_t pgtable;
1044 pgtable = get_pmd_huge_pte(tlb->mm); 1027 pmd_t orig_pmd;
1045 page = pmd_page(*pmd); 1028 pgtable = pgtable_trans_huge_withdraw(tlb->mm);
1046 pmd_clear(pmd); 1029 orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd);
1030 page = pmd_page(orig_pmd);
1047 tlb_remove_pmd_tlb_entry(tlb, pmd, addr); 1031 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1048 page_remove_rmap(page); 1032 page_remove_rmap(page);
1049 VM_BUG_ON(page_mapcount(page) < 0); 1033 VM_BUG_ON(page_mapcount(page) < 0);
@@ -1207,7 +1191,11 @@ static int __split_huge_page_splitting(struct page *page,
1207 struct mm_struct *mm = vma->vm_mm; 1191 struct mm_struct *mm = vma->vm_mm;
1208 pmd_t *pmd; 1192 pmd_t *pmd;
1209 int ret = 0; 1193 int ret = 0;
1194 /* For mmu_notifiers */
1195 const unsigned long mmun_start = address;
1196 const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
1210 1197
1198 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1211 spin_lock(&mm->page_table_lock); 1199 spin_lock(&mm->page_table_lock);
1212 pmd = page_check_address_pmd(page, mm, address, 1200 pmd = page_check_address_pmd(page, mm, address,
1213 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); 1201 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
@@ -1219,10 +1207,11 @@ static int __split_huge_page_splitting(struct page *page,
1219 * and it won't wait on the anon_vma->root->mutex to 1207 * and it won't wait on the anon_vma->root->mutex to
1220 * serialize against split_huge_page*. 1208 * serialize against split_huge_page*.
1221 */ 1209 */
1222 pmdp_splitting_flush_notify(vma, address, pmd); 1210 pmdp_splitting_flush(vma, address, pmd);
1223 ret = 1; 1211 ret = 1;
1224 } 1212 }
1225 spin_unlock(&mm->page_table_lock); 1213 spin_unlock(&mm->page_table_lock);
1214 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1226 1215
1227 return ret; 1216 return ret;
1228} 1217}
@@ -1358,11 +1347,11 @@ static int __split_huge_page_map(struct page *page,
1358 pmd = page_check_address_pmd(page, mm, address, 1347 pmd = page_check_address_pmd(page, mm, address,
1359 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); 1348 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1360 if (pmd) { 1349 if (pmd) {
1361 pgtable = get_pmd_huge_pte(mm); 1350 pgtable = pgtable_trans_huge_withdraw(mm);
1362 pmd_populate(mm, &_pmd, pgtable); 1351 pmd_populate(mm, &_pmd, pgtable);
1363 1352
1364 for (i = 0, haddr = address; i < HPAGE_PMD_NR; 1353 haddr = address;
1365 i++, haddr += PAGE_SIZE) { 1354 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1366 pte_t *pte, entry; 1355 pte_t *pte, entry;
1367 BUG_ON(PageCompound(page+i)); 1356 BUG_ON(PageCompound(page+i));
1368 entry = mk_pte(page + i, vma->vm_page_prot); 1357 entry = mk_pte(page + i, vma->vm_page_prot);
@@ -1406,8 +1395,7 @@ static int __split_huge_page_map(struct page *page,
1406 * SMP TLB and finally we write the non-huge version 1395 * SMP TLB and finally we write the non-huge version
1407 * of the pmd entry with pmd_populate. 1396 * of the pmd entry with pmd_populate.
1408 */ 1397 */
1409 set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); 1398 pmdp_invalidate(vma, address, pmd);
1410 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
1411 pmd_populate(mm, pmd, pgtable); 1399 pmd_populate(mm, pmd, pgtable);
1412 ret = 1; 1400 ret = 1;
1413 } 1401 }
@@ -1421,18 +1409,17 @@ static void __split_huge_page(struct page *page,
1421 struct anon_vma *anon_vma) 1409 struct anon_vma *anon_vma)
1422{ 1410{
1423 int mapcount, mapcount2; 1411 int mapcount, mapcount2;
1412 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1424 struct anon_vma_chain *avc; 1413 struct anon_vma_chain *avc;
1425 1414
1426 BUG_ON(!PageHead(page)); 1415 BUG_ON(!PageHead(page));
1427 BUG_ON(PageTail(page)); 1416 BUG_ON(PageTail(page));
1428 1417
1429 mapcount = 0; 1418 mapcount = 0;
1430 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1419 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1431 struct vm_area_struct *vma = avc->vma; 1420 struct vm_area_struct *vma = avc->vma;
1432 unsigned long addr = vma_address(page, vma); 1421 unsigned long addr = vma_address(page, vma);
1433 BUG_ON(is_vma_temporary_stack(vma)); 1422 BUG_ON(is_vma_temporary_stack(vma));
1434 if (addr == -EFAULT)
1435 continue;
1436 mapcount += __split_huge_page_splitting(page, vma, addr); 1423 mapcount += __split_huge_page_splitting(page, vma, addr);
1437 } 1424 }
1438 /* 1425 /*
@@ -1453,12 +1440,10 @@ static void __split_huge_page(struct page *page,
1453 __split_huge_page_refcount(page); 1440 __split_huge_page_refcount(page);
1454 1441
1455 mapcount2 = 0; 1442 mapcount2 = 0;
1456 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1443 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1457 struct vm_area_struct *vma = avc->vma; 1444 struct vm_area_struct *vma = avc->vma;
1458 unsigned long addr = vma_address(page, vma); 1445 unsigned long addr = vma_address(page, vma);
1459 BUG_ON(is_vma_temporary_stack(vma)); 1446 BUG_ON(is_vma_temporary_stack(vma));
1460 if (addr == -EFAULT)
1461 continue;
1462 mapcount2 += __split_huge_page_map(page, vma, addr); 1447 mapcount2 += __split_huge_page_map(page, vma, addr);
1463 } 1448 }
1464 if (mapcount != mapcount2) 1449 if (mapcount != mapcount2)
@@ -1491,12 +1476,13 @@ out:
1491 return ret; 1476 return ret;
1492} 1477}
1493 1478
1494#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \ 1479#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
1495 VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
1496 1480
1497int hugepage_madvise(struct vm_area_struct *vma, 1481int hugepage_madvise(struct vm_area_struct *vma,
1498 unsigned long *vm_flags, int advice) 1482 unsigned long *vm_flags, int advice)
1499{ 1483{
1484 struct mm_struct *mm = vma->vm_mm;
1485
1500 switch (advice) { 1486 switch (advice) {
1501 case MADV_HUGEPAGE: 1487 case MADV_HUGEPAGE:
1502 /* 1488 /*
@@ -1504,6 +1490,8 @@ int hugepage_madvise(struct vm_area_struct *vma,
1504 */ 1490 */
1505 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP)) 1491 if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
1506 return -EINVAL; 1492 return -EINVAL;
1493 if (mm->def_flags & VM_NOHUGEPAGE)
1494 return -EINVAL;
1507 *vm_flags &= ~VM_NOHUGEPAGE; 1495 *vm_flags &= ~VM_NOHUGEPAGE;
1508 *vm_flags |= VM_HUGEPAGE; 1496 *vm_flags |= VM_HUGEPAGE;
1509 /* 1497 /*
@@ -1655,11 +1643,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
1655 if (vma->vm_ops) 1643 if (vma->vm_ops)
1656 /* khugepaged not yet working on file or special mappings */ 1644 /* khugepaged not yet working on file or special mappings */
1657 return 0; 1645 return 0;
1658 /* 1646 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
1659 * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
1660 * true too, verify it here.
1661 */
1662 VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
1663 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 1647 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1664 hend = vma->vm_end & HPAGE_PMD_MASK; 1648 hend = vma->vm_end & HPAGE_PMD_MASK;
1665 if (hstart < hend) 1649 if (hstart < hend)
@@ -1833,28 +1817,35 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1833 } 1817 }
1834} 1818}
1835 1819
1836static void collapse_huge_page(struct mm_struct *mm, 1820static void khugepaged_alloc_sleep(void)
1837 unsigned long address,
1838 struct page **hpage,
1839 struct vm_area_struct *vma,
1840 int node)
1841{ 1821{
1842 pgd_t *pgd; 1822 wait_event_freezable_timeout(khugepaged_wait, false,
1843 pud_t *pud; 1823 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
1844 pmd_t *pmd, _pmd; 1824}
1845 pte_t *pte;
1846 pgtable_t pgtable;
1847 struct page *new_page;
1848 spinlock_t *ptl;
1849 int isolated;
1850 unsigned long hstart, hend;
1851 1825
1852 VM_BUG_ON(address & ~HPAGE_PMD_MASK); 1826#ifdef CONFIG_NUMA
1853#ifndef CONFIG_NUMA 1827static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
1854 up_read(&mm->mmap_sem); 1828{
1855 VM_BUG_ON(!*hpage); 1829 if (IS_ERR(*hpage)) {
1856 new_page = *hpage; 1830 if (!*wait)
1857#else 1831 return false;
1832
1833 *wait = false;
1834 *hpage = NULL;
1835 khugepaged_alloc_sleep();
1836 } else if (*hpage) {
1837 put_page(*hpage);
1838 *hpage = NULL;
1839 }
1840
1841 return true;
1842}
1843
1844static struct page
1845*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
1846 struct vm_area_struct *vma, unsigned long address,
1847 int node)
1848{
1858 VM_BUG_ON(*hpage); 1849 VM_BUG_ON(*hpage);
1859 /* 1850 /*
1860 * Allocate the page while the vma is still valid and under 1851 * Allocate the page while the vma is still valid and under
@@ -1866,7 +1857,7 @@ static void collapse_huge_page(struct mm_struct *mm,
1866 * mmap_sem in read mode is good idea also to allow greater 1857 * mmap_sem in read mode is good idea also to allow greater
1867 * scalability. 1858 * scalability.
1868 */ 1859 */
1869 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, 1860 *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
1870 node, __GFP_OTHER_NODE); 1861 node, __GFP_OTHER_NODE);
1871 1862
1872 /* 1863 /*
@@ -1874,20 +1865,85 @@ static void collapse_huge_page(struct mm_struct *mm,
1874 * preparation for taking it in write mode. 1865 * preparation for taking it in write mode.
1875 */ 1866 */
1876 up_read(&mm->mmap_sem); 1867 up_read(&mm->mmap_sem);
1877 if (unlikely(!new_page)) { 1868 if (unlikely(!*hpage)) {
1878 count_vm_event(THP_COLLAPSE_ALLOC_FAILED); 1869 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
1879 *hpage = ERR_PTR(-ENOMEM); 1870 *hpage = ERR_PTR(-ENOMEM);
1880 return; 1871 return NULL;
1881 } 1872 }
1882#endif
1883 1873
1884 count_vm_event(THP_COLLAPSE_ALLOC); 1874 count_vm_event(THP_COLLAPSE_ALLOC);
1885 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 1875 return *hpage;
1886#ifdef CONFIG_NUMA 1876}
1887 put_page(new_page); 1877#else
1878static struct page *khugepaged_alloc_hugepage(bool *wait)
1879{
1880 struct page *hpage;
1881
1882 do {
1883 hpage = alloc_hugepage(khugepaged_defrag());
1884 if (!hpage) {
1885 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
1886 if (!*wait)
1887 return NULL;
1888
1889 *wait = false;
1890 khugepaged_alloc_sleep();
1891 } else
1892 count_vm_event(THP_COLLAPSE_ALLOC);
1893 } while (unlikely(!hpage) && likely(khugepaged_enabled()));
1894
1895 return hpage;
1896}
1897
1898static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
1899{
1900 if (!*hpage)
1901 *hpage = khugepaged_alloc_hugepage(wait);
1902
1903 if (unlikely(!*hpage))
1904 return false;
1905
1906 return true;
1907}
1908
1909static struct page
1910*khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm,
1911 struct vm_area_struct *vma, unsigned long address,
1912 int node)
1913{
1914 up_read(&mm->mmap_sem);
1915 VM_BUG_ON(!*hpage);
1916 return *hpage;
1917}
1888#endif 1918#endif
1919
1920static void collapse_huge_page(struct mm_struct *mm,
1921 unsigned long address,
1922 struct page **hpage,
1923 struct vm_area_struct *vma,
1924 int node)
1925{
1926 pgd_t *pgd;
1927 pud_t *pud;
1928 pmd_t *pmd, _pmd;
1929 pte_t *pte;
1930 pgtable_t pgtable;
1931 struct page *new_page;
1932 spinlock_t *ptl;
1933 int isolated;
1934 unsigned long hstart, hend;
1935 unsigned long mmun_start; /* For mmu_notifiers */
1936 unsigned long mmun_end; /* For mmu_notifiers */
1937
1938 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1939
1940 /* release the mmap_sem read lock. */
1941 new_page = khugepaged_alloc_page(hpage, mm, vma, address, node);
1942 if (!new_page)
1943 return;
1944
1945 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
1889 return; 1946 return;
1890 }
1891 1947
1892 /* 1948 /*
1893 * Prevent all access to pagetables with the exception of 1949 * Prevent all access to pagetables with the exception of
@@ -1912,11 +1968,7 @@ static void collapse_huge_page(struct mm_struct *mm,
1912 goto out; 1968 goto out;
1913 if (is_vma_temporary_stack(vma)) 1969 if (is_vma_temporary_stack(vma))
1914 goto out; 1970 goto out;
1915 /* 1971 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
1916 * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
1917 * true too, verify it here.
1918 */
1919 VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
1920 1972
1921 pgd = pgd_offset(mm, address); 1973 pgd = pgd_offset(mm, address);
1922 if (!pgd_present(*pgd)) 1974 if (!pgd_present(*pgd))
@@ -1936,6 +1988,9 @@ static void collapse_huge_page(struct mm_struct *mm,
1936 pte = pte_offset_map(pmd, address); 1988 pte = pte_offset_map(pmd, address);
1937 ptl = pte_lockptr(mm, pmd); 1989 ptl = pte_lockptr(mm, pmd);
1938 1990
1991 mmun_start = address;
1992 mmun_end = address + HPAGE_PMD_SIZE;
1993 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1939 spin_lock(&mm->page_table_lock); /* probably unnecessary */ 1994 spin_lock(&mm->page_table_lock); /* probably unnecessary */
1940 /* 1995 /*
1941 * After this gup_fast can't run anymore. This also removes 1996 * After this gup_fast can't run anymore. This also removes
@@ -1943,8 +1998,9 @@ static void collapse_huge_page(struct mm_struct *mm,
1943 * huge and small TLB entries for the same virtual address 1998 * huge and small TLB entries for the same virtual address
1944 * to avoid the risk of CPU bugs in that area. 1999 * to avoid the risk of CPU bugs in that area.
1945 */ 2000 */
1946 _pmd = pmdp_clear_flush_notify(vma, address, pmd); 2001 _pmd = pmdp_clear_flush(vma, address, pmd);
1947 spin_unlock(&mm->page_table_lock); 2002 spin_unlock(&mm->page_table_lock);
2003 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1948 2004
1949 spin_lock(ptl); 2005 spin_lock(ptl);
1950 isolated = __collapse_huge_page_isolate(vma, address, pte); 2006 isolated = __collapse_huge_page_isolate(vma, address, pte);
@@ -1970,8 +2026,6 @@ static void collapse_huge_page(struct mm_struct *mm,
1970 pte_unmap(pte); 2026 pte_unmap(pte);
1971 __SetPageUptodate(new_page); 2027 __SetPageUptodate(new_page);
1972 pgtable = pmd_pgtable(_pmd); 2028 pgtable = pmd_pgtable(_pmd);
1973 VM_BUG_ON(page_count(pgtable) != 1);
1974 VM_BUG_ON(page_mapcount(pgtable) != 0);
1975 2029
1976 _pmd = mk_pmd(new_page, vma->vm_page_prot); 2030 _pmd = mk_pmd(new_page, vma->vm_page_prot);
1977 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); 2031 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
@@ -1988,13 +2042,12 @@ static void collapse_huge_page(struct mm_struct *mm,
1988 BUG_ON(!pmd_none(*pmd)); 2042 BUG_ON(!pmd_none(*pmd));
1989 page_add_new_anon_rmap(new_page, vma, address); 2043 page_add_new_anon_rmap(new_page, vma, address);
1990 set_pmd_at(mm, address, pmd, _pmd); 2044 set_pmd_at(mm, address, pmd, _pmd);
1991 update_mmu_cache(vma, address, _pmd); 2045 update_mmu_cache_pmd(vma, address, pmd);
1992 prepare_pmd_huge_pte(pgtable, mm); 2046 pgtable_trans_huge_deposit(mm, pgtable);
1993 spin_unlock(&mm->page_table_lock); 2047 spin_unlock(&mm->page_table_lock);
1994 2048
1995#ifndef CONFIG_NUMA
1996 *hpage = NULL; 2049 *hpage = NULL;
1997#endif 2050
1998 khugepaged_pages_collapsed++; 2051 khugepaged_pages_collapsed++;
1999out_up_write: 2052out_up_write:
2000 up_write(&mm->mmap_sem); 2053 up_write(&mm->mmap_sem);
@@ -2002,9 +2055,6 @@ out_up_write:
2002 2055
2003out: 2056out:
2004 mem_cgroup_uncharge_page(new_page); 2057 mem_cgroup_uncharge_page(new_page);
2005#ifdef CONFIG_NUMA
2006 put_page(new_page);
2007#endif
2008 goto out_up_write; 2058 goto out_up_write;
2009} 2059}
2010 2060
@@ -2154,12 +2204,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
2154 goto skip; 2204 goto skip;
2155 if (is_vma_temporary_stack(vma)) 2205 if (is_vma_temporary_stack(vma))
2156 goto skip; 2206 goto skip;
2157 /* 2207 VM_BUG_ON(vma->vm_flags & VM_NO_THP);
2158 * If is_pfn_mapping() is true is_learn_pfn_mapping()
2159 * must be true too, verify it here.
2160 */
2161 VM_BUG_ON(is_linear_pfn_mapping(vma) ||
2162 vma->vm_flags & VM_NO_THP);
2163 2208
2164 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; 2209 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2165 hend = vma->vm_end & HPAGE_PMD_MASK; 2210 hend = vma->vm_end & HPAGE_PMD_MASK;
@@ -2234,32 +2279,23 @@ static int khugepaged_has_work(void)
2234static int khugepaged_wait_event(void) 2279static int khugepaged_wait_event(void)
2235{ 2280{
2236 return !list_empty(&khugepaged_scan.mm_head) || 2281 return !list_empty(&khugepaged_scan.mm_head) ||
2237 !khugepaged_enabled(); 2282 kthread_should_stop();
2238} 2283}
2239 2284
2240static void khugepaged_do_scan(struct page **hpage) 2285static void khugepaged_do_scan(void)
2241{ 2286{
2287 struct page *hpage = NULL;
2242 unsigned int progress = 0, pass_through_head = 0; 2288 unsigned int progress = 0, pass_through_head = 0;
2243 unsigned int pages = khugepaged_pages_to_scan; 2289 unsigned int pages = khugepaged_pages_to_scan;
2290 bool wait = true;
2244 2291
2245 barrier(); /* write khugepaged_pages_to_scan to local stack */ 2292 barrier(); /* write khugepaged_pages_to_scan to local stack */
2246 2293
2247 while (progress < pages) { 2294 while (progress < pages) {
2248 cond_resched(); 2295 if (!khugepaged_prealloc_page(&hpage, &wait))
2249
2250#ifndef CONFIG_NUMA
2251 if (!*hpage) {
2252 *hpage = alloc_hugepage(khugepaged_defrag());
2253 if (unlikely(!*hpage)) {
2254 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2255 break;
2256 }
2257 count_vm_event(THP_COLLAPSE_ALLOC);
2258 }
2259#else
2260 if (IS_ERR(*hpage))
2261 break; 2296 break;
2262#endif 2297
2298 cond_resched();
2263 2299
2264 if (unlikely(kthread_should_stop() || freezing(current))) 2300 if (unlikely(kthread_should_stop() || freezing(current)))
2265 break; 2301 break;
@@ -2270,73 +2306,32 @@ static void khugepaged_do_scan(struct page **hpage)
2270 if (khugepaged_has_work() && 2306 if (khugepaged_has_work() &&
2271 pass_through_head < 2) 2307 pass_through_head < 2)
2272 progress += khugepaged_scan_mm_slot(pages - progress, 2308 progress += khugepaged_scan_mm_slot(pages - progress,
2273 hpage); 2309 &hpage);
2274 else 2310 else
2275 progress = pages; 2311 progress = pages;
2276 spin_unlock(&khugepaged_mm_lock); 2312 spin_unlock(&khugepaged_mm_lock);
2277 } 2313 }
2278}
2279 2314
2280static void khugepaged_alloc_sleep(void) 2315 if (!IS_ERR_OR_NULL(hpage))
2281{ 2316 put_page(hpage);
2282 wait_event_freezable_timeout(khugepaged_wait, false,
2283 msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
2284} 2317}
2285 2318
2286#ifndef CONFIG_NUMA 2319static void khugepaged_wait_work(void)
2287static struct page *khugepaged_alloc_hugepage(void)
2288{ 2320{
2289 struct page *hpage; 2321 try_to_freeze();
2290
2291 do {
2292 hpage = alloc_hugepage(khugepaged_defrag());
2293 if (!hpage) {
2294 count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
2295 khugepaged_alloc_sleep();
2296 } else
2297 count_vm_event(THP_COLLAPSE_ALLOC);
2298 } while (unlikely(!hpage) &&
2299 likely(khugepaged_enabled()));
2300 return hpage;
2301}
2302#endif
2303 2322
2304static void khugepaged_loop(void) 2323 if (khugepaged_has_work()) {
2305{ 2324 if (!khugepaged_scan_sleep_millisecs)
2306 struct page *hpage; 2325 return;
2307 2326
2308#ifdef CONFIG_NUMA 2327 wait_event_freezable_timeout(khugepaged_wait,
2309 hpage = NULL; 2328 kthread_should_stop(),
2310#endif 2329 msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
2311 while (likely(khugepaged_enabled())) { 2330 return;
2312#ifndef CONFIG_NUMA
2313 hpage = khugepaged_alloc_hugepage();
2314 if (unlikely(!hpage))
2315 break;
2316#else
2317 if (IS_ERR(hpage)) {
2318 khugepaged_alloc_sleep();
2319 hpage = NULL;
2320 }
2321#endif
2322
2323 khugepaged_do_scan(&hpage);
2324#ifndef CONFIG_NUMA
2325 if (hpage)
2326 put_page(hpage);
2327#endif
2328 try_to_freeze();
2329 if (unlikely(kthread_should_stop()))
2330 break;
2331 if (khugepaged_has_work()) {
2332 if (!khugepaged_scan_sleep_millisecs)
2333 continue;
2334 wait_event_freezable_timeout(khugepaged_wait, false,
2335 msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
2336 } else if (khugepaged_enabled())
2337 wait_event_freezable(khugepaged_wait,
2338 khugepaged_wait_event());
2339 } 2331 }
2332
2333 if (khugepaged_enabled())
2334 wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
2340} 2335}
2341 2336
2342static int khugepaged(void *none) 2337static int khugepaged(void *none)
@@ -2346,20 +2341,9 @@ static int khugepaged(void *none)
2346 set_freezable(); 2341 set_freezable();
2347 set_user_nice(current, 19); 2342 set_user_nice(current, 19);
2348 2343
2349 /* serialize with start_khugepaged() */ 2344 while (!kthread_should_stop()) {
2350 mutex_lock(&khugepaged_mutex); 2345 khugepaged_do_scan();
2351 2346 khugepaged_wait_work();
2352 for (;;) {
2353 mutex_unlock(&khugepaged_mutex);
2354 VM_BUG_ON(khugepaged_thread != current);
2355 khugepaged_loop();
2356 VM_BUG_ON(khugepaged_thread != current);
2357
2358 mutex_lock(&khugepaged_mutex);
2359 if (!khugepaged_enabled())
2360 break;
2361 if (unlikely(kthread_should_stop()))
2362 break;
2363 } 2347 }
2364 2348
2365 spin_lock(&khugepaged_mm_lock); 2349 spin_lock(&khugepaged_mm_lock);
@@ -2368,10 +2352,6 @@ static int khugepaged(void *none)
2368 if (mm_slot) 2352 if (mm_slot)
2369 collect_mm_slot(mm_slot); 2353 collect_mm_slot(mm_slot);
2370 spin_unlock(&khugepaged_mm_lock); 2354 spin_unlock(&khugepaged_mm_lock);
2371
2372 khugepaged_thread = NULL;
2373 mutex_unlock(&khugepaged_mutex);
2374
2375 return 0; 2355 return 0;
2376} 2356}
2377 2357
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc727122dd44..59a0059b39e2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -30,7 +30,6 @@
30#include <linux/hugetlb.h> 30#include <linux/hugetlb.h>
31#include <linux/hugetlb_cgroup.h> 31#include <linux/hugetlb_cgroup.h>
32#include <linux/node.h> 32#include <linux/node.h>
33#include <linux/hugetlb_cgroup.h>
34#include "internal.h" 33#include "internal.h"
35 34
36const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 35const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -637,6 +636,7 @@ static void free_huge_page(struct page *page)
637 h->surplus_huge_pages--; 636 h->surplus_huge_pages--;
638 h->surplus_huge_pages_node[nid]--; 637 h->surplus_huge_pages_node[nid]--;
639 } else { 638 } else {
639 arch_clear_hugepage_flags(page);
640 enqueue_huge_page(h, page); 640 enqueue_huge_page(h, page);
641 } 641 }
642 spin_unlock(&hugetlb_lock); 642 spin_unlock(&hugetlb_lock);
@@ -671,6 +671,11 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
671 } 671 }
672} 672}
673 673
674/*
675 * PageHuge() only returns true for hugetlbfs pages, but not for normal or
676 * transparent huge pages. See the PageTransHuge() documentation for more
677 * details.
678 */
674int PageHuge(struct page *page) 679int PageHuge(struct page *page)
675{ 680{
676 compound_page_dtor *dtor; 681 compound_page_dtor *dtor;
@@ -2355,13 +2360,15 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
2355 struct page *page; 2360 struct page *page;
2356 struct hstate *h = hstate_vma(vma); 2361 struct hstate *h = hstate_vma(vma);
2357 unsigned long sz = huge_page_size(h); 2362 unsigned long sz = huge_page_size(h);
2363 const unsigned long mmun_start = start; /* For mmu_notifiers */
2364 const unsigned long mmun_end = end; /* For mmu_notifiers */
2358 2365
2359 WARN_ON(!is_vm_hugetlb_page(vma)); 2366 WARN_ON(!is_vm_hugetlb_page(vma));
2360 BUG_ON(start & ~huge_page_mask(h)); 2367 BUG_ON(start & ~huge_page_mask(h));
2361 BUG_ON(end & ~huge_page_mask(h)); 2368 BUG_ON(end & ~huge_page_mask(h));
2362 2369
2363 tlb_start_vma(tlb, vma); 2370 tlb_start_vma(tlb, vma);
2364 mmu_notifier_invalidate_range_start(mm, start, end); 2371 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2365again: 2372again:
2366 spin_lock(&mm->page_table_lock); 2373 spin_lock(&mm->page_table_lock);
2367 for (address = start; address < end; address += sz) { 2374 for (address = start; address < end; address += sz) {
@@ -2425,7 +2432,7 @@ again:
2425 if (address < end && !ref_page) 2432 if (address < end && !ref_page)
2426 goto again; 2433 goto again;
2427 } 2434 }
2428 mmu_notifier_invalidate_range_end(mm, start, end); 2435 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2429 tlb_end_vma(tlb, vma); 2436 tlb_end_vma(tlb, vma);
2430} 2437}
2431 2438
@@ -2473,7 +2480,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2473 struct hstate *h = hstate_vma(vma); 2480 struct hstate *h = hstate_vma(vma);
2474 struct vm_area_struct *iter_vma; 2481 struct vm_area_struct *iter_vma;
2475 struct address_space *mapping; 2482 struct address_space *mapping;
2476 struct prio_tree_iter iter;
2477 pgoff_t pgoff; 2483 pgoff_t pgoff;
2478 2484
2479 /* 2485 /*
@@ -2481,7 +2487,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2481 * from page cache lookup which is in HPAGE_SIZE units. 2487 * from page cache lookup which is in HPAGE_SIZE units.
2482 */ 2488 */
2483 address = address & huge_page_mask(h); 2489 address = address & huge_page_mask(h);
2484 pgoff = vma_hugecache_offset(h, vma, address); 2490 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
2491 vma->vm_pgoff;
2485 mapping = vma->vm_file->f_dentry->d_inode->i_mapping; 2492 mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
2486 2493
2487 /* 2494 /*
@@ -2490,7 +2497,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
2490 * __unmap_hugepage_range() is called as the lock is already held 2497 * __unmap_hugepage_range() is called as the lock is already held
2491 */ 2498 */
2492 mutex_lock(&mapping->i_mmap_mutex); 2499 mutex_lock(&mapping->i_mmap_mutex);
2493 vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 2500 vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
2494 /* Do not unmap the current VMA */ 2501 /* Do not unmap the current VMA */
2495 if (iter_vma == vma) 2502 if (iter_vma == vma)
2496 continue; 2503 continue;
@@ -2525,6 +2532,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
2525 struct page *old_page, *new_page; 2532 struct page *old_page, *new_page;
2526 int avoidcopy; 2533 int avoidcopy;
2527 int outside_reserve = 0; 2534 int outside_reserve = 0;
2535 unsigned long mmun_start; /* For mmu_notifiers */
2536 unsigned long mmun_end; /* For mmu_notifiers */
2528 2537
2529 old_page = pte_page(pte); 2538 old_page = pte_page(pte);
2530 2539
@@ -2611,6 +2620,9 @@ retry_avoidcopy:
2611 pages_per_huge_page(h)); 2620 pages_per_huge_page(h));
2612 __SetPageUptodate(new_page); 2621 __SetPageUptodate(new_page);
2613 2622
2623 mmun_start = address & huge_page_mask(h);
2624 mmun_end = mmun_start + huge_page_size(h);
2625 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2614 /* 2626 /*
2615 * Retake the page_table_lock to check for racing updates 2627 * Retake the page_table_lock to check for racing updates
2616 * before the page tables are altered 2628 * before the page tables are altered
@@ -2619,9 +2631,6 @@ retry_avoidcopy:
2619 ptep = huge_pte_offset(mm, address & huge_page_mask(h)); 2631 ptep = huge_pte_offset(mm, address & huge_page_mask(h));
2620 if (likely(pte_same(huge_ptep_get(ptep), pte))) { 2632 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
2621 /* Break COW */ 2633 /* Break COW */
2622 mmu_notifier_invalidate_range_start(mm,
2623 address & huge_page_mask(h),
2624 (address & huge_page_mask(h)) + huge_page_size(h));
2625 huge_ptep_clear_flush(vma, address, ptep); 2634 huge_ptep_clear_flush(vma, address, ptep);
2626 set_huge_pte_at(mm, address, ptep, 2635 set_huge_pte_at(mm, address, ptep,
2627 make_huge_pte(vma, new_page, 1)); 2636 make_huge_pte(vma, new_page, 1));
@@ -2629,10 +2638,11 @@ retry_avoidcopy:
2629 hugepage_add_new_anon_rmap(new_page, vma, address); 2638 hugepage_add_new_anon_rmap(new_page, vma, address);
2630 /* Make the old page be freed below */ 2639 /* Make the old page be freed below */
2631 new_page = old_page; 2640 new_page = old_page;
2632 mmu_notifier_invalidate_range_end(mm,
2633 address & huge_page_mask(h),
2634 (address & huge_page_mask(h)) + huge_page_size(h));
2635 } 2641 }
2642 spin_unlock(&mm->page_table_lock);
2643 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2644 /* Caller expects lock to be held */
2645 spin_lock(&mm->page_table_lock);
2636 page_cache_release(new_page); 2646 page_cache_release(new_page);
2637 page_cache_release(old_page); 2647 page_cache_release(old_page);
2638 return 0; 2648 return 0;
diff --git a/mm/internal.h b/mm/internal.h
index b8c91b342e24..a4fa284f6bc2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,26 +118,27 @@ struct compact_control {
118 unsigned long nr_freepages; /* Number of isolated free pages */ 118 unsigned long nr_freepages; /* Number of isolated free pages */
119 unsigned long nr_migratepages; /* Number of pages to migrate */ 119 unsigned long nr_migratepages; /* Number of pages to migrate */
120 unsigned long free_pfn; /* isolate_freepages search base */ 120 unsigned long free_pfn; /* isolate_freepages search base */
121 unsigned long start_free_pfn; /* where we started the search */
122 unsigned long migrate_pfn; /* isolate_migratepages search base */ 121 unsigned long migrate_pfn; /* isolate_migratepages search base */
123 bool sync; /* Synchronous migration */ 122 bool sync; /* Synchronous migration */
124 bool wrapped; /* Order > 0 compactions are 123 bool ignore_skip_hint; /* Scan blocks even if marked skip */
125 incremental, once free_pfn 124 bool finished_update_free; /* True when the zone cached pfns are
126 and migrate_pfn meet, we restart 125 * no longer being updated
127 from the top of the zone; 126 */
128 remember we wrapped around. */ 127 bool finished_update_migrate;
129 128
130 int order; /* order a direct compactor needs */ 129 int order; /* order a direct compactor needs */
131 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 130 int migratetype; /* MOVABLE, RECLAIMABLE etc */
132 struct zone *zone; 131 struct zone *zone;
133 bool *contended; /* True if a lock was contended */ 132 bool contended; /* True if a lock was contended */
133 struct page **page; /* Page captured of requested size */
134}; 134};
135 135
136unsigned long 136unsigned long
137isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn); 137isolate_freepages_range(struct compact_control *cc,
138 unsigned long start_pfn, unsigned long end_pfn);
138unsigned long 139unsigned long
139isolate_migratepages_range(struct zone *zone, struct compact_control *cc, 140isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
140 unsigned long low_pfn, unsigned long end_pfn); 141 unsigned long low_pfn, unsigned long end_pfn, bool unevictable);
141 142
142#endif 143#endif
143 144
@@ -167,9 +168,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
167} 168}
168 169
169/* 170/*
170 * Called only in fault path via page_evictable() for a new page 171 * Called only in fault path, to determine if a new page is being
171 * to determine if it's being mapped into a LOCKED vma. 172 * mapped into a LOCKED vma. If it is, mark page as mlocked.
172 * If so, mark page as mlocked.
173 */ 173 */
174static inline int mlocked_vma_newpage(struct vm_area_struct *vma, 174static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
175 struct page *page) 175 struct page *page)
@@ -180,7 +180,8 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
180 return 0; 180 return 0;
181 181
182 if (!TestSetPageMlocked(page)) { 182 if (!TestSetPageMlocked(page)) {
183 inc_zone_page_state(page, NR_MLOCK); 183 mod_zone_page_state(page_zone(page), NR_MLOCK,
184 hpage_nr_pages(page));
184 count_vm_event(UNEVICTABLE_PGMLOCKED); 185 count_vm_event(UNEVICTABLE_PGMLOCKED);
185 } 186 }
186 return 1; 187 return 1;
@@ -201,12 +202,7 @@ extern void munlock_vma_page(struct page *page);
201 * If called for a page that is still mapped by mlocked vmas, all we do 202 * If called for a page that is still mapped by mlocked vmas, all we do
202 * is revert to lazy LRU behaviour -- semantics are not broken. 203 * is revert to lazy LRU behaviour -- semantics are not broken.
203 */ 204 */
204extern void __clear_page_mlock(struct page *page); 205extern void clear_page_mlock(struct page *page);
205static inline void clear_page_mlock(struct page *page)
206{
207 if (unlikely(TestClearPageMlocked(page)))
208 __clear_page_mlock(page);
209}
210 206
211/* 207/*
212 * mlock_migrate_page - called only from migrate_page_copy() to 208 * mlock_migrate_page - called only from migrate_page_copy() to
@@ -340,7 +336,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
340#define ZONE_RECLAIM_FULL -1 336#define ZONE_RECLAIM_FULL -1
341#define ZONE_RECLAIM_SOME 0 337#define ZONE_RECLAIM_SOME 0
342#define ZONE_RECLAIM_SUCCESS 1 338#define ZONE_RECLAIM_SUCCESS 1
343#endif
344 339
345extern int hwpoison_filter(struct page *p); 340extern int hwpoison_filter(struct page *p);
346 341
@@ -356,3 +351,20 @@ extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
356 unsigned long, unsigned long); 351 unsigned long, unsigned long);
357 352
358extern void set_pageblock_order(void); 353extern void set_pageblock_order(void);
354unsigned long reclaim_clean_pages_from_list(struct zone *zone,
355 struct list_head *page_list);
356/* The ALLOC_WMARK bits are used as an index to zone->watermark */
357#define ALLOC_WMARK_MIN WMARK_MIN
358#define ALLOC_WMARK_LOW WMARK_LOW
359#define ALLOC_WMARK_HIGH WMARK_HIGH
360#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
361
362/* Mask to get the watermark bits */
363#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
364
365#define ALLOC_HARDER 0x10 /* try to alloc harder */
366#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
367#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
368#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
369
370#endif /* __MM_INTERNAL_H */
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
new file mode 100644
index 000000000000..4a5822a586e6
--- /dev/null
+++ b/mm/interval_tree.c
@@ -0,0 +1,112 @@
1/*
2 * mm/interval_tree.c - interval tree for mapping->i_mmap
3 *
4 * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
5 *
6 * This file is released under the GPL v2.
7 */
8
9#include <linux/mm.h>
10#include <linux/fs.h>
11#include <linux/rmap.h>
12#include <linux/interval_tree_generic.h>
13
14static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
15{
16 return v->vm_pgoff;
17}
18
19static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
20{
21 return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
22}
23
24INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
25 unsigned long, shared.linear.rb_subtree_last,
26 vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
27
28/* Insert node immediately after prev in the interval tree */
29void vma_interval_tree_insert_after(struct vm_area_struct *node,
30 struct vm_area_struct *prev,
31 struct rb_root *root)
32{
33 struct rb_node **link;
34 struct vm_area_struct *parent;
35 unsigned long last = vma_last_pgoff(node);
36
37 VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
38
39 if (!prev->shared.linear.rb.rb_right) {
40 parent = prev;
41 link = &prev->shared.linear.rb.rb_right;
42 } else {
43 parent = rb_entry(prev->shared.linear.rb.rb_right,
44 struct vm_area_struct, shared.linear.rb);
45 if (parent->shared.linear.rb_subtree_last < last)
46 parent->shared.linear.rb_subtree_last = last;
47 while (parent->shared.linear.rb.rb_left) {
48 parent = rb_entry(parent->shared.linear.rb.rb_left,
49 struct vm_area_struct, shared.linear.rb);
50 if (parent->shared.linear.rb_subtree_last < last)
51 parent->shared.linear.rb_subtree_last = last;
52 }
53 link = &parent->shared.linear.rb.rb_left;
54 }
55
56 node->shared.linear.rb_subtree_last = last;
57 rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
58 rb_insert_augmented(&node->shared.linear.rb, root,
59 &vma_interval_tree_augment);
60}
61
62static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
63{
64 return vma_start_pgoff(avc->vma);
65}
66
67static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
68{
69 return vma_last_pgoff(avc->vma);
70}
71
72INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
73 avc_start_pgoff, avc_last_pgoff,
74 static inline, __anon_vma_interval_tree)
75
76void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
77 struct rb_root *root)
78{
79#ifdef CONFIG_DEBUG_VM_RB
80 node->cached_vma_start = avc_start_pgoff(node);
81 node->cached_vma_last = avc_last_pgoff(node);
82#endif
83 __anon_vma_interval_tree_insert(node, root);
84}
85
86void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
87 struct rb_root *root)
88{
89 __anon_vma_interval_tree_remove(node, root);
90}
91
92struct anon_vma_chain *
93anon_vma_interval_tree_iter_first(struct rb_root *root,
94 unsigned long first, unsigned long last)
95{
96 return __anon_vma_interval_tree_iter_first(root, first, last);
97}
98
99struct anon_vma_chain *
100anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
101 unsigned long first, unsigned long last)
102{
103 return __anon_vma_interval_tree_iter_next(node, first, last);
104}
105
106#ifdef CONFIG_DEBUG_VM_RB
107void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
108{
109 WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
110 WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
111}
112#endif
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 0de83b4541e9..a217cc544060 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -29,7 +29,7 @@
29 * - kmemleak_lock (rwlock): protects the object_list modifications and 29 * - kmemleak_lock (rwlock): protects the object_list modifications and
30 * accesses to the object_tree_root. The object_list is the main list 30 * accesses to the object_tree_root. The object_list is the main list
31 * holding the metadata (struct kmemleak_object) for the allocated memory 31 * holding the metadata (struct kmemleak_object) for the allocated memory
32 * blocks. The object_tree_root is a priority search tree used to look-up 32 * blocks. The object_tree_root is a red black tree used to look-up
33 * metadata based on a pointer to the corresponding memory block. The 33 * metadata based on a pointer to the corresponding memory block. The
34 * kmemleak_object structures are added to the object_list and 34 * kmemleak_object structures are added to the object_list and
35 * object_tree_root in the create_object() function called from the 35 * object_tree_root in the create_object() function called from the
@@ -71,7 +71,7 @@
71#include <linux/delay.h> 71#include <linux/delay.h>
72#include <linux/export.h> 72#include <linux/export.h>
73#include <linux/kthread.h> 73#include <linux/kthread.h>
74#include <linux/prio_tree.h> 74#include <linux/rbtree.h>
75#include <linux/fs.h> 75#include <linux/fs.h>
76#include <linux/debugfs.h> 76#include <linux/debugfs.h>
77#include <linux/seq_file.h> 77#include <linux/seq_file.h>
@@ -132,7 +132,7 @@ struct kmemleak_scan_area {
132 * Structure holding the metadata for each allocated memory block. 132 * Structure holding the metadata for each allocated memory block.
133 * Modifications to such objects should be made while holding the 133 * Modifications to such objects should be made while holding the
134 * object->lock. Insertions or deletions from object_list, gray_list or 134 * object->lock. Insertions or deletions from object_list, gray_list or
135 * tree_node are already protected by the corresponding locks or mutex (see 135 * rb_node are already protected by the corresponding locks or mutex (see
136 * the notes on locking above). These objects are reference-counted 136 * the notes on locking above). These objects are reference-counted
137 * (use_count) and freed using the RCU mechanism. 137 * (use_count) and freed using the RCU mechanism.
138 */ 138 */
@@ -141,7 +141,7 @@ struct kmemleak_object {
141 unsigned long flags; /* object status flags */ 141 unsigned long flags; /* object status flags */
142 struct list_head object_list; 142 struct list_head object_list;
143 struct list_head gray_list; 143 struct list_head gray_list;
144 struct prio_tree_node tree_node; 144 struct rb_node rb_node;
145 struct rcu_head rcu; /* object_list lockless traversal */ 145 struct rcu_head rcu; /* object_list lockless traversal */
146 /* object usage count; object freed when use_count == 0 */ 146 /* object usage count; object freed when use_count == 0 */
147 atomic_t use_count; 147 atomic_t use_count;
@@ -182,9 +182,9 @@ struct kmemleak_object {
182static LIST_HEAD(object_list); 182static LIST_HEAD(object_list);
183/* the list of gray-colored objects (see color_gray comment below) */ 183/* the list of gray-colored objects (see color_gray comment below) */
184static LIST_HEAD(gray_list); 184static LIST_HEAD(gray_list);
185/* prio search tree for object boundaries */ 185/* search tree for object boundaries */
186static struct prio_tree_root object_tree_root; 186static struct rb_root object_tree_root = RB_ROOT;
187/* rw_lock protecting the access to object_list and prio_tree_root */ 187/* rw_lock protecting the access to object_list and object_tree_root */
188static DEFINE_RWLOCK(kmemleak_lock); 188static DEFINE_RWLOCK(kmemleak_lock);
189 189
190/* allocation caches for kmemleak internal data */ 190/* allocation caches for kmemleak internal data */
@@ -380,7 +380,7 @@ static void dump_object_info(struct kmemleak_object *object)
380 trace.entries = object->trace; 380 trace.entries = object->trace;
381 381
382 pr_notice("Object 0x%08lx (size %zu):\n", 382 pr_notice("Object 0x%08lx (size %zu):\n",
383 object->tree_node.start, object->size); 383 object->pointer, object->size);
384 pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", 384 pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
385 object->comm, object->pid, object->jiffies); 385 object->comm, object->pid, object->jiffies);
386 pr_notice(" min_count = %d\n", object->min_count); 386 pr_notice(" min_count = %d\n", object->min_count);
@@ -392,32 +392,32 @@ static void dump_object_info(struct kmemleak_object *object)
392} 392}
393 393
394/* 394/*
395 * Look-up a memory block metadata (kmemleak_object) in the priority search 395 * Look-up a memory block metadata (kmemleak_object) in the object search
396 * tree based on a pointer value. If alias is 0, only values pointing to the 396 * tree based on a pointer value. If alias is 0, only values pointing to the
397 * beginning of the memory block are allowed. The kmemleak_lock must be held 397 * beginning of the memory block are allowed. The kmemleak_lock must be held
398 * when calling this function. 398 * when calling this function.
399 */ 399 */
400static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) 400static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
401{ 401{
402 struct prio_tree_node *node; 402 struct rb_node *rb = object_tree_root.rb_node;
403 struct prio_tree_iter iter; 403
404 struct kmemleak_object *object; 404 while (rb) {
405 405 struct kmemleak_object *object =
406 prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr); 406 rb_entry(rb, struct kmemleak_object, rb_node);
407 node = prio_tree_next(&iter); 407 if (ptr < object->pointer)
408 if (node) { 408 rb = object->rb_node.rb_left;
409 object = prio_tree_entry(node, struct kmemleak_object, 409 else if (object->pointer + object->size <= ptr)
410 tree_node); 410 rb = object->rb_node.rb_right;
411 if (!alias && object->pointer != ptr) { 411 else if (object->pointer == ptr || alias)
412 return object;
413 else {
412 kmemleak_warn("Found object by alias at 0x%08lx\n", 414 kmemleak_warn("Found object by alias at 0x%08lx\n",
413 ptr); 415 ptr);
414 dump_object_info(object); 416 dump_object_info(object);
415 object = NULL; 417 break;
416 } 418 }
417 } else 419 }
418 object = NULL; 420 return NULL;
419
420 return object;
421} 421}
422 422
423/* 423/*
@@ -471,7 +471,7 @@ static void put_object(struct kmemleak_object *object)
471} 471}
472 472
473/* 473/*
474 * Look up an object in the prio search tree and increase its use_count. 474 * Look up an object in the object search tree and increase its use_count.
475 */ 475 */
476static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) 476static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
477{ 477{
@@ -516,8 +516,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
516 int min_count, gfp_t gfp) 516 int min_count, gfp_t gfp)
517{ 517{
518 unsigned long flags; 518 unsigned long flags;
519 struct kmemleak_object *object; 519 struct kmemleak_object *object, *parent;
520 struct prio_tree_node *node; 520 struct rb_node **link, *rb_parent;
521 521
522 object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); 522 object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
523 if (!object) { 523 if (!object) {
@@ -560,31 +560,34 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
560 /* kernel backtrace */ 560 /* kernel backtrace */
561 object->trace_len = __save_stack_trace(object->trace); 561 object->trace_len = __save_stack_trace(object->trace);
562 562
563 INIT_PRIO_TREE_NODE(&object->tree_node);
564 object->tree_node.start = ptr;
565 object->tree_node.last = ptr + size - 1;
566
567 write_lock_irqsave(&kmemleak_lock, flags); 563 write_lock_irqsave(&kmemleak_lock, flags);
568 564
569 min_addr = min(min_addr, ptr); 565 min_addr = min(min_addr, ptr);
570 max_addr = max(max_addr, ptr + size); 566 max_addr = max(max_addr, ptr + size);
571 node = prio_tree_insert(&object_tree_root, &object->tree_node); 567 link = &object_tree_root.rb_node;
572 /* 568 rb_parent = NULL;
573 * The code calling the kernel does not yet have the pointer to the 569 while (*link) {
574 * memory block to be able to free it. However, we still hold the 570 rb_parent = *link;
575 * kmemleak_lock here in case parts of the kernel started freeing 571 parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
576 * random memory blocks. 572 if (ptr + size <= parent->pointer)
577 */ 573 link = &parent->rb_node.rb_left;
578 if (node != &object->tree_node) { 574 else if (parent->pointer + parent->size <= ptr)
579 kmemleak_stop("Cannot insert 0x%lx into the object search tree " 575 link = &parent->rb_node.rb_right;
580 "(already existing)\n", ptr); 576 else {
581 object = lookup_object(ptr, 1); 577 kmemleak_stop("Cannot insert 0x%lx into the object "
582 spin_lock(&object->lock); 578 "search tree (overlaps existing)\n",
583 dump_object_info(object); 579 ptr);
584 spin_unlock(&object->lock); 580 kmem_cache_free(object_cache, object);
585 581 object = parent;
586 goto out; 582 spin_lock(&object->lock);
583 dump_object_info(object);
584 spin_unlock(&object->lock);
585 goto out;
586 }
587 } 587 }
588 rb_link_node(&object->rb_node, rb_parent, link);
589 rb_insert_color(&object->rb_node, &object_tree_root);
590
588 list_add_tail_rcu(&object->object_list, &object_list); 591 list_add_tail_rcu(&object->object_list, &object_list);
589out: 592out:
590 write_unlock_irqrestore(&kmemleak_lock, flags); 593 write_unlock_irqrestore(&kmemleak_lock, flags);
@@ -600,7 +603,7 @@ static void __delete_object(struct kmemleak_object *object)
600 unsigned long flags; 603 unsigned long flags;
601 604
602 write_lock_irqsave(&kmemleak_lock, flags); 605 write_lock_irqsave(&kmemleak_lock, flags);
603 prio_tree_remove(&object_tree_root, &object->tree_node); 606 rb_erase(&object->rb_node, &object_tree_root);
604 list_del_rcu(&object->object_list); 607 list_del_rcu(&object->object_list);
605 write_unlock_irqrestore(&kmemleak_lock, flags); 608 write_unlock_irqrestore(&kmemleak_lock, flags);
606 609
@@ -1766,7 +1769,6 @@ void __init kmemleak_init(void)
1766 1769
1767 object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); 1770 object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
1768 scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); 1771 scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
1769 INIT_PRIO_TREE_ROOT(&object_tree_root);
1770 1772
1771 if (crt_early_log >= ARRAY_SIZE(early_log)) 1773 if (crt_early_log >= ARRAY_SIZE(early_log))
1772 pr_warning("Early log buffer exceeded (%d), please increase " 1774 pr_warning("Early log buffer exceeded (%d), please increase "
diff --git a/mm/ksm.c b/mm/ksm.c
index 47c885368890..ae539f0b8aa1 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -709,15 +709,22 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
709 spinlock_t *ptl; 709 spinlock_t *ptl;
710 int swapped; 710 int swapped;
711 int err = -EFAULT; 711 int err = -EFAULT;
712 unsigned long mmun_start; /* For mmu_notifiers */
713 unsigned long mmun_end; /* For mmu_notifiers */
712 714
713 addr = page_address_in_vma(page, vma); 715 addr = page_address_in_vma(page, vma);
714 if (addr == -EFAULT) 716 if (addr == -EFAULT)
715 goto out; 717 goto out;
716 718
717 BUG_ON(PageTransCompound(page)); 719 BUG_ON(PageTransCompound(page));
720
721 mmun_start = addr;
722 mmun_end = addr + PAGE_SIZE;
723 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
724
718 ptep = page_check_address(page, mm, addr, &ptl, 0); 725 ptep = page_check_address(page, mm, addr, &ptl, 0);
719 if (!ptep) 726 if (!ptep)
720 goto out; 727 goto out_mn;
721 728
722 if (pte_write(*ptep) || pte_dirty(*ptep)) { 729 if (pte_write(*ptep) || pte_dirty(*ptep)) {
723 pte_t entry; 730 pte_t entry;
@@ -752,6 +759,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
752 759
753out_unlock: 760out_unlock:
754 pte_unmap_unlock(ptep, ptl); 761 pte_unmap_unlock(ptep, ptl);
762out_mn:
763 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
755out: 764out:
756 return err; 765 return err;
757} 766}
@@ -776,6 +785,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
776 spinlock_t *ptl; 785 spinlock_t *ptl;
777 unsigned long addr; 786 unsigned long addr;
778 int err = -EFAULT; 787 int err = -EFAULT;
788 unsigned long mmun_start; /* For mmu_notifiers */
789 unsigned long mmun_end; /* For mmu_notifiers */
779 790
780 addr = page_address_in_vma(page, vma); 791 addr = page_address_in_vma(page, vma);
781 if (addr == -EFAULT) 792 if (addr == -EFAULT)
@@ -794,10 +805,14 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
794 if (!pmd_present(*pmd)) 805 if (!pmd_present(*pmd))
795 goto out; 806 goto out;
796 807
808 mmun_start = addr;
809 mmun_end = addr + PAGE_SIZE;
810 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
811
797 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); 812 ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
798 if (!pte_same(*ptep, orig_pte)) { 813 if (!pte_same(*ptep, orig_pte)) {
799 pte_unmap_unlock(ptep, ptl); 814 pte_unmap_unlock(ptep, ptl);
800 goto out; 815 goto out_mn;
801 } 816 }
802 817
803 get_page(kpage); 818 get_page(kpage);
@@ -814,6 +829,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
814 829
815 pte_unmap_unlock(ptep, ptl); 830 pte_unmap_unlock(ptep, ptl);
816 err = 0; 831 err = 0;
832out_mn:
833 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
817out: 834out:
818 return err; 835 return err;
819} 836}
@@ -1469,10 +1486,14 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
1469 */ 1486 */
1470 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | 1487 if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
1471 VM_PFNMAP | VM_IO | VM_DONTEXPAND | 1488 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1472 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | 1489 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
1473 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
1474 return 0; /* just ignore the advice */ 1490 return 0; /* just ignore the advice */
1475 1491
1492#ifdef VM_SAO
1493 if (*vm_flags & VM_SAO)
1494 return 0;
1495#endif
1496
1476 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { 1497 if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
1477 err = __ksm_enter(mm); 1498 err = __ksm_enter(mm);
1478 if (err) 1499 if (err)
@@ -1582,7 +1603,7 @@ struct page *ksm_does_need_to_copy(struct page *page,
1582 SetPageSwapBacked(new_page); 1603 SetPageSwapBacked(new_page);
1583 __set_page_locked(new_page); 1604 __set_page_locked(new_page);
1584 1605
1585 if (page_evictable(new_page, vma)) 1606 if (!mlocked_vma_newpage(vma, new_page))
1586 lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); 1607 lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
1587 else 1608 else
1588 add_page_to_unevictable_list(new_page); 1609 add_page_to_unevictable_list(new_page);
@@ -1614,7 +1635,8 @@ again:
1614 struct vm_area_struct *vma; 1635 struct vm_area_struct *vma;
1615 1636
1616 anon_vma_lock(anon_vma); 1637 anon_vma_lock(anon_vma);
1617 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1638 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1639 0, ULONG_MAX) {
1618 vma = vmac->vma; 1640 vma = vmac->vma;
1619 if (rmap_item->address < vma->vm_start || 1641 if (rmap_item->address < vma->vm_start ||
1620 rmap_item->address >= vma->vm_end) 1642 rmap_item->address >= vma->vm_end)
@@ -1667,7 +1689,8 @@ again:
1667 struct vm_area_struct *vma; 1689 struct vm_area_struct *vma;
1668 1690
1669 anon_vma_lock(anon_vma); 1691 anon_vma_lock(anon_vma);
1670 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1692 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1693 0, ULONG_MAX) {
1671 vma = vmac->vma; 1694 vma = vmac->vma;
1672 if (rmap_item->address < vma->vm_start || 1695 if (rmap_item->address < vma->vm_start ||
1673 rmap_item->address >= vma->vm_end) 1696 rmap_item->address >= vma->vm_end)
@@ -1719,7 +1742,8 @@ again:
1719 struct vm_area_struct *vma; 1742 struct vm_area_struct *vma;
1720 1743
1721 anon_vma_lock(anon_vma); 1744 anon_vma_lock(anon_vma);
1722 list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { 1745 anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
1746 0, ULONG_MAX) {
1723 vma = vmac->vma; 1747 vma = vmac->vma;
1724 if (rmap_item->address < vma->vm_start || 1748 if (rmap_item->address < vma->vm_start ||
1725 rmap_item->address >= vma->vm_end) 1749 rmap_item->address >= vma->vm_end)
diff --git a/mm/madvise.c b/mm/madvise.c
index 14d260fa0d17..03dfa5c7adb3 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -69,10 +69,14 @@ static long madvise_behavior(struct vm_area_struct * vma,
69 new_flags &= ~VM_DONTCOPY; 69 new_flags &= ~VM_DONTCOPY;
70 break; 70 break;
71 case MADV_DONTDUMP: 71 case MADV_DONTDUMP:
72 new_flags |= VM_NODUMP; 72 new_flags |= VM_DONTDUMP;
73 break; 73 break;
74 case MADV_DODUMP: 74 case MADV_DODUMP:
75 new_flags &= ~VM_NODUMP; 75 if (new_flags & VM_SPECIAL) {
76 error = -EINVAL;
77 goto out;
78 }
79 new_flags &= ~VM_DONTDUMP;
76 break; 80 break;
77 case MADV_MERGEABLE: 81 case MADV_MERGEABLE:
78 case MADV_UNMERGEABLE: 82 case MADV_UNMERGEABLE:
diff --git a/mm/memblock.c b/mm/memblock.c
index 82aa349d2f7a..931eef145af5 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -41,7 +41,8 @@ static int memblock_memory_in_slab __initdata_memblock = 0;
41static int memblock_reserved_in_slab __initdata_memblock = 0; 41static int memblock_reserved_in_slab __initdata_memblock = 0;
42 42
43/* inline so we don't get a warning when pr_debug is compiled out */ 43/* inline so we don't get a warning when pr_debug is compiled out */
44static inline const char *memblock_type_name(struct memblock_type *type) 44static __init_memblock const char *
45memblock_type_name(struct memblock_type *type)
45{ 46{
46 if (type == &memblock.memory) 47 if (type == &memblock.memory)
47 return "memory"; 48 return "memory";
@@ -756,7 +757,7 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
756 return ret; 757 return ret;
757 758
758 for (i = start_rgn; i < end_rgn; i++) 759 for (i = start_rgn; i < end_rgn; i++)
759 type->regions[i].nid = nid; 760 memblock_set_region_node(&type->regions[i], nid);
760 761
761 memblock_merge_regions(type); 762 memblock_merge_regions(type);
762 return 0; 763 return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a72f2ffdc3d0..7acf43bf04a2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -51,6 +51,7 @@
51#include <linux/oom.h> 51#include <linux/oom.h>
52#include "internal.h" 52#include "internal.h"
53#include <net/sock.h> 53#include <net/sock.h>
54#include <net/ip.h>
54#include <net/tcp_memcontrol.h> 55#include <net/tcp_memcontrol.h>
55 56
56#include <asm/uaccess.h> 57#include <asm/uaccess.h>
@@ -326,7 +327,7 @@ struct mem_cgroup {
326 struct mem_cgroup_stat_cpu nocpu_base; 327 struct mem_cgroup_stat_cpu nocpu_base;
327 spinlock_t pcp_counter_lock; 328 spinlock_t pcp_counter_lock;
328 329
329#ifdef CONFIG_INET 330#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
330 struct tcp_memcontrol tcp_mem; 331 struct tcp_memcontrol tcp_mem;
331#endif 332#endif
332}; 333};
@@ -411,12 +412,14 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
411 return container_of(s, struct mem_cgroup, css); 412 return container_of(s, struct mem_cgroup, css);
412} 413}
413 414
415static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
416{
417 return (memcg == root_mem_cgroup);
418}
419
414/* Writing them here to avoid exposing memcg's inner layout */ 420/* Writing them here to avoid exposing memcg's inner layout */
415#ifdef CONFIG_MEMCG_KMEM 421#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
416#include <net/sock.h>
417#include <net/ip.h>
418 422
419static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
420void sock_update_memcg(struct sock *sk) 423void sock_update_memcg(struct sock *sk)
421{ 424{
422 if (mem_cgroup_sockets_enabled) { 425 if (mem_cgroup_sockets_enabled) {
@@ -461,7 +464,6 @@ void sock_release_memcg(struct sock *sk)
461 } 464 }
462} 465}
463 466
464#ifdef CONFIG_INET
465struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 467struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
466{ 468{
467 if (!memcg || mem_cgroup_is_root(memcg)) 469 if (!memcg || mem_cgroup_is_root(memcg))
@@ -470,10 +472,7 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
470 return &memcg->tcp_mem.cg_proto; 472 return &memcg->tcp_mem.cg_proto;
471} 473}
472EXPORT_SYMBOL(tcp_proto_cgroup); 474EXPORT_SYMBOL(tcp_proto_cgroup);
473#endif /* CONFIG_INET */
474#endif /* CONFIG_MEMCG_KMEM */
475 475
476#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
477static void disarm_sock_keys(struct mem_cgroup *memcg) 476static void disarm_sock_keys(struct mem_cgroup *memcg)
478{ 477{
479 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) 478 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
@@ -1016,11 +1015,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
1016 iter != NULL; \ 1015 iter != NULL; \
1017 iter = mem_cgroup_iter(NULL, iter, NULL)) 1016 iter = mem_cgroup_iter(NULL, iter, NULL))
1018 1017
1019static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
1020{
1021 return (memcg == root_mem_cgroup);
1022}
1023
1024void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1018void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1025{ 1019{
1026 struct mem_cgroup *memcg; 1020 struct mem_cgroup *memcg;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a6e2141a6610..6c5899b9034a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -400,18 +400,21 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
400 struct vm_area_struct *vma; 400 struct vm_area_struct *vma;
401 struct task_struct *tsk; 401 struct task_struct *tsk;
402 struct anon_vma *av; 402 struct anon_vma *av;
403 pgoff_t pgoff;
403 404
404 av = page_lock_anon_vma(page); 405 av = page_lock_anon_vma(page);
405 if (av == NULL) /* Not actually mapped anymore */ 406 if (av == NULL) /* Not actually mapped anymore */
406 return; 407 return;
407 408
409 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
408 read_lock(&tasklist_lock); 410 read_lock(&tasklist_lock);
409 for_each_process (tsk) { 411 for_each_process (tsk) {
410 struct anon_vma_chain *vmac; 412 struct anon_vma_chain *vmac;
411 413
412 if (!task_early_kill(tsk)) 414 if (!task_early_kill(tsk))
413 continue; 415 continue;
414 list_for_each_entry(vmac, &av->head, same_anon_vma) { 416 anon_vma_interval_tree_foreach(vmac, &av->rb_root,
417 pgoff, pgoff) {
415 vma = vmac->vma; 418 vma = vmac->vma;
416 if (!page_mapped_in_vma(page, vma)) 419 if (!page_mapped_in_vma(page, vma))
417 continue; 420 continue;
@@ -431,7 +434,6 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
431{ 434{
432 struct vm_area_struct *vma; 435 struct vm_area_struct *vma;
433 struct task_struct *tsk; 436 struct task_struct *tsk;
434 struct prio_tree_iter iter;
435 struct address_space *mapping = page->mapping; 437 struct address_space *mapping = page->mapping;
436 438
437 mutex_lock(&mapping->i_mmap_mutex); 439 mutex_lock(&mapping->i_mmap_mutex);
@@ -442,7 +444,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
442 if (!task_early_kill(tsk)) 444 if (!task_early_kill(tsk))
443 continue; 445 continue;
444 446
445 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, 447 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff,
446 pgoff) { 448 pgoff) {
447 /* 449 /*
448 * Send early kill signal to tasks where a vma covers 450 * Send early kill signal to tasks where a vma covers
diff --git a/mm/memory.c b/mm/memory.c
index 57361708d1a5..fb135ba4aba9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -712,7 +712,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
712 add_taint(TAINT_BAD_PAGE); 712 add_taint(TAINT_BAD_PAGE);
713} 713}
714 714
715static inline int is_cow_mapping(vm_flags_t flags) 715static inline bool is_cow_mapping(vm_flags_t flags)
716{ 716{
717 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; 717 return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
718} 718}
@@ -1039,6 +1039,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1039 unsigned long next; 1039 unsigned long next;
1040 unsigned long addr = vma->vm_start; 1040 unsigned long addr = vma->vm_start;
1041 unsigned long end = vma->vm_end; 1041 unsigned long end = vma->vm_end;
1042 unsigned long mmun_start; /* For mmu_notifiers */
1043 unsigned long mmun_end; /* For mmu_notifiers */
1044 bool is_cow;
1042 int ret; 1045 int ret;
1043 1046
1044 /* 1047 /*
@@ -1047,7 +1050,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1047 * readonly mappings. The tradeoff is that copy_page_range is more 1050 * readonly mappings. The tradeoff is that copy_page_range is more
1048 * efficient than faulting. 1051 * efficient than faulting.
1049 */ 1052 */
1050 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { 1053 if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
1054 VM_PFNMAP | VM_MIXEDMAP))) {
1051 if (!vma->anon_vma) 1055 if (!vma->anon_vma)
1052 return 0; 1056 return 0;
1053 } 1057 }
@@ -1055,12 +1059,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1055 if (is_vm_hugetlb_page(vma)) 1059 if (is_vm_hugetlb_page(vma))
1056 return copy_hugetlb_page_range(dst_mm, src_mm, vma); 1060 return copy_hugetlb_page_range(dst_mm, src_mm, vma);
1057 1061
1058 if (unlikely(is_pfn_mapping(vma))) { 1062 if (unlikely(vma->vm_flags & VM_PFNMAP)) {
1059 /* 1063 /*
1060 * We do not free on error cases below as remove_vma 1064 * We do not free on error cases below as remove_vma
1061 * gets called on error from higher level routine 1065 * gets called on error from higher level routine
1062 */ 1066 */
1063 ret = track_pfn_vma_copy(vma); 1067 ret = track_pfn_copy(vma);
1064 if (ret) 1068 if (ret)
1065 return ret; 1069 return ret;
1066 } 1070 }
@@ -1071,8 +1075,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1071 * parent mm. And a permission downgrade will only happen if 1075 * parent mm. And a permission downgrade will only happen if
1072 * is_cow_mapping() returns true. 1076 * is_cow_mapping() returns true.
1073 */ 1077 */
1074 if (is_cow_mapping(vma->vm_flags)) 1078 is_cow = is_cow_mapping(vma->vm_flags);
1075 mmu_notifier_invalidate_range_start(src_mm, addr, end); 1079 mmun_start = addr;
1080 mmun_end = end;
1081 if (is_cow)
1082 mmu_notifier_invalidate_range_start(src_mm, mmun_start,
1083 mmun_end);
1076 1084
1077 ret = 0; 1085 ret = 0;
1078 dst_pgd = pgd_offset(dst_mm, addr); 1086 dst_pgd = pgd_offset(dst_mm, addr);
@@ -1088,9 +1096,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1088 } 1096 }
1089 } while (dst_pgd++, src_pgd++, addr = next, addr != end); 1097 } while (dst_pgd++, src_pgd++, addr = next, addr != end);
1090 1098
1091 if (is_cow_mapping(vma->vm_flags)) 1099 if (is_cow)
1092 mmu_notifier_invalidate_range_end(src_mm, 1100 mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
1093 vma->vm_start, end);
1094 return ret; 1101 return ret;
1095} 1102}
1096 1103
@@ -1327,8 +1334,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
1327 if (vma->vm_file) 1334 if (vma->vm_file)
1328 uprobe_munmap(vma, start, end); 1335 uprobe_munmap(vma, start, end);
1329 1336
1330 if (unlikely(is_pfn_mapping(vma))) 1337 if (unlikely(vma->vm_flags & VM_PFNMAP))
1331 untrack_pfn_vma(vma, 0, 0); 1338 untrack_pfn(vma, 0, 0);
1332 1339
1333 if (start != end) { 1340 if (start != end) {
1334 if (unlikely(is_vm_hugetlb_page(vma))) { 1341 if (unlikely(is_vm_hugetlb_page(vma))) {
@@ -1521,7 +1528,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1521 spin_unlock(&mm->page_table_lock); 1528 spin_unlock(&mm->page_table_lock);
1522 wait_split_huge_page(vma->anon_vma, pmd); 1529 wait_split_huge_page(vma->anon_vma, pmd);
1523 } else { 1530 } else {
1524 page = follow_trans_huge_pmd(mm, address, 1531 page = follow_trans_huge_pmd(vma, address,
1525 pmd, flags); 1532 pmd, flags);
1526 spin_unlock(&mm->page_table_lock); 1533 spin_unlock(&mm->page_table_lock);
1527 goto out; 1534 goto out;
@@ -1576,12 +1583,12 @@ split_fallthrough:
1576 if (page->mapping && trylock_page(page)) { 1583 if (page->mapping && trylock_page(page)) {
1577 lru_add_drain(); /* push cached pages to LRU */ 1584 lru_add_drain(); /* push cached pages to LRU */
1578 /* 1585 /*
1579 * Because we lock page here and migration is 1586 * Because we lock page here, and migration is
1580 * blocked by the pte's page reference, we need 1587 * blocked by the pte's page reference, and we
1581 * only check for file-cache page truncation. 1588 * know the page is still mapped, we don't even
1589 * need to check for file-cache page truncation.
1582 */ 1590 */
1583 if (page->mapping) 1591 mlock_vma_page(page);
1584 mlock_vma_page(page);
1585 unlock_page(page); 1592 unlock_page(page);
1586 } 1593 }
1587 } 1594 }
@@ -2085,6 +2092,11 @@ out:
2085 * ask for a shared writable mapping! 2092 * ask for a shared writable mapping!
2086 * 2093 *
2087 * The page does not need to be reserved. 2094 * The page does not need to be reserved.
2095 *
2096 * Usually this function is called from f_op->mmap() handler
2097 * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
2098 * Caller must set VM_MIXEDMAP on vma if it wants to call this
2099 * function from other places, for example from page-fault handler.
2088 */ 2100 */
2089int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 2101int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2090 struct page *page) 2102 struct page *page)
@@ -2093,7 +2105,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
2093 return -EFAULT; 2105 return -EFAULT;
2094 if (!page_count(page)) 2106 if (!page_count(page))
2095 return -EINVAL; 2107 return -EINVAL;
2096 vma->vm_flags |= VM_INSERTPAGE; 2108 if (!(vma->vm_flags & VM_MIXEDMAP)) {
2109 BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
2110 BUG_ON(vma->vm_flags & VM_PFNMAP);
2111 vma->vm_flags |= VM_MIXEDMAP;
2112 }
2097 return insert_page(vma, addr, page, vma->vm_page_prot); 2113 return insert_page(vma, addr, page, vma->vm_page_prot);
2098} 2114}
2099EXPORT_SYMBOL(vm_insert_page); 2115EXPORT_SYMBOL(vm_insert_page);
@@ -2132,7 +2148,7 @@ out:
2132 * @addr: target user address of this page 2148 * @addr: target user address of this page
2133 * @pfn: source kernel pfn 2149 * @pfn: source kernel pfn
2134 * 2150 *
2135 * Similar to vm_inert_page, this allows drivers to insert individual pages 2151 * Similar to vm_insert_page, this allows drivers to insert individual pages
2136 * they've allocated into a user vma. Same comments apply. 2152 * they've allocated into a user vma. Same comments apply.
2137 * 2153 *
2138 * This function should only be called from a vm_ops->fault handler, and 2154 * This function should only be called from a vm_ops->fault handler, and
@@ -2162,14 +2178,11 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
2162 2178
2163 if (addr < vma->vm_start || addr >= vma->vm_end) 2179 if (addr < vma->vm_start || addr >= vma->vm_end)
2164 return -EFAULT; 2180 return -EFAULT;
2165 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) 2181 if (track_pfn_insert(vma, &pgprot, pfn))
2166 return -EINVAL; 2182 return -EINVAL;
2167 2183
2168 ret = insert_pfn(vma, addr, pfn, pgprot); 2184 ret = insert_pfn(vma, addr, pfn, pgprot);
2169 2185
2170 if (ret)
2171 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
2172
2173 return ret; 2186 return ret;
2174} 2187}
2175EXPORT_SYMBOL(vm_insert_pfn); 2188EXPORT_SYMBOL(vm_insert_pfn);
@@ -2290,37 +2303,30 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2290 * rest of the world about it: 2303 * rest of the world about it:
2291 * VM_IO tells people not to look at these pages 2304 * VM_IO tells people not to look at these pages
2292 * (accesses can have side effects). 2305 * (accesses can have side effects).
2293 * VM_RESERVED is specified all over the place, because
2294 * in 2.4 it kept swapout's vma scan off this vma; but
2295 * in 2.6 the LRU scan won't even find its pages, so this
2296 * flag means no more than count its pages in reserved_vm,
2297 * and omit it from core dump, even when VM_IO turned off.
2298 * VM_PFNMAP tells the core MM that the base pages are just 2306 * VM_PFNMAP tells the core MM that the base pages are just
2299 * raw PFN mappings, and do not have a "struct page" associated 2307 * raw PFN mappings, and do not have a "struct page" associated
2300 * with them. 2308 * with them.
2309 * VM_DONTEXPAND
2310 * Disable vma merging and expanding with mremap().
2311 * VM_DONTDUMP
2312 * Omit vma from core dump, even when VM_IO turned off.
2301 * 2313 *
2302 * There's a horrible special case to handle copy-on-write 2314 * There's a horrible special case to handle copy-on-write
2303 * behaviour that some programs depend on. We mark the "original" 2315 * behaviour that some programs depend on. We mark the "original"
2304 * un-COW'ed pages by matching them up with "vma->vm_pgoff". 2316 * un-COW'ed pages by matching them up with "vma->vm_pgoff".
2317 * See vm_normal_page() for details.
2305 */ 2318 */
2306 if (addr == vma->vm_start && end == vma->vm_end) { 2319 if (is_cow_mapping(vma->vm_flags)) {
2320 if (addr != vma->vm_start || end != vma->vm_end)
2321 return -EINVAL;
2307 vma->vm_pgoff = pfn; 2322 vma->vm_pgoff = pfn;
2308 vma->vm_flags |= VM_PFN_AT_MMAP; 2323 }
2309 } else if (is_cow_mapping(vma->vm_flags))
2310 return -EINVAL;
2311
2312 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
2313 2324
2314 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); 2325 err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
2315 if (err) { 2326 if (err)
2316 /*
2317 * To indicate that track_pfn related cleanup is not
2318 * needed from higher level routine calling unmap_vmas
2319 */
2320 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
2321 vma->vm_flags &= ~VM_PFN_AT_MMAP;
2322 return -EINVAL; 2327 return -EINVAL;
2323 } 2328
2329 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
2324 2330
2325 BUG_ON(addr >= end); 2331 BUG_ON(addr >= end);
2326 pfn -= addr >> PAGE_SHIFT; 2332 pfn -= addr >> PAGE_SHIFT;
@@ -2335,7 +2341,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
2335 } while (pgd++, addr = next, addr != end); 2341 } while (pgd++, addr = next, addr != end);
2336 2342
2337 if (err) 2343 if (err)
2338 untrack_pfn_vma(vma, pfn, PAGE_ALIGN(size)); 2344 untrack_pfn(vma, pfn, PAGE_ALIGN(size));
2339 2345
2340 return err; 2346 return err;
2341} 2347}
@@ -2516,11 +2522,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2516 spinlock_t *ptl, pte_t orig_pte) 2522 spinlock_t *ptl, pte_t orig_pte)
2517 __releases(ptl) 2523 __releases(ptl)
2518{ 2524{
2519 struct page *old_page, *new_page; 2525 struct page *old_page, *new_page = NULL;
2520 pte_t entry; 2526 pte_t entry;
2521 int ret = 0; 2527 int ret = 0;
2522 int page_mkwrite = 0; 2528 int page_mkwrite = 0;
2523 struct page *dirty_page = NULL; 2529 struct page *dirty_page = NULL;
2530 unsigned long mmun_start; /* For mmu_notifiers */
2531 unsigned long mmun_end; /* For mmu_notifiers */
2532 bool mmun_called = false; /* For mmu_notifiers */
2524 2533
2525 old_page = vm_normal_page(vma, address, orig_pte); 2534 old_page = vm_normal_page(vma, address, orig_pte);
2526 if (!old_page) { 2535 if (!old_page) {
@@ -2698,6 +2707,11 @@ gotten:
2698 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2707 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2699 goto oom_free_new; 2708 goto oom_free_new;
2700 2709
2710 mmun_start = address & PAGE_MASK;
2711 mmun_end = (address & PAGE_MASK) + PAGE_SIZE;
2712 mmun_called = true;
2713 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2714
2701 /* 2715 /*
2702 * Re-check the pte - we dropped the lock 2716 * Re-check the pte - we dropped the lock
2703 */ 2717 */
@@ -2764,6 +2778,8 @@ gotten:
2764 page_cache_release(new_page); 2778 page_cache_release(new_page);
2765unlock: 2779unlock:
2766 pte_unmap_unlock(page_table, ptl); 2780 pte_unmap_unlock(page_table, ptl);
2781 if (mmun_called)
2782 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2767 if (old_page) { 2783 if (old_page) {
2768 /* 2784 /*
2769 * Don't let another task, with possibly unlocked vma, 2785 * Don't let another task, with possibly unlocked vma,
@@ -2801,14 +2817,13 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
2801 zap_page_range_single(vma, start_addr, end_addr - start_addr, details); 2817 zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
2802} 2818}
2803 2819
2804static inline void unmap_mapping_range_tree(struct prio_tree_root *root, 2820static inline void unmap_mapping_range_tree(struct rb_root *root,
2805 struct zap_details *details) 2821 struct zap_details *details)
2806{ 2822{
2807 struct vm_area_struct *vma; 2823 struct vm_area_struct *vma;
2808 struct prio_tree_iter iter;
2809 pgoff_t vba, vea, zba, zea; 2824 pgoff_t vba, vea, zba, zea;
2810 2825
2811 vma_prio_tree_foreach(vma, &iter, root, 2826 vma_interval_tree_foreach(vma, root,
2812 details->first_index, details->last_index) { 2827 details->first_index, details->last_index) {
2813 2828
2814 vba = vma->vm_pgoff; 2829 vba = vma->vm_pgoff;
@@ -2839,7 +2854,7 @@ static inline void unmap_mapping_range_list(struct list_head *head,
2839 * across *all* the pages in each nonlinear VMA, not just the pages 2854 * across *all* the pages in each nonlinear VMA, not just the pages
2840 * whose virtual address lies outside the file truncation point. 2855 * whose virtual address lies outside the file truncation point.
2841 */ 2856 */
2842 list_for_each_entry(vma, head, shared.vm_set.list) { 2857 list_for_each_entry(vma, head, shared.nonlinear) {
2843 details->nonlinear_vma = vma; 2858 details->nonlinear_vma = vma;
2844 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details); 2859 unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
2845 } 2860 }
@@ -2883,7 +2898,7 @@ void unmap_mapping_range(struct address_space *mapping,
2883 2898
2884 2899
2885 mutex_lock(&mapping->i_mmap_mutex); 2900 mutex_lock(&mapping->i_mmap_mutex);
2886 if (unlikely(!prio_tree_empty(&mapping->i_mmap))) 2901 if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
2887 unmap_mapping_range_tree(&mapping->i_mmap, &details); 2902 unmap_mapping_range_tree(&mapping->i_mmap, &details);
2888 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) 2903 if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
2889 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); 2904 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6a5b90d0cfd7..56b758ae57d2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page,
106void __ref put_page_bootmem(struct page *page) 106void __ref put_page_bootmem(struct page *page)
107{ 107{
108 unsigned long type; 108 unsigned long type;
109 struct zone *zone;
109 110
110 type = (unsigned long) page->lru.next; 111 type = (unsigned long) page->lru.next;
111 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || 112 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
@@ -116,6 +117,12 @@ void __ref put_page_bootmem(struct page *page)
116 set_page_private(page, 0); 117 set_page_private(page, 0);
117 INIT_LIST_HEAD(&page->lru); 118 INIT_LIST_HEAD(&page->lru);
118 __free_pages_bootmem(page, 0); 119 __free_pages_bootmem(page, 0);
120
121 zone = page_zone(page);
122 zone_span_writelock(zone);
123 zone->present_pages++;
124 zone_span_writeunlock(zone);
125 totalram_pages++;
119 } 126 }
120 127
121} 128}
@@ -362,11 +369,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
362 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 369 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
363 BUG_ON(nr_pages % PAGES_PER_SECTION); 370 BUG_ON(nr_pages % PAGES_PER_SECTION);
364 371
372 release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
373
365 sections_to_remove = nr_pages / PAGES_PER_SECTION; 374 sections_to_remove = nr_pages / PAGES_PER_SECTION;
366 for (i = 0; i < sections_to_remove; i++) { 375 for (i = 0; i < sections_to_remove; i++) {
367 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 376 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
368 release_mem_region(pfn << PAGE_SHIFT,
369 PAGES_PER_SECTION << PAGE_SHIFT);
370 ret = __remove_section(zone, __pfn_to_section(pfn)); 377 ret = __remove_section(zone, __pfn_to_section(pfn));
371 if (ret) 378 if (ret)
372 break; 379 break;
@@ -756,13 +763,6 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
756 return 0; 763 return 0;
757} 764}
758 765
759static struct page *
760hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
761{
762 /* This should be improooooved!! */
763 return alloc_page(GFP_HIGHUSER_MOVABLE);
764}
765
766#define NR_OFFLINE_AT_ONCE_PAGES (256) 766#define NR_OFFLINE_AT_ONCE_PAGES (256)
767static int 767static int
768do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 768do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
@@ -813,8 +813,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
813 putback_lru_pages(&source); 813 putback_lru_pages(&source);
814 goto out; 814 goto out;
815 } 815 }
816 /* this function returns # of failed pages */ 816
817 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 817 /*
818 * alloc_migrate_target should be improooooved!!
819 * migrate_pages returns # of failed pages.
820 */
821 ret = migrate_pages(&source, alloc_migrate_target, 0,
818 true, MIGRATE_SYNC); 822 true, MIGRATE_SYNC);
819 if (ret) 823 if (ret)
820 putback_lru_pages(&source); 824 putback_lru_pages(&source);
@@ -870,7 +874,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
870 return offlined; 874 return offlined;
871} 875}
872 876
873static int __ref offline_pages(unsigned long start_pfn, 877static int __ref __offline_pages(unsigned long start_pfn,
874 unsigned long end_pfn, unsigned long timeout) 878 unsigned long end_pfn, unsigned long timeout)
875{ 879{
876 unsigned long pfn, nr_pages, expire; 880 unsigned long pfn, nr_pages, expire;
@@ -970,8 +974,13 @@ repeat:
970 974
971 init_per_zone_wmark_min(); 975 init_per_zone_wmark_min();
972 976
973 if (!populated_zone(zone)) 977 if (!populated_zone(zone)) {
974 zone_pcp_reset(zone); 978 zone_pcp_reset(zone);
979 mutex_lock(&zonelists_mutex);
980 build_all_zonelists(NULL, NULL);
981 mutex_unlock(&zonelists_mutex);
982 } else
983 zone_pcp_update(zone);
975 984
976 if (!node_present_pages(node)) { 985 if (!node_present_pages(node)) {
977 node_clear_state(node, N_HIGH_MEMORY); 986 node_clear_state(node, N_HIGH_MEMORY);
@@ -998,15 +1007,55 @@ out:
998 return ret; 1007 return ret;
999} 1008}
1000 1009
1010int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1011{
1012 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
1013}
1014
1001int remove_memory(u64 start, u64 size) 1015int remove_memory(u64 start, u64 size)
1002{ 1016{
1017 struct memory_block *mem = NULL;
1018 struct mem_section *section;
1003 unsigned long start_pfn, end_pfn; 1019 unsigned long start_pfn, end_pfn;
1020 unsigned long pfn, section_nr;
1021 int ret;
1004 1022
1005 start_pfn = PFN_DOWN(start); 1023 start_pfn = PFN_DOWN(start);
1006 end_pfn = start_pfn + PFN_DOWN(size); 1024 end_pfn = start_pfn + PFN_DOWN(size);
1007 return offline_pages(start_pfn, end_pfn, 120 * HZ); 1025
1026 for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1027 section_nr = pfn_to_section_nr(pfn);
1028 if (!present_section_nr(section_nr))
1029 continue;
1030
1031 section = __nr_to_section(section_nr);
1032 /* same memblock? */
1033 if (mem)
1034 if ((section_nr >= mem->start_section_nr) &&
1035 (section_nr <= mem->end_section_nr))
1036 continue;
1037
1038 mem = find_memory_block_hinted(section, mem);
1039 if (!mem)
1040 continue;
1041
1042 ret = offline_memory_block(mem);
1043 if (ret) {
1044 kobject_put(&mem->dev.kobj);
1045 return ret;
1046 }
1047 }
1048
1049 if (mem)
1050 kobject_put(&mem->dev.kobj);
1051
1052 return 0;
1008} 1053}
1009#else 1054#else
1055int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1056{
1057 return -EINVAL;
1058}
1010int remove_memory(u64 start, u64 size) 1059int remove_memory(u64 start, u64 size)
1011{ 1060{
1012 return -EINVAL; 1061 return -EINVAL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4ada3be6e252..0b78fb9ea65b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -607,6 +607,42 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
607 return first; 607 return first;
608} 608}
609 609
610/*
611 * Apply policy to a single VMA
612 * This must be called with the mmap_sem held for writing.
613 */
614static int vma_replace_policy(struct vm_area_struct *vma,
615 struct mempolicy *pol)
616{
617 int err;
618 struct mempolicy *old;
619 struct mempolicy *new;
620
621 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
622 vma->vm_start, vma->vm_end, vma->vm_pgoff,
623 vma->vm_ops, vma->vm_file,
624 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
625
626 new = mpol_dup(pol);
627 if (IS_ERR(new))
628 return PTR_ERR(new);
629
630 if (vma->vm_ops && vma->vm_ops->set_policy) {
631 err = vma->vm_ops->set_policy(vma, new);
632 if (err)
633 goto err_out;
634 }
635
636 old = vma->vm_policy;
637 vma->vm_policy = new; /* protected by mmap_sem */
638 mpol_put(old);
639
640 return 0;
641 err_out:
642 mpol_put(new);
643 return err;
644}
645
610/* Step 2: apply policy to a range and do splits. */ 646/* Step 2: apply policy to a range and do splits. */
611static int mbind_range(struct mm_struct *mm, unsigned long start, 647static int mbind_range(struct mm_struct *mm, unsigned long start,
612 unsigned long end, struct mempolicy *new_pol) 648 unsigned long end, struct mempolicy *new_pol)
@@ -655,23 +691,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
655 if (err) 691 if (err)
656 goto out; 692 goto out;
657 } 693 }
658 694 err = vma_replace_policy(vma, new_pol);
659 /* 695 if (err)
660 * Apply policy to a single VMA. The reference counting of 696 goto out;
661 * policy for vma_policy linkages has already been handled by
662 * vma_merge and split_vma as necessary. If this is a shared
663 * policy then ->set_policy will increment the reference count
664 * for an sp node.
665 */
666 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
667 vma->vm_start, vma->vm_end, vma->vm_pgoff,
668 vma->vm_ops, vma->vm_file,
669 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
670 if (vma->vm_ops && vma->vm_ops->set_policy) {
671 err = vma->vm_ops->set_policy(vma, new_pol);
672 if (err)
673 goto out;
674 }
675 } 697 }
676 698
677 out: 699 out:
@@ -924,15 +946,18 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
924 nodemask_t nmask; 946 nodemask_t nmask;
925 LIST_HEAD(pagelist); 947 LIST_HEAD(pagelist);
926 int err = 0; 948 int err = 0;
927 struct vm_area_struct *vma;
928 949
929 nodes_clear(nmask); 950 nodes_clear(nmask);
930 node_set(source, nmask); 951 node_set(source, nmask);
931 952
932 vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, 953 /*
954 * This does not "check" the range but isolates all pages that
955 * need migration. Between passing in the full user address
956 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
957 */
958 VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
959 check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
933 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 960 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
934 if (IS_ERR(vma))
935 return PTR_ERR(vma);
936 961
937 if (!list_empty(&pagelist)) { 962 if (!list_empty(&pagelist)) {
938 err = migrate_pages(&pagelist, new_node_page, dest, 963 err = migrate_pages(&pagelist, new_node_page, dest,
@@ -1530,8 +1555,18 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
1530 addr); 1555 addr);
1531 if (vpol) 1556 if (vpol)
1532 pol = vpol; 1557 pol = vpol;
1533 } else if (vma->vm_policy) 1558 } else if (vma->vm_policy) {
1534 pol = vma->vm_policy; 1559 pol = vma->vm_policy;
1560
1561 /*
1562 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1563 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1564 * count on these policies which will be dropped by
1565 * mpol_cond_put() later
1566 */
1567 if (mpol_needs_cond_ref(pol))
1568 mpol_get(pol);
1569 }
1535 } 1570 }
1536 if (!pol) 1571 if (!pol)
1537 pol = &default_policy; 1572 pol = &default_policy;
@@ -2061,7 +2096,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2061 */ 2096 */
2062 2097
2063/* lookup first element intersecting start-end */ 2098/* lookup first element intersecting start-end */
2064/* Caller holds sp->lock */ 2099/* Caller holds sp->mutex */
2065static struct sp_node * 2100static struct sp_node *
2066sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) 2101sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2067{ 2102{
@@ -2125,36 +2160,50 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2125 2160
2126 if (!sp->root.rb_node) 2161 if (!sp->root.rb_node)
2127 return NULL; 2162 return NULL;
2128 spin_lock(&sp->lock); 2163 mutex_lock(&sp->mutex);
2129 sn = sp_lookup(sp, idx, idx+1); 2164 sn = sp_lookup(sp, idx, idx+1);
2130 if (sn) { 2165 if (sn) {
2131 mpol_get(sn->policy); 2166 mpol_get(sn->policy);
2132 pol = sn->policy; 2167 pol = sn->policy;
2133 } 2168 }
2134 spin_unlock(&sp->lock); 2169 mutex_unlock(&sp->mutex);
2135 return pol; 2170 return pol;
2136} 2171}
2137 2172
2173static void sp_free(struct sp_node *n)
2174{
2175 mpol_put(n->policy);
2176 kmem_cache_free(sn_cache, n);
2177}
2178
2138static void sp_delete(struct shared_policy *sp, struct sp_node *n) 2179static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2139{ 2180{
2140 pr_debug("deleting %lx-l%lx\n", n->start, n->end); 2181 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2141 rb_erase(&n->nd, &sp->root); 2182 rb_erase(&n->nd, &sp->root);
2142 mpol_put(n->policy); 2183 sp_free(n);
2143 kmem_cache_free(sn_cache, n);
2144} 2184}
2145 2185
2146static struct sp_node *sp_alloc(unsigned long start, unsigned long end, 2186static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2147 struct mempolicy *pol) 2187 struct mempolicy *pol)
2148{ 2188{
2149 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); 2189 struct sp_node *n;
2190 struct mempolicy *newpol;
2150 2191
2192 n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2151 if (!n) 2193 if (!n)
2152 return NULL; 2194 return NULL;
2195
2196 newpol = mpol_dup(pol);
2197 if (IS_ERR(newpol)) {
2198 kmem_cache_free(sn_cache, n);
2199 return NULL;
2200 }
2201 newpol->flags |= MPOL_F_SHARED;
2202
2153 n->start = start; 2203 n->start = start;
2154 n->end = end; 2204 n->end = end;
2155 mpol_get(pol); 2205 n->policy = newpol;
2156 pol->flags |= MPOL_F_SHARED; /* for unref */ 2206
2157 n->policy = pol;
2158 return n; 2207 return n;
2159} 2208}
2160 2209
@@ -2162,10 +2211,10 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2162static int shared_policy_replace(struct shared_policy *sp, unsigned long start, 2211static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2163 unsigned long end, struct sp_node *new) 2212 unsigned long end, struct sp_node *new)
2164{ 2213{
2165 struct sp_node *n, *new2 = NULL; 2214 struct sp_node *n;
2215 int ret = 0;
2166 2216
2167restart: 2217 mutex_lock(&sp->mutex);
2168 spin_lock(&sp->lock);
2169 n = sp_lookup(sp, start, end); 2218 n = sp_lookup(sp, start, end);
2170 /* Take care of old policies in the same range. */ 2219 /* Take care of old policies in the same range. */
2171 while (n && n->start < end) { 2220 while (n && n->start < end) {
@@ -2178,16 +2227,14 @@ restart:
2178 } else { 2227 } else {
2179 /* Old policy spanning whole new range. */ 2228 /* Old policy spanning whole new range. */
2180 if (n->end > end) { 2229 if (n->end > end) {
2230 struct sp_node *new2;
2231 new2 = sp_alloc(end, n->end, n->policy);
2181 if (!new2) { 2232 if (!new2) {
2182 spin_unlock(&sp->lock); 2233 ret = -ENOMEM;
2183 new2 = sp_alloc(end, n->end, n->policy); 2234 goto out;
2184 if (!new2)
2185 return -ENOMEM;
2186 goto restart;
2187 } 2235 }
2188 n->end = start; 2236 n->end = start;
2189 sp_insert(sp, new2); 2237 sp_insert(sp, new2);
2190 new2 = NULL;
2191 break; 2238 break;
2192 } else 2239 } else
2193 n->end = start; 2240 n->end = start;
@@ -2198,12 +2245,9 @@ restart:
2198 } 2245 }
2199 if (new) 2246 if (new)
2200 sp_insert(sp, new); 2247 sp_insert(sp, new);
2201 spin_unlock(&sp->lock); 2248out:
2202 if (new2) { 2249 mutex_unlock(&sp->mutex);
2203 mpol_put(new2->policy); 2250 return ret;
2204 kmem_cache_free(sn_cache, new2);
2205 }
2206 return 0;
2207} 2251}
2208 2252
2209/** 2253/**
@@ -2221,7 +2265,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2221 int ret; 2265 int ret;
2222 2266
2223 sp->root = RB_ROOT; /* empty tree == default mempolicy */ 2267 sp->root = RB_ROOT; /* empty tree == default mempolicy */
2224 spin_lock_init(&sp->lock); 2268 mutex_init(&sp->mutex);
2225 2269
2226 if (mpol) { 2270 if (mpol) {
2227 struct vm_area_struct pvma; 2271 struct vm_area_struct pvma;
@@ -2275,7 +2319,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
2275 } 2319 }
2276 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); 2320 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2277 if (err && new) 2321 if (err && new)
2278 kmem_cache_free(sn_cache, new); 2322 sp_free(new);
2279 return err; 2323 return err;
2280} 2324}
2281 2325
@@ -2287,16 +2331,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
2287 2331
2288 if (!p->root.rb_node) 2332 if (!p->root.rb_node)
2289 return; 2333 return;
2290 spin_lock(&p->lock); 2334 mutex_lock(&p->mutex);
2291 next = rb_first(&p->root); 2335 next = rb_first(&p->root);
2292 while (next) { 2336 while (next) {
2293 n = rb_entry(next, struct sp_node, nd); 2337 n = rb_entry(next, struct sp_node, nd);
2294 next = rb_next(&n->nd); 2338 next = rb_next(&n->nd);
2295 rb_erase(&n->nd, &p->root); 2339 sp_delete(p, n);
2296 mpol_put(n->policy);
2297 kmem_cache_free(sn_cache, n);
2298 } 2340 }
2299 spin_unlock(&p->lock); 2341 mutex_unlock(&p->mutex);
2300} 2342}
2301 2343
2302/* assumes fs == KERNEL_DS */ 2344/* assumes fs == KERNEL_DS */
diff --git a/mm/mlock.c b/mm/mlock.c
index ef726e8aa8e9..f0b9ce572fc7 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -51,15 +51,13 @@ EXPORT_SYMBOL(can_do_mlock);
51/* 51/*
52 * LRU accounting for clear_page_mlock() 52 * LRU accounting for clear_page_mlock()
53 */ 53 */
54void __clear_page_mlock(struct page *page) 54void clear_page_mlock(struct page *page)
55{ 55{
56 VM_BUG_ON(!PageLocked(page)); 56 if (!TestClearPageMlocked(page))
57
58 if (!page->mapping) { /* truncated ? */
59 return; 57 return;
60 }
61 58
62 dec_zone_page_state(page, NR_MLOCK); 59 mod_zone_page_state(page_zone(page), NR_MLOCK,
60 -hpage_nr_pages(page));
63 count_vm_event(UNEVICTABLE_PGCLEARED); 61 count_vm_event(UNEVICTABLE_PGCLEARED);
64 if (!isolate_lru_page(page)) { 62 if (!isolate_lru_page(page)) {
65 putback_lru_page(page); 63 putback_lru_page(page);
@@ -81,7 +79,8 @@ void mlock_vma_page(struct page *page)
81 BUG_ON(!PageLocked(page)); 79 BUG_ON(!PageLocked(page));
82 80
83 if (!TestSetPageMlocked(page)) { 81 if (!TestSetPageMlocked(page)) {
84 inc_zone_page_state(page, NR_MLOCK); 82 mod_zone_page_state(page_zone(page), NR_MLOCK,
83 hpage_nr_pages(page));
85 count_vm_event(UNEVICTABLE_PGMLOCKED); 84 count_vm_event(UNEVICTABLE_PGMLOCKED);
86 if (!isolate_lru_page(page)) 85 if (!isolate_lru_page(page))
87 putback_lru_page(page); 86 putback_lru_page(page);
@@ -108,7 +107,8 @@ void munlock_vma_page(struct page *page)
108 BUG_ON(!PageLocked(page)); 107 BUG_ON(!PageLocked(page));
109 108
110 if (TestClearPageMlocked(page)) { 109 if (TestClearPageMlocked(page)) {
111 dec_zone_page_state(page, NR_MLOCK); 110 mod_zone_page_state(page_zone(page), NR_MLOCK,
111 -hpage_nr_pages(page));
112 if (!isolate_lru_page(page)) { 112 if (!isolate_lru_page(page)) {
113 int ret = SWAP_AGAIN; 113 int ret = SWAP_AGAIN;
114 114
@@ -227,7 +227,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
227 if (vma->vm_flags & (VM_IO | VM_PFNMAP)) 227 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
228 goto no_mlock; 228 goto no_mlock;
229 229
230 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || 230 if (!((vma->vm_flags & VM_DONTEXPAND) ||
231 is_vm_hugetlb_page(vma) || 231 is_vm_hugetlb_page(vma) ||
232 vma == get_gate_vma(current->mm))) { 232 vma == get_gate_vma(current->mm))) {
233 233
@@ -290,14 +290,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
290 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); 290 page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
291 if (page && !IS_ERR(page)) { 291 if (page && !IS_ERR(page)) {
292 lock_page(page); 292 lock_page(page);
293 /* 293 munlock_vma_page(page);
294 * Like in __mlock_vma_pages_range(),
295 * because we lock page here and migration is
296 * blocked by the elevated reference, we need
297 * only check for file-cache page truncation.
298 */
299 if (page->mapping)
300 munlock_vma_page(page);
301 unlock_page(page); 294 unlock_page(page);
302 put_page(page); 295 put_page(page);
303 } 296 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 872441e81914..2d942353d681 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -51,12 +51,6 @@ static void unmap_region(struct mm_struct *mm,
51 struct vm_area_struct *vma, struct vm_area_struct *prev, 51 struct vm_area_struct *vma, struct vm_area_struct *prev,
52 unsigned long start, unsigned long end); 52 unsigned long start, unsigned long end);
53 53
54/*
55 * WARNING: the debugging will use recursive algorithms so never enable this
56 * unless you know what you are doing.
57 */
58#undef DEBUG_MM_RB
59
60/* description of effects of mapping type and prot in current implementation. 54/* description of effects of mapping type and prot in current implementation.
61 * this is due to the limited x86 page protection hardware. The expected 55 * this is due to the limited x86 page protection hardware. The expected
62 * behavior is in parens: 56 * behavior is in parens:
@@ -199,14 +193,14 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
199 193
200 flush_dcache_mmap_lock(mapping); 194 flush_dcache_mmap_lock(mapping);
201 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 195 if (unlikely(vma->vm_flags & VM_NONLINEAR))
202 list_del_init(&vma->shared.vm_set.list); 196 list_del_init(&vma->shared.nonlinear);
203 else 197 else
204 vma_prio_tree_remove(vma, &mapping->i_mmap); 198 vma_interval_tree_remove(vma, &mapping->i_mmap);
205 flush_dcache_mmap_unlock(mapping); 199 flush_dcache_mmap_unlock(mapping);
206} 200}
207 201
208/* 202/*
209 * Unlink a file-based vm structure from its prio_tree, to hide 203 * Unlink a file-based vm structure from its interval tree, to hide
210 * vma from rmap and vmtruncate before freeing its page tables. 204 * vma from rmap and vmtruncate before freeing its page tables.
211 */ 205 */
212void unlink_file_vma(struct vm_area_struct *vma) 206void unlink_file_vma(struct vm_area_struct *vma)
@@ -231,11 +225,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
231 might_sleep(); 225 might_sleep();
232 if (vma->vm_ops && vma->vm_ops->close) 226 if (vma->vm_ops && vma->vm_ops->close)
233 vma->vm_ops->close(vma); 227 vma->vm_ops->close(vma);
234 if (vma->vm_file) { 228 if (vma->vm_file)
235 fput(vma->vm_file); 229 fput(vma->vm_file);
236 if (vma->vm_flags & VM_EXECUTABLE)
237 removed_exe_file_vma(vma->vm_mm);
238 }
239 mpol_put(vma_policy(vma)); 230 mpol_put(vma_policy(vma));
240 kmem_cache_free(vm_area_cachep, vma); 231 kmem_cache_free(vm_area_cachep, vma);
241 return next; 232 return next;
@@ -306,7 +297,7 @@ out:
306 return retval; 297 return retval;
307} 298}
308 299
309#ifdef DEBUG_MM_RB 300#ifdef CONFIG_DEBUG_VM_RB
310static int browse_rb(struct rb_root *root) 301static int browse_rb(struct rb_root *root)
311{ 302{
312 int i = 0, j; 303 int i = 0, j;
@@ -340,9 +331,12 @@ void validate_mm(struct mm_struct *mm)
340{ 331{
341 int bug = 0; 332 int bug = 0;
342 int i = 0; 333 int i = 0;
343 struct vm_area_struct *tmp = mm->mmap; 334 struct vm_area_struct *vma = mm->mmap;
344 while (tmp) { 335 while (vma) {
345 tmp = tmp->vm_next; 336 struct anon_vma_chain *avc;
337 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
338 anon_vma_interval_tree_verify(avc);
339 vma = vma->vm_next;
346 i++; 340 i++;
347 } 341 }
348 if (i != mm->map_count) 342 if (i != mm->map_count)
@@ -356,17 +350,46 @@ void validate_mm(struct mm_struct *mm)
356#define validate_mm(mm) do { } while (0) 350#define validate_mm(mm) do { } while (0)
357#endif 351#endif
358 352
359static struct vm_area_struct * 353/*
360find_vma_prepare(struct mm_struct *mm, unsigned long addr, 354 * vma has some anon_vma assigned, and is already inserted on that
361 struct vm_area_struct **pprev, struct rb_node ***rb_link, 355 * anon_vma's interval trees.
362 struct rb_node ** rb_parent) 356 *
357 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
358 * vma must be removed from the anon_vma's interval trees using
359 * anon_vma_interval_tree_pre_update_vma().
360 *
361 * After the update, the vma will be reinserted using
362 * anon_vma_interval_tree_post_update_vma().
363 *
364 * The entire update must be protected by exclusive mmap_sem and by
365 * the root anon_vma's mutex.
366 */
367static inline void
368anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
363{ 369{
364 struct vm_area_struct * vma; 370 struct anon_vma_chain *avc;
365 struct rb_node ** __rb_link, * __rb_parent, * rb_prev; 371
372 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
373 anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
374}
375
376static inline void
377anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
378{
379 struct anon_vma_chain *avc;
380
381 list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
382 anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
383}
384
385static int find_vma_links(struct mm_struct *mm, unsigned long addr,
386 unsigned long end, struct vm_area_struct **pprev,
387 struct rb_node ***rb_link, struct rb_node **rb_parent)
388{
389 struct rb_node **__rb_link, *__rb_parent, *rb_prev;
366 390
367 __rb_link = &mm->mm_rb.rb_node; 391 __rb_link = &mm->mm_rb.rb_node;
368 rb_prev = __rb_parent = NULL; 392 rb_prev = __rb_parent = NULL;
369 vma = NULL;
370 393
371 while (*__rb_link) { 394 while (*__rb_link) {
372 struct vm_area_struct *vma_tmp; 395 struct vm_area_struct *vma_tmp;
@@ -375,9 +398,9 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
375 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); 398 vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb);
376 399
377 if (vma_tmp->vm_end > addr) { 400 if (vma_tmp->vm_end > addr) {
378 vma = vma_tmp; 401 /* Fail if an existing vma overlaps the area */
379 if (vma_tmp->vm_start <= addr) 402 if (vma_tmp->vm_start < end)
380 break; 403 return -ENOMEM;
381 __rb_link = &__rb_parent->rb_left; 404 __rb_link = &__rb_parent->rb_left;
382 } else { 405 } else {
383 rb_prev = __rb_parent; 406 rb_prev = __rb_parent;
@@ -390,7 +413,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
390 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); 413 *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
391 *rb_link = __rb_link; 414 *rb_link = __rb_link;
392 *rb_parent = __rb_parent; 415 *rb_parent = __rb_parent;
393 return vma; 416 return 0;
394} 417}
395 418
396void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, 419void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -417,7 +440,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
417 if (unlikely(vma->vm_flags & VM_NONLINEAR)) 440 if (unlikely(vma->vm_flags & VM_NONLINEAR))
418 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear); 441 vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
419 else 442 else
420 vma_prio_tree_insert(vma, &mapping->i_mmap); 443 vma_interval_tree_insert(vma, &mapping->i_mmap);
421 flush_dcache_mmap_unlock(mapping); 444 flush_dcache_mmap_unlock(mapping);
422 } 445 }
423} 446}
@@ -455,15 +478,16 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
455 478
456/* 479/*
457 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the 480 * Helper for vma_adjust() in the split_vma insert case: insert a vma into the
458 * mm's list and rbtree. It has already been inserted into the prio_tree. 481 * mm's list and rbtree. It has already been inserted into the interval tree.
459 */ 482 */
460static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) 483static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
461{ 484{
462 struct vm_area_struct *__vma, *prev; 485 struct vm_area_struct *prev;
463 struct rb_node **rb_link, *rb_parent; 486 struct rb_node **rb_link, *rb_parent;
464 487
465 __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); 488 if (find_vma_links(mm, vma->vm_start, vma->vm_end,
466 BUG_ON(__vma && __vma->vm_start < vma->vm_end); 489 &prev, &rb_link, &rb_parent))
490 BUG();
467 __vma_link(mm, vma, prev, rb_link, rb_parent); 491 __vma_link(mm, vma, prev, rb_link, rb_parent);
468 mm->map_count++; 492 mm->map_count++;
469} 493}
@@ -496,7 +520,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
496 struct vm_area_struct *next = vma->vm_next; 520 struct vm_area_struct *next = vma->vm_next;
497 struct vm_area_struct *importer = NULL; 521 struct vm_area_struct *importer = NULL;
498 struct address_space *mapping = NULL; 522 struct address_space *mapping = NULL;
499 struct prio_tree_root *root = NULL; 523 struct rb_root *root = NULL;
500 struct anon_vma *anon_vma = NULL; 524 struct anon_vma *anon_vma = NULL;
501 struct file *file = vma->vm_file; 525 struct file *file = vma->vm_file;
502 long adjust_next = 0; 526 long adjust_next = 0;
@@ -559,7 +583,7 @@ again: remove_next = 1 + (end > next->vm_end);
559 mutex_lock(&mapping->i_mmap_mutex); 583 mutex_lock(&mapping->i_mmap_mutex);
560 if (insert) { 584 if (insert) {
561 /* 585 /*
562 * Put into prio_tree now, so instantiated pages 586 * Put into interval tree now, so instantiated pages
563 * are visible to arm/parisc __flush_dcache_page 587 * are visible to arm/parisc __flush_dcache_page
564 * throughout; but we cannot insert into address 588 * throughout; but we cannot insert into address
565 * space until vma start or end is updated. 589 * space until vma start or end is updated.
@@ -570,22 +594,23 @@ again: remove_next = 1 + (end > next->vm_end);
570 594
571 vma_adjust_trans_huge(vma, start, end, adjust_next); 595 vma_adjust_trans_huge(vma, start, end, adjust_next);
572 596
573 /* 597 anon_vma = vma->anon_vma;
574 * When changing only vma->vm_end, we don't really need anon_vma 598 if (!anon_vma && adjust_next)
575 * lock. This is a fairly rare case by itself, but the anon_vma 599 anon_vma = next->anon_vma;
576 * lock may be shared between many sibling processes. Skipping 600 if (anon_vma) {
577 * the lock for brk adjustments makes a difference sometimes. 601 VM_BUG_ON(adjust_next && next->anon_vma &&
578 */ 602 anon_vma != next->anon_vma);
579 if (vma->anon_vma && (importer || start != vma->vm_start)) {
580 anon_vma = vma->anon_vma;
581 anon_vma_lock(anon_vma); 603 anon_vma_lock(anon_vma);
604 anon_vma_interval_tree_pre_update_vma(vma);
605 if (adjust_next)
606 anon_vma_interval_tree_pre_update_vma(next);
582 } 607 }
583 608
584 if (root) { 609 if (root) {
585 flush_dcache_mmap_lock(mapping); 610 flush_dcache_mmap_lock(mapping);
586 vma_prio_tree_remove(vma, root); 611 vma_interval_tree_remove(vma, root);
587 if (adjust_next) 612 if (adjust_next)
588 vma_prio_tree_remove(next, root); 613 vma_interval_tree_remove(next, root);
589 } 614 }
590 615
591 vma->vm_start = start; 616 vma->vm_start = start;
@@ -598,8 +623,8 @@ again: remove_next = 1 + (end > next->vm_end);
598 623
599 if (root) { 624 if (root) {
600 if (adjust_next) 625 if (adjust_next)
601 vma_prio_tree_insert(next, root); 626 vma_interval_tree_insert(next, root);
602 vma_prio_tree_insert(vma, root); 627 vma_interval_tree_insert(vma, root);
603 flush_dcache_mmap_unlock(mapping); 628 flush_dcache_mmap_unlock(mapping);
604 } 629 }
605 630
@@ -620,8 +645,12 @@ again: remove_next = 1 + (end > next->vm_end);
620 __insert_vm_struct(mm, insert); 645 __insert_vm_struct(mm, insert);
621 } 646 }
622 647
623 if (anon_vma) 648 if (anon_vma) {
649 anon_vma_interval_tree_post_update_vma(vma);
650 if (adjust_next)
651 anon_vma_interval_tree_post_update_vma(next);
624 anon_vma_unlock(anon_vma); 652 anon_vma_unlock(anon_vma);
653 }
625 if (mapping) 654 if (mapping)
626 mutex_unlock(&mapping->i_mmap_mutex); 655 mutex_unlock(&mapping->i_mmap_mutex);
627 656
@@ -636,8 +665,6 @@ again: remove_next = 1 + (end > next->vm_end);
636 if (file) { 665 if (file) {
637 uprobe_munmap(next, next->vm_start, next->vm_end); 666 uprobe_munmap(next, next->vm_start, next->vm_end);
638 fput(file); 667 fput(file);
639 if (next->vm_flags & VM_EXECUTABLE)
640 removed_exe_file_vma(mm);
641 } 668 }
642 if (next->anon_vma) 669 if (next->anon_vma)
643 anon_vma_merge(vma, next); 670 anon_vma_merge(vma, next);
@@ -669,8 +696,7 @@ again: remove_next = 1 + (end > next->vm_end);
669static inline int is_mergeable_vma(struct vm_area_struct *vma, 696static inline int is_mergeable_vma(struct vm_area_struct *vma,
670 struct file *file, unsigned long vm_flags) 697 struct file *file, unsigned long vm_flags)
671{ 698{
672 /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */ 699 if (vma->vm_flags ^ vm_flags)
673 if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR)
674 return 0; 700 return 0;
675 if (vma->vm_file != file) 701 if (vma->vm_file != file)
676 return 0; 702 return 0;
@@ -951,8 +977,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
951 mm->exec_vm += pages; 977 mm->exec_vm += pages;
952 } else if (flags & stack_flags) 978 } else if (flags & stack_flags)
953 mm->stack_vm += pages; 979 mm->stack_vm += pages;
954 if (flags & (VM_RESERVED|VM_IO))
955 mm->reserved_vm += pages;
956} 980}
957#endif /* CONFIG_PROC_FS */ 981#endif /* CONFIG_PROC_FS */
958 982
@@ -1190,7 +1214,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
1190 return 0; 1214 return 0;
1191 1215
1192 /* Specialty mapping? */ 1216 /* Specialty mapping? */
1193 if (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) 1217 if (vm_flags & VM_PFNMAP)
1194 return 0; 1218 return 0;
1195 1219
1196 /* Can the mapping track the dirty pages? */ 1220 /* Can the mapping track the dirty pages? */
@@ -1229,8 +1253,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
1229 /* Clear old maps */ 1253 /* Clear old maps */
1230 error = -ENOMEM; 1254 error = -ENOMEM;
1231munmap_back: 1255munmap_back:
1232 vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 1256 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
1233 if (vma && vma->vm_start < addr + len) {
1234 if (do_munmap(mm, addr, len)) 1257 if (do_munmap(mm, addr, len))
1235 return -ENOMEM; 1258 return -ENOMEM;
1236 goto munmap_back; 1259 goto munmap_back;
@@ -1305,8 +1328,6 @@ munmap_back:
1305 error = file->f_op->mmap(file, vma); 1328 error = file->f_op->mmap(file, vma);
1306 if (error) 1329 if (error)
1307 goto unmap_and_free_vma; 1330 goto unmap_and_free_vma;
1308 if (vm_flags & VM_EXECUTABLE)
1309 added_exe_file_vma(mm);
1310 1331
1311 /* Can addr have changed?? 1332 /* Can addr have changed??
1312 * 1333 *
@@ -1757,13 +1778,16 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1757 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { 1778 if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
1758 error = acct_stack_growth(vma, size, grow); 1779 error = acct_stack_growth(vma, size, grow);
1759 if (!error) { 1780 if (!error) {
1781 anon_vma_interval_tree_pre_update_vma(vma);
1760 vma->vm_end = address; 1782 vma->vm_end = address;
1783 anon_vma_interval_tree_post_update_vma(vma);
1761 perf_event_mmap(vma); 1784 perf_event_mmap(vma);
1762 } 1785 }
1763 } 1786 }
1764 } 1787 }
1765 vma_unlock_anon_vma(vma); 1788 vma_unlock_anon_vma(vma);
1766 khugepaged_enter_vma_merge(vma); 1789 khugepaged_enter_vma_merge(vma);
1790 validate_mm(vma->vm_mm);
1767 return error; 1791 return error;
1768} 1792}
1769#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ 1793#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1807,14 +1831,17 @@ int expand_downwards(struct vm_area_struct *vma,
1807 if (grow <= vma->vm_pgoff) { 1831 if (grow <= vma->vm_pgoff) {
1808 error = acct_stack_growth(vma, size, grow); 1832 error = acct_stack_growth(vma, size, grow);
1809 if (!error) { 1833 if (!error) {
1834 anon_vma_interval_tree_pre_update_vma(vma);
1810 vma->vm_start = address; 1835 vma->vm_start = address;
1811 vma->vm_pgoff -= grow; 1836 vma->vm_pgoff -= grow;
1837 anon_vma_interval_tree_post_update_vma(vma);
1812 perf_event_mmap(vma); 1838 perf_event_mmap(vma);
1813 } 1839 }
1814 } 1840 }
1815 } 1841 }
1816 vma_unlock_anon_vma(vma); 1842 vma_unlock_anon_vma(vma);
1817 khugepaged_enter_vma_merge(vma); 1843 khugepaged_enter_vma_merge(vma);
1844 validate_mm(vma->vm_mm);
1818 return error; 1845 return error;
1819} 1846}
1820 1847
@@ -1988,11 +2015,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1988 if (anon_vma_clone(new, vma)) 2015 if (anon_vma_clone(new, vma))
1989 goto out_free_mpol; 2016 goto out_free_mpol;
1990 2017
1991 if (new->vm_file) { 2018 if (new->vm_file)
1992 get_file(new->vm_file); 2019 get_file(new->vm_file);
1993 if (vma->vm_flags & VM_EXECUTABLE)
1994 added_exe_file_vma(mm);
1995 }
1996 2020
1997 if (new->vm_ops && new->vm_ops->open) 2021 if (new->vm_ops && new->vm_ops->open)
1998 new->vm_ops->open(new); 2022 new->vm_ops->open(new);
@@ -2010,11 +2034,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
2010 /* Clean everything up if vma_adjust failed. */ 2034 /* Clean everything up if vma_adjust failed. */
2011 if (new->vm_ops && new->vm_ops->close) 2035 if (new->vm_ops && new->vm_ops->close)
2012 new->vm_ops->close(new); 2036 new->vm_ops->close(new);
2013 if (new->vm_file) { 2037 if (new->vm_file)
2014 if (vma->vm_flags & VM_EXECUTABLE)
2015 removed_exe_file_vma(mm);
2016 fput(new->vm_file); 2038 fput(new->vm_file);
2017 }
2018 unlink_anon_vmas(new); 2039 unlink_anon_vmas(new);
2019 out_free_mpol: 2040 out_free_mpol:
2020 mpol_put(pol); 2041 mpol_put(pol);
@@ -2199,8 +2220,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
2199 * Clear old maps. this also does some error checking for us 2220 * Clear old maps. this also does some error checking for us
2200 */ 2221 */
2201 munmap_back: 2222 munmap_back:
2202 vma = find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 2223 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) {
2203 if (vma && vma->vm_start < addr + len) {
2204 if (do_munmap(mm, addr, len)) 2224 if (do_munmap(mm, addr, len))
2205 return -ENOMEM; 2225 return -ENOMEM;
2206 goto munmap_back; 2226 goto munmap_back;
@@ -2314,10 +2334,10 @@ void exit_mmap(struct mm_struct *mm)
2314 * and into the inode's i_mmap tree. If vm_file is non-NULL 2334 * and into the inode's i_mmap tree. If vm_file is non-NULL
2315 * then i_mmap_mutex is taken here. 2335 * then i_mmap_mutex is taken here.
2316 */ 2336 */
2317int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) 2337int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
2318{ 2338{
2319 struct vm_area_struct * __vma, * prev; 2339 struct vm_area_struct *prev;
2320 struct rb_node ** rb_link, * rb_parent; 2340 struct rb_node **rb_link, *rb_parent;
2321 2341
2322 /* 2342 /*
2323 * The vm_pgoff of a purely anonymous vma should be irrelevant 2343 * The vm_pgoff of a purely anonymous vma should be irrelevant
@@ -2335,8 +2355,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
2335 BUG_ON(vma->anon_vma); 2355 BUG_ON(vma->anon_vma);
2336 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; 2356 vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
2337 } 2357 }
2338 __vma = find_vma_prepare(mm,vma->vm_start,&prev,&rb_link,&rb_parent); 2358 if (find_vma_links(mm, vma->vm_start, vma->vm_end,
2339 if (__vma && __vma->vm_start < vma->vm_end) 2359 &prev, &rb_link, &rb_parent))
2340 return -ENOMEM; 2360 return -ENOMEM;
2341 if ((vma->vm_flags & VM_ACCOUNT) && 2361 if ((vma->vm_flags & VM_ACCOUNT) &&
2342 security_vm_enough_memory_mm(mm, vma_pages(vma))) 2362 security_vm_enough_memory_mm(mm, vma_pages(vma)))
@@ -2351,7 +2371,8 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
2351 * prior to moving page table entries, to effect an mremap move. 2371 * prior to moving page table entries, to effect an mremap move.
2352 */ 2372 */
2353struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, 2373struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2354 unsigned long addr, unsigned long len, pgoff_t pgoff) 2374 unsigned long addr, unsigned long len, pgoff_t pgoff,
2375 bool *need_rmap_locks)
2355{ 2376{
2356 struct vm_area_struct *vma = *vmap; 2377 struct vm_area_struct *vma = *vmap;
2357 unsigned long vma_start = vma->vm_start; 2378 unsigned long vma_start = vma->vm_start;
@@ -2370,7 +2391,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2370 faulted_in_anon_vma = false; 2391 faulted_in_anon_vma = false;
2371 } 2392 }
2372 2393
2373 find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); 2394 if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent))
2395 return NULL; /* should never get here */
2374 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, 2396 new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
2375 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); 2397 vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma));
2376 if (new_vma) { 2398 if (new_vma) {
@@ -2392,32 +2414,29 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2392 * linear if there are no pages mapped yet. 2414 * linear if there are no pages mapped yet.
2393 */ 2415 */
2394 VM_BUG_ON(faulted_in_anon_vma); 2416 VM_BUG_ON(faulted_in_anon_vma);
2395 *vmap = new_vma; 2417 *vmap = vma = new_vma;
2396 } else 2418 }
2397 anon_vma_moveto_tail(new_vma); 2419 *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
2398 } else { 2420 } else {
2399 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2421 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2400 if (new_vma) { 2422 if (new_vma) {
2401 *new_vma = *vma; 2423 *new_vma = *vma;
2424 new_vma->vm_start = addr;
2425 new_vma->vm_end = addr + len;
2426 new_vma->vm_pgoff = pgoff;
2402 pol = mpol_dup(vma_policy(vma)); 2427 pol = mpol_dup(vma_policy(vma));
2403 if (IS_ERR(pol)) 2428 if (IS_ERR(pol))
2404 goto out_free_vma; 2429 goto out_free_vma;
2430 vma_set_policy(new_vma, pol);
2405 INIT_LIST_HEAD(&new_vma->anon_vma_chain); 2431 INIT_LIST_HEAD(&new_vma->anon_vma_chain);
2406 if (anon_vma_clone(new_vma, vma)) 2432 if (anon_vma_clone(new_vma, vma))
2407 goto out_free_mempol; 2433 goto out_free_mempol;
2408 vma_set_policy(new_vma, pol); 2434 if (new_vma->vm_file)
2409 new_vma->vm_start = addr;
2410 new_vma->vm_end = addr + len;
2411 new_vma->vm_pgoff = pgoff;
2412 if (new_vma->vm_file) {
2413 get_file(new_vma->vm_file); 2435 get_file(new_vma->vm_file);
2414
2415 if (vma->vm_flags & VM_EXECUTABLE)
2416 added_exe_file_vma(mm);
2417 }
2418 if (new_vma->vm_ops && new_vma->vm_ops->open) 2436 if (new_vma->vm_ops && new_vma->vm_ops->open)
2419 new_vma->vm_ops->open(new_vma); 2437 new_vma->vm_ops->open(new_vma);
2420 vma_link(mm, new_vma, prev, rb_link, rb_parent); 2438 vma_link(mm, new_vma, prev, rb_link, rb_parent);
2439 *need_rmap_locks = false;
2421 } 2440 }
2422 } 2441 }
2423 return new_vma; 2442 return new_vma;
@@ -2535,7 +2554,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
2535 2554
2536static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) 2555static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2537{ 2556{
2538 if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { 2557 if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
2539 /* 2558 /*
2540 * The LSB of head.next can't change from under us 2559 * The LSB of head.next can't change from under us
2541 * because we hold the mm_all_locks_mutex. 2560 * because we hold the mm_all_locks_mutex.
@@ -2551,7 +2570,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
2551 * anon_vma->root->mutex. 2570 * anon_vma->root->mutex.
2552 */ 2571 */
2553 if (__test_and_set_bit(0, (unsigned long *) 2572 if (__test_and_set_bit(0, (unsigned long *)
2554 &anon_vma->root->head.next)) 2573 &anon_vma->root->rb_root.rb_node))
2555 BUG(); 2574 BUG();
2556 } 2575 }
2557} 2576}
@@ -2592,7 +2611,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
2592 * A single task can't take more than one mm_take_all_locks() in a row 2611 * A single task can't take more than one mm_take_all_locks() in a row
2593 * or it would deadlock. 2612 * or it would deadlock.
2594 * 2613 *
2595 * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in 2614 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
2596 * mapping->flags avoid to take the same lock twice, if more than one 2615 * mapping->flags avoid to take the same lock twice, if more than one
2597 * vma in this mm is backed by the same anon_vma or address_space. 2616 * vma in this mm is backed by the same anon_vma or address_space.
2598 * 2617 *
@@ -2639,13 +2658,13 @@ out_unlock:
2639 2658
2640static void vm_unlock_anon_vma(struct anon_vma *anon_vma) 2659static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2641{ 2660{
2642 if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { 2661 if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_node)) {
2643 /* 2662 /*
2644 * The LSB of head.next can't change to 0 from under 2663 * The LSB of head.next can't change to 0 from under
2645 * us because we hold the mm_all_locks_mutex. 2664 * us because we hold the mm_all_locks_mutex.
2646 * 2665 *
2647 * We must however clear the bitflag before unlocking 2666 * We must however clear the bitflag before unlocking
2648 * the vma so the users using the anon_vma->head will 2667 * the vma so the users using the anon_vma->rb_root will
2649 * never see our bitflag. 2668 * never see our bitflag.
2650 * 2669 *
2651 * No need of atomic instructions here, head.next 2670 * No need of atomic instructions here, head.next
@@ -2653,7 +2672,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
2653 * anon_vma->root->mutex. 2672 * anon_vma->root->mutex.
2654 */ 2673 */
2655 if (!__test_and_clear_bit(0, (unsigned long *) 2674 if (!__test_and_clear_bit(0, (unsigned long *)
2656 &anon_vma->root->head.next)) 2675 &anon_vma->root->rb_root.rb_node))
2657 BUG(); 2676 BUG();
2658 anon_vma_unlock(anon_vma); 2677 anon_vma_unlock(anon_vma);
2659 } 2678 }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 862b60822d9f..479a1e751a73 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -14,10 +14,14 @@
14#include <linux/export.h> 14#include <linux/export.h>
15#include <linux/mm.h> 15#include <linux/mm.h>
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/srcu.h>
17#include <linux/rcupdate.h> 18#include <linux/rcupdate.h>
18#include <linux/sched.h> 19#include <linux/sched.h>
19#include <linux/slab.h> 20#include <linux/slab.h>
20 21
22/* global SRCU for all MMs */
23static struct srcu_struct srcu;
24
21/* 25/*
22 * This function can't run concurrently against mmu_notifier_register 26 * This function can't run concurrently against mmu_notifier_register
23 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap 27 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
@@ -25,8 +29,8 @@
25 * in parallel despite there being no task using this mm any more, 29 * in parallel despite there being no task using this mm any more,
26 * through the vmas outside of the exit_mmap context, such as with 30 * through the vmas outside of the exit_mmap context, such as with
27 * vmtruncate. This serializes against mmu_notifier_unregister with 31 * vmtruncate. This serializes against mmu_notifier_unregister with
28 * the mmu_notifier_mm->lock in addition to RCU and it serializes 32 * the mmu_notifier_mm->lock in addition to SRCU and it serializes
29 * against the other mmu notifiers with RCU. struct mmu_notifier_mm 33 * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
30 * can't go away from under us as exit_mmap holds an mm_count pin 34 * can't go away from under us as exit_mmap holds an mm_count pin
31 * itself. 35 * itself.
32 */ 36 */
@@ -34,12 +38,13 @@ void __mmu_notifier_release(struct mm_struct *mm)
34{ 38{
35 struct mmu_notifier *mn; 39 struct mmu_notifier *mn;
36 struct hlist_node *n; 40 struct hlist_node *n;
41 int id;
37 42
38 /* 43 /*
39 * RCU here will block mmu_notifier_unregister until 44 * SRCU here will block mmu_notifier_unregister until
40 * ->release returns. 45 * ->release returns.
41 */ 46 */
42 rcu_read_lock(); 47 id = srcu_read_lock(&srcu);
43 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) 48 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
44 /* 49 /*
45 * if ->release runs before mmu_notifier_unregister it 50 * if ->release runs before mmu_notifier_unregister it
@@ -50,7 +55,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
50 */ 55 */
51 if (mn->ops->release) 56 if (mn->ops->release)
52 mn->ops->release(mn, mm); 57 mn->ops->release(mn, mm);
53 rcu_read_unlock(); 58 srcu_read_unlock(&srcu, id);
54 59
55 spin_lock(&mm->mmu_notifier_mm->lock); 60 spin_lock(&mm->mmu_notifier_mm->lock);
56 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { 61 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -68,7 +73,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
68 spin_unlock(&mm->mmu_notifier_mm->lock); 73 spin_unlock(&mm->mmu_notifier_mm->lock);
69 74
70 /* 75 /*
71 * synchronize_rcu here prevents mmu_notifier_release to 76 * synchronize_srcu here prevents mmu_notifier_release to
72 * return to exit_mmap (which would proceed freeing all pages 77 * return to exit_mmap (which would proceed freeing all pages
73 * in the mm) until the ->release method returns, if it was 78 * in the mm) until the ->release method returns, if it was
74 * invoked by mmu_notifier_unregister. 79 * invoked by mmu_notifier_unregister.
@@ -76,7 +81,7 @@ void __mmu_notifier_release(struct mm_struct *mm)
76 * The mmu_notifier_mm can't go away from under us because one 81 * The mmu_notifier_mm can't go away from under us because one
77 * mm_count is hold by exit_mmap. 82 * mm_count is hold by exit_mmap.
78 */ 83 */
79 synchronize_rcu(); 84 synchronize_srcu(&srcu);
80} 85}
81 86
82/* 87/*
@@ -89,14 +94,14 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
89{ 94{
90 struct mmu_notifier *mn; 95 struct mmu_notifier *mn;
91 struct hlist_node *n; 96 struct hlist_node *n;
92 int young = 0; 97 int young = 0, id;
93 98
94 rcu_read_lock(); 99 id = srcu_read_lock(&srcu);
95 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 100 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
96 if (mn->ops->clear_flush_young) 101 if (mn->ops->clear_flush_young)
97 young |= mn->ops->clear_flush_young(mn, mm, address); 102 young |= mn->ops->clear_flush_young(mn, mm, address);
98 } 103 }
99 rcu_read_unlock(); 104 srcu_read_unlock(&srcu, id);
100 105
101 return young; 106 return young;
102} 107}
@@ -106,9 +111,9 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
106{ 111{
107 struct mmu_notifier *mn; 112 struct mmu_notifier *mn;
108 struct hlist_node *n; 113 struct hlist_node *n;
109 int young = 0; 114 int young = 0, id;
110 115
111 rcu_read_lock(); 116 id = srcu_read_lock(&srcu);
112 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 117 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
113 if (mn->ops->test_young) { 118 if (mn->ops->test_young) {
114 young = mn->ops->test_young(mn, mm, address); 119 young = mn->ops->test_young(mn, mm, address);
@@ -116,7 +121,7 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
116 break; 121 break;
117 } 122 }
118 } 123 }
119 rcu_read_unlock(); 124 srcu_read_unlock(&srcu, id);
120 125
121 return young; 126 return young;
122} 127}
@@ -126,19 +131,14 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
126{ 131{
127 struct mmu_notifier *mn; 132 struct mmu_notifier *mn;
128 struct hlist_node *n; 133 struct hlist_node *n;
134 int id;
129 135
130 rcu_read_lock(); 136 id = srcu_read_lock(&srcu);
131 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 137 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
132 if (mn->ops->change_pte) 138 if (mn->ops->change_pte)
133 mn->ops->change_pte(mn, mm, address, pte); 139 mn->ops->change_pte(mn, mm, address, pte);
134 /*
135 * Some drivers don't have change_pte,
136 * so we must call invalidate_page in that case.
137 */
138 else if (mn->ops->invalidate_page)
139 mn->ops->invalidate_page(mn, mm, address);
140 } 140 }
141 rcu_read_unlock(); 141 srcu_read_unlock(&srcu, id);
142} 142}
143 143
144void __mmu_notifier_invalidate_page(struct mm_struct *mm, 144void __mmu_notifier_invalidate_page(struct mm_struct *mm,
@@ -146,13 +146,14 @@ void __mmu_notifier_invalidate_page(struct mm_struct *mm,
146{ 146{
147 struct mmu_notifier *mn; 147 struct mmu_notifier *mn;
148 struct hlist_node *n; 148 struct hlist_node *n;
149 int id;
149 150
150 rcu_read_lock(); 151 id = srcu_read_lock(&srcu);
151 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 152 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
152 if (mn->ops->invalidate_page) 153 if (mn->ops->invalidate_page)
153 mn->ops->invalidate_page(mn, mm, address); 154 mn->ops->invalidate_page(mn, mm, address);
154 } 155 }
155 rcu_read_unlock(); 156 srcu_read_unlock(&srcu, id);
156} 157}
157 158
158void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, 159void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
@@ -160,13 +161,14 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
160{ 161{
161 struct mmu_notifier *mn; 162 struct mmu_notifier *mn;
162 struct hlist_node *n; 163 struct hlist_node *n;
164 int id;
163 165
164 rcu_read_lock(); 166 id = srcu_read_lock(&srcu);
165 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 167 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
166 if (mn->ops->invalidate_range_start) 168 if (mn->ops->invalidate_range_start)
167 mn->ops->invalidate_range_start(mn, mm, start, end); 169 mn->ops->invalidate_range_start(mn, mm, start, end);
168 } 170 }
169 rcu_read_unlock(); 171 srcu_read_unlock(&srcu, id);
170} 172}
171 173
172void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, 174void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -174,13 +176,14 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
174{ 176{
175 struct mmu_notifier *mn; 177 struct mmu_notifier *mn;
176 struct hlist_node *n; 178 struct hlist_node *n;
179 int id;
177 180
178 rcu_read_lock(); 181 id = srcu_read_lock(&srcu);
179 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { 182 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
180 if (mn->ops->invalidate_range_end) 183 if (mn->ops->invalidate_range_end)
181 mn->ops->invalidate_range_end(mn, mm, start, end); 184 mn->ops->invalidate_range_end(mn, mm, start, end);
182 } 185 }
183 rcu_read_unlock(); 186 srcu_read_unlock(&srcu, id);
184} 187}
185 188
186static int do_mmu_notifier_register(struct mmu_notifier *mn, 189static int do_mmu_notifier_register(struct mmu_notifier *mn,
@@ -192,22 +195,29 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
192 195
193 BUG_ON(atomic_read(&mm->mm_users) <= 0); 196 BUG_ON(atomic_read(&mm->mm_users) <= 0);
194 197
195 ret = -ENOMEM; 198 /*
196 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); 199 * Verify that mmu_notifier_init() already run and the global srcu is
197 if (unlikely(!mmu_notifier_mm)) 200 * initialized.
198 goto out; 201 */
202 BUG_ON(!srcu.per_cpu_ref);
199 203
200 if (take_mmap_sem) 204 if (take_mmap_sem)
201 down_write(&mm->mmap_sem); 205 down_write(&mm->mmap_sem);
202 ret = mm_take_all_locks(mm); 206 ret = mm_take_all_locks(mm);
203 if (unlikely(ret)) 207 if (unlikely(ret))
204 goto out_cleanup; 208 goto out;
205 209
206 if (!mm_has_notifiers(mm)) { 210 if (!mm_has_notifiers(mm)) {
211 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm),
212 GFP_KERNEL);
213 if (unlikely(!mmu_notifier_mm)) {
214 ret = -ENOMEM;
215 goto out_of_mem;
216 }
207 INIT_HLIST_HEAD(&mmu_notifier_mm->list); 217 INIT_HLIST_HEAD(&mmu_notifier_mm->list);
208 spin_lock_init(&mmu_notifier_mm->lock); 218 spin_lock_init(&mmu_notifier_mm->lock);
219
209 mm->mmu_notifier_mm = mmu_notifier_mm; 220 mm->mmu_notifier_mm = mmu_notifier_mm;
210 mmu_notifier_mm = NULL;
211 } 221 }
212 atomic_inc(&mm->mm_count); 222 atomic_inc(&mm->mm_count);
213 223
@@ -223,13 +233,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn,
223 hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); 233 hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
224 spin_unlock(&mm->mmu_notifier_mm->lock); 234 spin_unlock(&mm->mmu_notifier_mm->lock);
225 235
236out_of_mem:
226 mm_drop_all_locks(mm); 237 mm_drop_all_locks(mm);
227out_cleanup: 238out:
228 if (take_mmap_sem) 239 if (take_mmap_sem)
229 up_write(&mm->mmap_sem); 240 up_write(&mm->mmap_sem);
230 /* kfree() does nothing if mmu_notifier_mm is NULL */ 241
231 kfree(mmu_notifier_mm);
232out:
233 BUG_ON(atomic_read(&mm->mm_users) <= 0); 242 BUG_ON(atomic_read(&mm->mm_users) <= 0);
234 return ret; 243 return ret;
235} 244}
@@ -274,8 +283,8 @@ void __mmu_notifier_mm_destroy(struct mm_struct *mm)
274/* 283/*
275 * This releases the mm_count pin automatically and frees the mm 284 * This releases the mm_count pin automatically and frees the mm
276 * structure if it was the last user of it. It serializes against 285 * structure if it was the last user of it. It serializes against
277 * running mmu notifiers with RCU and against mmu_notifier_unregister 286 * running mmu notifiers with SRCU and against mmu_notifier_unregister
278 * with the unregister lock + RCU. All sptes must be dropped before 287 * with the unregister lock + SRCU. All sptes must be dropped before
279 * calling mmu_notifier_unregister. ->release or any other notifier 288 * calling mmu_notifier_unregister. ->release or any other notifier
280 * method may be invoked concurrently with mmu_notifier_unregister, 289 * method may be invoked concurrently with mmu_notifier_unregister,
281 * and only after mmu_notifier_unregister returned we're guaranteed 290 * and only after mmu_notifier_unregister returned we're guaranteed
@@ -287,11 +296,12 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
287 296
288 if (!hlist_unhashed(&mn->hlist)) { 297 if (!hlist_unhashed(&mn->hlist)) {
289 /* 298 /*
290 * RCU here will force exit_mmap to wait ->release to finish 299 * SRCU here will force exit_mmap to wait ->release to finish
291 * before freeing the pages. 300 * before freeing the pages.
292 */ 301 */
293 rcu_read_lock(); 302 int id;
294 303
304 id = srcu_read_lock(&srcu);
295 /* 305 /*
296 * exit_mmap will block in mmu_notifier_release to 306 * exit_mmap will block in mmu_notifier_release to
297 * guarantee ->release is called before freeing the 307 * guarantee ->release is called before freeing the
@@ -299,7 +309,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
299 */ 309 */
300 if (mn->ops->release) 310 if (mn->ops->release)
301 mn->ops->release(mn, mm); 311 mn->ops->release(mn, mm);
302 rcu_read_unlock(); 312 srcu_read_unlock(&srcu, id);
303 313
304 spin_lock(&mm->mmu_notifier_mm->lock); 314 spin_lock(&mm->mmu_notifier_mm->lock);
305 hlist_del_rcu(&mn->hlist); 315 hlist_del_rcu(&mn->hlist);
@@ -310,10 +320,17 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
310 * Wait any running method to finish, of course including 320 * Wait any running method to finish, of course including
311 * ->release if it was run by mmu_notifier_relase instead of us. 321 * ->release if it was run by mmu_notifier_relase instead of us.
312 */ 322 */
313 synchronize_rcu(); 323 synchronize_srcu(&srcu);
314 324
315 BUG_ON(atomic_read(&mm->mm_count) <= 0); 325 BUG_ON(atomic_read(&mm->mm_count) <= 0);
316 326
317 mmdrop(mm); 327 mmdrop(mm);
318} 328}
319EXPORT_SYMBOL_GPL(mmu_notifier_unregister); 329EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
330
331static int __init mmu_notifier_init(void)
332{
333 return init_srcu_struct(&srcu);
334}
335
336module_init(mmu_notifier_init);
diff --git a/mm/mremap.c b/mm/mremap.c
index cc06d0e48d05..1b61c2d3307a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -71,22 +71,41 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
71static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, 71static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
72 unsigned long old_addr, unsigned long old_end, 72 unsigned long old_addr, unsigned long old_end,
73 struct vm_area_struct *new_vma, pmd_t *new_pmd, 73 struct vm_area_struct *new_vma, pmd_t *new_pmd,
74 unsigned long new_addr) 74 unsigned long new_addr, bool need_rmap_locks)
75{ 75{
76 struct address_space *mapping = NULL; 76 struct address_space *mapping = NULL;
77 struct anon_vma *anon_vma = NULL;
77 struct mm_struct *mm = vma->vm_mm; 78 struct mm_struct *mm = vma->vm_mm;
78 pte_t *old_pte, *new_pte, pte; 79 pte_t *old_pte, *new_pte, pte;
79 spinlock_t *old_ptl, *new_ptl; 80 spinlock_t *old_ptl, *new_ptl;
80 81
81 if (vma->vm_file) { 82 /*
82 /* 83 * When need_rmap_locks is true, we take the i_mmap_mutex and anon_vma
83 * Subtle point from Rajesh Venkatasubramanian: before 84 * locks to ensure that rmap will always observe either the old or the
84 * moving file-based ptes, we must lock truncate_pagecache 85 * new ptes. This is the easiest way to avoid races with
85 * out, since it might clean the dst vma before the src vma, 86 * truncate_pagecache(), page migration, etc...
86 * and we propagate stale pages into the dst afterward. 87 *
87 */ 88 * When need_rmap_locks is false, we use other ways to avoid
88 mapping = vma->vm_file->f_mapping; 89 * such races:
89 mutex_lock(&mapping->i_mmap_mutex); 90 *
91 * - During exec() shift_arg_pages(), we use a specially tagged vma
92 * which rmap call sites look for using is_vma_temporary_stack().
93 *
94 * - During mremap(), new_vma is often known to be placed after vma
95 * in rmap traversal order. This ensures rmap will always observe
96 * either the old pte, or the new pte, or both (the page table locks
97 * serialize access to individual ptes, but only rmap traversal
98 * order guarantees that we won't miss both the old and new ptes).
99 */
100 if (need_rmap_locks) {
101 if (vma->vm_file) {
102 mapping = vma->vm_file->f_mapping;
103 mutex_lock(&mapping->i_mmap_mutex);
104 }
105 if (vma->anon_vma) {
106 anon_vma = vma->anon_vma;
107 anon_vma_lock(anon_vma);
108 }
90 } 109 }
91 110
92 /* 111 /*
@@ -114,6 +133,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
114 spin_unlock(new_ptl); 133 spin_unlock(new_ptl);
115 pte_unmap(new_pte - 1); 134 pte_unmap(new_pte - 1);
116 pte_unmap_unlock(old_pte - 1, old_ptl); 135 pte_unmap_unlock(old_pte - 1, old_ptl);
136 if (anon_vma)
137 anon_vma_unlock(anon_vma);
117 if (mapping) 138 if (mapping)
118 mutex_unlock(&mapping->i_mmap_mutex); 139 mutex_unlock(&mapping->i_mmap_mutex);
119} 140}
@@ -122,16 +143,21 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
122 143
123unsigned long move_page_tables(struct vm_area_struct *vma, 144unsigned long move_page_tables(struct vm_area_struct *vma,
124 unsigned long old_addr, struct vm_area_struct *new_vma, 145 unsigned long old_addr, struct vm_area_struct *new_vma,
125 unsigned long new_addr, unsigned long len) 146 unsigned long new_addr, unsigned long len,
147 bool need_rmap_locks)
126{ 148{
127 unsigned long extent, next, old_end; 149 unsigned long extent, next, old_end;
128 pmd_t *old_pmd, *new_pmd; 150 pmd_t *old_pmd, *new_pmd;
129 bool need_flush = false; 151 bool need_flush = false;
152 unsigned long mmun_start; /* For mmu_notifiers */
153 unsigned long mmun_end; /* For mmu_notifiers */
130 154
131 old_end = old_addr + len; 155 old_end = old_addr + len;
132 flush_cache_range(vma, old_addr, old_end); 156 flush_cache_range(vma, old_addr, old_end);
133 157
134 mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); 158 mmun_start = old_addr;
159 mmun_end = old_end;
160 mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
135 161
136 for (; old_addr < old_end; old_addr += extent, new_addr += extent) { 162 for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
137 cond_resched(); 163 cond_resched();
@@ -169,13 +195,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
169 if (extent > LATENCY_LIMIT) 195 if (extent > LATENCY_LIMIT)
170 extent = LATENCY_LIMIT; 196 extent = LATENCY_LIMIT;
171 move_ptes(vma, old_pmd, old_addr, old_addr + extent, 197 move_ptes(vma, old_pmd, old_addr, old_addr + extent,
172 new_vma, new_pmd, new_addr); 198 new_vma, new_pmd, new_addr, need_rmap_locks);
173 need_flush = true; 199 need_flush = true;
174 } 200 }
175 if (likely(need_flush)) 201 if (likely(need_flush))
176 flush_tlb_range(vma, old_end-len, old_addr); 202 flush_tlb_range(vma, old_end-len, old_addr);
177 203
178 mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); 204 mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
179 205
180 return len + old_addr - old_end; /* how much done */ 206 return len + old_addr - old_end; /* how much done */
181} 207}
@@ -193,6 +219,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
193 unsigned long hiwater_vm; 219 unsigned long hiwater_vm;
194 int split = 0; 220 int split = 0;
195 int err; 221 int err;
222 bool need_rmap_locks;
196 223
197 /* 224 /*
198 * We'd prefer to avoid failure later on in do_munmap: 225 * We'd prefer to avoid failure later on in do_munmap:
@@ -214,27 +241,21 @@ static unsigned long move_vma(struct vm_area_struct *vma,
214 return err; 241 return err;
215 242
216 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); 243 new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
217 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); 244 new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
245 &need_rmap_locks);
218 if (!new_vma) 246 if (!new_vma)
219 return -ENOMEM; 247 return -ENOMEM;
220 248
221 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); 249 moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
250 need_rmap_locks);
222 if (moved_len < old_len) { 251 if (moved_len < old_len) {
223 /* 252 /*
224 * Before moving the page tables from the new vma to
225 * the old vma, we need to be sure the old vma is
226 * queued after new vma in the same_anon_vma list to
227 * prevent SMP races with rmap_walk (that could lead
228 * rmap_walk to miss some page table).
229 */
230 anon_vma_moveto_tail(vma);
231
232 /*
233 * On error, move entries back from new area to old, 253 * On error, move entries back from new area to old,
234 * which will succeed since page tables still there, 254 * which will succeed since page tables still there,
235 * and then proceed to unmap new area instead of old. 255 * and then proceed to unmap new area instead of old.
236 */ 256 */
237 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len); 257 move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
258 true);
238 vma = new_vma; 259 vma = new_vma;
239 old_len = new_len; 260 old_len = new_len;
240 old_addr = new_addr; 261 old_addr = new_addr;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 405573010f99..714d5d650470 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -116,6 +116,8 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
116 return 0; 116 return 0;
117 117
118 __free_pages_memory(start_pfn, end_pfn); 118 __free_pages_memory(start_pfn, end_pfn);
119 fixup_zone_present_pages(pfn_to_nid(start >> PAGE_SHIFT),
120 start_pfn, end_pfn);
119 121
120 return end_pfn - start_pfn; 122 return end_pfn - start_pfn;
121} 123}
@@ -126,6 +128,7 @@ unsigned long __init free_low_memory_core_early(int nodeid)
126 phys_addr_t start, end, size; 128 phys_addr_t start, end, size;
127 u64 i; 129 u64 i;
128 130
131 reset_zone_present_pages();
129 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) 132 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
130 count += __free_memory_core(start, end); 133 count += __free_memory_core(start, end);
131 134
@@ -162,8 +165,6 @@ unsigned long __init free_all_bootmem(void)
162 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id 165 * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
163 * because in some case like Node0 doesn't have RAM installed 166 * because in some case like Node0 doesn't have RAM installed
164 * low ram will be on Node1 167 * low ram will be on Node1
165 * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
166 * will be used instead of only Node0 related
167 */ 168 */
168 return free_low_memory_core_early(MAX_NUMNODES); 169 return free_low_memory_core_early(MAX_NUMNODES);
169} 170}
diff --git a/mm/nommu.c b/mm/nommu.c
index dee2ff89fd58..45131b41bcdb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -698,7 +698,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
698 698
699 mutex_lock(&mapping->i_mmap_mutex); 699 mutex_lock(&mapping->i_mmap_mutex);
700 flush_dcache_mmap_lock(mapping); 700 flush_dcache_mmap_lock(mapping);
701 vma_prio_tree_insert(vma, &mapping->i_mmap); 701 vma_interval_tree_insert(vma, &mapping->i_mmap);
702 flush_dcache_mmap_unlock(mapping); 702 flush_dcache_mmap_unlock(mapping);
703 mutex_unlock(&mapping->i_mmap_mutex); 703 mutex_unlock(&mapping->i_mmap_mutex);
704 } 704 }
@@ -764,7 +764,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
764 764
765 mutex_lock(&mapping->i_mmap_mutex); 765 mutex_lock(&mapping->i_mmap_mutex);
766 flush_dcache_mmap_lock(mapping); 766 flush_dcache_mmap_lock(mapping);
767 vma_prio_tree_remove(vma, &mapping->i_mmap); 767 vma_interval_tree_remove(vma, &mapping->i_mmap);
768 flush_dcache_mmap_unlock(mapping); 768 flush_dcache_mmap_unlock(mapping);
769 mutex_unlock(&mapping->i_mmap_mutex); 769 mutex_unlock(&mapping->i_mmap_mutex);
770 } 770 }
@@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
789 kenter("%p", vma); 789 kenter("%p", vma);
790 if (vma->vm_ops && vma->vm_ops->close) 790 if (vma->vm_ops && vma->vm_ops->close)
791 vma->vm_ops->close(vma); 791 vma->vm_ops->close(vma);
792 if (vma->vm_file) { 792 if (vma->vm_file)
793 fput(vma->vm_file); 793 fput(vma->vm_file);
794 if (vma->vm_flags & VM_EXECUTABLE)
795 removed_exe_file_vma(mm);
796 }
797 put_nommu_region(vma->vm_region); 794 put_nommu_region(vma->vm_region);
798 kmem_cache_free(vm_area_cachep, vma); 795 kmem_cache_free(vm_area_cachep, vma);
799} 796}
@@ -1284,10 +1281,6 @@ unsigned long do_mmap_pgoff(struct file *file,
1284 if (file) { 1281 if (file) {
1285 region->vm_file = get_file(file); 1282 region->vm_file = get_file(file);
1286 vma->vm_file = get_file(file); 1283 vma->vm_file = get_file(file);
1287 if (vm_flags & VM_EXECUTABLE) {
1288 added_exe_file_vma(current->mm);
1289 vma->vm_mm = current->mm;
1290 }
1291 } 1284 }
1292 1285
1293 down_write(&nommu_region_sem); 1286 down_write(&nommu_region_sem);
@@ -1440,8 +1433,6 @@ error:
1440 kmem_cache_free(vm_region_jar, region); 1433 kmem_cache_free(vm_region_jar, region);
1441 if (vma->vm_file) 1434 if (vma->vm_file)
1442 fput(vma->vm_file); 1435 fput(vma->vm_file);
1443 if (vma->vm_flags & VM_EXECUTABLE)
1444 removed_exe_file_vma(vma->vm_mm);
1445 kmem_cache_free(vm_area_cachep, vma); 1436 kmem_cache_free(vm_area_cachep, vma);
1446 kleave(" = %d", ret); 1437 kleave(" = %d", ret);
1447 return ret; 1438 return ret;
@@ -1820,7 +1811,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1820 if (addr != (pfn << PAGE_SHIFT)) 1811 if (addr != (pfn << PAGE_SHIFT))
1821 return -EINVAL; 1812 return -EINVAL;
1822 1813
1823 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; 1814 vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
1824 return 0; 1815 return 0;
1825} 1816}
1826EXPORT_SYMBOL(remap_pfn_range); 1817EXPORT_SYMBOL(remap_pfn_range);
@@ -1961,6 +1952,14 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1961} 1952}
1962EXPORT_SYMBOL(filemap_fault); 1953EXPORT_SYMBOL(filemap_fault);
1963 1954
1955int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
1956 unsigned long size, pgoff_t pgoff)
1957{
1958 BUG();
1959 return 0;
1960}
1961EXPORT_SYMBOL(generic_file_remap_pages);
1962
1964static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, 1963static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
1965 unsigned long addr, void *buf, int len, int write) 1964 unsigned long addr, void *buf, int len, int write)
1966{ 1965{
@@ -2045,7 +2044,6 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2045 size_t newsize) 2044 size_t newsize)
2046{ 2045{
2047 struct vm_area_struct *vma; 2046 struct vm_area_struct *vma;
2048 struct prio_tree_iter iter;
2049 struct vm_region *region; 2047 struct vm_region *region;
2050 pgoff_t low, high; 2048 pgoff_t low, high;
2051 size_t r_size, r_top; 2049 size_t r_size, r_top;
@@ -2057,8 +2055,7 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2057 mutex_lock(&inode->i_mapping->i_mmap_mutex); 2055 mutex_lock(&inode->i_mapping->i_mmap_mutex);
2058 2056
2059 /* search for VMAs that fall within the dead zone */ 2057 /* search for VMAs that fall within the dead zone */
2060 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, 2058 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap, low, high) {
2061 low, high) {
2062 /* found one - only interested if it's shared out of the page 2059 /* found one - only interested if it's shared out of the page
2063 * cache */ 2060 * cache */
2064 if (vma->vm_flags & VM_SHARED) { 2061 if (vma->vm_flags & VM_SHARED) {
@@ -2074,8 +2071,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
2074 * we don't check for any regions that start beyond the EOF as there 2071 * we don't check for any regions that start beyond the EOF as there
2075 * shouldn't be any 2072 * shouldn't be any
2076 */ 2073 */
2077 vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, 2074 vma_interval_tree_foreach(vma, &inode->i_mapping->i_mmap,
2078 0, ULONG_MAX) { 2075 0, ULONG_MAX) {
2079 if (!(vma->vm_flags & VM_SHARED)) 2076 if (!(vma->vm_flags & VM_SHARED))
2080 continue; 2077 continue;
2081 2078
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 198600861638..79e0f3e24831 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -428,8 +428,8 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
428{ 428{
429 task_lock(current); 429 task_lock(current);
430 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 430 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
431 "oom_adj=%d, oom_score_adj=%d\n", 431 "oom_score_adj=%d\n",
432 current->comm, gfp_mask, order, current->signal->oom_adj, 432 current->comm, gfp_mask, order,
433 current->signal->oom_score_adj); 433 current->signal->oom_score_adj);
434 cpuset_print_task_mems_allowed(current); 434 cpuset_print_task_mems_allowed(current);
435 task_unlock(current); 435 task_unlock(current);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c13ea7538891..bb90971182bd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -558,7 +558,8 @@ static inline void __free_one_page(struct page *page,
558 if (page_is_guard(buddy)) { 558 if (page_is_guard(buddy)) {
559 clear_page_guard_flag(buddy); 559 clear_page_guard_flag(buddy);
560 set_page_private(page, 0); 560 set_page_private(page, 0);
561 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); 561 __mod_zone_freepage_state(zone, 1 << order,
562 migratetype);
562 } else { 563 } else {
563 list_del(&buddy->lru); 564 list_del(&buddy->lru);
564 zone->free_area[order].nr_free--; 565 zone->free_area[order].nr_free--;
@@ -597,17 +598,6 @@ out:
597 zone->free_area[order].nr_free++; 598 zone->free_area[order].nr_free++;
598} 599}
599 600
600/*
601 * free_page_mlock() -- clean up attempts to free and mlocked() page.
602 * Page should not be on lru, so no need to fix that up.
603 * free_pages_check() will verify...
604 */
605static inline void free_page_mlock(struct page *page)
606{
607 __dec_zone_page_state(page, NR_MLOCK);
608 __count_vm_event(UNEVICTABLE_MLOCKFREED);
609}
610
611static inline int free_pages_check(struct page *page) 601static inline int free_pages_check(struct page *page)
612{ 602{
613 if (unlikely(page_mapcount(page) | 603 if (unlikely(page_mapcount(page) |
@@ -668,12 +658,17 @@ static void free_pcppages_bulk(struct zone *zone, int count,
668 batch_free = to_free; 658 batch_free = to_free;
669 659
670 do { 660 do {
661 int mt; /* migratetype of the to-be-freed page */
662
671 page = list_entry(list->prev, struct page, lru); 663 page = list_entry(list->prev, struct page, lru);
672 /* must delete as __free_one_page list manipulates */ 664 /* must delete as __free_one_page list manipulates */
673 list_del(&page->lru); 665 list_del(&page->lru);
666 mt = get_freepage_migratetype(page);
674 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 667 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
675 __free_one_page(page, zone, 0, page_private(page)); 668 __free_one_page(page, zone, 0, mt);
676 trace_mm_page_pcpu_drain(page, 0, page_private(page)); 669 trace_mm_page_pcpu_drain(page, 0, mt);
670 if (is_migrate_cma(mt))
671 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
677 } while (--to_free && --batch_free && !list_empty(list)); 672 } while (--to_free && --batch_free && !list_empty(list));
678 } 673 }
679 __mod_zone_page_state(zone, NR_FREE_PAGES, count); 674 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
@@ -688,7 +683,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
688 zone->pages_scanned = 0; 683 zone->pages_scanned = 0;
689 684
690 __free_one_page(page, zone, order, migratetype); 685 __free_one_page(page, zone, order, migratetype);
691 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); 686 if (unlikely(migratetype != MIGRATE_ISOLATE))
687 __mod_zone_freepage_state(zone, 1 << order, migratetype);
692 spin_unlock(&zone->lock); 688 spin_unlock(&zone->lock);
693} 689}
694 690
@@ -721,17 +717,16 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
721static void __free_pages_ok(struct page *page, unsigned int order) 717static void __free_pages_ok(struct page *page, unsigned int order)
722{ 718{
723 unsigned long flags; 719 unsigned long flags;
724 int wasMlocked = __TestClearPageMlocked(page); 720 int migratetype;
725 721
726 if (!free_pages_prepare(page, order)) 722 if (!free_pages_prepare(page, order))
727 return; 723 return;
728 724
729 local_irq_save(flags); 725 local_irq_save(flags);
730 if (unlikely(wasMlocked))
731 free_page_mlock(page);
732 __count_vm_events(PGFREE, 1 << order); 726 __count_vm_events(PGFREE, 1 << order);
733 free_one_page(page_zone(page), page, order, 727 migratetype = get_pageblock_migratetype(page);
734 get_pageblock_migratetype(page)); 728 set_freepage_migratetype(page, migratetype);
729 free_one_page(page_zone(page), page, order, migratetype);
735 local_irq_restore(flags); 730 local_irq_restore(flags);
736} 731}
737 732
@@ -811,7 +806,8 @@ static inline void expand(struct zone *zone, struct page *page,
811 set_page_guard_flag(&page[size]); 806 set_page_guard_flag(&page[size]);
812 set_page_private(&page[size], high); 807 set_page_private(&page[size], high);
813 /* Guard pages are not available for any usage */ 808 /* Guard pages are not available for any usage */
814 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); 809 __mod_zone_freepage_state(zone, -(1 << high),
810 migratetype);
815 continue; 811 continue;
816 } 812 }
817#endif 813#endif
@@ -915,7 +911,7 @@ static int fallbacks[MIGRATE_TYPES][4] = {
915 * Note that start_page and end_pages are not aligned on a pageblock 911 * Note that start_page and end_pages are not aligned on a pageblock
916 * boundary. If alignment is required, use move_freepages_block() 912 * boundary. If alignment is required, use move_freepages_block()
917 */ 913 */
918static int move_freepages(struct zone *zone, 914int move_freepages(struct zone *zone,
919 struct page *start_page, struct page *end_page, 915 struct page *start_page, struct page *end_page,
920 int migratetype) 916 int migratetype)
921{ 917{
@@ -951,6 +947,7 @@ static int move_freepages(struct zone *zone,
951 order = page_order(page); 947 order = page_order(page);
952 list_move(&page->lru, 948 list_move(&page->lru,
953 &zone->free_area[order].free_list[migratetype]); 949 &zone->free_area[order].free_list[migratetype]);
950 set_freepage_migratetype(page, migratetype);
954 page += 1 << order; 951 page += 1 << order;
955 pages_moved += 1 << order; 952 pages_moved += 1 << order;
956 } 953 }
@@ -1135,8 +1132,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
1135 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) 1132 if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
1136 mt = migratetype; 1133 mt = migratetype;
1137 } 1134 }
1138 set_page_private(page, mt); 1135 set_freepage_migratetype(page, mt);
1139 list = &page->lru; 1136 list = &page->lru;
1137 if (is_migrate_cma(mt))
1138 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
1139 -(1 << order));
1140 } 1140 }
1141 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); 1141 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
1142 spin_unlock(&zone->lock); 1142 spin_unlock(&zone->lock);
@@ -1296,16 +1296,13 @@ void free_hot_cold_page(struct page *page, int cold)
1296 struct per_cpu_pages *pcp; 1296 struct per_cpu_pages *pcp;
1297 unsigned long flags; 1297 unsigned long flags;
1298 int migratetype; 1298 int migratetype;
1299 int wasMlocked = __TestClearPageMlocked(page);
1300 1299
1301 if (!free_pages_prepare(page, 0)) 1300 if (!free_pages_prepare(page, 0))
1302 return; 1301 return;
1303 1302
1304 migratetype = get_pageblock_migratetype(page); 1303 migratetype = get_pageblock_migratetype(page);
1305 set_page_private(page, migratetype); 1304 set_freepage_migratetype(page, migratetype);
1306 local_irq_save(flags); 1305 local_irq_save(flags);
1307 if (unlikely(wasMlocked))
1308 free_page_mlock(page);
1309 __count_vm_event(PGFREE); 1306 __count_vm_event(PGFREE);
1310 1307
1311 /* 1308 /*
@@ -1380,20 +1377,16 @@ void split_page(struct page *page, unsigned int order)
1380} 1377}
1381 1378
1382/* 1379/*
1383 * Similar to split_page except the page is already free. As this is only 1380 * Similar to the split_page family of functions except that the page
1384 * being used for migration, the migratetype of the block also changes. 1381 * required at the given order and being isolated now to prevent races
1385 * As this is called with interrupts disabled, the caller is responsible 1382 * with parallel allocators
1386 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1387 * are enabled.
1388 *
1389 * Note: this is probably too low level an operation for use in drivers.
1390 * Please consult with lkml before using this in your driver.
1391 */ 1383 */
1392int split_free_page(struct page *page) 1384int capture_free_page(struct page *page, int alloc_order, int migratetype)
1393{ 1385{
1394 unsigned int order; 1386 unsigned int order;
1395 unsigned long watermark; 1387 unsigned long watermark;
1396 struct zone *zone; 1388 struct zone *zone;
1389 int mt;
1397 1390
1398 BUG_ON(!PageBuddy(page)); 1391 BUG_ON(!PageBuddy(page));
1399 1392
@@ -1409,12 +1402,16 @@ int split_free_page(struct page *page)
1409 list_del(&page->lru); 1402 list_del(&page->lru);
1410 zone->free_area[order].nr_free--; 1403 zone->free_area[order].nr_free--;
1411 rmv_page_order(page); 1404 rmv_page_order(page);
1412 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
1413 1405
1414 /* Split into individual pages */ 1406 mt = get_pageblock_migratetype(page);
1415 set_page_refcounted(page); 1407 if (unlikely(mt != MIGRATE_ISOLATE))
1416 split_page(page, order); 1408 __mod_zone_freepage_state(zone, -(1UL << order), mt);
1417 1409
1410 if (alloc_order != order)
1411 expand(zone, page, alloc_order, order,
1412 &zone->free_area[order], migratetype);
1413
1414 /* Set the pageblock if the captured page is at least a pageblock */
1418 if (order >= pageblock_order - 1) { 1415 if (order >= pageblock_order - 1) {
1419 struct page *endpage = page + (1 << order) - 1; 1416 struct page *endpage = page + (1 << order) - 1;
1420 for (; page < endpage; page += pageblock_nr_pages) { 1417 for (; page < endpage; page += pageblock_nr_pages) {
@@ -1425,7 +1422,35 @@ int split_free_page(struct page *page)
1425 } 1422 }
1426 } 1423 }
1427 1424
1428 return 1 << order; 1425 return 1UL << order;
1426}
1427
1428/*
1429 * Similar to split_page except the page is already free. As this is only
1430 * being used for migration, the migratetype of the block also changes.
1431 * As this is called with interrupts disabled, the caller is responsible
1432 * for calling arch_alloc_page() and kernel_map_page() after interrupts
1433 * are enabled.
1434 *
1435 * Note: this is probably too low level an operation for use in drivers.
1436 * Please consult with lkml before using this in your driver.
1437 */
1438int split_free_page(struct page *page)
1439{
1440 unsigned int order;
1441 int nr_pages;
1442
1443 BUG_ON(!PageBuddy(page));
1444 order = page_order(page);
1445
1446 nr_pages = capture_free_page(page, order, 0);
1447 if (!nr_pages)
1448 return 0;
1449
1450 /* Split into individual pages */
1451 set_page_refcounted(page);
1452 split_page(page, order);
1453 return nr_pages;
1429} 1454}
1430 1455
1431/* 1456/*
@@ -1484,7 +1509,8 @@ again:
1484 spin_unlock(&zone->lock); 1509 spin_unlock(&zone->lock);
1485 if (!page) 1510 if (!page)
1486 goto failed; 1511 goto failed;
1487 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); 1512 __mod_zone_freepage_state(zone, -(1 << order),
1513 get_pageblock_migratetype(page));
1488 } 1514 }
1489 1515
1490 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1516 __count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -1501,19 +1527,6 @@ failed:
1501 return NULL; 1527 return NULL;
1502} 1528}
1503 1529
1504/* The ALLOC_WMARK bits are used as an index to zone->watermark */
1505#define ALLOC_WMARK_MIN WMARK_MIN
1506#define ALLOC_WMARK_LOW WMARK_LOW
1507#define ALLOC_WMARK_HIGH WMARK_HIGH
1508#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
1509
1510/* Mask to get the watermark bits */
1511#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
1512
1513#define ALLOC_HARDER 0x10 /* try to alloc harder */
1514#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
1515#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
1516
1517#ifdef CONFIG_FAIL_PAGE_ALLOC 1530#ifdef CONFIG_FAIL_PAGE_ALLOC
1518 1531
1519static struct { 1532static struct {
@@ -1608,7 +1621,11 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1608 min -= min / 2; 1621 min -= min / 2;
1609 if (alloc_flags & ALLOC_HARDER) 1622 if (alloc_flags & ALLOC_HARDER)
1610 min -= min / 4; 1623 min -= min / 4;
1611 1624#ifdef CONFIG_CMA
1625 /* If allocation can't use CMA areas don't use free CMA pages */
1626 if (!(alloc_flags & ALLOC_CMA))
1627 free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
1628#endif
1612 if (free_pages <= min + lowmem_reserve) 1629 if (free_pages <= min + lowmem_reserve)
1613 return false; 1630 return false;
1614 for (o = 0; o < order; o++) { 1631 for (o = 0; o < order; o++) {
@@ -1782,6 +1799,22 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
1782 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1799 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1783} 1800}
1784 1801
1802static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1803{
1804 return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
1805}
1806
1807static void __paginginit init_zone_allows_reclaim(int nid)
1808{
1809 int i;
1810
1811 for_each_online_node(i)
1812 if (node_distance(nid, i) <= RECLAIM_DISTANCE) {
1813 node_set(i, NODE_DATA(nid)->reclaim_nodes);
1814 zone_reclaim_mode = 1;
1815 }
1816}
1817
1785#else /* CONFIG_NUMA */ 1818#else /* CONFIG_NUMA */
1786 1819
1787static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1820static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
@@ -1802,6 +1835,15 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1802static void zlc_clear_zones_full(struct zonelist *zonelist) 1835static void zlc_clear_zones_full(struct zonelist *zonelist)
1803{ 1836{
1804} 1837}
1838
1839static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
1840{
1841 return true;
1842}
1843
1844static inline void init_zone_allows_reclaim(int nid)
1845{
1846}
1805#endif /* CONFIG_NUMA */ 1847#endif /* CONFIG_NUMA */
1806 1848
1807/* 1849/*
@@ -1886,7 +1928,8 @@ zonelist_scan:
1886 did_zlc_setup = 1; 1928 did_zlc_setup = 1;
1887 } 1929 }
1888 1930
1889 if (zone_reclaim_mode == 0) 1931 if (zone_reclaim_mode == 0 ||
1932 !zone_allows_reclaim(preferred_zone, zone))
1890 goto this_zone_full; 1933 goto this_zone_full;
1891 1934
1892 /* 1935 /*
@@ -2105,7 +2148,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2105 bool *contended_compaction, bool *deferred_compaction, 2148 bool *contended_compaction, bool *deferred_compaction,
2106 unsigned long *did_some_progress) 2149 unsigned long *did_some_progress)
2107{ 2150{
2108 struct page *page; 2151 struct page *page = NULL;
2109 2152
2110 if (!order) 2153 if (!order)
2111 return NULL; 2154 return NULL;
@@ -2118,10 +2161,16 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2118 current->flags |= PF_MEMALLOC; 2161 current->flags |= PF_MEMALLOC;
2119 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 2162 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
2120 nodemask, sync_migration, 2163 nodemask, sync_migration,
2121 contended_compaction); 2164 contended_compaction, &page);
2122 current->flags &= ~PF_MEMALLOC; 2165 current->flags &= ~PF_MEMALLOC;
2123 if (*did_some_progress != COMPACT_SKIPPED) {
2124 2166
2167 /* If compaction captured a page, prep and use it */
2168 if (page) {
2169 prep_new_page(page, order, gfp_mask);
2170 goto got_page;
2171 }
2172
2173 if (*did_some_progress != COMPACT_SKIPPED) {
2125 /* Page migration frees to the PCP lists but we want merging */ 2174 /* Page migration frees to the PCP lists but we want merging */
2126 drain_pages(get_cpu()); 2175 drain_pages(get_cpu());
2127 put_cpu(); 2176 put_cpu();
@@ -2131,6 +2180,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2131 alloc_flags & ~ALLOC_NO_WATERMARKS, 2180 alloc_flags & ~ALLOC_NO_WATERMARKS,
2132 preferred_zone, migratetype); 2181 preferred_zone, migratetype);
2133 if (page) { 2182 if (page) {
2183got_page:
2184 preferred_zone->compact_blockskip_flush = false;
2134 preferred_zone->compact_considered = 0; 2185 preferred_zone->compact_considered = 0;
2135 preferred_zone->compact_defer_shift = 0; 2186 preferred_zone->compact_defer_shift = 0;
2136 if (order >= preferred_zone->compact_order_failed) 2187 if (order >= preferred_zone->compact_order_failed)
@@ -2315,7 +2366,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
2315 unlikely(test_thread_flag(TIF_MEMDIE)))) 2366 unlikely(test_thread_flag(TIF_MEMDIE))))
2316 alloc_flags |= ALLOC_NO_WATERMARKS; 2367 alloc_flags |= ALLOC_NO_WATERMARKS;
2317 } 2368 }
2318 2369#ifdef CONFIG_CMA
2370 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2371 alloc_flags |= ALLOC_CMA;
2372#endif
2319 return alloc_flags; 2373 return alloc_flags;
2320} 2374}
2321 2375
@@ -2362,9 +2416,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2362 goto nopage; 2416 goto nopage;
2363 2417
2364restart: 2418restart:
2365 if (!(gfp_mask & __GFP_NO_KSWAPD)) 2419 wake_all_kswapd(order, zonelist, high_zoneidx,
2366 wake_all_kswapd(order, zonelist, high_zoneidx, 2420 zone_idx(preferred_zone));
2367 zone_idx(preferred_zone));
2368 2421
2369 /* 2422 /*
2370 * OK, we're below the kswapd watermark and have kicked background 2423 * OK, we're below the kswapd watermark and have kicked background
@@ -2441,7 +2494,7 @@ rebalance:
2441 * system then fail the allocation instead of entering direct reclaim. 2494 * system then fail the allocation instead of entering direct reclaim.
2442 */ 2495 */
2443 if ((deferred_compaction || contended_compaction) && 2496 if ((deferred_compaction || contended_compaction) &&
2444 (gfp_mask & __GFP_NO_KSWAPD)) 2497 (gfp_mask & (__GFP_MOVABLE|__GFP_REPEAT)) == __GFP_MOVABLE)
2445 goto nopage; 2498 goto nopage;
2446 2499
2447 /* Try direct reclaim and then allocating */ 2500 /* Try direct reclaim and then allocating */
@@ -2541,6 +2594,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2541 struct page *page = NULL; 2594 struct page *page = NULL;
2542 int migratetype = allocflags_to_migratetype(gfp_mask); 2595 int migratetype = allocflags_to_migratetype(gfp_mask);
2543 unsigned int cpuset_mems_cookie; 2596 unsigned int cpuset_mems_cookie;
2597 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
2544 2598
2545 gfp_mask &= gfp_allowed_mask; 2599 gfp_mask &= gfp_allowed_mask;
2546 2600
@@ -2569,9 +2623,13 @@ retry_cpuset:
2569 if (!preferred_zone) 2623 if (!preferred_zone)
2570 goto out; 2624 goto out;
2571 2625
2626#ifdef CONFIG_CMA
2627 if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
2628 alloc_flags |= ALLOC_CMA;
2629#endif
2572 /* First allocation attempt */ 2630 /* First allocation attempt */
2573 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 2631 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
2574 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, 2632 zonelist, high_zoneidx, alloc_flags,
2575 preferred_zone, migratetype); 2633 preferred_zone, migratetype);
2576 if (unlikely(!page)) 2634 if (unlikely(!page))
2577 page = __alloc_pages_slowpath(gfp_mask, order, 2635 page = __alloc_pages_slowpath(gfp_mask, order,
@@ -2852,7 +2910,8 @@ void show_free_areas(unsigned int filter)
2852 " unevictable:%lu" 2910 " unevictable:%lu"
2853 " dirty:%lu writeback:%lu unstable:%lu\n" 2911 " dirty:%lu writeback:%lu unstable:%lu\n"
2854 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" 2912 " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
2855 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", 2913 " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
2914 " free_cma:%lu\n",
2856 global_page_state(NR_ACTIVE_ANON), 2915 global_page_state(NR_ACTIVE_ANON),
2857 global_page_state(NR_INACTIVE_ANON), 2916 global_page_state(NR_INACTIVE_ANON),
2858 global_page_state(NR_ISOLATED_ANON), 2917 global_page_state(NR_ISOLATED_ANON),
@@ -2869,7 +2928,8 @@ void show_free_areas(unsigned int filter)
2869 global_page_state(NR_FILE_MAPPED), 2928 global_page_state(NR_FILE_MAPPED),
2870 global_page_state(NR_SHMEM), 2929 global_page_state(NR_SHMEM),
2871 global_page_state(NR_PAGETABLE), 2930 global_page_state(NR_PAGETABLE),
2872 global_page_state(NR_BOUNCE)); 2931 global_page_state(NR_BOUNCE),
2932 global_page_state(NR_FREE_CMA_PAGES));
2873 2933
2874 for_each_populated_zone(zone) { 2934 for_each_populated_zone(zone) {
2875 int i; 2935 int i;
@@ -2901,6 +2961,7 @@ void show_free_areas(unsigned int filter)
2901 " pagetables:%lukB" 2961 " pagetables:%lukB"
2902 " unstable:%lukB" 2962 " unstable:%lukB"
2903 " bounce:%lukB" 2963 " bounce:%lukB"
2964 " free_cma:%lukB"
2904 " writeback_tmp:%lukB" 2965 " writeback_tmp:%lukB"
2905 " pages_scanned:%lu" 2966 " pages_scanned:%lu"
2906 " all_unreclaimable? %s" 2967 " all_unreclaimable? %s"
@@ -2930,6 +2991,7 @@ void show_free_areas(unsigned int filter)
2930 K(zone_page_state(zone, NR_PAGETABLE)), 2991 K(zone_page_state(zone, NR_PAGETABLE)),
2931 K(zone_page_state(zone, NR_UNSTABLE_NFS)), 2992 K(zone_page_state(zone, NR_UNSTABLE_NFS)),
2932 K(zone_page_state(zone, NR_BOUNCE)), 2993 K(zone_page_state(zone, NR_BOUNCE)),
2994 K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
2933 K(zone_page_state(zone, NR_WRITEBACK_TEMP)), 2995 K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
2934 zone->pages_scanned, 2996 zone->pages_scanned,
2935 (zone->all_unreclaimable ? "yes" : "no") 2997 (zone->all_unreclaimable ? "yes" : "no")
@@ -3328,21 +3390,13 @@ static void build_zonelists(pg_data_t *pgdat)
3328 j = 0; 3390 j = 0;
3329 3391
3330 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 3392 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
3331 int distance = node_distance(local_node, node);
3332
3333 /*
3334 * If another node is sufficiently far away then it is better
3335 * to reclaim pages in a zone before going off node.
3336 */
3337 if (distance > RECLAIM_DISTANCE)
3338 zone_reclaim_mode = 1;
3339
3340 /* 3393 /*
3341 * We don't want to pressure a particular node. 3394 * We don't want to pressure a particular node.
3342 * So adding penalty to the first node in same 3395 * So adding penalty to the first node in same
3343 * distance group to make it round-robin. 3396 * distance group to make it round-robin.
3344 */ 3397 */
3345 if (distance != node_distance(local_node, prev_node)) 3398 if (node_distance(local_node, node) !=
3399 node_distance(local_node, prev_node))
3346 node_load[node] = load; 3400 node_load[node] = load;
3347 3401
3348 prev_node = node; 3402 prev_node = node;
@@ -4438,11 +4492,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4438 4492
4439 zone->spanned_pages = size; 4493 zone->spanned_pages = size;
4440 zone->present_pages = realsize; 4494 zone->present_pages = realsize;
4441#if defined CONFIG_COMPACTION || defined CONFIG_CMA
4442 zone->compact_cached_free_pfn = zone->zone_start_pfn +
4443 zone->spanned_pages;
4444 zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
4445#endif
4446#ifdef CONFIG_NUMA 4495#ifdef CONFIG_NUMA
4447 zone->node = nid; 4496 zone->node = nid;
4448 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4497 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4521,6 +4570,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4521 4570
4522 pgdat->node_id = nid; 4571 pgdat->node_id = nid;
4523 pgdat->node_start_pfn = node_start_pfn; 4572 pgdat->node_start_pfn = node_start_pfn;
4573 init_zone_allows_reclaim(nid);
4524 calculate_node_totalpages(pgdat, zones_size, zholes_size); 4574 calculate_node_totalpages(pgdat, zones_size, zholes_size);
4525 4575
4526 alloc_node_mem_map(pgdat); 4576 alloc_node_mem_map(pgdat);
@@ -4879,7 +4929,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4879 zone_movable_pfn[i] << PAGE_SHIFT); 4929 zone_movable_pfn[i] << PAGE_SHIFT);
4880 } 4930 }
4881 4931
4882 /* Print out the early_node_map[] */ 4932 /* Print out the early node map */
4883 printk("Early memory node ranges\n"); 4933 printk("Early memory node ranges\n");
4884 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 4934 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4885 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, 4935 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
@@ -5619,47 +5669,28 @@ static unsigned long pfn_max_align_up(unsigned long pfn)
5619 pageblock_nr_pages)); 5669 pageblock_nr_pages));
5620} 5670}
5621 5671
5622static struct page *
5623__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
5624 int **resultp)
5625{
5626 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
5627
5628 if (PageHighMem(page))
5629 gfp_mask |= __GFP_HIGHMEM;
5630
5631 return alloc_page(gfp_mask);
5632}
5633
5634/* [start, end) must belong to a single zone. */ 5672/* [start, end) must belong to a single zone. */
5635static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) 5673static int __alloc_contig_migrate_range(struct compact_control *cc,
5674 unsigned long start, unsigned long end)
5636{ 5675{
5637 /* This function is based on compact_zone() from compaction.c. */ 5676 /* This function is based on compact_zone() from compaction.c. */
5638 5677 unsigned long nr_reclaimed;
5639 unsigned long pfn = start; 5678 unsigned long pfn = start;
5640 unsigned int tries = 0; 5679 unsigned int tries = 0;
5641 int ret = 0; 5680 int ret = 0;
5642 5681
5643 struct compact_control cc = {
5644 .nr_migratepages = 0,
5645 .order = -1,
5646 .zone = page_zone(pfn_to_page(start)),
5647 .sync = true,
5648 };
5649 INIT_LIST_HEAD(&cc.migratepages);
5650
5651 migrate_prep_local(); 5682 migrate_prep_local();
5652 5683
5653 while (pfn < end || !list_empty(&cc.migratepages)) { 5684 while (pfn < end || !list_empty(&cc->migratepages)) {
5654 if (fatal_signal_pending(current)) { 5685 if (fatal_signal_pending(current)) {
5655 ret = -EINTR; 5686 ret = -EINTR;
5656 break; 5687 break;
5657 } 5688 }
5658 5689
5659 if (list_empty(&cc.migratepages)) { 5690 if (list_empty(&cc->migratepages)) {
5660 cc.nr_migratepages = 0; 5691 cc->nr_migratepages = 0;
5661 pfn = isolate_migratepages_range(cc.zone, &cc, 5692 pfn = isolate_migratepages_range(cc->zone, cc,
5662 pfn, end); 5693 pfn, end, true);
5663 if (!pfn) { 5694 if (!pfn) {
5664 ret = -EINTR; 5695 ret = -EINTR;
5665 break; 5696 break;
@@ -5670,12 +5701,16 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
5670 break; 5701 break;
5671 } 5702 }
5672 5703
5673 ret = migrate_pages(&cc.migratepages, 5704 nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
5674 __alloc_contig_migrate_alloc, 5705 &cc->migratepages);
5706 cc->nr_migratepages -= nr_reclaimed;
5707
5708 ret = migrate_pages(&cc->migratepages,
5709 alloc_migrate_target,
5675 0, false, MIGRATE_SYNC); 5710 0, false, MIGRATE_SYNC);
5676 } 5711 }
5677 5712
5678 putback_lru_pages(&cc.migratepages); 5713 putback_lru_pages(&cc->migratepages);
5679 return ret > 0 ? 0 : ret; 5714 return ret > 0 ? 0 : ret;
5680} 5715}
5681 5716
@@ -5754,6 +5789,15 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5754 unsigned long outer_start, outer_end; 5789 unsigned long outer_start, outer_end;
5755 int ret = 0, order; 5790 int ret = 0, order;
5756 5791
5792 struct compact_control cc = {
5793 .nr_migratepages = 0,
5794 .order = -1,
5795 .zone = page_zone(pfn_to_page(start)),
5796 .sync = true,
5797 .ignore_skip_hint = true,
5798 };
5799 INIT_LIST_HEAD(&cc.migratepages);
5800
5757 /* 5801 /*
5758 * What we do here is we mark all pageblocks in range as 5802 * What we do here is we mark all pageblocks in range as
5759 * MIGRATE_ISOLATE. Because pageblock and max order pages may 5803 * MIGRATE_ISOLATE. Because pageblock and max order pages may
@@ -5783,7 +5827,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5783 if (ret) 5827 if (ret)
5784 goto done; 5828 goto done;
5785 5829
5786 ret = __alloc_contig_migrate_range(start, end); 5830 ret = __alloc_contig_migrate_range(&cc, start, end);
5787 if (ret) 5831 if (ret)
5788 goto done; 5832 goto done;
5789 5833
@@ -5832,7 +5876,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5832 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); 5876 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
5833 5877
5834 /* Grab isolated pages from freelists. */ 5878 /* Grab isolated pages from freelists. */
5835 outer_end = isolate_freepages_range(outer_start, end); 5879 outer_end = isolate_freepages_range(&cc, outer_start, end);
5836 if (!outer_end) { 5880 if (!outer_end) {
5837 ret = -EBUSY; 5881 ret = -EBUSY;
5838 goto done; 5882 goto done;
@@ -5874,6 +5918,7 @@ static int __meminit __zone_pcp_update(void *data)
5874 local_irq_save(flags); 5918 local_irq_save(flags);
5875 if (pcp->count > 0) 5919 if (pcp->count > 0)
5876 free_pcppages_bulk(zone, pcp->count, pcp); 5920 free_pcppages_bulk(zone, pcp->count, pcp);
5921 drain_zonestat(zone, pset);
5877 setup_pageset(pset, batch); 5922 setup_pageset(pset, batch);
5878 local_irq_restore(flags); 5923 local_irq_restore(flags);
5879 } 5924 }
@@ -5890,10 +5935,16 @@ void __meminit zone_pcp_update(struct zone *zone)
5890void zone_pcp_reset(struct zone *zone) 5935void zone_pcp_reset(struct zone *zone)
5891{ 5936{
5892 unsigned long flags; 5937 unsigned long flags;
5938 int cpu;
5939 struct per_cpu_pageset *pset;
5893 5940
5894 /* avoid races with drain_pages() */ 5941 /* avoid races with drain_pages() */
5895 local_irq_save(flags); 5942 local_irq_save(flags);
5896 if (zone->pageset != &boot_pageset) { 5943 if (zone->pageset != &boot_pageset) {
5944 for_each_online_cpu(cpu) {
5945 pset = per_cpu_ptr(zone->pageset, cpu);
5946 drain_zonestat(zone, pset);
5947 }
5897 free_percpu(zone->pageset); 5948 free_percpu(zone->pageset);
5898 zone->pageset = &boot_pageset; 5949 zone->pageset = &boot_pageset;
5899 } 5950 }
@@ -6047,3 +6098,37 @@ void dump_page(struct page *page)
6047 dump_page_flags(page->flags); 6098 dump_page_flags(page->flags);
6048 mem_cgroup_print_bad_page(page); 6099 mem_cgroup_print_bad_page(page);
6049} 6100}
6101
6102/* reset zone->present_pages */
6103void reset_zone_present_pages(void)
6104{
6105 struct zone *z;
6106 int i, nid;
6107
6108 for_each_node_state(nid, N_HIGH_MEMORY) {
6109 for (i = 0; i < MAX_NR_ZONES; i++) {
6110 z = NODE_DATA(nid)->node_zones + i;
6111 z->present_pages = 0;
6112 }
6113 }
6114}
6115
6116/* calculate zone's present pages in buddy system */
6117void fixup_zone_present_pages(int nid, unsigned long start_pfn,
6118 unsigned long end_pfn)
6119{
6120 struct zone *z;
6121 unsigned long zone_start_pfn, zone_end_pfn;
6122 int i;
6123
6124 for (i = 0; i < MAX_NR_ZONES; i++) {
6125 z = NODE_DATA(nid)->node_zones + i;
6126 zone_start_pfn = z->zone_start_pfn;
6127 zone_end_pfn = zone_start_pfn + z->spanned_pages;
6128
6129 /* if the two regions intersect */
6130 if (!(zone_start_pfn >= end_pfn || zone_end_pfn <= start_pfn))
6131 z->present_pages += min(end_pfn, zone_end_pfn) -
6132 max(start_pfn, zone_start_pfn);
6133 }
6134}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 247d1f175739..f2f5b4818e94 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -76,8 +76,13 @@ int set_migratetype_isolate(struct page *page)
76 76
77out: 77out:
78 if (!ret) { 78 if (!ret) {
79 unsigned long nr_pages;
80 int migratetype = get_pageblock_migratetype(page);
81
79 set_pageblock_isolate(page); 82 set_pageblock_isolate(page);
80 move_freepages_block(zone, page, MIGRATE_ISOLATE); 83 nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE);
84
85 __mod_zone_freepage_state(zone, -nr_pages, migratetype);
81 } 86 }
82 87
83 spin_unlock_irqrestore(&zone->lock, flags); 88 spin_unlock_irqrestore(&zone->lock, flags);
@@ -89,12 +94,14 @@ out:
89void unset_migratetype_isolate(struct page *page, unsigned migratetype) 94void unset_migratetype_isolate(struct page *page, unsigned migratetype)
90{ 95{
91 struct zone *zone; 96 struct zone *zone;
92 unsigned long flags; 97 unsigned long flags, nr_pages;
98
93 zone = page_zone(page); 99 zone = page_zone(page);
94 spin_lock_irqsave(&zone->lock, flags); 100 spin_lock_irqsave(&zone->lock, flags);
95 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) 101 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
96 goto out; 102 goto out;
97 move_freepages_block(zone, page, migratetype); 103 nr_pages = move_freepages_block(zone, page, migratetype);
104 __mod_zone_freepage_state(zone, nr_pages, migratetype);
98 restore_pageblock_isolate(page, migratetype); 105 restore_pageblock_isolate(page, migratetype);
99out: 106out:
100 spin_unlock_irqrestore(&zone->lock, flags); 107 spin_unlock_irqrestore(&zone->lock, flags);
@@ -193,10 +200,25 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
193 continue; 200 continue;
194 } 201 }
195 page = pfn_to_page(pfn); 202 page = pfn_to_page(pfn);
196 if (PageBuddy(page)) 203 if (PageBuddy(page)) {
204 /*
205 * If race between isolatation and allocation happens,
206 * some free pages could be in MIGRATE_MOVABLE list
207 * although pageblock's migratation type of the page
208 * is MIGRATE_ISOLATE. Catch it and move the page into
209 * MIGRATE_ISOLATE list.
210 */
211 if (get_freepage_migratetype(page) != MIGRATE_ISOLATE) {
212 struct page *end_page;
213
214 end_page = page + (1 << page_order(page)) - 1;
215 move_freepages(page_zone(page), page, end_page,
216 MIGRATE_ISOLATE);
217 }
197 pfn += 1 << page_order(page); 218 pfn += 1 << page_order(page);
219 }
198 else if (page_count(page) == 0 && 220 else if (page_count(page) == 0 &&
199 page_private(page) == MIGRATE_ISOLATE) 221 get_freepage_migratetype(page) == MIGRATE_ISOLATE)
200 pfn += 1; 222 pfn += 1;
201 else 223 else
202 break; 224 break;
@@ -233,3 +255,14 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
233 spin_unlock_irqrestore(&zone->lock, flags); 255 spin_unlock_irqrestore(&zone->lock, flags);
234 return ret ? 0 : -EBUSY; 256 return ret ? 0 : -EBUSY;
235} 257}
258
259struct page *alloc_migrate_target(struct page *page, unsigned long private,
260 int **resultp)
261{
262 gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
263
264 if (PageHighMem(page))
265 gfp_mask |= __GFP_HIGHMEM;
266
267 return alloc_page(gfp_mask);
268}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 74c0ddaa6fa0..e642627da6b7 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -120,3 +120,53 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
120} 120}
121#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 121#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
122#endif 122#endif
123
124#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
125#ifdef CONFIG_TRANSPARENT_HUGEPAGE
126void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable)
127{
128 assert_spin_locked(&mm->page_table_lock);
129
130 /* FIFO */
131 if (!mm->pmd_huge_pte)
132 INIT_LIST_HEAD(&pgtable->lru);
133 else
134 list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
135 mm->pmd_huge_pte = pgtable;
136}
137#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
138#endif
139
140#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
141#ifdef CONFIG_TRANSPARENT_HUGEPAGE
142/* no "address" argument so destroys page coloring of some arch */
143pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm)
144{
145 pgtable_t pgtable;
146
147 assert_spin_locked(&mm->page_table_lock);
148
149 /* FIFO */
150 pgtable = mm->pmd_huge_pte;
151 if (list_empty(&pgtable->lru))
152 mm->pmd_huge_pte = NULL;
153 else {
154 mm->pmd_huge_pte = list_entry(pgtable->lru.next,
155 struct page, lru);
156 list_del(&pgtable->lru);
157 }
158 return pgtable;
159}
160#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
161#endif
162
163#ifndef __HAVE_ARCH_PMDP_INVALIDATE
164#ifdef CONFIG_TRANSPARENT_HUGEPAGE
165void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
166 pmd_t *pmdp)
167{
168 set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
169 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
170}
171#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
172#endif
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
deleted file mode 100644
index 799dcfd7cd8c..000000000000
--- a/mm/prio_tree.c
+++ /dev/null
@@ -1,208 +0,0 @@
1/*
2 * mm/prio_tree.c - priority search tree for mapping->i_mmap
3 *
4 * Copyright (C) 2004, Rajesh Venkatasubramanian <vrajesh@umich.edu>
5 *
6 * This file is released under the GPL v2.
7 *
8 * Based on the radix priority search tree proposed by Edward M. McCreight
9 * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985
10 *
11 * 02Feb2004 Initial version
12 */
13
14#include <linux/mm.h>
15#include <linux/prio_tree.h>
16#include <linux/prefetch.h>
17
18/*
19 * See lib/prio_tree.c for details on the general radix priority search tree
20 * code.
21 */
22
23/*
24 * The following #defines are mirrored from lib/prio_tree.c. They're only used
25 * for debugging, and should be removed (along with the debugging code using
26 * them) when switching also VMAs to the regular prio_tree code.
27 */
28
29#define RADIX_INDEX(vma) ((vma)->vm_pgoff)
30#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT)
31/* avoid overflow */
32#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1))
33
34/*
35 * Radix priority search tree for address_space->i_mmap
36 *
37 * For each vma that map a unique set of file pages i.e., unique [radix_index,
38 * heap_index] value, we have a corresponding priority search tree node. If
39 * multiple vmas have identical [radix_index, heap_index] value, then one of
40 * them is used as a tree node and others are stored in a vm_set list. The tree
41 * node points to the first vma (head) of the list using vm_set.head.
42 *
43 * prio_tree_root
44 * |
45 * A vm_set.head
46 * / \ /
47 * L R -> H-I-J-K-M-N-O-P-Q-S
48 * ^ ^ <-- vm_set.list -->
49 * tree nodes
50 *
51 * We need some way to identify whether a vma is a tree node, head of a vm_set
52 * list, or just a member of a vm_set list. We cannot use vm_flags to store
53 * such information. The reason is, in the above figure, it is possible that
54 * vm_flags' of R and H are covered by the different mmap_sems. When R is
55 * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold
56 * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now.
57 * That's why some trick involving shared.vm_set.parent is used for identifying
58 * tree nodes and list head nodes.
59 *
60 * vma radix priority search tree node rules:
61 *
62 * vma->shared.vm_set.parent != NULL ==> a tree node
63 * vma->shared.vm_set.head != NULL ==> list of others mapping same range
64 * vma->shared.vm_set.head == NULL ==> no others map the same range
65 *
66 * vma->shared.vm_set.parent == NULL
67 * vma->shared.vm_set.head != NULL ==> list head of vmas mapping same range
68 * vma->shared.vm_set.head == NULL ==> a list node
69 */
70
71/*
72 * Add a new vma known to map the same set of pages as the old vma:
73 * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
74 * Note that it just happens to work correctly on i_mmap_nonlinear too.
75 */
76void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
77{
78 /* Leave these BUG_ONs till prio_tree patch stabilizes */
79 BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old));
80 BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old));
81
82 vma->shared.vm_set.head = NULL;
83 vma->shared.vm_set.parent = NULL;
84
85 if (!old->shared.vm_set.parent)
86 list_add(&vma->shared.vm_set.list,
87 &old->shared.vm_set.list);
88 else if (old->shared.vm_set.head)
89 list_add_tail(&vma->shared.vm_set.list,
90 &old->shared.vm_set.head->shared.vm_set.list);
91 else {
92 INIT_LIST_HEAD(&vma->shared.vm_set.list);
93 vma->shared.vm_set.head = old;
94 old->shared.vm_set.head = vma;
95 }
96}
97
98void vma_prio_tree_insert(struct vm_area_struct *vma,
99 struct prio_tree_root *root)
100{
101 struct prio_tree_node *ptr;
102 struct vm_area_struct *old;
103
104 vma->shared.vm_set.head = NULL;
105
106 ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
107 if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
108 old = prio_tree_entry(ptr, struct vm_area_struct,
109 shared.prio_tree_node);
110 vma_prio_tree_add(vma, old);
111 }
112}
113
114void vma_prio_tree_remove(struct vm_area_struct *vma,
115 struct prio_tree_root *root)
116{
117 struct vm_area_struct *node, *head, *new_head;
118
119 if (!vma->shared.vm_set.head) {
120 if (!vma->shared.vm_set.parent)
121 list_del_init(&vma->shared.vm_set.list);
122 else
123 raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
124 } else {
125 /* Leave this BUG_ON till prio_tree patch stabilizes */
126 BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
127 if (vma->shared.vm_set.parent) {
128 head = vma->shared.vm_set.head;
129 if (!list_empty(&head->shared.vm_set.list)) {
130 new_head = list_entry(
131 head->shared.vm_set.list.next,
132 struct vm_area_struct,
133 shared.vm_set.list);
134 list_del_init(&head->shared.vm_set.list);
135 } else
136 new_head = NULL;
137
138 raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
139 &head->shared.prio_tree_node);
140 head->shared.vm_set.head = new_head;
141 if (new_head)
142 new_head->shared.vm_set.head = head;
143
144 } else {
145 node = vma->shared.vm_set.head;
146 if (!list_empty(&vma->shared.vm_set.list)) {
147 new_head = list_entry(
148 vma->shared.vm_set.list.next,
149 struct vm_area_struct,
150 shared.vm_set.list);
151 list_del_init(&vma->shared.vm_set.list);
152 node->shared.vm_set.head = new_head;
153 new_head->shared.vm_set.head = node;
154 } else
155 node->shared.vm_set.head = NULL;
156 }
157 }
158}
159
160/*
161 * Helper function to enumerate vmas that map a given file page or a set of
162 * contiguous file pages. The function returns vmas that at least map a single
163 * page in the given range of contiguous file pages.
164 */
165struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
166 struct prio_tree_iter *iter)
167{
168 struct prio_tree_node *ptr;
169 struct vm_area_struct *next;
170
171 if (!vma) {
172 /*
173 * First call is with NULL vma
174 */
175 ptr = prio_tree_next(iter);
176 if (ptr) {
177 next = prio_tree_entry(ptr, struct vm_area_struct,
178 shared.prio_tree_node);
179 prefetch(next->shared.vm_set.head);
180 return next;
181 } else
182 return NULL;
183 }
184
185 if (vma->shared.vm_set.parent) {
186 if (vma->shared.vm_set.head) {
187 next = vma->shared.vm_set.head;
188 prefetch(next->shared.vm_set.list.next);
189 return next;
190 }
191 } else {
192 next = list_entry(vma->shared.vm_set.list.next,
193 struct vm_area_struct, shared.vm_set.list);
194 if (!next->shared.vm_set.head) {
195 prefetch(next->shared.vm_set.list.next);
196 return next;
197 }
198 }
199
200 ptr = prio_tree_next(iter);
201 if (ptr) {
202 next = prio_tree_entry(ptr, struct vm_area_struct,
203 shared.prio_tree_node);
204 prefetch(next->shared.vm_set.head);
205 return next;
206 } else
207 return NULL;
208}
diff --git a/mm/rmap.c b/mm/rmap.c
index 0f3b7cda2a24..7df7984d476c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -127,12 +127,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
127 avc->vma = vma; 127 avc->vma = vma;
128 avc->anon_vma = anon_vma; 128 avc->anon_vma = anon_vma;
129 list_add(&avc->same_vma, &vma->anon_vma_chain); 129 list_add(&avc->same_vma, &vma->anon_vma_chain);
130 130 anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
131 /*
132 * It's critical to add new vmas to the tail of the anon_vma,
133 * see comment in huge_memory.c:__split_huge_page().
134 */
135 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
136} 131}
137 132
138/** 133/**
@@ -269,51 +264,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
269} 264}
270 265
271/* 266/*
272 * Some rmap walk that needs to find all ptes/hugepmds without false
273 * negatives (like migrate and split_huge_page) running concurrent
274 * with operations that copy or move pagetables (like mremap() and
275 * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
276 * list to be in a certain order: the dst_vma must be placed after the
277 * src_vma in the list. This is always guaranteed by fork() but
278 * mremap() needs to call this function to enforce it in case the
279 * dst_vma isn't newly allocated and chained with the anon_vma_clone()
280 * function but just an extension of a pre-existing vma through
281 * vma_merge.
282 *
283 * NOTE: the same_anon_vma list can still be changed by other
284 * processes while mremap runs because mremap doesn't hold the
285 * anon_vma mutex to prevent modifications to the list while it
286 * runs. All we need to enforce is that the relative order of this
287 * process vmas isn't changing (we don't care about other vmas
288 * order). Each vma corresponds to an anon_vma_chain structure so
289 * there's no risk that other processes calling anon_vma_moveto_tail()
290 * and changing the same_anon_vma list under mremap() will screw with
291 * the relative order of this process vmas in the list, because we
292 * they can't alter the order of any vma that belongs to this
293 * process. And there can't be another anon_vma_moveto_tail() running
294 * concurrently with mremap() coming from this process because we hold
295 * the mmap_sem for the whole mremap(). fork() ordering dependency
296 * also shouldn't be affected because fork() only cares that the
297 * parent vmas are placed in the list before the child vmas and
298 * anon_vma_moveto_tail() won't reorder vmas from either the fork()
299 * parent or child.
300 */
301void anon_vma_moveto_tail(struct vm_area_struct *dst)
302{
303 struct anon_vma_chain *pavc;
304 struct anon_vma *root = NULL;
305
306 list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
307 struct anon_vma *anon_vma = pavc->anon_vma;
308 VM_BUG_ON(pavc->vma != dst);
309 root = lock_anon_vma_root(root, anon_vma);
310 list_del(&pavc->same_anon_vma);
311 list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
312 }
313 unlock_anon_vma_root(root);
314}
315
316/*
317 * Attach vma to its own anon_vma, as well as to the anon_vmas that 267 * Attach vma to its own anon_vma, as well as to the anon_vmas that
318 * the corresponding VMA in the parent process is attached to. 268 * the corresponding VMA in the parent process is attached to.
319 * Returns 0 on success, non-zero on failure. 269 * Returns 0 on success, non-zero on failure.
@@ -381,13 +331,13 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
381 struct anon_vma *anon_vma = avc->anon_vma; 331 struct anon_vma *anon_vma = avc->anon_vma;
382 332
383 root = lock_anon_vma_root(root, anon_vma); 333 root = lock_anon_vma_root(root, anon_vma);
384 list_del(&avc->same_anon_vma); 334 anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
385 335
386 /* 336 /*
387 * Leave empty anon_vmas on the list - we'll need 337 * Leave empty anon_vmas on the list - we'll need
388 * to free them outside the lock. 338 * to free them outside the lock.
389 */ 339 */
390 if (list_empty(&anon_vma->head)) 340 if (RB_EMPTY_ROOT(&anon_vma->rb_root))
391 continue; 341 continue;
392 342
393 list_del(&avc->same_vma); 343 list_del(&avc->same_vma);
@@ -416,7 +366,7 @@ static void anon_vma_ctor(void *data)
416 366
417 mutex_init(&anon_vma->mutex); 367 mutex_init(&anon_vma->mutex);
418 atomic_set(&anon_vma->refcount, 0); 368 atomic_set(&anon_vma->refcount, 0);
419 INIT_LIST_HEAD(&anon_vma->head); 369 anon_vma->rb_root = RB_ROOT;
420} 370}
421 371
422void __init anon_vma_init(void) 372void __init anon_vma_init(void)
@@ -560,22 +510,26 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
560 510
561/* 511/*
562 * At what user virtual address is page expected in @vma? 512 * At what user virtual address is page expected in @vma?
563 * Returns virtual address or -EFAULT if page's index/offset is not
564 * within the range mapped the @vma.
565 */ 513 */
566inline unsigned long 514static inline unsigned long
567vma_address(struct page *page, struct vm_area_struct *vma) 515__vma_address(struct page *page, struct vm_area_struct *vma)
568{ 516{
569 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 517 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
570 unsigned long address;
571 518
572 if (unlikely(is_vm_hugetlb_page(vma))) 519 if (unlikely(is_vm_hugetlb_page(vma)))
573 pgoff = page->index << huge_page_order(page_hstate(page)); 520 pgoff = page->index << huge_page_order(page_hstate(page));
574 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); 521
575 if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { 522 return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
576 /* page should be within @vma mapping range */ 523}
577 return -EFAULT; 524
578 } 525inline unsigned long
526vma_address(struct page *page, struct vm_area_struct *vma)
527{
528 unsigned long address = __vma_address(page, vma);
529
530 /* page should be within @vma mapping range */
531 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
532
579 return address; 533 return address;
580} 534}
581 535
@@ -585,6 +539,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
585 */ 539 */
586unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 540unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
587{ 541{
542 unsigned long address;
588 if (PageAnon(page)) { 543 if (PageAnon(page)) {
589 struct anon_vma *page__anon_vma = page_anon_vma(page); 544 struct anon_vma *page__anon_vma = page_anon_vma(page);
590 /* 545 /*
@@ -600,7 +555,10 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
600 return -EFAULT; 555 return -EFAULT;
601 } else 556 } else
602 return -EFAULT; 557 return -EFAULT;
603 return vma_address(page, vma); 558 address = __vma_address(page, vma);
559 if (unlikely(address < vma->vm_start || address >= vma->vm_end))
560 return -EFAULT;
561 return address;
604} 562}
605 563
606/* 564/*
@@ -674,8 +632,8 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
674 pte_t *pte; 632 pte_t *pte;
675 spinlock_t *ptl; 633 spinlock_t *ptl;
676 634
677 address = vma_address(page, vma); 635 address = __vma_address(page, vma);
678 if (address == -EFAULT) /* out of vma range */ 636 if (unlikely(address < vma->vm_start || address >= vma->vm_end))
679 return 0; 637 return 0;
680 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); 638 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
681 if (!pte) /* the page is not in this mm */ 639 if (!pte) /* the page is not in this mm */
@@ -769,6 +727,7 @@ static int page_referenced_anon(struct page *page,
769{ 727{
770 unsigned int mapcount; 728 unsigned int mapcount;
771 struct anon_vma *anon_vma; 729 struct anon_vma *anon_vma;
730 pgoff_t pgoff;
772 struct anon_vma_chain *avc; 731 struct anon_vma_chain *avc;
773 int referenced = 0; 732 int referenced = 0;
774 733
@@ -777,11 +736,10 @@ static int page_referenced_anon(struct page *page,
777 return referenced; 736 return referenced;
778 737
779 mapcount = page_mapcount(page); 738 mapcount = page_mapcount(page);
780 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 739 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
740 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
781 struct vm_area_struct *vma = avc->vma; 741 struct vm_area_struct *vma = avc->vma;
782 unsigned long address = vma_address(page, vma); 742 unsigned long address = vma_address(page, vma);
783 if (address == -EFAULT)
784 continue;
785 /* 743 /*
786 * If we are reclaiming on behalf of a cgroup, skip 744 * If we are reclaiming on behalf of a cgroup, skip
787 * counting on behalf of references from different 745 * counting on behalf of references from different
@@ -820,7 +778,6 @@ static int page_referenced_file(struct page *page,
820 struct address_space *mapping = page->mapping; 778 struct address_space *mapping = page->mapping;
821 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 779 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
822 struct vm_area_struct *vma; 780 struct vm_area_struct *vma;
823 struct prio_tree_iter iter;
824 int referenced = 0; 781 int referenced = 0;
825 782
826 /* 783 /*
@@ -846,10 +803,8 @@ static int page_referenced_file(struct page *page,
846 */ 803 */
847 mapcount = page_mapcount(page); 804 mapcount = page_mapcount(page);
848 805
849 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 806 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
850 unsigned long address = vma_address(page, vma); 807 unsigned long address = vma_address(page, vma);
851 if (address == -EFAULT)
852 continue;
853 /* 808 /*
854 * If we are reclaiming on behalf of a cgroup, skip 809 * If we are reclaiming on behalf of a cgroup, skip
855 * counting on behalf of references from different 810 * counting on behalf of references from different
@@ -929,7 +884,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
929 pte_t entry; 884 pte_t entry;
930 885
931 flush_cache_page(vma, address, pte_pfn(*pte)); 886 flush_cache_page(vma, address, pte_pfn(*pte));
932 entry = ptep_clear_flush_notify(vma, address, pte); 887 entry = ptep_clear_flush(vma, address, pte);
933 entry = pte_wrprotect(entry); 888 entry = pte_wrprotect(entry);
934 entry = pte_mkclean(entry); 889 entry = pte_mkclean(entry);
935 set_pte_at(mm, address, pte, entry); 890 set_pte_at(mm, address, pte, entry);
@@ -937,6 +892,9 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
937 } 892 }
938 893
939 pte_unmap_unlock(pte, ptl); 894 pte_unmap_unlock(pte, ptl);
895
896 if (ret)
897 mmu_notifier_invalidate_page(mm, address);
940out: 898out:
941 return ret; 899 return ret;
942} 900}
@@ -945,17 +903,14 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
945{ 903{
946 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 904 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
947 struct vm_area_struct *vma; 905 struct vm_area_struct *vma;
948 struct prio_tree_iter iter;
949 int ret = 0; 906 int ret = 0;
950 907
951 BUG_ON(PageAnon(page)); 908 BUG_ON(PageAnon(page));
952 909
953 mutex_lock(&mapping->i_mmap_mutex); 910 mutex_lock(&mapping->i_mmap_mutex);
954 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 911 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
955 if (vma->vm_flags & VM_SHARED) { 912 if (vma->vm_flags & VM_SHARED) {
956 unsigned long address = vma_address(page, vma); 913 unsigned long address = vma_address(page, vma);
957 if (address == -EFAULT)
958 continue;
959 ret += page_mkclean_one(page, vma, address); 914 ret += page_mkclean_one(page, vma, address);
960 } 915 }
961 } 916 }
@@ -1128,7 +1083,7 @@ void page_add_new_anon_rmap(struct page *page,
1128 else 1083 else
1129 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); 1084 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1130 __page_set_anon_rmap(page, vma, address, 1); 1085 __page_set_anon_rmap(page, vma, address, 1);
1131 if (page_evictable(page, vma)) 1086 if (!mlocked_vma_newpage(vma, page))
1132 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 1087 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
1133 else 1088 else
1134 add_page_to_unevictable_list(page); 1089 add_page_to_unevictable_list(page);
@@ -1203,7 +1158,10 @@ void page_remove_rmap(struct page *page)
1203 } else { 1158 } else {
1204 __dec_zone_page_state(page, NR_FILE_MAPPED); 1159 __dec_zone_page_state(page, NR_FILE_MAPPED);
1205 mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); 1160 mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
1161 mem_cgroup_end_update_page_stat(page, &locked, &flags);
1206 } 1162 }
1163 if (unlikely(PageMlocked(page)))
1164 clear_page_mlock(page);
1207 /* 1165 /*
1208 * It would be tidy to reset the PageAnon mapping here, 1166 * It would be tidy to reset the PageAnon mapping here,
1209 * but that might overwrite a racing page_add_anon_rmap 1167 * but that might overwrite a racing page_add_anon_rmap
@@ -1213,6 +1171,7 @@ void page_remove_rmap(struct page *page)
1213 * Leaving it set also helps swapoff to reinstate ptes 1171 * Leaving it set also helps swapoff to reinstate ptes
1214 * faster for those pages still in swapcache. 1172 * faster for those pages still in swapcache.
1215 */ 1173 */
1174 return;
1216out: 1175out:
1217 if (!anon) 1176 if (!anon)
1218 mem_cgroup_end_update_page_stat(page, &locked, &flags); 1177 mem_cgroup_end_update_page_stat(page, &locked, &flags);
@@ -1256,7 +1215,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1256 1215
1257 /* Nuke the page table entry. */ 1216 /* Nuke the page table entry. */
1258 flush_cache_page(vma, address, page_to_pfn(page)); 1217 flush_cache_page(vma, address, page_to_pfn(page));
1259 pteval = ptep_clear_flush_notify(vma, address, pte); 1218 pteval = ptep_clear_flush(vma, address, pte);
1260 1219
1261 /* Move the dirty bit to the physical page now the pte is gone. */ 1220 /* Move the dirty bit to the physical page now the pte is gone. */
1262 if (pte_dirty(pteval)) 1221 if (pte_dirty(pteval))
@@ -1318,6 +1277,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1318 1277
1319out_unmap: 1278out_unmap:
1320 pte_unmap_unlock(pte, ptl); 1279 pte_unmap_unlock(pte, ptl);
1280 if (ret != SWAP_FAIL)
1281 mmu_notifier_invalidate_page(mm, address);
1321out: 1282out:
1322 return ret; 1283 return ret;
1323 1284
@@ -1382,6 +1343,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1382 spinlock_t *ptl; 1343 spinlock_t *ptl;
1383 struct page *page; 1344 struct page *page;
1384 unsigned long address; 1345 unsigned long address;
1346 unsigned long mmun_start; /* For mmu_notifiers */
1347 unsigned long mmun_end; /* For mmu_notifiers */
1385 unsigned long end; 1348 unsigned long end;
1386 int ret = SWAP_AGAIN; 1349 int ret = SWAP_AGAIN;
1387 int locked_vma = 0; 1350 int locked_vma = 0;
@@ -1405,6 +1368,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1405 if (!pmd_present(*pmd)) 1368 if (!pmd_present(*pmd))
1406 return ret; 1369 return ret;
1407 1370
1371 mmun_start = address;
1372 mmun_end = end;
1373 mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
1374
1408 /* 1375 /*
1409 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, 1376 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
1410 * keep the sem while scanning the cluster for mlocking pages. 1377 * keep the sem while scanning the cluster for mlocking pages.
@@ -1438,7 +1405,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1438 1405
1439 /* Nuke the page table entry. */ 1406 /* Nuke the page table entry. */
1440 flush_cache_page(vma, address, pte_pfn(*pte)); 1407 flush_cache_page(vma, address, pte_pfn(*pte));
1441 pteval = ptep_clear_flush_notify(vma, address, pte); 1408 pteval = ptep_clear_flush(vma, address, pte);
1442 1409
1443 /* If nonlinear, store the file page offset in the pte. */ 1410 /* If nonlinear, store the file page offset in the pte. */
1444 if (page->index != linear_page_index(vma, address)) 1411 if (page->index != linear_page_index(vma, address))
@@ -1454,6 +1421,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1454 (*mapcount)--; 1421 (*mapcount)--;
1455 } 1422 }
1456 pte_unmap_unlock(pte - 1, ptl); 1423 pte_unmap_unlock(pte - 1, ptl);
1424 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
1457 if (locked_vma) 1425 if (locked_vma)
1458 up_read(&vma->vm_mm->mmap_sem); 1426 up_read(&vma->vm_mm->mmap_sem);
1459 return ret; 1427 return ret;
@@ -1492,6 +1460,7 @@ bool is_vma_temporary_stack(struct vm_area_struct *vma)
1492static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) 1460static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1493{ 1461{
1494 struct anon_vma *anon_vma; 1462 struct anon_vma *anon_vma;
1463 pgoff_t pgoff;
1495 struct anon_vma_chain *avc; 1464 struct anon_vma_chain *avc;
1496 int ret = SWAP_AGAIN; 1465 int ret = SWAP_AGAIN;
1497 1466
@@ -1499,7 +1468,8 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1499 if (!anon_vma) 1468 if (!anon_vma)
1500 return ret; 1469 return ret;
1501 1470
1502 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1471 pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1472 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1503 struct vm_area_struct *vma = avc->vma; 1473 struct vm_area_struct *vma = avc->vma;
1504 unsigned long address; 1474 unsigned long address;
1505 1475
@@ -1516,8 +1486,6 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
1516 continue; 1486 continue;
1517 1487
1518 address = vma_address(page, vma); 1488 address = vma_address(page, vma);
1519 if (address == -EFAULT)
1520 continue;
1521 ret = try_to_unmap_one(page, vma, address, flags); 1489 ret = try_to_unmap_one(page, vma, address, flags);
1522 if (ret != SWAP_AGAIN || !page_mapped(page)) 1490 if (ret != SWAP_AGAIN || !page_mapped(page))
1523 break; 1491 break;
@@ -1547,7 +1515,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1547 struct address_space *mapping = page->mapping; 1515 struct address_space *mapping = page->mapping;
1548 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1516 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1549 struct vm_area_struct *vma; 1517 struct vm_area_struct *vma;
1550 struct prio_tree_iter iter;
1551 int ret = SWAP_AGAIN; 1518 int ret = SWAP_AGAIN;
1552 unsigned long cursor; 1519 unsigned long cursor;
1553 unsigned long max_nl_cursor = 0; 1520 unsigned long max_nl_cursor = 0;
@@ -1555,10 +1522,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1555 unsigned int mapcount; 1522 unsigned int mapcount;
1556 1523
1557 mutex_lock(&mapping->i_mmap_mutex); 1524 mutex_lock(&mapping->i_mmap_mutex);
1558 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1525 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1559 unsigned long address = vma_address(page, vma); 1526 unsigned long address = vma_address(page, vma);
1560 if (address == -EFAULT)
1561 continue;
1562 ret = try_to_unmap_one(page, vma, address, flags); 1527 ret = try_to_unmap_one(page, vma, address, flags);
1563 if (ret != SWAP_AGAIN || !page_mapped(page)) 1528 if (ret != SWAP_AGAIN || !page_mapped(page))
1564 goto out; 1529 goto out;
@@ -1576,7 +1541,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1576 goto out; 1541 goto out;
1577 1542
1578 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1543 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1579 shared.vm_set.list) { 1544 shared.nonlinear) {
1580 cursor = (unsigned long) vma->vm_private_data; 1545 cursor = (unsigned long) vma->vm_private_data;
1581 if (cursor > max_nl_cursor) 1546 if (cursor > max_nl_cursor)
1582 max_nl_cursor = cursor; 1547 max_nl_cursor = cursor;
@@ -1608,7 +1573,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1608 1573
1609 do { 1574 do {
1610 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1575 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
1611 shared.vm_set.list) { 1576 shared.nonlinear) {
1612 cursor = (unsigned long) vma->vm_private_data; 1577 cursor = (unsigned long) vma->vm_private_data;
1613 while ( cursor < max_nl_cursor && 1578 while ( cursor < max_nl_cursor &&
1614 cursor < vma->vm_end - vma->vm_start) { 1579 cursor < vma->vm_end - vma->vm_start) {
@@ -1631,7 +1596,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
1631 * in locked vmas). Reset cursor on all unreserved nonlinear 1596 * in locked vmas). Reset cursor on all unreserved nonlinear
1632 * vmas, now forgetting on which ones it had fallen behind. 1597 * vmas, now forgetting on which ones it had fallen behind.
1633 */ 1598 */
1634 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) 1599 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
1635 vma->vm_private_data = NULL; 1600 vma->vm_private_data = NULL;
1636out: 1601out:
1637 mutex_unlock(&mapping->i_mmap_mutex); 1602 mutex_unlock(&mapping->i_mmap_mutex);
@@ -1716,6 +1681,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1716 struct vm_area_struct *, unsigned long, void *), void *arg) 1681 struct vm_area_struct *, unsigned long, void *), void *arg)
1717{ 1682{
1718 struct anon_vma *anon_vma; 1683 struct anon_vma *anon_vma;
1684 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1719 struct anon_vma_chain *avc; 1685 struct anon_vma_chain *avc;
1720 int ret = SWAP_AGAIN; 1686 int ret = SWAP_AGAIN;
1721 1687
@@ -1729,11 +1695,9 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
1729 if (!anon_vma) 1695 if (!anon_vma)
1730 return ret; 1696 return ret;
1731 anon_vma_lock(anon_vma); 1697 anon_vma_lock(anon_vma);
1732 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { 1698 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
1733 struct vm_area_struct *vma = avc->vma; 1699 struct vm_area_struct *vma = avc->vma;
1734 unsigned long address = vma_address(page, vma); 1700 unsigned long address = vma_address(page, vma);
1735 if (address == -EFAULT)
1736 continue;
1737 ret = rmap_one(page, vma, address, arg); 1701 ret = rmap_one(page, vma, address, arg);
1738 if (ret != SWAP_AGAIN) 1702 if (ret != SWAP_AGAIN)
1739 break; 1703 break;
@@ -1748,16 +1712,13 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
1748 struct address_space *mapping = page->mapping; 1712 struct address_space *mapping = page->mapping;
1749 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1713 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
1750 struct vm_area_struct *vma; 1714 struct vm_area_struct *vma;
1751 struct prio_tree_iter iter;
1752 int ret = SWAP_AGAIN; 1715 int ret = SWAP_AGAIN;
1753 1716
1754 if (!mapping) 1717 if (!mapping)
1755 return ret; 1718 return ret;
1756 mutex_lock(&mapping->i_mmap_mutex); 1719 mutex_lock(&mapping->i_mmap_mutex);
1757 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1720 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
1758 unsigned long address = vma_address(page, vma); 1721 unsigned long address = vma_address(page, vma);
1759 if (address == -EFAULT)
1760 continue;
1761 ret = rmap_one(page, vma, address, arg); 1722 ret = rmap_one(page, vma, address, arg);
1762 if (ret != SWAP_AGAIN) 1723 if (ret != SWAP_AGAIN)
1763 break; 1724 break;
diff --git a/mm/shmem.c b/mm/shmem.c
index d3752110c8c7..cc12072f8787 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1339,7 +1339,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
1339{ 1339{
1340 file_accessed(file); 1340 file_accessed(file);
1341 vma->vm_ops = &shmem_vm_ops; 1341 vma->vm_ops = &shmem_vm_ops;
1342 vma->vm_flags |= VM_CAN_NONLINEAR;
1343 return 0; 1342 return 0;
1344} 1343}
1345 1344
@@ -2643,6 +2642,7 @@ static const struct vm_operations_struct shmem_vm_ops = {
2643 .set_policy = shmem_set_policy, 2642 .set_policy = shmem_set_policy,
2644 .get_policy = shmem_get_policy, 2643 .get_policy = shmem_get_policy,
2645#endif 2644#endif
2645 .remap_pages = generic_file_remap_pages,
2646}; 2646};
2647 2647
2648static struct dentry *shmem_mount(struct file_system_type *fs_type, 2648static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@ -2836,7 +2836,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2836 fput(vma->vm_file); 2836 fput(vma->vm_file);
2837 vma->vm_file = file; 2837 vma->vm_file = file;
2838 vma->vm_ops = &shmem_vm_ops; 2838 vma->vm_ops = &shmem_vm_ops;
2839 vma->vm_flags |= VM_CAN_NONLINEAR;
2840 return 0; 2839 return 0;
2841} 2840}
2842 2841
diff --git a/mm/swap.c b/mm/swap.c
index 77825883298f..6310dc2008ff 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -446,13 +446,22 @@ void mark_page_accessed(struct page *page)
446} 446}
447EXPORT_SYMBOL(mark_page_accessed); 447EXPORT_SYMBOL(mark_page_accessed);
448 448
449/*
450 * Order of operations is important: flush the pagevec when it's already
451 * full, not when adding the last page, to make sure that last page is
452 * not added to the LRU directly when passed to this function. Because
453 * mark_page_accessed() (called after this when writing) only activates
454 * pages that are on the LRU, linear writes in subpage chunks would see
455 * every PAGEVEC_SIZE page activated, which is unexpected.
456 */
449void __lru_cache_add(struct page *page, enum lru_list lru) 457void __lru_cache_add(struct page *page, enum lru_list lru)
450{ 458{
451 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; 459 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
452 460
453 page_cache_get(page); 461 page_cache_get(page);
454 if (!pagevec_add(pvec, page)) 462 if (!pagevec_space(pvec))
455 __pagevec_lru_add(pvec, lru); 463 __pagevec_lru_add(pvec, lru);
464 pagevec_add(pvec, page);
456 put_cpu_var(lru_add_pvecs); 465 put_cpu_var(lru_add_pvecs);
457} 466}
458EXPORT_SYMBOL(__lru_cache_add); 467EXPORT_SYMBOL(__lru_cache_add);
@@ -742,7 +751,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
742 751
743 SetPageLRU(page_tail); 752 SetPageLRU(page_tail);
744 753
745 if (page_evictable(page_tail, NULL)) { 754 if (page_evictable(page_tail)) {
746 if (PageActive(page)) { 755 if (PageActive(page)) {
747 SetPageActive(page_tail); 756 SetPageActive(page_tail);
748 active = 1; 757 active = 1;
diff --git a/mm/truncate.c b/mm/truncate.c
index 75801acdaac7..d51ce92d6e83 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -107,7 +107,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
107 107
108 cancel_dirty_page(page, PAGE_CACHE_SIZE); 108 cancel_dirty_page(page, PAGE_CACHE_SIZE);
109 109
110 clear_page_mlock(page);
111 ClearPageMappedToDisk(page); 110 ClearPageMappedToDisk(page);
112 delete_from_page_cache(page); 111 delete_from_page_cache(page);
113 return 0; 112 return 0;
@@ -132,7 +131,6 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
132 if (page_has_private(page) && !try_to_release_page(page, 0)) 131 if (page_has_private(page) && !try_to_release_page(page, 0))
133 return 0; 132 return 0;
134 133
135 clear_page_mlock(page);
136 ret = remove_mapping(mapping, page); 134 ret = remove_mapping(mapping, page);
137 135
138 return ret; 136 return ret;
@@ -398,7 +396,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
398 if (PageDirty(page)) 396 if (PageDirty(page))
399 goto failed; 397 goto failed;
400 398
401 clear_page_mlock(page);
402 BUG_ON(page_has_private(page)); 399 BUG_ON(page_has_private(page));
403 __delete_from_page_cache(page); 400 __delete_from_page_cache(page);
404 spin_unlock_irq(&mapping->tree_lock); 401 spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2bb90b1d241c..78e08300db21 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2163,8 +2163,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
2163 usize -= PAGE_SIZE; 2163 usize -= PAGE_SIZE;
2164 } while (usize > 0); 2164 } while (usize > 0);
2165 2165
2166 /* Prevent "things" like memory migration? VM_flags need a cleanup... */ 2166 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
2167 vma->vm_flags |= VM_RESERVED;
2168 2167
2169 return 0; 2168 return 0;
2170} 2169}
@@ -2572,7 +2571,7 @@ static int s_show(struct seq_file *m, void *p)
2572{ 2571{
2573 struct vm_struct *v = p; 2572 struct vm_struct *v = p;
2574 2573
2575 seq_printf(m, "0x%p-0x%p %7ld", 2574 seq_printf(m, "0x%pK-0x%pK %7ld",
2576 v->addr, v->addr + v->size, v->size); 2575 v->addr, v->addr + v->size, v->size);
2577 2576
2578 if (v->caller) 2577 if (v->caller)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 99b434b674c0..2624edcfb420 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -553,7 +553,7 @@ void putback_lru_page(struct page *page)
553redo: 553redo:
554 ClearPageUnevictable(page); 554 ClearPageUnevictable(page);
555 555
556 if (page_evictable(page, NULL)) { 556 if (page_evictable(page)) {
557 /* 557 /*
558 * For evictable pages, we can use the cache. 558 * For evictable pages, we can use the cache.
559 * In event of a race, worst case is we end up with an 559 * In event of a race, worst case is we end up with an
@@ -587,7 +587,7 @@ redo:
587 * page is on unevictable list, it never be freed. To avoid that, 587 * page is on unevictable list, it never be freed. To avoid that,
588 * check after we added it to the list, again. 588 * check after we added it to the list, again.
589 */ 589 */
590 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { 590 if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
591 if (!isolate_lru_page(page)) { 591 if (!isolate_lru_page(page)) {
592 put_page(page); 592 put_page(page);
593 goto redo; 593 goto redo;
@@ -674,8 +674,10 @@ static enum page_references page_check_references(struct page *page,
674static unsigned long shrink_page_list(struct list_head *page_list, 674static unsigned long shrink_page_list(struct list_head *page_list,
675 struct zone *zone, 675 struct zone *zone,
676 struct scan_control *sc, 676 struct scan_control *sc,
677 enum ttu_flags ttu_flags,
677 unsigned long *ret_nr_dirty, 678 unsigned long *ret_nr_dirty,
678 unsigned long *ret_nr_writeback) 679 unsigned long *ret_nr_writeback,
680 bool force_reclaim)
679{ 681{
680 LIST_HEAD(ret_pages); 682 LIST_HEAD(ret_pages);
681 LIST_HEAD(free_pages); 683 LIST_HEAD(free_pages);
@@ -689,10 +691,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
689 691
690 mem_cgroup_uncharge_start(); 692 mem_cgroup_uncharge_start();
691 while (!list_empty(page_list)) { 693 while (!list_empty(page_list)) {
692 enum page_references references;
693 struct address_space *mapping; 694 struct address_space *mapping;
694 struct page *page; 695 struct page *page;
695 int may_enter_fs; 696 int may_enter_fs;
697 enum page_references references = PAGEREF_RECLAIM_CLEAN;
696 698
697 cond_resched(); 699 cond_resched();
698 700
@@ -707,7 +709,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
707 709
708 sc->nr_scanned++; 710 sc->nr_scanned++;
709 711
710 if (unlikely(!page_evictable(page, NULL))) 712 if (unlikely(!page_evictable(page)))
711 goto cull_mlocked; 713 goto cull_mlocked;
712 714
713 if (!sc->may_unmap && page_mapped(page)) 715 if (!sc->may_unmap && page_mapped(page))
@@ -758,7 +760,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
758 wait_on_page_writeback(page); 760 wait_on_page_writeback(page);
759 } 761 }
760 762
761 references = page_check_references(page, sc); 763 if (!force_reclaim)
764 references = page_check_references(page, sc);
765
762 switch (references) { 766 switch (references) {
763 case PAGEREF_ACTIVATE: 767 case PAGEREF_ACTIVATE:
764 goto activate_locked; 768 goto activate_locked;
@@ -788,7 +792,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
788 * processes. Try to unmap it here. 792 * processes. Try to unmap it here.
789 */ 793 */
790 if (page_mapped(page) && mapping) { 794 if (page_mapped(page) && mapping) {
791 switch (try_to_unmap(page, TTU_UNMAP)) { 795 switch (try_to_unmap(page, ttu_flags)) {
792 case SWAP_FAIL: 796 case SWAP_FAIL:
793 goto activate_locked; 797 goto activate_locked;
794 case SWAP_AGAIN: 798 case SWAP_AGAIN:
@@ -960,6 +964,33 @@ keep:
960 return nr_reclaimed; 964 return nr_reclaimed;
961} 965}
962 966
967unsigned long reclaim_clean_pages_from_list(struct zone *zone,
968 struct list_head *page_list)
969{
970 struct scan_control sc = {
971 .gfp_mask = GFP_KERNEL,
972 .priority = DEF_PRIORITY,
973 .may_unmap = 1,
974 };
975 unsigned long ret, dummy1, dummy2;
976 struct page *page, *next;
977 LIST_HEAD(clean_pages);
978
979 list_for_each_entry_safe(page, next, page_list, lru) {
980 if (page_is_file_cache(page) && !PageDirty(page)) {
981 ClearPageActive(page);
982 list_move(&page->lru, &clean_pages);
983 }
984 }
985
986 ret = shrink_page_list(&clean_pages, zone, &sc,
987 TTU_UNMAP|TTU_IGNORE_ACCESS,
988 &dummy1, &dummy2, true);
989 list_splice(&clean_pages, page_list);
990 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
991 return ret;
992}
993
963/* 994/*
964 * Attempt to remove the specified page from its LRU. Only take this page 995 * Attempt to remove the specified page from its LRU. Only take this page
965 * if it is of the appropriate PageActive status. Pages which are being 996 * if it is of the appropriate PageActive status. Pages which are being
@@ -978,8 +1009,8 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
978 if (!PageLRU(page)) 1009 if (!PageLRU(page))
979 return ret; 1010 return ret;
980 1011
981 /* Do not give back unevictable pages for compaction */ 1012 /* Compaction should not handle unevictable pages but CMA can do so */
982 if (PageUnevictable(page)) 1013 if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
983 return ret; 1014 return ret;
984 1015
985 ret = -EBUSY; 1016 ret = -EBUSY;
@@ -1186,7 +1217,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
1186 1217
1187 VM_BUG_ON(PageLRU(page)); 1218 VM_BUG_ON(PageLRU(page));
1188 list_del(&page->lru); 1219 list_del(&page->lru);
1189 if (unlikely(!page_evictable(page, NULL))) { 1220 if (unlikely(!page_evictable(page))) {
1190 spin_unlock_irq(&zone->lru_lock); 1221 spin_unlock_irq(&zone->lru_lock);
1191 putback_lru_page(page); 1222 putback_lru_page(page);
1192 spin_lock_irq(&zone->lru_lock); 1223 spin_lock_irq(&zone->lru_lock);
@@ -1278,8 +1309,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1278 if (nr_taken == 0) 1309 if (nr_taken == 0)
1279 return 0; 1310 return 0;
1280 1311
1281 nr_reclaimed = shrink_page_list(&page_list, zone, sc, 1312 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
1282 &nr_dirty, &nr_writeback); 1313 &nr_dirty, &nr_writeback, false);
1283 1314
1284 spin_lock_irq(&zone->lru_lock); 1315 spin_lock_irq(&zone->lru_lock);
1285 1316
@@ -1439,7 +1470,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
1439 page = lru_to_page(&l_hold); 1470 page = lru_to_page(&l_hold);
1440 list_del(&page->lru); 1471 list_del(&page->lru);
1441 1472
1442 if (unlikely(!page_evictable(page, NULL))) { 1473 if (unlikely(!page_evictable(page))) {
1443 putback_lru_page(page); 1474 putback_lru_page(page);
1444 continue; 1475 continue;
1445 } 1476 }
@@ -1729,6 +1760,28 @@ static bool in_reclaim_compaction(struct scan_control *sc)
1729 return false; 1760 return false;
1730} 1761}
1731 1762
1763#ifdef CONFIG_COMPACTION
1764/*
1765 * If compaction is deferred for sc->order then scale the number of pages
1766 * reclaimed based on the number of consecutive allocation failures
1767 */
1768static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
1769 struct lruvec *lruvec, struct scan_control *sc)
1770{
1771 struct zone *zone = lruvec_zone(lruvec);
1772
1773 if (zone->compact_order_failed <= sc->order)
1774 pages_for_compaction <<= zone->compact_defer_shift;
1775 return pages_for_compaction;
1776}
1777#else
1778static unsigned long scale_for_compaction(unsigned long pages_for_compaction,
1779 struct lruvec *lruvec, struct scan_control *sc)
1780{
1781 return pages_for_compaction;
1782}
1783#endif
1784
1732/* 1785/*
1733 * Reclaim/compaction is used for high-order allocation requests. It reclaims 1786 * Reclaim/compaction is used for high-order allocation requests. It reclaims
1734 * order-0 pages before compacting the zone. should_continue_reclaim() returns 1787 * order-0 pages before compacting the zone. should_continue_reclaim() returns
@@ -1776,6 +1829,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
1776 * inactive lists are large enough, continue reclaiming 1829 * inactive lists are large enough, continue reclaiming
1777 */ 1830 */
1778 pages_for_compaction = (2UL << sc->order); 1831 pages_for_compaction = (2UL << sc->order);
1832
1833 pages_for_compaction = scale_for_compaction(pages_for_compaction,
1834 lruvec, sc);
1779 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); 1835 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
1780 if (nr_swap_pages > 0) 1836 if (nr_swap_pages > 0)
1781 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); 1837 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
@@ -2839,6 +2895,14 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2839 */ 2895 */
2840 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); 2896 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2841 2897
2898 /*
2899 * Compaction records what page blocks it recently failed to
2900 * isolate pages from and skips them in the future scanning.
2901 * When kswapd is going to sleep, it is reasonable to assume
2902 * that pages and compaction may succeed so reset the cache.
2903 */
2904 reset_isolation_suitable(pgdat);
2905
2842 if (!kthread_should_stop()) 2906 if (!kthread_should_stop())
2843 schedule(); 2907 schedule();
2844 2908
@@ -3101,9 +3165,9 @@ int kswapd_run(int nid)
3101 if (IS_ERR(pgdat->kswapd)) { 3165 if (IS_ERR(pgdat->kswapd)) {
3102 /* failure at boot is fatal */ 3166 /* failure at boot is fatal */
3103 BUG_ON(system_state == SYSTEM_BOOTING); 3167 BUG_ON(system_state == SYSTEM_BOOTING);
3104 printk("Failed to start kswapd on node %d\n",nid);
3105 pgdat->kswapd = NULL; 3168 pgdat->kswapd = NULL;
3106 ret = -1; 3169 pr_err("Failed to start kswapd on node %d\n", nid);
3170 ret = PTR_ERR(pgdat->kswapd);
3107 } 3171 }
3108 return ret; 3172 return ret;
3109} 3173}
@@ -3350,27 +3414,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3350/* 3414/*
3351 * page_evictable - test whether a page is evictable 3415 * page_evictable - test whether a page is evictable
3352 * @page: the page to test 3416 * @page: the page to test
3353 * @vma: the VMA in which the page is or will be mapped, may be NULL
3354 * 3417 *
3355 * Test whether page is evictable--i.e., should be placed on active/inactive 3418 * Test whether page is evictable--i.e., should be placed on active/inactive
3356 * lists vs unevictable list. The vma argument is !NULL when called from the 3419 * lists vs unevictable list.
3357 * fault path to determine how to instantate a new page.
3358 * 3420 *
3359 * Reasons page might not be evictable: 3421 * Reasons page might not be evictable:
3360 * (1) page's mapping marked unevictable 3422 * (1) page's mapping marked unevictable
3361 * (2) page is part of an mlocked VMA 3423 * (2) page is part of an mlocked VMA
3362 * 3424 *
3363 */ 3425 */
3364int page_evictable(struct page *page, struct vm_area_struct *vma) 3426int page_evictable(struct page *page)
3365{ 3427{
3366 3428 return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
3367 if (mapping_unevictable(page_mapping(page)))
3368 return 0;
3369
3370 if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
3371 return 0;
3372
3373 return 1;
3374} 3429}
3375 3430
3376#ifdef CONFIG_SHMEM 3431#ifdef CONFIG_SHMEM
@@ -3408,7 +3463,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
3408 if (!PageLRU(page) || !PageUnevictable(page)) 3463 if (!PageLRU(page) || !PageUnevictable(page))
3409 continue; 3464 continue;
3410 3465
3411 if (page_evictable(page, NULL)) { 3466 if (page_evictable(page)) {
3412 enum lru_list lru = page_lru_base_type(page); 3467 enum lru_list lru = page_lru_base_type(page);
3413 3468
3414 VM_BUG_ON(PageActive(page)); 3469 VM_BUG_ON(PageActive(page));
diff --git a/mm/vmstat.c b/mm/vmstat.c
index b3e3b9d525d0..c7370579111b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -495,6 +495,18 @@ void refresh_cpu_vm_stats(int cpu)
495 atomic_long_add(global_diff[i], &vm_stat[i]); 495 atomic_long_add(global_diff[i], &vm_stat[i]);
496} 496}
497 497
498void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
499{
500 int i;
501
502 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
503 if (pset->vm_stat_diff[i]) {
504 int v = pset->vm_stat_diff[i];
505 pset->vm_stat_diff[i] = 0;
506 atomic_long_add(v, &zone->vm_stat[i]);
507 atomic_long_add(v, &vm_stat[i]);
508 }
509}
498#endif 510#endif
499 511
500#ifdef CONFIG_NUMA 512#ifdef CONFIG_NUMA
@@ -722,6 +734,7 @@ const char * const vmstat_text[] = {
722 "numa_other", 734 "numa_other",
723#endif 735#endif
724 "nr_anon_transparent_hugepages", 736 "nr_anon_transparent_hugepages",
737 "nr_free_cma",
725 "nr_dirty_threshold", 738 "nr_dirty_threshold",
726 "nr_dirty_background_threshold", 739 "nr_dirty_background_threshold",
727 740
@@ -781,7 +794,6 @@ const char * const vmstat_text[] = {
781 "unevictable_pgs_munlocked", 794 "unevictable_pgs_munlocked",
782 "unevictable_pgs_cleared", 795 "unevictable_pgs_cleared",
783 "unevictable_pgs_stranded", 796 "unevictable_pgs_stranded",
784 "unevictable_pgs_mlockfreed",
785 797
786#ifdef CONFIG_TRANSPARENT_HUGEPAGE 798#ifdef CONFIG_TRANSPARENT_HUGEPAGE
787 "thp_fault_alloc", 799 "thp_fault_alloc",