aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2011-02-14 05:55:18 -0500
committerIngo Molnar <mingo@elte.hu>2011-02-14 05:55:18 -0500
commitd2137d5af4259f50c19addb8246a186c9ffac325 (patch)
tree2f7e309f9cf8ef2f2698532c226edda38021fe69 /mm
parentf005fe12b90c5b9fe180a09209a893e09affa8aa (diff)
parent795abaf1e4e188c4171e3cd3dbb11a9fcacaf505 (diff)
Merge branch 'linus' into x86/bootmem
Conflicts: arch/x86/mm/numa_64.c Merge reason: fix the conflict, update to latest -rc and pick up this dependent fix from Yinghai: e6d2e2b2b1e1: memblock: don't adjust size in memblock_find_base() Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig40
-rw-r--r--mm/Makefile3
-rw-r--r--mm/compaction.c186
-rw-r--r--mm/dmapool.c16
-rw-r--r--mm/filemap.c25
-rw-r--r--mm/huge_memory.c2354
-rw-r--r--mm/hugetlb.c114
-rw-r--r--mm/internal.h7
-rw-r--r--mm/kmemleak-test.c6
-rw-r--r--mm/kmemleak.c13
-rw-r--r--mm/ksm.c88
-rw-r--r--mm/madvise.c10
-rw-r--r--mm/memblock.c10
-rw-r--r--mm/memcontrol.c431
-rw-r--r--mm/memory-failure.c118
-rw-r--r--mm/memory.c360
-rw-r--r--mm/memory_hotplug.c52
-rw-r--r--mm/mempolicy.c26
-rw-r--r--mm/migrate.c134
-rw-r--r--mm/mincore.c7
-rw-r--r--mm/mlock.c170
-rw-r--r--mm/mmap.c33
-rw-r--r--mm/mmu_notifier.c20
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/mprotect.c20
-rw-r--r--mm/mremap.c9
-rw-r--r--mm/nommu.c35
-rw-r--r--mm/page-writeback.c11
-rw-r--r--mm/page_alloc.c216
-rw-r--r--mm/pagewalk.c6
-rw-r--r--mm/percpu-vm.c2
-rw-r--r--mm/percpu.c12
-rw-r--r--mm/pgtable-generic.c121
-rw-r--r--mm/rmap.c93
-rw-r--r--mm/shmem.c9
-rw-r--r--mm/slab.c82
-rw-r--r--mm/slob.c5
-rw-r--r--mm/slub.c89
-rw-r--r--mm/sparse-vmemmap.c2
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c131
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/swapfile.c9
-rw-r--r--mm/truncate.c15
-rw-r--r--mm/util.c21
-rw-r--r--mm/vmalloc.c118
-rw-r--r--mm/vmscan.c435
-rw-r--r--mm/vmstat.c206
48 files changed, 4789 insertions, 1112 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c2c8a4a11898..e9c0c61f2ddd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS
179config COMPACTION 179config COMPACTION
180 bool "Allow for memory compaction" 180 bool "Allow for memory compaction"
181 select MIGRATION 181 select MIGRATION
182 depends on EXPERIMENTAL && HUGETLB_PAGE && MMU 182 depends on MMU
183 help 183 help
184 Allows the compaction of memory for the allocation of huge pages. 184 Allows the compaction of memory for the allocation of huge pages.
185 185
@@ -302,6 +302,44 @@ config NOMMU_INITIAL_TRIM_EXCESS
302 302
303 See Documentation/nommu-mmap.txt for more information. 303 See Documentation/nommu-mmap.txt for more information.
304 304
305config TRANSPARENT_HUGEPAGE
306 bool "Transparent Hugepage Support"
307 depends on X86 && MMU
308 select COMPACTION
309 help
310 Transparent Hugepages allows the kernel to use huge pages and
311 huge tlb transparently to the applications whenever possible.
312 This feature can improve computing performance to certain
313 applications by speeding up page faults during memory
314 allocation, by reducing the number of tlb misses and by speeding
315 up the pagetable walking.
316
317 If memory constrained on embedded, you may want to say N.
318
319choice
320 prompt "Transparent Hugepage Support sysfs defaults"
321 depends on TRANSPARENT_HUGEPAGE
322 default TRANSPARENT_HUGEPAGE_ALWAYS
323 help
324 Selects the sysfs defaults for Transparent Hugepage Support.
325
326 config TRANSPARENT_HUGEPAGE_ALWAYS
327 bool "always"
328 help
329 Enabling Transparent Hugepage always, can increase the
330 memory footprint of applications without a guaranteed
331 benefit but it will work automatically for all applications.
332
333 config TRANSPARENT_HUGEPAGE_MADVISE
334 bool "madvise"
335 help
336 Enabling Transparent Hugepage madvise, will only provide a
337 performance improvement benefit to the applications using
338 madvise(MADV_HUGEPAGE) but it won't risk to increase the
339 memory footprint of applications without a guaranteed
340 benefit.
341endchoice
342
305# 343#
306# UP and nommu archs use km based percpu allocator 344# UP and nommu archs use km based percpu allocator
307# 345#
diff --git a/mm/Makefile b/mm/Makefile
index f73f75a29f82..2b1b575ae712 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,7 +5,7 @@
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o pagewalk.o 8 vmalloc.o pagewalk.o pgtable-generic.o
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o \ 11 maccess.o page_alloc.o page-writeback.o \
@@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
37obj-$(CONFIG_FS_XIP) += filemap_xip.o 37obj-$(CONFIG_FS_XIP) += filemap_xip.o
38obj-$(CONFIG_MIGRATION) += migrate.o 38obj-$(CONFIG_MIGRATION) += migrate.o
39obj-$(CONFIG_QUICKLIST) += quicklist.o 39obj-$(CONFIG_QUICKLIST) += quicklist.o
40obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
40obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 41obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
41obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 42obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
42obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 43obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 4d709ee59013..8be430b812de 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,9 @@
16#include <linux/sysfs.h> 16#include <linux/sysfs.h>
17#include "internal.h" 17#include "internal.h"
18 18
19#define CREATE_TRACE_POINTS
20#include <trace/events/compaction.h>
21
19/* 22/*
20 * compact_control is used to track pages being migrated and the free pages 23 * compact_control is used to track pages being migrated and the free pages
21 * they are being migrated to during memory compaction. The free_pfn starts 24 * they are being migrated to during memory compaction. The free_pfn starts
@@ -30,6 +33,7 @@ struct compact_control {
30 unsigned long nr_migratepages; /* Number of pages to migrate */ 33 unsigned long nr_migratepages; /* Number of pages to migrate */
31 unsigned long free_pfn; /* isolate_freepages search base */ 34 unsigned long free_pfn; /* isolate_freepages search base */
32 unsigned long migrate_pfn; /* isolate_migratepages search base */ 35 unsigned long migrate_pfn; /* isolate_migratepages search base */
36 bool sync; /* Synchronous migration */
33 37
34 /* Account for isolated anon and file pages */ 38 /* Account for isolated anon and file pages */
35 unsigned long nr_anon; 39 unsigned long nr_anon;
@@ -38,6 +42,8 @@ struct compact_control {
38 unsigned int order; /* order a direct compactor needs */ 42 unsigned int order; /* order a direct compactor needs */
39 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 43 int migratetype; /* MOVABLE, RECLAIMABLE etc */
40 struct zone *zone; 44 struct zone *zone;
45
46 int compact_mode;
41}; 47};
42 48
43static unsigned long release_freepages(struct list_head *freelist) 49static unsigned long release_freepages(struct list_head *freelist)
@@ -60,7 +66,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
60 struct list_head *freelist) 66 struct list_head *freelist)
61{ 67{
62 unsigned long zone_end_pfn, end_pfn; 68 unsigned long zone_end_pfn, end_pfn;
63 int total_isolated = 0; 69 int nr_scanned = 0, total_isolated = 0;
64 struct page *cursor; 70 struct page *cursor;
65 71
66 /* Get the last PFN we should scan for free pages at */ 72 /* Get the last PFN we should scan for free pages at */
@@ -81,6 +87,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
81 87
82 if (!pfn_valid_within(blockpfn)) 88 if (!pfn_valid_within(blockpfn))
83 continue; 89 continue;
90 nr_scanned++;
84 91
85 if (!PageBuddy(page)) 92 if (!PageBuddy(page))
86 continue; 93 continue;
@@ -100,6 +107,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
100 } 107 }
101 } 108 }
102 109
110 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
103 return total_isolated; 111 return total_isolated;
104} 112}
105 113
@@ -234,6 +242,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
234 struct compact_control *cc) 242 struct compact_control *cc)
235{ 243{
236 unsigned long low_pfn, end_pfn; 244 unsigned long low_pfn, end_pfn;
245 unsigned long last_pageblock_nr = 0, pageblock_nr;
246 unsigned long nr_scanned = 0, nr_isolated = 0;
237 struct list_head *migratelist = &cc->migratepages; 247 struct list_head *migratelist = &cc->migratepages;
238 248
239 /* Do not scan outside zone boundaries */ 249 /* Do not scan outside zone boundaries */
@@ -266,21 +276,51 @@ static unsigned long isolate_migratepages(struct zone *zone,
266 struct page *page; 276 struct page *page;
267 if (!pfn_valid_within(low_pfn)) 277 if (!pfn_valid_within(low_pfn))
268 continue; 278 continue;
279 nr_scanned++;
269 280
270 /* Get the page and skip if free */ 281 /* Get the page and skip if free */
271 page = pfn_to_page(low_pfn); 282 page = pfn_to_page(low_pfn);
272 if (PageBuddy(page)) 283 if (PageBuddy(page))
273 continue; 284 continue;
274 285
286 /*
287 * For async migration, also only scan in MOVABLE blocks. Async
288 * migration is optimistic to see if the minimum amount of work
289 * satisfies the allocation
290 */
291 pageblock_nr = low_pfn >> pageblock_order;
292 if (!cc->sync && last_pageblock_nr != pageblock_nr &&
293 get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
294 low_pfn += pageblock_nr_pages;
295 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
296 last_pageblock_nr = pageblock_nr;
297 continue;
298 }
299
300 if (!PageLRU(page))
301 continue;
302
303 /*
304 * PageLRU is set, and lru_lock excludes isolation,
305 * splitting and collapsing (collapsing has already
306 * happened if PageLRU is set).
307 */
308 if (PageTransHuge(page)) {
309 low_pfn += (1 << compound_order(page)) - 1;
310 continue;
311 }
312
275 /* Try isolate the page */ 313 /* Try isolate the page */
276 if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) 314 if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
277 continue; 315 continue;
278 316
317 VM_BUG_ON(PageTransCompound(page));
318
279 /* Successfully isolated */ 319 /* Successfully isolated */
280 del_page_from_lru_list(zone, page, page_lru(page)); 320 del_page_from_lru_list(zone, page, page_lru(page));
281 list_add(&page->lru, migratelist); 321 list_add(&page->lru, migratelist);
282 mem_cgroup_del_lru(page);
283 cc->nr_migratepages++; 322 cc->nr_migratepages++;
323 nr_isolated++;
284 324
285 /* Avoid isolating too much */ 325 /* Avoid isolating too much */
286 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) 326 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
@@ -292,6 +332,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
292 spin_unlock_irq(&zone->lru_lock); 332 spin_unlock_irq(&zone->lru_lock);
293 cc->migrate_pfn = low_pfn; 333 cc->migrate_pfn = low_pfn;
294 334
335 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
336
295 return cc->nr_migratepages; 337 return cc->nr_migratepages;
296} 338}
297 339
@@ -342,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc)
342} 384}
343 385
344static int compact_finished(struct zone *zone, 386static int compact_finished(struct zone *zone,
345 struct compact_control *cc) 387 struct compact_control *cc)
346{ 388{
347 unsigned int order; 389 unsigned int order;
348 unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); 390 unsigned long watermark;
349 391
350 if (fatal_signal_pending(current)) 392 if (fatal_signal_pending(current))
351 return COMPACT_PARTIAL; 393 return COMPACT_PARTIAL;
@@ -355,12 +397,31 @@ static int compact_finished(struct zone *zone,
355 return COMPACT_COMPLETE; 397 return COMPACT_COMPLETE;
356 398
357 /* Compaction run is not finished if the watermark is not met */ 399 /* Compaction run is not finished if the watermark is not met */
400 if (cc->compact_mode != COMPACT_MODE_KSWAPD)
401 watermark = low_wmark_pages(zone);
402 else
403 watermark = high_wmark_pages(zone);
404 watermark += (1 << cc->order);
405
358 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) 406 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
359 return COMPACT_CONTINUE; 407 return COMPACT_CONTINUE;
360 408
409 /*
410 * order == -1 is expected when compacting via
411 * /proc/sys/vm/compact_memory
412 */
361 if (cc->order == -1) 413 if (cc->order == -1)
362 return COMPACT_CONTINUE; 414 return COMPACT_CONTINUE;
363 415
416 /*
417 * Generating only one page of the right order is not enough
418 * for kswapd, we must continue until we're above the high
419 * watermark as a pool for high order GFP_ATOMIC allocations
420 * too.
421 */
422 if (cc->compact_mode == COMPACT_MODE_KSWAPD)
423 return COMPACT_CONTINUE;
424
364 /* Direct compactor: Is a suitable page free? */ 425 /* Direct compactor: Is a suitable page free? */
365 for (order = cc->order; order < MAX_ORDER; order++) { 426 for (order = cc->order; order < MAX_ORDER; order++) {
366 /* Job done if page is free of the right migratetype */ 427 /* Job done if page is free of the right migratetype */
@@ -375,10 +436,69 @@ static int compact_finished(struct zone *zone,
375 return COMPACT_CONTINUE; 436 return COMPACT_CONTINUE;
376} 437}
377 438
439/*
440 * compaction_suitable: Is this suitable to run compaction on this zone now?
441 * Returns
442 * COMPACT_SKIPPED - If there are too few free pages for compaction
443 * COMPACT_PARTIAL - If the allocation would succeed without compaction
444 * COMPACT_CONTINUE - If compaction should run now
445 */
446unsigned long compaction_suitable(struct zone *zone, int order)
447{
448 int fragindex;
449 unsigned long watermark;
450
451 /*
452 * Watermarks for order-0 must be met for compaction. Note the 2UL.
453 * This is because during migration, copies of pages need to be
454 * allocated and for a short time, the footprint is higher
455 */
456 watermark = low_wmark_pages(zone) + (2UL << order);
457 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
458 return COMPACT_SKIPPED;
459
460 /*
461 * order == -1 is expected when compacting via
462 * /proc/sys/vm/compact_memory
463 */
464 if (order == -1)
465 return COMPACT_CONTINUE;
466
467 /*
468 * fragmentation index determines if allocation failures are due to
469 * low memory or external fragmentation
470 *
471 * index of -1 implies allocations might succeed dependingon watermarks
472 * index towards 0 implies failure is due to lack of memory
473 * index towards 1000 implies failure is due to fragmentation
474 *
475 * Only compact if a failure would be due to fragmentation.
476 */
477 fragindex = fragmentation_index(zone, order);
478 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
479 return COMPACT_SKIPPED;
480
481 if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
482 return COMPACT_PARTIAL;
483
484 return COMPACT_CONTINUE;
485}
486
378static int compact_zone(struct zone *zone, struct compact_control *cc) 487static int compact_zone(struct zone *zone, struct compact_control *cc)
379{ 488{
380 int ret; 489 int ret;
381 490
491 ret = compaction_suitable(zone, cc->order);
492 switch (ret) {
493 case COMPACT_PARTIAL:
494 case COMPACT_SKIPPED:
495 /* Compaction is likely to fail */
496 return ret;
497 case COMPACT_CONTINUE:
498 /* Fall through to compaction */
499 ;
500 }
501
382 /* Setup to move all movable pages to the end of the zone */ 502 /* Setup to move all movable pages to the end of the zone */
383 cc->migrate_pfn = zone->zone_start_pfn; 503 cc->migrate_pfn = zone->zone_start_pfn;
384 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; 504 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -394,7 +514,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
394 514
395 nr_migrate = cc->nr_migratepages; 515 nr_migrate = cc->nr_migratepages;
396 migrate_pages(&cc->migratepages, compaction_alloc, 516 migrate_pages(&cc->migratepages, compaction_alloc,
397 (unsigned long)cc, 0); 517 (unsigned long)cc, false,
518 cc->sync);
398 update_nr_listpages(cc); 519 update_nr_listpages(cc);
399 nr_remaining = cc->nr_migratepages; 520 nr_remaining = cc->nr_migratepages;
400 521
@@ -402,6 +523,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
402 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); 523 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
403 if (nr_remaining) 524 if (nr_remaining)
404 count_vm_events(COMPACTPAGEFAILED, nr_remaining); 525 count_vm_events(COMPACTPAGEFAILED, nr_remaining);
526 trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
527 nr_remaining);
405 528
406 /* Release LRU pages not migrated */ 529 /* Release LRU pages not migrated */
407 if (!list_empty(&cc->migratepages)) { 530 if (!list_empty(&cc->migratepages)) {
@@ -418,8 +541,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
418 return ret; 541 return ret;
419} 542}
420 543
421static unsigned long compact_zone_order(struct zone *zone, 544unsigned long compact_zone_order(struct zone *zone,
422 int order, gfp_t gfp_mask) 545 int order, gfp_t gfp_mask,
546 bool sync,
547 int compact_mode)
423{ 548{
424 struct compact_control cc = { 549 struct compact_control cc = {
425 .nr_freepages = 0, 550 .nr_freepages = 0,
@@ -427,6 +552,8 @@ static unsigned long compact_zone_order(struct zone *zone,
427 .order = order, 552 .order = order,
428 .migratetype = allocflags_to_migratetype(gfp_mask), 553 .migratetype = allocflags_to_migratetype(gfp_mask),
429 .zone = zone, 554 .zone = zone,
555 .sync = sync,
556 .compact_mode = compact_mode,
430 }; 557 };
431 INIT_LIST_HEAD(&cc.freepages); 558 INIT_LIST_HEAD(&cc.freepages);
432 INIT_LIST_HEAD(&cc.migratepages); 559 INIT_LIST_HEAD(&cc.migratepages);
@@ -442,16 +569,17 @@ int sysctl_extfrag_threshold = 500;
442 * @order: The order of the current allocation 569 * @order: The order of the current allocation
443 * @gfp_mask: The GFP mask of the current allocation 570 * @gfp_mask: The GFP mask of the current allocation
444 * @nodemask: The allowed nodes to allocate from 571 * @nodemask: The allowed nodes to allocate from
572 * @sync: Whether migration is synchronous or not
445 * 573 *
446 * This is the main entry point for direct page compaction. 574 * This is the main entry point for direct page compaction.
447 */ 575 */
448unsigned long try_to_compact_pages(struct zonelist *zonelist, 576unsigned long try_to_compact_pages(struct zonelist *zonelist,
449 int order, gfp_t gfp_mask, nodemask_t *nodemask) 577 int order, gfp_t gfp_mask, nodemask_t *nodemask,
578 bool sync)
450{ 579{
451 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 580 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
452 int may_enter_fs = gfp_mask & __GFP_FS; 581 int may_enter_fs = gfp_mask & __GFP_FS;
453 int may_perform_io = gfp_mask & __GFP_IO; 582 int may_perform_io = gfp_mask & __GFP_IO;
454 unsigned long watermark;
455 struct zoneref *z; 583 struct zoneref *z;
456 struct zone *zone; 584 struct zone *zone;
457 int rc = COMPACT_SKIPPED; 585 int rc = COMPACT_SKIPPED;
@@ -461,7 +589,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
461 * made because an assumption is made that the page allocator can satisfy 589 * made because an assumption is made that the page allocator can satisfy
462 * the "cheaper" orders without taking special steps 590 * the "cheaper" orders without taking special steps
463 */ 591 */
464 if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) 592 if (!order || !may_enter_fs || !may_perform_io)
465 return rc; 593 return rc;
466 594
467 count_vm_event(COMPACTSTALL); 595 count_vm_event(COMPACTSTALL);
@@ -469,43 +597,14 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
469 /* Compact each zone in the list */ 597 /* Compact each zone in the list */
470 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 598 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
471 nodemask) { 599 nodemask) {
472 int fragindex;
473 int status; 600 int status;
474 601
475 /* 602 status = compact_zone_order(zone, order, gfp_mask, sync,
476 * Watermarks for order-0 must be met for compaction. Note 603 COMPACT_MODE_DIRECT_RECLAIM);
477 * the 2UL. This is because during migration, copies of
478 * pages need to be allocated and for a short time, the
479 * footprint is higher
480 */
481 watermark = low_wmark_pages(zone) + (2UL << order);
482 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
483 continue;
484
485 /*
486 * fragmentation index determines if allocation failures are
487 * due to low memory or external fragmentation
488 *
489 * index of -1 implies allocations might succeed depending
490 * on watermarks
491 * index towards 0 implies failure is due to lack of memory
492 * index towards 1000 implies failure is due to fragmentation
493 *
494 * Only compact if a failure would be due to fragmentation.
495 */
496 fragindex = fragmentation_index(zone, order);
497 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
498 continue;
499
500 if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
501 rc = COMPACT_PARTIAL;
502 break;
503 }
504
505 status = compact_zone_order(zone, order, gfp_mask);
506 rc = max(status, rc); 604 rc = max(status, rc);
507 605
508 if (zone_watermark_ok(zone, order, watermark, 0, 0)) 606 /* If a normal allocation would succeed, stop compacting */
607 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
509 break; 608 break;
510 } 609 }
511 610
@@ -532,6 +631,7 @@ static int compact_node(int nid)
532 .nr_freepages = 0, 631 .nr_freepages = 0,
533 .nr_migratepages = 0, 632 .nr_migratepages = 0,
534 .order = -1, 633 .order = -1,
634 .compact_mode = COMPACT_MODE_DIRECT_RECLAIM,
535 }; 635 };
536 636
537 zone = &pgdat->node_zones[zoneid]; 637 zone = &pgdat->node_zones[zoneid];
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 4df2de77e069..03bf3bb4519a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
324 if (mem_flags & __GFP_WAIT) { 324 if (mem_flags & __GFP_WAIT) {
325 DECLARE_WAITQUEUE(wait, current); 325 DECLARE_WAITQUEUE(wait, current);
326 326
327 __set_current_state(TASK_INTERRUPTIBLE); 327 __set_current_state(TASK_UNINTERRUPTIBLE);
328 __add_wait_queue(&pool->waitq, &wait); 328 __add_wait_queue(&pool->waitq, &wait);
329 spin_unlock_irqrestore(&pool->lock, flags); 329 spin_unlock_irqrestore(&pool->lock, flags);
330 330
@@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc);
355 355
356static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) 356static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
357{ 357{
358 unsigned long flags;
359 struct dma_page *page; 358 struct dma_page *page;
360 359
361 spin_lock_irqsave(&pool->lock, flags);
362 list_for_each_entry(page, &pool->page_list, page_list) { 360 list_for_each_entry(page, &pool->page_list, page_list) {
363 if (dma < page->dma) 361 if (dma < page->dma)
364 continue; 362 continue;
365 if (dma < (page->dma + pool->allocation)) 363 if (dma < (page->dma + pool->allocation))
366 goto done; 364 return page;
367 } 365 }
368 page = NULL; 366 return NULL;
369 done:
370 spin_unlock_irqrestore(&pool->lock, flags);
371 return page;
372} 367}
373 368
374/** 369/**
@@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
386 unsigned long flags; 381 unsigned long flags;
387 unsigned int offset; 382 unsigned int offset;
388 383
384 spin_lock_irqsave(&pool->lock, flags);
389 page = pool_find_page(pool, dma); 385 page = pool_find_page(pool, dma);
390 if (!page) { 386 if (!page) {
387 spin_unlock_irqrestore(&pool->lock, flags);
391 if (pool->dev) 388 if (pool->dev)
392 dev_err(pool->dev, 389 dev_err(pool->dev,
393 "dma_pool_free %s, %p/%lx (bad dma)\n", 390 "dma_pool_free %s, %p/%lx (bad dma)\n",
@@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
401 offset = vaddr - page->vaddr; 398 offset = vaddr - page->vaddr;
402#ifdef DMAPOOL_DEBUG 399#ifdef DMAPOOL_DEBUG
403 if ((dma - page->dma) != offset) { 400 if ((dma - page->dma) != offset) {
401 spin_unlock_irqrestore(&pool->lock, flags);
404 if (pool->dev) 402 if (pool->dev)
405 dev_err(pool->dev, 403 dev_err(pool->dev,
406 "dma_pool_free %s, %p (bad vaddr)/%Lx\n", 404 "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
@@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
418 chain = *(int *)(page->vaddr + chain); 416 chain = *(int *)(page->vaddr + chain);
419 continue; 417 continue;
420 } 418 }
419 spin_unlock_irqrestore(&pool->lock, flags);
421 if (pool->dev) 420 if (pool->dev)
422 dev_err(pool->dev, "dma_pool_free %s, dma %Lx " 421 dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
423 "already free\n", pool->name, 422 "already free\n", pool->name,
@@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
432 memset(vaddr, POOL_POISON_FREED, pool->size); 431 memset(vaddr, POOL_POISON_FREED, pool->size);
433#endif 432#endif
434 433
435 spin_lock_irqsave(&pool->lock, flags);
436 page->in_use--; 434 page->in_use--;
437 *(int *)vaddr = page->offset; 435 *(int *)vaddr = page->offset;
438 page->offset = offset; 436 page->offset = offset;
diff --git a/mm/filemap.c b/mm/filemap.c
index ea89840fc65f..83a45d35468b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -102,9 +102,6 @@
102 * ->inode_lock (zap_pte_range->set_page_dirty) 102 * ->inode_lock (zap_pte_range->set_page_dirty)
103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
104 * 104 *
105 * ->task->proc_lock
106 * ->dcache_lock (proc_pid_lookup)
107 *
108 * (code doesn't rely on that order, so you could switch it around) 105 * (code doesn't rely on that order, so you could switch it around)
109 * ->tasklist_lock (memory_failure, collect_procs_ao) 106 * ->tasklist_lock (memory_failure, collect_procs_ao)
110 * ->i_mmap_lock 107 * ->i_mmap_lock
@@ -143,13 +140,18 @@ void __remove_from_page_cache(struct page *page)
143void remove_from_page_cache(struct page *page) 140void remove_from_page_cache(struct page *page)
144{ 141{
145 struct address_space *mapping = page->mapping; 142 struct address_space *mapping = page->mapping;
143 void (*freepage)(struct page *);
146 144
147 BUG_ON(!PageLocked(page)); 145 BUG_ON(!PageLocked(page));
148 146
147 freepage = mapping->a_ops->freepage;
149 spin_lock_irq(&mapping->tree_lock); 148 spin_lock_irq(&mapping->tree_lock);
150 __remove_from_page_cache(page); 149 __remove_from_page_cache(page);
151 spin_unlock_irq(&mapping->tree_lock); 150 spin_unlock_irq(&mapping->tree_lock);
152 mem_cgroup_uncharge_cache_page(page); 151 mem_cgroup_uncharge_cache_page(page);
152
153 if (freepage)
154 freepage(page);
153} 155}
154EXPORT_SYMBOL(remove_from_page_cache); 156EXPORT_SYMBOL(remove_from_page_cache);
155 157
@@ -296,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
296 continue; 298 continue;
297 299
298 wait_on_page_writeback(page); 300 wait_on_page_writeback(page);
299 if (PageError(page)) 301 if (TestClearPageError(page))
300 ret = -EIO; 302 ret = -EIO;
301 } 303 }
302 pagevec_release(&pvec); 304 pagevec_release(&pvec);
@@ -835,9 +837,6 @@ repeat:
835 if (radix_tree_deref_retry(page)) 837 if (radix_tree_deref_retry(page))
836 goto restart; 838 goto restart;
837 839
838 if (page->mapping == NULL || page->index != index)
839 break;
840
841 if (!page_cache_get_speculative(page)) 840 if (!page_cache_get_speculative(page))
842 goto repeat; 841 goto repeat;
843 842
@@ -847,6 +846,16 @@ repeat:
847 goto repeat; 846 goto repeat;
848 } 847 }
849 848
849 /*
850 * must check mapping and index after taking the ref.
851 * otherwise we can get both false positives and false
852 * negatives, which is just confusing to the caller.
853 */
854 if (page->mapping == NULL || page->index != index) {
855 page_cache_release(page);
856 break;
857 }
858
850 pages[ret] = page; 859 pages[ret] = page;
851 ret++; 860 ret++;
852 index++; 861 index++;
@@ -2218,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
2218 gfp_notmask = __GFP_FS; 2227 gfp_notmask = __GFP_FS;
2219repeat: 2228repeat:
2220 page = find_lock_page(mapping, index); 2229 page = find_lock_page(mapping, index);
2221 if (likely(page)) 2230 if (page)
2222 return page; 2231 return page;
2223 2232
2224 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); 2233 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644
index 000000000000..e62ddb8f24b6
--- /dev/null
+++ b/mm/huge_memory.c
@@ -0,0 +1,2354 @@
1/*
2 * Copyright (C) 2009 Red Hat, Inc.
3 *
4 * This work is licensed under the terms of the GNU GPL, version 2. See
5 * the COPYING file in the top-level directory.
6 */
7
8#include <linux/mm.h>
9#include <linux/sched.h>
10#include <linux/highmem.h>
11#include <linux/hugetlb.h>
12#include <linux/mmu_notifier.h>
13#include <linux/rmap.h>
14#include <linux/swap.h>
15#include <linux/mm_inline.h>
16#include <linux/kthread.h>
17#include <linux/khugepaged.h>
18#include <linux/freezer.h>
19#include <linux/mman.h>
20#include <asm/tlb.h>
21#include <asm/pgalloc.h>
22#include "internal.h"
23
24/*
25 * By default transparent hugepage support is enabled for all mappings
26 * and khugepaged scans all mappings. Defrag is only invoked by
27 * khugepaged hugepage allocations and by page faults inside
28 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
29 * allocations.
30 */
31unsigned long transparent_hugepage_flags __read_mostly =
32#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
33 (1<<TRANSPARENT_HUGEPAGE_FLAG)|
34#endif
35#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
36 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
37#endif
38 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
39 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
40
41/* default scan 8*512 pte (or vmas) every 30 second */
42static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
43static unsigned int khugepaged_pages_collapsed;
44static unsigned int khugepaged_full_scans;
45static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
46/* during fragmentation poll the hugepage allocator once every minute */
47static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
48static struct task_struct *khugepaged_thread __read_mostly;
49static DEFINE_MUTEX(khugepaged_mutex);
50static DEFINE_SPINLOCK(khugepaged_mm_lock);
51static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
52/*
53 * default collapse hugepages if there is at least one pte mapped like
54 * it would have happened if the vma was large enough during page
55 * fault.
56 */
57static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
58
59static int khugepaged(void *none);
60static int mm_slots_hash_init(void);
61static int khugepaged_slab_init(void);
62static void khugepaged_slab_free(void);
63
64#define MM_SLOTS_HASH_HEADS 1024
65static struct hlist_head *mm_slots_hash __read_mostly;
66static struct kmem_cache *mm_slot_cache __read_mostly;
67
68/**
69 * struct mm_slot - hash lookup from mm to mm_slot
70 * @hash: hash collision list
71 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
72 * @mm: the mm that this information is valid for
73 */
74struct mm_slot {
75 struct hlist_node hash;
76 struct list_head mm_node;
77 struct mm_struct *mm;
78};
79
80/**
81 * struct khugepaged_scan - cursor for scanning
82 * @mm_head: the head of the mm list to scan
83 * @mm_slot: the current mm_slot we are scanning
84 * @address: the next address inside that to be scanned
85 *
86 * There is only the one khugepaged_scan instance of this cursor structure.
87 */
88struct khugepaged_scan {
89 struct list_head mm_head;
90 struct mm_slot *mm_slot;
91 unsigned long address;
92} khugepaged_scan = {
93 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
94};
95
96
97static int set_recommended_min_free_kbytes(void)
98{
99 struct zone *zone;
100 int nr_zones = 0;
101 unsigned long recommended_min;
102 extern int min_free_kbytes;
103
104 if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
105 &transparent_hugepage_flags) &&
106 !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
107 &transparent_hugepage_flags))
108 return 0;
109
110 for_each_populated_zone(zone)
111 nr_zones++;
112
113 /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
114 recommended_min = pageblock_nr_pages * nr_zones * 2;
115
116 /*
117 * Make sure that on average at least two pageblocks are almost free
118 * of another type, one for a migratetype to fall back to and a
119 * second to avoid subsequent fallbacks of other types There are 3
120 * MIGRATE_TYPES we care about.
121 */
122 recommended_min += pageblock_nr_pages * nr_zones *
123 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
124
125 /* don't ever allow to reserve more than 5% of the lowmem */
126 recommended_min = min(recommended_min,
127 (unsigned long) nr_free_buffer_pages() / 20);
128 recommended_min <<= (PAGE_SHIFT-10);
129
130 if (recommended_min > min_free_kbytes)
131 min_free_kbytes = recommended_min;
132 setup_per_zone_wmarks();
133 return 0;
134}
135late_initcall(set_recommended_min_free_kbytes);
136
137static int start_khugepaged(void)
138{
139 int err = 0;
140 if (khugepaged_enabled()) {
141 int wakeup;
142 if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
143 err = -ENOMEM;
144 goto out;
145 }
146 mutex_lock(&khugepaged_mutex);
147 if (!khugepaged_thread)
148 khugepaged_thread = kthread_run(khugepaged, NULL,
149 "khugepaged");
150 if (unlikely(IS_ERR(khugepaged_thread))) {
151 printk(KERN_ERR
152 "khugepaged: kthread_run(khugepaged) failed\n");
153 err = PTR_ERR(khugepaged_thread);
154 khugepaged_thread = NULL;
155 }
156 wakeup = !list_empty(&khugepaged_scan.mm_head);
157 mutex_unlock(&khugepaged_mutex);
158 if (wakeup)
159 wake_up_interruptible(&khugepaged_wait);
160
161 set_recommended_min_free_kbytes();
162 } else
163 /* wakeup to exit */
164 wake_up_interruptible(&khugepaged_wait);
165out:
166 return err;
167}
168
169#ifdef CONFIG_SYSFS
170
171static ssize_t double_flag_show(struct kobject *kobj,
172 struct kobj_attribute *attr, char *buf,
173 enum transparent_hugepage_flag enabled,
174 enum transparent_hugepage_flag req_madv)
175{
176 if (test_bit(enabled, &transparent_hugepage_flags)) {
177 VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
178 return sprintf(buf, "[always] madvise never\n");
179 } else if (test_bit(req_madv, &transparent_hugepage_flags))
180 return sprintf(buf, "always [madvise] never\n");
181 else
182 return sprintf(buf, "always madvise [never]\n");
183}
184static ssize_t double_flag_store(struct kobject *kobj,
185 struct kobj_attribute *attr,
186 const char *buf, size_t count,
187 enum transparent_hugepage_flag enabled,
188 enum transparent_hugepage_flag req_madv)
189{
190 if (!memcmp("always", buf,
191 min(sizeof("always")-1, count))) {
192 set_bit(enabled, &transparent_hugepage_flags);
193 clear_bit(req_madv, &transparent_hugepage_flags);
194 } else if (!memcmp("madvise", buf,
195 min(sizeof("madvise")-1, count))) {
196 clear_bit(enabled, &transparent_hugepage_flags);
197 set_bit(req_madv, &transparent_hugepage_flags);
198 } else if (!memcmp("never", buf,
199 min(sizeof("never")-1, count))) {
200 clear_bit(enabled, &transparent_hugepage_flags);
201 clear_bit(req_madv, &transparent_hugepage_flags);
202 } else
203 return -EINVAL;
204
205 return count;
206}
207
208static ssize_t enabled_show(struct kobject *kobj,
209 struct kobj_attribute *attr, char *buf)
210{
211 return double_flag_show(kobj, attr, buf,
212 TRANSPARENT_HUGEPAGE_FLAG,
213 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
214}
215static ssize_t enabled_store(struct kobject *kobj,
216 struct kobj_attribute *attr,
217 const char *buf, size_t count)
218{
219 ssize_t ret;
220
221 ret = double_flag_store(kobj, attr, buf, count,
222 TRANSPARENT_HUGEPAGE_FLAG,
223 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
224
225 if (ret > 0) {
226 int err = start_khugepaged();
227 if (err)
228 ret = err;
229 }
230
231 if (ret > 0 &&
232 (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
233 &transparent_hugepage_flags) ||
234 test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
235 &transparent_hugepage_flags)))
236 set_recommended_min_free_kbytes();
237
238 return ret;
239}
240static struct kobj_attribute enabled_attr =
241 __ATTR(enabled, 0644, enabled_show, enabled_store);
242
243static ssize_t single_flag_show(struct kobject *kobj,
244 struct kobj_attribute *attr, char *buf,
245 enum transparent_hugepage_flag flag)
246{
247 if (test_bit(flag, &transparent_hugepage_flags))
248 return sprintf(buf, "[yes] no\n");
249 else
250 return sprintf(buf, "yes [no]\n");
251}
252static ssize_t single_flag_store(struct kobject *kobj,
253 struct kobj_attribute *attr,
254 const char *buf, size_t count,
255 enum transparent_hugepage_flag flag)
256{
257 if (!memcmp("yes", buf,
258 min(sizeof("yes")-1, count))) {
259 set_bit(flag, &transparent_hugepage_flags);
260 } else if (!memcmp("no", buf,
261 min(sizeof("no")-1, count))) {
262 clear_bit(flag, &transparent_hugepage_flags);
263 } else
264 return -EINVAL;
265
266 return count;
267}
268
269/*
270 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
271 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
272 * memory just to allocate one more hugepage.
273 */
274static ssize_t defrag_show(struct kobject *kobj,
275 struct kobj_attribute *attr, char *buf)
276{
277 return double_flag_show(kobj, attr, buf,
278 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
279 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
280}
281static ssize_t defrag_store(struct kobject *kobj,
282 struct kobj_attribute *attr,
283 const char *buf, size_t count)
284{
285 return double_flag_store(kobj, attr, buf, count,
286 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
287 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
288}
289static struct kobj_attribute defrag_attr =
290 __ATTR(defrag, 0644, defrag_show, defrag_store);
291
292#ifdef CONFIG_DEBUG_VM
293static ssize_t debug_cow_show(struct kobject *kobj,
294 struct kobj_attribute *attr, char *buf)
295{
296 return single_flag_show(kobj, attr, buf,
297 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
298}
299static ssize_t debug_cow_store(struct kobject *kobj,
300 struct kobj_attribute *attr,
301 const char *buf, size_t count)
302{
303 return single_flag_store(kobj, attr, buf, count,
304 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
305}
306static struct kobj_attribute debug_cow_attr =
307 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
308#endif /* CONFIG_DEBUG_VM */
309
310static struct attribute *hugepage_attr[] = {
311 &enabled_attr.attr,
312 &defrag_attr.attr,
313#ifdef CONFIG_DEBUG_VM
314 &debug_cow_attr.attr,
315#endif
316 NULL,
317};
318
319static struct attribute_group hugepage_attr_group = {
320 .attrs = hugepage_attr,
321};
322
323static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
324 struct kobj_attribute *attr,
325 char *buf)
326{
327 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
328}
329
330static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
331 struct kobj_attribute *attr,
332 const char *buf, size_t count)
333{
334 unsigned long msecs;
335 int err;
336
337 err = strict_strtoul(buf, 10, &msecs);
338 if (err || msecs > UINT_MAX)
339 return -EINVAL;
340
341 khugepaged_scan_sleep_millisecs = msecs;
342 wake_up_interruptible(&khugepaged_wait);
343
344 return count;
345}
346static struct kobj_attribute scan_sleep_millisecs_attr =
347 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
348 scan_sleep_millisecs_store);
349
350static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
351 struct kobj_attribute *attr,
352 char *buf)
353{
354 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
355}
356
357static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
358 struct kobj_attribute *attr,
359 const char *buf, size_t count)
360{
361 unsigned long msecs;
362 int err;
363
364 err = strict_strtoul(buf, 10, &msecs);
365 if (err || msecs > UINT_MAX)
366 return -EINVAL;
367
368 khugepaged_alloc_sleep_millisecs = msecs;
369 wake_up_interruptible(&khugepaged_wait);
370
371 return count;
372}
373static struct kobj_attribute alloc_sleep_millisecs_attr =
374 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
375 alloc_sleep_millisecs_store);
376
377static ssize_t pages_to_scan_show(struct kobject *kobj,
378 struct kobj_attribute *attr,
379 char *buf)
380{
381 return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
382}
383static ssize_t pages_to_scan_store(struct kobject *kobj,
384 struct kobj_attribute *attr,
385 const char *buf, size_t count)
386{
387 int err;
388 unsigned long pages;
389
390 err = strict_strtoul(buf, 10, &pages);
391 if (err || !pages || pages > UINT_MAX)
392 return -EINVAL;
393
394 khugepaged_pages_to_scan = pages;
395
396 return count;
397}
398static struct kobj_attribute pages_to_scan_attr =
399 __ATTR(pages_to_scan, 0644, pages_to_scan_show,
400 pages_to_scan_store);
401
402static ssize_t pages_collapsed_show(struct kobject *kobj,
403 struct kobj_attribute *attr,
404 char *buf)
405{
406 return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
407}
408static struct kobj_attribute pages_collapsed_attr =
409 __ATTR_RO(pages_collapsed);
410
411static ssize_t full_scans_show(struct kobject *kobj,
412 struct kobj_attribute *attr,
413 char *buf)
414{
415 return sprintf(buf, "%u\n", khugepaged_full_scans);
416}
417static struct kobj_attribute full_scans_attr =
418 __ATTR_RO(full_scans);
419
420static ssize_t khugepaged_defrag_show(struct kobject *kobj,
421 struct kobj_attribute *attr, char *buf)
422{
423 return single_flag_show(kobj, attr, buf,
424 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
425}
426static ssize_t khugepaged_defrag_store(struct kobject *kobj,
427 struct kobj_attribute *attr,
428 const char *buf, size_t count)
429{
430 return single_flag_store(kobj, attr, buf, count,
431 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
432}
433static struct kobj_attribute khugepaged_defrag_attr =
434 __ATTR(defrag, 0644, khugepaged_defrag_show,
435 khugepaged_defrag_store);
436
437/*
438 * max_ptes_none controls if khugepaged should collapse hugepages over
439 * any unmapped ptes in turn potentially increasing the memory
440 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
441 * reduce the available free memory in the system as it
442 * runs. Increasing max_ptes_none will instead potentially reduce the
443 * free memory in the system during the khugepaged scan.
444 */
445static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
446 struct kobj_attribute *attr,
447 char *buf)
448{
449 return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
450}
451static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
452 struct kobj_attribute *attr,
453 const char *buf, size_t count)
454{
455 int err;
456 unsigned long max_ptes_none;
457
458 err = strict_strtoul(buf, 10, &max_ptes_none);
459 if (err || max_ptes_none > HPAGE_PMD_NR-1)
460 return -EINVAL;
461
462 khugepaged_max_ptes_none = max_ptes_none;
463
464 return count;
465}
466static struct kobj_attribute khugepaged_max_ptes_none_attr =
467 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
468 khugepaged_max_ptes_none_store);
469
470static struct attribute *khugepaged_attr[] = {
471 &khugepaged_defrag_attr.attr,
472 &khugepaged_max_ptes_none_attr.attr,
473 &pages_to_scan_attr.attr,
474 &pages_collapsed_attr.attr,
475 &full_scans_attr.attr,
476 &scan_sleep_millisecs_attr.attr,
477 &alloc_sleep_millisecs_attr.attr,
478 NULL,
479};
480
481static struct attribute_group khugepaged_attr_group = {
482 .attrs = khugepaged_attr,
483 .name = "khugepaged",
484};
485#endif /* CONFIG_SYSFS */
486
487static int __init hugepage_init(void)
488{
489 int err;
490#ifdef CONFIG_SYSFS
491 static struct kobject *hugepage_kobj;
492#endif
493
494 err = -EINVAL;
495 if (!has_transparent_hugepage()) {
496 transparent_hugepage_flags = 0;
497 goto out;
498 }
499
500#ifdef CONFIG_SYSFS
501 err = -ENOMEM;
502 hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
503 if (unlikely(!hugepage_kobj)) {
504 printk(KERN_ERR "hugepage: failed kobject create\n");
505 goto out;
506 }
507
508 err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
509 if (err) {
510 printk(KERN_ERR "hugepage: failed register hugeage group\n");
511 goto out;
512 }
513
514 err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
515 if (err) {
516 printk(KERN_ERR "hugepage: failed register hugeage group\n");
517 goto out;
518 }
519#endif
520
521 err = khugepaged_slab_init();
522 if (err)
523 goto out;
524
525 err = mm_slots_hash_init();
526 if (err) {
527 khugepaged_slab_free();
528 goto out;
529 }
530
531 /*
532 * By default disable transparent hugepages on smaller systems,
533 * where the extra memory used could hurt more than TLB overhead
534 * is likely to save. The admin can still enable it through /sys.
535 */
536 if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
537 transparent_hugepage_flags = 0;
538
539 start_khugepaged();
540
541 set_recommended_min_free_kbytes();
542
543out:
544 return err;
545}
546module_init(hugepage_init)
547
548static int __init setup_transparent_hugepage(char *str)
549{
550 int ret = 0;
551 if (!str)
552 goto out;
553 if (!strcmp(str, "always")) {
554 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
555 &transparent_hugepage_flags);
556 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
557 &transparent_hugepage_flags);
558 ret = 1;
559 } else if (!strcmp(str, "madvise")) {
560 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
561 &transparent_hugepage_flags);
562 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
563 &transparent_hugepage_flags);
564 ret = 1;
565 } else if (!strcmp(str, "never")) {
566 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
567 &transparent_hugepage_flags);
568 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
569 &transparent_hugepage_flags);
570 ret = 1;
571 }
572out:
573 if (!ret)
574 printk(KERN_WARNING
575 "transparent_hugepage= cannot parse, ignored\n");
576 return ret;
577}
578__setup("transparent_hugepage=", setup_transparent_hugepage);
579
580static void prepare_pmd_huge_pte(pgtable_t pgtable,
581 struct mm_struct *mm)
582{
583 assert_spin_locked(&mm->page_table_lock);
584
585 /* FIFO */
586 if (!mm->pmd_huge_pte)
587 INIT_LIST_HEAD(&pgtable->lru);
588 else
589 list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
590 mm->pmd_huge_pte = pgtable;
591}
592
593static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
594{
595 if (likely(vma->vm_flags & VM_WRITE))
596 pmd = pmd_mkwrite(pmd);
597 return pmd;
598}
599
600static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
601 struct vm_area_struct *vma,
602 unsigned long haddr, pmd_t *pmd,
603 struct page *page)
604{
605 int ret = 0;
606 pgtable_t pgtable;
607
608 VM_BUG_ON(!PageCompound(page));
609 pgtable = pte_alloc_one(mm, haddr);
610 if (unlikely(!pgtable)) {
611 mem_cgroup_uncharge_page(page);
612 put_page(page);
613 return VM_FAULT_OOM;
614 }
615
616 clear_huge_page(page, haddr, HPAGE_PMD_NR);
617 __SetPageUptodate(page);
618
619 spin_lock(&mm->page_table_lock);
620 if (unlikely(!pmd_none(*pmd))) {
621 spin_unlock(&mm->page_table_lock);
622 mem_cgroup_uncharge_page(page);
623 put_page(page);
624 pte_free(mm, pgtable);
625 } else {
626 pmd_t entry;
627 entry = mk_pmd(page, vma->vm_page_prot);
628 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
629 entry = pmd_mkhuge(entry);
630 /*
631 * The spinlocking to take the lru_lock inside
632 * page_add_new_anon_rmap() acts as a full memory
633 * barrier to be sure clear_huge_page writes become
634 * visible after the set_pmd_at() write.
635 */
636 page_add_new_anon_rmap(page, vma, haddr);
637 set_pmd_at(mm, haddr, pmd, entry);
638 prepare_pmd_huge_pte(pgtable, mm);
639 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
640 spin_unlock(&mm->page_table_lock);
641 }
642
643 return ret;
644}
645
646static inline gfp_t alloc_hugepage_gfpmask(int defrag)
647{
648 return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
649}
650
651static inline struct page *alloc_hugepage_vma(int defrag,
652 struct vm_area_struct *vma,
653 unsigned long haddr)
654{
655 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
656 HPAGE_PMD_ORDER, vma, haddr);
657}
658
659#ifndef CONFIG_NUMA
660static inline struct page *alloc_hugepage(int defrag)
661{
662 return alloc_pages(alloc_hugepage_gfpmask(defrag),
663 HPAGE_PMD_ORDER);
664}
665#endif
666
667int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
668 unsigned long address, pmd_t *pmd,
669 unsigned int flags)
670{
671 struct page *page;
672 unsigned long haddr = address & HPAGE_PMD_MASK;
673 pte_t *pte;
674
675 if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
676 if (unlikely(anon_vma_prepare(vma)))
677 return VM_FAULT_OOM;
678 if (unlikely(khugepaged_enter(vma)))
679 return VM_FAULT_OOM;
680 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
681 vma, haddr);
682 if (unlikely(!page))
683 goto out;
684 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
685 put_page(page);
686 goto out;
687 }
688
689 return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
690 }
691out:
692 /*
693 * Use __pte_alloc instead of pte_alloc_map, because we can't
694 * run pte_offset_map on the pmd, if an huge pmd could
695 * materialize from under us from a different thread.
696 */
697 if (unlikely(__pte_alloc(mm, vma, pmd, address)))
698 return VM_FAULT_OOM;
699 /* if an huge pmd materialized from under us just retry later */
700 if (unlikely(pmd_trans_huge(*pmd)))
701 return 0;
702 /*
703 * A regular pmd is established and it can't morph into a huge pmd
704 * from under us anymore at this point because we hold the mmap_sem
705 * read mode and khugepaged takes it in write mode. So now it's
706 * safe to run pte_offset_map().
707 */
708 pte = pte_offset_map(pmd, address);
709 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
710}
711
712int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
713 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
714 struct vm_area_struct *vma)
715{
716 struct page *src_page;
717 pmd_t pmd;
718 pgtable_t pgtable;
719 int ret;
720
721 ret = -ENOMEM;
722 pgtable = pte_alloc_one(dst_mm, addr);
723 if (unlikely(!pgtable))
724 goto out;
725
726 spin_lock(&dst_mm->page_table_lock);
727 spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
728
729 ret = -EAGAIN;
730 pmd = *src_pmd;
731 if (unlikely(!pmd_trans_huge(pmd))) {
732 pte_free(dst_mm, pgtable);
733 goto out_unlock;
734 }
735 if (unlikely(pmd_trans_splitting(pmd))) {
736 /* split huge page running from under us */
737 spin_unlock(&src_mm->page_table_lock);
738 spin_unlock(&dst_mm->page_table_lock);
739 pte_free(dst_mm, pgtable);
740
741 wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
742 goto out;
743 }
744 src_page = pmd_page(pmd);
745 VM_BUG_ON(!PageHead(src_page));
746 get_page(src_page);
747 page_dup_rmap(src_page);
748 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
749
750 pmdp_set_wrprotect(src_mm, addr, src_pmd);
751 pmd = pmd_mkold(pmd_wrprotect(pmd));
752 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
753 prepare_pmd_huge_pte(pgtable, dst_mm);
754
755 ret = 0;
756out_unlock:
757 spin_unlock(&src_mm->page_table_lock);
758 spin_unlock(&dst_mm->page_table_lock);
759out:
760 return ret;
761}
762
763/* no "address" argument so destroys page coloring of some arch */
764pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
765{
766 pgtable_t pgtable;
767
768 assert_spin_locked(&mm->page_table_lock);
769
770 /* FIFO */
771 pgtable = mm->pmd_huge_pte;
772 if (list_empty(&pgtable->lru))
773 mm->pmd_huge_pte = NULL;
774 else {
775 mm->pmd_huge_pte = list_entry(pgtable->lru.next,
776 struct page, lru);
777 list_del(&pgtable->lru);
778 }
779 return pgtable;
780}
781
782static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
783 struct vm_area_struct *vma,
784 unsigned long address,
785 pmd_t *pmd, pmd_t orig_pmd,
786 struct page *page,
787 unsigned long haddr)
788{
789 pgtable_t pgtable;
790 pmd_t _pmd;
791 int ret = 0, i;
792 struct page **pages;
793
794 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
795 GFP_KERNEL);
796 if (unlikely(!pages)) {
797 ret |= VM_FAULT_OOM;
798 goto out;
799 }
800
801 for (i = 0; i < HPAGE_PMD_NR; i++) {
802 pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
803 vma, address);
804 if (unlikely(!pages[i] ||
805 mem_cgroup_newpage_charge(pages[i], mm,
806 GFP_KERNEL))) {
807 if (pages[i])
808 put_page(pages[i]);
809 mem_cgroup_uncharge_start();
810 while (--i >= 0) {
811 mem_cgroup_uncharge_page(pages[i]);
812 put_page(pages[i]);
813 }
814 mem_cgroup_uncharge_end();
815 kfree(pages);
816 ret |= VM_FAULT_OOM;
817 goto out;
818 }
819 }
820
821 for (i = 0; i < HPAGE_PMD_NR; i++) {
822 copy_user_highpage(pages[i], page + i,
823 haddr + PAGE_SHIFT*i, vma);
824 __SetPageUptodate(pages[i]);
825 cond_resched();
826 }
827
828 spin_lock(&mm->page_table_lock);
829 if (unlikely(!pmd_same(*pmd, orig_pmd)))
830 goto out_free_pages;
831 VM_BUG_ON(!PageHead(page));
832
833 pmdp_clear_flush_notify(vma, haddr, pmd);
834 /* leave pmd empty until pte is filled */
835
836 pgtable = get_pmd_huge_pte(mm);
837 pmd_populate(mm, &_pmd, pgtable);
838
839 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
840 pte_t *pte, entry;
841 entry = mk_pte(pages[i], vma->vm_page_prot);
842 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
843 page_add_new_anon_rmap(pages[i], vma, haddr);
844 pte = pte_offset_map(&_pmd, haddr);
845 VM_BUG_ON(!pte_none(*pte));
846 set_pte_at(mm, haddr, pte, entry);
847 pte_unmap(pte);
848 }
849 kfree(pages);
850
851 mm->nr_ptes++;
852 smp_wmb(); /* make pte visible before pmd */
853 pmd_populate(mm, pmd, pgtable);
854 page_remove_rmap(page);
855 spin_unlock(&mm->page_table_lock);
856
857 ret |= VM_FAULT_WRITE;
858 put_page(page);
859
860out:
861 return ret;
862
863out_free_pages:
864 spin_unlock(&mm->page_table_lock);
865 mem_cgroup_uncharge_start();
866 for (i = 0; i < HPAGE_PMD_NR; i++) {
867 mem_cgroup_uncharge_page(pages[i]);
868 put_page(pages[i]);
869 }
870 mem_cgroup_uncharge_end();
871 kfree(pages);
872 goto out;
873}
874
875int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
876 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
877{
878 int ret = 0;
879 struct page *page, *new_page;
880 unsigned long haddr;
881
882 VM_BUG_ON(!vma->anon_vma);
883 spin_lock(&mm->page_table_lock);
884 if (unlikely(!pmd_same(*pmd, orig_pmd)))
885 goto out_unlock;
886
887 page = pmd_page(orig_pmd);
888 VM_BUG_ON(!PageCompound(page) || !PageHead(page));
889 haddr = address & HPAGE_PMD_MASK;
890 if (page_mapcount(page) == 1) {
891 pmd_t entry;
892 entry = pmd_mkyoung(orig_pmd);
893 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
894 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
895 update_mmu_cache(vma, address, entry);
896 ret |= VM_FAULT_WRITE;
897 goto out_unlock;
898 }
899 get_page(page);
900 spin_unlock(&mm->page_table_lock);
901
902 if (transparent_hugepage_enabled(vma) &&
903 !transparent_hugepage_debug_cow())
904 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
905 vma, haddr);
906 else
907 new_page = NULL;
908
909 if (unlikely(!new_page)) {
910 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
911 pmd, orig_pmd, page, haddr);
912 put_page(page);
913 goto out;
914 }
915
916 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
917 put_page(new_page);
918 put_page(page);
919 ret |= VM_FAULT_OOM;
920 goto out;
921 }
922
923 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
924 __SetPageUptodate(new_page);
925
926 spin_lock(&mm->page_table_lock);
927 put_page(page);
928 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
929 mem_cgroup_uncharge_page(new_page);
930 put_page(new_page);
931 } else {
932 pmd_t entry;
933 VM_BUG_ON(!PageHead(page));
934 entry = mk_pmd(new_page, vma->vm_page_prot);
935 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
936 entry = pmd_mkhuge(entry);
937 pmdp_clear_flush_notify(vma, haddr, pmd);
938 page_add_new_anon_rmap(new_page, vma, haddr);
939 set_pmd_at(mm, haddr, pmd, entry);
940 update_mmu_cache(vma, address, entry);
941 page_remove_rmap(page);
942 put_page(page);
943 ret |= VM_FAULT_WRITE;
944 }
945out_unlock:
946 spin_unlock(&mm->page_table_lock);
947out:
948 return ret;
949}
950
951struct page *follow_trans_huge_pmd(struct mm_struct *mm,
952 unsigned long addr,
953 pmd_t *pmd,
954 unsigned int flags)
955{
956 struct page *page = NULL;
957
958 assert_spin_locked(&mm->page_table_lock);
959
960 if (flags & FOLL_WRITE && !pmd_write(*pmd))
961 goto out;
962
963 page = pmd_page(*pmd);
964 VM_BUG_ON(!PageHead(page));
965 if (flags & FOLL_TOUCH) {
966 pmd_t _pmd;
967 /*
968 * We should set the dirty bit only for FOLL_WRITE but
969 * for now the dirty bit in the pmd is meaningless.
970 * And if the dirty bit will become meaningful and
971 * we'll only set it with FOLL_WRITE, an atomic
972 * set_bit will be required on the pmd to set the
973 * young bit, instead of the current set_pmd_at.
974 */
975 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
976 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
977 }
978 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
979 VM_BUG_ON(!PageCompound(page));
980 if (flags & FOLL_GET)
981 get_page(page);
982
983out:
984 return page;
985}
986
987int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
988 pmd_t *pmd)
989{
990 int ret = 0;
991
992 spin_lock(&tlb->mm->page_table_lock);
993 if (likely(pmd_trans_huge(*pmd))) {
994 if (unlikely(pmd_trans_splitting(*pmd))) {
995 spin_unlock(&tlb->mm->page_table_lock);
996 wait_split_huge_page(vma->anon_vma,
997 pmd);
998 } else {
999 struct page *page;
1000 pgtable_t pgtable;
1001 pgtable = get_pmd_huge_pte(tlb->mm);
1002 page = pmd_page(*pmd);
1003 pmd_clear(pmd);
1004 page_remove_rmap(page);
1005 VM_BUG_ON(page_mapcount(page) < 0);
1006 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1007 VM_BUG_ON(!PageHead(page));
1008 spin_unlock(&tlb->mm->page_table_lock);
1009 tlb_remove_page(tlb, page);
1010 pte_free(tlb->mm, pgtable);
1011 ret = 1;
1012 }
1013 } else
1014 spin_unlock(&tlb->mm->page_table_lock);
1015
1016 return ret;
1017}
1018
1019int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1020 unsigned long addr, unsigned long end,
1021 unsigned char *vec)
1022{
1023 int ret = 0;
1024
1025 spin_lock(&vma->vm_mm->page_table_lock);
1026 if (likely(pmd_trans_huge(*pmd))) {
1027 ret = !pmd_trans_splitting(*pmd);
1028 spin_unlock(&vma->vm_mm->page_table_lock);
1029 if (unlikely(!ret))
1030 wait_split_huge_page(vma->anon_vma, pmd);
1031 else {
1032 /*
1033 * All logical pages in the range are present
1034 * if backed by a huge page.
1035 */
1036 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1037 }
1038 } else
1039 spin_unlock(&vma->vm_mm->page_table_lock);
1040
1041 return ret;
1042}
1043
1044int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1045 unsigned long addr, pgprot_t newprot)
1046{
1047 struct mm_struct *mm = vma->vm_mm;
1048 int ret = 0;
1049
1050 spin_lock(&mm->page_table_lock);
1051 if (likely(pmd_trans_huge(*pmd))) {
1052 if (unlikely(pmd_trans_splitting(*pmd))) {
1053 spin_unlock(&mm->page_table_lock);
1054 wait_split_huge_page(vma->anon_vma, pmd);
1055 } else {
1056 pmd_t entry;
1057
1058 entry = pmdp_get_and_clear(mm, addr, pmd);
1059 entry = pmd_modify(entry, newprot);
1060 set_pmd_at(mm, addr, pmd, entry);
1061 spin_unlock(&vma->vm_mm->page_table_lock);
1062 flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
1063 ret = 1;
1064 }
1065 } else
1066 spin_unlock(&vma->vm_mm->page_table_lock);
1067
1068 return ret;
1069}
1070
1071pmd_t *page_check_address_pmd(struct page *page,
1072 struct mm_struct *mm,
1073 unsigned long address,
1074 enum page_check_address_pmd_flag flag)
1075{
1076 pgd_t *pgd;
1077 pud_t *pud;
1078 pmd_t *pmd, *ret = NULL;
1079
1080 if (address & ~HPAGE_PMD_MASK)
1081 goto out;
1082
1083 pgd = pgd_offset(mm, address);
1084 if (!pgd_present(*pgd))
1085 goto out;
1086
1087 pud = pud_offset(pgd, address);
1088 if (!pud_present(*pud))
1089 goto out;
1090
1091 pmd = pmd_offset(pud, address);
1092 if (pmd_none(*pmd))
1093 goto out;
1094 if (pmd_page(*pmd) != page)
1095 goto out;
1096 /*
1097 * split_vma() may create temporary aliased mappings. There is
1098 * no risk as long as all huge pmd are found and have their
1099 * splitting bit set before __split_huge_page_refcount
1100 * runs. Finding the same huge pmd more than once during the
1101 * same rmap walk is not a problem.
1102 */
1103 if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
1104 pmd_trans_splitting(*pmd))
1105 goto out;
1106 if (pmd_trans_huge(*pmd)) {
1107 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
1108 !pmd_trans_splitting(*pmd));
1109 ret = pmd;
1110 }
1111out:
1112 return ret;
1113}
1114
1115static int __split_huge_page_splitting(struct page *page,
1116 struct vm_area_struct *vma,
1117 unsigned long address)
1118{
1119 struct mm_struct *mm = vma->vm_mm;
1120 pmd_t *pmd;
1121 int ret = 0;
1122
1123 spin_lock(&mm->page_table_lock);
1124 pmd = page_check_address_pmd(page, mm, address,
1125 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
1126 if (pmd) {
1127 /*
1128 * We can't temporarily set the pmd to null in order
1129 * to split it, the pmd must remain marked huge at all
1130 * times or the VM won't take the pmd_trans_huge paths
1131 * and it won't wait on the anon_vma->root->lock to
1132 * serialize against split_huge_page*.
1133 */
1134 pmdp_splitting_flush_notify(vma, address, pmd);
1135 ret = 1;
1136 }
1137 spin_unlock(&mm->page_table_lock);
1138
1139 return ret;
1140}
1141
1142static void __split_huge_page_refcount(struct page *page)
1143{
1144 int i;
1145 unsigned long head_index = page->index;
1146 struct zone *zone = page_zone(page);
1147 int zonestat;
1148
1149 /* prevent PageLRU to go away from under us, and freeze lru stats */
1150 spin_lock_irq(&zone->lru_lock);
1151 compound_lock(page);
1152
1153 for (i = 1; i < HPAGE_PMD_NR; i++) {
1154 struct page *page_tail = page + i;
1155
1156 /* tail_page->_count cannot change */
1157 atomic_sub(atomic_read(&page_tail->_count), &page->_count);
1158 BUG_ON(page_count(page) <= 0);
1159 atomic_add(page_mapcount(page) + 1, &page_tail->_count);
1160 BUG_ON(atomic_read(&page_tail->_count) <= 0);
1161
1162 /* after clearing PageTail the gup refcount can be released */
1163 smp_mb();
1164
1165 /*
1166 * retain hwpoison flag of the poisoned tail page:
1167 * fix for the unsuitable process killed on Guest Machine(KVM)
1168 * by the memory-failure.
1169 */
1170 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
1171 page_tail->flags |= (page->flags &
1172 ((1L << PG_referenced) |
1173 (1L << PG_swapbacked) |
1174 (1L << PG_mlocked) |
1175 (1L << PG_uptodate)));
1176 page_tail->flags |= (1L << PG_dirty);
1177
1178 /*
1179 * 1) clear PageTail before overwriting first_page
1180 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
1181 */
1182 smp_wmb();
1183
1184 /*
1185 * __split_huge_page_splitting() already set the
1186 * splitting bit in all pmd that could map this
1187 * hugepage, that will ensure no CPU can alter the
1188 * mapcount on the head page. The mapcount is only
1189 * accounted in the head page and it has to be
1190 * transferred to all tail pages in the below code. So
1191 * for this code to be safe, the split the mapcount
1192 * can't change. But that doesn't mean userland can't
1193 * keep changing and reading the page contents while
1194 * we transfer the mapcount, so the pmd splitting
1195 * status is achieved setting a reserved bit in the
1196 * pmd, not by clearing the present bit.
1197 */
1198 BUG_ON(page_mapcount(page_tail));
1199 page_tail->_mapcount = page->_mapcount;
1200
1201 BUG_ON(page_tail->mapping);
1202 page_tail->mapping = page->mapping;
1203
1204 page_tail->index = ++head_index;
1205
1206 BUG_ON(!PageAnon(page_tail));
1207 BUG_ON(!PageUptodate(page_tail));
1208 BUG_ON(!PageDirty(page_tail));
1209 BUG_ON(!PageSwapBacked(page_tail));
1210
1211 mem_cgroup_split_huge_fixup(page, page_tail);
1212
1213 lru_add_page_tail(zone, page, page_tail);
1214 }
1215
1216 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1217 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
1218
1219 /*
1220 * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
1221 * so adjust those appropriately if this page is on the LRU.
1222 */
1223 if (PageLRU(page)) {
1224 zonestat = NR_LRU_BASE + page_lru(page);
1225 __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
1226 }
1227
1228 ClearPageCompound(page);
1229 compound_unlock(page);
1230 spin_unlock_irq(&zone->lru_lock);
1231
1232 for (i = 1; i < HPAGE_PMD_NR; i++) {
1233 struct page *page_tail = page + i;
1234 BUG_ON(page_count(page_tail) <= 0);
1235 /*
1236 * Tail pages may be freed if there wasn't any mapping
1237 * like if add_to_swap() is running on a lru page that
1238 * had its mapping zapped. And freeing these pages
1239 * requires taking the lru_lock so we do the put_page
1240 * of the tail pages after the split is complete.
1241 */
1242 put_page(page_tail);
1243 }
1244
1245 /*
1246 * Only the head page (now become a regular page) is required
1247 * to be pinned by the caller.
1248 */
1249 BUG_ON(page_count(page) <= 0);
1250}
1251
1252static int __split_huge_page_map(struct page *page,
1253 struct vm_area_struct *vma,
1254 unsigned long address)
1255{
1256 struct mm_struct *mm = vma->vm_mm;
1257 pmd_t *pmd, _pmd;
1258 int ret = 0, i;
1259 pgtable_t pgtable;
1260 unsigned long haddr;
1261
1262 spin_lock(&mm->page_table_lock);
1263 pmd = page_check_address_pmd(page, mm, address,
1264 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1265 if (pmd) {
1266 pgtable = get_pmd_huge_pte(mm);
1267 pmd_populate(mm, &_pmd, pgtable);
1268
1269 for (i = 0, haddr = address; i < HPAGE_PMD_NR;
1270 i++, haddr += PAGE_SIZE) {
1271 pte_t *pte, entry;
1272 BUG_ON(PageCompound(page+i));
1273 entry = mk_pte(page + i, vma->vm_page_prot);
1274 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1275 if (!pmd_write(*pmd))
1276 entry = pte_wrprotect(entry);
1277 else
1278 BUG_ON(page_mapcount(page) != 1);
1279 if (!pmd_young(*pmd))
1280 entry = pte_mkold(entry);
1281 pte = pte_offset_map(&_pmd, haddr);
1282 BUG_ON(!pte_none(*pte));
1283 set_pte_at(mm, haddr, pte, entry);
1284 pte_unmap(pte);
1285 }
1286
1287 mm->nr_ptes++;
1288 smp_wmb(); /* make pte visible before pmd */
1289 /*
1290 * Up to this point the pmd is present and huge and
1291 * userland has the whole access to the hugepage
1292 * during the split (which happens in place). If we
1293 * overwrite the pmd with the not-huge version
1294 * pointing to the pte here (which of course we could
1295 * if all CPUs were bug free), userland could trigger
1296 * a small page size TLB miss on the small sized TLB
1297 * while the hugepage TLB entry is still established
1298 * in the huge TLB. Some CPU doesn't like that. See
1299 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
1300 * Erratum 383 on page 93. Intel should be safe but is
1301 * also warns that it's only safe if the permission
1302 * and cache attributes of the two entries loaded in
1303 * the two TLB is identical (which should be the case
1304 * here). But it is generally safer to never allow
1305 * small and huge TLB entries for the same virtual
1306 * address to be loaded simultaneously. So instead of
1307 * doing "pmd_populate(); flush_tlb_range();" we first
1308 * mark the current pmd notpresent (atomically because
1309 * here the pmd_trans_huge and pmd_trans_splitting
1310 * must remain set at all times on the pmd until the
1311 * split is complete for this pmd), then we flush the
1312 * SMP TLB and finally we write the non-huge version
1313 * of the pmd entry with pmd_populate.
1314 */
1315 set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
1316 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
1317 pmd_populate(mm, pmd, pgtable);
1318 ret = 1;
1319 }
1320 spin_unlock(&mm->page_table_lock);
1321
1322 return ret;
1323}
1324
1325/* must be called with anon_vma->root->lock hold */
1326static void __split_huge_page(struct page *page,
1327 struct anon_vma *anon_vma)
1328{
1329 int mapcount, mapcount2;
1330 struct anon_vma_chain *avc;
1331
1332 BUG_ON(!PageHead(page));
1333 BUG_ON(PageTail(page));
1334
1335 mapcount = 0;
1336 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1337 struct vm_area_struct *vma = avc->vma;
1338 unsigned long addr = vma_address(page, vma);
1339 BUG_ON(is_vma_temporary_stack(vma));
1340 if (addr == -EFAULT)
1341 continue;
1342 mapcount += __split_huge_page_splitting(page, vma, addr);
1343 }
1344 /*
1345 * It is critical that new vmas are added to the tail of the
1346 * anon_vma list. This guarantes that if copy_huge_pmd() runs
1347 * and establishes a child pmd before
1348 * __split_huge_page_splitting() freezes the parent pmd (so if
1349 * we fail to prevent copy_huge_pmd() from running until the
1350 * whole __split_huge_page() is complete), we will still see
1351 * the newly established pmd of the child later during the
1352 * walk, to be able to set it as pmd_trans_splitting too.
1353 */
1354 if (mapcount != page_mapcount(page))
1355 printk(KERN_ERR "mapcount %d page_mapcount %d\n",
1356 mapcount, page_mapcount(page));
1357 BUG_ON(mapcount != page_mapcount(page));
1358
1359 __split_huge_page_refcount(page);
1360
1361 mapcount2 = 0;
1362 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1363 struct vm_area_struct *vma = avc->vma;
1364 unsigned long addr = vma_address(page, vma);
1365 BUG_ON(is_vma_temporary_stack(vma));
1366 if (addr == -EFAULT)
1367 continue;
1368 mapcount2 += __split_huge_page_map(page, vma, addr);
1369 }
1370 if (mapcount != mapcount2)
1371 printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
1372 mapcount, mapcount2, page_mapcount(page));
1373 BUG_ON(mapcount != mapcount2);
1374}
1375
1376int split_huge_page(struct page *page)
1377{
1378 struct anon_vma *anon_vma;
1379 int ret = 1;
1380
1381 BUG_ON(!PageAnon(page));
1382 anon_vma = page_lock_anon_vma(page);
1383 if (!anon_vma)
1384 goto out;
1385 ret = 0;
1386 if (!PageCompound(page))
1387 goto out_unlock;
1388
1389 BUG_ON(!PageSwapBacked(page));
1390 __split_huge_page(page, anon_vma);
1391
1392 BUG_ON(PageCompound(page));
1393out_unlock:
1394 page_unlock_anon_vma(anon_vma);
1395out:
1396 return ret;
1397}
1398
1399int hugepage_madvise(struct vm_area_struct *vma,
1400 unsigned long *vm_flags, int advice)
1401{
1402 switch (advice) {
1403 case MADV_HUGEPAGE:
1404 /*
1405 * Be somewhat over-protective like KSM for now!
1406 */
1407 if (*vm_flags & (VM_HUGEPAGE |
1408 VM_SHARED | VM_MAYSHARE |
1409 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1410 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1411 VM_MIXEDMAP | VM_SAO))
1412 return -EINVAL;
1413 *vm_flags &= ~VM_NOHUGEPAGE;
1414 *vm_flags |= VM_HUGEPAGE;
1415 /*
1416 * If the vma become good for khugepaged to scan,
1417 * register it here without waiting a page fault that
1418 * may not happen any time soon.
1419 */
1420 if (unlikely(khugepaged_enter_vma_merge(vma)))
1421 return -ENOMEM;
1422 break;
1423 case MADV_NOHUGEPAGE:
1424 /*
1425 * Be somewhat over-protective like KSM for now!
1426 */
1427 if (*vm_flags & (VM_NOHUGEPAGE |
1428 VM_SHARED | VM_MAYSHARE |
1429 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1430 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1431 VM_MIXEDMAP | VM_SAO))
1432 return -EINVAL;
1433 *vm_flags &= ~VM_HUGEPAGE;
1434 *vm_flags |= VM_NOHUGEPAGE;
1435 /*
1436 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
1437 * this vma even if we leave the mm registered in khugepaged if
1438 * it got registered before VM_NOHUGEPAGE was set.
1439 */
1440 break;
1441 }
1442
1443 return 0;
1444}
1445
1446static int __init khugepaged_slab_init(void)
1447{
1448 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
1449 sizeof(struct mm_slot),
1450 __alignof__(struct mm_slot), 0, NULL);
1451 if (!mm_slot_cache)
1452 return -ENOMEM;
1453
1454 return 0;
1455}
1456
1457static void __init khugepaged_slab_free(void)
1458{
1459 kmem_cache_destroy(mm_slot_cache);
1460 mm_slot_cache = NULL;
1461}
1462
1463static inline struct mm_slot *alloc_mm_slot(void)
1464{
1465 if (!mm_slot_cache) /* initialization failed */
1466 return NULL;
1467 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1468}
1469
1470static inline void free_mm_slot(struct mm_slot *mm_slot)
1471{
1472 kmem_cache_free(mm_slot_cache, mm_slot);
1473}
1474
1475static int __init mm_slots_hash_init(void)
1476{
1477 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
1478 GFP_KERNEL);
1479 if (!mm_slots_hash)
1480 return -ENOMEM;
1481 return 0;
1482}
1483
1484#if 0
1485static void __init mm_slots_hash_free(void)
1486{
1487 kfree(mm_slots_hash);
1488 mm_slots_hash = NULL;
1489}
1490#endif
1491
1492static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1493{
1494 struct mm_slot *mm_slot;
1495 struct hlist_head *bucket;
1496 struct hlist_node *node;
1497
1498 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1499 % MM_SLOTS_HASH_HEADS];
1500 hlist_for_each_entry(mm_slot, node, bucket, hash) {
1501 if (mm == mm_slot->mm)
1502 return mm_slot;
1503 }
1504 return NULL;
1505}
1506
1507static void insert_to_mm_slots_hash(struct mm_struct *mm,
1508 struct mm_slot *mm_slot)
1509{
1510 struct hlist_head *bucket;
1511
1512 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1513 % MM_SLOTS_HASH_HEADS];
1514 mm_slot->mm = mm;
1515 hlist_add_head(&mm_slot->hash, bucket);
1516}
1517
1518static inline int khugepaged_test_exit(struct mm_struct *mm)
1519{
1520 return atomic_read(&mm->mm_users) == 0;
1521}
1522
1523int __khugepaged_enter(struct mm_struct *mm)
1524{
1525 struct mm_slot *mm_slot;
1526 int wakeup;
1527
1528 mm_slot = alloc_mm_slot();
1529 if (!mm_slot)
1530 return -ENOMEM;
1531
1532 /* __khugepaged_exit() must not run from under us */
1533 VM_BUG_ON(khugepaged_test_exit(mm));
1534 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
1535 free_mm_slot(mm_slot);
1536 return 0;
1537 }
1538
1539 spin_lock(&khugepaged_mm_lock);
1540 insert_to_mm_slots_hash(mm, mm_slot);
1541 /*
1542 * Insert just behind the scanning cursor, to let the area settle
1543 * down a little.
1544 */
1545 wakeup = list_empty(&khugepaged_scan.mm_head);
1546 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
1547 spin_unlock(&khugepaged_mm_lock);
1548
1549 atomic_inc(&mm->mm_count);
1550 if (wakeup)
1551 wake_up_interruptible(&khugepaged_wait);
1552
1553 return 0;
1554}
1555
1556int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
1557{
1558 unsigned long hstart, hend;
1559 if (!vma->anon_vma)
1560 /*
1561 * Not yet faulted in so we will register later in the
1562 * page fault if needed.
1563 */
1564 return 0;
1565 if (vma->vm_file || vma->vm_ops)
1566 /* khugepaged not yet working on file or special mappings */
1567 return 0;
1568 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1569 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1570 hend = vma->vm_end & HPAGE_PMD_MASK;
1571 if (hstart < hend)
1572 return khugepaged_enter(vma);
1573 return 0;
1574}
1575
1576void __khugepaged_exit(struct mm_struct *mm)
1577{
1578 struct mm_slot *mm_slot;
1579 int free = 0;
1580
1581 spin_lock(&khugepaged_mm_lock);
1582 mm_slot = get_mm_slot(mm);
1583 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
1584 hlist_del(&mm_slot->hash);
1585 list_del(&mm_slot->mm_node);
1586 free = 1;
1587 }
1588
1589 if (free) {
1590 spin_unlock(&khugepaged_mm_lock);
1591 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1592 free_mm_slot(mm_slot);
1593 mmdrop(mm);
1594 } else if (mm_slot) {
1595 spin_unlock(&khugepaged_mm_lock);
1596 /*
1597 * This is required to serialize against
1598 * khugepaged_test_exit() (which is guaranteed to run
1599 * under mmap sem read mode). Stop here (after we
1600 * return all pagetables will be destroyed) until
1601 * khugepaged has finished working on the pagetables
1602 * under the mmap_sem.
1603 */
1604 down_write(&mm->mmap_sem);
1605 up_write(&mm->mmap_sem);
1606 } else
1607 spin_unlock(&khugepaged_mm_lock);
1608}
1609
1610static void release_pte_page(struct page *page)
1611{
1612 /* 0 stands for page_is_file_cache(page) == false */
1613 dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
1614 unlock_page(page);
1615 putback_lru_page(page);
1616}
1617
1618static void release_pte_pages(pte_t *pte, pte_t *_pte)
1619{
1620 while (--_pte >= pte) {
1621 pte_t pteval = *_pte;
1622 if (!pte_none(pteval))
1623 release_pte_page(pte_page(pteval));
1624 }
1625}
1626
1627static void release_all_pte_pages(pte_t *pte)
1628{
1629 release_pte_pages(pte, pte + HPAGE_PMD_NR);
1630}
1631
1632static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1633 unsigned long address,
1634 pte_t *pte)
1635{
1636 struct page *page;
1637 pte_t *_pte;
1638 int referenced = 0, isolated = 0, none = 0;
1639 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
1640 _pte++, address += PAGE_SIZE) {
1641 pte_t pteval = *_pte;
1642 if (pte_none(pteval)) {
1643 if (++none <= khugepaged_max_ptes_none)
1644 continue;
1645 else {
1646 release_pte_pages(pte, _pte);
1647 goto out;
1648 }
1649 }
1650 if (!pte_present(pteval) || !pte_write(pteval)) {
1651 release_pte_pages(pte, _pte);
1652 goto out;
1653 }
1654 page = vm_normal_page(vma, address, pteval);
1655 if (unlikely(!page)) {
1656 release_pte_pages(pte, _pte);
1657 goto out;
1658 }
1659 VM_BUG_ON(PageCompound(page));
1660 BUG_ON(!PageAnon(page));
1661 VM_BUG_ON(!PageSwapBacked(page));
1662
1663 /* cannot use mapcount: can't collapse if there's a gup pin */
1664 if (page_count(page) != 1) {
1665 release_pte_pages(pte, _pte);
1666 goto out;
1667 }
1668 /*
1669 * We can do it before isolate_lru_page because the
1670 * page can't be freed from under us. NOTE: PG_lock
1671 * is needed to serialize against split_huge_page
1672 * when invoked from the VM.
1673 */
1674 if (!trylock_page(page)) {
1675 release_pte_pages(pte, _pte);
1676 goto out;
1677 }
1678 /*
1679 * Isolate the page to avoid collapsing an hugepage
1680 * currently in use by the VM.
1681 */
1682 if (isolate_lru_page(page)) {
1683 unlock_page(page);
1684 release_pte_pages(pte, _pte);
1685 goto out;
1686 }
1687 /* 0 stands for page_is_file_cache(page) == false */
1688 inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
1689 VM_BUG_ON(!PageLocked(page));
1690 VM_BUG_ON(PageLRU(page));
1691
1692 /* If there is no mapped pte young don't collapse the page */
1693 if (pte_young(pteval) || PageReferenced(page) ||
1694 mmu_notifier_test_young(vma->vm_mm, address))
1695 referenced = 1;
1696 }
1697 if (unlikely(!referenced))
1698 release_all_pte_pages(pte);
1699 else
1700 isolated = 1;
1701out:
1702 return isolated;
1703}
1704
1705static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1706 struct vm_area_struct *vma,
1707 unsigned long address,
1708 spinlock_t *ptl)
1709{
1710 pte_t *_pte;
1711 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
1712 pte_t pteval = *_pte;
1713 struct page *src_page;
1714
1715 if (pte_none(pteval)) {
1716 clear_user_highpage(page, address);
1717 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
1718 } else {
1719 src_page = pte_page(pteval);
1720 copy_user_highpage(page, src_page, address, vma);
1721 VM_BUG_ON(page_mapcount(src_page) != 1);
1722 VM_BUG_ON(page_count(src_page) != 2);
1723 release_pte_page(src_page);
1724 /*
1725 * ptl mostly unnecessary, but preempt has to
1726 * be disabled to update the per-cpu stats
1727 * inside page_remove_rmap().
1728 */
1729 spin_lock(ptl);
1730 /*
1731 * paravirt calls inside pte_clear here are
1732 * superfluous.
1733 */
1734 pte_clear(vma->vm_mm, address, _pte);
1735 page_remove_rmap(src_page);
1736 spin_unlock(ptl);
1737 free_page_and_swap_cache(src_page);
1738 }
1739
1740 address += PAGE_SIZE;
1741 page++;
1742 }
1743}
1744
1745static void collapse_huge_page(struct mm_struct *mm,
1746 unsigned long address,
1747 struct page **hpage,
1748 struct vm_area_struct *vma)
1749{
1750 pgd_t *pgd;
1751 pud_t *pud;
1752 pmd_t *pmd, _pmd;
1753 pte_t *pte;
1754 pgtable_t pgtable;
1755 struct page *new_page;
1756 spinlock_t *ptl;
1757 int isolated;
1758 unsigned long hstart, hend;
1759
1760 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1761#ifndef CONFIG_NUMA
1762 VM_BUG_ON(!*hpage);
1763 new_page = *hpage;
1764#else
1765 VM_BUG_ON(*hpage);
1766 /*
1767 * Allocate the page while the vma is still valid and under
1768 * the mmap_sem read mode so there is no memory allocation
1769 * later when we take the mmap_sem in write mode. This is more
1770 * friendly behavior (OTOH it may actually hide bugs) to
1771 * filesystems in userland with daemons allocating memory in
1772 * the userland I/O paths. Allocating memory with the
1773 * mmap_sem in read mode is good idea also to allow greater
1774 * scalability.
1775 */
1776 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
1777 if (unlikely(!new_page)) {
1778 up_read(&mm->mmap_sem);
1779 *hpage = ERR_PTR(-ENOMEM);
1780 return;
1781 }
1782#endif
1783 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1784 up_read(&mm->mmap_sem);
1785 put_page(new_page);
1786 return;
1787 }
1788
1789 /* after allocating the hugepage upgrade to mmap_sem write mode */
1790 up_read(&mm->mmap_sem);
1791
1792 /*
1793 * Prevent all access to pagetables with the exception of
1794 * gup_fast later hanlded by the ptep_clear_flush and the VM
1795 * handled by the anon_vma lock + PG_lock.
1796 */
1797 down_write(&mm->mmap_sem);
1798 if (unlikely(khugepaged_test_exit(mm)))
1799 goto out;
1800
1801 vma = find_vma(mm, address);
1802 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1803 hend = vma->vm_end & HPAGE_PMD_MASK;
1804 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
1805 goto out;
1806
1807 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
1808 (vma->vm_flags & VM_NOHUGEPAGE))
1809 goto out;
1810
1811 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
1812 if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
1813 goto out;
1814 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1815
1816 pgd = pgd_offset(mm, address);
1817 if (!pgd_present(*pgd))
1818 goto out;
1819
1820 pud = pud_offset(pgd, address);
1821 if (!pud_present(*pud))
1822 goto out;
1823
1824 pmd = pmd_offset(pud, address);
1825 /* pmd can't go away or become huge under us */
1826 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1827 goto out;
1828
1829 anon_vma_lock(vma->anon_vma);
1830
1831 pte = pte_offset_map(pmd, address);
1832 ptl = pte_lockptr(mm, pmd);
1833
1834 spin_lock(&mm->page_table_lock); /* probably unnecessary */
1835 /*
1836 * After this gup_fast can't run anymore. This also removes
1837 * any huge TLB entry from the CPU so we won't allow
1838 * huge and small TLB entries for the same virtual address
1839 * to avoid the risk of CPU bugs in that area.
1840 */
1841 _pmd = pmdp_clear_flush_notify(vma, address, pmd);
1842 spin_unlock(&mm->page_table_lock);
1843
1844 spin_lock(ptl);
1845 isolated = __collapse_huge_page_isolate(vma, address, pte);
1846 spin_unlock(ptl);
1847
1848 if (unlikely(!isolated)) {
1849 pte_unmap(pte);
1850 spin_lock(&mm->page_table_lock);
1851 BUG_ON(!pmd_none(*pmd));
1852 set_pmd_at(mm, address, pmd, _pmd);
1853 spin_unlock(&mm->page_table_lock);
1854 anon_vma_unlock(vma->anon_vma);
1855 goto out;
1856 }
1857
1858 /*
1859 * All pages are isolated and locked so anon_vma rmap
1860 * can't run anymore.
1861 */
1862 anon_vma_unlock(vma->anon_vma);
1863
1864 __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
1865 pte_unmap(pte);
1866 __SetPageUptodate(new_page);
1867 pgtable = pmd_pgtable(_pmd);
1868 VM_BUG_ON(page_count(pgtable) != 1);
1869 VM_BUG_ON(page_mapcount(pgtable) != 0);
1870
1871 _pmd = mk_pmd(new_page, vma->vm_page_prot);
1872 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
1873 _pmd = pmd_mkhuge(_pmd);
1874
1875 /*
1876 * spin_lock() below is not the equivalent of smp_wmb(), so
1877 * this is needed to avoid the copy_huge_page writes to become
1878 * visible after the set_pmd_at() write.
1879 */
1880 smp_wmb();
1881
1882 spin_lock(&mm->page_table_lock);
1883 BUG_ON(!pmd_none(*pmd));
1884 page_add_new_anon_rmap(new_page, vma, address);
1885 set_pmd_at(mm, address, pmd, _pmd);
1886 update_mmu_cache(vma, address, entry);
1887 prepare_pmd_huge_pte(pgtable, mm);
1888 mm->nr_ptes--;
1889 spin_unlock(&mm->page_table_lock);
1890
1891#ifndef CONFIG_NUMA
1892 *hpage = NULL;
1893#endif
1894 khugepaged_pages_collapsed++;
1895out_up_write:
1896 up_write(&mm->mmap_sem);
1897 return;
1898
1899out:
1900 mem_cgroup_uncharge_page(new_page);
1901#ifdef CONFIG_NUMA
1902 put_page(new_page);
1903#endif
1904 goto out_up_write;
1905}
1906
1907static int khugepaged_scan_pmd(struct mm_struct *mm,
1908 struct vm_area_struct *vma,
1909 unsigned long address,
1910 struct page **hpage)
1911{
1912 pgd_t *pgd;
1913 pud_t *pud;
1914 pmd_t *pmd;
1915 pte_t *pte, *_pte;
1916 int ret = 0, referenced = 0, none = 0;
1917 struct page *page;
1918 unsigned long _address;
1919 spinlock_t *ptl;
1920
1921 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1922
1923 pgd = pgd_offset(mm, address);
1924 if (!pgd_present(*pgd))
1925 goto out;
1926
1927 pud = pud_offset(pgd, address);
1928 if (!pud_present(*pud))
1929 goto out;
1930
1931 pmd = pmd_offset(pud, address);
1932 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1933 goto out;
1934
1935 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1936 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
1937 _pte++, _address += PAGE_SIZE) {
1938 pte_t pteval = *_pte;
1939 if (pte_none(pteval)) {
1940 if (++none <= khugepaged_max_ptes_none)
1941 continue;
1942 else
1943 goto out_unmap;
1944 }
1945 if (!pte_present(pteval) || !pte_write(pteval))
1946 goto out_unmap;
1947 page = vm_normal_page(vma, _address, pteval);
1948 if (unlikely(!page))
1949 goto out_unmap;
1950 VM_BUG_ON(PageCompound(page));
1951 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
1952 goto out_unmap;
1953 /* cannot use mapcount: can't collapse if there's a gup pin */
1954 if (page_count(page) != 1)
1955 goto out_unmap;
1956 if (pte_young(pteval) || PageReferenced(page) ||
1957 mmu_notifier_test_young(vma->vm_mm, address))
1958 referenced = 1;
1959 }
1960 if (referenced)
1961 ret = 1;
1962out_unmap:
1963 pte_unmap_unlock(pte, ptl);
1964 if (ret)
1965 /* collapse_huge_page will return with the mmap_sem released */
1966 collapse_huge_page(mm, address, hpage, vma);
1967out:
1968 return ret;
1969}
1970
1971static void collect_mm_slot(struct mm_slot *mm_slot)
1972{
1973 struct mm_struct *mm = mm_slot->mm;
1974
1975 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
1976
1977 if (khugepaged_test_exit(mm)) {
1978 /* free mm_slot */
1979 hlist_del(&mm_slot->hash);
1980 list_del(&mm_slot->mm_node);
1981
1982 /*
1983 * Not strictly needed because the mm exited already.
1984 *
1985 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1986 */
1987
1988 /* khugepaged_mm_lock actually not necessary for the below */
1989 free_mm_slot(mm_slot);
1990 mmdrop(mm);
1991 }
1992}
1993
1994static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
1995 struct page **hpage)
1996{
1997 struct mm_slot *mm_slot;
1998 struct mm_struct *mm;
1999 struct vm_area_struct *vma;
2000 int progress = 0;
2001
2002 VM_BUG_ON(!pages);
2003 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
2004
2005 if (khugepaged_scan.mm_slot)
2006 mm_slot = khugepaged_scan.mm_slot;
2007 else {
2008 mm_slot = list_entry(khugepaged_scan.mm_head.next,
2009 struct mm_slot, mm_node);
2010 khugepaged_scan.address = 0;
2011 khugepaged_scan.mm_slot = mm_slot;
2012 }
2013 spin_unlock(&khugepaged_mm_lock);
2014
2015 mm = mm_slot->mm;
2016 down_read(&mm->mmap_sem);
2017 if (unlikely(khugepaged_test_exit(mm)))
2018 vma = NULL;
2019 else
2020 vma = find_vma(mm, khugepaged_scan.address);
2021
2022 progress++;
2023 for (; vma; vma = vma->vm_next) {
2024 unsigned long hstart, hend;
2025
2026 cond_resched();
2027 if (unlikely(khugepaged_test_exit(mm))) {
2028 progress++;
2029 break;
2030 }
2031
2032 if ((!(vma->vm_flags & VM_HUGEPAGE) &&
2033 !khugepaged_always()) ||
2034 (vma->vm_flags & VM_NOHUGEPAGE)) {
2035 progress++;
2036 continue;
2037 }
2038
2039 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
2040 if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
2041 khugepaged_scan.address = vma->vm_end;
2042 progress++;
2043 continue;
2044 }
2045 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
2046
2047 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2048 hend = vma->vm_end & HPAGE_PMD_MASK;
2049 if (hstart >= hend) {
2050 progress++;
2051 continue;
2052 }
2053 if (khugepaged_scan.address < hstart)
2054 khugepaged_scan.address = hstart;
2055 if (khugepaged_scan.address > hend) {
2056 khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
2057 progress++;
2058 continue;
2059 }
2060 BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2061
2062 while (khugepaged_scan.address < hend) {
2063 int ret;
2064 cond_resched();
2065 if (unlikely(khugepaged_test_exit(mm)))
2066 goto breakouterloop;
2067
2068 VM_BUG_ON(khugepaged_scan.address < hstart ||
2069 khugepaged_scan.address + HPAGE_PMD_SIZE >
2070 hend);
2071 ret = khugepaged_scan_pmd(mm, vma,
2072 khugepaged_scan.address,
2073 hpage);
2074 /* move to next address */
2075 khugepaged_scan.address += HPAGE_PMD_SIZE;
2076 progress += HPAGE_PMD_NR;
2077 if (ret)
2078 /* we released mmap_sem so break loop */
2079 goto breakouterloop_mmap_sem;
2080 if (progress >= pages)
2081 goto breakouterloop;
2082 }
2083 }
2084breakouterloop:
2085 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
2086breakouterloop_mmap_sem:
2087
2088 spin_lock(&khugepaged_mm_lock);
2089 BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2090 /*
2091 * Release the current mm_slot if this mm is about to die, or
2092 * if we scanned all vmas of this mm.
2093 */
2094 if (khugepaged_test_exit(mm) || !vma) {
2095 /*
2096 * Make sure that if mm_users is reaching zero while
2097 * khugepaged runs here, khugepaged_exit will find
2098 * mm_slot not pointing to the exiting mm.
2099 */
2100 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
2101 khugepaged_scan.mm_slot = list_entry(
2102 mm_slot->mm_node.next,
2103 struct mm_slot, mm_node);
2104 khugepaged_scan.address = 0;
2105 } else {
2106 khugepaged_scan.mm_slot = NULL;
2107 khugepaged_full_scans++;
2108 }
2109
2110 collect_mm_slot(mm_slot);
2111 }
2112
2113 return progress;
2114}
2115
2116static int khugepaged_has_work(void)
2117{
2118 return !list_empty(&khugepaged_scan.mm_head) &&
2119 khugepaged_enabled();
2120}
2121
2122static int khugepaged_wait_event(void)
2123{
2124 return !list_empty(&khugepaged_scan.mm_head) ||
2125 !khugepaged_enabled();
2126}
2127
2128static void khugepaged_do_scan(struct page **hpage)
2129{
2130 unsigned int progress = 0, pass_through_head = 0;
2131 unsigned int pages = khugepaged_pages_to_scan;
2132
2133 barrier(); /* write khugepaged_pages_to_scan to local stack */
2134
2135 while (progress < pages) {
2136 cond_resched();
2137
2138#ifndef CONFIG_NUMA
2139 if (!*hpage) {
2140 *hpage = alloc_hugepage(khugepaged_defrag());
2141 if (unlikely(!*hpage))
2142 break;
2143 }
2144#else
2145 if (IS_ERR(*hpage))
2146 break;
2147#endif
2148
2149 if (unlikely(kthread_should_stop() || freezing(current)))
2150 break;
2151
2152 spin_lock(&khugepaged_mm_lock);
2153 if (!khugepaged_scan.mm_slot)
2154 pass_through_head++;
2155 if (khugepaged_has_work() &&
2156 pass_through_head < 2)
2157 progress += khugepaged_scan_mm_slot(pages - progress,
2158 hpage);
2159 else
2160 progress = pages;
2161 spin_unlock(&khugepaged_mm_lock);
2162 }
2163}
2164
2165static void khugepaged_alloc_sleep(void)
2166{
2167 DEFINE_WAIT(wait);
2168 add_wait_queue(&khugepaged_wait, &wait);
2169 schedule_timeout_interruptible(
2170 msecs_to_jiffies(
2171 khugepaged_alloc_sleep_millisecs));
2172 remove_wait_queue(&khugepaged_wait, &wait);
2173}
2174
2175#ifndef CONFIG_NUMA
2176static struct page *khugepaged_alloc_hugepage(void)
2177{
2178 struct page *hpage;
2179
2180 do {
2181 hpage = alloc_hugepage(khugepaged_defrag());
2182 if (!hpage)
2183 khugepaged_alloc_sleep();
2184 } while (unlikely(!hpage) &&
2185 likely(khugepaged_enabled()));
2186 return hpage;
2187}
2188#endif
2189
2190static void khugepaged_loop(void)
2191{
2192 struct page *hpage;
2193
2194#ifdef CONFIG_NUMA
2195 hpage = NULL;
2196#endif
2197 while (likely(khugepaged_enabled())) {
2198#ifndef CONFIG_NUMA
2199 hpage = khugepaged_alloc_hugepage();
2200 if (unlikely(!hpage))
2201 break;
2202#else
2203 if (IS_ERR(hpage)) {
2204 khugepaged_alloc_sleep();
2205 hpage = NULL;
2206 }
2207#endif
2208
2209 khugepaged_do_scan(&hpage);
2210#ifndef CONFIG_NUMA
2211 if (hpage)
2212 put_page(hpage);
2213#endif
2214 try_to_freeze();
2215 if (unlikely(kthread_should_stop()))
2216 break;
2217 if (khugepaged_has_work()) {
2218 DEFINE_WAIT(wait);
2219 if (!khugepaged_scan_sleep_millisecs)
2220 continue;
2221 add_wait_queue(&khugepaged_wait, &wait);
2222 schedule_timeout_interruptible(
2223 msecs_to_jiffies(
2224 khugepaged_scan_sleep_millisecs));
2225 remove_wait_queue(&khugepaged_wait, &wait);
2226 } else if (khugepaged_enabled())
2227 wait_event_freezable(khugepaged_wait,
2228 khugepaged_wait_event());
2229 }
2230}
2231
2232static int khugepaged(void *none)
2233{
2234 struct mm_slot *mm_slot;
2235
2236 set_freezable();
2237 set_user_nice(current, 19);
2238
2239 /* serialize with start_khugepaged() */
2240 mutex_lock(&khugepaged_mutex);
2241
2242 for (;;) {
2243 mutex_unlock(&khugepaged_mutex);
2244 BUG_ON(khugepaged_thread != current);
2245 khugepaged_loop();
2246 BUG_ON(khugepaged_thread != current);
2247
2248 mutex_lock(&khugepaged_mutex);
2249 if (!khugepaged_enabled())
2250 break;
2251 if (unlikely(kthread_should_stop()))
2252 break;
2253 }
2254
2255 spin_lock(&khugepaged_mm_lock);
2256 mm_slot = khugepaged_scan.mm_slot;
2257 khugepaged_scan.mm_slot = NULL;
2258 if (mm_slot)
2259 collect_mm_slot(mm_slot);
2260 spin_unlock(&khugepaged_mm_lock);
2261
2262 khugepaged_thread = NULL;
2263 mutex_unlock(&khugepaged_mutex);
2264
2265 return 0;
2266}
2267
2268void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
2269{
2270 struct page *page;
2271
2272 spin_lock(&mm->page_table_lock);
2273 if (unlikely(!pmd_trans_huge(*pmd))) {
2274 spin_unlock(&mm->page_table_lock);
2275 return;
2276 }
2277 page = pmd_page(*pmd);
2278 VM_BUG_ON(!page_count(page));
2279 get_page(page);
2280 spin_unlock(&mm->page_table_lock);
2281
2282 split_huge_page(page);
2283
2284 put_page(page);
2285 BUG_ON(pmd_trans_huge(*pmd));
2286}
2287
2288static void split_huge_page_address(struct mm_struct *mm,
2289 unsigned long address)
2290{
2291 pgd_t *pgd;
2292 pud_t *pud;
2293 pmd_t *pmd;
2294
2295 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2296
2297 pgd = pgd_offset(mm, address);
2298 if (!pgd_present(*pgd))
2299 return;
2300
2301 pud = pud_offset(pgd, address);
2302 if (!pud_present(*pud))
2303 return;
2304
2305 pmd = pmd_offset(pud, address);
2306 if (!pmd_present(*pmd))
2307 return;
2308 /*
2309 * Caller holds the mmap_sem write mode, so a huge pmd cannot
2310 * materialize from under us.
2311 */
2312 split_huge_page_pmd(mm, pmd);
2313}
2314
2315void __vma_adjust_trans_huge(struct vm_area_struct *vma,
2316 unsigned long start,
2317 unsigned long end,
2318 long adjust_next)
2319{
2320 /*
2321 * If the new start address isn't hpage aligned and it could
2322 * previously contain an hugepage: check if we need to split
2323 * an huge pmd.
2324 */
2325 if (start & ~HPAGE_PMD_MASK &&
2326 (start & HPAGE_PMD_MASK) >= vma->vm_start &&
2327 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2328 split_huge_page_address(vma->vm_mm, start);
2329
2330 /*
2331 * If the new end address isn't hpage aligned and it could
2332 * previously contain an hugepage: check if we need to split
2333 * an huge pmd.
2334 */
2335 if (end & ~HPAGE_PMD_MASK &&
2336 (end & HPAGE_PMD_MASK) >= vma->vm_start &&
2337 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2338 split_huge_page_address(vma->vm_mm, end);
2339
2340 /*
2341 * If we're also updating the vma->vm_next->vm_start, if the new
2342 * vm_next->vm_start isn't page aligned and it could previously
2343 * contain an hugepage: check if we need to split an huge pmd.
2344 */
2345 if (adjust_next > 0) {
2346 struct vm_area_struct *next = vma->vm_next;
2347 unsigned long nstart = next->vm_start;
2348 nstart += adjust_next << PAGE_SHIFT;
2349 if (nstart & ~HPAGE_PMD_MASK &&
2350 (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
2351 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
2352 split_huge_page_address(next->vm_mm, nstart);
2353 }
2354}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c4a3558589ab..bb0b7c128015 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma)
394 return 0; 394 return 0;
395} 395}
396 396
397static void clear_gigantic_page(struct page *page,
398 unsigned long addr, unsigned long sz)
399{
400 int i;
401 struct page *p = page;
402
403 might_sleep();
404 for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
405 cond_resched();
406 clear_user_highpage(p, addr + i * PAGE_SIZE);
407 }
408}
409static void clear_huge_page(struct page *page,
410 unsigned long addr, unsigned long sz)
411{
412 int i;
413
414 if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
415 clear_gigantic_page(page, addr, sz);
416 return;
417 }
418
419 might_sleep();
420 for (i = 0; i < sz/PAGE_SIZE; i++) {
421 cond_resched();
422 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
423 }
424}
425
426static void copy_user_gigantic_page(struct page *dst, struct page *src,
427 unsigned long addr, struct vm_area_struct *vma)
428{
429 int i;
430 struct hstate *h = hstate_vma(vma);
431 struct page *dst_base = dst;
432 struct page *src_base = src;
433
434 for (i = 0; i < pages_per_huge_page(h); ) {
435 cond_resched();
436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
437
438 i++;
439 dst = mem_map_next(dst, dst_base, i);
440 src = mem_map_next(src, src_base, i);
441 }
442}
443
444static void copy_user_huge_page(struct page *dst, struct page *src,
445 unsigned long addr, struct vm_area_struct *vma)
446{
447 int i;
448 struct hstate *h = hstate_vma(vma);
449
450 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
451 copy_user_gigantic_page(dst, src, addr, vma);
452 return;
453 }
454
455 might_sleep();
456 for (i = 0; i < pages_per_huge_page(h); i++) {
457 cond_resched();
458 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
459 }
460}
461
462static void copy_gigantic_page(struct page *dst, struct page *src) 397static void copy_gigantic_page(struct page *dst, struct page *src)
463{ 398{
464 int i; 399 int i;
@@ -1428,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1428 1363
1429 return sprintf(buf, "%lu\n", nr_huge_pages); 1364 return sprintf(buf, "%lu\n", nr_huge_pages);
1430} 1365}
1366
1431static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 1367static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1432 struct kobject *kobj, struct kobj_attribute *attr, 1368 struct kobject *kobj, struct kobj_attribute *attr,
1433 const char *buf, size_t len) 1369 const char *buf, size_t len)
@@ -1440,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1440 1376
1441 err = strict_strtoul(buf, 10, &count); 1377 err = strict_strtoul(buf, 10, &count);
1442 if (err) 1378 if (err)
1443 return 0; 1379 goto out;
1444 1380
1445 h = kobj_to_hstate(kobj, &nid); 1381 h = kobj_to_hstate(kobj, &nid);
1382 if (h->order >= MAX_ORDER) {
1383 err = -EINVAL;
1384 goto out;
1385 }
1386
1446 if (nid == NUMA_NO_NODE) { 1387 if (nid == NUMA_NO_NODE) {
1447 /* 1388 /*
1448 * global hstate attribute 1389 * global hstate attribute
@@ -1468,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1468 NODEMASK_FREE(nodes_allowed); 1409 NODEMASK_FREE(nodes_allowed);
1469 1410
1470 return len; 1411 return len;
1412out:
1413 NODEMASK_FREE(nodes_allowed);
1414 return err;
1471} 1415}
1472 1416
1473static ssize_t nr_hugepages_show(struct kobject *kobj, 1417static ssize_t nr_hugepages_show(struct kobject *kobj,
@@ -1510,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1510 struct hstate *h = kobj_to_hstate(kobj, NULL); 1454 struct hstate *h = kobj_to_hstate(kobj, NULL);
1511 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 1455 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1512} 1456}
1457
1513static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 1458static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1514 struct kobj_attribute *attr, const char *buf, size_t count) 1459 struct kobj_attribute *attr, const char *buf, size_t count)
1515{ 1460{
@@ -1517,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1517 unsigned long input; 1462 unsigned long input;
1518 struct hstate *h = kobj_to_hstate(kobj, NULL); 1463 struct hstate *h = kobj_to_hstate(kobj, NULL);
1519 1464
1465 if (h->order >= MAX_ORDER)
1466 return -EINVAL;
1467
1520 err = strict_strtoul(buf, 10, &input); 1468 err = strict_strtoul(buf, 10, &input);
1521 if (err) 1469 if (err)
1522 return 0; 1470 return err;
1523 1471
1524 spin_lock(&hugetlb_lock); 1472 spin_lock(&hugetlb_lock);
1525 h->nr_overcommit_huge_pages = input; 1473 h->nr_overcommit_huge_pages = input;
@@ -1922,13 +1870,19 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1922{ 1870{
1923 struct hstate *h = &default_hstate; 1871 struct hstate *h = &default_hstate;
1924 unsigned long tmp; 1872 unsigned long tmp;
1873 int ret;
1925 1874
1926 if (!write) 1875 if (!write)
1927 tmp = h->max_huge_pages; 1876 tmp = h->max_huge_pages;
1928 1877
1878 if (write && h->order >= MAX_ORDER)
1879 return -EINVAL;
1880
1929 table->data = &tmp; 1881 table->data = &tmp;
1930 table->maxlen = sizeof(unsigned long); 1882 table->maxlen = sizeof(unsigned long);
1931 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1883 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
1884 if (ret)
1885 goto out;
1932 1886
1933 if (write) { 1887 if (write) {
1934 NODEMASK_ALLOC(nodemask_t, nodes_allowed, 1888 NODEMASK_ALLOC(nodemask_t, nodes_allowed,
@@ -1943,8 +1897,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1943 if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 1897 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1944 NODEMASK_FREE(nodes_allowed); 1898 NODEMASK_FREE(nodes_allowed);
1945 } 1899 }
1946 1900out:
1947 return 0; 1901 return ret;
1948} 1902}
1949 1903
1950int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1904int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -1982,21 +1936,27 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1982{ 1936{
1983 struct hstate *h = &default_hstate; 1937 struct hstate *h = &default_hstate;
1984 unsigned long tmp; 1938 unsigned long tmp;
1939 int ret;
1985 1940
1986 if (!write) 1941 if (!write)
1987 tmp = h->nr_overcommit_huge_pages; 1942 tmp = h->nr_overcommit_huge_pages;
1988 1943
1944 if (write && h->order >= MAX_ORDER)
1945 return -EINVAL;
1946
1989 table->data = &tmp; 1947 table->data = &tmp;
1990 table->maxlen = sizeof(unsigned long); 1948 table->maxlen = sizeof(unsigned long);
1991 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1949 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
1950 if (ret)
1951 goto out;
1992 1952
1993 if (write) { 1953 if (write) {
1994 spin_lock(&hugetlb_lock); 1954 spin_lock(&hugetlb_lock);
1995 h->nr_overcommit_huge_pages = tmp; 1955 h->nr_overcommit_huge_pages = tmp;
1996 spin_unlock(&hugetlb_lock); 1956 spin_unlock(&hugetlb_lock);
1997 } 1957 }
1998 1958out:
1999 return 0; 1959 return ret;
2000} 1960}
2001 1961
2002#endif /* CONFIG_SYSCTL */ 1962#endif /* CONFIG_SYSCTL */
@@ -2454,7 +2414,8 @@ retry_avoidcopy:
2454 return VM_FAULT_OOM; 2414 return VM_FAULT_OOM;
2455 } 2415 }
2456 2416
2457 copy_user_huge_page(new_page, old_page, address, vma); 2417 copy_user_huge_page(new_page, old_page, address, vma,
2418 pages_per_huge_page(h));
2458 __SetPageUptodate(new_page); 2419 __SetPageUptodate(new_page);
2459 2420
2460 /* 2421 /*
@@ -2558,7 +2519,7 @@ retry:
2558 ret = -PTR_ERR(page); 2519 ret = -PTR_ERR(page);
2559 goto out; 2520 goto out;
2560 } 2521 }
2561 clear_huge_page(page, address, huge_page_size(h)); 2522 clear_huge_page(page, address, pages_per_huge_page(h));
2562 __SetPageUptodate(page); 2523 __SetPageUptodate(page);
2563 2524
2564 if (vma->vm_flags & VM_MAYSHARE) { 2525 if (vma->vm_flags & VM_MAYSHARE) {
@@ -2738,7 +2699,8 @@ out_page_table_lock:
2738 unlock_page(pagecache_page); 2699 unlock_page(pagecache_page);
2739 put_page(pagecache_page); 2700 put_page(pagecache_page);
2740 } 2701 }
2741 unlock_page(page); 2702 if (page != pagecache_page)
2703 unlock_page(page);
2742 2704
2743out_mutex: 2705out_mutex:
2744 mutex_unlock(&hugetlb_instantiation_mutex); 2706 mutex_unlock(&hugetlb_instantiation_mutex);
diff --git a/mm/internal.h b/mm/internal.h
index dedb0aff673f..69488205723d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -134,6 +134,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
134 } 134 }
135} 135}
136 136
137#ifdef CONFIG_TRANSPARENT_HUGEPAGE
138extern unsigned long vma_address(struct page *page,
139 struct vm_area_struct *vma);
140#endif
137#else /* !CONFIG_MMU */ 141#else /* !CONFIG_MMU */
138static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 142static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
139{ 143{
@@ -243,7 +247,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
243 247
244int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 248int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
245 unsigned long start, int len, unsigned int foll_flags, 249 unsigned long start, int len, unsigned int foll_flags,
246 struct page **pages, struct vm_area_struct **vmas); 250 struct page **pages, struct vm_area_struct **vmas,
251 int *nonblocking);
247 252
248#define ZONE_RECLAIM_NOSCAN -2 253#define ZONE_RECLAIM_NOSCAN -2
249#define ZONE_RECLAIM_FULL -1 254#define ZONE_RECLAIM_FULL -1
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index 177a5169bbde..ff0d9779cec8 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void)
75 * after the module is removed. 75 * after the module is removed.
76 */ 76 */
77 for (i = 0; i < 10; i++) { 77 for (i = 0; i < 10; i++) {
78 elem = kmalloc(sizeof(*elem), GFP_KERNEL); 78 elem = kzalloc(sizeof(*elem), GFP_KERNEL);
79 pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); 79 pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem);
80 if (!elem) 80 if (!elem)
81 return -ENOMEM; 81 return -ENOMEM;
82 memset(elem, 0, sizeof(*elem));
83 INIT_LIST_HEAD(&elem->list); 82 INIT_LIST_HEAD(&elem->list);
84
85 list_add_tail(&elem->list, &test_list); 83 list_add_tail(&elem->list, &test_list);
86 } 84 }
87 85
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index bd9bc214091b..84225f3b7190 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -113,7 +113,9 @@
113#define BYTES_PER_POINTER sizeof(void *) 113#define BYTES_PER_POINTER sizeof(void *)
114 114
115/* GFP bitmask for kmemleak internal allocations */ 115/* GFP bitmask for kmemleak internal allocations */
116#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) 116#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
117 __GFP_NORETRY | __GFP_NOMEMALLOC | \
118 __GFP_NOWARN)
117 119
118/* scanning area inside a memory block */ 120/* scanning area inside a memory block */
119struct kmemleak_scan_area { 121struct kmemleak_scan_area {
@@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
511 struct kmemleak_object *object; 513 struct kmemleak_object *object;
512 struct prio_tree_node *node; 514 struct prio_tree_node *node;
513 515
514 object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); 516 object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
515 if (!object) { 517 if (!object) {
516 kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); 518 pr_warning("Cannot allocate a kmemleak_object structure\n");
519 kmemleak_disable();
517 return NULL; 520 return NULL;
518 } 521 }
519 522
@@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
734 return; 737 return;
735 } 738 }
736 739
737 area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); 740 area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
738 if (!area) { 741 if (!area) {
739 kmemleak_warn("Cannot allocate a scan area\n"); 742 pr_warning("Cannot allocate a scan area\n");
740 goto out; 743 goto out;
741 } 744 }
742 745
diff --git a/mm/ksm.c b/mm/ksm.c
index 65ab5c7067d9..c2b2a94f9d67 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -34,6 +34,7 @@
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/ksm.h> 35#include <linux/ksm.h>
36#include <linux/hash.h> 36#include <linux/hash.h>
37#include <linux/freezer.h>
37 38
38#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
39#include "internal.h" 40#include "internal.h"
@@ -411,6 +412,20 @@ out:
411 up_read(&mm->mmap_sem); 412 up_read(&mm->mmap_sem);
412} 413}
413 414
415static struct page *page_trans_compound_anon(struct page *page)
416{
417 if (PageTransCompound(page)) {
418 struct page *head = compound_trans_head(page);
419 /*
420 * head may actually be splitted and freed from under
421 * us but it's ok here.
422 */
423 if (PageAnon(head))
424 return head;
425 }
426 return NULL;
427}
428
414static struct page *get_mergeable_page(struct rmap_item *rmap_item) 429static struct page *get_mergeable_page(struct rmap_item *rmap_item)
415{ 430{
416 struct mm_struct *mm = rmap_item->mm; 431 struct mm_struct *mm = rmap_item->mm;
@@ -430,7 +445,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
430 page = follow_page(vma, addr, FOLL_GET); 445 page = follow_page(vma, addr, FOLL_GET);
431 if (IS_ERR_OR_NULL(page)) 446 if (IS_ERR_OR_NULL(page))
432 goto out; 447 goto out;
433 if (PageAnon(page)) { 448 if (PageAnon(page) || page_trans_compound_anon(page)) {
434 flush_anon_page(vma, page, addr); 449 flush_anon_page(vma, page, addr);
435 flush_dcache_page(page); 450 flush_dcache_page(page);
436 } else { 451 } else {
@@ -708,6 +723,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
708 if (addr == -EFAULT) 723 if (addr == -EFAULT)
709 goto out; 724 goto out;
710 725
726 BUG_ON(PageTransCompound(page));
711 ptep = page_check_address(page, mm, addr, &ptl, 0); 727 ptep = page_check_address(page, mm, addr, &ptl, 0);
712 if (!ptep) 728 if (!ptep)
713 goto out; 729 goto out;
@@ -783,6 +799,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
783 goto out; 799 goto out;
784 800
785 pmd = pmd_offset(pud, addr); 801 pmd = pmd_offset(pud, addr);
802 BUG_ON(pmd_trans_huge(*pmd));
786 if (!pmd_present(*pmd)) 803 if (!pmd_present(*pmd))
787 goto out; 804 goto out;
788 805
@@ -800,6 +817,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
800 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); 817 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
801 818
802 page_remove_rmap(page); 819 page_remove_rmap(page);
820 if (!page_mapped(page))
821 try_to_free_swap(page);
803 put_page(page); 822 put_page(page);
804 823
805 pte_unmap_unlock(ptep, ptl); 824 pte_unmap_unlock(ptep, ptl);
@@ -808,6 +827,33 @@ out:
808 return err; 827 return err;
809} 828}
810 829
830static int page_trans_compound_anon_split(struct page *page)
831{
832 int ret = 0;
833 struct page *transhuge_head = page_trans_compound_anon(page);
834 if (transhuge_head) {
835 /* Get the reference on the head to split it. */
836 if (get_page_unless_zero(transhuge_head)) {
837 /*
838 * Recheck we got the reference while the head
839 * was still anonymous.
840 */
841 if (PageAnon(transhuge_head))
842 ret = split_huge_page(transhuge_head);
843 else
844 /*
845 * Retry later if split_huge_page run
846 * from under us.
847 */
848 ret = 1;
849 put_page(transhuge_head);
850 } else
851 /* Retry later if split_huge_page run from under us. */
852 ret = 1;
853 }
854 return ret;
855}
856
811/* 857/*
812 * try_to_merge_one_page - take two pages and merge them into one 858 * try_to_merge_one_page - take two pages and merge them into one
813 * @vma: the vma that holds the pte pointing to page 859 * @vma: the vma that holds the pte pointing to page
@@ -828,6 +874,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
828 874
829 if (!(vma->vm_flags & VM_MERGEABLE)) 875 if (!(vma->vm_flags & VM_MERGEABLE))
830 goto out; 876 goto out;
877 if (PageTransCompound(page) && page_trans_compound_anon_split(page))
878 goto out;
879 BUG_ON(PageTransCompound(page));
831 if (!PageAnon(page)) 880 if (!PageAnon(page))
832 goto out; 881 goto out;
833 882
@@ -1247,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1247 1296
1248 slot = ksm_scan.mm_slot; 1297 slot = ksm_scan.mm_slot;
1249 if (slot == &ksm_mm_head) { 1298 if (slot == &ksm_mm_head) {
1299 /*
1300 * A number of pages can hang around indefinitely on per-cpu
1301 * pagevecs, raised page count preventing write_protect_page
1302 * from merging them. Though it doesn't really matter much,
1303 * it is puzzling to see some stuck in pages_volatile until
1304 * other activity jostles them out, and they also prevented
1305 * LTP's KSM test from succeeding deterministically; so drain
1306 * them here (here rather than on entry to ksm_do_scan(),
1307 * so we don't IPI too often when pages_to_scan is set low).
1308 */
1309 lru_add_drain_all();
1310
1250 root_unstable_tree = RB_ROOT; 1311 root_unstable_tree = RB_ROOT;
1251 1312
1252 spin_lock(&ksm_mmlist_lock); 1313 spin_lock(&ksm_mmlist_lock);
@@ -1277,7 +1338,13 @@ next_mm:
1277 if (ksm_test_exit(mm)) 1338 if (ksm_test_exit(mm))
1278 break; 1339 break;
1279 *page = follow_page(vma, ksm_scan.address, FOLL_GET); 1340 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1280 if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { 1341 if (IS_ERR_OR_NULL(*page)) {
1342 ksm_scan.address += PAGE_SIZE;
1343 cond_resched();
1344 continue;
1345 }
1346 if (PageAnon(*page) ||
1347 page_trans_compound_anon(*page)) {
1281 flush_anon_page(vma, *page, ksm_scan.address); 1348 flush_anon_page(vma, *page, ksm_scan.address);
1282 flush_dcache_page(*page); 1349 flush_dcache_page(*page);
1283 rmap_item = get_next_rmap_item(slot, 1350 rmap_item = get_next_rmap_item(slot,
@@ -1291,8 +1358,7 @@ next_mm:
1291 up_read(&mm->mmap_sem); 1358 up_read(&mm->mmap_sem);
1292 return rmap_item; 1359 return rmap_item;
1293 } 1360 }
1294 if (!IS_ERR_OR_NULL(*page)) 1361 put_page(*page);
1295 put_page(*page);
1296 ksm_scan.address += PAGE_SIZE; 1362 ksm_scan.address += PAGE_SIZE;
1297 cond_resched(); 1363 cond_resched();
1298 } 1364 }
@@ -1352,7 +1418,7 @@ static void ksm_do_scan(unsigned int scan_npages)
1352 struct rmap_item *rmap_item; 1418 struct rmap_item *rmap_item;
1353 struct page *uninitialized_var(page); 1419 struct page *uninitialized_var(page);
1354 1420
1355 while (scan_npages--) { 1421 while (scan_npages-- && likely(!freezing(current))) {
1356 cond_resched(); 1422 cond_resched();
1357 rmap_item = scan_get_next_rmap_item(&page); 1423 rmap_item = scan_get_next_rmap_item(&page);
1358 if (!rmap_item) 1424 if (!rmap_item)
@@ -1370,6 +1436,7 @@ static int ksmd_should_run(void)
1370 1436
1371static int ksm_scan_thread(void *nothing) 1437static int ksm_scan_thread(void *nothing)
1372{ 1438{
1439 set_freezable();
1373 set_user_nice(current, 5); 1440 set_user_nice(current, 5);
1374 1441
1375 while (!kthread_should_stop()) { 1442 while (!kthread_should_stop()) {
@@ -1378,11 +1445,13 @@ static int ksm_scan_thread(void *nothing)
1378 ksm_do_scan(ksm_thread_pages_to_scan); 1445 ksm_do_scan(ksm_thread_pages_to_scan);
1379 mutex_unlock(&ksm_thread_mutex); 1446 mutex_unlock(&ksm_thread_mutex);
1380 1447
1448 try_to_freeze();
1449
1381 if (ksmd_should_run()) { 1450 if (ksmd_should_run()) {
1382 schedule_timeout_interruptible( 1451 schedule_timeout_interruptible(
1383 msecs_to_jiffies(ksm_thread_sleep_millisecs)); 1452 msecs_to_jiffies(ksm_thread_sleep_millisecs));
1384 } else { 1453 } else {
1385 wait_event_interruptible(ksm_thread_wait, 1454 wait_event_freezable(ksm_thread_wait,
1386 ksmd_should_run() || kthread_should_stop()); 1455 ksmd_should_run() || kthread_should_stop());
1387 } 1456 }
1388 } 1457 }
@@ -1724,8 +1793,13 @@ static int ksm_memory_callback(struct notifier_block *self,
1724 /* 1793 /*
1725 * Keep it very simple for now: just lock out ksmd and 1794 * Keep it very simple for now: just lock out ksmd and
1726 * MADV_UNMERGEABLE while any memory is going offline. 1795 * MADV_UNMERGEABLE while any memory is going offline.
1796 * mutex_lock_nested() is necessary because lockdep was alarmed
1797 * that here we take ksm_thread_mutex inside notifier chain
1798 * mutex, and later take notifier chain mutex inside
1799 * ksm_thread_mutex to unlock it. But that's safe because both
1800 * are inside mem_hotplug_mutex.
1727 */ 1801 */
1728 mutex_lock(&ksm_thread_mutex); 1802 mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING);
1729 break; 1803 break;
1730 1804
1731 case MEM_OFFLINE: 1805 case MEM_OFFLINE:
diff --git a/mm/madvise.c b/mm/madvise.c
index 319528b8db74..2221491ed503 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
71 if (error) 71 if (error)
72 goto out; 72 goto out;
73 break; 73 break;
74 case MADV_HUGEPAGE:
75 case MADV_NOHUGEPAGE:
76 error = hugepage_madvise(vma, &new_flags, behavior);
77 if (error)
78 goto out;
79 break;
74 } 80 }
75 81
76 if (new_flags == vma->vm_flags) { 82 if (new_flags == vma->vm_flags) {
@@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior)
283 case MADV_MERGEABLE: 289 case MADV_MERGEABLE:
284 case MADV_UNMERGEABLE: 290 case MADV_UNMERGEABLE:
285#endif 291#endif
292#ifdef CONFIG_TRANSPARENT_HUGEPAGE
293 case MADV_HUGEPAGE:
294 case MADV_NOHUGEPAGE:
295#endif
286 return 1; 296 return 1;
287 297
288 default: 298 default:
diff --git a/mm/memblock.c b/mm/memblock.c
index 400dc62697d7..4618fda975a0 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -137,8 +137,6 @@ static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
137 137
138 BUG_ON(0 == size); 138 BUG_ON(0 == size);
139 139
140 size = memblock_align_up(size, align);
141
142 /* Pump up max_addr */ 140 /* Pump up max_addr */
143 if (end == MEMBLOCK_ALLOC_ACCESSIBLE) 141 if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
144 end = memblock.current_limit; 142 end = memblock.current_limit;
@@ -683,13 +681,13 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
683 681
684int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) 682int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
685{ 683{
686 int idx = memblock_search(&memblock.reserved, base); 684 int idx = memblock_search(&memblock.memory, base);
687 685
688 if (idx == -1) 686 if (idx == -1)
689 return 0; 687 return 0;
690 return memblock.reserved.regions[idx].base <= base && 688 return memblock.memory.regions[idx].base <= base &&
691 (memblock.reserved.regions[idx].base + 689 (memblock.memory.regions[idx].base +
692 memblock.reserved.regions[idx].size) >= (base + size); 690 memblock.memory.regions[idx].size) >= (base + size);
693} 691}
694 692
695int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) 693int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2efa8ea07ff7..da53a252b259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -61,7 +61,14 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 61#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
62/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 62/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
63int do_swap_account __read_mostly; 63int do_swap_account __read_mostly;
64static int really_do_swap_account __initdata = 1; /* for remember boot option*/ 64
65/* for remember boot option*/
66#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
67static int really_do_swap_account __initdata = 1;
68#else
69static int really_do_swap_account __initdata = 0;
70#endif
71
65#else 72#else
66#define do_swap_account (0) 73#define do_swap_account (0)
67#endif 74#endif
@@ -278,7 +285,7 @@ enum move_type {
278 285
279/* "mc" and its members are protected by cgroup_mutex */ 286/* "mc" and its members are protected by cgroup_mutex */
280static struct move_charge_struct { 287static struct move_charge_struct {
281 spinlock_t lock; /* for from, to, moving_task */ 288 spinlock_t lock; /* for from, to */
282 struct mem_cgroup *from; 289 struct mem_cgroup *from;
283 struct mem_cgroup *to; 290 struct mem_cgroup *to;
284 unsigned long precharge; 291 unsigned long precharge;
@@ -593,23 +600,24 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
593} 600}
594 601
595static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 602static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
596 struct page_cgroup *pc, 603 bool file, int nr_pages)
597 bool charge)
598{ 604{
599 int val = (charge) ? 1 : -1;
600
601 preempt_disable(); 605 preempt_disable();
602 606
603 if (PageCgroupCache(pc)) 607 if (file)
604 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); 608 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
605 else 609 else
606 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); 610 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
607 611
608 if (charge) 612 /* pagein of a big page is an event. So, ignore page size */
613 if (nr_pages > 0)
609 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); 614 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
610 else 615 else {
611 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); 616 __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
612 __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); 617 nr_pages = -nr_pages; /* for event */
618 }
619
620 __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages);
613 621
614 preempt_enable(); 622 preempt_enable();
615} 623}
@@ -808,12 +816,12 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
808 * removed from global LRU. 816 * removed from global LRU.
809 */ 817 */
810 mz = page_cgroup_zoneinfo(pc); 818 mz = page_cgroup_zoneinfo(pc);
811 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 819 /* huge page split is done under lru_lock. so, we have no races. */
820 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
812 if (mem_cgroup_is_root(pc->mem_cgroup)) 821 if (mem_cgroup_is_root(pc->mem_cgroup))
813 return; 822 return;
814 VM_BUG_ON(list_empty(&pc->lru)); 823 VM_BUG_ON(list_empty(&pc->lru));
815 list_del_init(&pc->lru); 824 list_del_init(&pc->lru);
816 return;
817} 825}
818 826
819void mem_cgroup_del_lru(struct page *page) 827void mem_cgroup_del_lru(struct page *page)
@@ -830,13 +838,12 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
830 return; 838 return;
831 839
832 pc = lookup_page_cgroup(page); 840 pc = lookup_page_cgroup(page);
833 /*
834 * Used bit is set without atomic ops but after smp_wmb().
835 * For making pc->mem_cgroup visible, insert smp_rmb() here.
836 */
837 smp_rmb();
838 /* unused or root page is not rotated. */ 841 /* unused or root page is not rotated. */
839 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) 842 if (!PageCgroupUsed(pc))
843 return;
844 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
845 smp_rmb();
846 if (mem_cgroup_is_root(pc->mem_cgroup))
840 return; 847 return;
841 mz = page_cgroup_zoneinfo(pc); 848 mz = page_cgroup_zoneinfo(pc);
842 list_move(&pc->lru, &mz->lists[lru]); 849 list_move(&pc->lru, &mz->lists[lru]);
@@ -851,16 +858,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
851 return; 858 return;
852 pc = lookup_page_cgroup(page); 859 pc = lookup_page_cgroup(page);
853 VM_BUG_ON(PageCgroupAcctLRU(pc)); 860 VM_BUG_ON(PageCgroupAcctLRU(pc));
854 /*
855 * Used bit is set without atomic ops but after smp_wmb().
856 * For making pc->mem_cgroup visible, insert smp_rmb() here.
857 */
858 smp_rmb();
859 if (!PageCgroupUsed(pc)) 861 if (!PageCgroupUsed(pc))
860 return; 862 return;
861 863 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
864 smp_rmb();
862 mz = page_cgroup_zoneinfo(pc); 865 mz = page_cgroup_zoneinfo(pc);
863 MEM_CGROUP_ZSTAT(mz, lru) += 1; 866 /* huge page split is done under lru_lock. so, we have no races. */
867 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
864 SetPageCgroupAcctLRU(pc); 868 SetPageCgroupAcctLRU(pc);
865 if (mem_cgroup_is_root(pc->mem_cgroup)) 869 if (mem_cgroup_is_root(pc->mem_cgroup))
866 return; 870 return;
@@ -1024,14 +1028,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1024 return NULL; 1028 return NULL;
1025 1029
1026 pc = lookup_page_cgroup(page); 1030 pc = lookup_page_cgroup(page);
1027 /*
1028 * Used bit is set without atomic ops but after smp_wmb().
1029 * For making pc->mem_cgroup visible, insert smp_rmb() here.
1030 */
1031 smp_rmb();
1032 if (!PageCgroupUsed(pc)) 1031 if (!PageCgroupUsed(pc))
1033 return NULL; 1032 return NULL;
1034 1033 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1034 smp_rmb();
1035 mz = page_cgroup_zoneinfo(pc); 1035 mz = page_cgroup_zoneinfo(pc);
1036 if (!mz) 1036 if (!mz)
1037 return NULL; 1037 return NULL;
@@ -1079,7 +1079,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1079 case 0: 1079 case 0:
1080 list_move(&page->lru, dst); 1080 list_move(&page->lru, dst);
1081 mem_cgroup_del_lru(page); 1081 mem_cgroup_del_lru(page);
1082 nr_taken++; 1082 nr_taken += hpage_nr_pages(page);
1083 break; 1083 break;
1084 case -EBUSY: 1084 case -EBUSY:
1085 /* we don't affect global LRU but rotate in our LRU */ 1085 /* we don't affect global LRU but rotate in our LRU */
@@ -1113,6 +1113,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
1113 return false; 1113 return false;
1114} 1114}
1115 1115
1116/**
1117 * mem_cgroup_check_margin - check if the memory cgroup allows charging
1118 * @mem: memory cgroup to check
1119 * @bytes: the number of bytes the caller intends to charge
1120 *
1121 * Returns a boolean value on whether @mem can be charged @bytes or
1122 * whether this would exceed the limit.
1123 */
1124static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes)
1125{
1126 if (!res_counter_check_margin(&mem->res, bytes))
1127 return false;
1128 if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes))
1129 return false;
1130 return true;
1131}
1132
1116static unsigned int get_swappiness(struct mem_cgroup *memcg) 1133static unsigned int get_swappiness(struct mem_cgroup *memcg)
1117{ 1134{
1118 struct cgroup *cgrp = memcg->css.cgroup; 1135 struct cgroup *cgrp = memcg->css.cgroup;
@@ -1304,8 +1321,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1304 u64 limit; 1321 u64 limit;
1305 u64 memsw; 1322 u64 memsw;
1306 1323
1307 limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + 1324 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1308 total_swap_pages; 1325 limit += total_swap_pages << PAGE_SHIFT;
1326
1309 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1327 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1310 /* 1328 /*
1311 * If memsw is finite and limits the amount of swap space available 1329 * If memsw is finite and limits the amount of swap space available
@@ -1592,11 +1610,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1592 * possibility of race condition. If there is, we take a lock. 1610 * possibility of race condition. If there is, we take a lock.
1593 */ 1611 */
1594 1612
1595static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) 1613void mem_cgroup_update_page_stat(struct page *page,
1614 enum mem_cgroup_page_stat_item idx, int val)
1596{ 1615{
1597 struct mem_cgroup *mem; 1616 struct mem_cgroup *mem;
1598 struct page_cgroup *pc = lookup_page_cgroup(page); 1617 struct page_cgroup *pc = lookup_page_cgroup(page);
1599 bool need_unlock = false; 1618 bool need_unlock = false;
1619 unsigned long uninitialized_var(flags);
1600 1620
1601 if (unlikely(!pc)) 1621 if (unlikely(!pc))
1602 return; 1622 return;
@@ -1606,39 +1626,36 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
1606 if (unlikely(!mem || !PageCgroupUsed(pc))) 1626 if (unlikely(!mem || !PageCgroupUsed(pc)))
1607 goto out; 1627 goto out;
1608 /* pc->mem_cgroup is unstable ? */ 1628 /* pc->mem_cgroup is unstable ? */
1609 if (unlikely(mem_cgroup_stealed(mem))) { 1629 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
1610 /* take a lock against to access pc->mem_cgroup */ 1630 /* take a lock against to access pc->mem_cgroup */
1611 lock_page_cgroup(pc); 1631 move_lock_page_cgroup(pc, &flags);
1612 need_unlock = true; 1632 need_unlock = true;
1613 mem = pc->mem_cgroup; 1633 mem = pc->mem_cgroup;
1614 if (!mem || !PageCgroupUsed(pc)) 1634 if (!mem || !PageCgroupUsed(pc))
1615 goto out; 1635 goto out;
1616 } 1636 }
1617 1637
1618 this_cpu_add(mem->stat->count[idx], val);
1619
1620 switch (idx) { 1638 switch (idx) {
1621 case MEM_CGROUP_STAT_FILE_MAPPED: 1639 case MEMCG_NR_FILE_MAPPED:
1622 if (val > 0) 1640 if (val > 0)
1623 SetPageCgroupFileMapped(pc); 1641 SetPageCgroupFileMapped(pc);
1624 else if (!page_mapped(page)) 1642 else if (!page_mapped(page))
1625 ClearPageCgroupFileMapped(pc); 1643 ClearPageCgroupFileMapped(pc);
1644 idx = MEM_CGROUP_STAT_FILE_MAPPED;
1626 break; 1645 break;
1627 default: 1646 default:
1628 BUG(); 1647 BUG();
1629 } 1648 }
1630 1649
1650 this_cpu_add(mem->stat->count[idx], val);
1651
1631out: 1652out:
1632 if (unlikely(need_unlock)) 1653 if (unlikely(need_unlock))
1633 unlock_page_cgroup(pc); 1654 move_unlock_page_cgroup(pc, &flags);
1634 rcu_read_unlock(); 1655 rcu_read_unlock();
1635 return; 1656 return;
1636} 1657}
1637 1658EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1638void mem_cgroup_update_file_mapped(struct page *page, int val)
1639{
1640 mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
1641}
1642 1659
1643/* 1660/*
1644 * size of first charge trial. "32" comes from vmscan.c's magic value. 1661 * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -1834,27 +1851,39 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1834 if (likely(!ret)) 1851 if (likely(!ret))
1835 return CHARGE_OK; 1852 return CHARGE_OK;
1836 1853
1854 res_counter_uncharge(&mem->res, csize);
1837 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 1855 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1838 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1856 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1839 } else 1857 } else
1840 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 1858 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1841 1859 /*
1842 if (csize > PAGE_SIZE) /* change csize and retry */ 1860 * csize can be either a huge page (HPAGE_SIZE), a batch of
1861 * regular pages (CHARGE_SIZE), or a single regular page
1862 * (PAGE_SIZE).
1863 *
1864 * Never reclaim on behalf of optional batching, retry with a
1865 * single page instead.
1866 */
1867 if (csize == CHARGE_SIZE)
1843 return CHARGE_RETRY; 1868 return CHARGE_RETRY;
1844 1869
1845 if (!(gfp_mask & __GFP_WAIT)) 1870 if (!(gfp_mask & __GFP_WAIT))
1846 return CHARGE_WOULDBLOCK; 1871 return CHARGE_WOULDBLOCK;
1847 1872
1848 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1873 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1849 gfp_mask, flags); 1874 gfp_mask, flags);
1875 if (mem_cgroup_check_margin(mem_over_limit, csize))
1876 return CHARGE_RETRY;
1850 /* 1877 /*
1851 * try_to_free_mem_cgroup_pages() might not give us a full 1878 * Even though the limit is exceeded at this point, reclaim
1852 * picture of reclaim. Some pages are reclaimed and might be 1879 * may have been able to free some pages. Retry the charge
1853 * moved to swap cache or just unmapped from the cgroup. 1880 * before killing the task.
1854 * Check the limit again to see if the reclaim reduced the 1881 *
1855 * current usage of the cgroup before giving up 1882 * Only for regular pages, though: huge pages are rather
1883 * unlikely to succeed so close to the limit, and we fall back
1884 * to regular pages anyway in case of failure.
1856 */ 1885 */
1857 if (ret || mem_cgroup_check_under_limit(mem_over_limit)) 1886 if (csize == PAGE_SIZE && ret)
1858 return CHARGE_RETRY; 1887 return CHARGE_RETRY;
1859 1888
1860 /* 1889 /*
@@ -1879,12 +1908,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1879 * oom-killer can be invoked. 1908 * oom-killer can be invoked.
1880 */ 1909 */
1881static int __mem_cgroup_try_charge(struct mm_struct *mm, 1910static int __mem_cgroup_try_charge(struct mm_struct *mm,
1882 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) 1911 gfp_t gfp_mask,
1912 struct mem_cgroup **memcg, bool oom,
1913 int page_size)
1883{ 1914{
1884 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 1915 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1885 struct mem_cgroup *mem = NULL; 1916 struct mem_cgroup *mem = NULL;
1886 int ret; 1917 int ret;
1887 int csize = CHARGE_SIZE; 1918 int csize = max(CHARGE_SIZE, (unsigned long) page_size);
1888 1919
1889 /* 1920 /*
1890 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 1921 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
@@ -1909,7 +1940,7 @@ again:
1909 VM_BUG_ON(css_is_removed(&mem->css)); 1940 VM_BUG_ON(css_is_removed(&mem->css));
1910 if (mem_cgroup_is_root(mem)) 1941 if (mem_cgroup_is_root(mem))
1911 goto done; 1942 goto done;
1912 if (consume_stock(mem)) 1943 if (page_size == PAGE_SIZE && consume_stock(mem))
1913 goto done; 1944 goto done;
1914 css_get(&mem->css); 1945 css_get(&mem->css);
1915 } else { 1946 } else {
@@ -1917,23 +1948,22 @@ again:
1917 1948
1918 rcu_read_lock(); 1949 rcu_read_lock();
1919 p = rcu_dereference(mm->owner); 1950 p = rcu_dereference(mm->owner);
1920 VM_BUG_ON(!p);
1921 /* 1951 /*
1922 * because we don't have task_lock(), "p" can exit while 1952 * Because we don't have task_lock(), "p" can exit.
1923 * we're here. In that case, "mem" can point to root 1953 * In that case, "mem" can point to root or p can be NULL with
1924 * cgroup but never be NULL. (and task_struct itself is freed 1954 * race with swapoff. Then, we have small risk of mis-accouning.
1925 * by RCU, cgroup itself is RCU safe.) Then, we have small 1955 * But such kind of mis-account by race always happens because
1926 * risk here to get wrong cgroup. But such kind of mis-account 1956 * we don't have cgroup_mutex(). It's overkill and we allo that
1927 * by race always happens because we don't have cgroup_mutex(). 1957 * small race, here.
1928 * It's overkill and we allow that small race, here. 1958 * (*) swapoff at el will charge against mm-struct not against
1959 * task-struct. So, mm->owner can be NULL.
1929 */ 1960 */
1930 mem = mem_cgroup_from_task(p); 1961 mem = mem_cgroup_from_task(p);
1931 VM_BUG_ON(!mem); 1962 if (!mem || mem_cgroup_is_root(mem)) {
1932 if (mem_cgroup_is_root(mem)) {
1933 rcu_read_unlock(); 1963 rcu_read_unlock();
1934 goto done; 1964 goto done;
1935 } 1965 }
1936 if (consume_stock(mem)) { 1966 if (page_size == PAGE_SIZE && consume_stock(mem)) {
1937 /* 1967 /*
1938 * It seems dagerous to access memcg without css_get(). 1968 * It seems dagerous to access memcg without css_get().
1939 * But considering how consume_stok works, it's not 1969 * But considering how consume_stok works, it's not
@@ -1974,7 +2004,7 @@ again:
1974 case CHARGE_OK: 2004 case CHARGE_OK:
1975 break; 2005 break;
1976 case CHARGE_RETRY: /* not in OOM situation but retry */ 2006 case CHARGE_RETRY: /* not in OOM situation but retry */
1977 csize = PAGE_SIZE; 2007 csize = page_size;
1978 css_put(&mem->css); 2008 css_put(&mem->css);
1979 mem = NULL; 2009 mem = NULL;
1980 goto again; 2010 goto again;
@@ -1995,8 +2025,8 @@ again:
1995 } 2025 }
1996 } while (ret != CHARGE_OK); 2026 } while (ret != CHARGE_OK);
1997 2027
1998 if (csize > PAGE_SIZE) 2028 if (csize > page_size)
1999 refill_stock(mem, csize - PAGE_SIZE); 2029 refill_stock(mem, csize - page_size);
2000 css_put(&mem->css); 2030 css_put(&mem->css);
2001done: 2031done:
2002 *memcg = mem; 2032 *memcg = mem;
@@ -2024,9 +2054,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2024 } 2054 }
2025} 2055}
2026 2056
2027static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 2057static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2058 int page_size)
2028{ 2059{
2029 __mem_cgroup_cancel_charge(mem, 1); 2060 __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
2030} 2061}
2031 2062
2032/* 2063/*
@@ -2076,15 +2107,13 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2076 return mem; 2107 return mem;
2077} 2108}
2078 2109
2079/*
2080 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
2081 * USED state. If already USED, uncharge and return.
2082 */
2083
2084static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2110static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2085 struct page_cgroup *pc, 2111 struct page_cgroup *pc,
2086 enum charge_type ctype) 2112 enum charge_type ctype,
2113 int page_size)
2087{ 2114{
2115 int nr_pages = page_size >> PAGE_SHIFT;
2116
2088 /* try_charge() can return NULL to *memcg, taking care of it. */ 2117 /* try_charge() can return NULL to *memcg, taking care of it. */
2089 if (!mem) 2118 if (!mem)
2090 return; 2119 return;
@@ -2092,10 +2121,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2092 lock_page_cgroup(pc); 2121 lock_page_cgroup(pc);
2093 if (unlikely(PageCgroupUsed(pc))) { 2122 if (unlikely(PageCgroupUsed(pc))) {
2094 unlock_page_cgroup(pc); 2123 unlock_page_cgroup(pc);
2095 mem_cgroup_cancel_charge(mem); 2124 mem_cgroup_cancel_charge(mem, page_size);
2096 return; 2125 return;
2097 } 2126 }
2098 2127 /*
2128 * we don't need page_cgroup_lock about tail pages, becase they are not
2129 * accessed by any other context at this point.
2130 */
2099 pc->mem_cgroup = mem; 2131 pc->mem_cgroup = mem;
2100 /* 2132 /*
2101 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2133 * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2119,8 +2151,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2119 break; 2151 break;
2120 } 2152 }
2121 2153
2122 mem_cgroup_charge_statistics(mem, pc, true); 2154 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
2123
2124 unlock_page_cgroup(pc); 2155 unlock_page_cgroup(pc);
2125 /* 2156 /*
2126 * "charge_statistics" updated event counter. Then, check it. 2157 * "charge_statistics" updated event counter. Then, check it.
@@ -2130,6 +2161,48 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2130 memcg_check_events(mem, pc->page); 2161 memcg_check_events(mem, pc->page);
2131} 2162}
2132 2163
2164#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2165
2166#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2167 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
2168/*
2169 * Because tail pages are not marked as "used", set it. We're under
2170 * zone->lru_lock, 'splitting on pmd' and compund_lock.
2171 */
2172void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2173{
2174 struct page_cgroup *head_pc = lookup_page_cgroup(head);
2175 struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
2176 unsigned long flags;
2177
2178 if (mem_cgroup_disabled())
2179 return;
2180 /*
2181 * We have no races with charge/uncharge but will have races with
2182 * page state accounting.
2183 */
2184 move_lock_page_cgroup(head_pc, &flags);
2185
2186 tail_pc->mem_cgroup = head_pc->mem_cgroup;
2187 smp_wmb(); /* see __commit_charge() */
2188 if (PageCgroupAcctLRU(head_pc)) {
2189 enum lru_list lru;
2190 struct mem_cgroup_per_zone *mz;
2191
2192 /*
2193 * LRU flags cannot be copied because we need to add tail
2194 *.page to LRU by generic call and our hook will be called.
2195 * We hold lru_lock, then, reduce counter directly.
2196 */
2197 lru = page_lru(head);
2198 mz = page_cgroup_zoneinfo(head_pc);
2199 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2200 }
2201 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2202 move_unlock_page_cgroup(head_pc, &flags);
2203}
2204#endif
2205
2133/** 2206/**
2134 * __mem_cgroup_move_account - move account of the page 2207 * __mem_cgroup_move_account - move account of the page
2135 * @pc: page_cgroup of the page. 2208 * @pc: page_cgroup of the page.
@@ -2148,11 +2221,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2148 */ 2221 */
2149 2222
2150static void __mem_cgroup_move_account(struct page_cgroup *pc, 2223static void __mem_cgroup_move_account(struct page_cgroup *pc,
2151 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 2224 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge,
2225 int charge_size)
2152{ 2226{
2227 int nr_pages = charge_size >> PAGE_SHIFT;
2228
2153 VM_BUG_ON(from == to); 2229 VM_BUG_ON(from == to);
2154 VM_BUG_ON(PageLRU(pc->page)); 2230 VM_BUG_ON(PageLRU(pc->page));
2155 VM_BUG_ON(!PageCgroupLocked(pc)); 2231 VM_BUG_ON(!page_is_cgroup_locked(pc));
2156 VM_BUG_ON(!PageCgroupUsed(pc)); 2232 VM_BUG_ON(!PageCgroupUsed(pc));
2157 VM_BUG_ON(pc->mem_cgroup != from); 2233 VM_BUG_ON(pc->mem_cgroup != from);
2158 2234
@@ -2163,14 +2239,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
2163 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2239 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2164 preempt_enable(); 2240 preempt_enable();
2165 } 2241 }
2166 mem_cgroup_charge_statistics(from, pc, false); 2242 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
2167 if (uncharge) 2243 if (uncharge)
2168 /* This is not "cancel", but cancel_charge does all we need. */ 2244 /* This is not "cancel", but cancel_charge does all we need. */
2169 mem_cgroup_cancel_charge(from); 2245 mem_cgroup_cancel_charge(from, charge_size);
2170 2246
2171 /* caller should have done css_get */ 2247 /* caller should have done css_get */
2172 pc->mem_cgroup = to; 2248 pc->mem_cgroup = to;
2173 mem_cgroup_charge_statistics(to, pc, true); 2249 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
2174 /* 2250 /*
2175 * We charges against "to" which may not have any tasks. Then, "to" 2251 * We charges against "to" which may not have any tasks. Then, "to"
2176 * can be under rmdir(). But in current implementation, caller of 2252 * can be under rmdir(). But in current implementation, caller of
@@ -2185,12 +2261,25 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
2185 * __mem_cgroup_move_account() 2261 * __mem_cgroup_move_account()
2186 */ 2262 */
2187static int mem_cgroup_move_account(struct page_cgroup *pc, 2263static int mem_cgroup_move_account(struct page_cgroup *pc,
2188 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 2264 struct mem_cgroup *from, struct mem_cgroup *to,
2265 bool uncharge, int charge_size)
2189{ 2266{
2190 int ret = -EINVAL; 2267 int ret = -EINVAL;
2268 unsigned long flags;
2269 /*
2270 * The page is isolated from LRU. So, collapse function
2271 * will not handle this page. But page splitting can happen.
2272 * Do this check under compound_page_lock(). The caller should
2273 * hold it.
2274 */
2275 if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page))
2276 return -EBUSY;
2277
2191 lock_page_cgroup(pc); 2278 lock_page_cgroup(pc);
2192 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { 2279 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
2193 __mem_cgroup_move_account(pc, from, to, uncharge); 2280 move_lock_page_cgroup(pc, &flags);
2281 __mem_cgroup_move_account(pc, from, to, uncharge, charge_size);
2282 move_unlock_page_cgroup(pc, &flags);
2194 ret = 0; 2283 ret = 0;
2195 } 2284 }
2196 unlock_page_cgroup(pc); 2285 unlock_page_cgroup(pc);
@@ -2214,6 +2303,8 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2214 struct cgroup *cg = child->css.cgroup; 2303 struct cgroup *cg = child->css.cgroup;
2215 struct cgroup *pcg = cg->parent; 2304 struct cgroup *pcg = cg->parent;
2216 struct mem_cgroup *parent; 2305 struct mem_cgroup *parent;
2306 int page_size = PAGE_SIZE;
2307 unsigned long flags;
2217 int ret; 2308 int ret;
2218 2309
2219 /* Is ROOT ? */ 2310 /* Is ROOT ? */
@@ -2226,14 +2317,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2226 if (isolate_lru_page(page)) 2317 if (isolate_lru_page(page))
2227 goto put; 2318 goto put;
2228 2319
2320 if (PageTransHuge(page))
2321 page_size = HPAGE_SIZE;
2322
2229 parent = mem_cgroup_from_cont(pcg); 2323 parent = mem_cgroup_from_cont(pcg);
2230 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 2324 ret = __mem_cgroup_try_charge(NULL, gfp_mask,
2325 &parent, false, page_size);
2231 if (ret || !parent) 2326 if (ret || !parent)
2232 goto put_back; 2327 goto put_back;
2233 2328
2234 ret = mem_cgroup_move_account(pc, child, parent, true); 2329 if (page_size > PAGE_SIZE)
2330 flags = compound_lock_irqsave(page);
2331
2332 ret = mem_cgroup_move_account(pc, child, parent, true, page_size);
2235 if (ret) 2333 if (ret)
2236 mem_cgroup_cancel_charge(parent); 2334 mem_cgroup_cancel_charge(parent, page_size);
2335
2336 if (page_size > PAGE_SIZE)
2337 compound_unlock_irqrestore(page, flags);
2237put_back: 2338put_back:
2238 putback_lru_page(page); 2339 putback_lru_page(page);
2239put: 2340put:
@@ -2252,20 +2353,32 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2252 gfp_t gfp_mask, enum charge_type ctype) 2353 gfp_t gfp_mask, enum charge_type ctype)
2253{ 2354{
2254 struct mem_cgroup *mem = NULL; 2355 struct mem_cgroup *mem = NULL;
2356 int page_size = PAGE_SIZE;
2255 struct page_cgroup *pc; 2357 struct page_cgroup *pc;
2358 bool oom = true;
2256 int ret; 2359 int ret;
2257 2360
2361 if (PageTransHuge(page)) {
2362 page_size <<= compound_order(page);
2363 VM_BUG_ON(!PageTransHuge(page));
2364 /*
2365 * Never OOM-kill a process for a huge page. The
2366 * fault handler will fall back to regular pages.
2367 */
2368 oom = false;
2369 }
2370
2258 pc = lookup_page_cgroup(page); 2371 pc = lookup_page_cgroup(page);
2259 /* can happen at boot */ 2372 /* can happen at boot */
2260 if (unlikely(!pc)) 2373 if (unlikely(!pc))
2261 return 0; 2374 return 0;
2262 prefetchw(pc); 2375 prefetchw(pc);
2263 2376
2264 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 2377 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size);
2265 if (ret || !mem) 2378 if (ret || !mem)
2266 return ret; 2379 return ret;
2267 2380
2268 __mem_cgroup_commit_charge(mem, pc, ctype); 2381 __mem_cgroup_commit_charge(mem, pc, ctype, page_size);
2269 return 0; 2382 return 0;
2270} 2383}
2271 2384
@@ -2274,8 +2387,6 @@ int mem_cgroup_newpage_charge(struct page *page,
2274{ 2387{
2275 if (mem_cgroup_disabled()) 2388 if (mem_cgroup_disabled())
2276 return 0; 2389 return 0;
2277 if (PageCompound(page))
2278 return 0;
2279 /* 2390 /*
2280 * If already mapped, we don't have to account. 2391 * If already mapped, we don't have to account.
2281 * If page cache, page->mapping has address_space. 2392 * If page cache, page->mapping has address_space.
@@ -2381,13 +2492,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2381 if (!mem) 2492 if (!mem)
2382 goto charge_cur_mm; 2493 goto charge_cur_mm;
2383 *ptr = mem; 2494 *ptr = mem;
2384 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 2495 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE);
2385 css_put(&mem->css); 2496 css_put(&mem->css);
2386 return ret; 2497 return ret;
2387charge_cur_mm: 2498charge_cur_mm:
2388 if (unlikely(!mm)) 2499 if (unlikely(!mm))
2389 mm = &init_mm; 2500 mm = &init_mm;
2390 return __mem_cgroup_try_charge(mm, mask, ptr, true); 2501 return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE);
2391} 2502}
2392 2503
2393static void 2504static void
@@ -2403,7 +2514,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2403 cgroup_exclude_rmdir(&ptr->css); 2514 cgroup_exclude_rmdir(&ptr->css);
2404 pc = lookup_page_cgroup(page); 2515 pc = lookup_page_cgroup(page);
2405 mem_cgroup_lru_del_before_commit_swapcache(page); 2516 mem_cgroup_lru_del_before_commit_swapcache(page);
2406 __mem_cgroup_commit_charge(ptr, pc, ctype); 2517 __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
2407 mem_cgroup_lru_add_after_commit_swapcache(page); 2518 mem_cgroup_lru_add_after_commit_swapcache(page);
2408 /* 2519 /*
2409 * Now swap is on-memory. This means this page may be 2520 * Now swap is on-memory. This means this page may be
@@ -2452,11 +2563,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2452 return; 2563 return;
2453 if (!mem) 2564 if (!mem)
2454 return; 2565 return;
2455 mem_cgroup_cancel_charge(mem); 2566 mem_cgroup_cancel_charge(mem, PAGE_SIZE);
2456} 2567}
2457 2568
2458static void 2569static void
2459__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) 2570__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
2571 int page_size)
2460{ 2572{
2461 struct memcg_batch_info *batch = NULL; 2573 struct memcg_batch_info *batch = NULL;
2462 bool uncharge_memsw = true; 2574 bool uncharge_memsw = true;
@@ -2483,6 +2595,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2483 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2595 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2484 goto direct_uncharge; 2596 goto direct_uncharge;
2485 2597
2598 if (page_size != PAGE_SIZE)
2599 goto direct_uncharge;
2600
2486 /* 2601 /*
2487 * In typical case, batch->memcg == mem. This means we can 2602 * In typical case, batch->memcg == mem. This means we can
2488 * merge a series of uncharges to an uncharge of res_counter. 2603 * merge a series of uncharges to an uncharge of res_counter.
@@ -2496,9 +2611,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2496 batch->memsw_bytes += PAGE_SIZE; 2611 batch->memsw_bytes += PAGE_SIZE;
2497 return; 2612 return;
2498direct_uncharge: 2613direct_uncharge:
2499 res_counter_uncharge(&mem->res, PAGE_SIZE); 2614 res_counter_uncharge(&mem->res, page_size);
2500 if (uncharge_memsw) 2615 if (uncharge_memsw)
2501 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 2616 res_counter_uncharge(&mem->memsw, page_size);
2502 if (unlikely(batch->memcg != mem)) 2617 if (unlikely(batch->memcg != mem))
2503 memcg_oom_recover(mem); 2618 memcg_oom_recover(mem);
2504 return; 2619 return;
@@ -2510,8 +2625,10 @@ direct_uncharge:
2510static struct mem_cgroup * 2625static struct mem_cgroup *
2511__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2626__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2512{ 2627{
2628 int count;
2513 struct page_cgroup *pc; 2629 struct page_cgroup *pc;
2514 struct mem_cgroup *mem = NULL; 2630 struct mem_cgroup *mem = NULL;
2631 int page_size = PAGE_SIZE;
2515 2632
2516 if (mem_cgroup_disabled()) 2633 if (mem_cgroup_disabled())
2517 return NULL; 2634 return NULL;
@@ -2519,6 +2636,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2519 if (PageSwapCache(page)) 2636 if (PageSwapCache(page))
2520 return NULL; 2637 return NULL;
2521 2638
2639 if (PageTransHuge(page)) {
2640 page_size <<= compound_order(page);
2641 VM_BUG_ON(!PageTransHuge(page));
2642 }
2643
2644 count = page_size >> PAGE_SHIFT;
2522 /* 2645 /*
2523 * Check if our page_cgroup is valid 2646 * Check if our page_cgroup is valid
2524 */ 2647 */
@@ -2551,7 +2674,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2551 break; 2674 break;
2552 } 2675 }
2553 2676
2554 mem_cgroup_charge_statistics(mem, pc, false); 2677 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count);
2555 2678
2556 ClearPageCgroupUsed(pc); 2679 ClearPageCgroupUsed(pc);
2557 /* 2680 /*
@@ -2572,7 +2695,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2572 mem_cgroup_get(mem); 2695 mem_cgroup_get(mem);
2573 } 2696 }
2574 if (!mem_cgroup_is_root(mem)) 2697 if (!mem_cgroup_is_root(mem))
2575 __do_uncharge(mem, ctype); 2698 __do_uncharge(mem, ctype, page_size);
2576 2699
2577 return mem; 2700 return mem;
2578 2701
@@ -2767,6 +2890,7 @@ int mem_cgroup_prepare_migration(struct page *page,
2767 enum charge_type ctype; 2890 enum charge_type ctype;
2768 int ret = 0; 2891 int ret = 0;
2769 2892
2893 VM_BUG_ON(PageTransHuge(page));
2770 if (mem_cgroup_disabled()) 2894 if (mem_cgroup_disabled())
2771 return 0; 2895 return 0;
2772 2896
@@ -2816,7 +2940,7 @@ int mem_cgroup_prepare_migration(struct page *page,
2816 return 0; 2940 return 0;
2817 2941
2818 *ptr = mem; 2942 *ptr = mem;
2819 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); 2943 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE);
2820 css_put(&mem->css);/* drop extra refcnt */ 2944 css_put(&mem->css);/* drop extra refcnt */
2821 if (ret || *ptr == NULL) { 2945 if (ret || *ptr == NULL) {
2822 if (PageAnon(page)) { 2946 if (PageAnon(page)) {
@@ -2843,13 +2967,13 @@ int mem_cgroup_prepare_migration(struct page *page,
2843 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 2967 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2844 else 2968 else
2845 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2969 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2846 __mem_cgroup_commit_charge(mem, pc, ctype); 2970 __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE);
2847 return ret; 2971 return ret;
2848} 2972}
2849 2973
2850/* remove redundant charge if migration failed*/ 2974/* remove redundant charge if migration failed*/
2851void mem_cgroup_end_migration(struct mem_cgroup *mem, 2975void mem_cgroup_end_migration(struct mem_cgroup *mem,
2852 struct page *oldpage, struct page *newpage) 2976 struct page *oldpage, struct page *newpage, bool migration_ok)
2853{ 2977{
2854 struct page *used, *unused; 2978 struct page *used, *unused;
2855 struct page_cgroup *pc; 2979 struct page_cgroup *pc;
@@ -2858,8 +2982,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
2858 return; 2982 return;
2859 /* blocks rmdir() */ 2983 /* blocks rmdir() */
2860 cgroup_exclude_rmdir(&mem->css); 2984 cgroup_exclude_rmdir(&mem->css);
2861 /* at migration success, oldpage->mapping is NULL. */ 2985 if (!migration_ok) {
2862 if (oldpage->mapping) {
2863 used = oldpage; 2986 used = oldpage;
2864 unused = newpage; 2987 unused = newpage;
2865 } else { 2988 } else {
@@ -4169,13 +4292,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4169 */ 4292 */
4170 if (!node_state(node, N_NORMAL_MEMORY)) 4293 if (!node_state(node, N_NORMAL_MEMORY))
4171 tmp = -1; 4294 tmp = -1;
4172 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4295 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4173 if (!pn) 4296 if (!pn)
4174 return 1; 4297 return 1;
4175 4298
4176 mem->info.nodeinfo[node] = pn; 4299 mem->info.nodeinfo[node] = pn;
4177 memset(pn, 0, sizeof(*pn));
4178
4179 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4300 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4180 mz = &pn->zoneinfo[zone]; 4301 mz = &pn->zoneinfo[zone];
4181 for_each_lru(l) 4302 for_each_lru(l)
@@ -4199,14 +4320,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
4199 4320
4200 /* Can be very big if MAX_NUMNODES is very big */ 4321 /* Can be very big if MAX_NUMNODES is very big */
4201 if (size < PAGE_SIZE) 4322 if (size < PAGE_SIZE)
4202 mem = kmalloc(size, GFP_KERNEL); 4323 mem = kzalloc(size, GFP_KERNEL);
4203 else 4324 else
4204 mem = vmalloc(size); 4325 mem = vzalloc(size);
4205 4326
4206 if (!mem) 4327 if (!mem)
4207 return NULL; 4328 return NULL;
4208 4329
4209 memset(mem, 0, size);
4210 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4330 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4211 if (!mem->stat) 4331 if (!mem->stat)
4212 goto out_free; 4332 goto out_free;
@@ -4454,7 +4574,8 @@ one_by_one:
4454 batch_count = PRECHARGE_COUNT_AT_ONCE; 4574 batch_count = PRECHARGE_COUNT_AT_ONCE;
4455 cond_resched(); 4575 cond_resched();
4456 } 4576 }
4457 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 4577 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
4578 PAGE_SIZE);
4458 if (ret || !mem) 4579 if (ret || !mem)
4459 /* mem_cgroup_clear_mc() will do uncharge later */ 4580 /* mem_cgroup_clear_mc() will do uncharge later */
4460 return -ENOMEM; 4581 return -ENOMEM;
@@ -4616,6 +4737,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4616 pte_t *pte; 4737 pte_t *pte;
4617 spinlock_t *ptl; 4738 spinlock_t *ptl;
4618 4739
4740 VM_BUG_ON(pmd_trans_huge(*pmd));
4619 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4741 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4620 for (; addr != end; pte++, addr += PAGE_SIZE) 4742 for (; addr != end; pte++, addr += PAGE_SIZE)
4621 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 4743 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
@@ -4653,10 +4775,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4653 4775
4654static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4776static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4655{ 4777{
4656 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); 4778 unsigned long precharge = mem_cgroup_count_precharge(mm);
4779
4780 VM_BUG_ON(mc.moving_task);
4781 mc.moving_task = current;
4782 return mem_cgroup_do_precharge(precharge);
4657} 4783}
4658 4784
4659static void mem_cgroup_clear_mc(void) 4785/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
4786static void __mem_cgroup_clear_mc(void)
4660{ 4787{
4661 struct mem_cgroup *from = mc.from; 4788 struct mem_cgroup *from = mc.from;
4662 struct mem_cgroup *to = mc.to; 4789 struct mem_cgroup *to = mc.to;
@@ -4691,18 +4818,28 @@ static void mem_cgroup_clear_mc(void)
4691 PAGE_SIZE * mc.moved_swap); 4818 PAGE_SIZE * mc.moved_swap);
4692 } 4819 }
4693 /* we've already done mem_cgroup_get(mc.to) */ 4820 /* we've already done mem_cgroup_get(mc.to) */
4694
4695 mc.moved_swap = 0; 4821 mc.moved_swap = 0;
4696 } 4822 }
4823 memcg_oom_recover(from);
4824 memcg_oom_recover(to);
4825 wake_up_all(&mc.waitq);
4826}
4827
4828static void mem_cgroup_clear_mc(void)
4829{
4830 struct mem_cgroup *from = mc.from;
4831
4832 /*
4833 * we must clear moving_task before waking up waiters at the end of
4834 * task migration.
4835 */
4836 mc.moving_task = NULL;
4837 __mem_cgroup_clear_mc();
4697 spin_lock(&mc.lock); 4838 spin_lock(&mc.lock);
4698 mc.from = NULL; 4839 mc.from = NULL;
4699 mc.to = NULL; 4840 mc.to = NULL;
4700 mc.moving_task = NULL;
4701 spin_unlock(&mc.lock); 4841 spin_unlock(&mc.lock);
4702 mem_cgroup_end_move(from); 4842 mem_cgroup_end_move(from);
4703 memcg_oom_recover(from);
4704 memcg_oom_recover(to);
4705 wake_up_all(&mc.waitq);
4706} 4843}
4707 4844
4708static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4845static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
@@ -4729,16 +4866,12 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4729 VM_BUG_ON(mc.precharge); 4866 VM_BUG_ON(mc.precharge);
4730 VM_BUG_ON(mc.moved_charge); 4867 VM_BUG_ON(mc.moved_charge);
4731 VM_BUG_ON(mc.moved_swap); 4868 VM_BUG_ON(mc.moved_swap);
4732 VM_BUG_ON(mc.moving_task);
4733 mem_cgroup_start_move(from); 4869 mem_cgroup_start_move(from);
4734 spin_lock(&mc.lock); 4870 spin_lock(&mc.lock);
4735 mc.from = from; 4871 mc.from = from;
4736 mc.to = mem; 4872 mc.to = mem;
4737 mc.precharge = 0;
4738 mc.moved_charge = 0;
4739 mc.moved_swap = 0;
4740 mc.moving_task = current;
4741 spin_unlock(&mc.lock); 4873 spin_unlock(&mc.lock);
4874 /* We set mc.moving_task later */
4742 4875
4743 ret = mem_cgroup_precharge_mc(mm); 4876 ret = mem_cgroup_precharge_mc(mm);
4744 if (ret) 4877 if (ret)
@@ -4767,6 +4900,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4767 spinlock_t *ptl; 4900 spinlock_t *ptl;
4768 4901
4769retry: 4902retry:
4903 VM_BUG_ON(pmd_trans_huge(*pmd));
4770 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4904 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4771 for (; addr != end; addr += PAGE_SIZE) { 4905 for (; addr != end; addr += PAGE_SIZE) {
4772 pte_t ptent = *(pte++); 4906 pte_t ptent = *(pte++);
@@ -4787,7 +4921,7 @@ retry:
4787 goto put; 4921 goto put;
4788 pc = lookup_page_cgroup(page); 4922 pc = lookup_page_cgroup(page);
4789 if (!mem_cgroup_move_account(pc, 4923 if (!mem_cgroup_move_account(pc,
4790 mc.from, mc.to, false)) { 4924 mc.from, mc.to, false, PAGE_SIZE)) {
4791 mc.precharge--; 4925 mc.precharge--;
4792 /* we uncharge from mc.from later. */ 4926 /* we uncharge from mc.from later. */
4793 mc.moved_charge++; 4927 mc.moved_charge++;
@@ -4832,7 +4966,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4832 struct vm_area_struct *vma; 4966 struct vm_area_struct *vma;
4833 4967
4834 lru_add_drain_all(); 4968 lru_add_drain_all();
4835 down_read(&mm->mmap_sem); 4969retry:
4970 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
4971 /*
4972 * Someone who are holding the mmap_sem might be waiting in
4973 * waitq. So we cancel all extra charges, wake up all waiters,
4974 * and retry. Because we cancel precharges, we might not be able
4975 * to move enough charges, but moving charge is a best-effort
4976 * feature anyway, so it wouldn't be a big problem.
4977 */
4978 __mem_cgroup_clear_mc();
4979 cond_resched();
4980 goto retry;
4981 }
4836 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4982 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4837 int ret; 4983 int ret;
4838 struct mm_walk mem_cgroup_move_charge_walk = { 4984 struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4911,10 +5057,21 @@ struct cgroup_subsys mem_cgroup_subsys = {
4911}; 5057};
4912 5058
4913#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5059#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
5060static int __init enable_swap_account(char *s)
5061{
5062 /* consider enabled if no parameter or 1 is given */
5063 if (!(*s) || !strcmp(s, "=1"))
5064 really_do_swap_account = 1;
5065 else if (!strcmp(s, "=0"))
5066 really_do_swap_account = 0;
5067 return 1;
5068}
5069__setup("swapaccount", enable_swap_account);
4914 5070
4915static int __init disable_swap_account(char *s) 5071static int __init disable_swap_account(char *s)
4916{ 5072{
4917 really_do_swap_account = 0; 5073 printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n");
5074 enable_swap_account("=0");
4918 return 1; 5075 return 1;
4919} 5076}
4920__setup("noswapaccount", disable_swap_account); 5077__setup("noswapaccount", disable_swap_account);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 124324134ff6..0207c2f6f8bd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -51,6 +51,7 @@
51#include <linux/slab.h> 51#include <linux/slab.h>
52#include <linux/swapops.h> 52#include <linux/swapops.h>
53#include <linux/hugetlb.h> 53#include <linux/hugetlb.h>
54#include <linux/memory_hotplug.h>
54#include "internal.h" 55#include "internal.h"
55 56
56int sysctl_memory_failure_early_kill __read_mostly = 0; 57int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -202,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
202#ifdef __ARCH_SI_TRAPNO 203#ifdef __ARCH_SI_TRAPNO
203 si.si_trapno = trapno; 204 si.si_trapno = trapno;
204#endif 205#endif
205 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; 206 si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
206 /* 207 /*
207 * Don't use force here, it's convenient if the signal 208 * Don't use force here, it's convenient if the signal
208 * can be temporarily blocked. 209 * can be temporarily blocked.
@@ -232,8 +233,8 @@ void shake_page(struct page *p, int access)
232 } 233 }
233 234
234 /* 235 /*
235 * Only all shrink_slab here (which would also 236 * Only call shrink_slab here (which would also shrink other caches) if
236 * shrink other caches) if access is not potentially fatal. 237 * access is not potentially fatal.
237 */ 238 */
238 if (access) { 239 if (access) {
239 int nr; 240 int nr;
@@ -853,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
853 int ret; 854 int ret;
854 int kill = 1; 855 int kill = 1;
855 struct page *hpage = compound_head(p); 856 struct page *hpage = compound_head(p);
857 struct page *ppage;
856 858
857 if (PageReserved(p) || PageSlab(p)) 859 if (PageReserved(p) || PageSlab(p))
858 return SWAP_SUCCESS; 860 return SWAP_SUCCESS;
@@ -894,6 +896,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
894 } 896 }
895 897
896 /* 898 /*
899 * ppage: poisoned page
900 * if p is regular page(4k page)
901 * ppage == real poisoned page;
902 * else p is hugetlb or THP, ppage == head page.
903 */
904 ppage = hpage;
905
906 if (PageTransHuge(hpage)) {
907 /*
908 * Verify that this isn't a hugetlbfs head page, the check for
909 * PageAnon is just for avoid tripping a split_huge_page
910 * internal debug check, as split_huge_page refuses to deal with
911 * anything that isn't an anon page. PageAnon can't go away fro
912 * under us because we hold a refcount on the hpage, without a
913 * refcount on the hpage. split_huge_page can't be safely called
914 * in the first place, having a refcount on the tail isn't
915 * enough * to be safe.
916 */
917 if (!PageHuge(hpage) && PageAnon(hpage)) {
918 if (unlikely(split_huge_page(hpage))) {
919 /*
920 * FIXME: if splitting THP is failed, it is
921 * better to stop the following operation rather
922 * than causing panic by unmapping. System might
923 * survive if the page is freed later.
924 */
925 printk(KERN_INFO
926 "MCE %#lx: failed to split THP\n", pfn);
927
928 BUG_ON(!PageHWPoison(p));
929 return SWAP_FAIL;
930 }
931 /* THP is split, so ppage should be the real poisoned page. */
932 ppage = p;
933 }
934 }
935
936 /*
897 * First collect all the processes that have the page 937 * First collect all the processes that have the page
898 * mapped in dirty form. This has to be done before try_to_unmap, 938 * mapped in dirty form. This has to be done before try_to_unmap,
899 * because ttu takes the rmap data structures down. 939 * because ttu takes the rmap data structures down.
@@ -902,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
902 * there's nothing that can be done. 942 * there's nothing that can be done.
903 */ 943 */
904 if (kill) 944 if (kill)
905 collect_procs(hpage, &tokill); 945 collect_procs(ppage, &tokill);
906 946
907 ret = try_to_unmap(hpage, ttu); 947 if (hpage != ppage)
948 lock_page_nosync(ppage);
949
950 ret = try_to_unmap(ppage, ttu);
908 if (ret != SWAP_SUCCESS) 951 if (ret != SWAP_SUCCESS)
909 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", 952 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
910 pfn, page_mapcount(hpage)); 953 pfn, page_mapcount(ppage));
954
955 if (hpage != ppage)
956 unlock_page(ppage);
911 957
912 /* 958 /*
913 * Now that the dirty bit has been propagated to the 959 * Now that the dirty bit has been propagated to the
@@ -918,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
918 * use a more force-full uncatchable kill to prevent 964 * use a more force-full uncatchable kill to prevent
919 * any accesses to the poisoned memory. 965 * any accesses to the poisoned memory.
920 */ 966 */
921 kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, 967 kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
922 ret != SWAP_SUCCESS, p, pfn); 968 ret != SWAP_SUCCESS, p, pfn);
923 969
924 return ret; 970 return ret;
@@ -927,7 +973,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
927static void set_page_hwpoison_huge_page(struct page *hpage) 973static void set_page_hwpoison_huge_page(struct page *hpage)
928{ 974{
929 int i; 975 int i;
930 int nr_pages = 1 << compound_order(hpage); 976 int nr_pages = 1 << compound_trans_order(hpage);
931 for (i = 0; i < nr_pages; i++) 977 for (i = 0; i < nr_pages; i++)
932 SetPageHWPoison(hpage + i); 978 SetPageHWPoison(hpage + i);
933} 979}
@@ -935,7 +981,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
935static void clear_page_hwpoison_huge_page(struct page *hpage) 981static void clear_page_hwpoison_huge_page(struct page *hpage)
936{ 982{
937 int i; 983 int i;
938 int nr_pages = 1 << compound_order(hpage); 984 int nr_pages = 1 << compound_trans_order(hpage);
939 for (i = 0; i < nr_pages; i++) 985 for (i = 0; i < nr_pages; i++)
940 ClearPageHWPoison(hpage + i); 986 ClearPageHWPoison(hpage + i);
941} 987}
@@ -965,7 +1011,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
965 return 0; 1011 return 0;
966 } 1012 }
967 1013
968 nr_pages = 1 << compound_order(hpage); 1014 nr_pages = 1 << compound_trans_order(hpage);
969 atomic_long_add(nr_pages, &mce_bad_pages); 1015 atomic_long_add(nr_pages, &mce_bad_pages);
970 1016
971 /* 1017 /*
@@ -1019,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1019 * The check (unnecessarily) ignores LRU pages being isolated and 1065 * The check (unnecessarily) ignores LRU pages being isolated and
1020 * walked by the page reclaim code, however that's not a big loss. 1066 * walked by the page reclaim code, however that's not a big loss.
1021 */ 1067 */
1022 if (!PageLRU(p) && !PageHuge(p)) 1068 if (!PageHuge(p) && !PageTransCompound(p)) {
1023 shake_page(p, 0); 1069 if (!PageLRU(p))
1024 if (!PageLRU(p) && !PageHuge(p)) { 1070 shake_page(p, 0);
1025 /* 1071 if (!PageLRU(p)) {
1026 * shake_page could have turned it free. 1072 /*
1027 */ 1073 * shake_page could have turned it free.
1028 if (is_free_buddy_page(p)) { 1074 */
1029 action_result(pfn, "free buddy, 2nd try", DELAYED); 1075 if (is_free_buddy_page(p)) {
1030 return 0; 1076 action_result(pfn, "free buddy, 2nd try",
1077 DELAYED);
1078 return 0;
1079 }
1080 action_result(pfn, "non LRU", IGNORED);
1081 put_page(p);
1082 return -EBUSY;
1031 } 1083 }
1032 action_result(pfn, "non LRU", IGNORED);
1033 put_page(p);
1034 return -EBUSY;
1035 } 1084 }
1036 1085
1037 /* 1086 /*
@@ -1061,7 +1110,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
1061 * For error on the tail page, we should set PG_hwpoison 1110 * For error on the tail page, we should set PG_hwpoison
1062 * on the head page to show that the hugepage is hwpoisoned 1111 * on the head page to show that the hugepage is hwpoisoned
1063 */ 1112 */
1064 if (PageTail(p) && TestSetPageHWPoison(hpage)) { 1113 if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
1065 action_result(pfn, "hugepage already hardware poisoned", 1114 action_result(pfn, "hugepage already hardware poisoned",
1066 IGNORED); 1115 IGNORED);
1067 unlock_page(hpage); 1116 unlock_page(hpage);
@@ -1163,7 +1212,7 @@ int unpoison_memory(unsigned long pfn)
1163 return 0; 1212 return 0;
1164 } 1213 }
1165 1214
1166 nr_pages = 1 << compound_order(page); 1215 nr_pages = 1 << compound_trans_order(page);
1167 1216
1168 if (!get_page_unless_zero(page)) { 1217 if (!get_page_unless_zero(page)) {
1169 /* 1218 /*
@@ -1230,11 +1279,10 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1230 return 1; 1279 return 1;
1231 1280
1232 /* 1281 /*
1233 * The lock_system_sleep prevents a race with memory hotplug, 1282 * The lock_memory_hotplug prevents a race with memory hotplug.
1234 * because the isolation assumes there's only a single user.
1235 * This is a big hammer, a better would be nicer. 1283 * This is a big hammer, a better would be nicer.
1236 */ 1284 */
1237 lock_system_sleep(); 1285 lock_memory_hotplug();
1238 1286
1239 /* 1287 /*
1240 * Isolate the page, so that it doesn't get reallocated if it 1288 * Isolate the page, so that it doesn't get reallocated if it
@@ -1264,7 +1312,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1264 ret = 1; 1312 ret = 1;
1265 } 1313 }
1266 unset_migratetype_isolate(p); 1314 unset_migratetype_isolate(p);
1267 unlock_system_sleep(); 1315 unlock_memory_hotplug();
1268 return ret; 1316 return ret;
1269} 1317}
1270 1318
@@ -1290,9 +1338,13 @@ static int soft_offline_huge_page(struct page *page, int flags)
1290 /* Keep page count to indicate a given hugepage is isolated. */ 1338 /* Keep page count to indicate a given hugepage is isolated. */
1291 1339
1292 list_add(&hpage->lru, &pagelist); 1340 list_add(&hpage->lru, &pagelist);
1293 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1341 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
1342 true);
1294 if (ret) { 1343 if (ret) {
1295 putback_lru_pages(&pagelist); 1344 struct page *page1, *page2;
1345 list_for_each_entry_safe(page1, page2, &pagelist, lru)
1346 put_page(page1);
1347
1296 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1348 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1297 pfn, ret, page->flags); 1349 pfn, ret, page->flags);
1298 if (ret > 0) 1350 if (ret > 0)
@@ -1301,7 +1353,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
1301 } 1353 }
1302done: 1354done:
1303 if (!PageHWPoison(hpage)) 1355 if (!PageHWPoison(hpage))
1304 atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); 1356 atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
1305 set_page_hwpoison_huge_page(hpage); 1357 set_page_hwpoison_huge_page(hpage);
1306 dequeue_hwpoisoned_huge_page(hpage); 1358 dequeue_hwpoisoned_huge_page(hpage);
1307 /* keep elevated page count for bad page */ 1359 /* keep elevated page count for bad page */
@@ -1413,8 +1465,10 @@ int soft_offline_page(struct page *page, int flags)
1413 LIST_HEAD(pagelist); 1465 LIST_HEAD(pagelist);
1414 1466
1415 list_add(&page->lru, &pagelist); 1467 list_add(&page->lru, &pagelist);
1416 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1468 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1469 0, true);
1417 if (ret) { 1470 if (ret) {
1471 putback_lru_pages(&pagelist);
1418 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1472 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1419 pfn, ret, page->flags); 1473 pfn, ret, page->flags);
1420 if (ret > 0) 1474 if (ret > 0)
diff --git a/mm/memory.c b/mm/memory.c
index 02e48aa0ed13..8e8c18324863 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
394 } 394 }
395} 395}
396 396
397int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) 397int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
398 pmd_t *pmd, unsigned long address)
398{ 399{
399 pgtable_t new = pte_alloc_one(mm, address); 400 pgtable_t new = pte_alloc_one(mm, address);
401 int wait_split_huge_page;
400 if (!new) 402 if (!new)
401 return -ENOMEM; 403 return -ENOMEM;
402 404
@@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
416 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ 418 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
417 419
418 spin_lock(&mm->page_table_lock); 420 spin_lock(&mm->page_table_lock);
419 if (!pmd_present(*pmd)) { /* Has another populated it ? */ 421 wait_split_huge_page = 0;
422 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
420 mm->nr_ptes++; 423 mm->nr_ptes++;
421 pmd_populate(mm, pmd, new); 424 pmd_populate(mm, pmd, new);
422 new = NULL; 425 new = NULL;
423 } 426 } else if (unlikely(pmd_trans_splitting(*pmd)))
427 wait_split_huge_page = 1;
424 spin_unlock(&mm->page_table_lock); 428 spin_unlock(&mm->page_table_lock);
425 if (new) 429 if (new)
426 pte_free(mm, new); 430 pte_free(mm, new);
431 if (wait_split_huge_page)
432 wait_split_huge_page(vma->anon_vma, pmd);
427 return 0; 433 return 0;
428} 434}
429 435
@@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
436 smp_wmb(); /* See comment in __pte_alloc */ 442 smp_wmb(); /* See comment in __pte_alloc */
437 443
438 spin_lock(&init_mm.page_table_lock); 444 spin_lock(&init_mm.page_table_lock);
439 if (!pmd_present(*pmd)) { /* Has another populated it ? */ 445 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
440 pmd_populate_kernel(&init_mm, pmd, new); 446 pmd_populate_kernel(&init_mm, pmd, new);
441 new = NULL; 447 new = NULL;
442 } 448 } else
449 VM_BUG_ON(pmd_trans_splitting(*pmd));
443 spin_unlock(&init_mm.page_table_lock); 450 spin_unlock(&init_mm.page_table_lock);
444 if (new) 451 if (new)
445 pte_free_kernel(&init_mm, new); 452 pte_free_kernel(&init_mm, new);
@@ -719,9 +726,9 @@ out_set_pte:
719 return 0; 726 return 0;
720} 727}
721 728
722static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 729int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
723 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, 730 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
724 unsigned long addr, unsigned long end) 731 unsigned long addr, unsigned long end)
725{ 732{
726 pte_t *orig_src_pte, *orig_dst_pte; 733 pte_t *orig_src_pte, *orig_dst_pte;
727 pte_t *src_pte, *dst_pte; 734 pte_t *src_pte, *dst_pte;
@@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
795 src_pmd = pmd_offset(src_pud, addr); 802 src_pmd = pmd_offset(src_pud, addr);
796 do { 803 do {
797 next = pmd_addr_end(addr, end); 804 next = pmd_addr_end(addr, end);
805 if (pmd_trans_huge(*src_pmd)) {
806 int err;
807 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
808 err = copy_huge_pmd(dst_mm, src_mm,
809 dst_pmd, src_pmd, addr, vma);
810 if (err == -ENOMEM)
811 return -ENOMEM;
812 if (!err)
813 continue;
814 /* fall through */
815 }
798 if (pmd_none_or_clear_bad(src_pmd)) 816 if (pmd_none_or_clear_bad(src_pmd))
799 continue; 817 continue;
800 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, 818 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
997 pmd = pmd_offset(pud, addr); 1015 pmd = pmd_offset(pud, addr);
998 do { 1016 do {
999 next = pmd_addr_end(addr, end); 1017 next = pmd_addr_end(addr, end);
1018 if (pmd_trans_huge(*pmd)) {
1019 if (next-addr != HPAGE_PMD_SIZE) {
1020 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1021 split_huge_page_pmd(vma->vm_mm, pmd);
1022 } else if (zap_huge_pmd(tlb, vma, pmd)) {
1023 (*zap_work)--;
1024 continue;
1025 }
1026 /* fall through */
1027 }
1000 if (pmd_none_or_clear_bad(pmd)) { 1028 if (pmd_none_or_clear_bad(pmd)) {
1001 (*zap_work)--; 1029 (*zap_work)--;
1002 continue; 1030 continue;
@@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1262 pud = pud_offset(pgd, address); 1290 pud = pud_offset(pgd, address);
1263 if (pud_none(*pud)) 1291 if (pud_none(*pud))
1264 goto no_page_table; 1292 goto no_page_table;
1265 if (pud_huge(*pud)) { 1293 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1266 BUG_ON(flags & FOLL_GET); 1294 BUG_ON(flags & FOLL_GET);
1267 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); 1295 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1268 goto out; 1296 goto out;
@@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1273 pmd = pmd_offset(pud, address); 1301 pmd = pmd_offset(pud, address);
1274 if (pmd_none(*pmd)) 1302 if (pmd_none(*pmd))
1275 goto no_page_table; 1303 goto no_page_table;
1276 if (pmd_huge(*pmd)) { 1304 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1277 BUG_ON(flags & FOLL_GET); 1305 BUG_ON(flags & FOLL_GET);
1278 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1306 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1279 goto out; 1307 goto out;
1280 } 1308 }
1309 if (pmd_trans_huge(*pmd)) {
1310 if (flags & FOLL_SPLIT) {
1311 split_huge_page_pmd(mm, pmd);
1312 goto split_fallthrough;
1313 }
1314 spin_lock(&mm->page_table_lock);
1315 if (likely(pmd_trans_huge(*pmd))) {
1316 if (unlikely(pmd_trans_splitting(*pmd))) {
1317 spin_unlock(&mm->page_table_lock);
1318 wait_split_huge_page(vma->anon_vma, pmd);
1319 } else {
1320 page = follow_trans_huge_pmd(mm, address,
1321 pmd, flags);
1322 spin_unlock(&mm->page_table_lock);
1323 goto out;
1324 }
1325 } else
1326 spin_unlock(&mm->page_table_lock);
1327 /* fall through */
1328 }
1329split_fallthrough:
1281 if (unlikely(pmd_bad(*pmd))) 1330 if (unlikely(pmd_bad(*pmd)))
1282 goto no_page_table; 1331 goto no_page_table;
1283 1332
@@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1310 */ 1359 */
1311 mark_page_accessed(page); 1360 mark_page_accessed(page);
1312 } 1361 }
1362 if (flags & FOLL_MLOCK) {
1363 /*
1364 * The preliminary mapping check is mainly to avoid the
1365 * pointless overhead of lock_page on the ZERO_PAGE
1366 * which might bounce very badly if there is contention.
1367 *
1368 * If the page is already locked, we don't need to
1369 * handle it now - vmscan will handle it later if and
1370 * when it attempts to reclaim the page.
1371 */
1372 if (page->mapping && trylock_page(page)) {
1373 lru_add_drain(); /* push cached pages to LRU */
1374 /*
1375 * Because we lock page here and migration is
1376 * blocked by the pte's page reference, we need
1377 * only check for file-cache page truncation.
1378 */
1379 if (page->mapping)
1380 mlock_vma_page(page);
1381 unlock_page(page);
1382 }
1383 }
1313unlock: 1384unlock:
1314 pte_unmap_unlock(ptep, ptl); 1385 pte_unmap_unlock(ptep, ptl);
1315out: 1386out:
@@ -1341,7 +1412,8 @@ no_page_table:
1341 1412
1342int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1413int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1343 unsigned long start, int nr_pages, unsigned int gup_flags, 1414 unsigned long start, int nr_pages, unsigned int gup_flags,
1344 struct page **pages, struct vm_area_struct **vmas) 1415 struct page **pages, struct vm_area_struct **vmas,
1416 int *nonblocking)
1345{ 1417{
1346 int i; 1418 int i;
1347 unsigned long vm_flags; 1419 unsigned long vm_flags;
@@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1386 pmd = pmd_offset(pud, pg); 1458 pmd = pmd_offset(pud, pg);
1387 if (pmd_none(*pmd)) 1459 if (pmd_none(*pmd))
1388 return i ? : -EFAULT; 1460 return i ? : -EFAULT;
1461 VM_BUG_ON(pmd_trans_huge(*pmd));
1389 pte = pte_offset_map(pmd, pg); 1462 pte = pte_offset_map(pmd, pg);
1390 if (pte_none(*pte)) { 1463 if (pte_none(*pte)) {
1391 pte_unmap(pte); 1464 pte_unmap(pte);
@@ -1441,10 +1514,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1441 cond_resched(); 1514 cond_resched();
1442 while (!(page = follow_page(vma, start, foll_flags))) { 1515 while (!(page = follow_page(vma, start, foll_flags))) {
1443 int ret; 1516 int ret;
1517 unsigned int fault_flags = 0;
1518
1519 if (foll_flags & FOLL_WRITE)
1520 fault_flags |= FAULT_FLAG_WRITE;
1521 if (nonblocking)
1522 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1444 1523
1445 ret = handle_mm_fault(mm, vma, start, 1524 ret = handle_mm_fault(mm, vma, start,
1446 (foll_flags & FOLL_WRITE) ? 1525 fault_flags);
1447 FAULT_FLAG_WRITE : 0);
1448 1526
1449 if (ret & VM_FAULT_ERROR) { 1527 if (ret & VM_FAULT_ERROR) {
1450 if (ret & VM_FAULT_OOM) 1528 if (ret & VM_FAULT_OOM)
@@ -1460,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1460 else 1538 else
1461 tsk->min_flt++; 1539 tsk->min_flt++;
1462 1540
1541 if (ret & VM_FAULT_RETRY) {
1542 *nonblocking = 0;
1543 return i;
1544 }
1545
1463 /* 1546 /*
1464 * The VM_FAULT_WRITE bit tells us that 1547 * The VM_FAULT_WRITE bit tells us that
1465 * do_wp_page has broken COW when necessary, 1548 * do_wp_page has broken COW when necessary,
@@ -1559,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1559 if (force) 1642 if (force)
1560 flags |= FOLL_FORCE; 1643 flags |= FOLL_FORCE;
1561 1644
1562 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 1645 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
1646 NULL);
1563} 1647}
1564EXPORT_SYMBOL(get_user_pages); 1648EXPORT_SYMBOL(get_user_pages);
1565 1649
@@ -1584,7 +1668,8 @@ struct page *get_dump_page(unsigned long addr)
1584 struct page *page; 1668 struct page *page;
1585 1669
1586 if (__get_user_pages(current, current->mm, addr, 1, 1670 if (__get_user_pages(current, current->mm, addr, 1,
1587 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) 1671 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1672 NULL) < 1)
1588 return NULL; 1673 return NULL;
1589 flush_cache_page(vma, addr, page_to_pfn(page)); 1674 flush_cache_page(vma, addr, page_to_pfn(page));
1590 return page; 1675 return page;
@@ -1598,8 +1683,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1598 pud_t * pud = pud_alloc(mm, pgd, addr); 1683 pud_t * pud = pud_alloc(mm, pgd, addr);
1599 if (pud) { 1684 if (pud) {
1600 pmd_t * pmd = pmd_alloc(mm, pud, addr); 1685 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1601 if (pmd) 1686 if (pmd) {
1687 VM_BUG_ON(pmd_trans_huge(*pmd));
1602 return pte_alloc_map_lock(mm, pmd, addr, ptl); 1688 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1689 }
1603 } 1690 }
1604 return NULL; 1691 return NULL;
1605} 1692}
@@ -1818,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1818 pmd = pmd_alloc(mm, pud, addr); 1905 pmd = pmd_alloc(mm, pud, addr);
1819 if (!pmd) 1906 if (!pmd)
1820 return -ENOMEM; 1907 return -ENOMEM;
1908 VM_BUG_ON(pmd_trans_huge(*pmd));
1821 do { 1909 do {
1822 next = pmd_addr_end(addr, end); 1910 next = pmd_addr_end(addr, end);
1823 if (remap_pte_range(mm, pmd, addr, next, 1911 if (remap_pte_range(mm, pmd, addr, next,
@@ -2048,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2048 return same; 2136 return same;
2049} 2137}
2050 2138
2051/*
2052 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
2053 * servicing faults for write access. In the normal case, do always want
2054 * pte_mkwrite. But get_user_pages can cause write faults for mappings
2055 * that do not have writing enabled, when used by access_process_vm.
2056 */
2057static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
2058{
2059 if (likely(vma->vm_flags & VM_WRITE))
2060 pte = pte_mkwrite(pte);
2061 return pte;
2062}
2063
2064static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) 2139static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2065{ 2140{
2066 /* 2141 /*
@@ -2112,7 +2187,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2112{ 2187{
2113 struct page *old_page, *new_page; 2188 struct page *old_page, *new_page;
2114 pte_t entry; 2189 pte_t entry;
2115 int reuse = 0, ret = 0; 2190 int ret = 0;
2116 int page_mkwrite = 0; 2191 int page_mkwrite = 0;
2117 struct page *dirty_page = NULL; 2192 struct page *dirty_page = NULL;
2118 2193
@@ -2144,19 +2219,20 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2144 &ptl); 2219 &ptl);
2145 if (!pte_same(*page_table, orig_pte)) { 2220 if (!pte_same(*page_table, orig_pte)) {
2146 unlock_page(old_page); 2221 unlock_page(old_page);
2147 page_cache_release(old_page);
2148 goto unlock; 2222 goto unlock;
2149 } 2223 }
2150 page_cache_release(old_page); 2224 page_cache_release(old_page);
2151 } 2225 }
2152 reuse = reuse_swap_page(old_page); 2226 if (reuse_swap_page(old_page)) {
2153 if (reuse)
2154 /* 2227 /*
2155 * The page is all ours. Move it to our anon_vma so 2228 * The page is all ours. Move it to our anon_vma so
2156 * the rmap code will not search our parent or siblings. 2229 * the rmap code will not search our parent or siblings.
2157 * Protected against the rmap code by the page lock. 2230 * Protected against the rmap code by the page lock.
2158 */ 2231 */
2159 page_move_anon_rmap(old_page, vma, address); 2232 page_move_anon_rmap(old_page, vma, address);
2233 unlock_page(old_page);
2234 goto reuse;
2235 }
2160 unlock_page(old_page); 2236 unlock_page(old_page);
2161 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2237 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2162 (VM_WRITE|VM_SHARED))) { 2238 (VM_WRITE|VM_SHARED))) {
@@ -2212,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2212 &ptl); 2288 &ptl);
2213 if (!pte_same(*page_table, orig_pte)) { 2289 if (!pte_same(*page_table, orig_pte)) {
2214 unlock_page(old_page); 2290 unlock_page(old_page);
2215 page_cache_release(old_page);
2216 goto unlock; 2291 goto unlock;
2217 } 2292 }
2218 2293
@@ -2220,18 +2295,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2220 } 2295 }
2221 dirty_page = old_page; 2296 dirty_page = old_page;
2222 get_page(dirty_page); 2297 get_page(dirty_page);
2223 reuse = 1;
2224 }
2225 2298
2226 if (reuse) {
2227reuse: 2299reuse:
2228 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2300 flush_cache_page(vma, address, pte_pfn(orig_pte));
2229 entry = pte_mkyoung(orig_pte); 2301 entry = pte_mkyoung(orig_pte);
2230 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2302 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2231 if (ptep_set_access_flags(vma, address, page_table, entry,1)) 2303 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2232 update_mmu_cache(vma, address, page_table); 2304 update_mmu_cache(vma, address, page_table);
2305 pte_unmap_unlock(page_table, ptl);
2233 ret |= VM_FAULT_WRITE; 2306 ret |= VM_FAULT_WRITE;
2234 goto unlock; 2307
2308 if (!dirty_page)
2309 return ret;
2310
2311 /*
2312 * Yes, Virginia, this is actually required to prevent a race
2313 * with clear_page_dirty_for_io() from clearing the page dirty
2314 * bit after it clear all dirty ptes, but before a racing
2315 * do_wp_page installs a dirty pte.
2316 *
2317 * do_no_page is protected similarly.
2318 */
2319 if (!page_mkwrite) {
2320 wait_on_page_locked(dirty_page);
2321 set_page_dirty_balance(dirty_page, page_mkwrite);
2322 }
2323 put_page(dirty_page);
2324 if (page_mkwrite) {
2325 struct address_space *mapping = dirty_page->mapping;
2326
2327 set_page_dirty(dirty_page);
2328 unlock_page(dirty_page);
2329 page_cache_release(dirty_page);
2330 if (mapping) {
2331 /*
2332 * Some device drivers do not set page.mapping
2333 * but still dirty their pages
2334 */
2335 balance_dirty_pages_ratelimited(mapping);
2336 }
2337 }
2338
2339 /* file_update_time outside page_lock */
2340 if (vma->vm_file)
2341 file_update_time(vma->vm_file);
2342
2343 return ret;
2235 } 2344 }
2236 2345
2237 /* 2346 /*
@@ -2256,16 +2365,6 @@ gotten:
2256 } 2365 }
2257 __SetPageUptodate(new_page); 2366 __SetPageUptodate(new_page);
2258 2367
2259 /*
2260 * Don't let another task, with possibly unlocked vma,
2261 * keep the mlocked page.
2262 */
2263 if ((vma->vm_flags & VM_LOCKED) && old_page) {
2264 lock_page(old_page); /* for LRU manipulation */
2265 clear_page_mlock(old_page);
2266 unlock_page(old_page);
2267 }
2268
2269 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) 2368 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
2270 goto oom_free_new; 2369 goto oom_free_new;
2271 2370
@@ -2333,42 +2432,19 @@ gotten:
2333 2432
2334 if (new_page) 2433 if (new_page)
2335 page_cache_release(new_page); 2434 page_cache_release(new_page);
2336 if (old_page)
2337 page_cache_release(old_page);
2338unlock: 2435unlock:
2339 pte_unmap_unlock(page_table, ptl); 2436 pte_unmap_unlock(page_table, ptl);
2340 if (dirty_page) { 2437 if (old_page) {
2341 /* 2438 /*
2342 * Yes, Virginia, this is actually required to prevent a race 2439 * Don't let another task, with possibly unlocked vma,
2343 * with clear_page_dirty_for_io() from clearing the page dirty 2440 * keep the mlocked page.
2344 * bit after it clear all dirty ptes, but before a racing
2345 * do_wp_page installs a dirty pte.
2346 *
2347 * do_no_page is protected similarly.
2348 */ 2441 */
2349 if (!page_mkwrite) { 2442 if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
2350 wait_on_page_locked(dirty_page); 2443 lock_page(old_page); /* LRU manipulation */
2351 set_page_dirty_balance(dirty_page, page_mkwrite); 2444 munlock_vma_page(old_page);
2352 } 2445 unlock_page(old_page);
2353 put_page(dirty_page);
2354 if (page_mkwrite) {
2355 struct address_space *mapping = dirty_page->mapping;
2356
2357 set_page_dirty(dirty_page);
2358 unlock_page(dirty_page);
2359 page_cache_release(dirty_page);
2360 if (mapping) {
2361 /*
2362 * Some device drivers do not set page.mapping
2363 * but still dirty their pages
2364 */
2365 balance_dirty_pages_ratelimited(mapping);
2366 }
2367 } 2446 }
2368 2447 page_cache_release(old_page);
2369 /* file_update_time outside page_lock */
2370 if (vma->vm_file)
2371 file_update_time(vma->vm_file);
2372 } 2448 }
2373 return ret; 2449 return ret;
2374oom_free_new: 2450oom_free_new:
@@ -2975,12 +3051,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2975 goto out; 3051 goto out;
2976 } 3052 }
2977 charged = 1; 3053 charged = 1;
2978 /*
2979 * Don't let another task, with possibly unlocked vma,
2980 * keep the mlocked page.
2981 */
2982 if (vma->vm_flags & VM_LOCKED)
2983 clear_page_mlock(vmf.page);
2984 copy_user_highpage(page, vmf.page, address, vma); 3054 copy_user_highpage(page, vmf.page, address, vma);
2985 __SetPageUptodate(page); 3055 __SetPageUptodate(page);
2986 } else { 3056 } else {
@@ -3147,9 +3217,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3147 * but allow concurrent faults), and pte mapped but not yet locked. 3217 * but allow concurrent faults), and pte mapped but not yet locked.
3148 * We return with mmap_sem still held, but pte unmapped and unlocked. 3218 * We return with mmap_sem still held, but pte unmapped and unlocked.
3149 */ 3219 */
3150static inline int handle_pte_fault(struct mm_struct *mm, 3220int handle_pte_fault(struct mm_struct *mm,
3151 struct vm_area_struct *vma, unsigned long address, 3221 struct vm_area_struct *vma, unsigned long address,
3152 pte_t *pte, pmd_t *pmd, unsigned int flags) 3222 pte_t *pte, pmd_t *pmd, unsigned int flags)
3153{ 3223{
3154 pte_t entry; 3224 pte_t entry;
3155 spinlock_t *ptl; 3225 spinlock_t *ptl;
@@ -3228,9 +3298,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3228 pmd = pmd_alloc(mm, pud, address); 3298 pmd = pmd_alloc(mm, pud, address);
3229 if (!pmd) 3299 if (!pmd)
3230 return VM_FAULT_OOM; 3300 return VM_FAULT_OOM;
3231 pte = pte_alloc_map(mm, pmd, address); 3301 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3232 if (!pte) 3302 if (!vma->vm_ops)
3303 return do_huge_pmd_anonymous_page(mm, vma, address,
3304 pmd, flags);
3305 } else {
3306 pmd_t orig_pmd = *pmd;
3307 barrier();
3308 if (pmd_trans_huge(orig_pmd)) {
3309 if (flags & FAULT_FLAG_WRITE &&
3310 !pmd_write(orig_pmd) &&
3311 !pmd_trans_splitting(orig_pmd))
3312 return do_huge_pmd_wp_page(mm, vma, address,
3313 pmd, orig_pmd);
3314 return 0;
3315 }
3316 }
3317
3318 /*
3319 * Use __pte_alloc instead of pte_alloc_map, because we can't
3320 * run pte_offset_map on the pmd, if an huge pmd could
3321 * materialize from under us from a different thread.
3322 */
3323 if (unlikely(__pte_alloc(mm, vma, pmd, address)))
3233 return VM_FAULT_OOM; 3324 return VM_FAULT_OOM;
3325 /* if an huge pmd materialized from under us just retry later */
3326 if (unlikely(pmd_trans_huge(*pmd)))
3327 return 0;
3328 /*
3329 * A regular pmd is established and it can't morph into a huge pmd
3330 * from under us anymore at this point because we hold the mmap_sem
3331 * read mode and khugepaged takes it in write mode. So now it's
3332 * safe to run pte_offset_map().
3333 */
3334 pte = pte_offset_map(pmd, address);
3234 3335
3235 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 3336 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3236} 3337}
@@ -3296,7 +3397,12 @@ int make_pages_present(unsigned long addr, unsigned long end)
3296 vma = find_vma(current->mm, addr); 3397 vma = find_vma(current->mm, addr);
3297 if (!vma) 3398 if (!vma)
3298 return -ENOMEM; 3399 return -ENOMEM;
3299 write = (vma->vm_flags & VM_WRITE) != 0; 3400 /*
3401 * We want to touch writable mappings with a write fault in order
3402 * to break COW, except for shared mappings because these don't COW
3403 * and we would not want to dirty them for nothing.
3404 */
3405 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3300 BUG_ON(addr >= end); 3406 BUG_ON(addr >= end);
3301 BUG_ON(end > vma->vm_end); 3407 BUG_ON(end > vma->vm_end);
3302 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; 3408 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
@@ -3368,6 +3474,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,
3368 goto out; 3474 goto out;
3369 3475
3370 pmd = pmd_offset(pud, address); 3476 pmd = pmd_offset(pud, address);
3477 VM_BUG_ON(pmd_trans_huge(*pmd));
3371 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 3478 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3372 goto out; 3479 goto out;
3373 3480
@@ -3608,3 +3715,74 @@ void might_fault(void)
3608} 3715}
3609EXPORT_SYMBOL(might_fault); 3716EXPORT_SYMBOL(might_fault);
3610#endif 3717#endif
3718
3719#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3720static void clear_gigantic_page(struct page *page,
3721 unsigned long addr,
3722 unsigned int pages_per_huge_page)
3723{
3724 int i;
3725 struct page *p = page;
3726
3727 might_sleep();
3728 for (i = 0; i < pages_per_huge_page;
3729 i++, p = mem_map_next(p, page, i)) {
3730 cond_resched();
3731 clear_user_highpage(p, addr + i * PAGE_SIZE);
3732 }
3733}
3734void clear_huge_page(struct page *page,
3735 unsigned long addr, unsigned int pages_per_huge_page)
3736{
3737 int i;
3738
3739 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3740 clear_gigantic_page(page, addr, pages_per_huge_page);
3741 return;
3742 }
3743
3744 might_sleep();
3745 for (i = 0; i < pages_per_huge_page; i++) {
3746 cond_resched();
3747 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
3748 }
3749}
3750
3751static void copy_user_gigantic_page(struct page *dst, struct page *src,
3752 unsigned long addr,
3753 struct vm_area_struct *vma,
3754 unsigned int pages_per_huge_page)
3755{
3756 int i;
3757 struct page *dst_base = dst;
3758 struct page *src_base = src;
3759
3760 for (i = 0; i < pages_per_huge_page; ) {
3761 cond_resched();
3762 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
3763
3764 i++;
3765 dst = mem_map_next(dst, dst_base, i);
3766 src = mem_map_next(src, src_base, i);
3767 }
3768}
3769
3770void copy_user_huge_page(struct page *dst, struct page *src,
3771 unsigned long addr, struct vm_area_struct *vma,
3772 unsigned int pages_per_huge_page)
3773{
3774 int i;
3775
3776 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3777 copy_user_gigantic_page(dst, src, addr, vma,
3778 pages_per_huge_page);
3779 return;
3780 }
3781
3782 might_sleep();
3783 for (i = 0; i < pages_per_huge_page; i++) {
3784 cond_resched();
3785 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
3786 }
3787}
3788#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9260314a221e..321fc7455df7 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,23 @@
34 34
35#include "internal.h" 35#include "internal.h"
36 36
37DEFINE_MUTEX(mem_hotplug_mutex);
38
39void lock_memory_hotplug(void)
40{
41 mutex_lock(&mem_hotplug_mutex);
42
43 /* for exclusive hibernation if CONFIG_HIBERNATION=y */
44 lock_system_sleep();
45}
46
47void unlock_memory_hotplug(void)
48{
49 unlock_system_sleep();
50 mutex_unlock(&mem_hotplug_mutex);
51}
52
53
37/* add this memory to iomem resource */ 54/* add this memory to iomem resource */
38static struct resource *register_memory_resource(u64 start, u64 size) 55static struct resource *register_memory_resource(u64 start, u64 size)
39{ 56{
@@ -65,9 +82,10 @@ static void release_memory_resource(struct resource *res)
65 82
66#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 83#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
67#ifndef CONFIG_SPARSEMEM_VMEMMAP 84#ifndef CONFIG_SPARSEMEM_VMEMMAP
68static void get_page_bootmem(unsigned long info, struct page *page, int type) 85static void get_page_bootmem(unsigned long info, struct page *page,
86 unsigned long type)
69{ 87{
70 atomic_set(&page->_mapcount, type); 88 page->lru.next = (struct list_head *) type;
71 SetPagePrivate(page); 89 SetPagePrivate(page);
72 set_page_private(page, info); 90 set_page_private(page, info);
73 atomic_inc(&page->_count); 91 atomic_inc(&page->_count);
@@ -77,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type)
77 * so use __ref to tell modpost not to generate a warning */ 95 * so use __ref to tell modpost not to generate a warning */
78void __ref put_page_bootmem(struct page *page) 96void __ref put_page_bootmem(struct page *page)
79{ 97{
80 int type; 98 unsigned long type;
81 99
82 type = atomic_read(&page->_mapcount); 100 type = (unsigned long) page->lru.next;
83 BUG_ON(type >= -1); 101 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
102 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
84 103
85 if (atomic_dec_return(&page->_count) == 1) { 104 if (atomic_dec_return(&page->_count) == 1) {
86 ClearPagePrivate(page); 105 ClearPagePrivate(page);
87 set_page_private(page, 0); 106 set_page_private(page, 0);
88 reset_page_mapcount(page); 107 INIT_LIST_HEAD(&page->lru);
89 __free_pages_bootmem(page, 0); 108 __free_pages_bootmem(page, 0);
90 } 109 }
91 110
@@ -390,6 +409,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
390 int ret; 409 int ret;
391 struct memory_notify arg; 410 struct memory_notify arg;
392 411
412 lock_memory_hotplug();
393 arg.start_pfn = pfn; 413 arg.start_pfn = pfn;
394 arg.nr_pages = nr_pages; 414 arg.nr_pages = nr_pages;
395 arg.status_change_nid = -1; 415 arg.status_change_nid = -1;
@@ -402,6 +422,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
402 ret = notifier_to_errno(ret); 422 ret = notifier_to_errno(ret);
403 if (ret) { 423 if (ret) {
404 memory_notify(MEM_CANCEL_ONLINE, &arg); 424 memory_notify(MEM_CANCEL_ONLINE, &arg);
425 unlock_memory_hotplug();
405 return ret; 426 return ret;
406 } 427 }
407 /* 428 /*
@@ -426,6 +447,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
426 printk(KERN_DEBUG "online_pages %lx at %lx failed\n", 447 printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
427 nr_pages, pfn); 448 nr_pages, pfn);
428 memory_notify(MEM_CANCEL_ONLINE, &arg); 449 memory_notify(MEM_CANCEL_ONLINE, &arg);
450 unlock_memory_hotplug();
429 return ret; 451 return ret;
430 } 452 }
431 453
@@ -450,6 +472,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
450 472
451 if (onlined_pages) 473 if (onlined_pages)
452 memory_notify(MEM_ONLINE, &arg); 474 memory_notify(MEM_ONLINE, &arg);
475 unlock_memory_hotplug();
453 476
454 return 0; 477 return 0;
455} 478}
@@ -493,7 +516,7 @@ int mem_online_node(int nid)
493 pg_data_t *pgdat; 516 pg_data_t *pgdat;
494 int ret; 517 int ret;
495 518
496 lock_system_sleep(); 519 lock_memory_hotplug();
497 pgdat = hotadd_new_pgdat(nid, 0); 520 pgdat = hotadd_new_pgdat(nid, 0);
498 if (pgdat) { 521 if (pgdat) {
499 ret = -ENOMEM; 522 ret = -ENOMEM;
@@ -504,7 +527,7 @@ int mem_online_node(int nid)
504 BUG_ON(ret); 527 BUG_ON(ret);
505 528
506out: 529out:
507 unlock_system_sleep(); 530 unlock_memory_hotplug();
508 return ret; 531 return ret;
509} 532}
510 533
@@ -516,7 +539,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
516 struct resource *res; 539 struct resource *res;
517 int ret; 540 int ret;
518 541
519 lock_system_sleep(); 542 lock_memory_hotplug();
520 543
521 res = register_memory_resource(start, size); 544 res = register_memory_resource(start, size);
522 ret = -EEXIST; 545 ret = -EEXIST;
@@ -563,7 +586,7 @@ error:
563 release_memory_resource(res); 586 release_memory_resource(res);
564 587
565out: 588out:
566 unlock_system_sleep(); 589 unlock_memory_hotplug();
567 return ret; 590 return ret;
568} 591}
569EXPORT_SYMBOL_GPL(add_memory); 592EXPORT_SYMBOL_GPL(add_memory);
@@ -716,7 +739,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
716 goto out; 739 goto out;
717 } 740 }
718 /* this function returns # of failed pages */ 741 /* this function returns # of failed pages */
719 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); 742 ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
743 true, true);
720 if (ret) 744 if (ret)
721 putback_lru_pages(&source); 745 putback_lru_pages(&source);
722 } 746 }
@@ -791,7 +815,7 @@ static int offline_pages(unsigned long start_pfn,
791 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 815 if (!test_pages_in_a_zone(start_pfn, end_pfn))
792 return -EINVAL; 816 return -EINVAL;
793 817
794 lock_system_sleep(); 818 lock_memory_hotplug();
795 819
796 zone = page_zone(pfn_to_page(start_pfn)); 820 zone = page_zone(pfn_to_page(start_pfn));
797 node = zone_to_nid(zone); 821 node = zone_to_nid(zone);
@@ -880,7 +904,7 @@ repeat:
880 writeback_set_ratelimit(); 904 writeback_set_ratelimit();
881 905
882 memory_notify(MEM_OFFLINE, &arg); 906 memory_notify(MEM_OFFLINE, &arg);
883 unlock_system_sleep(); 907 unlock_memory_hotplug();
884 return 0; 908 return 0;
885 909
886failed_removal: 910failed_removal:
@@ -891,7 +915,7 @@ failed_removal:
891 undo_isolate_page_range(start_pfn, end_pfn); 915 undo_isolate_page_range(start_pfn, end_pfn);
892 916
893out: 917out:
894 unlock_system_sleep(); 918 unlock_memory_hotplug();
895 return ret; 919 return ret;
896} 920}
897 921
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4a57f135b76e..368fc9d23610 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
514 pmd = pmd_offset(pud, addr); 514 pmd = pmd_offset(pud, addr);
515 do { 515 do {
516 next = pmd_addr_end(addr, end); 516 next = pmd_addr_end(addr, end);
517 split_huge_page_pmd(vma->vm_mm, pmd);
517 if (pmd_none_or_clear_bad(pmd)) 518 if (pmd_none_or_clear_bad(pmd))
518 continue; 519 continue;
519 if (check_pte_range(vma, pmd, addr, next, nodes, 520 if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -935,7 +936,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
935 return PTR_ERR(vma); 936 return PTR_ERR(vma);
936 937
937 if (!list_empty(&pagelist)) { 938 if (!list_empty(&pagelist)) {
938 err = migrate_pages(&pagelist, new_node_page, dest, 0); 939 err = migrate_pages(&pagelist, new_node_page, dest,
940 false, true);
939 if (err) 941 if (err)
940 putback_lru_pages(&pagelist); 942 putback_lru_pages(&pagelist);
941 } 943 }
@@ -1155,7 +1157,8 @@ static long do_mbind(unsigned long start, unsigned long len,
1155 1157
1156 if (!list_empty(&pagelist)) { 1158 if (!list_empty(&pagelist)) {
1157 nr_failed = migrate_pages(&pagelist, new_vma_page, 1159 nr_failed = migrate_pages(&pagelist, new_vma_page,
1158 (unsigned long)vma, 0); 1160 (unsigned long)vma,
1161 false, true);
1159 if (nr_failed) 1162 if (nr_failed)
1160 putback_lru_pages(&pagelist); 1163 putback_lru_pages(&pagelist);
1161 } 1164 }
@@ -1307,15 +1310,15 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1307 goto out; 1310 goto out;
1308 1311
1309 /* Find the mm_struct */ 1312 /* Find the mm_struct */
1310 read_lock(&tasklist_lock); 1313 rcu_read_lock();
1311 task = pid ? find_task_by_vpid(pid) : current; 1314 task = pid ? find_task_by_vpid(pid) : current;
1312 if (!task) { 1315 if (!task) {
1313 read_unlock(&tasklist_lock); 1316 rcu_read_unlock();
1314 err = -ESRCH; 1317 err = -ESRCH;
1315 goto out; 1318 goto out;
1316 } 1319 }
1317 mm = get_task_mm(task); 1320 mm = get_task_mm(task);
1318 read_unlock(&tasklist_lock); 1321 rcu_read_unlock();
1319 1322
1320 err = -EINVAL; 1323 err = -EINVAL;
1321 if (!mm) 1324 if (!mm)
@@ -1793,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1793} 1796}
1794 1797
1795/** 1798/**
1796 * alloc_page_vma - Allocate a page for a VMA. 1799 * alloc_pages_vma - Allocate a page for a VMA.
1797 * 1800 *
1798 * @gfp: 1801 * @gfp:
1799 * %GFP_USER user allocation. 1802 * %GFP_USER user allocation.
@@ -1802,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1802 * %GFP_FS allocation should not call back into a file system. 1805 * %GFP_FS allocation should not call back into a file system.
1803 * %GFP_ATOMIC don't sleep. 1806 * %GFP_ATOMIC don't sleep.
1804 * 1807 *
1808 * @order:Order of the GFP allocation.
1805 * @vma: Pointer to VMA or NULL if not available. 1809 * @vma: Pointer to VMA or NULL if not available.
1806 * @addr: Virtual Address of the allocation. Must be inside the VMA. 1810 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1807 * 1811 *
@@ -1815,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1815 * Should be called with the mm_sem of the vma hold. 1819 * Should be called with the mm_sem of the vma hold.
1816 */ 1820 */
1817struct page * 1821struct page *
1818alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) 1822alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1823 unsigned long addr)
1819{ 1824{
1820 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1825 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1821 struct zonelist *zl; 1826 struct zonelist *zl;
@@ -1827,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1827 1832
1828 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1833 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1829 mpol_cond_put(pol); 1834 mpol_cond_put(pol);
1830 page = alloc_page_interleave(gfp, 0, nid); 1835 page = alloc_page_interleave(gfp, order, nid);
1831 put_mems_allowed(); 1836 put_mems_allowed();
1832 return page; 1837 return page;
1833 } 1838 }
@@ -1836,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1836 /* 1841 /*
1837 * slow path: ref counted shared policy 1842 * slow path: ref counted shared policy
1838 */ 1843 */
1839 struct page *page = __alloc_pages_nodemask(gfp, 0, 1844 struct page *page = __alloc_pages_nodemask(gfp, order,
1840 zl, policy_nodemask(gfp, pol)); 1845 zl, policy_nodemask(gfp, pol));
1841 __mpol_put(pol); 1846 __mpol_put(pol);
1842 put_mems_allowed(); 1847 put_mems_allowed();
@@ -1845,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1845 /* 1850 /*
1846 * fast path: default or task policy 1851 * fast path: default or task policy
1847 */ 1852 */
1848 page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); 1853 page = __alloc_pages_nodemask(gfp, order, zl,
1854 policy_nodemask(gfp, pol));
1849 put_mems_allowed(); 1855 put_mems_allowed();
1850 return page; 1856 return page;
1851} 1857}
diff --git a/mm/migrate.c b/mm/migrate.c
index fe5a3c6a5426..766115253807 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -35,6 +35,8 @@
35#include <linux/hugetlb.h> 35#include <linux/hugetlb.h>
36#include <linux/gfp.h> 36#include <linux/gfp.h>
37 37
38#include <asm/tlbflush.h>
39
38#include "internal.h" 40#include "internal.h"
39 41
40#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 42#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -111,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
111 goto out; 113 goto out;
112 114
113 pmd = pmd_offset(pud, addr); 115 pmd = pmd_offset(pud, addr);
116 if (pmd_trans_huge(*pmd))
117 goto out;
114 if (!pmd_present(*pmd)) 118 if (!pmd_present(*pmd))
115 goto out; 119 goto out;
116 120
@@ -244,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
244 248
245 expected_count = 2 + page_has_private(page); 249 expected_count = 2 + page_has_private(page);
246 if (page_count(page) != expected_count || 250 if (page_count(page) != expected_count ||
247 (struct page *)radix_tree_deref_slot(pslot) != page) { 251 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
248 spin_unlock_irq(&mapping->tree_lock); 252 spin_unlock_irq(&mapping->tree_lock);
249 return -EAGAIN; 253 return -EAGAIN;
250 } 254 }
@@ -316,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
316 320
317 expected_count = 2 + page_has_private(page); 321 expected_count = 2 + page_has_private(page);
318 if (page_count(page) != expected_count || 322 if (page_count(page) != expected_count ||
319 (struct page *)radix_tree_deref_slot(pslot) != page) { 323 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
320 spin_unlock_irq(&mapping->tree_lock); 324 spin_unlock_irq(&mapping->tree_lock);
321 return -EAGAIN; 325 return -EAGAIN;
322 } 326 }
@@ -612,13 +616,12 @@ static int move_to_new_page(struct page *newpage, struct page *page,
612 * to the newly allocated page in newpage. 616 * to the newly allocated page in newpage.
613 */ 617 */
614static int unmap_and_move(new_page_t get_new_page, unsigned long private, 618static int unmap_and_move(new_page_t get_new_page, unsigned long private,
615 struct page *page, int force, int offlining) 619 struct page *page, int force, bool offlining, bool sync)
616{ 620{
617 int rc = 0; 621 int rc = 0;
618 int *result = NULL; 622 int *result = NULL;
619 struct page *newpage = get_new_page(page, private, &result); 623 struct page *newpage = get_new_page(page, private, &result);
620 int remap_swapcache = 1; 624 int remap_swapcache = 1;
621 int rcu_locked = 0;
622 int charge = 0; 625 int charge = 0;
623 struct mem_cgroup *mem = NULL; 626 struct mem_cgroup *mem = NULL;
624 struct anon_vma *anon_vma = NULL; 627 struct anon_vma *anon_vma = NULL;
@@ -630,6 +633,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
630 /* page was freed from under us. So we are done. */ 633 /* page was freed from under us. So we are done. */
631 goto move_newpage; 634 goto move_newpage;
632 } 635 }
636 if (unlikely(PageTransHuge(page)))
637 if (unlikely(split_huge_page(page)))
638 goto move_newpage;
633 639
634 /* prepare cgroup just returns 0 or -ENOMEM */ 640 /* prepare cgroup just returns 0 or -ENOMEM */
635 rc = -EAGAIN; 641 rc = -EAGAIN;
@@ -637,6 +643,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
637 if (!trylock_page(page)) { 643 if (!trylock_page(page)) {
638 if (!force) 644 if (!force)
639 goto move_newpage; 645 goto move_newpage;
646
647 /*
648 * It's not safe for direct compaction to call lock_page.
649 * For example, during page readahead pages are added locked
650 * to the LRU. Later, when the IO completes the pages are
651 * marked uptodate and unlocked. However, the queueing
652 * could be merging multiple pages for one bio (e.g.
653 * mpage_readpages). If an allocation happens for the
654 * second or third page, the process can end up locking
655 * the same page twice and deadlocking. Rather than
656 * trying to be clever about what pages can be locked,
657 * avoid the use of lock_page for direct compaction
658 * altogether.
659 */
660 if (current->flags & PF_MEMALLOC)
661 goto move_newpage;
662
640 lock_page(page); 663 lock_page(page);
641 } 664 }
642 665
@@ -663,27 +686,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
663 BUG_ON(charge); 686 BUG_ON(charge);
664 687
665 if (PageWriteback(page)) { 688 if (PageWriteback(page)) {
666 if (!force) 689 if (!force || !sync)
667 goto uncharge; 690 goto uncharge;
668 wait_on_page_writeback(page); 691 wait_on_page_writeback(page);
669 } 692 }
670 /* 693 /*
671 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 694 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
672 * we cannot notice that anon_vma is freed while we migrates a page. 695 * we cannot notice that anon_vma is freed while we migrates a page.
673 * This rcu_read_lock() delays freeing anon_vma pointer until the end 696 * This get_anon_vma() delays freeing anon_vma pointer until the end
674 * of migration. File cache pages are no problem because of page_lock() 697 * of migration. File cache pages are no problem because of page_lock()
675 * File Caches may use write_page() or lock_page() in migration, then, 698 * File Caches may use write_page() or lock_page() in migration, then,
676 * just care Anon page here. 699 * just care Anon page here.
677 */ 700 */
678 if (PageAnon(page)) { 701 if (PageAnon(page)) {
679 rcu_read_lock(); 702 /*
680 rcu_locked = 1; 703 * Only page_lock_anon_vma() understands the subtleties of
681 704 * getting a hold on an anon_vma from outside one of its mms.
682 /* Determine how to safely use anon_vma */ 705 */
683 if (!page_mapped(page)) { 706 anon_vma = page_lock_anon_vma(page);
684 if (!PageSwapCache(page)) 707 if (anon_vma) {
685 goto rcu_unlock; 708 /*
686 709 * Take a reference count on the anon_vma if the
710 * page is mapped so that it is guaranteed to
711 * exist when the page is remapped later
712 */
713 get_anon_vma(anon_vma);
714 page_unlock_anon_vma(anon_vma);
715 } else if (PageSwapCache(page)) {
687 /* 716 /*
688 * We cannot be sure that the anon_vma of an unmapped 717 * We cannot be sure that the anon_vma of an unmapped
689 * swapcache page is safe to use because we don't 718 * swapcache page is safe to use because we don't
@@ -698,13 +727,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
698 */ 727 */
699 remap_swapcache = 0; 728 remap_swapcache = 0;
700 } else { 729 } else {
701 /* 730 goto uncharge;
702 * Take a reference count on the anon_vma if the
703 * page is mapped so that it is guaranteed to
704 * exist when the page is remapped later
705 */
706 anon_vma = page_anon_vma(page);
707 get_anon_vma(anon_vma);
708 } 731 }
709 } 732 }
710 733
@@ -721,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
721 * free the metadata, so the page can be freed. 744 * free the metadata, so the page can be freed.
722 */ 745 */
723 if (!page->mapping) { 746 if (!page->mapping) {
724 if (!PageAnon(page) && page_has_private(page)) { 747 VM_BUG_ON(PageAnon(page));
725 /* 748 if (page_has_private(page)) {
726 * Go direct to try_to_free_buffers() here because
727 * a) that's what try_to_release_page() would do anyway
728 * b) we may be under rcu_read_lock() here, so we can't
729 * use GFP_KERNEL which is what try_to_release_page()
730 * needs to be effective.
731 */
732 try_to_free_buffers(page); 749 try_to_free_buffers(page);
733 goto rcu_unlock; 750 goto uncharge;
734 } 751 }
735 goto skip_unmap; 752 goto skip_unmap;
736 } 753 }
@@ -744,20 +761,18 @@ skip_unmap:
744 761
745 if (rc && remap_swapcache) 762 if (rc && remap_swapcache)
746 remove_migration_ptes(page, page); 763 remove_migration_ptes(page, page);
747rcu_unlock:
748 764
749 /* Drop an anon_vma reference if we took one */ 765 /* Drop an anon_vma reference if we took one */
750 if (anon_vma) 766 if (anon_vma)
751 drop_anon_vma(anon_vma); 767 drop_anon_vma(anon_vma);
752 768
753 if (rcu_locked)
754 rcu_read_unlock();
755uncharge: 769uncharge:
756 if (!charge) 770 if (!charge)
757 mem_cgroup_end_migration(mem, page, newpage); 771 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
758unlock: 772unlock:
759 unlock_page(page); 773 unlock_page(page);
760 774
775move_newpage:
761 if (rc != -EAGAIN) { 776 if (rc != -EAGAIN) {
762 /* 777 /*
763 * A page that has been migrated has all references 778 * A page that has been migrated has all references
@@ -771,8 +786,6 @@ unlock:
771 putback_lru_page(page); 786 putback_lru_page(page);
772 } 787 }
773 788
774move_newpage:
775
776 /* 789 /*
777 * Move the new page to the LRU. If migration was not successful 790 * Move the new page to the LRU. If migration was not successful
778 * then this will free the page. 791 * then this will free the page.
@@ -808,12 +821,11 @@ move_newpage:
808 */ 821 */
809static int unmap_and_move_huge_page(new_page_t get_new_page, 822static int unmap_and_move_huge_page(new_page_t get_new_page,
810 unsigned long private, struct page *hpage, 823 unsigned long private, struct page *hpage,
811 int force, int offlining) 824 int force, bool offlining, bool sync)
812{ 825{
813 int rc = 0; 826 int rc = 0;
814 int *result = NULL; 827 int *result = NULL;
815 struct page *new_hpage = get_new_page(hpage, private, &result); 828 struct page *new_hpage = get_new_page(hpage, private, &result);
816 int rcu_locked = 0;
817 struct anon_vma *anon_vma = NULL; 829 struct anon_vma *anon_vma = NULL;
818 830
819 if (!new_hpage) 831 if (!new_hpage)
@@ -822,18 +834,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
822 rc = -EAGAIN; 834 rc = -EAGAIN;
823 835
824 if (!trylock_page(hpage)) { 836 if (!trylock_page(hpage)) {
825 if (!force) 837 if (!force || !sync)
826 goto out; 838 goto out;
827 lock_page(hpage); 839 lock_page(hpage);
828 } 840 }
829 841
830 if (PageAnon(hpage)) { 842 if (PageAnon(hpage)) {
831 rcu_read_lock(); 843 anon_vma = page_lock_anon_vma(hpage);
832 rcu_locked = 1; 844 if (anon_vma) {
833 845 get_anon_vma(anon_vma);
834 if (page_mapped(hpage)) { 846 page_unlock_anon_vma(anon_vma);
835 anon_vma = page_anon_vma(hpage);
836 atomic_inc(&anon_vma->external_refcount);
837 } 847 }
838 } 848 }
839 849
@@ -845,16 +855,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
845 if (rc) 855 if (rc)
846 remove_migration_ptes(hpage, hpage); 856 remove_migration_ptes(hpage, hpage);
847 857
848 if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, 858 if (anon_vma)
849 &anon_vma->lock)) { 859 drop_anon_vma(anon_vma);
850 int empty = list_empty(&anon_vma->head);
851 spin_unlock(&anon_vma->lock);
852 if (empty)
853 anon_vma_free(anon_vma);
854 }
855
856 if (rcu_locked)
857 rcu_read_unlock();
858out: 860out:
859 unlock_page(hpage); 861 unlock_page(hpage);
860 862
@@ -885,12 +887,13 @@ out:
885 * are movable anymore because to has become empty 887 * are movable anymore because to has become empty
886 * or no retryable pages exist anymore. 888 * or no retryable pages exist anymore.
887 * Caller should call putback_lru_pages to return pages to the LRU 889 * Caller should call putback_lru_pages to return pages to the LRU
888 * or free list. 890 * or free list only if ret != 0.
889 * 891 *
890 * Return: Number of pages not migrated or error code. 892 * Return: Number of pages not migrated or error code.
891 */ 893 */
892int migrate_pages(struct list_head *from, 894int migrate_pages(struct list_head *from,
893 new_page_t get_new_page, unsigned long private, int offlining) 895 new_page_t get_new_page, unsigned long private, bool offlining,
896 bool sync)
894{ 897{
895 int retry = 1; 898 int retry = 1;
896 int nr_failed = 0; 899 int nr_failed = 0;
@@ -910,7 +913,8 @@ int migrate_pages(struct list_head *from,
910 cond_resched(); 913 cond_resched();
911 914
912 rc = unmap_and_move(get_new_page, private, 915 rc = unmap_and_move(get_new_page, private,
913 page, pass > 2, offlining); 916 page, pass > 2, offlining,
917 sync);
914 918
915 switch(rc) { 919 switch(rc) {
916 case -ENOMEM: 920 case -ENOMEM:
@@ -939,7 +943,8 @@ out:
939} 943}
940 944
941int migrate_huge_pages(struct list_head *from, 945int migrate_huge_pages(struct list_head *from,
942 new_page_t get_new_page, unsigned long private, int offlining) 946 new_page_t get_new_page, unsigned long private, bool offlining,
947 bool sync)
943{ 948{
944 int retry = 1; 949 int retry = 1;
945 int nr_failed = 0; 950 int nr_failed = 0;
@@ -955,7 +960,8 @@ int migrate_huge_pages(struct list_head *from,
955 cond_resched(); 960 cond_resched();
956 961
957 rc = unmap_and_move_huge_page(get_new_page, 962 rc = unmap_and_move_huge_page(get_new_page,
958 private, page, pass > 2, offlining); 963 private, page, pass > 2, offlining,
964 sync);
959 965
960 switch(rc) { 966 switch(rc) {
961 case -ENOMEM: 967 case -ENOMEM:
@@ -974,10 +980,6 @@ int migrate_huge_pages(struct list_head *from,
974 } 980 }
975 rc = 0; 981 rc = 0;
976out: 982out:
977
978 list_for_each_entry_safe(page, page2, from, lru)
979 put_page(page);
980
981 if (rc) 983 if (rc)
982 return rc; 984 return rc;
983 985
@@ -1040,7 +1042,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
1040 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) 1042 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
1041 goto set_status; 1043 goto set_status;
1042 1044
1043 page = follow_page(vma, pp->addr, FOLL_GET); 1045 page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
1044 1046
1045 err = PTR_ERR(page); 1047 err = PTR_ERR(page);
1046 if (IS_ERR(page)) 1048 if (IS_ERR(page))
@@ -1088,7 +1090,7 @@ set_status:
1088 err = 0; 1090 err = 0;
1089 if (!list_empty(&pagelist)) { 1091 if (!list_empty(&pagelist)) {
1090 err = migrate_pages(&pagelist, new_page_node, 1092 err = migrate_pages(&pagelist, new_page_node,
1091 (unsigned long)pm, 0); 1093 (unsigned long)pm, 0, true);
1092 if (err) 1094 if (err)
1093 putback_lru_pages(&pagelist); 1095 putback_lru_pages(&pagelist);
1094 } 1096 }
diff --git a/mm/mincore.c b/mm/mincore.c
index 9ac42dc6d7b6..a4e6b9d75c76 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
154 pmd = pmd_offset(pud, addr); 154 pmd = pmd_offset(pud, addr);
155 do { 155 do {
156 next = pmd_addr_end(addr, end); 156 next = pmd_addr_end(addr, end);
157 if (pmd_trans_huge(*pmd)) {
158 if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
159 vec += (next - addr) >> PAGE_SHIFT;
160 continue;
161 }
162 /* fall through */
163 }
157 if (pmd_none_or_clear_bad(pmd)) 164 if (pmd_none_or_clear_bad(pmd))
158 mincore_unmapped_range(vma, addr, next, vec); 165 mincore_unmapped_range(vma, addr, next, vec);
159 else 166 else
diff --git a/mm/mlock.c b/mm/mlock.c
index b70919ce4f72..c3924c7f00be 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -155,13 +155,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
155 * vma->vm_mm->mmap_sem must be held for at least read. 155 * vma->vm_mm->mmap_sem must be held for at least read.
156 */ 156 */
157static long __mlock_vma_pages_range(struct vm_area_struct *vma, 157static long __mlock_vma_pages_range(struct vm_area_struct *vma,
158 unsigned long start, unsigned long end) 158 unsigned long start, unsigned long end,
159 int *nonblocking)
159{ 160{
160 struct mm_struct *mm = vma->vm_mm; 161 struct mm_struct *mm = vma->vm_mm;
161 unsigned long addr = start; 162 unsigned long addr = start;
162 struct page *pages[16]; /* 16 gives a reasonable batch */
163 int nr_pages = (end - start) / PAGE_SIZE; 163 int nr_pages = (end - start) / PAGE_SIZE;
164 int ret = 0;
165 int gup_flags; 164 int gup_flags;
166 165
167 VM_BUG_ON(start & ~PAGE_MASK); 166 VM_BUG_ON(start & ~PAGE_MASK);
@@ -170,73 +169,33 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
170 VM_BUG_ON(end > vma->vm_end); 169 VM_BUG_ON(end > vma->vm_end);
171 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); 170 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
172 171
173 gup_flags = FOLL_TOUCH | FOLL_GET; 172 gup_flags = FOLL_TOUCH;
174 if (vma->vm_flags & VM_WRITE) 173 /*
174 * We want to touch writable mappings with a write fault in order
175 * to break COW, except for shared mappings because these don't COW
176 * and we would not want to dirty them for nothing.
177 */
178 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
175 gup_flags |= FOLL_WRITE; 179 gup_flags |= FOLL_WRITE;
176 180
181 /*
182 * We want mlock to succeed for regions that have any permissions
183 * other than PROT_NONE.
184 */
185 if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
186 gup_flags |= FOLL_FORCE;
187
188 if (vma->vm_flags & VM_LOCKED)
189 gup_flags |= FOLL_MLOCK;
190
177 /* We don't try to access the guard page of a stack vma */ 191 /* We don't try to access the guard page of a stack vma */
178 if (stack_guard_page(vma, start)) { 192 if (stack_guard_page(vma, start)) {
179 addr += PAGE_SIZE; 193 addr += PAGE_SIZE;
180 nr_pages--; 194 nr_pages--;
181 } 195 }
182 196
183 while (nr_pages > 0) { 197 return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
184 int i; 198 NULL, NULL, nonblocking);
185
186 cond_resched();
187
188 /*
189 * get_user_pages makes pages present if we are
190 * setting mlock. and this extra reference count will
191 * disable migration of this page. However, page may
192 * still be truncated out from under us.
193 */
194 ret = __get_user_pages(current, mm, addr,
195 min_t(int, nr_pages, ARRAY_SIZE(pages)),
196 gup_flags, pages, NULL);
197 /*
198 * This can happen for, e.g., VM_NONLINEAR regions before
199 * a page has been allocated and mapped at a given offset,
200 * or for addresses that map beyond end of a file.
201 * We'll mlock the pages if/when they get faulted in.
202 */
203 if (ret < 0)
204 break;
205
206 lru_add_drain(); /* push cached pages to LRU */
207
208 for (i = 0; i < ret; i++) {
209 struct page *page = pages[i];
210
211 if (page->mapping) {
212 /*
213 * That preliminary check is mainly to avoid
214 * the pointless overhead of lock_page on the
215 * ZERO_PAGE: which might bounce very badly if
216 * there is contention. However, we're still
217 * dirtying its cacheline with get/put_page:
218 * we'll add another __get_user_pages flag to
219 * avoid it if that case turns out to matter.
220 */
221 lock_page(page);
222 /*
223 * Because we lock page here and migration is
224 * blocked by the elevated reference, we need
225 * only check for file-cache page truncation.
226 */
227 if (page->mapping)
228 mlock_vma_page(page);
229 unlock_page(page);
230 }
231 put_page(page); /* ref from get_user_pages() */
232 }
233
234 addr += ret * PAGE_SIZE;
235 nr_pages -= ret;
236 ret = 0;
237 }
238
239 return ret; /* 0 or negative error code */
240} 199}
241 200
242/* 201/*
@@ -280,7 +239,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
280 is_vm_hugetlb_page(vma) || 239 is_vm_hugetlb_page(vma) ||
281 vma == get_gate_vma(current))) { 240 vma == get_gate_vma(current))) {
282 241
283 __mlock_vma_pages_range(vma, start, end); 242 __mlock_vma_pages_range(vma, start, end, NULL);
284 243
285 /* Hide errors from mmap() and other callers */ 244 /* Hide errors from mmap() and other callers */
286 return 0; 245 return 0;
@@ -372,18 +331,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
372 int ret = 0; 331 int ret = 0;
373 int lock = newflags & VM_LOCKED; 332 int lock = newflags & VM_LOCKED;
374 333
375 if (newflags == vma->vm_flags || 334 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
376 (vma->vm_flags & (VM_IO | VM_PFNMAP))) 335 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))
377 goto out; /* don't set VM_LOCKED, don't count */ 336 goto out; /* don't set VM_LOCKED, don't count */
378 337
379 if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
380 is_vm_hugetlb_page(vma) ||
381 vma == get_gate_vma(current)) {
382 if (lock)
383 make_pages_present(start, end);
384 goto out; /* don't set VM_LOCKED, don't count */
385 }
386
387 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 338 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
388 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, 339 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
389 vma->vm_file, pgoff, vma_policy(vma)); 340 vma->vm_file, pgoff, vma_policy(vma));
@@ -419,14 +370,10 @@ success:
419 * set VM_LOCKED, __mlock_vma_pages_range will bring it back. 370 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
420 */ 371 */
421 372
422 if (lock) { 373 if (lock)
423 vma->vm_flags = newflags; 374 vma->vm_flags = newflags;
424 ret = __mlock_vma_pages_range(vma, start, end); 375 else
425 if (ret < 0)
426 ret = __mlock_posix_error_return(ret);
427 } else {
428 munlock_vma_pages_range(vma, start, end); 376 munlock_vma_pages_range(vma, start, end);
429 }
430 377
431out: 378out:
432 *prev = vma; 379 *prev = vma;
@@ -439,7 +386,8 @@ static int do_mlock(unsigned long start, size_t len, int on)
439 struct vm_area_struct * vma, * prev; 386 struct vm_area_struct * vma, * prev;
440 int error; 387 int error;
441 388
442 len = PAGE_ALIGN(len); 389 VM_BUG_ON(start & ~PAGE_MASK);
390 VM_BUG_ON(len != PAGE_ALIGN(len));
443 end = start + len; 391 end = start + len;
444 if (end < start) 392 if (end < start)
445 return -EINVAL; 393 return -EINVAL;
@@ -482,6 +430,62 @@ static int do_mlock(unsigned long start, size_t len, int on)
482 return error; 430 return error;
483} 431}
484 432
433static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
434{
435 struct mm_struct *mm = current->mm;
436 unsigned long end, nstart, nend;
437 struct vm_area_struct *vma = NULL;
438 int locked = 0;
439 int ret = 0;
440
441 VM_BUG_ON(start & ~PAGE_MASK);
442 VM_BUG_ON(len != PAGE_ALIGN(len));
443 end = start + len;
444
445 for (nstart = start; nstart < end; nstart = nend) {
446 /*
447 * We want to fault in pages for [nstart; end) address range.
448 * Find first corresponding VMA.
449 */
450 if (!locked) {
451 locked = 1;
452 down_read(&mm->mmap_sem);
453 vma = find_vma(mm, nstart);
454 } else if (nstart >= vma->vm_end)
455 vma = vma->vm_next;
456 if (!vma || vma->vm_start >= end)
457 break;
458 /*
459 * Set [nstart; nend) to intersection of desired address
460 * range with the first VMA. Also, skip undesirable VMA types.
461 */
462 nend = min(end, vma->vm_end);
463 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
464 continue;
465 if (nstart < vma->vm_start)
466 nstart = vma->vm_start;
467 /*
468 * Now fault in a range of pages. __mlock_vma_pages_range()
469 * double checks the vma flags, so that it won't mlock pages
470 * if the vma was already munlocked.
471 */
472 ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
473 if (ret < 0) {
474 if (ignore_errors) {
475 ret = 0;
476 continue; /* continue at next VMA */
477 }
478 ret = __mlock_posix_error_return(ret);
479 break;
480 }
481 nend = nstart + ret * PAGE_SIZE;
482 ret = 0;
483 }
484 if (locked)
485 up_read(&mm->mmap_sem);
486 return ret; /* 0 or negative error code */
487}
488
485SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) 489SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
486{ 490{
487 unsigned long locked; 491 unsigned long locked;
@@ -507,6 +511,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
507 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) 511 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
508 error = do_mlock(start, len, 1); 512 error = do_mlock(start, len, 1);
509 up_write(&current->mm->mmap_sem); 513 up_write(&current->mm->mmap_sem);
514 if (!error)
515 error = do_mlock_pages(start, len, 0);
510 return error; 516 return error;
511} 517}
512 518
@@ -571,6 +577,10 @@ SYSCALL_DEFINE1(mlockall, int, flags)
571 capable(CAP_IPC_LOCK)) 577 capable(CAP_IPC_LOCK))
572 ret = do_mlockall(flags); 578 ret = do_mlockall(flags);
573 up_write(&current->mm->mmap_sem); 579 up_write(&current->mm->mmap_sem);
580 if (!ret && (flags & MCL_CURRENT)) {
581 /* Ignore errors */
582 do_mlock_pages(0, TASK_SIZE, 1);
583 }
574out: 584out:
575 return ret; 585 return ret;
576} 586}
diff --git a/mm/mmap.c b/mm/mmap.c
index b179abb1474a..2ec8eb5a9cdd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,7 @@
29#include <linux/mmu_notifier.h> 29#include <linux/mmu_notifier.h>
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/audit.h> 31#include <linux/audit.h>
32#include <linux/khugepaged.h>
32 33
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <asm/cacheflush.h> 35#include <asm/cacheflush.h>
@@ -253,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
253 down_write(&mm->mmap_sem); 254 down_write(&mm->mmap_sem);
254 255
255#ifdef CONFIG_COMPAT_BRK 256#ifdef CONFIG_COMPAT_BRK
256 min_brk = mm->end_code; 257 /*
258 * CONFIG_COMPAT_BRK can still be overridden by setting
259 * randomize_va_space to 2, which will still cause mm->start_brk
260 * to be arbitrarily shifted
261 */
262 if (mm->start_brk > PAGE_ALIGN(mm->end_data))
263 min_brk = mm->start_brk;
264 else
265 min_brk = mm->end_data;
257#else 266#else
258 min_brk = mm->start_brk; 267 min_brk = mm->start_brk;
259#endif 268#endif
@@ -588,6 +597,8 @@ again: remove_next = 1 + (end > next->vm_end);
588 } 597 }
589 } 598 }
590 599
600 vma_adjust_trans_huge(vma, start, end, adjust_next);
601
591 /* 602 /*
592 * When changing only vma->vm_end, we don't really need anon_vma 603 * When changing only vma->vm_end, we don't really need anon_vma
593 * lock. This is a fairly rare case by itself, but the anon_vma 604 * lock. This is a fairly rare case by itself, but the anon_vma
@@ -815,6 +826,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
815 end, prev->vm_pgoff, NULL); 826 end, prev->vm_pgoff, NULL);
816 if (err) 827 if (err)
817 return NULL; 828 return NULL;
829 khugepaged_enter_vma_merge(prev);
818 return prev; 830 return prev;
819 } 831 }
820 832
@@ -833,6 +845,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
833 next->vm_pgoff - pglen, NULL); 845 next->vm_pgoff - pglen, NULL);
834 if (err) 846 if (err)
835 return NULL; 847 return NULL;
848 khugepaged_enter_vma_merge(area);
836 return area; 849 return area;
837 } 850 }
838 851
@@ -1761,6 +1774,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1761 } 1774 }
1762 } 1775 }
1763 vma_unlock_anon_vma(vma); 1776 vma_unlock_anon_vma(vma);
1777 khugepaged_enter_vma_merge(vma);
1764 return error; 1778 return error;
1765} 1779}
1766#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ 1780#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1808,6 +1822,7 @@ static int expand_downwards(struct vm_area_struct *vma,
1808 } 1822 }
1809 } 1823 }
1810 vma_unlock_anon_vma(vma); 1824 vma_unlock_anon_vma(vma);
1825 khugepaged_enter_vma_merge(vma);
1811 return error; 1826 return error;
1812} 1827}
1813 1828
@@ -2462,6 +2477,7 @@ int install_special_mapping(struct mm_struct *mm,
2462 unsigned long addr, unsigned long len, 2477 unsigned long addr, unsigned long len,
2463 unsigned long vm_flags, struct page **pages) 2478 unsigned long vm_flags, struct page **pages)
2464{ 2479{
2480 int ret;
2465 struct vm_area_struct *vma; 2481 struct vm_area_struct *vma;
2466 2482
2467 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); 2483 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
@@ -2479,16 +2495,23 @@ int install_special_mapping(struct mm_struct *mm,
2479 vma->vm_ops = &special_mapping_vmops; 2495 vma->vm_ops = &special_mapping_vmops;
2480 vma->vm_private_data = pages; 2496 vma->vm_private_data = pages;
2481 2497
2482 if (unlikely(insert_vm_struct(mm, vma))) { 2498 ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
2483 kmem_cache_free(vm_area_cachep, vma); 2499 if (ret)
2484 return -ENOMEM; 2500 goto out;
2485 } 2501
2502 ret = insert_vm_struct(mm, vma);
2503 if (ret)
2504 goto out;
2486 2505
2487 mm->total_vm += len >> PAGE_SHIFT; 2506 mm->total_vm += len >> PAGE_SHIFT;
2488 2507
2489 perf_event_mmap(vma); 2508 perf_event_mmap(vma);
2490 2509
2491 return 0; 2510 return 0;
2511
2512out:
2513 kmem_cache_free(vm_area_cachep, vma);
2514 return ret;
2492} 2515}
2493 2516
2494static DEFINE_MUTEX(mm_all_locks_mutex); 2517static DEFINE_MUTEX(mm_all_locks_mutex);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 438951d366f2..8d032de4088e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
100 return young; 100 return young;
101} 101}
102 102
103int __mmu_notifier_test_young(struct mm_struct *mm,
104 unsigned long address)
105{
106 struct mmu_notifier *mn;
107 struct hlist_node *n;
108 int young = 0;
109
110 rcu_read_lock();
111 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
112 if (mn->ops->test_young) {
113 young = mn->ops->test_young(mn, mm, address);
114 if (young)
115 break;
116 }
117 }
118 rcu_read_unlock();
119
120 return young;
121}
122
103void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, 123void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
104 pte_t pte) 124 pte_t pte)
105{ 125{
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb82c855..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
87 return 1; 87 return 1;
88} 88}
89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
90
91#ifdef CONFIG_SMP
92/* Called when a more accurate view of NR_FREE_PAGES is needed */
93unsigned long zone_nr_free_pages(struct zone *zone)
94{
95 unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
96
97 /*
98 * While kswapd is awake, it is considered the zone is under some
99 * memory pressure. Under pressure, there is a risk that
100 * per-cpu-counter-drift will allow the min watermark to be breached
101 * potentially causing a live-lock. While kswapd is awake and
102 * free pages are low, get a better estimate for free pages
103 */
104 if (nr_free_pages < zone->percpu_drift_mark &&
105 !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
106 return zone_page_state_snapshot(zone, NR_FREE_PAGES);
107
108 return nr_free_pages;
109}
110#endif /* CONFIG_SMP */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4c5133873097..5a688a2756be 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
78 pte_unmap_unlock(pte - 1, ptl); 78 pte_unmap_unlock(pte - 1, ptl);
79} 79}
80 80
81static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, 81static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
82 unsigned long addr, unsigned long end, pgprot_t newprot, 82 unsigned long addr, unsigned long end, pgprot_t newprot,
83 int dirty_accountable) 83 int dirty_accountable)
84{ 84{
@@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
88 pmd = pmd_offset(pud, addr); 88 pmd = pmd_offset(pud, addr);
89 do { 89 do {
90 next = pmd_addr_end(addr, end); 90 next = pmd_addr_end(addr, end);
91 if (pmd_trans_huge(*pmd)) {
92 if (next - addr != HPAGE_PMD_SIZE)
93 split_huge_page_pmd(vma->vm_mm, pmd);
94 else if (change_huge_pmd(vma, pmd, addr, newprot))
95 continue;
96 /* fall through */
97 }
91 if (pmd_none_or_clear_bad(pmd)) 98 if (pmd_none_or_clear_bad(pmd))
92 continue; 99 continue;
93 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); 100 change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
101 dirty_accountable);
94 } while (pmd++, addr = next, addr != end); 102 } while (pmd++, addr = next, addr != end);
95} 103}
96 104
97static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, 105static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
98 unsigned long addr, unsigned long end, pgprot_t newprot, 106 unsigned long addr, unsigned long end, pgprot_t newprot,
99 int dirty_accountable) 107 int dirty_accountable)
100{ 108{
@@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
106 next = pud_addr_end(addr, end); 114 next = pud_addr_end(addr, end);
107 if (pud_none_or_clear_bad(pud)) 115 if (pud_none_or_clear_bad(pud))
108 continue; 116 continue;
109 change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); 117 change_pmd_range(vma, pud, addr, next, newprot,
118 dirty_accountable);
110 } while (pud++, addr = next, addr != end); 119 } while (pud++, addr = next, addr != end);
111} 120}
112 121
@@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma,
126 next = pgd_addr_end(addr, end); 135 next = pgd_addr_end(addr, end);
127 if (pgd_none_or_clear_bad(pgd)) 136 if (pgd_none_or_clear_bad(pgd))
128 continue; 137 continue;
129 change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); 138 change_pud_range(vma, pgd, addr, next, newprot,
139 dirty_accountable);
130 } while (pgd++, addr = next, addr != end); 140 } while (pgd++, addr = next, addr != end);
131 flush_tlb_range(vma, start, end); 141 flush_tlb_range(vma, start, end);
132} 142}
diff --git a/mm/mremap.c b/mm/mremap.c
index 563fbdd6293a..9925b6391b80 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
41 return NULL; 41 return NULL;
42 42
43 pmd = pmd_offset(pud, addr); 43 pmd = pmd_offset(pud, addr);
44 split_huge_page_pmd(mm, pmd);
44 if (pmd_none_or_clear_bad(pmd)) 45 if (pmd_none_or_clear_bad(pmd))
45 return NULL; 46 return NULL;
46 47
47 return pmd; 48 return pmd;
48} 49}
49 50
50static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) 51static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
52 unsigned long addr)
51{ 53{
52 pgd_t *pgd; 54 pgd_t *pgd;
53 pud_t *pud; 55 pud_t *pud;
@@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
62 if (!pmd) 64 if (!pmd)
63 return NULL; 65 return NULL;
64 66
65 if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) 67 VM_BUG_ON(pmd_trans_huge(*pmd));
68 if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
66 return NULL; 69 return NULL;
67 70
68 return pmd; 71 return pmd;
@@ -147,7 +150,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
147 old_pmd = get_old_pmd(vma->vm_mm, old_addr); 150 old_pmd = get_old_pmd(vma->vm_mm, old_addr);
148 if (!old_pmd) 151 if (!old_pmd)
149 continue; 152 continue;
150 new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); 153 new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
151 if (!new_pmd) 154 if (!new_pmd)
152 break; 155 break;
153 next = (new_addr + PMD_SIZE) & PMD_MASK; 156 next = (new_addr + PMD_SIZE) & PMD_MASK;
diff --git a/mm/nommu.c b/mm/nommu.c
index 3613517c7592..f59e1424d3db 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,7 +10,7 @@
10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> 10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> 11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> 12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
13 * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org> 13 * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
14 */ 14 */
15 15
16#include <linux/module.h> 16#include <linux/module.h>
@@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp)
127 127
128int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 128int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
129 unsigned long start, int nr_pages, unsigned int foll_flags, 129 unsigned long start, int nr_pages, unsigned int foll_flags,
130 struct page **pages, struct vm_area_struct **vmas) 130 struct page **pages, struct vm_area_struct **vmas,
131 int *retry)
131{ 132{
132 struct vm_area_struct *vma; 133 struct vm_area_struct *vma;
133 unsigned long vm_flags; 134 unsigned long vm_flags;
@@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
185 if (force) 186 if (force)
186 flags |= FOLL_FORCE; 187 flags |= FOLL_FORCE;
187 188
188 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 189 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
190 NULL);
189} 191}
190EXPORT_SYMBOL(get_user_pages); 192EXPORT_SYMBOL(get_user_pages);
191 193
@@ -328,6 +330,7 @@ void *vmalloc_node(unsigned long size, int node)
328{ 330{
329 return vmalloc(size); 331 return vmalloc(size);
330} 332}
333EXPORT_SYMBOL(vmalloc_node);
331 334
332/** 335/**
333 * vzalloc_node - allocate memory on a specific node with zero fill 336 * vzalloc_node - allocate memory on a specific node with zero fill
@@ -440,6 +443,31 @@ void __attribute__((weak)) vmalloc_sync_all(void)
440{ 443{
441} 444}
442 445
446/**
447 * alloc_vm_area - allocate a range of kernel address space
448 * @size: size of the area
449 *
450 * Returns: NULL on failure, vm_struct on success
451 *
452 * This function reserves a range of kernel address space, and
453 * allocates pagetables to map that range. No actual mappings
454 * are created. If the kernel address space is not shared
455 * between processes, it syncs the pagetable across all
456 * processes.
457 */
458struct vm_struct *alloc_vm_area(size_t size)
459{
460 BUG();
461 return NULL;
462}
463EXPORT_SYMBOL_GPL(alloc_vm_area);
464
465void free_vm_area(struct vm_struct *area)
466{
467 BUG();
468}
469EXPORT_SYMBOL_GPL(free_vm_area);
470
443int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, 471int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
444 struct page *page) 472 struct page *page)
445{ 473{
@@ -1717,6 +1745,7 @@ void exit_mmap(struct mm_struct *mm)
1717 mm->mmap = vma->vm_next; 1745 mm->mmap = vma->vm_next;
1718 delete_vma_from_mm(vma); 1746 delete_vma_from_mm(vma);
1719 delete_vma(mm, vma); 1747 delete_vma(mm, vma);
1748 cond_resched();
1720 } 1749 }
1721 1750
1722 kleave(""); 1751 kleave("");
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b840afa89761..2cb01f6ec5d0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -404,15 +404,18 @@ unsigned long determine_dirtyable_memory(void)
404 * - vm.dirty_background_ratio or vm.dirty_background_bytes 404 * - vm.dirty_background_ratio or vm.dirty_background_bytes
405 * - vm.dirty_ratio or vm.dirty_bytes 405 * - vm.dirty_ratio or vm.dirty_bytes
406 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and 406 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
407 * runtime tasks. 407 * real-time tasks.
408 */ 408 */
409void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) 409void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
410{ 410{
411 unsigned long background; 411 unsigned long background;
412 unsigned long dirty; 412 unsigned long dirty;
413 unsigned long available_memory = determine_dirtyable_memory(); 413 unsigned long uninitialized_var(available_memory);
414 struct task_struct *tsk; 414 struct task_struct *tsk;
415 415
416 if (!vm_dirty_bytes || !dirty_background_bytes)
417 available_memory = determine_dirtyable_memory();
418
416 if (vm_dirty_bytes) 419 if (vm_dirty_bytes)
417 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); 420 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
418 else 421 else
@@ -563,7 +566,7 @@ static void balance_dirty_pages(struct address_space *mapping,
563 break; /* We've done our duty */ 566 break; /* We've done our duty */
564 } 567 }
565 trace_wbc_balance_dirty_wait(&wbc, bdi); 568 trace_wbc_balance_dirty_wait(&wbc, bdi);
566 __set_current_state(TASK_INTERRUPTIBLE); 569 __set_current_state(TASK_UNINTERRUPTIBLE);
567 io_schedule_timeout(pause); 570 io_schedule_timeout(pause);
568 571
569 /* 572 /*
@@ -1103,7 +1106,7 @@ EXPORT_SYMBOL(write_one_page);
1103int __set_page_dirty_no_writeback(struct page *page) 1106int __set_page_dirty_no_writeback(struct page *page)
1104{ 1107{
1105 if (!PageDirty(page)) 1108 if (!PageDirty(page))
1106 SetPageDirty(page); 1109 return !TestSetPageDirty(page);
1107 return 0; 1110 return 0;
1108} 1111}
1109 1112
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 19413bfdef92..887ce3bd823d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -104,19 +104,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
104 * only be modified with pm_mutex held, unless the suspend/hibernate code is 104 * only be modified with pm_mutex held, unless the suspend/hibernate code is
105 * guaranteed not to run in parallel with that modification). 105 * guaranteed not to run in parallel with that modification).
106 */ 106 */
107void set_gfp_allowed_mask(gfp_t mask) 107
108static gfp_t saved_gfp_mask;
109
110void pm_restore_gfp_mask(void)
108{ 111{
109 WARN_ON(!mutex_is_locked(&pm_mutex)); 112 WARN_ON(!mutex_is_locked(&pm_mutex));
110 gfp_allowed_mask = mask; 113 if (saved_gfp_mask) {
114 gfp_allowed_mask = saved_gfp_mask;
115 saved_gfp_mask = 0;
116 }
111} 117}
112 118
113gfp_t clear_gfp_allowed_mask(gfp_t mask) 119void pm_restrict_gfp_mask(void)
114{ 120{
115 gfp_t ret = gfp_allowed_mask;
116
117 WARN_ON(!mutex_is_locked(&pm_mutex)); 121 WARN_ON(!mutex_is_locked(&pm_mutex));
118 gfp_allowed_mask &= ~mask; 122 WARN_ON(saved_gfp_mask);
119 return ret; 123 saved_gfp_mask = gfp_allowed_mask;
124 gfp_allowed_mask &= ~GFP_IOFS;
120} 125}
121#endif /* CONFIG_PM_SLEEP */ 126#endif /* CONFIG_PM_SLEEP */
122 127
@@ -352,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order)
352 } 357 }
353} 358}
354 359
360/* update __split_huge_page_refcount if you change this function */
355static int destroy_compound_page(struct page *page, unsigned long order) 361static int destroy_compound_page(struct page *page, unsigned long order)
356{ 362{
357 int i; 363 int i;
@@ -421,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
421 * 427 *
422 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 428 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
423 */ 429 */
424static inline struct page *
425__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
426{
427 unsigned long buddy_idx = page_idx ^ (1 << order);
428
429 return page + (buddy_idx - page_idx);
430}
431
432static inline unsigned long 430static inline unsigned long
433__find_combined_index(unsigned long page_idx, unsigned int order) 431__find_buddy_index(unsigned long page_idx, unsigned int order)
434{ 432{
435 return (page_idx & ~(1 << order)); 433 return page_idx ^ (1 << order);
436} 434}
437 435
438/* 436/*
@@ -443,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
443 * (c) a page and its buddy have the same order && 441 * (c) a page and its buddy have the same order &&
444 * (d) a page and its buddy are in the same zone. 442 * (d) a page and its buddy are in the same zone.
445 * 443 *
446 * For recording whether a page is in the buddy system, we use PG_buddy. 444 * For recording whether a page is in the buddy system, we set ->_mapcount -2.
447 * Setting, clearing, and testing PG_buddy is serialized by zone->lock. 445 * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
448 * 446 *
449 * For recording page's order, we use page_private(page). 447 * For recording page's order, we use page_private(page).
450 */ 448 */
@@ -477,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
477 * as necessary, plus some accounting needed to play nicely with other 475 * as necessary, plus some accounting needed to play nicely with other
478 * parts of the VM system. 476 * parts of the VM system.
479 * At each level, we keep a list of pages, which are heads of continuous 477 * At each level, we keep a list of pages, which are heads of continuous
480 * free pages of length of (1 << order) and marked with PG_buddy. Page's 478 * free pages of length of (1 << order) and marked with _mapcount -2. Page's
481 * order is recorded in page_private(page) field. 479 * order is recorded in page_private(page) field.
482 * So when we are allocating or freeing one, we can derive the state of the 480 * So when we are allocating or freeing one, we can derive the state of the
483 * other. That is, if we allocate a small block, and both were 481 * other. That is, if we allocate a small block, and both were
@@ -494,6 +492,7 @@ static inline void __free_one_page(struct page *page,
494{ 492{
495 unsigned long page_idx; 493 unsigned long page_idx;
496 unsigned long combined_idx; 494 unsigned long combined_idx;
495 unsigned long uninitialized_var(buddy_idx);
497 struct page *buddy; 496 struct page *buddy;
498 497
499 if (unlikely(PageCompound(page))) 498 if (unlikely(PageCompound(page)))
@@ -508,7 +507,8 @@ static inline void __free_one_page(struct page *page,
508 VM_BUG_ON(bad_range(zone, page)); 507 VM_BUG_ON(bad_range(zone, page));
509 508
510 while (order < MAX_ORDER-1) { 509 while (order < MAX_ORDER-1) {
511 buddy = __page_find_buddy(page, page_idx, order); 510 buddy_idx = __find_buddy_index(page_idx, order);
511 buddy = page + (buddy_idx - page_idx);
512 if (!page_is_buddy(page, buddy, order)) 512 if (!page_is_buddy(page, buddy, order))
513 break; 513 break;
514 514
@@ -516,7 +516,7 @@ static inline void __free_one_page(struct page *page,
516 list_del(&buddy->lru); 516 list_del(&buddy->lru);
517 zone->free_area[order].nr_free--; 517 zone->free_area[order].nr_free--;
518 rmv_page_order(buddy); 518 rmv_page_order(buddy);
519 combined_idx = __find_combined_index(page_idx, order); 519 combined_idx = buddy_idx & page_idx;
520 page = page + (combined_idx - page_idx); 520 page = page + (combined_idx - page_idx);
521 page_idx = combined_idx; 521 page_idx = combined_idx;
522 order++; 522 order++;
@@ -533,9 +533,10 @@ static inline void __free_one_page(struct page *page,
533 */ 533 */
534 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { 534 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
535 struct page *higher_page, *higher_buddy; 535 struct page *higher_page, *higher_buddy;
536 combined_idx = __find_combined_index(page_idx, order); 536 combined_idx = buddy_idx & page_idx;
537 higher_page = page + combined_idx - page_idx; 537 higher_page = page + (combined_idx - page_idx);
538 higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); 538 buddy_idx = __find_buddy_index(combined_idx, order + 1);
539 higher_buddy = page + (buddy_idx - combined_idx);
539 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 540 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
540 list_add_tail(&page->lru, 541 list_add_tail(&page->lru,
541 &zone->free_area[order].free_list[migratetype]); 542 &zone->free_area[order].free_list[migratetype]);
@@ -646,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
646 trace_mm_page_free_direct(page, order); 647 trace_mm_page_free_direct(page, order);
647 kmemcheck_free_shadow(page, order); 648 kmemcheck_free_shadow(page, order);
648 649
649 for (i = 0; i < (1 << order); i++) { 650 if (PageAnon(page))
650 struct page *pg = page + i; 651 page->mapping = NULL;
651 652 for (i = 0; i < (1 << order); i++)
652 if (PageAnon(pg)) 653 bad += free_pages_check(page + i);
653 pg->mapping = NULL;
654 bad += free_pages_check(pg);
655 }
656 if (bad) 654 if (bad)
657 return false; 655 return false;
658 656
@@ -1090,8 +1088,10 @@ static void drain_pages(unsigned int cpu)
1090 pset = per_cpu_ptr(zone->pageset, cpu); 1088 pset = per_cpu_ptr(zone->pageset, cpu);
1091 1089
1092 pcp = &pset->pcp; 1090 pcp = &pset->pcp;
1093 free_pcppages_bulk(zone, pcp->count, pcp); 1091 if (pcp->count) {
1094 pcp->count = 0; 1092 free_pcppages_bulk(zone, pcp->count, pcp);
1093 pcp->count = 0;
1094 }
1095 local_irq_restore(flags); 1095 local_irq_restore(flags);
1096 } 1096 }
1097} 1097}
@@ -1455,24 +1455,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1455#endif /* CONFIG_FAIL_PAGE_ALLOC */ 1455#endif /* CONFIG_FAIL_PAGE_ALLOC */
1456 1456
1457/* 1457/*
1458 * Return 1 if free pages are above 'mark'. This takes into account the order 1458 * Return true if free pages are above 'mark'. This takes into account the order
1459 * of the allocation. 1459 * of the allocation.
1460 */ 1460 */
1461int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1461static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1462 int classzone_idx, int alloc_flags) 1462 int classzone_idx, int alloc_flags, long free_pages)
1463{ 1463{
1464 /* free_pages my go negative - that's OK */ 1464 /* free_pages my go negative - that's OK */
1465 long min = mark; 1465 long min = mark;
1466 long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1467 int o; 1466 int o;
1468 1467
1468 free_pages -= (1 << order) + 1;
1469 if (alloc_flags & ALLOC_HIGH) 1469 if (alloc_flags & ALLOC_HIGH)
1470 min -= min / 2; 1470 min -= min / 2;
1471 if (alloc_flags & ALLOC_HARDER) 1471 if (alloc_flags & ALLOC_HARDER)
1472 min -= min / 4; 1472 min -= min / 4;
1473 1473
1474 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1474 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1475 return 0; 1475 return false;
1476 for (o = 0; o < order; o++) { 1476 for (o = 0; o < order; o++) {
1477 /* At the next order, this order's pages become unavailable */ 1477 /* At the next order, this order's pages become unavailable */
1478 free_pages -= z->free_area[o].nr_free << o; 1478 free_pages -= z->free_area[o].nr_free << o;
@@ -1481,9 +1481,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1481 min >>= 1; 1481 min >>= 1;
1482 1482
1483 if (free_pages <= min) 1483 if (free_pages <= min)
1484 return 0; 1484 return false;
1485 } 1485 }
1486 return 1; 1486 return true;
1487}
1488
1489bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1490 int classzone_idx, int alloc_flags)
1491{
1492 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1493 zone_page_state(z, NR_FREE_PAGES));
1494}
1495
1496bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1497 int classzone_idx, int alloc_flags)
1498{
1499 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1500
1501 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1502 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1503
1504 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1505 free_pages);
1487} 1506}
1488 1507
1489#ifdef CONFIG_NUMA 1508#ifdef CONFIG_NUMA
@@ -1788,15 +1807,18 @@ static struct page *
1788__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1807__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1789 struct zonelist *zonelist, enum zone_type high_zoneidx, 1808 struct zonelist *zonelist, enum zone_type high_zoneidx,
1790 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1809 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1791 int migratetype, unsigned long *did_some_progress) 1810 int migratetype, unsigned long *did_some_progress,
1811 bool sync_migration)
1792{ 1812{
1793 struct page *page; 1813 struct page *page;
1794 1814
1795 if (!order || compaction_deferred(preferred_zone)) 1815 if (!order || compaction_deferred(preferred_zone))
1796 return NULL; 1816 return NULL;
1797 1817
1818 current->flags |= PF_MEMALLOC;
1798 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 1819 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1799 nodemask); 1820 nodemask, sync_migration);
1821 current->flags &= ~PF_MEMALLOC;
1800 if (*did_some_progress != COMPACT_SKIPPED) { 1822 if (*did_some_progress != COMPACT_SKIPPED) {
1801 1823
1802 /* Page migration frees to the PCP lists but we want merging */ 1824 /* Page migration frees to the PCP lists but we want merging */
@@ -1832,7 +1854,8 @@ static inline struct page *
1832__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1854__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1833 struct zonelist *zonelist, enum zone_type high_zoneidx, 1855 struct zonelist *zonelist, enum zone_type high_zoneidx,
1834 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1856 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1835 int migratetype, unsigned long *did_some_progress) 1857 int migratetype, unsigned long *did_some_progress,
1858 bool sync_migration)
1836{ 1859{
1837 return NULL; 1860 return NULL;
1838} 1861}
@@ -1847,23 +1870,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1847{ 1870{
1848 struct page *page = NULL; 1871 struct page *page = NULL;
1849 struct reclaim_state reclaim_state; 1872 struct reclaim_state reclaim_state;
1850 struct task_struct *p = current;
1851 bool drained = false; 1873 bool drained = false;
1852 1874
1853 cond_resched(); 1875 cond_resched();
1854 1876
1855 /* We now go into synchronous reclaim */ 1877 /* We now go into synchronous reclaim */
1856 cpuset_memory_pressure_bump(); 1878 cpuset_memory_pressure_bump();
1857 p->flags |= PF_MEMALLOC; 1879 current->flags |= PF_MEMALLOC;
1858 lockdep_set_current_reclaim_state(gfp_mask); 1880 lockdep_set_current_reclaim_state(gfp_mask);
1859 reclaim_state.reclaimed_slab = 0; 1881 reclaim_state.reclaimed_slab = 0;
1860 p->reclaim_state = &reclaim_state; 1882 current->reclaim_state = &reclaim_state;
1861 1883
1862 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 1884 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1863 1885
1864 p->reclaim_state = NULL; 1886 current->reclaim_state = NULL;
1865 lockdep_clear_current_reclaim_state(); 1887 lockdep_clear_current_reclaim_state();
1866 p->flags &= ~PF_MEMALLOC; 1888 current->flags &= ~PF_MEMALLOC;
1867 1889
1868 cond_resched(); 1890 cond_resched();
1869 1891
@@ -1915,19 +1937,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1915 1937
1916static inline 1938static inline
1917void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, 1939void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1918 enum zone_type high_zoneidx) 1940 enum zone_type high_zoneidx,
1941 enum zone_type classzone_idx)
1919{ 1942{
1920 struct zoneref *z; 1943 struct zoneref *z;
1921 struct zone *zone; 1944 struct zone *zone;
1922 1945
1923 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1946 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1924 wakeup_kswapd(zone, order); 1947 wakeup_kswapd(zone, order, classzone_idx);
1925} 1948}
1926 1949
1927static inline int 1950static inline int
1928gfp_to_alloc_flags(gfp_t gfp_mask) 1951gfp_to_alloc_flags(gfp_t gfp_mask)
1929{ 1952{
1930 struct task_struct *p = current;
1931 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 1953 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1932 const gfp_t wait = gfp_mask & __GFP_WAIT; 1954 const gfp_t wait = gfp_mask & __GFP_WAIT;
1933 1955
@@ -1943,18 +1965,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1943 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 1965 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
1944 1966
1945 if (!wait) { 1967 if (!wait) {
1946 alloc_flags |= ALLOC_HARDER; 1968 /*
1969 * Not worth trying to allocate harder for
1970 * __GFP_NOMEMALLOC even if it can't schedule.
1971 */
1972 if (!(gfp_mask & __GFP_NOMEMALLOC))
1973 alloc_flags |= ALLOC_HARDER;
1947 /* 1974 /*
1948 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 1975 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1949 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1976 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1950 */ 1977 */
1951 alloc_flags &= ~ALLOC_CPUSET; 1978 alloc_flags &= ~ALLOC_CPUSET;
1952 } else if (unlikely(rt_task(p)) && !in_interrupt()) 1979 } else if (unlikely(rt_task(current)) && !in_interrupt())
1953 alloc_flags |= ALLOC_HARDER; 1980 alloc_flags |= ALLOC_HARDER;
1954 1981
1955 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 1982 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
1956 if (!in_interrupt() && 1983 if (!in_interrupt() &&
1957 ((p->flags & PF_MEMALLOC) || 1984 ((current->flags & PF_MEMALLOC) ||
1958 unlikely(test_thread_flag(TIF_MEMDIE)))) 1985 unlikely(test_thread_flag(TIF_MEMDIE))))
1959 alloc_flags |= ALLOC_NO_WATERMARKS; 1986 alloc_flags |= ALLOC_NO_WATERMARKS;
1960 } 1987 }
@@ -1973,7 +2000,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1973 int alloc_flags; 2000 int alloc_flags;
1974 unsigned long pages_reclaimed = 0; 2001 unsigned long pages_reclaimed = 0;
1975 unsigned long did_some_progress; 2002 unsigned long did_some_progress;
1976 struct task_struct *p = current; 2003 bool sync_migration = false;
1977 2004
1978 /* 2005 /*
1979 * In the slowpath, we sanity check order to avoid ever trying to 2006 * In the slowpath, we sanity check order to avoid ever trying to
@@ -1998,7 +2025,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1998 goto nopage; 2025 goto nopage;
1999 2026
2000restart: 2027restart:
2001 wake_all_kswapd(order, zonelist, high_zoneidx); 2028 if (!(gfp_mask & __GFP_NO_KSWAPD))
2029 wake_all_kswapd(order, zonelist, high_zoneidx,
2030 zone_idx(preferred_zone));
2002 2031
2003 /* 2032 /*
2004 * OK, we're below the kswapd watermark and have kicked background 2033 * OK, we're below the kswapd watermark and have kicked background
@@ -2007,6 +2036,14 @@ restart:
2007 */ 2036 */
2008 alloc_flags = gfp_to_alloc_flags(gfp_mask); 2037 alloc_flags = gfp_to_alloc_flags(gfp_mask);
2009 2038
2039 /*
2040 * Find the true preferred zone if the allocation is unconstrained by
2041 * cpusets.
2042 */
2043 if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
2044 first_zones_zonelist(zonelist, high_zoneidx, NULL,
2045 &preferred_zone);
2046
2010 /* This is the last chance, in general, before the goto nopage. */ 2047 /* This is the last chance, in general, before the goto nopage. */
2011 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 2048 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
2012 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, 2049 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2029,21 +2066,26 @@ rebalance:
2029 goto nopage; 2066 goto nopage;
2030 2067
2031 /* Avoid recursion of direct reclaim */ 2068 /* Avoid recursion of direct reclaim */
2032 if (p->flags & PF_MEMALLOC) 2069 if (current->flags & PF_MEMALLOC)
2033 goto nopage; 2070 goto nopage;
2034 2071
2035 /* Avoid allocations with no watermarks from looping endlessly */ 2072 /* Avoid allocations with no watermarks from looping endlessly */
2036 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2073 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2037 goto nopage; 2074 goto nopage;
2038 2075
2039 /* Try direct compaction */ 2076 /*
2077 * Try direct compaction. The first pass is asynchronous. Subsequent
2078 * attempts after direct reclaim are synchronous
2079 */
2040 page = __alloc_pages_direct_compact(gfp_mask, order, 2080 page = __alloc_pages_direct_compact(gfp_mask, order,
2041 zonelist, high_zoneidx, 2081 zonelist, high_zoneidx,
2042 nodemask, 2082 nodemask,
2043 alloc_flags, preferred_zone, 2083 alloc_flags, preferred_zone,
2044 migratetype, &did_some_progress); 2084 migratetype, &did_some_progress,
2085 sync_migration);
2045 if (page) 2086 if (page)
2046 goto got_pg; 2087 goto got_pg;
2088 sync_migration = true;
2047 2089
2048 /* Try direct reclaim and then allocating */ 2090 /* Try direct reclaim and then allocating */
2049 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2091 page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2097,13 +2139,27 @@ rebalance:
2097 /* Wait for some write requests to complete then retry */ 2139 /* Wait for some write requests to complete then retry */
2098 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2140 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2099 goto rebalance; 2141 goto rebalance;
2142 } else {
2143 /*
2144 * High-order allocations do not necessarily loop after
2145 * direct reclaim and reclaim/compaction depends on compaction
2146 * being called after reclaim so call directly if necessary
2147 */
2148 page = __alloc_pages_direct_compact(gfp_mask, order,
2149 zonelist, high_zoneidx,
2150 nodemask,
2151 alloc_flags, preferred_zone,
2152 migratetype, &did_some_progress,
2153 sync_migration);
2154 if (page)
2155 goto got_pg;
2100 } 2156 }
2101 2157
2102nopage: 2158nopage:
2103 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 2159 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
2104 printk(KERN_WARNING "%s: page allocation failure." 2160 printk(KERN_WARNING "%s: page allocation failure."
2105 " order:%d, mode:0x%x\n", 2161 " order:%d, mode:0x%x\n",
2106 p->comm, order, gfp_mask); 2162 current->comm, order, gfp_mask);
2107 dump_stack(); 2163 dump_stack();
2108 show_mem(); 2164 show_mem();
2109 } 2165 }
@@ -2146,7 +2202,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2146 2202
2147 get_mems_allowed(); 2203 get_mems_allowed();
2148 /* The preferred zone is used for statistics later */ 2204 /* The preferred zone is used for statistics later */
2149 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); 2205 first_zones_zonelist(zonelist, high_zoneidx,
2206 nodemask ? : &cpuset_current_mems_allowed,
2207 &preferred_zone);
2150 if (!preferred_zone) { 2208 if (!preferred_zone) {
2151 put_mems_allowed(); 2209 put_mems_allowed();
2152 return NULL; 2210 return NULL;
@@ -2437,7 +2495,7 @@ void show_free_areas(void)
2437 " all_unreclaimable? %s" 2495 " all_unreclaimable? %s"
2438 "\n", 2496 "\n",
2439 zone->name, 2497 zone->name,
2440 K(zone_nr_free_pages(zone)), 2498 K(zone_page_state(zone, NR_FREE_PAGES)),
2441 K(min_wmark_pages(zone)), 2499 K(min_wmark_pages(zone)),
2442 K(low_wmark_pages(zone)), 2500 K(low_wmark_pages(zone)),
2443 K(high_wmark_pages(zone)), 2501 K(high_wmark_pages(zone)),
@@ -2580,9 +2638,16 @@ static int __parse_numa_zonelist_order(char *s)
2580 2638
2581static __init int setup_numa_zonelist_order(char *s) 2639static __init int setup_numa_zonelist_order(char *s)
2582{ 2640{
2583 if (s) 2641 int ret;
2584 return __parse_numa_zonelist_order(s); 2642
2585 return 0; 2643 if (!s)
2644 return 0;
2645
2646 ret = __parse_numa_zonelist_order(s);
2647 if (ret == 0)
2648 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
2649
2650 return ret;
2586} 2651}
2587early_param("numa_zonelist_order", setup_numa_zonelist_order); 2652early_param("numa_zonelist_order", setup_numa_zonelist_order);
2588 2653
@@ -3008,14 +3073,6 @@ static __init_refok int __build_all_zonelists(void *data)
3008 build_zonelist_cache(pgdat); 3073 build_zonelist_cache(pgdat);
3009 } 3074 }
3010 3075
3011#ifdef CONFIG_MEMORY_HOTPLUG
3012 /* Setup real pagesets for the new zone */
3013 if (data) {
3014 struct zone *zone = data;
3015 setup_zone_pageset(zone);
3016 }
3017#endif
3018
3019 /* 3076 /*
3020 * Initialize the boot_pagesets that are going to be used 3077 * Initialize the boot_pagesets that are going to be used
3021 * for bootstrapping processors. The real pagesets for 3078 * for bootstrapping processors. The real pagesets for
@@ -3064,7 +3121,11 @@ void build_all_zonelists(void *data)
3064 } else { 3121 } else {
3065 /* we have to stop all cpus to guarantee there is no user 3122 /* we have to stop all cpus to guarantee there is no user
3066 of zonelist */ 3123 of zonelist */
3067 stop_machine(__build_all_zonelists, data, NULL); 3124#ifdef CONFIG_MEMORY_HOTPLUG
3125 if (data)
3126 setup_zone_pageset((struct zone *)data);
3127#endif
3128 stop_machine(__build_all_zonelists, NULL, NULL);
3068 /* cpuset refresh routine should be here */ 3129 /* cpuset refresh routine should be here */
3069 } 3130 }
3070 vm_total_pages = nr_free_pagecache_pages(); 3131 vm_total_pages = nr_free_pagecache_pages();
@@ -4045,7 +4106,7 @@ static void __init setup_usemap(struct pglist_data *pgdat,
4045 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); 4106 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
4046} 4107}
4047#else 4108#else
4048static void inline setup_usemap(struct pglist_data *pgdat, 4109static inline void setup_usemap(struct pglist_data *pgdat,
4049 struct zone *zone, unsigned long zonesize) {} 4110 struct zone *zone, unsigned long zonesize) {}
4050#endif /* CONFIG_SPARSEMEM */ 4111#endif /* CONFIG_SPARSEMEM */
4051 4112
@@ -5548,7 +5609,6 @@ static struct trace_print_flags pageflag_names[] = {
5548 {1UL << PG_swapcache, "swapcache" }, 5609 {1UL << PG_swapcache, "swapcache" },
5549 {1UL << PG_mappedtodisk, "mappedtodisk" }, 5610 {1UL << PG_mappedtodisk, "mappedtodisk" },
5550 {1UL << PG_reclaim, "reclaim" }, 5611 {1UL << PG_reclaim, "reclaim" },
5551 {1UL << PG_buddy, "buddy" },
5552 {1UL << PG_swapbacked, "swapbacked" }, 5612 {1UL << PG_swapbacked, "swapbacked" },
5553 {1UL << PG_unevictable, "unevictable" }, 5613 {1UL << PG_unevictable, "unevictable" },
5554#ifdef CONFIG_MMU 5614#ifdef CONFIG_MMU
@@ -5596,7 +5656,7 @@ void dump_page(struct page *page)
5596{ 5656{
5597 printk(KERN_ALERT 5657 printk(KERN_ALERT
5598 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 5658 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
5599 page, page_count(page), page_mapcount(page), 5659 page, atomic_read(&page->_count), page_mapcount(page),
5600 page->mapping, page->index); 5660 page->mapping, page->index);
5601 dump_page_flags(page->flags); 5661 dump_page_flags(page->flags);
5602} 5662}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 8b1a2ce21ee5..7cfa6ae02303 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
34 pmd = pmd_offset(pud, addr); 34 pmd = pmd_offset(pud, addr);
35 do { 35 do {
36 next = pmd_addr_end(addr, end); 36 next = pmd_addr_end(addr, end);
37 split_huge_page_pmd(walk->mm, pmd);
37 if (pmd_none_or_clear_bad(pmd)) { 38 if (pmd_none_or_clear_bad(pmd)) {
38 if (walk->pte_hole) 39 if (walk->pte_hole)
39 err = walk->pte_hole(addr, next, walk); 40 err = walk->pte_hole(addr, next, walk);
@@ -139,7 +140,6 @@ int walk_page_range(unsigned long addr, unsigned long end,
139 pgd_t *pgd; 140 pgd_t *pgd;
140 unsigned long next; 141 unsigned long next;
141 int err = 0; 142 int err = 0;
142 struct vm_area_struct *vma;
143 143
144 if (addr >= end) 144 if (addr >= end)
145 return err; 145 return err;
@@ -149,15 +149,17 @@ int walk_page_range(unsigned long addr, unsigned long end,
149 149
150 pgd = pgd_offset(walk->mm, addr); 150 pgd = pgd_offset(walk->mm, addr);
151 do { 151 do {
152 struct vm_area_struct *uninitialized_var(vma);
153
152 next = pgd_addr_end(addr, end); 154 next = pgd_addr_end(addr, end);
153 155
156#ifdef CONFIG_HUGETLB_PAGE
154 /* 157 /*
155 * handle hugetlb vma individually because pagetable walk for 158 * handle hugetlb vma individually because pagetable walk for
156 * the hugetlb page is dependent on the architecture and 159 * the hugetlb page is dependent on the architecture and
157 * we can't handled it in the same manner as non-huge pages. 160 * we can't handled it in the same manner as non-huge pages.
158 */ 161 */
159 vma = find_vma(walk->mm, addr); 162 vma = find_vma(walk->mm, addr);
160#ifdef CONFIG_HUGETLB_PAGE
161 if (vma && is_vm_hugetlb_page(vma)) { 163 if (vma && is_vm_hugetlb_page(vma)) {
162 if (vma->vm_end < next) 164 if (vma->vm_end < next)
163 next = vma->vm_end; 165 next = vma->vm_end;
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 7d9c1d0ebd3f..ea534960a04b 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
421 return NULL; 421 return NULL;
422 422
423 vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, 423 vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
424 pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); 424 pcpu_nr_groups, pcpu_atom_size);
425 if (!vms) { 425 if (!vms) {
426 pcpu_free_chunk(chunk); 426 pcpu_free_chunk(chunk);
427 return NULL; 427 return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index efe816856a9d..3f930018aa60 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -258,7 +258,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
258 258
259/* 259/*
260 * (Un)populated page region iterators. Iterate over (un)populated 260 * (Un)populated page region iterators. Iterate over (un)populated
261 * page regions betwen @start and @end in @chunk. @rs and @re should 261 * page regions between @start and @end in @chunk. @rs and @re should
262 * be integer variables and will be set to start and end page index of 262 * be integer variables and will be set to start and end page index of
263 * the current region. 263 * the current region.
264 */ 264 */
@@ -293,12 +293,8 @@ static void *pcpu_mem_alloc(size_t size)
293 293
294 if (size <= PAGE_SIZE) 294 if (size <= PAGE_SIZE)
295 return kzalloc(size, GFP_KERNEL); 295 return kzalloc(size, GFP_KERNEL);
296 else { 296 else
297 void *ptr = vmalloc(size); 297 return vzalloc(size);
298 if (ptr)
299 memset(ptr, 0, size);
300 return ptr;
301 }
302} 298}
303 299
304/** 300/**
@@ -1268,7 +1264,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1268 1264
1269 /* we're done parsing the input, undefine BUG macro and dump config */ 1265 /* we're done parsing the input, undefine BUG macro and dump config */
1270#undef PCPU_SETUP_BUG_ON 1266#undef PCPU_SETUP_BUG_ON
1271 pcpu_dump_alloc_info(KERN_INFO, ai); 1267 pcpu_dump_alloc_info(KERN_DEBUG, ai);
1272 1268
1273 pcpu_nr_groups = ai->nr_groups; 1269 pcpu_nr_groups = ai->nr_groups;
1274 pcpu_group_offsets = group_offsets; 1270 pcpu_group_offsets = group_offsets;
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
new file mode 100644
index 000000000000..eb663fb533e0
--- /dev/null
+++ b/mm/pgtable-generic.c
@@ -0,0 +1,121 @@
1/*
2 * mm/pgtable-generic.c
3 *
4 * Generic pgtable methods declared in asm-generic/pgtable.h
5 *
6 * Copyright (C) 2010 Linus Torvalds
7 */
8
9#include <linux/pagemap.h>
10#include <asm/tlb.h>
11#include <asm-generic/pgtable.h>
12
13#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
14/*
15 * Only sets the access flags (dirty, accessed, and
16 * writable). Furthermore, we know it always gets set to a "more
17 * permissive" setting, which allows most architectures to optimize
18 * this. We return whether the PTE actually changed, which in turn
19 * instructs the caller to do things like update__mmu_cache. This
20 * used to be done in the caller, but sparc needs minor faults to
21 * force that call on sun4c so we changed this macro slightly
22 */
23int ptep_set_access_flags(struct vm_area_struct *vma,
24 unsigned long address, pte_t *ptep,
25 pte_t entry, int dirty)
26{
27 int changed = !pte_same(*ptep, entry);
28 if (changed) {
29 set_pte_at(vma->vm_mm, address, ptep, entry);
30 flush_tlb_page(vma, address);
31 }
32 return changed;
33}
34#endif
35
36#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
37int pmdp_set_access_flags(struct vm_area_struct *vma,
38 unsigned long address, pmd_t *pmdp,
39 pmd_t entry, int dirty)
40{
41#ifdef CONFIG_TRANSPARENT_HUGEPAGE
42 int changed = !pmd_same(*pmdp, entry);
43 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
44 if (changed) {
45 set_pmd_at(vma->vm_mm, address, pmdp, entry);
46 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
47 }
48 return changed;
49#else /* CONFIG_TRANSPARENT_HUGEPAGE */
50 BUG();
51 return 0;
52#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
53}
54#endif
55
56#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
57int ptep_clear_flush_young(struct vm_area_struct *vma,
58 unsigned long address, pte_t *ptep)
59{
60 int young;
61 young = ptep_test_and_clear_young(vma, address, ptep);
62 if (young)
63 flush_tlb_page(vma, address);
64 return young;
65}
66#endif
67
68#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
69int pmdp_clear_flush_young(struct vm_area_struct *vma,
70 unsigned long address, pmd_t *pmdp)
71{
72 int young;
73#ifndef CONFIG_TRANSPARENT_HUGEPAGE
74 BUG();
75#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
76 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
77 young = pmdp_test_and_clear_young(vma, address, pmdp);
78 if (young)
79 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
80 return young;
81}
82#endif
83
84#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
85pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
86 pte_t *ptep)
87{
88 pte_t pte;
89 pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
90 flush_tlb_page(vma, address);
91 return pte;
92}
93#endif
94
95#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
96#ifdef CONFIG_TRANSPARENT_HUGEPAGE
97pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
98 pmd_t *pmdp)
99{
100 pmd_t pmd;
101 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
102 pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
103 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
104 return pmd;
105}
106#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
107#endif
108
109#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
110#ifdef CONFIG_TRANSPARENT_HUGEPAGE
111pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
112 pmd_t *pmdp)
113{
114 pmd_t pmd = pmd_mksplitting(*pmdp);
115 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
116 set_pmd_at(vma->vm_mm, address, pmdp, pmd);
117 /* tlb flush only to serialize against gup-fast */
118 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
119}
120#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
121#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 1a8bf76bfd03..f21f4a1d6a1c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -94,7 +94,7 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
94 * anonymous pages mapped into it with that anon_vma. 94 * anonymous pages mapped into it with that anon_vma.
95 * 95 *
96 * The common case will be that we already have one, but if 96 * The common case will be that we already have one, but if
97 * if not we either need to find an adjacent mapping that we 97 * not we either need to find an adjacent mapping that we
98 * can re-use the anon_vma from (very common when the only 98 * can re-use the anon_vma from (very common when the only
99 * reason for splitting a vma has been mprotect()), or we 99 * reason for splitting a vma has been mprotect()), or we
100 * allocate a new one. 100 * allocate a new one.
@@ -177,6 +177,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
177 list_add(&avc->same_vma, &vma->anon_vma_chain); 177 list_add(&avc->same_vma, &vma->anon_vma_chain);
178 178
179 anon_vma_lock(anon_vma); 179 anon_vma_lock(anon_vma);
180 /*
181 * It's critical to add new vmas to the tail of the anon_vma,
182 * see comment in huge_memory.c:__split_huge_page().
183 */
180 list_add_tail(&avc->same_anon_vma, &anon_vma->head); 184 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
181 anon_vma_unlock(anon_vma); 185 anon_vma_unlock(anon_vma);
182} 186}
@@ -360,7 +364,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
360 * Returns virtual address or -EFAULT if page's index/offset is not 364 * Returns virtual address or -EFAULT if page's index/offset is not
361 * within the range mapped the @vma. 365 * within the range mapped the @vma.
362 */ 366 */
363static inline unsigned long 367inline unsigned long
364vma_address(struct page *page, struct vm_area_struct *vma) 368vma_address(struct page *page, struct vm_area_struct *vma)
365{ 369{
366 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 370 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -435,6 +439,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
435 pmd = pmd_offset(pud, address); 439 pmd = pmd_offset(pud, address);
436 if (!pmd_present(*pmd)) 440 if (!pmd_present(*pmd))
437 return NULL; 441 return NULL;
442 if (pmd_trans_huge(*pmd))
443 return NULL;
438 444
439 pte = pte_offset_map(pmd, address); 445 pte = pte_offset_map(pmd, address);
440 /* Make a quick check before getting the lock */ 446 /* Make a quick check before getting the lock */
@@ -489,35 +495,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
489 unsigned long *vm_flags) 495 unsigned long *vm_flags)
490{ 496{
491 struct mm_struct *mm = vma->vm_mm; 497 struct mm_struct *mm = vma->vm_mm;
492 pte_t *pte;
493 spinlock_t *ptl;
494 int referenced = 0; 498 int referenced = 0;
495 499
496 pte = page_check_address(page, mm, address, &ptl, 0);
497 if (!pte)
498 goto out;
499
500 /* 500 /*
501 * Don't want to elevate referenced for mlocked page that gets this far, 501 * Don't want to elevate referenced for mlocked page that gets this far,
502 * in order that it progresses to try_to_unmap and is moved to the 502 * in order that it progresses to try_to_unmap and is moved to the
503 * unevictable list. 503 * unevictable list.
504 */ 504 */
505 if (vma->vm_flags & VM_LOCKED) { 505 if (vma->vm_flags & VM_LOCKED) {
506 *mapcount = 1; /* break early from loop */ 506 *mapcount = 0; /* break early from loop */
507 *vm_flags |= VM_LOCKED; 507 *vm_flags |= VM_LOCKED;
508 goto out_unmap; 508 goto out;
509 }
510
511 if (ptep_clear_flush_young_notify(vma, address, pte)) {
512 /*
513 * Don't treat a reference through a sequentially read
514 * mapping as such. If the page has been used in
515 * another mapping, we will catch it; if this other
516 * mapping is already gone, the unmap path will have
517 * set PG_referenced or activated the page.
518 */
519 if (likely(!VM_SequentialReadHint(vma)))
520 referenced++;
521 } 509 }
522 510
523 /* Pretend the page is referenced if the task has the 511 /* Pretend the page is referenced if the task has the
@@ -526,9 +514,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
526 rwsem_is_locked(&mm->mmap_sem)) 514 rwsem_is_locked(&mm->mmap_sem))
527 referenced++; 515 referenced++;
528 516
529out_unmap: 517 if (unlikely(PageTransHuge(page))) {
518 pmd_t *pmd;
519
520 spin_lock(&mm->page_table_lock);
521 pmd = page_check_address_pmd(page, mm, address,
522 PAGE_CHECK_ADDRESS_PMD_FLAG);
523 if (pmd && !pmd_trans_splitting(*pmd) &&
524 pmdp_clear_flush_young_notify(vma, address, pmd))
525 referenced++;
526 spin_unlock(&mm->page_table_lock);
527 } else {
528 pte_t *pte;
529 spinlock_t *ptl;
530
531 pte = page_check_address(page, mm, address, &ptl, 0);
532 if (!pte)
533 goto out;
534
535 if (ptep_clear_flush_young_notify(vma, address, pte)) {
536 /*
537 * Don't treat a reference through a sequentially read
538 * mapping as such. If the page has been used in
539 * another mapping, we will catch it; if this other
540 * mapping is already gone, the unmap path will have
541 * set PG_referenced or activated the page.
542 */
543 if (likely(!VM_SequentialReadHint(vma)))
544 referenced++;
545 }
546 pte_unmap_unlock(pte, ptl);
547 }
548
530 (*mapcount)--; 549 (*mapcount)--;
531 pte_unmap_unlock(pte, ptl);
532 550
533 if (referenced) 551 if (referenced)
534 *vm_flags |= vma->vm_flags; 552 *vm_flags |= vma->vm_flags;
@@ -864,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page,
864 struct vm_area_struct *vma, unsigned long address, int exclusive) 882 struct vm_area_struct *vma, unsigned long address, int exclusive)
865{ 883{
866 int first = atomic_inc_and_test(&page->_mapcount); 884 int first = atomic_inc_and_test(&page->_mapcount);
867 if (first) 885 if (first) {
868 __inc_zone_page_state(page, NR_ANON_PAGES); 886 if (!PageTransHuge(page))
887 __inc_zone_page_state(page, NR_ANON_PAGES);
888 else
889 __inc_zone_page_state(page,
890 NR_ANON_TRANSPARENT_HUGEPAGES);
891 }
869 if (unlikely(PageKsm(page))) 892 if (unlikely(PageKsm(page)))
870 return; 893 return;
871 894
@@ -893,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page,
893 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 916 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
894 SetPageSwapBacked(page); 917 SetPageSwapBacked(page);
895 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 918 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
896 __inc_zone_page_state(page, NR_ANON_PAGES); 919 if (!PageTransHuge(page))
920 __inc_zone_page_state(page, NR_ANON_PAGES);
921 else
922 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
897 __page_set_anon_rmap(page, vma, address, 1); 923 __page_set_anon_rmap(page, vma, address, 1);
898 if (page_evictable(page, vma)) 924 if (page_evictable(page, vma))
899 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 925 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -911,7 +937,7 @@ void page_add_file_rmap(struct page *page)
911{ 937{
912 if (atomic_inc_and_test(&page->_mapcount)) { 938 if (atomic_inc_and_test(&page->_mapcount)) {
913 __inc_zone_page_state(page, NR_FILE_MAPPED); 939 __inc_zone_page_state(page, NR_FILE_MAPPED);
914 mem_cgroup_update_file_mapped(page, 1); 940 mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
915 } 941 }
916} 942}
917 943
@@ -946,10 +972,14 @@ void page_remove_rmap(struct page *page)
946 return; 972 return;
947 if (PageAnon(page)) { 973 if (PageAnon(page)) {
948 mem_cgroup_uncharge_page(page); 974 mem_cgroup_uncharge_page(page);
949 __dec_zone_page_state(page, NR_ANON_PAGES); 975 if (!PageTransHuge(page))
976 __dec_zone_page_state(page, NR_ANON_PAGES);
977 else
978 __dec_zone_page_state(page,
979 NR_ANON_TRANSPARENT_HUGEPAGES);
950 } else { 980 } else {
951 __dec_zone_page_state(page, NR_FILE_MAPPED); 981 __dec_zone_page_state(page, NR_FILE_MAPPED);
952 mem_cgroup_update_file_mapped(page, -1); 982 mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
953 } 983 }
954 /* 984 /*
955 * It would be tidy to reset the PageAnon mapping here, 985 * It would be tidy to reset the PageAnon mapping here,
@@ -1202,7 +1232,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1202 return ret; 1232 return ret;
1203} 1233}
1204 1234
1205static bool is_vma_temporary_stack(struct vm_area_struct *vma) 1235bool is_vma_temporary_stack(struct vm_area_struct *vma)
1206{ 1236{
1207 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1237 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1208 1238
@@ -1400,6 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1400 int ret; 1430 int ret;
1401 1431
1402 BUG_ON(!PageLocked(page)); 1432 BUG_ON(!PageLocked(page));
1433 VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
1403 1434
1404 if (unlikely(PageKsm(page))) 1435 if (unlikely(PageKsm(page)))
1405 ret = try_to_unmap_ksm(page, flags); 1436 ret = try_to_unmap_ksm(page, flags);
diff --git a/mm/shmem.c b/mm/shmem.c
index 47fdeeb9d636..5ee67c990602 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2415,13 +2415,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
2415 return &p->vfs_inode; 2415 return &p->vfs_inode;
2416} 2416}
2417 2417
2418static void shmem_i_callback(struct rcu_head *head)
2419{
2420 struct inode *inode = container_of(head, struct inode, i_rcu);
2421 INIT_LIST_HEAD(&inode->i_dentry);
2422 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2423}
2424
2418static void shmem_destroy_inode(struct inode *inode) 2425static void shmem_destroy_inode(struct inode *inode)
2419{ 2426{
2420 if ((inode->i_mode & S_IFMT) == S_IFREG) { 2427 if ((inode->i_mode & S_IFMT) == S_IFREG) {
2421 /* only struct inode is valid if it's an inline symlink */ 2428 /* only struct inode is valid if it's an inline symlink */
2422 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2429 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2423 } 2430 }
2424 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2431 call_rcu(&inode->i_rcu, shmem_i_callback);
2425} 2432}
2426 2433
2427static void init_once(void *foo) 2434static void init_once(void *foo)
diff --git a/mm/slab.c b/mm/slab.c
index b1e40dafbab3..37961d1f584f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -284,7 +284,7 @@ struct kmem_list3 {
284 * Need this for bootstrapping a per node allocator. 284 * Need this for bootstrapping a per node allocator.
285 */ 285 */
286#define NUM_INIT_LISTS (3 * MAX_NUMNODES) 286#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
287struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; 287static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
288#define CACHE_CACHE 0 288#define CACHE_CACHE 0
289#define SIZE_AC MAX_NUMNODES 289#define SIZE_AC MAX_NUMNODES
290#define SIZE_L3 (2 * MAX_NUMNODES) 290#define SIZE_L3 (2 * MAX_NUMNODES)
@@ -829,12 +829,12 @@ static void init_reap_node(int cpu)
829 829
830static void next_reap_node(void) 830static void next_reap_node(void)
831{ 831{
832 int node = __get_cpu_var(slab_reap_node); 832 int node = __this_cpu_read(slab_reap_node);
833 833
834 node = next_node(node, node_online_map); 834 node = next_node(node, node_online_map);
835 if (unlikely(node >= MAX_NUMNODES)) 835 if (unlikely(node >= MAX_NUMNODES))
836 node = first_node(node_online_map); 836 node = first_node(node_online_map);
837 __get_cpu_var(slab_reap_node) = node; 837 __this_cpu_write(slab_reap_node, node);
838} 838}
839 839
840#else 840#else
@@ -1012,7 +1012,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
1012 */ 1012 */
1013static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) 1013static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1014{ 1014{
1015 int node = __get_cpu_var(slab_reap_node); 1015 int node = __this_cpu_read(slab_reap_node);
1016 1016
1017 if (l3->alien) { 1017 if (l3->alien) {
1018 struct array_cache *ac = l3->alien[node]; 1018 struct array_cache *ac = l3->alien[node];
@@ -1293,7 +1293,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1293 * anything expensive but will only modify reap_work 1293 * anything expensive but will only modify reap_work
1294 * and reschedule the timer. 1294 * and reschedule the timer.
1295 */ 1295 */
1296 cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); 1296 cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
1297 /* Now the cache_reaper is guaranteed to be not running. */ 1297 /* Now the cache_reaper is guaranteed to be not running. */
1298 per_cpu(slab_reap_work, cpu).work.func = NULL; 1298 per_cpu(slab_reap_work, cpu).work.func = NULL;
1299 break; 1299 break;
@@ -2781,7 +2781,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2781/* 2781/*
2782 * Map pages beginning at addr to the given cache and slab. This is required 2782 * Map pages beginning at addr to the given cache and slab. This is required
2783 * for the slab allocator to be able to lookup the cache and slab of a 2783 * for the slab allocator to be able to lookup the cache and slab of a
2784 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. 2784 * virtual address for kfree, ksize, and slab debugging.
2785 */ 2785 */
2786static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, 2786static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2787 void *addr) 2787 void *addr)
@@ -3653,42 +3653,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3653EXPORT_SYMBOL(kmem_cache_alloc); 3653EXPORT_SYMBOL(kmem_cache_alloc);
3654 3654
3655#ifdef CONFIG_TRACING 3655#ifdef CONFIG_TRACING
3656void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) 3656void *
3657kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
3657{ 3658{
3658 return __cache_alloc(cachep, flags, __builtin_return_address(0)); 3659 void *ret;
3659}
3660EXPORT_SYMBOL(kmem_cache_alloc_notrace);
3661#endif
3662 3660
3663/** 3661 ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3664 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
3665 * @cachep: the cache we're checking against
3666 * @ptr: pointer to validate
3667 *
3668 * This verifies that the untrusted pointer looks sane;
3669 * it is _not_ a guarantee that the pointer is actually
3670 * part of the slab cache in question, but it at least
3671 * validates that the pointer can be dereferenced and
3672 * looks half-way sane.
3673 *
3674 * Currently only used for dentry validation.
3675 */
3676int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3677{
3678 unsigned long size = cachep->buffer_size;
3679 struct page *page;
3680 3662
3681 if (unlikely(!kern_ptr_validate(ptr, size))) 3663 trace_kmalloc(_RET_IP_, ret,
3682 goto out; 3664 size, slab_buffer_size(cachep), flags);
3683 page = virt_to_page(ptr); 3665 return ret;
3684 if (unlikely(!PageSlab(page)))
3685 goto out;
3686 if (unlikely(page_get_cache(page) != cachep))
3687 goto out;
3688 return 1;
3689out:
3690 return 0;
3691} 3666}
3667EXPORT_SYMBOL(kmem_cache_alloc_trace);
3668#endif
3692 3669
3693#ifdef CONFIG_NUMA 3670#ifdef CONFIG_NUMA
3694void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3671void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
@@ -3705,31 +3682,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3705EXPORT_SYMBOL(kmem_cache_alloc_node); 3682EXPORT_SYMBOL(kmem_cache_alloc_node);
3706 3683
3707#ifdef CONFIG_TRACING 3684#ifdef CONFIG_TRACING
3708void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, 3685void *kmem_cache_alloc_node_trace(size_t size,
3709 gfp_t flags, 3686 struct kmem_cache *cachep,
3710 int nodeid) 3687 gfp_t flags,
3688 int nodeid)
3711{ 3689{
3712 return __cache_alloc_node(cachep, flags, nodeid, 3690 void *ret;
3691
3692 ret = __cache_alloc_node(cachep, flags, nodeid,
3713 __builtin_return_address(0)); 3693 __builtin_return_address(0));
3694 trace_kmalloc_node(_RET_IP_, ret,
3695 size, slab_buffer_size(cachep),
3696 flags, nodeid);
3697 return ret;
3714} 3698}
3715EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); 3699EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
3716#endif 3700#endif
3717 3701
3718static __always_inline void * 3702static __always_inline void *
3719__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) 3703__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3720{ 3704{
3721 struct kmem_cache *cachep; 3705 struct kmem_cache *cachep;
3722 void *ret;
3723 3706
3724 cachep = kmem_find_general_cachep(size, flags); 3707 cachep = kmem_find_general_cachep(size, flags);
3725 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3708 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3726 return cachep; 3709 return cachep;
3727 ret = kmem_cache_alloc_node_notrace(cachep, flags, node); 3710 return kmem_cache_alloc_node_trace(size, cachep, flags, node);
3728
3729 trace_kmalloc_node((unsigned long) caller, ret,
3730 size, cachep->buffer_size, flags, node);
3731
3732 return ret;
3733} 3711}
3734 3712
3735#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) 3713#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
@@ -4075,7 +4053,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
4075 * necessary. Note that the l3 listlock also protects the array_cache 4053 * necessary. Note that the l3 listlock also protects the array_cache
4076 * if drain_array() is used on the shared array. 4054 * if drain_array() is used on the shared array.
4077 */ 4055 */
4078void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, 4056static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
4079 struct array_cache *ac, int force, int node) 4057 struct array_cache *ac, int force, int node)
4080{ 4058{
4081 int tofree; 4059 int tofree;
@@ -4339,7 +4317,7 @@ static const struct seq_operations slabinfo_op = {
4339 * @count: data length 4317 * @count: data length
4340 * @ppos: unused 4318 * @ppos: unused
4341 */ 4319 */
4342ssize_t slabinfo_write(struct file *file, const char __user * buffer, 4320static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4343 size_t count, loff_t *ppos) 4321 size_t count, loff_t *ppos)
4344{ 4322{
4345 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; 4323 char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
diff --git a/mm/slob.c b/mm/slob.c
index 617b6d6c42c7..3588eaaef726 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -678,11 +678,6 @@ int kmem_cache_shrink(struct kmem_cache *d)
678} 678}
679EXPORT_SYMBOL(kmem_cache_shrink); 679EXPORT_SYMBOL(kmem_cache_shrink);
680 680
681int kmem_ptr_validate(struct kmem_cache *a, const void *b)
682{
683 return 0;
684}
685
686static unsigned int slob_ready __read_mostly; 681static unsigned int slob_ready __read_mostly;
687 682
688int slab_is_available(void) 683int slab_is_available(void)
diff --git a/mm/slub.c b/mm/slub.c
index 981fb730aa04..e15aa7f193c9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -28,6 +28,8 @@
28#include <linux/math64.h> 28#include <linux/math64.h>
29#include <linux/fault-inject.h> 29#include <linux/fault-inject.h>
30 30
31#include <trace/events/kmem.h>
32
31/* 33/*
32 * Lock order: 34 * Lock order:
33 * 1. slab_lock(page) 35 * 1. slab_lock(page)
@@ -1774,11 +1776,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1774EXPORT_SYMBOL(kmem_cache_alloc); 1776EXPORT_SYMBOL(kmem_cache_alloc);
1775 1777
1776#ifdef CONFIG_TRACING 1778#ifdef CONFIG_TRACING
1777void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) 1779void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
1780{
1781 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
1782 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
1783 return ret;
1784}
1785EXPORT_SYMBOL(kmem_cache_alloc_trace);
1786
1787void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
1778{ 1788{
1779 return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 1789 void *ret = kmalloc_order(size, flags, order);
1790 trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
1791 return ret;
1780} 1792}
1781EXPORT_SYMBOL(kmem_cache_alloc_notrace); 1793EXPORT_SYMBOL(kmalloc_order_trace);
1782#endif 1794#endif
1783 1795
1784#ifdef CONFIG_NUMA 1796#ifdef CONFIG_NUMA
@@ -1794,13 +1806,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1794EXPORT_SYMBOL(kmem_cache_alloc_node); 1806EXPORT_SYMBOL(kmem_cache_alloc_node);
1795 1807
1796#ifdef CONFIG_TRACING 1808#ifdef CONFIG_TRACING
1797void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, 1809void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
1798 gfp_t gfpflags, 1810 gfp_t gfpflags,
1799 int node) 1811 int node, size_t size)
1800{ 1812{
1801 return slab_alloc(s, gfpflags, node, _RET_IP_); 1813 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
1814
1815 trace_kmalloc_node(_RET_IP_, ret,
1816 size, s->size, gfpflags, node);
1817 return ret;
1802} 1818}
1803EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); 1819EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
1804#endif 1820#endif
1805#endif 1821#endif
1806 1822
@@ -1917,17 +1933,6 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
1917} 1933}
1918EXPORT_SYMBOL(kmem_cache_free); 1934EXPORT_SYMBOL(kmem_cache_free);
1919 1935
1920/* Figure out on which slab page the object resides */
1921static struct page *get_object_page(const void *x)
1922{
1923 struct page *page = virt_to_head_page(x);
1924
1925 if (!PageSlab(page))
1926 return NULL;
1927
1928 return page;
1929}
1930
1931/* 1936/*
1932 * Object placement in a slab is made very easy because we always start at 1937 * Object placement in a slab is made very easy because we always start at
1933 * offset 0. If we tune the size of the object to the alignment then we can 1938 * offset 0. If we tune the size of the object to the alignment then we can
@@ -2386,35 +2391,6 @@ error:
2386} 2391}
2387 2392
2388/* 2393/*
2389 * Check if a given pointer is valid
2390 */
2391int kmem_ptr_validate(struct kmem_cache *s, const void *object)
2392{
2393 struct page *page;
2394
2395 if (!kern_ptr_validate(object, s->size))
2396 return 0;
2397
2398 page = get_object_page(object);
2399
2400 if (!page || s != page->slab)
2401 /* No slab or wrong slab */
2402 return 0;
2403
2404 if (!check_valid_pointer(s, page, object))
2405 return 0;
2406
2407 /*
2408 * We could also check if the object is on the slabs freelist.
2409 * But this would be too expensive and it seems that the main
2410 * purpose of kmem_ptr_valid() is to check if the object belongs
2411 * to a certain slab.
2412 */
2413 return 1;
2414}
2415EXPORT_SYMBOL(kmem_ptr_validate);
2416
2417/*
2418 * Determine the size of a slab object 2394 * Determine the size of a slab object
2419 */ 2395 */
2420unsigned int kmem_cache_size(struct kmem_cache *s) 2396unsigned int kmem_cache_size(struct kmem_cache *s)
@@ -3401,13 +3377,13 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
3401 3377
3402 for_each_free_object(p, s, page->freelist) { 3378 for_each_free_object(p, s, page->freelist) {
3403 set_bit(slab_index(p, s, addr), map); 3379 set_bit(slab_index(p, s, addr), map);
3404 if (!check_object(s, page, p, 0)) 3380 if (!check_object(s, page, p, SLUB_RED_INACTIVE))
3405 return 0; 3381 return 0;
3406 } 3382 }
3407 3383
3408 for_each_object(p, s, addr, page->objects) 3384 for_each_object(p, s, addr, page->objects)
3409 if (!test_bit(slab_index(p, s, addr), map)) 3385 if (!test_bit(slab_index(p, s, addr), map))
3410 if (!check_object(s, page, p, 1)) 3386 if (!check_object(s, page, p, SLUB_RED_ACTIVE))
3411 return 0; 3387 return 0;
3412 return 1; 3388 return 1;
3413} 3389}
@@ -3660,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
3660 len += sprintf(buf + len, "%7ld ", l->count); 3636 len += sprintf(buf + len, "%7ld ", l->count);
3661 3637
3662 if (l->addr) 3638 if (l->addr)
3663 len += sprint_symbol(buf + len, (unsigned long)l->addr); 3639 len += sprintf(buf + len, "%pS", (void *)l->addr);
3664 else 3640 else
3665 len += sprintf(buf + len, "<not-available>"); 3641 len += sprintf(buf + len, "<not-available>");
3666 3642
@@ -3821,7 +3797,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3821 } 3797 }
3822 } 3798 }
3823 3799
3824 down_read(&slub_lock); 3800 lock_memory_hotplug();
3825#ifdef CONFIG_SLUB_DEBUG 3801#ifdef CONFIG_SLUB_DEBUG
3826 if (flags & SO_ALL) { 3802 if (flags & SO_ALL) {
3827 for_each_node_state(node, N_NORMAL_MEMORY) { 3803 for_each_node_state(node, N_NORMAL_MEMORY) {
@@ -3862,7 +3838,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
3862 x += sprintf(buf + x, " N%d=%lu", 3838 x += sprintf(buf + x, " N%d=%lu",
3863 node, nodes[node]); 3839 node, nodes[node]);
3864#endif 3840#endif
3865 up_read(&slub_lock); 3841 unlock_memory_hotplug();
3866 kfree(nodes); 3842 kfree(nodes);
3867 return x + sprintf(buf + x, "\n"); 3843 return x + sprintf(buf + x, "\n");
3868} 3844}
@@ -3970,12 +3946,9 @@ SLAB_ATTR(min_partial);
3970 3946
3971static ssize_t ctor_show(struct kmem_cache *s, char *buf) 3947static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3972{ 3948{
3973 if (s->ctor) { 3949 if (!s->ctor)
3974 int n = sprint_symbol(buf, (unsigned long)s->ctor); 3950 return 0;
3975 3951 return sprintf(buf, "%pS\n", s->ctor);
3976 return n + sprintf(buf + n, "\n");
3977 }
3978 return 0;
3979} 3952}
3980SLAB_ATTR_RO(ctor); 3953SLAB_ATTR_RO(ctor);
3981 3954
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 29d6cbffb283..64b984091edb 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -9,7 +9,7 @@
9 * 9 *
10 * However, virtual mappings need a page table and TLBs. Many Linux 10 * However, virtual mappings need a page table and TLBs. Many Linux
11 * architectures already map their physical space using 1-1 mappings 11 * architectures already map their physical space using 1-1 mappings
12 * via TLBs. For those arches the virtual memmory map is essentially 12 * via TLBs. For those arches the virtual memory map is essentially
13 * for free if we use the same page size as the 1-1 mappings. In that 13 * for free if we use the same page size as the 1-1 mappings. In that
14 * case the overhead consists of a few additional pages that are 14 * case the overhead consists of a few additional pages that are
15 * allocated to create a view of memory for vmemmap. 15 * allocated to create a view of memory for vmemmap.
diff --git a/mm/sparse.c b/mm/sparse.c
index 95ac219af379..93250207c5cf 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
671static void free_map_bootmem(struct page *page, unsigned long nr_pages) 671static void free_map_bootmem(struct page *page, unsigned long nr_pages)
672{ 672{
673 unsigned long maps_section_nr, removing_section_nr, i; 673 unsigned long maps_section_nr, removing_section_nr, i;
674 int magic; 674 unsigned long magic;
675 675
676 for (i = 0; i < nr_pages; i++, page++) { 676 for (i = 0; i < nr_pages; i++, page++) {
677 magic = atomic_read(&page->_mapcount); 677 magic = (unsigned long) page->lru.next;
678 678
679 BUG_ON(magic == NODE_INFO); 679 BUG_ON(magic == NODE_INFO);
680 680
diff --git a/mm/swap.c b/mm/swap.c
index 3f4854205b16..c02f93611a84 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -56,17 +56,97 @@ static void __page_cache_release(struct page *page)
56 del_page_from_lru(zone, page); 56 del_page_from_lru(zone, page);
57 spin_unlock_irqrestore(&zone->lru_lock, flags); 57 spin_unlock_irqrestore(&zone->lru_lock, flags);
58 } 58 }
59}
60
61static void __put_single_page(struct page *page)
62{
63 __page_cache_release(page);
59 free_hot_cold_page(page, 0); 64 free_hot_cold_page(page, 0);
60} 65}
61 66
62static void put_compound_page(struct page *page) 67static void __put_compound_page(struct page *page)
63{ 68{
64 page = compound_head(page); 69 compound_page_dtor *dtor;
65 if (put_page_testzero(page)) {
66 compound_page_dtor *dtor;
67 70
68 dtor = get_compound_page_dtor(page); 71 __page_cache_release(page);
69 (*dtor)(page); 72 dtor = get_compound_page_dtor(page);
73 (*dtor)(page);
74}
75
76static void put_compound_page(struct page *page)
77{
78 if (unlikely(PageTail(page))) {
79 /* __split_huge_page_refcount can run under us */
80 struct page *page_head = page->first_page;
81 smp_rmb();
82 /*
83 * If PageTail is still set after smp_rmb() we can be sure
84 * that the page->first_page we read wasn't a dangling pointer.
85 * See __split_huge_page_refcount() smp_wmb().
86 */
87 if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
88 unsigned long flags;
89 /*
90 * Verify that our page_head wasn't converted
91 * to a a regular page before we got a
92 * reference on it.
93 */
94 if (unlikely(!PageHead(page_head))) {
95 /* PageHead is cleared after PageTail */
96 smp_rmb();
97 VM_BUG_ON(PageTail(page));
98 goto out_put_head;
99 }
100 /*
101 * Only run compound_lock on a valid PageHead,
102 * after having it pinned with
103 * get_page_unless_zero() above.
104 */
105 smp_mb();
106 /* page_head wasn't a dangling pointer */
107 flags = compound_lock_irqsave(page_head);
108 if (unlikely(!PageTail(page))) {
109 /* __split_huge_page_refcount run before us */
110 compound_unlock_irqrestore(page_head, flags);
111 VM_BUG_ON(PageHead(page_head));
112 out_put_head:
113 if (put_page_testzero(page_head))
114 __put_single_page(page_head);
115 out_put_single:
116 if (put_page_testzero(page))
117 __put_single_page(page);
118 return;
119 }
120 VM_BUG_ON(page_head != page->first_page);
121 /*
122 * We can release the refcount taken by
123 * get_page_unless_zero now that
124 * split_huge_page_refcount is blocked on the
125 * compound_lock.
126 */
127 if (put_page_testzero(page_head))
128 VM_BUG_ON(1);
129 /* __split_huge_page_refcount will wait now */
130 VM_BUG_ON(atomic_read(&page->_count) <= 0);
131 atomic_dec(&page->_count);
132 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
133 compound_unlock_irqrestore(page_head, flags);
134 if (put_page_testzero(page_head)) {
135 if (PageHead(page_head))
136 __put_compound_page(page_head);
137 else
138 __put_single_page(page_head);
139 }
140 } else {
141 /* page_head is a dangling pointer */
142 VM_BUG_ON(PageTail(page));
143 goto out_put_single;
144 }
145 } else if (put_page_testzero(page)) {
146 if (PageHead(page))
147 __put_compound_page(page);
148 else
149 __put_single_page(page);
70 } 150 }
71} 151}
72 152
@@ -75,7 +155,7 @@ void put_page(struct page *page)
75 if (unlikely(PageCompound(page))) 155 if (unlikely(PageCompound(page)))
76 put_compound_page(page); 156 put_compound_page(page);
77 else if (put_page_testzero(page)) 157 else if (put_page_testzero(page))
78 __page_cache_release(page); 158 __put_single_page(page);
79} 159}
80EXPORT_SYMBOL(put_page); 160EXPORT_SYMBOL(put_page);
81 161
@@ -399,6 +479,43 @@ void __pagevec_release(struct pagevec *pvec)
399 479
400EXPORT_SYMBOL(__pagevec_release); 480EXPORT_SYMBOL(__pagevec_release);
401 481
482/* used by __split_huge_page_refcount() */
483void lru_add_page_tail(struct zone* zone,
484 struct page *page, struct page *page_tail)
485{
486 int active;
487 enum lru_list lru;
488 const int file = 0;
489 struct list_head *head;
490
491 VM_BUG_ON(!PageHead(page));
492 VM_BUG_ON(PageCompound(page_tail));
493 VM_BUG_ON(PageLRU(page_tail));
494 VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
495
496 SetPageLRU(page_tail);
497
498 if (page_evictable(page_tail, NULL)) {
499 if (PageActive(page)) {
500 SetPageActive(page_tail);
501 active = 1;
502 lru = LRU_ACTIVE_ANON;
503 } else {
504 active = 0;
505 lru = LRU_INACTIVE_ANON;
506 }
507 update_page_reclaim_stat(zone, page_tail, file, active);
508 if (likely(PageLRU(page)))
509 head = page->lru.prev;
510 else
511 head = &zone->lru[lru].list;
512 __add_page_to_lru_list(zone, page_tail, lru, head);
513 } else {
514 SetPageUnevictable(page_tail);
515 add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
516 }
517}
518
402/* 519/*
403 * Add the passed pages to the LRU, then drop the caller's refcount 520 * Add the passed pages to the LRU, then drop the caller's refcount
404 * on them. Reinitialises the caller's pagevec. 521 * on them. Reinitialises the caller's pagevec.
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e10f5833167f..5c8cfabbc9bc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -157,6 +157,12 @@ int add_to_swap(struct page *page)
157 if (!entry.val) 157 if (!entry.val)
158 return 0; 158 return 0;
159 159
160 if (unlikely(PageTransHuge(page)))
161 if (unlikely(split_huge_page(page))) {
162 swapcache_free(entry, NULL);
163 return 0;
164 }
165
160 /* 166 /*
161 * Radix-tree node allocations from PF_MEMALLOC contexts could 167 * Radix-tree node allocations from PF_MEMALLOC contexts could
162 * completely exhaust the page allocator. __GFP_NOMEMALLOC 168 * completely exhaust the page allocator. __GFP_NOMEMALLOC
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 67ddaaf98c74..07a458d72fa8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -964,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
964 pmd = pmd_offset(pud, addr); 964 pmd = pmd_offset(pud, addr);
965 do { 965 do {
966 next = pmd_addr_end(addr, end); 966 next = pmd_addr_end(addr, end);
967 if (unlikely(pmd_trans_huge(*pmd)))
968 continue;
967 if (pmd_none_or_clear_bad(pmd)) 969 if (pmd_none_or_clear_bad(pmd))
968 continue; 970 continue;
969 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 971 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
@@ -1677,7 +1679,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1677 if (S_ISBLK(inode->i_mode)) { 1679 if (S_ISBLK(inode->i_mode)) {
1678 struct block_device *bdev = I_BDEV(inode); 1680 struct block_device *bdev = I_BDEV(inode);
1679 set_blocksize(bdev, p->old_block_size); 1681 set_blocksize(bdev, p->old_block_size);
1680 bd_release(bdev); 1682 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1681 } else { 1683 } else {
1682 mutex_lock(&inode->i_mutex); 1684 mutex_lock(&inode->i_mutex);
1683 inode->i_flags &= ~S_SWAPFILE; 1685 inode->i_flags &= ~S_SWAPFILE;
@@ -1939,7 +1941,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1939 error = -EINVAL; 1941 error = -EINVAL;
1940 if (S_ISBLK(inode->i_mode)) { 1942 if (S_ISBLK(inode->i_mode)) {
1941 bdev = I_BDEV(inode); 1943 bdev = I_BDEV(inode);
1942 error = bd_claim(bdev, sys_swapon); 1944 error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1945 sys_swapon);
1943 if (error < 0) { 1946 if (error < 0) {
1944 bdev = NULL; 1947 bdev = NULL;
1945 error = -EINVAL; 1948 error = -EINVAL;
@@ -2136,7 +2139,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2136bad_swap: 2139bad_swap:
2137 if (bdev) { 2140 if (bdev) {
2138 set_blocksize(bdev, p->old_block_size); 2141 set_blocksize(bdev, p->old_block_size);
2139 bd_release(bdev); 2142 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2140 } 2143 }
2141 destroy_swap_extents(p); 2144 destroy_swap_extents(p);
2142 swap_cgroup_swapoff(type); 2145 swap_cgroup_swapoff(type);
diff --git a/mm/truncate.c b/mm/truncate.c
index ba887bff48c5..49feb46e77b8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -390,6 +390,10 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
390 __remove_from_page_cache(page); 390 __remove_from_page_cache(page);
391 spin_unlock_irq(&mapping->tree_lock); 391 spin_unlock_irq(&mapping->tree_lock);
392 mem_cgroup_uncharge_cache_page(page); 392 mem_cgroup_uncharge_cache_page(page);
393
394 if (mapping->a_ops->freepage)
395 mapping->a_ops->freepage(page);
396
393 page_cache_release(page); /* pagecache ref */ 397 page_cache_release(page); /* pagecache ref */
394 return 1; 398 return 1;
395failed: 399failed:
@@ -545,13 +549,12 @@ EXPORT_SYMBOL(truncate_pagecache);
545 * @inode: inode 549 * @inode: inode
546 * @newsize: new file size 550 * @newsize: new file size
547 * 551 *
548 * truncate_setsize updastes i_size update and performs pagecache 552 * truncate_setsize updates i_size and performs pagecache truncation (if
549 * truncation (if necessary) for a file size updates. It will be 553 * necessary) to @newsize. It will be typically be called from the filesystem's
550 * typically be called from the filesystem's setattr function when 554 * setattr function when ATTR_SIZE is passed in.
551 * ATTR_SIZE is passed in.
552 * 555 *
553 * Must be called with inode_mutex held and after all filesystem 556 * Must be called with inode_mutex held and before all filesystem specific
554 * specific block truncation has been performed. 557 * block truncation has been performed.
555 */ 558 */
556void truncate_setsize(struct inode *inode, loff_t newsize) 559void truncate_setsize(struct inode *inode, loff_t newsize)
557{ 560{
diff --git a/mm/util.c b/mm/util.c
index 73dac81e9f78..f126975ef23e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -186,27 +186,6 @@ void kzfree(const void *p)
186} 186}
187EXPORT_SYMBOL(kzfree); 187EXPORT_SYMBOL(kzfree);
188 188
189int kern_ptr_validate(const void *ptr, unsigned long size)
190{
191 unsigned long addr = (unsigned long)ptr;
192 unsigned long min_addr = PAGE_OFFSET;
193 unsigned long align_mask = sizeof(void *) - 1;
194
195 if (unlikely(addr < min_addr))
196 goto out;
197 if (unlikely(addr > (unsigned long)high_memory - size))
198 goto out;
199 if (unlikely(addr & align_mask))
200 goto out;
201 if (unlikely(!kern_addr_valid(addr)))
202 goto out;
203 if (unlikely(!kern_addr_valid(addr + size - 1)))
204 goto out;
205 return 1;
206out:
207 return 0;
208}
209
210/* 189/*
211 * strndup_user - duplicate an existing string from user space 190 * strndup_user - duplicate an existing string from user space
212 * @s: The string to duplicate 191 * @s: The string to duplicate
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a3d66b3dc5cb..f9b166732e70 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,8 +31,6 @@
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/shmparam.h> 32#include <asm/shmparam.h>
33 33
34bool vmap_lazy_unmap __read_mostly = true;
35
36/*** Page table manipulation functions ***/ 34/*** Page table manipulation functions ***/
37 35
38static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) 36static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
@@ -503,9 +501,6 @@ static unsigned long lazy_max_pages(void)
503{ 501{
504 unsigned int log; 502 unsigned int log;
505 503
506 if (!vmap_lazy_unmap)
507 return 0;
508
509 log = fls(num_online_cpus()); 504 log = fls(num_online_cpus());
510 505
511 return log * (32UL * 1024 * 1024 / PAGE_SIZE); 506 return log * (32UL * 1024 * 1024 / PAGE_SIZE);
@@ -566,7 +561,6 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
566 if (va->va_end > *end) 561 if (va->va_end > *end)
567 *end = va->va_end; 562 *end = va->va_end;
568 nr += (va->va_end - va->va_start) >> PAGE_SHIFT; 563 nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
569 unmap_vmap_area(va);
570 list_add_tail(&va->purge_list, &valist); 564 list_add_tail(&va->purge_list, &valist);
571 va->flags |= VM_LAZY_FREEING; 565 va->flags |= VM_LAZY_FREEING;
572 va->flags &= ~VM_LAZY_FREE; 566 va->flags &= ~VM_LAZY_FREE;
@@ -611,10 +605,11 @@ static void purge_vmap_area_lazy(void)
611} 605}
612 606
613/* 607/*
614 * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been 608 * Free a vmap area, caller ensuring that the area has been unmapped
615 * called for the correct range previously. 609 * and flush_cache_vunmap had been called for the correct range
610 * previously.
616 */ 611 */
617static void free_unmap_vmap_area_noflush(struct vmap_area *va) 612static void free_vmap_area_noflush(struct vmap_area *va)
618{ 613{
619 va->flags |= VM_LAZY_FREE; 614 va->flags |= VM_LAZY_FREE;
620 atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); 615 atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
@@ -623,6 +618,16 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va)
623} 618}
624 619
625/* 620/*
621 * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
622 * called for the correct range previously.
623 */
624static void free_unmap_vmap_area_noflush(struct vmap_area *va)
625{
626 unmap_vmap_area(va);
627 free_vmap_area_noflush(va);
628}
629
630/*
626 * Free and unmap a vmap area 631 * Free and unmap a vmap area
627 */ 632 */
628static void free_unmap_vmap_area(struct vmap_area *va) 633static void free_unmap_vmap_area(struct vmap_area *va)
@@ -743,7 +748,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
743 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, 748 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
744 VMALLOC_START, VMALLOC_END, 749 VMALLOC_START, VMALLOC_END,
745 node, gfp_mask); 750 node, gfp_mask);
746 if (unlikely(IS_ERR(va))) { 751 if (IS_ERR(va)) {
747 kfree(vb); 752 kfree(vb);
748 return ERR_CAST(va); 753 return ERR_CAST(va);
749 } 754 }
@@ -798,7 +803,7 @@ static void free_vmap_block(struct vmap_block *vb)
798 spin_unlock(&vmap_block_tree_lock); 803 spin_unlock(&vmap_block_tree_lock);
799 BUG_ON(tmp != vb); 804 BUG_ON(tmp != vb);
800 805
801 free_unmap_vmap_area_noflush(vb->va); 806 free_vmap_area_noflush(vb->va);
802 call_rcu(&vb->rcu_head, rcu_free_vb); 807 call_rcu(&vb->rcu_head, rcu_free_vb);
803} 808}
804 809
@@ -936,6 +941,8 @@ static void vb_free(const void *addr, unsigned long size)
936 rcu_read_unlock(); 941 rcu_read_unlock();
937 BUG_ON(!vb); 942 BUG_ON(!vb);
938 943
944 vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
945
939 spin_lock(&vb->lock); 946 spin_lock(&vb->lock);
940 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); 947 BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
941 948
@@ -988,7 +995,6 @@ void vm_unmap_aliases(void)
988 995
989 s = vb->va->va_start + (i << PAGE_SHIFT); 996 s = vb->va->va_start + (i << PAGE_SHIFT);
990 e = vb->va->va_start + (j << PAGE_SHIFT); 997 e = vb->va->va_start + (j << PAGE_SHIFT);
991 vunmap_page_range(s, e);
992 flush = 1; 998 flush = 1;
993 999
994 if (s < start) 1000 if (s < start)
@@ -1169,6 +1175,7 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
1169{ 1175{
1170 vunmap_page_range(addr, addr + size); 1176 vunmap_page_range(addr, addr + size);
1171} 1177}
1178EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
1172 1179
1173/** 1180/**
1174 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB 1181 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
@@ -1309,13 +1316,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1309 -1, GFP_KERNEL, caller); 1316 -1, GFP_KERNEL, caller);
1310} 1317}
1311 1318
1312struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
1313 int node, gfp_t gfp_mask)
1314{
1315 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1316 node, gfp_mask, __builtin_return_address(0));
1317}
1318
1319static struct vm_struct *find_vm_area(const void *addr) 1319static struct vm_struct *find_vm_area(const void *addr)
1320{ 1320{
1321 struct vmap_area *va; 1321 struct vmap_area *va;
@@ -1531,25 +1531,12 @@ fail:
1531 return NULL; 1531 return NULL;
1532} 1532}
1533 1533
1534void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1535{
1536 void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
1537 __builtin_return_address(0));
1538
1539 /*
1540 * A ref_count = 3 is needed because the vm_struct and vmap_area
1541 * structures allocated in the __get_vm_area_node() function contain
1542 * references to the virtual address of the vmalloc'ed block.
1543 */
1544 kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
1545
1546 return addr;
1547}
1548
1549/** 1534/**
1550 * __vmalloc_node - allocate virtually contiguous memory 1535 * __vmalloc_node_range - allocate virtually contiguous memory
1551 * @size: allocation size 1536 * @size: allocation size
1552 * @align: desired alignment 1537 * @align: desired alignment
1538 * @start: vm area range start
1539 * @end: vm area range end
1553 * @gfp_mask: flags for the page level allocator 1540 * @gfp_mask: flags for the page level allocator
1554 * @prot: protection mask for the allocated pages 1541 * @prot: protection mask for the allocated pages
1555 * @node: node to use for allocation or -1 1542 * @node: node to use for allocation or -1
@@ -1559,9 +1546,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1559 * allocator with @gfp_mask flags. Map them into contiguous 1546 * allocator with @gfp_mask flags. Map them into contiguous
1560 * kernel virtual space, using a pagetable protection of @prot. 1547 * kernel virtual space, using a pagetable protection of @prot.
1561 */ 1548 */
1562static void *__vmalloc_node(unsigned long size, unsigned long align, 1549void *__vmalloc_node_range(unsigned long size, unsigned long align,
1563 gfp_t gfp_mask, pgprot_t prot, 1550 unsigned long start, unsigned long end, gfp_t gfp_mask,
1564 int node, void *caller) 1551 pgprot_t prot, int node, void *caller)
1565{ 1552{
1566 struct vm_struct *area; 1553 struct vm_struct *area;
1567 void *addr; 1554 void *addr;
@@ -1571,8 +1558,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1571 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1558 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1572 return NULL; 1559 return NULL;
1573 1560
1574 area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, 1561 area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
1575 VMALLOC_END, node, gfp_mask, caller); 1562 gfp_mask, caller);
1576 1563
1577 if (!area) 1564 if (!area)
1578 return NULL; 1565 return NULL;
@@ -1589,6 +1576,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1589 return addr; 1576 return addr;
1590} 1577}
1591 1578
1579/**
1580 * __vmalloc_node - allocate virtually contiguous memory
1581 * @size: allocation size
1582 * @align: desired alignment
1583 * @gfp_mask: flags for the page level allocator
1584 * @prot: protection mask for the allocated pages
1585 * @node: node to use for allocation or -1
1586 * @caller: caller's return address
1587 *
1588 * Allocate enough pages to cover @size from the page level
1589 * allocator with @gfp_mask flags. Map them into contiguous
1590 * kernel virtual space, using a pagetable protection of @prot.
1591 */
1592static void *__vmalloc_node(unsigned long size, unsigned long align,
1593 gfp_t gfp_mask, pgprot_t prot,
1594 int node, void *caller)
1595{
1596 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
1597 gfp_mask, prot, node, caller);
1598}
1599
1592void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1600void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1593{ 1601{
1594 return __vmalloc_node(size, 1, gfp_mask, prot, -1, 1602 return __vmalloc_node(size, 1, gfp_mask, prot, -1,
@@ -2197,17 +2205,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
2197 * @sizes: array containing size of each area 2205 * @sizes: array containing size of each area
2198 * @nr_vms: the number of areas to allocate 2206 * @nr_vms: the number of areas to allocate
2199 * @align: alignment, all entries in @offsets and @sizes must be aligned to this 2207 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
2200 * @gfp_mask: allocation mask
2201 * 2208 *
2202 * Returns: kmalloc'd vm_struct pointer array pointing to allocated 2209 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
2203 * vm_structs on success, %NULL on failure 2210 * vm_structs on success, %NULL on failure
2204 * 2211 *
2205 * Percpu allocator wants to use congruent vm areas so that it can 2212 * Percpu allocator wants to use congruent vm areas so that it can
2206 * maintain the offsets among percpu areas. This function allocates 2213 * maintain the offsets among percpu areas. This function allocates
2207 * congruent vmalloc areas for it. These areas tend to be scattered 2214 * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
2208 * pretty far, distance between two areas easily going up to 2215 * be scattered pretty far, distance between two areas easily going up
2209 * gigabytes. To avoid interacting with regular vmallocs, these areas 2216 * to gigabytes. To avoid interacting with regular vmallocs, these
2210 * are allocated from top. 2217 * areas are allocated from top.
2211 * 2218 *
2212 * Despite its complicated look, this allocator is rather simple. It 2219 * Despite its complicated look, this allocator is rather simple. It
2213 * does everything top-down and scans areas from the end looking for 2220 * does everything top-down and scans areas from the end looking for
@@ -2218,7 +2225,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
2218 */ 2225 */
2219struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 2226struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2220 const size_t *sizes, int nr_vms, 2227 const size_t *sizes, int nr_vms,
2221 size_t align, gfp_t gfp_mask) 2228 size_t align)
2222{ 2229{
2223 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 2230 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
2224 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 2231 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2228,8 +2235,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2228 unsigned long base, start, end, last_end; 2235 unsigned long base, start, end, last_end;
2229 bool purged = false; 2236 bool purged = false;
2230 2237
2231 gfp_mask &= GFP_RECLAIM_MASK;
2232
2233 /* verify parameters and allocate data structures */ 2238 /* verify parameters and allocate data structures */
2234 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); 2239 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
2235 for (last_area = 0, area = 0; area < nr_vms; area++) { 2240 for (last_area = 0, area = 0; area < nr_vms; area++) {
@@ -2262,14 +2267,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2262 return NULL; 2267 return NULL;
2263 } 2268 }
2264 2269
2265 vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); 2270 vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
2266 vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); 2271 vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
2267 if (!vas || !vms) 2272 if (!vas || !vms)
2268 goto err_free; 2273 goto err_free;
2269 2274
2270 for (area = 0; area < nr_vms; area++) { 2275 for (area = 0; area < nr_vms; area++) {
2271 vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); 2276 vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
2272 vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); 2277 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
2273 if (!vas[area] || !vms[area]) 2278 if (!vas[area] || !vms[area])
2274 goto err_free; 2279 goto err_free;
2275 } 2280 }
@@ -2450,13 +2455,8 @@ static int s_show(struct seq_file *m, void *p)
2450 seq_printf(m, "0x%p-0x%p %7ld", 2455 seq_printf(m, "0x%p-0x%p %7ld",
2451 v->addr, v->addr + v->size, v->size); 2456 v->addr, v->addr + v->size, v->size);
2452 2457
2453 if (v->caller) { 2458 if (v->caller)
2454 char buff[KSYM_SYMBOL_LEN]; 2459 seq_printf(m, " %pS", v->caller);
2455
2456 seq_putc(m, ' ');
2457 sprint_symbol(buff, (unsigned long)v->caller);
2458 seq_puts(m, buff);
2459 }
2460 2460
2461 if (v->nr_pages) 2461 if (v->nr_pages)
2462 seq_printf(m, " pages=%d", v->nr_pages); 2462 seq_printf(m, " pages=%d", v->nr_pages);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d31d7ce52c0e..17497d0cd8b9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
32#include <linux/topology.h> 32#include <linux/topology.h>
33#include <linux/cpu.h> 33#include <linux/cpu.h>
34#include <linux/cpuset.h> 34#include <linux/cpuset.h>
35#include <linux/compaction.h>
35#include <linux/notifier.h> 36#include <linux/notifier.h>
36#include <linux/rwsem.h> 37#include <linux/rwsem.h>
37#include <linux/delay.h> 38#include <linux/delay.h>
@@ -51,11 +52,23 @@
51#define CREATE_TRACE_POINTS 52#define CREATE_TRACE_POINTS
52#include <trace/events/vmscan.h> 53#include <trace/events/vmscan.h>
53 54
54enum lumpy_mode { 55/*
55 LUMPY_MODE_NONE, 56 * reclaim_mode determines how the inactive list is shrunk
56 LUMPY_MODE_ASYNC, 57 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
57 LUMPY_MODE_SYNC, 58 * RECLAIM_MODE_ASYNC: Do not block
58}; 59 * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
60 * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
61 * page from the LRU and reclaim all pages within a
62 * naturally aligned range
63 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
64 * order-0 pages and then compact the zone
65 */
66typedef unsigned __bitwise__ reclaim_mode_t;
67#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
68#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
69#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
70#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
71#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
59 72
60struct scan_control { 73struct scan_control {
61 /* Incremented by the number of inactive pages that were scanned */ 74 /* Incremented by the number of inactive pages that were scanned */
@@ -88,7 +101,7 @@ struct scan_control {
88 * Intend to reclaim enough continuous memory rather than reclaim 101 * Intend to reclaim enough continuous memory rather than reclaim
89 * enough amount of memory. i.e, mode for high order allocation. 102 * enough amount of memory. i.e, mode for high order allocation.
90 */ 103 */
91 enum lumpy_mode lumpy_reclaim_mode; 104 reclaim_mode_t reclaim_mode;
92 105
93 /* Which cgroup do we reclaim from */ 106 /* Which cgroup do we reclaim from */
94 struct mem_cgroup *mem_cgroup; 107 struct mem_cgroup *mem_cgroup;
@@ -271,34 +284,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
271 return ret; 284 return ret;
272} 285}
273 286
274static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, 287static void set_reclaim_mode(int priority, struct scan_control *sc,
275 bool sync) 288 bool sync)
276{ 289{
277 enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; 290 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
278 291
279 /* 292 /*
280 * Some reclaim have alredy been failed. No worth to try synchronous 293 * Initially assume we are entering either lumpy reclaim or
281 * lumpy reclaim. 294 * reclaim/compaction.Depending on the order, we will either set the
295 * sync mode or just reclaim order-0 pages later.
282 */ 296 */
283 if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) 297 if (COMPACTION_BUILD)
284 return; 298 sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
299 else
300 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
285 301
286 /* 302 /*
287 * If we need a large contiguous chunk of memory, or have 303 * Avoid using lumpy reclaim or reclaim/compaction if possible by
288 * trouble getting a small set of contiguous pages, we 304 * restricting when its set to either costly allocations or when
289 * will reclaim both active and inactive pages. 305 * under memory pressure
290 */ 306 */
291 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 307 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
292 sc->lumpy_reclaim_mode = mode; 308 sc->reclaim_mode |= syncmode;
293 else if (sc->order && priority < DEF_PRIORITY - 2) 309 else if (sc->order && priority < DEF_PRIORITY - 2)
294 sc->lumpy_reclaim_mode = mode; 310 sc->reclaim_mode |= syncmode;
295 else 311 else
296 sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; 312 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
297} 313}
298 314
299static void disable_lumpy_reclaim_mode(struct scan_control *sc) 315static void reset_reclaim_mode(struct scan_control *sc)
300{ 316{
301 sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; 317 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
302} 318}
303 319
304static inline int is_page_cache_freeable(struct page *page) 320static inline int is_page_cache_freeable(struct page *page)
@@ -429,7 +445,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
429 * first attempt to free a range of pages fails. 445 * first attempt to free a range of pages fails.
430 */ 446 */
431 if (PageWriteback(page) && 447 if (PageWriteback(page) &&
432 sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) 448 (sc->reclaim_mode & RECLAIM_MODE_SYNC))
433 wait_on_page_writeback(page); 449 wait_on_page_writeback(page);
434 450
435 if (!PageWriteback(page)) { 451 if (!PageWriteback(page)) {
@@ -437,7 +453,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
437 ClearPageReclaim(page); 453 ClearPageReclaim(page);
438 } 454 }
439 trace_mm_vmscan_writepage(page, 455 trace_mm_vmscan_writepage(page,
440 trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); 456 trace_reclaim_flags(page, sc->reclaim_mode));
441 inc_zone_page_state(page, NR_VMSCAN_WRITE); 457 inc_zone_page_state(page, NR_VMSCAN_WRITE);
442 return PAGE_SUCCESS; 458 return PAGE_SUCCESS;
443 } 459 }
@@ -494,9 +510,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
494 spin_unlock_irq(&mapping->tree_lock); 510 spin_unlock_irq(&mapping->tree_lock);
495 swapcache_free(swap, page); 511 swapcache_free(swap, page);
496 } else { 512 } else {
513 void (*freepage)(struct page *);
514
515 freepage = mapping->a_ops->freepage;
516
497 __remove_from_page_cache(page); 517 __remove_from_page_cache(page);
498 spin_unlock_irq(&mapping->tree_lock); 518 spin_unlock_irq(&mapping->tree_lock);
499 mem_cgroup_uncharge_cache_page(page); 519 mem_cgroup_uncharge_cache_page(page);
520
521 if (freepage != NULL)
522 freepage(page);
500 } 523 }
501 524
502 return 1; 525 return 1;
@@ -615,7 +638,7 @@ static enum page_references page_check_references(struct page *page,
615 referenced_page = TestClearPageReferenced(page); 638 referenced_page = TestClearPageReferenced(page);
616 639
617 /* Lumpy reclaim - ignore references */ 640 /* Lumpy reclaim - ignore references */
618 if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) 641 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
619 return PAGEREF_RECLAIM; 642 return PAGEREF_RECLAIM;
620 643
621 /* 644 /*
@@ -732,7 +755,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
732 * for any page for which writeback has already 755 * for any page for which writeback has already
733 * started. 756 * started.
734 */ 757 */
735 if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && 758 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
736 may_enter_fs) 759 may_enter_fs)
737 wait_on_page_writeback(page); 760 wait_on_page_writeback(page);
738 else { 761 else {
@@ -888,7 +911,7 @@ cull_mlocked:
888 try_to_free_swap(page); 911 try_to_free_swap(page);
889 unlock_page(page); 912 unlock_page(page);
890 putback_lru_page(page); 913 putback_lru_page(page);
891 disable_lumpy_reclaim_mode(sc); 914 reset_reclaim_mode(sc);
892 continue; 915 continue;
893 916
894activate_locked: 917activate_locked:
@@ -901,7 +924,7 @@ activate_locked:
901keep_locked: 924keep_locked:
902 unlock_page(page); 925 unlock_page(page);
903keep: 926keep:
904 disable_lumpy_reclaim_mode(sc); 927 reset_reclaim_mode(sc);
905keep_lumpy: 928keep_lumpy:
906 list_add(&page->lru, &ret_pages); 929 list_add(&page->lru, &ret_pages);
907 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 930 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
@@ -1021,7 +1044,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1021 case 0: 1044 case 0:
1022 list_move(&page->lru, dst); 1045 list_move(&page->lru, dst);
1023 mem_cgroup_del_lru(page); 1046 mem_cgroup_del_lru(page);
1024 nr_taken++; 1047 nr_taken += hpage_nr_pages(page);
1025 break; 1048 break;
1026 1049
1027 case -EBUSY: 1050 case -EBUSY:
@@ -1079,7 +1102,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1079 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 1102 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1080 list_move(&cursor_page->lru, dst); 1103 list_move(&cursor_page->lru, dst);
1081 mem_cgroup_del_lru(cursor_page); 1104 mem_cgroup_del_lru(cursor_page);
1082 nr_taken++; 1105 nr_taken += hpage_nr_pages(page);
1083 nr_lumpy_taken++; 1106 nr_lumpy_taken++;
1084 if (PageDirty(cursor_page)) 1107 if (PageDirty(cursor_page))
1085 nr_lumpy_dirty++; 1108 nr_lumpy_dirty++;
@@ -1134,14 +1157,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
1134 struct page *page; 1157 struct page *page;
1135 1158
1136 list_for_each_entry(page, page_list, lru) { 1159 list_for_each_entry(page, page_list, lru) {
1160 int numpages = hpage_nr_pages(page);
1137 lru = page_lru_base_type(page); 1161 lru = page_lru_base_type(page);
1138 if (PageActive(page)) { 1162 if (PageActive(page)) {
1139 lru += LRU_ACTIVE; 1163 lru += LRU_ACTIVE;
1140 ClearPageActive(page); 1164 ClearPageActive(page);
1141 nr_active++; 1165 nr_active += numpages;
1142 } 1166 }
1143 if (count) 1167 if (count)
1144 count[lru]++; 1168 count[lru] += numpages;
1145 } 1169 }
1146 1170
1147 return nr_active; 1171 return nr_active;
@@ -1251,7 +1275,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
1251 add_page_to_lru_list(zone, page, lru); 1275 add_page_to_lru_list(zone, page, lru);
1252 if (is_active_lru(lru)) { 1276 if (is_active_lru(lru)) {
1253 int file = is_file_lru(lru); 1277 int file = is_file_lru(lru);
1254 reclaim_stat->recent_rotated[file]++; 1278 int numpages = hpage_nr_pages(page);
1279 reclaim_stat->recent_rotated[file] += numpages;
1255 } 1280 }
1256 if (!pagevec_add(&pvec, page)) { 1281 if (!pagevec_add(&pvec, page)) {
1257 spin_unlock_irq(&zone->lru_lock); 1282 spin_unlock_irq(&zone->lru_lock);
@@ -1317,7 +1342,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
1317 return false; 1342 return false;
1318 1343
1319 /* Only stall on lumpy reclaim */ 1344 /* Only stall on lumpy reclaim */
1320 if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) 1345 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1321 return false; 1346 return false;
1322 1347
1323 /* If we have relaimed everything on the isolated list, no stall */ 1348 /* If we have relaimed everything on the isolated list, no stall */
@@ -1361,15 +1386,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1361 return SWAP_CLUSTER_MAX; 1386 return SWAP_CLUSTER_MAX;
1362 } 1387 }
1363 1388
1364 set_lumpy_reclaim_mode(priority, sc, false); 1389 set_reclaim_mode(priority, sc, false);
1365 lru_add_drain(); 1390 lru_add_drain();
1366 spin_lock_irq(&zone->lru_lock); 1391 spin_lock_irq(&zone->lru_lock);
1367 1392
1368 if (scanning_global_lru(sc)) { 1393 if (scanning_global_lru(sc)) {
1369 nr_taken = isolate_pages_global(nr_to_scan, 1394 nr_taken = isolate_pages_global(nr_to_scan,
1370 &page_list, &nr_scanned, sc->order, 1395 &page_list, &nr_scanned, sc->order,
1371 sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? 1396 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1372 ISOLATE_INACTIVE : ISOLATE_BOTH, 1397 ISOLATE_BOTH : ISOLATE_INACTIVE,
1373 zone, 0, file); 1398 zone, 0, file);
1374 zone->pages_scanned += nr_scanned; 1399 zone->pages_scanned += nr_scanned;
1375 if (current_is_kswapd()) 1400 if (current_is_kswapd())
@@ -1381,8 +1406,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1381 } else { 1406 } else {
1382 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1407 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1383 &page_list, &nr_scanned, sc->order, 1408 &page_list, &nr_scanned, sc->order,
1384 sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? 1409 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1385 ISOLATE_INACTIVE : ISOLATE_BOTH, 1410 ISOLATE_BOTH : ISOLATE_INACTIVE,
1386 zone, sc->mem_cgroup, 1411 zone, sc->mem_cgroup,
1387 0, file); 1412 0, file);
1388 /* 1413 /*
@@ -1404,7 +1429,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1404 1429
1405 /* Check if we should syncronously wait for writeback */ 1430 /* Check if we should syncronously wait for writeback */
1406 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1431 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1407 set_lumpy_reclaim_mode(priority, sc, true); 1432 set_reclaim_mode(priority, sc, true);
1408 nr_reclaimed += shrink_page_list(&page_list, zone, sc); 1433 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1409 } 1434 }
1410 1435
@@ -1419,7 +1444,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1419 zone_idx(zone), 1444 zone_idx(zone),
1420 nr_scanned, nr_reclaimed, 1445 nr_scanned, nr_reclaimed,
1421 priority, 1446 priority,
1422 trace_shrink_flags(file, sc->lumpy_reclaim_mode)); 1447 trace_shrink_flags(file, sc->reclaim_mode));
1423 return nr_reclaimed; 1448 return nr_reclaimed;
1424} 1449}
1425 1450
@@ -1459,7 +1484,7 @@ static void move_active_pages_to_lru(struct zone *zone,
1459 1484
1460 list_move(&page->lru, &zone->lru[lru].list); 1485 list_move(&page->lru, &zone->lru[lru].list);
1461 mem_cgroup_add_lru_list(page, lru); 1486 mem_cgroup_add_lru_list(page, lru);
1462 pgmoved++; 1487 pgmoved += hpage_nr_pages(page);
1463 1488
1464 if (!pagevec_add(&pvec, page) || list_empty(list)) { 1489 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1465 spin_unlock_irq(&zone->lru_lock); 1490 spin_unlock_irq(&zone->lru_lock);
@@ -1527,7 +1552,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1527 } 1552 }
1528 1553
1529 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { 1554 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1530 nr_rotated++; 1555 nr_rotated += hpage_nr_pages(page);
1531 /* 1556 /*
1532 * Identify referenced, file-backed active pages and 1557 * Identify referenced, file-backed active pages and
1533 * give them one more trip around the active list. So 1558 * give them one more trip around the active list. So
@@ -1798,6 +1823,57 @@ out:
1798} 1823}
1799 1824
1800/* 1825/*
1826 * Reclaim/compaction depends on a number of pages being freed. To avoid
1827 * disruption to the system, a small number of order-0 pages continue to be
1828 * rotated and reclaimed in the normal fashion. However, by the time we get
1829 * back to the allocator and call try_to_compact_zone(), we ensure that
1830 * there are enough free pages for it to be likely successful
1831 */
1832static inline bool should_continue_reclaim(struct zone *zone,
1833 unsigned long nr_reclaimed,
1834 unsigned long nr_scanned,
1835 struct scan_control *sc)
1836{
1837 unsigned long pages_for_compaction;
1838 unsigned long inactive_lru_pages;
1839
1840 /* If not in reclaim/compaction mode, stop */
1841 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
1842 return false;
1843
1844 /*
1845 * If we failed to reclaim and have scanned the full list, stop.
1846 * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
1847 * faster but obviously would be less likely to succeed
1848 * allocation. If this is desirable, use GFP_REPEAT to decide
1849 * if both reclaimed and scanned should be checked or just
1850 * reclaimed
1851 */
1852 if (!nr_reclaimed && !nr_scanned)
1853 return false;
1854
1855 /*
1856 * If we have not reclaimed enough pages for compaction and the
1857 * inactive lists are large enough, continue reclaiming
1858 */
1859 pages_for_compaction = (2UL << sc->order);
1860 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
1861 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1862 if (sc->nr_reclaimed < pages_for_compaction &&
1863 inactive_lru_pages > pages_for_compaction)
1864 return true;
1865
1866 /* If compaction would go ahead or the allocation would succeed, stop */
1867 switch (compaction_suitable(zone, sc->order)) {
1868 case COMPACT_PARTIAL:
1869 case COMPACT_CONTINUE:
1870 return false;
1871 default:
1872 return true;
1873 }
1874}
1875
1876/*
1801 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1877 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1802 */ 1878 */
1803static void shrink_zone(int priority, struct zone *zone, 1879static void shrink_zone(int priority, struct zone *zone,
@@ -1806,9 +1882,12 @@ static void shrink_zone(int priority, struct zone *zone,
1806 unsigned long nr[NR_LRU_LISTS]; 1882 unsigned long nr[NR_LRU_LISTS];
1807 unsigned long nr_to_scan; 1883 unsigned long nr_to_scan;
1808 enum lru_list l; 1884 enum lru_list l;
1809 unsigned long nr_reclaimed = sc->nr_reclaimed; 1885 unsigned long nr_reclaimed, nr_scanned;
1810 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1886 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1811 1887
1888restart:
1889 nr_reclaimed = 0;
1890 nr_scanned = sc->nr_scanned;
1812 get_scan_count(zone, sc, nr, priority); 1891 get_scan_count(zone, sc, nr, priority);
1813 1892
1814 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1893 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1834,8 +1913,7 @@ static void shrink_zone(int priority, struct zone *zone,
1834 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 1913 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
1835 break; 1914 break;
1836 } 1915 }
1837 1916 sc->nr_reclaimed += nr_reclaimed;
1838 sc->nr_reclaimed = nr_reclaimed;
1839 1917
1840 /* 1918 /*
1841 * Even if we did not try to evict anon pages at all, we want to 1919 * Even if we did not try to evict anon pages at all, we want to
@@ -1844,6 +1922,11 @@ static void shrink_zone(int priority, struct zone *zone,
1844 if (inactive_anon_is_low(zone, sc)) 1922 if (inactive_anon_is_low(zone, sc))
1845 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1923 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1846 1924
1925 /* reclaim/compaction might need reclaim to continue */
1926 if (should_continue_reclaim(zone, nr_reclaimed,
1927 sc->nr_scanned - nr_scanned, sc))
1928 goto restart;
1929
1847 throttle_vm_writeout(sc->gfp_mask); 1930 throttle_vm_writeout(sc->gfp_mask);
1848} 1931}
1849 1932
@@ -2000,7 +2083,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2000 struct zone *preferred_zone; 2083 struct zone *preferred_zone;
2001 2084
2002 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), 2085 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2003 NULL, &preferred_zone); 2086 &cpuset_current_mems_allowed,
2087 &preferred_zone);
2004 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); 2088 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2005 } 2089 }
2006 } 2090 }
@@ -2117,38 +2201,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2117} 2201}
2118#endif 2202#endif
2119 2203
2204/*
2205 * pgdat_balanced is used when checking if a node is balanced for high-order
2206 * allocations. Only zones that meet watermarks and are in a zone allowed
2207 * by the callers classzone_idx are added to balanced_pages. The total of
2208 * balanced pages must be at least 25% of the zones allowed by classzone_idx
2209 * for the node to be considered balanced. Forcing all zones to be balanced
2210 * for high orders can cause excessive reclaim when there are imbalanced zones.
2211 * The choice of 25% is due to
2212 * o a 16M DMA zone that is balanced will not balance a zone on any
2213 * reasonable sized machine
2214 * o On all other machines, the top zone must be at least a reasonable
2215 * precentage of the middle zones. For example, on 32-bit x86, highmem
2216 * would need to be at least 256M for it to be balance a whole node.
2217 * Similarly, on x86-64 the Normal zone would need to be at least 1G
2218 * to balance a node on its own. These seemed like reasonable ratios.
2219 */
2220static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2221 int classzone_idx)
2222{
2223 unsigned long present_pages = 0;
2224 int i;
2225
2226 for (i = 0; i <= classzone_idx; i++)
2227 present_pages += pgdat->node_zones[i].present_pages;
2228
2229 return balanced_pages > (present_pages >> 2);
2230}
2231
2120/* is kswapd sleeping prematurely? */ 2232/* is kswapd sleeping prematurely? */
2121static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) 2233static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2234 int classzone_idx)
2122{ 2235{
2123 int i; 2236 int i;
2237 unsigned long balanced = 0;
2238 bool all_zones_ok = true;
2124 2239
2125 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2240 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2126 if (remaining) 2241 if (remaining)
2127 return 1; 2242 return true;
2128 2243
2129 /* If after HZ/10, a zone is below the high mark, it's premature */ 2244 /* Check the watermark levels */
2130 for (i = 0; i < pgdat->nr_zones; i++) { 2245 for (i = 0; i < pgdat->nr_zones; i++) {
2131 struct zone *zone = pgdat->node_zones + i; 2246 struct zone *zone = pgdat->node_zones + i;
2132 2247
2133 if (!populated_zone(zone)) 2248 if (!populated_zone(zone))
2134 continue; 2249 continue;
2135 2250
2136 if (zone->all_unreclaimable) 2251 /*
2252 * balance_pgdat() skips over all_unreclaimable after
2253 * DEF_PRIORITY. Effectively, it considers them balanced so
2254 * they must be considered balanced here as well if kswapd
2255 * is to sleep
2256 */
2257 if (zone->all_unreclaimable) {
2258 balanced += zone->present_pages;
2137 continue; 2259 continue;
2260 }
2138 2261
2139 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 2262 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2140 0, 0)) 2263 classzone_idx, 0))
2141 return 1; 2264 all_zones_ok = false;
2265 else
2266 balanced += zone->present_pages;
2142 } 2267 }
2143 2268
2144 return 0; 2269 /*
2270 * For high-order requests, the balanced zones must contain at least
2271 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
2272 * must be balanced
2273 */
2274 if (order)
2275 return pgdat_balanced(pgdat, balanced, classzone_idx);
2276 else
2277 return !all_zones_ok;
2145} 2278}
2146 2279
2147/* 2280/*
2148 * For kswapd, balance_pgdat() will work across all this node's zones until 2281 * For kswapd, balance_pgdat() will work across all this node's zones until
2149 * they are all at high_wmark_pages(zone). 2282 * they are all at high_wmark_pages(zone).
2150 * 2283 *
2151 * Returns the number of pages which were actually freed. 2284 * Returns the final order kswapd was reclaiming at
2152 * 2285 *
2153 * There is special handling here for zones which are full of pinned pages. 2286 * There is special handling here for zones which are full of pinned pages.
2154 * This can happen if the pages are all mlocked, or if they are all used by 2287 * This can happen if the pages are all mlocked, or if they are all used by
@@ -2165,11 +2298,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
2165 * interoperates with the page allocator fallback scheme to ensure that aging 2298 * interoperates with the page allocator fallback scheme to ensure that aging
2166 * of pages is balanced across the zones. 2299 * of pages is balanced across the zones.
2167 */ 2300 */
2168static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 2301static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2302 int *classzone_idx)
2169{ 2303{
2170 int all_zones_ok; 2304 int all_zones_ok;
2305 unsigned long balanced;
2171 int priority; 2306 int priority;
2172 int i; 2307 int i;
2308 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2173 unsigned long total_scanned; 2309 unsigned long total_scanned;
2174 struct reclaim_state *reclaim_state = current->reclaim_state; 2310 struct reclaim_state *reclaim_state = current->reclaim_state;
2175 struct scan_control sc = { 2311 struct scan_control sc = {
@@ -2192,7 +2328,6 @@ loop_again:
2192 count_vm_event(PAGEOUTRUN); 2328 count_vm_event(PAGEOUTRUN);
2193 2329
2194 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2330 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2195 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2196 unsigned long lru_pages = 0; 2331 unsigned long lru_pages = 0;
2197 int has_under_min_watermark_zone = 0; 2332 int has_under_min_watermark_zone = 0;
2198 2333
@@ -2201,6 +2336,7 @@ loop_again:
2201 disable_swap_token(); 2336 disable_swap_token();
2202 2337
2203 all_zones_ok = 1; 2338 all_zones_ok = 1;
2339 balanced = 0;
2204 2340
2205 /* 2341 /*
2206 * Scan in the highmem->dma direction for the highest 2342 * Scan in the highmem->dma direction for the highest
@@ -2223,9 +2359,10 @@ loop_again:
2223 shrink_active_list(SWAP_CLUSTER_MAX, zone, 2359 shrink_active_list(SWAP_CLUSTER_MAX, zone,
2224 &sc, priority, 0); 2360 &sc, priority, 0);
2225 2361
2226 if (!zone_watermark_ok(zone, order, 2362 if (!zone_watermark_ok_safe(zone, order,
2227 high_wmark_pages(zone), 0, 0)) { 2363 high_wmark_pages(zone), 0, 0)) {
2228 end_zone = i; 2364 end_zone = i;
2365 *classzone_idx = i;
2229 break; 2366 break;
2230 } 2367 }
2231 } 2368 }
@@ -2248,6 +2385,7 @@ loop_again:
2248 * cause too much scanning of the lower zones. 2385 * cause too much scanning of the lower zones.
2249 */ 2386 */
2250 for (i = 0; i <= end_zone; i++) { 2387 for (i = 0; i <= end_zone; i++) {
2388 int compaction;
2251 struct zone *zone = pgdat->node_zones + i; 2389 struct zone *zone = pgdat->node_zones + i;
2252 int nr_slab; 2390 int nr_slab;
2253 2391
@@ -2269,7 +2407,7 @@ loop_again:
2269 * We put equal pressure on every zone, unless one 2407 * We put equal pressure on every zone, unless one
2270 * zone has way too many pages free already. 2408 * zone has way too many pages free already.
2271 */ 2409 */
2272 if (!zone_watermark_ok(zone, order, 2410 if (!zone_watermark_ok_safe(zone, order,
2273 8*high_wmark_pages(zone), end_zone, 0)) 2411 8*high_wmark_pages(zone), end_zone, 0))
2274 shrink_zone(priority, zone, &sc); 2412 shrink_zone(priority, zone, &sc);
2275 reclaim_state->reclaimed_slab = 0; 2413 reclaim_state->reclaimed_slab = 0;
@@ -2277,9 +2415,26 @@ loop_again:
2277 lru_pages); 2415 lru_pages);
2278 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2416 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2279 total_scanned += sc.nr_scanned; 2417 total_scanned += sc.nr_scanned;
2418
2419 compaction = 0;
2420 if (order &&
2421 zone_watermark_ok(zone, 0,
2422 high_wmark_pages(zone),
2423 end_zone, 0) &&
2424 !zone_watermark_ok(zone, order,
2425 high_wmark_pages(zone),
2426 end_zone, 0)) {
2427 compact_zone_order(zone,
2428 order,
2429 sc.gfp_mask, false,
2430 COMPACT_MODE_KSWAPD);
2431 compaction = 1;
2432 }
2433
2280 if (zone->all_unreclaimable) 2434 if (zone->all_unreclaimable)
2281 continue; 2435 continue;
2282 if (nr_slab == 0 && !zone_reclaimable(zone)) 2436 if (!compaction && nr_slab == 0 &&
2437 !zone_reclaimable(zone))
2283 zone->all_unreclaimable = 1; 2438 zone->all_unreclaimable = 1;
2284 /* 2439 /*
2285 * If we've done a decent amount of scanning and 2440 * If we've done a decent amount of scanning and
@@ -2290,7 +2445,7 @@ loop_again:
2290 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2445 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2291 sc.may_writepage = 1; 2446 sc.may_writepage = 1;
2292 2447
2293 if (!zone_watermark_ok(zone, order, 2448 if (!zone_watermark_ok_safe(zone, order,
2294 high_wmark_pages(zone), end_zone, 0)) { 2449 high_wmark_pages(zone), end_zone, 0)) {
2295 all_zones_ok = 0; 2450 all_zones_ok = 0;
2296 /* 2451 /*
@@ -2298,7 +2453,7 @@ loop_again:
2298 * means that we have a GFP_ATOMIC allocation 2453 * means that we have a GFP_ATOMIC allocation
2299 * failure risk. Hurry up! 2454 * failure risk. Hurry up!
2300 */ 2455 */
2301 if (!zone_watermark_ok(zone, order, 2456 if (!zone_watermark_ok_safe(zone, order,
2302 min_wmark_pages(zone), end_zone, 0)) 2457 min_wmark_pages(zone), end_zone, 0))
2303 has_under_min_watermark_zone = 1; 2458 has_under_min_watermark_zone = 1;
2304 } else { 2459 } else {
@@ -2310,10 +2465,12 @@ loop_again:
2310 * spectulatively avoid congestion waits 2465 * spectulatively avoid congestion waits
2311 */ 2466 */
2312 zone_clear_flag(zone, ZONE_CONGESTED); 2467 zone_clear_flag(zone, ZONE_CONGESTED);
2468 if (i <= *classzone_idx)
2469 balanced += zone->present_pages;
2313 } 2470 }
2314 2471
2315 } 2472 }
2316 if (all_zones_ok) 2473 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2317 break; /* kswapd: all done */ 2474 break; /* kswapd: all done */
2318 /* 2475 /*
2319 * OK, kswapd is getting into trouble. Take a nap, then take 2476 * OK, kswapd is getting into trouble. Take a nap, then take
@@ -2336,7 +2493,13 @@ loop_again:
2336 break; 2493 break;
2337 } 2494 }
2338out: 2495out:
2339 if (!all_zones_ok) { 2496
2497 /*
2498 * order-0: All zones must meet high watermark for a balanced node
2499 * high-order: Balanced zones must make up at least 25% of the node
2500 * for the node to be balanced
2501 */
2502 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
2340 cond_resched(); 2503 cond_resched();
2341 2504
2342 try_to_freeze(); 2505 try_to_freeze();
@@ -2361,7 +2524,88 @@ out:
2361 goto loop_again; 2524 goto loop_again;
2362 } 2525 }
2363 2526
2364 return sc.nr_reclaimed; 2527 /*
2528 * If kswapd was reclaiming at a higher order, it has the option of
2529 * sleeping without all zones being balanced. Before it does, it must
2530 * ensure that the watermarks for order-0 on *all* zones are met and
2531 * that the congestion flags are cleared. The congestion flag must
2532 * be cleared as kswapd is the only mechanism that clears the flag
2533 * and it is potentially going to sleep here.
2534 */
2535 if (order) {
2536 for (i = 0; i <= end_zone; i++) {
2537 struct zone *zone = pgdat->node_zones + i;
2538
2539 if (!populated_zone(zone))
2540 continue;
2541
2542 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2543 continue;
2544
2545 /* Confirm the zone is balanced for order-0 */
2546 if (!zone_watermark_ok(zone, 0,
2547 high_wmark_pages(zone), 0, 0)) {
2548 order = sc.order = 0;
2549 goto loop_again;
2550 }
2551
2552 /* If balanced, clear the congested flag */
2553 zone_clear_flag(zone, ZONE_CONGESTED);
2554 }
2555 }
2556
2557 /*
2558 * Return the order we were reclaiming at so sleeping_prematurely()
2559 * makes a decision on the order we were last reclaiming at. However,
2560 * if another caller entered the allocator slow path while kswapd
2561 * was awake, order will remain at the higher level
2562 */
2563 *classzone_idx = end_zone;
2564 return order;
2565}
2566
2567static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2568{
2569 long remaining = 0;
2570 DEFINE_WAIT(wait);
2571
2572 if (freezing(current) || kthread_should_stop())
2573 return;
2574
2575 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2576
2577 /* Try to sleep for a short interval */
2578 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2579 remaining = schedule_timeout(HZ/10);
2580 finish_wait(&pgdat->kswapd_wait, &wait);
2581 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2582 }
2583
2584 /*
2585 * After a short sleep, check if it was a premature sleep. If not, then
2586 * go fully to sleep until explicitly woken up.
2587 */
2588 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2589 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2590
2591 /*
2592 * vmstat counters are not perfectly accurate and the estimated
2593 * value for counters such as NR_FREE_PAGES can deviate from the
2594 * true value by nr_online_cpus * threshold. To avoid the zone
2595 * watermarks being breached while under pressure, we reduce the
2596 * per-cpu vmstat threshold while kswapd is awake and restore
2597 * them before going back to sleep.
2598 */
2599 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2600 schedule();
2601 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2602 } else {
2603 if (remaining)
2604 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2605 else
2606 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2607 }
2608 finish_wait(&pgdat->kswapd_wait, &wait);
2365} 2609}
2366 2610
2367/* 2611/*
@@ -2380,9 +2624,10 @@ out:
2380static int kswapd(void *p) 2624static int kswapd(void *p)
2381{ 2625{
2382 unsigned long order; 2626 unsigned long order;
2627 int classzone_idx;
2383 pg_data_t *pgdat = (pg_data_t*)p; 2628 pg_data_t *pgdat = (pg_data_t*)p;
2384 struct task_struct *tsk = current; 2629 struct task_struct *tsk = current;
2385 DEFINE_WAIT(wait); 2630
2386 struct reclaim_state reclaim_state = { 2631 struct reclaim_state reclaim_state = {
2387 .reclaimed_slab = 0, 2632 .reclaimed_slab = 0,
2388 }; 2633 };
@@ -2410,49 +2655,30 @@ static int kswapd(void *p)
2410 set_freezable(); 2655 set_freezable();
2411 2656
2412 order = 0; 2657 order = 0;
2658 classzone_idx = MAX_NR_ZONES - 1;
2413 for ( ; ; ) { 2659 for ( ; ; ) {
2414 unsigned long new_order; 2660 unsigned long new_order;
2661 int new_classzone_idx;
2415 int ret; 2662 int ret;
2416 2663
2417 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2418 new_order = pgdat->kswapd_max_order; 2664 new_order = pgdat->kswapd_max_order;
2665 new_classzone_idx = pgdat->classzone_idx;
2419 pgdat->kswapd_max_order = 0; 2666 pgdat->kswapd_max_order = 0;
2420 if (order < new_order) { 2667 pgdat->classzone_idx = MAX_NR_ZONES - 1;
2668 if (order < new_order || classzone_idx > new_classzone_idx) {
2421 /* 2669 /*
2422 * Don't sleep if someone wants a larger 'order' 2670 * Don't sleep if someone wants a larger 'order'
2423 * allocation 2671 * allocation or has tigher zone constraints
2424 */ 2672 */
2425 order = new_order; 2673 order = new_order;
2674 classzone_idx = new_classzone_idx;
2426 } else { 2675 } else {
2427 if (!freezing(current) && !kthread_should_stop()) { 2676 kswapd_try_to_sleep(pgdat, order, classzone_idx);
2428 long remaining = 0;
2429
2430 /* Try to sleep for a short interval */
2431 if (!sleeping_prematurely(pgdat, order, remaining)) {
2432 remaining = schedule_timeout(HZ/10);
2433 finish_wait(&pgdat->kswapd_wait, &wait);
2434 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2435 }
2436
2437 /*
2438 * After a short sleep, check if it was a
2439 * premature sleep. If not, then go fully
2440 * to sleep until explicitly woken up
2441 */
2442 if (!sleeping_prematurely(pgdat, order, remaining)) {
2443 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2444 schedule();
2445 } else {
2446 if (remaining)
2447 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2448 else
2449 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2450 }
2451 }
2452
2453 order = pgdat->kswapd_max_order; 2677 order = pgdat->kswapd_max_order;
2678 classzone_idx = pgdat->classzone_idx;
2679 pgdat->kswapd_max_order = 0;
2680 pgdat->classzone_idx = MAX_NR_ZONES - 1;
2454 } 2681 }
2455 finish_wait(&pgdat->kswapd_wait, &wait);
2456 2682
2457 ret = try_to_freeze(); 2683 ret = try_to_freeze();
2458 if (kthread_should_stop()) 2684 if (kthread_should_stop())
@@ -2464,7 +2690,7 @@ static int kswapd(void *p)
2464 */ 2690 */
2465 if (!ret) { 2691 if (!ret) {
2466 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 2692 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2467 balance_pgdat(pgdat, order); 2693 order = balance_pgdat(pgdat, order, &classzone_idx);
2468 } 2694 }
2469 } 2695 }
2470 return 0; 2696 return 0;
@@ -2473,23 +2699,26 @@ static int kswapd(void *p)
2473/* 2699/*
2474 * A zone is low on free memory, so wake its kswapd task to service it. 2700 * A zone is low on free memory, so wake its kswapd task to service it.
2475 */ 2701 */
2476void wakeup_kswapd(struct zone *zone, int order) 2702void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
2477{ 2703{
2478 pg_data_t *pgdat; 2704 pg_data_t *pgdat;
2479 2705
2480 if (!populated_zone(zone)) 2706 if (!populated_zone(zone))
2481 return; 2707 return;
2482 2708
2483 pgdat = zone->zone_pgdat;
2484 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2485 return;
2486 if (pgdat->kswapd_max_order < order)
2487 pgdat->kswapd_max_order = order;
2488 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2489 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2709 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2490 return; 2710 return;
2711 pgdat = zone->zone_pgdat;
2712 if (pgdat->kswapd_max_order < order) {
2713 pgdat->kswapd_max_order = order;
2714 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
2715 }
2491 if (!waitqueue_active(&pgdat->kswapd_wait)) 2716 if (!waitqueue_active(&pgdat->kswapd_wait))
2492 return; 2717 return;
2718 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
2719 return;
2720
2721 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2493 wake_up_interruptible(&pgdat->kswapd_wait); 2722 wake_up_interruptible(&pgdat->kswapd_wait);
2494} 2723}
2495 2724
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 42eac4d33216..0c3b5048773e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -83,7 +83,31 @@ EXPORT_SYMBOL(vm_stat);
83 83
84#ifdef CONFIG_SMP 84#ifdef CONFIG_SMP
85 85
86static int calculate_threshold(struct zone *zone) 86int calculate_pressure_threshold(struct zone *zone)
87{
88 int threshold;
89 int watermark_distance;
90
91 /*
92 * As vmstats are not up to date, there is drift between the estimated
93 * and real values. For high thresholds and a high number of CPUs, it
94 * is possible for the min watermark to be breached while the estimated
95 * value looks fine. The pressure threshold is a reduced value such
96 * that even the maximum amount of drift will not accidentally breach
97 * the min watermark
98 */
99 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
100 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
101
102 /*
103 * Maximum threshold is 125
104 */
105 threshold = min(125, threshold);
106
107 return threshold;
108}
109
110int calculate_normal_threshold(struct zone *zone)
87{ 111{
88 int threshold; 112 int threshold;
89 int mem; /* memory in 128 MB units */ 113 int mem; /* memory in 128 MB units */
@@ -142,7 +166,7 @@ static void refresh_zone_stat_thresholds(void)
142 for_each_populated_zone(zone) { 166 for_each_populated_zone(zone) {
143 unsigned long max_drift, tolerate_drift; 167 unsigned long max_drift, tolerate_drift;
144 168
145 threshold = calculate_threshold(zone); 169 threshold = calculate_normal_threshold(zone);
146 170
147 for_each_online_cpu(cpu) 171 for_each_online_cpu(cpu)
148 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 172 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
@@ -161,42 +185,50 @@ static void refresh_zone_stat_thresholds(void)
161 } 185 }
162} 186}
163 187
188void set_pgdat_percpu_threshold(pg_data_t *pgdat,
189 int (*calculate_pressure)(struct zone *))
190{
191 struct zone *zone;
192 int cpu;
193 int threshold;
194 int i;
195
196 for (i = 0; i < pgdat->nr_zones; i++) {
197 zone = &pgdat->node_zones[i];
198 if (!zone->percpu_drift_mark)
199 continue;
200
201 threshold = (*calculate_pressure)(zone);
202 for_each_possible_cpu(cpu)
203 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
204 = threshold;
205 }
206}
207
164/* 208/*
165 * For use when we know that interrupts are disabled. 209 * For use when we know that interrupts are disabled.
166 */ 210 */
167void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 211void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
168 int delta) 212 int delta)
169{ 213{
170 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 214 struct per_cpu_pageset __percpu *pcp = zone->pageset;
171 215 s8 __percpu *p = pcp->vm_stat_diff + item;
172 s8 *p = pcp->vm_stat_diff + item;
173 long x; 216 long x;
217 long t;
174 218
175 x = delta + *p; 219 x = delta + __this_cpu_read(*p);
176 220
177 if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { 221 t = __this_cpu_read(pcp->stat_threshold);
222
223 if (unlikely(x > t || x < -t)) {
178 zone_page_state_add(x, zone, item); 224 zone_page_state_add(x, zone, item);
179 x = 0; 225 x = 0;
180 } 226 }
181 *p = x; 227 __this_cpu_write(*p, x);
182} 228}
183EXPORT_SYMBOL(__mod_zone_page_state); 229EXPORT_SYMBOL(__mod_zone_page_state);
184 230
185/* 231/*
186 * For an unknown interrupt state
187 */
188void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
189 int delta)
190{
191 unsigned long flags;
192
193 local_irq_save(flags);
194 __mod_zone_page_state(zone, item, delta);
195 local_irq_restore(flags);
196}
197EXPORT_SYMBOL(mod_zone_page_state);
198
199/*
200 * Optimized increment and decrement functions. 232 * Optimized increment and decrement functions.
201 * 233 *
202 * These are only for a single page and therefore can take a struct page * 234 * These are only for a single page and therefore can take a struct page *
@@ -221,16 +253,17 @@ EXPORT_SYMBOL(mod_zone_page_state);
221 */ 253 */
222void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 254void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
223{ 255{
224 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 256 struct per_cpu_pageset __percpu *pcp = zone->pageset;
225 s8 *p = pcp->vm_stat_diff + item; 257 s8 __percpu *p = pcp->vm_stat_diff + item;
226 258 s8 v, t;
227 (*p)++;
228 259
229 if (unlikely(*p > pcp->stat_threshold)) { 260 v = __this_cpu_inc_return(*p);
230 int overstep = pcp->stat_threshold / 2; 261 t = __this_cpu_read(pcp->stat_threshold);
262 if (unlikely(v > t)) {
263 s8 overstep = t >> 1;
231 264
232 zone_page_state_add(*p + overstep, zone, item); 265 zone_page_state_add(v + overstep, zone, item);
233 *p = -overstep; 266 __this_cpu_write(*p, -overstep);
234 } 267 }
235} 268}
236 269
@@ -242,16 +275,17 @@ EXPORT_SYMBOL(__inc_zone_page_state);
242 275
243void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 276void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
244{ 277{
245 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 278 struct per_cpu_pageset __percpu *pcp = zone->pageset;
246 s8 *p = pcp->vm_stat_diff + item; 279 s8 __percpu *p = pcp->vm_stat_diff + item;
280 s8 v, t;
247 281
248 (*p)--; 282 v = __this_cpu_dec_return(*p);
283 t = __this_cpu_read(pcp->stat_threshold);
284 if (unlikely(v < - t)) {
285 s8 overstep = t >> 1;
249 286
250 if (unlikely(*p < - pcp->stat_threshold)) { 287 zone_page_state_add(v - overstep, zone, item);
251 int overstep = pcp->stat_threshold / 2; 288 __this_cpu_write(*p, overstep);
252
253 zone_page_state_add(*p - overstep, zone, item);
254 *p = overstep;
255 } 289 }
256} 290}
257 291
@@ -261,6 +295,92 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
261} 295}
262EXPORT_SYMBOL(__dec_zone_page_state); 296EXPORT_SYMBOL(__dec_zone_page_state);
263 297
298#ifdef CONFIG_CMPXCHG_LOCAL
299/*
300 * If we have cmpxchg_local support then we do not need to incur the overhead
301 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
302 *
303 * mod_state() modifies the zone counter state through atomic per cpu
304 * operations.
305 *
306 * Overstep mode specifies how overstep should handled:
307 * 0 No overstepping
308 * 1 Overstepping half of threshold
309 * -1 Overstepping minus half of threshold
310*/
311static inline void mod_state(struct zone *zone,
312 enum zone_stat_item item, int delta, int overstep_mode)
313{
314 struct per_cpu_pageset __percpu *pcp = zone->pageset;
315 s8 __percpu *p = pcp->vm_stat_diff + item;
316 long o, n, t, z;
317
318 do {
319 z = 0; /* overflow to zone counters */
320
321 /*
322 * The fetching of the stat_threshold is racy. We may apply
323 * a counter threshold to the wrong the cpu if we get
324 * rescheduled while executing here. However, the following
325 * will apply the threshold again and therefore bring the
326 * counter under the threshold.
327 */
328 t = this_cpu_read(pcp->stat_threshold);
329
330 o = this_cpu_read(*p);
331 n = delta + o;
332
333 if (n > t || n < -t) {
334 int os = overstep_mode * (t >> 1) ;
335
336 /* Overflow must be added to zone counters */
337 z = n + os;
338 n = -os;
339 }
340 } while (this_cpu_cmpxchg(*p, o, n) != o);
341
342 if (z)
343 zone_page_state_add(z, zone, item);
344}
345
346void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
347 int delta)
348{
349 mod_state(zone, item, delta, 0);
350}
351EXPORT_SYMBOL(mod_zone_page_state);
352
353void inc_zone_state(struct zone *zone, enum zone_stat_item item)
354{
355 mod_state(zone, item, 1, 1);
356}
357
358void inc_zone_page_state(struct page *page, enum zone_stat_item item)
359{
360 mod_state(page_zone(page), item, 1, 1);
361}
362EXPORT_SYMBOL(inc_zone_page_state);
363
364void dec_zone_page_state(struct page *page, enum zone_stat_item item)
365{
366 mod_state(page_zone(page), item, -1, -1);
367}
368EXPORT_SYMBOL(dec_zone_page_state);
369#else
370/*
371 * Use interrupt disable to serialize counter updates
372 */
373void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
374 int delta)
375{
376 unsigned long flags;
377
378 local_irq_save(flags);
379 __mod_zone_page_state(zone, item, delta);
380 local_irq_restore(flags);
381}
382EXPORT_SYMBOL(mod_zone_page_state);
383
264void inc_zone_state(struct zone *zone, enum zone_stat_item item) 384void inc_zone_state(struct zone *zone, enum zone_stat_item item)
265{ 385{
266 unsigned long flags; 386 unsigned long flags;
@@ -291,6 +411,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
291 local_irq_restore(flags); 411 local_irq_restore(flags);
292} 412}
293EXPORT_SYMBOL(dec_zone_page_state); 413EXPORT_SYMBOL(dec_zone_page_state);
414#endif
294 415
295/* 416/*
296 * Update the zone counters for one cpu. 417 * Update the zone counters for one cpu.
@@ -750,8 +871,6 @@ static const char * const vmstat_text[] = {
750 "nr_shmem", 871 "nr_shmem",
751 "nr_dirtied", 872 "nr_dirtied",
752 "nr_written", 873 "nr_written",
753 "nr_dirty_threshold",
754 "nr_dirty_background_threshold",
755 874
756#ifdef CONFIG_NUMA 875#ifdef CONFIG_NUMA
757 "numa_hit", 876 "numa_hit",
@@ -761,6 +880,9 @@ static const char * const vmstat_text[] = {
761 "numa_local", 880 "numa_local",
762 "numa_other", 881 "numa_other",
763#endif 882#endif
883 "nr_anon_transparent_hugepages",
884 "nr_dirty_threshold",
885 "nr_dirty_background_threshold",
764 886
765#ifdef CONFIG_VM_EVENT_COUNTERS 887#ifdef CONFIG_VM_EVENT_COUNTERS
766 "pgpgin", 888 "pgpgin",
@@ -834,7 +956,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
834 "\n scanned %lu" 956 "\n scanned %lu"
835 "\n spanned %lu" 957 "\n spanned %lu"
836 "\n present %lu", 958 "\n present %lu",
837 zone_nr_free_pages(zone), 959 zone_page_state(zone, NR_FREE_PAGES),
838 min_wmark_pages(zone), 960 min_wmark_pages(zone),
839 low_wmark_pages(zone), 961 low_wmark_pages(zone),
840 high_wmark_pages(zone), 962 high_wmark_pages(zone),
@@ -1033,7 +1155,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
1033 break; 1155 break;
1034 case CPU_DOWN_PREPARE: 1156 case CPU_DOWN_PREPARE:
1035 case CPU_DOWN_PREPARE_FROZEN: 1157 case CPU_DOWN_PREPARE_FROZEN:
1036 cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu)); 1158 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
1037 per_cpu(vmstat_work, cpu).work.func = NULL; 1159 per_cpu(vmstat_work, cpu).work.func = NULL;
1038 break; 1160 break;
1039 case CPU_DOWN_FAILED: 1161 case CPU_DOWN_FAILED: