aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig38
-rw-r--r--mm/Makefile3
-rw-r--r--mm/compaction.c174
-rw-r--r--mm/dmapool.c16
-rw-r--r--mm/filemap.c20
-rw-r--r--mm/huge_memory.c2346
-rw-r--r--mm/hugetlb.c111
-rw-r--r--mm/internal.h16
-rw-r--r--mm/ksm.c81
-rw-r--r--mm/madvise.c10
-rw-r--r--mm/memcontrol.c258
-rw-r--r--mm/memory-failure.c22
-rw-r--r--mm/memory.c336
-rw-r--r--mm/memory_hotplug.c17
-rw-r--r--mm/mempolicy.c23
-rw-r--r--mm/migrate.c123
-rw-r--r--mm/mincore.c7
-rw-r--r--mm/mlock.c163
-rw-r--r--mm/mmap.c17
-rw-r--r--mm/mmu_notifier.c20
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/mprotect.c20
-rw-r--r--mm/mremap.c9
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/page-writeback.c9
-rw-r--r--mm/page_alloc.c165
-rw-r--r--mm/pagewalk.c1
-rw-r--r--mm/percpu-vm.c2
-rw-r--r--mm/percpu.c10
-rw-r--r--mm/pgtable-generic.c123
-rw-r--r--mm/rmap.c93
-rw-r--r--mm/shmem.c9
-rw-r--r--mm/slab.c76
-rw-r--r--mm/slob.c5
-rw-r--r--mm/slub.c81
-rw-r--r--mm/sparse-vmemmap.c2
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c322
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/swapfile.c9
-rw-r--r--mm/util.c21
-rw-r--r--mm/vmalloc.c89
-rw-r--r--mm/vmscan.c432
-rw-r--r--mm/vmstat.c202
44 files changed, 4508 insertions, 1010 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c2c8a4a11898..3ad483bdf505 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -302,6 +302,44 @@ config NOMMU_INITIAL_TRIM_EXCESS
302 302
303 See Documentation/nommu-mmap.txt for more information. 303 See Documentation/nommu-mmap.txt for more information.
304 304
305config TRANSPARENT_HUGEPAGE
306 bool "Transparent Hugepage Support"
307 depends on X86 && MMU
308 select COMPACTION
309 help
310 Transparent Hugepages allows the kernel to use huge pages and
311 huge tlb transparently to the applications whenever possible.
312 This feature can improve computing performance to certain
313 applications by speeding up page faults during memory
314 allocation, by reducing the number of tlb misses and by speeding
315 up the pagetable walking.
316
317 If memory constrained on embedded, you may want to say N.
318
319choice
320 prompt "Transparent Hugepage Support sysfs defaults"
321 depends on TRANSPARENT_HUGEPAGE
322 default TRANSPARENT_HUGEPAGE_ALWAYS
323 help
324 Selects the sysfs defaults for Transparent Hugepage Support.
325
326 config TRANSPARENT_HUGEPAGE_ALWAYS
327 bool "always"
328 help
329 Enabling Transparent Hugepage always, can increase the
330 memory footprint of applications without a guaranteed
331 benefit but it will work automatically for all applications.
332
333 config TRANSPARENT_HUGEPAGE_MADVISE
334 bool "madvise"
335 help
336 Enabling Transparent Hugepage madvise, will only provide a
337 performance improvement benefit to the applications using
338 madvise(MADV_HUGEPAGE) but it won't risk to increase the
339 memory footprint of applications without a guaranteed
340 benefit.
341endchoice
342
305# 343#
306# UP and nommu archs use km based percpu allocator 344# UP and nommu archs use km based percpu allocator
307# 345#
diff --git a/mm/Makefile b/mm/Makefile
index f73f75a29f82..2b1b575ae712 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,7 +5,7 @@
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o pagewalk.o 8 vmalloc.o pagewalk.o pgtable-generic.o
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o \ 11 maccess.o page_alloc.o page-writeback.o \
@@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
37obj-$(CONFIG_FS_XIP) += filemap_xip.o 37obj-$(CONFIG_FS_XIP) += filemap_xip.o
38obj-$(CONFIG_MIGRATION) += migrate.o 38obj-$(CONFIG_MIGRATION) += migrate.o
39obj-$(CONFIG_QUICKLIST) += quicklist.o 39obj-$(CONFIG_QUICKLIST) += quicklist.o
40obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
40obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o 41obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
41obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o 42obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
42obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o 43obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/compaction.c b/mm/compaction.c
index 1a8894eadf72..6d592a021072 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,9 @@
16#include <linux/sysfs.h> 16#include <linux/sysfs.h>
17#include "internal.h" 17#include "internal.h"
18 18
19#define CREATE_TRACE_POINTS
20#include <trace/events/compaction.h>
21
19/* 22/*
20 * compact_control is used to track pages being migrated and the free pages 23 * compact_control is used to track pages being migrated and the free pages
21 * they are being migrated to during memory compaction. The free_pfn starts 24 * they are being migrated to during memory compaction. The free_pfn starts
@@ -30,6 +33,7 @@ struct compact_control {
30 unsigned long nr_migratepages; /* Number of pages to migrate */ 33 unsigned long nr_migratepages; /* Number of pages to migrate */
31 unsigned long free_pfn; /* isolate_freepages search base */ 34 unsigned long free_pfn; /* isolate_freepages search base */
32 unsigned long migrate_pfn; /* isolate_migratepages search base */ 35 unsigned long migrate_pfn; /* isolate_migratepages search base */
36 bool sync; /* Synchronous migration */
33 37
34 /* Account for isolated anon and file pages */ 38 /* Account for isolated anon and file pages */
35 unsigned long nr_anon; 39 unsigned long nr_anon;
@@ -38,6 +42,8 @@ struct compact_control {
38 unsigned int order; /* order a direct compactor needs */ 42 unsigned int order; /* order a direct compactor needs */
39 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 43 int migratetype; /* MOVABLE, RECLAIMABLE etc */
40 struct zone *zone; 44 struct zone *zone;
45
46 int compact_mode;
41}; 47};
42 48
43static unsigned long release_freepages(struct list_head *freelist) 49static unsigned long release_freepages(struct list_head *freelist)
@@ -60,7 +66,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
60 struct list_head *freelist) 66 struct list_head *freelist)
61{ 67{
62 unsigned long zone_end_pfn, end_pfn; 68 unsigned long zone_end_pfn, end_pfn;
63 int total_isolated = 0; 69 int nr_scanned = 0, total_isolated = 0;
64 struct page *cursor; 70 struct page *cursor;
65 71
66 /* Get the last PFN we should scan for free pages at */ 72 /* Get the last PFN we should scan for free pages at */
@@ -81,6 +87,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
81 87
82 if (!pfn_valid_within(blockpfn)) 88 if (!pfn_valid_within(blockpfn))
83 continue; 89 continue;
90 nr_scanned++;
84 91
85 if (!PageBuddy(page)) 92 if (!PageBuddy(page))
86 continue; 93 continue;
@@ -100,6 +107,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
100 } 107 }
101 } 108 }
102 109
110 trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
103 return total_isolated; 111 return total_isolated;
104} 112}
105 113
@@ -234,6 +242,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
234 struct compact_control *cc) 242 struct compact_control *cc)
235{ 243{
236 unsigned long low_pfn, end_pfn; 244 unsigned long low_pfn, end_pfn;
245 unsigned long last_pageblock_nr = 0, pageblock_nr;
246 unsigned long nr_scanned = 0, nr_isolated = 0;
237 struct list_head *migratelist = &cc->migratepages; 247 struct list_head *migratelist = &cc->migratepages;
238 248
239 /* Do not scan outside zone boundaries */ 249 /* Do not scan outside zone boundaries */
@@ -266,20 +276,51 @@ static unsigned long isolate_migratepages(struct zone *zone,
266 struct page *page; 276 struct page *page;
267 if (!pfn_valid_within(low_pfn)) 277 if (!pfn_valid_within(low_pfn))
268 continue; 278 continue;
279 nr_scanned++;
269 280
270 /* Get the page and skip if free */ 281 /* Get the page and skip if free */
271 page = pfn_to_page(low_pfn); 282 page = pfn_to_page(low_pfn);
272 if (PageBuddy(page)) 283 if (PageBuddy(page))
273 continue; 284 continue;
274 285
286 /*
287 * For async migration, also only scan in MOVABLE blocks. Async
288 * migration is optimistic to see if the minimum amount of work
289 * satisfies the allocation
290 */
291 pageblock_nr = low_pfn >> pageblock_order;
292 if (!cc->sync && last_pageblock_nr != pageblock_nr &&
293 get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
294 low_pfn += pageblock_nr_pages;
295 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
296 last_pageblock_nr = pageblock_nr;
297 continue;
298 }
299
300 if (!PageLRU(page))
301 continue;
302
303 /*
304 * PageLRU is set, and lru_lock excludes isolation,
305 * splitting and collapsing (collapsing has already
306 * happened if PageLRU is set).
307 */
308 if (PageTransHuge(page)) {
309 low_pfn += (1 << compound_order(page)) - 1;
310 continue;
311 }
312
275 /* Try isolate the page */ 313 /* Try isolate the page */
276 if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) 314 if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
277 continue; 315 continue;
278 316
317 VM_BUG_ON(PageTransCompound(page));
318
279 /* Successfully isolated */ 319 /* Successfully isolated */
280 del_page_from_lru_list(zone, page, page_lru(page)); 320 del_page_from_lru_list(zone, page, page_lru(page));
281 list_add(&page->lru, migratelist); 321 list_add(&page->lru, migratelist);
282 cc->nr_migratepages++; 322 cc->nr_migratepages++;
323 nr_isolated++;
283 324
284 /* Avoid isolating too much */ 325 /* Avoid isolating too much */
285 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) 326 if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
@@ -291,6 +332,8 @@ static unsigned long isolate_migratepages(struct zone *zone,
291 spin_unlock_irq(&zone->lru_lock); 332 spin_unlock_irq(&zone->lru_lock);
292 cc->migrate_pfn = low_pfn; 333 cc->migrate_pfn = low_pfn;
293 334
335 trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
336
294 return cc->nr_migratepages; 337 return cc->nr_migratepages;
295} 338}
296 339
@@ -341,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc)
341} 384}
342 385
343static int compact_finished(struct zone *zone, 386static int compact_finished(struct zone *zone,
344 struct compact_control *cc) 387 struct compact_control *cc)
345{ 388{
346 unsigned int order; 389 unsigned int order;
347 unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); 390 unsigned long watermark;
348 391
349 if (fatal_signal_pending(current)) 392 if (fatal_signal_pending(current))
350 return COMPACT_PARTIAL; 393 return COMPACT_PARTIAL;
@@ -354,12 +397,27 @@ static int compact_finished(struct zone *zone,
354 return COMPACT_COMPLETE; 397 return COMPACT_COMPLETE;
355 398
356 /* Compaction run is not finished if the watermark is not met */ 399 /* Compaction run is not finished if the watermark is not met */
400 if (cc->compact_mode != COMPACT_MODE_KSWAPD)
401 watermark = low_wmark_pages(zone);
402 else
403 watermark = high_wmark_pages(zone);
404 watermark += (1 << cc->order);
405
357 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) 406 if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
358 return COMPACT_CONTINUE; 407 return COMPACT_CONTINUE;
359 408
360 if (cc->order == -1) 409 if (cc->order == -1)
361 return COMPACT_CONTINUE; 410 return COMPACT_CONTINUE;
362 411
412 /*
413 * Generating only one page of the right order is not enough
414 * for kswapd, we must continue until we're above the high
415 * watermark as a pool for high order GFP_ATOMIC allocations
416 * too.
417 */
418 if (cc->compact_mode == COMPACT_MODE_KSWAPD)
419 return COMPACT_CONTINUE;
420
363 /* Direct compactor: Is a suitable page free? */ 421 /* Direct compactor: Is a suitable page free? */
364 for (order = cc->order; order < MAX_ORDER; order++) { 422 for (order = cc->order; order < MAX_ORDER; order++) {
365 /* Job done if page is free of the right migratetype */ 423 /* Job done if page is free of the right migratetype */
@@ -374,10 +432,62 @@ static int compact_finished(struct zone *zone,
374 return COMPACT_CONTINUE; 432 return COMPACT_CONTINUE;
375} 433}
376 434
435/*
436 * compaction_suitable: Is this suitable to run compaction on this zone now?
437 * Returns
438 * COMPACT_SKIPPED - If there are too few free pages for compaction
439 * COMPACT_PARTIAL - If the allocation would succeed without compaction
440 * COMPACT_CONTINUE - If compaction should run now
441 */
442unsigned long compaction_suitable(struct zone *zone, int order)
443{
444 int fragindex;
445 unsigned long watermark;
446
447 /*
448 * Watermarks for order-0 must be met for compaction. Note the 2UL.
449 * This is because during migration, copies of pages need to be
450 * allocated and for a short time, the footprint is higher
451 */
452 watermark = low_wmark_pages(zone) + (2UL << order);
453 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
454 return COMPACT_SKIPPED;
455
456 /*
457 * fragmentation index determines if allocation failures are due to
458 * low memory or external fragmentation
459 *
460 * index of -1 implies allocations might succeed dependingon watermarks
461 * index towards 0 implies failure is due to lack of memory
462 * index towards 1000 implies failure is due to fragmentation
463 *
464 * Only compact if a failure would be due to fragmentation.
465 */
466 fragindex = fragmentation_index(zone, order);
467 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
468 return COMPACT_SKIPPED;
469
470 if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
471 return COMPACT_PARTIAL;
472
473 return COMPACT_CONTINUE;
474}
475
377static int compact_zone(struct zone *zone, struct compact_control *cc) 476static int compact_zone(struct zone *zone, struct compact_control *cc)
378{ 477{
379 int ret; 478 int ret;
380 479
480 ret = compaction_suitable(zone, cc->order);
481 switch (ret) {
482 case COMPACT_PARTIAL:
483 case COMPACT_SKIPPED:
484 /* Compaction is likely to fail */
485 return ret;
486 case COMPACT_CONTINUE:
487 /* Fall through to compaction */
488 ;
489 }
490
381 /* Setup to move all movable pages to the end of the zone */ 491 /* Setup to move all movable pages to the end of the zone */
382 cc->migrate_pfn = zone->zone_start_pfn; 492 cc->migrate_pfn = zone->zone_start_pfn;
383 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; 493 cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -393,7 +503,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
393 503
394 nr_migrate = cc->nr_migratepages; 504 nr_migrate = cc->nr_migratepages;
395 migrate_pages(&cc->migratepages, compaction_alloc, 505 migrate_pages(&cc->migratepages, compaction_alloc,
396 (unsigned long)cc, 0); 506 (unsigned long)cc, false,
507 cc->sync);
397 update_nr_listpages(cc); 508 update_nr_listpages(cc);
398 nr_remaining = cc->nr_migratepages; 509 nr_remaining = cc->nr_migratepages;
399 510
@@ -401,6 +512,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
401 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); 512 count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
402 if (nr_remaining) 513 if (nr_remaining)
403 count_vm_events(COMPACTPAGEFAILED, nr_remaining); 514 count_vm_events(COMPACTPAGEFAILED, nr_remaining);
515 trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
516 nr_remaining);
404 517
405 /* Release LRU pages not migrated */ 518 /* Release LRU pages not migrated */
406 if (!list_empty(&cc->migratepages)) { 519 if (!list_empty(&cc->migratepages)) {
@@ -417,8 +530,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
417 return ret; 530 return ret;
418} 531}
419 532
420static unsigned long compact_zone_order(struct zone *zone, 533unsigned long compact_zone_order(struct zone *zone,
421 int order, gfp_t gfp_mask) 534 int order, gfp_t gfp_mask,
535 bool sync,
536 int compact_mode)
422{ 537{
423 struct compact_control cc = { 538 struct compact_control cc = {
424 .nr_freepages = 0, 539 .nr_freepages = 0,
@@ -426,6 +541,8 @@ static unsigned long compact_zone_order(struct zone *zone,
426 .order = order, 541 .order = order,
427 .migratetype = allocflags_to_migratetype(gfp_mask), 542 .migratetype = allocflags_to_migratetype(gfp_mask),
428 .zone = zone, 543 .zone = zone,
544 .sync = sync,
545 .compact_mode = compact_mode,
429 }; 546 };
430 INIT_LIST_HEAD(&cc.freepages); 547 INIT_LIST_HEAD(&cc.freepages);
431 INIT_LIST_HEAD(&cc.migratepages); 548 INIT_LIST_HEAD(&cc.migratepages);
@@ -441,16 +558,17 @@ int sysctl_extfrag_threshold = 500;
441 * @order: The order of the current allocation 558 * @order: The order of the current allocation
442 * @gfp_mask: The GFP mask of the current allocation 559 * @gfp_mask: The GFP mask of the current allocation
443 * @nodemask: The allowed nodes to allocate from 560 * @nodemask: The allowed nodes to allocate from
561 * @sync: Whether migration is synchronous or not
444 * 562 *
445 * This is the main entry point for direct page compaction. 563 * This is the main entry point for direct page compaction.
446 */ 564 */
447unsigned long try_to_compact_pages(struct zonelist *zonelist, 565unsigned long try_to_compact_pages(struct zonelist *zonelist,
448 int order, gfp_t gfp_mask, nodemask_t *nodemask) 566 int order, gfp_t gfp_mask, nodemask_t *nodemask,
567 bool sync)
449{ 568{
450 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 569 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
451 int may_enter_fs = gfp_mask & __GFP_FS; 570 int may_enter_fs = gfp_mask & __GFP_FS;
452 int may_perform_io = gfp_mask & __GFP_IO; 571 int may_perform_io = gfp_mask & __GFP_IO;
453 unsigned long watermark;
454 struct zoneref *z; 572 struct zoneref *z;
455 struct zone *zone; 573 struct zone *zone;
456 int rc = COMPACT_SKIPPED; 574 int rc = COMPACT_SKIPPED;
@@ -460,7 +578,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
460 * made because an assumption is made that the page allocator can satisfy 578 * made because an assumption is made that the page allocator can satisfy
461 * the "cheaper" orders without taking special steps 579 * the "cheaper" orders without taking special steps
462 */ 580 */
463 if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) 581 if (!order || !may_enter_fs || !may_perform_io)
464 return rc; 582 return rc;
465 583
466 count_vm_event(COMPACTSTALL); 584 count_vm_event(COMPACTSTALL);
@@ -468,43 +586,14 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
468 /* Compact each zone in the list */ 586 /* Compact each zone in the list */
469 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, 587 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
470 nodemask) { 588 nodemask) {
471 int fragindex;
472 int status; 589 int status;
473 590
474 /* 591 status = compact_zone_order(zone, order, gfp_mask, sync,
475 * Watermarks for order-0 must be met for compaction. Note 592 COMPACT_MODE_DIRECT_RECLAIM);
476 * the 2UL. This is because during migration, copies of
477 * pages need to be allocated and for a short time, the
478 * footprint is higher
479 */
480 watermark = low_wmark_pages(zone) + (2UL << order);
481 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
482 continue;
483
484 /*
485 * fragmentation index determines if allocation failures are
486 * due to low memory or external fragmentation
487 *
488 * index of -1 implies allocations might succeed depending
489 * on watermarks
490 * index towards 0 implies failure is due to lack of memory
491 * index towards 1000 implies failure is due to fragmentation
492 *
493 * Only compact if a failure would be due to fragmentation.
494 */
495 fragindex = fragmentation_index(zone, order);
496 if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
497 continue;
498
499 if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
500 rc = COMPACT_PARTIAL;
501 break;
502 }
503
504 status = compact_zone_order(zone, order, gfp_mask);
505 rc = max(status, rc); 593 rc = max(status, rc);
506 594
507 if (zone_watermark_ok(zone, order, watermark, 0, 0)) 595 /* If a normal allocation would succeed, stop compacting */
596 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
508 break; 597 break;
509 } 598 }
510 599
@@ -531,6 +620,7 @@ static int compact_node(int nid)
531 .nr_freepages = 0, 620 .nr_freepages = 0,
532 .nr_migratepages = 0, 621 .nr_migratepages = 0,
533 .order = -1, 622 .order = -1,
623 .compact_mode = COMPACT_MODE_DIRECT_RECLAIM,
534 }; 624 };
535 625
536 zone = &pgdat->node_zones[zoneid]; 626 zone = &pgdat->node_zones[zoneid];
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 4df2de77e069..03bf3bb4519a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
324 if (mem_flags & __GFP_WAIT) { 324 if (mem_flags & __GFP_WAIT) {
325 DECLARE_WAITQUEUE(wait, current); 325 DECLARE_WAITQUEUE(wait, current);
326 326
327 __set_current_state(TASK_INTERRUPTIBLE); 327 __set_current_state(TASK_UNINTERRUPTIBLE);
328 __add_wait_queue(&pool->waitq, &wait); 328 __add_wait_queue(&pool->waitq, &wait);
329 spin_unlock_irqrestore(&pool->lock, flags); 329 spin_unlock_irqrestore(&pool->lock, flags);
330 330
@@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc);
355 355
356static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) 356static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
357{ 357{
358 unsigned long flags;
359 struct dma_page *page; 358 struct dma_page *page;
360 359
361 spin_lock_irqsave(&pool->lock, flags);
362 list_for_each_entry(page, &pool->page_list, page_list) { 360 list_for_each_entry(page, &pool->page_list, page_list) {
363 if (dma < page->dma) 361 if (dma < page->dma)
364 continue; 362 continue;
365 if (dma < (page->dma + pool->allocation)) 363 if (dma < (page->dma + pool->allocation))
366 goto done; 364 return page;
367 } 365 }
368 page = NULL; 366 return NULL;
369 done:
370 spin_unlock_irqrestore(&pool->lock, flags);
371 return page;
372} 367}
373 368
374/** 369/**
@@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
386 unsigned long flags; 381 unsigned long flags;
387 unsigned int offset; 382 unsigned int offset;
388 383
384 spin_lock_irqsave(&pool->lock, flags);
389 page = pool_find_page(pool, dma); 385 page = pool_find_page(pool, dma);
390 if (!page) { 386 if (!page) {
387 spin_unlock_irqrestore(&pool->lock, flags);
391 if (pool->dev) 388 if (pool->dev)
392 dev_err(pool->dev, 389 dev_err(pool->dev,
393 "dma_pool_free %s, %p/%lx (bad dma)\n", 390 "dma_pool_free %s, %p/%lx (bad dma)\n",
@@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
401 offset = vaddr - page->vaddr; 398 offset = vaddr - page->vaddr;
402#ifdef DMAPOOL_DEBUG 399#ifdef DMAPOOL_DEBUG
403 if ((dma - page->dma) != offset) { 400 if ((dma - page->dma) != offset) {
401 spin_unlock_irqrestore(&pool->lock, flags);
404 if (pool->dev) 402 if (pool->dev)
405 dev_err(pool->dev, 403 dev_err(pool->dev,
406 "dma_pool_free %s, %p (bad vaddr)/%Lx\n", 404 "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
@@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
418 chain = *(int *)(page->vaddr + chain); 416 chain = *(int *)(page->vaddr + chain);
419 continue; 417 continue;
420 } 418 }
419 spin_unlock_irqrestore(&pool->lock, flags);
421 if (pool->dev) 420 if (pool->dev)
422 dev_err(pool->dev, "dma_pool_free %s, dma %Lx " 421 dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
423 "already free\n", pool->name, 422 "already free\n", pool->name,
@@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
432 memset(vaddr, POOL_POISON_FREED, pool->size); 431 memset(vaddr, POOL_POISON_FREED, pool->size);
433#endif 432#endif
434 433
435 spin_lock_irqsave(&pool->lock, flags);
436 page->in_use--; 434 page->in_use--;
437 *(int *)vaddr = page->offset; 435 *(int *)vaddr = page->offset;
438 page->offset = offset; 436 page->offset = offset;
diff --git a/mm/filemap.c b/mm/filemap.c
index 6b9aee20f242..83a45d35468b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -102,9 +102,6 @@
102 * ->inode_lock (zap_pte_range->set_page_dirty) 102 * ->inode_lock (zap_pte_range->set_page_dirty)
103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers) 103 * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
104 * 104 *
105 * ->task->proc_lock
106 * ->dcache_lock (proc_pid_lookup)
107 *
108 * (code doesn't rely on that order, so you could switch it around) 105 * (code doesn't rely on that order, so you could switch it around)
109 * ->tasklist_lock (memory_failure, collect_procs_ao) 106 * ->tasklist_lock (memory_failure, collect_procs_ao)
110 * ->i_mmap_lock 107 * ->i_mmap_lock
@@ -301,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
301 continue; 298 continue;
302 299
303 wait_on_page_writeback(page); 300 wait_on_page_writeback(page);
304 if (PageError(page)) 301 if (TestClearPageError(page))
305 ret = -EIO; 302 ret = -EIO;
306 } 303 }
307 pagevec_release(&pvec); 304 pagevec_release(&pvec);
@@ -840,9 +837,6 @@ repeat:
840 if (radix_tree_deref_retry(page)) 837 if (radix_tree_deref_retry(page))
841 goto restart; 838 goto restart;
842 839
843 if (page->mapping == NULL || page->index != index)
844 break;
845
846 if (!page_cache_get_speculative(page)) 840 if (!page_cache_get_speculative(page))
847 goto repeat; 841 goto repeat;
848 842
@@ -852,6 +846,16 @@ repeat:
852 goto repeat; 846 goto repeat;
853 } 847 }
854 848
849 /*
850 * must check mapping and index after taking the ref.
851 * otherwise we can get both false positives and false
852 * negatives, which is just confusing to the caller.
853 */
854 if (page->mapping == NULL || page->index != index) {
855 page_cache_release(page);
856 break;
857 }
858
855 pages[ret] = page; 859 pages[ret] = page;
856 ret++; 860 ret++;
857 index++; 861 index++;
@@ -2223,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
2223 gfp_notmask = __GFP_FS; 2227 gfp_notmask = __GFP_FS;
2224repeat: 2228repeat:
2225 page = find_lock_page(mapping, index); 2229 page = find_lock_page(mapping, index);
2226 if (likely(page)) 2230 if (page)
2227 return page; 2231 return page;
2228 2232
2229 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); 2233 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644
index 000000000000..004c9c2aac78
--- /dev/null
+++ b/mm/huge_memory.c
@@ -0,0 +1,2346 @@
1/*
2 * Copyright (C) 2009 Red Hat, Inc.
3 *
4 * This work is licensed under the terms of the GNU GPL, version 2. See
5 * the COPYING file in the top-level directory.
6 */
7
8#include <linux/mm.h>
9#include <linux/sched.h>
10#include <linux/highmem.h>
11#include <linux/hugetlb.h>
12#include <linux/mmu_notifier.h>
13#include <linux/rmap.h>
14#include <linux/swap.h>
15#include <linux/mm_inline.h>
16#include <linux/kthread.h>
17#include <linux/khugepaged.h>
18#include <linux/freezer.h>
19#include <linux/mman.h>
20#include <asm/tlb.h>
21#include <asm/pgalloc.h>
22#include "internal.h"
23
24/*
25 * By default transparent hugepage support is enabled for all mappings
26 * and khugepaged scans all mappings. Defrag is only invoked by
27 * khugepaged hugepage allocations and by page faults inside
28 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
29 * allocations.
30 */
31unsigned long transparent_hugepage_flags __read_mostly =
32#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
33 (1<<TRANSPARENT_HUGEPAGE_FLAG)|
34#endif
35#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
36 (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
37#endif
38 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
39 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
40
41/* default scan 8*512 pte (or vmas) every 30 second */
42static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
43static unsigned int khugepaged_pages_collapsed;
44static unsigned int khugepaged_full_scans;
45static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
46/* during fragmentation poll the hugepage allocator once every minute */
47static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
48static struct task_struct *khugepaged_thread __read_mostly;
49static DEFINE_MUTEX(khugepaged_mutex);
50static DEFINE_SPINLOCK(khugepaged_mm_lock);
51static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
52/*
53 * default collapse hugepages if there is at least one pte mapped like
54 * it would have happened if the vma was large enough during page
55 * fault.
56 */
57static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
58
59static int khugepaged(void *none);
60static int mm_slots_hash_init(void);
61static int khugepaged_slab_init(void);
62static void khugepaged_slab_free(void);
63
64#define MM_SLOTS_HASH_HEADS 1024
65static struct hlist_head *mm_slots_hash __read_mostly;
66static struct kmem_cache *mm_slot_cache __read_mostly;
67
68/**
69 * struct mm_slot - hash lookup from mm to mm_slot
70 * @hash: hash collision list
71 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
72 * @mm: the mm that this information is valid for
73 */
74struct mm_slot {
75 struct hlist_node hash;
76 struct list_head mm_node;
77 struct mm_struct *mm;
78};
79
80/**
81 * struct khugepaged_scan - cursor for scanning
82 * @mm_head: the head of the mm list to scan
83 * @mm_slot: the current mm_slot we are scanning
84 * @address: the next address inside that to be scanned
85 *
86 * There is only the one khugepaged_scan instance of this cursor structure.
87 */
88struct khugepaged_scan {
89 struct list_head mm_head;
90 struct mm_slot *mm_slot;
91 unsigned long address;
92} khugepaged_scan = {
93 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
94};
95
96
97static int set_recommended_min_free_kbytes(void)
98{
99 struct zone *zone;
100 int nr_zones = 0;
101 unsigned long recommended_min;
102 extern int min_free_kbytes;
103
104 if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
105 &transparent_hugepage_flags) &&
106 !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
107 &transparent_hugepage_flags))
108 return 0;
109
110 for_each_populated_zone(zone)
111 nr_zones++;
112
113 /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
114 recommended_min = pageblock_nr_pages * nr_zones * 2;
115
116 /*
117 * Make sure that on average at least two pageblocks are almost free
118 * of another type, one for a migratetype to fall back to and a
119 * second to avoid subsequent fallbacks of other types There are 3
120 * MIGRATE_TYPES we care about.
121 */
122 recommended_min += pageblock_nr_pages * nr_zones *
123 MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
124
125 /* don't ever allow to reserve more than 5% of the lowmem */
126 recommended_min = min(recommended_min,
127 (unsigned long) nr_free_buffer_pages() / 20);
128 recommended_min <<= (PAGE_SHIFT-10);
129
130 if (recommended_min > min_free_kbytes)
131 min_free_kbytes = recommended_min;
132 setup_per_zone_wmarks();
133 return 0;
134}
135late_initcall(set_recommended_min_free_kbytes);
136
137static int start_khugepaged(void)
138{
139 int err = 0;
140 if (khugepaged_enabled()) {
141 int wakeup;
142 if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
143 err = -ENOMEM;
144 goto out;
145 }
146 mutex_lock(&khugepaged_mutex);
147 if (!khugepaged_thread)
148 khugepaged_thread = kthread_run(khugepaged, NULL,
149 "khugepaged");
150 if (unlikely(IS_ERR(khugepaged_thread))) {
151 printk(KERN_ERR
152 "khugepaged: kthread_run(khugepaged) failed\n");
153 err = PTR_ERR(khugepaged_thread);
154 khugepaged_thread = NULL;
155 }
156 wakeup = !list_empty(&khugepaged_scan.mm_head);
157 mutex_unlock(&khugepaged_mutex);
158 if (wakeup)
159 wake_up_interruptible(&khugepaged_wait);
160
161 set_recommended_min_free_kbytes();
162 } else
163 /* wakeup to exit */
164 wake_up_interruptible(&khugepaged_wait);
165out:
166 return err;
167}
168
169#ifdef CONFIG_SYSFS
170
171static ssize_t double_flag_show(struct kobject *kobj,
172 struct kobj_attribute *attr, char *buf,
173 enum transparent_hugepage_flag enabled,
174 enum transparent_hugepage_flag req_madv)
175{
176 if (test_bit(enabled, &transparent_hugepage_flags)) {
177 VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
178 return sprintf(buf, "[always] madvise never\n");
179 } else if (test_bit(req_madv, &transparent_hugepage_flags))
180 return sprintf(buf, "always [madvise] never\n");
181 else
182 return sprintf(buf, "always madvise [never]\n");
183}
184static ssize_t double_flag_store(struct kobject *kobj,
185 struct kobj_attribute *attr,
186 const char *buf, size_t count,
187 enum transparent_hugepage_flag enabled,
188 enum transparent_hugepage_flag req_madv)
189{
190 if (!memcmp("always", buf,
191 min(sizeof("always")-1, count))) {
192 set_bit(enabled, &transparent_hugepage_flags);
193 clear_bit(req_madv, &transparent_hugepage_flags);
194 } else if (!memcmp("madvise", buf,
195 min(sizeof("madvise")-1, count))) {
196 clear_bit(enabled, &transparent_hugepage_flags);
197 set_bit(req_madv, &transparent_hugepage_flags);
198 } else if (!memcmp("never", buf,
199 min(sizeof("never")-1, count))) {
200 clear_bit(enabled, &transparent_hugepage_flags);
201 clear_bit(req_madv, &transparent_hugepage_flags);
202 } else
203 return -EINVAL;
204
205 return count;
206}
207
208static ssize_t enabled_show(struct kobject *kobj,
209 struct kobj_attribute *attr, char *buf)
210{
211 return double_flag_show(kobj, attr, buf,
212 TRANSPARENT_HUGEPAGE_FLAG,
213 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
214}
215static ssize_t enabled_store(struct kobject *kobj,
216 struct kobj_attribute *attr,
217 const char *buf, size_t count)
218{
219 ssize_t ret;
220
221 ret = double_flag_store(kobj, attr, buf, count,
222 TRANSPARENT_HUGEPAGE_FLAG,
223 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
224
225 if (ret > 0) {
226 int err = start_khugepaged();
227 if (err)
228 ret = err;
229 }
230
231 if (ret > 0 &&
232 (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
233 &transparent_hugepage_flags) ||
234 test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
235 &transparent_hugepage_flags)))
236 set_recommended_min_free_kbytes();
237
238 return ret;
239}
240static struct kobj_attribute enabled_attr =
241 __ATTR(enabled, 0644, enabled_show, enabled_store);
242
243static ssize_t single_flag_show(struct kobject *kobj,
244 struct kobj_attribute *attr, char *buf,
245 enum transparent_hugepage_flag flag)
246{
247 if (test_bit(flag, &transparent_hugepage_flags))
248 return sprintf(buf, "[yes] no\n");
249 else
250 return sprintf(buf, "yes [no]\n");
251}
252static ssize_t single_flag_store(struct kobject *kobj,
253 struct kobj_attribute *attr,
254 const char *buf, size_t count,
255 enum transparent_hugepage_flag flag)
256{
257 if (!memcmp("yes", buf,
258 min(sizeof("yes")-1, count))) {
259 set_bit(flag, &transparent_hugepage_flags);
260 } else if (!memcmp("no", buf,
261 min(sizeof("no")-1, count))) {
262 clear_bit(flag, &transparent_hugepage_flags);
263 } else
264 return -EINVAL;
265
266 return count;
267}
268
269/*
270 * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
271 * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
272 * memory just to allocate one more hugepage.
273 */
274static ssize_t defrag_show(struct kobject *kobj,
275 struct kobj_attribute *attr, char *buf)
276{
277 return double_flag_show(kobj, attr, buf,
278 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
279 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
280}
281static ssize_t defrag_store(struct kobject *kobj,
282 struct kobj_attribute *attr,
283 const char *buf, size_t count)
284{
285 return double_flag_store(kobj, attr, buf, count,
286 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
287 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
288}
289static struct kobj_attribute defrag_attr =
290 __ATTR(defrag, 0644, defrag_show, defrag_store);
291
292#ifdef CONFIG_DEBUG_VM
293static ssize_t debug_cow_show(struct kobject *kobj,
294 struct kobj_attribute *attr, char *buf)
295{
296 return single_flag_show(kobj, attr, buf,
297 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
298}
299static ssize_t debug_cow_store(struct kobject *kobj,
300 struct kobj_attribute *attr,
301 const char *buf, size_t count)
302{
303 return single_flag_store(kobj, attr, buf, count,
304 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
305}
306static struct kobj_attribute debug_cow_attr =
307 __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
308#endif /* CONFIG_DEBUG_VM */
309
310static struct attribute *hugepage_attr[] = {
311 &enabled_attr.attr,
312 &defrag_attr.attr,
313#ifdef CONFIG_DEBUG_VM
314 &debug_cow_attr.attr,
315#endif
316 NULL,
317};
318
319static struct attribute_group hugepage_attr_group = {
320 .attrs = hugepage_attr,
321};
322
323static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
324 struct kobj_attribute *attr,
325 char *buf)
326{
327 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
328}
329
330static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
331 struct kobj_attribute *attr,
332 const char *buf, size_t count)
333{
334 unsigned long msecs;
335 int err;
336
337 err = strict_strtoul(buf, 10, &msecs);
338 if (err || msecs > UINT_MAX)
339 return -EINVAL;
340
341 khugepaged_scan_sleep_millisecs = msecs;
342 wake_up_interruptible(&khugepaged_wait);
343
344 return count;
345}
346static struct kobj_attribute scan_sleep_millisecs_attr =
347 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
348 scan_sleep_millisecs_store);
349
350static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
351 struct kobj_attribute *attr,
352 char *buf)
353{
354 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
355}
356
357static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
358 struct kobj_attribute *attr,
359 const char *buf, size_t count)
360{
361 unsigned long msecs;
362 int err;
363
364 err = strict_strtoul(buf, 10, &msecs);
365 if (err || msecs > UINT_MAX)
366 return -EINVAL;
367
368 khugepaged_alloc_sleep_millisecs = msecs;
369 wake_up_interruptible(&khugepaged_wait);
370
371 return count;
372}
373static struct kobj_attribute alloc_sleep_millisecs_attr =
374 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
375 alloc_sleep_millisecs_store);
376
377static ssize_t pages_to_scan_show(struct kobject *kobj,
378 struct kobj_attribute *attr,
379 char *buf)
380{
381 return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
382}
383static ssize_t pages_to_scan_store(struct kobject *kobj,
384 struct kobj_attribute *attr,
385 const char *buf, size_t count)
386{
387 int err;
388 unsigned long pages;
389
390 err = strict_strtoul(buf, 10, &pages);
391 if (err || !pages || pages > UINT_MAX)
392 return -EINVAL;
393
394 khugepaged_pages_to_scan = pages;
395
396 return count;
397}
398static struct kobj_attribute pages_to_scan_attr =
399 __ATTR(pages_to_scan, 0644, pages_to_scan_show,
400 pages_to_scan_store);
401
402static ssize_t pages_collapsed_show(struct kobject *kobj,
403 struct kobj_attribute *attr,
404 char *buf)
405{
406 return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
407}
408static struct kobj_attribute pages_collapsed_attr =
409 __ATTR_RO(pages_collapsed);
410
411static ssize_t full_scans_show(struct kobject *kobj,
412 struct kobj_attribute *attr,
413 char *buf)
414{
415 return sprintf(buf, "%u\n", khugepaged_full_scans);
416}
417static struct kobj_attribute full_scans_attr =
418 __ATTR_RO(full_scans);
419
420static ssize_t khugepaged_defrag_show(struct kobject *kobj,
421 struct kobj_attribute *attr, char *buf)
422{
423 return single_flag_show(kobj, attr, buf,
424 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
425}
426static ssize_t khugepaged_defrag_store(struct kobject *kobj,
427 struct kobj_attribute *attr,
428 const char *buf, size_t count)
429{
430 return single_flag_store(kobj, attr, buf, count,
431 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
432}
433static struct kobj_attribute khugepaged_defrag_attr =
434 __ATTR(defrag, 0644, khugepaged_defrag_show,
435 khugepaged_defrag_store);
436
437/*
438 * max_ptes_none controls if khugepaged should collapse hugepages over
439 * any unmapped ptes in turn potentially increasing the memory
440 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
441 * reduce the available free memory in the system as it
442 * runs. Increasing max_ptes_none will instead potentially reduce the
443 * free memory in the system during the khugepaged scan.
444 */
445static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
446 struct kobj_attribute *attr,
447 char *buf)
448{
449 return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
450}
451static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
452 struct kobj_attribute *attr,
453 const char *buf, size_t count)
454{
455 int err;
456 unsigned long max_ptes_none;
457
458 err = strict_strtoul(buf, 10, &max_ptes_none);
459 if (err || max_ptes_none > HPAGE_PMD_NR-1)
460 return -EINVAL;
461
462 khugepaged_max_ptes_none = max_ptes_none;
463
464 return count;
465}
466static struct kobj_attribute khugepaged_max_ptes_none_attr =
467 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
468 khugepaged_max_ptes_none_store);
469
470static struct attribute *khugepaged_attr[] = {
471 &khugepaged_defrag_attr.attr,
472 &khugepaged_max_ptes_none_attr.attr,
473 &pages_to_scan_attr.attr,
474 &pages_collapsed_attr.attr,
475 &full_scans_attr.attr,
476 &scan_sleep_millisecs_attr.attr,
477 &alloc_sleep_millisecs_attr.attr,
478 NULL,
479};
480
481static struct attribute_group khugepaged_attr_group = {
482 .attrs = khugepaged_attr,
483 .name = "khugepaged",
484};
485#endif /* CONFIG_SYSFS */
486
487static int __init hugepage_init(void)
488{
489 int err;
490#ifdef CONFIG_SYSFS
491 static struct kobject *hugepage_kobj;
492#endif
493
494 err = -EINVAL;
495 if (!has_transparent_hugepage()) {
496 transparent_hugepage_flags = 0;
497 goto out;
498 }
499
500#ifdef CONFIG_SYSFS
501 err = -ENOMEM;
502 hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
503 if (unlikely(!hugepage_kobj)) {
504 printk(KERN_ERR "hugepage: failed kobject create\n");
505 goto out;
506 }
507
508 err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
509 if (err) {
510 printk(KERN_ERR "hugepage: failed register hugeage group\n");
511 goto out;
512 }
513
514 err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
515 if (err) {
516 printk(KERN_ERR "hugepage: failed register hugeage group\n");
517 goto out;
518 }
519#endif
520
521 err = khugepaged_slab_init();
522 if (err)
523 goto out;
524
525 err = mm_slots_hash_init();
526 if (err) {
527 khugepaged_slab_free();
528 goto out;
529 }
530
531 /*
532 * By default disable transparent hugepages on smaller systems,
533 * where the extra memory used could hurt more than TLB overhead
534 * is likely to save. The admin can still enable it through /sys.
535 */
536 if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
537 transparent_hugepage_flags = 0;
538
539 start_khugepaged();
540
541 set_recommended_min_free_kbytes();
542
543out:
544 return err;
545}
546module_init(hugepage_init)
547
548static int __init setup_transparent_hugepage(char *str)
549{
550 int ret = 0;
551 if (!str)
552 goto out;
553 if (!strcmp(str, "always")) {
554 set_bit(TRANSPARENT_HUGEPAGE_FLAG,
555 &transparent_hugepage_flags);
556 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
557 &transparent_hugepage_flags);
558 ret = 1;
559 } else if (!strcmp(str, "madvise")) {
560 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
561 &transparent_hugepage_flags);
562 set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
563 &transparent_hugepage_flags);
564 ret = 1;
565 } else if (!strcmp(str, "never")) {
566 clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
567 &transparent_hugepage_flags);
568 clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
569 &transparent_hugepage_flags);
570 ret = 1;
571 }
572out:
573 if (!ret)
574 printk(KERN_WARNING
575 "transparent_hugepage= cannot parse, ignored\n");
576 return ret;
577}
578__setup("transparent_hugepage=", setup_transparent_hugepage);
579
580static void prepare_pmd_huge_pte(pgtable_t pgtable,
581 struct mm_struct *mm)
582{
583 assert_spin_locked(&mm->page_table_lock);
584
585 /* FIFO */
586 if (!mm->pmd_huge_pte)
587 INIT_LIST_HEAD(&pgtable->lru);
588 else
589 list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
590 mm->pmd_huge_pte = pgtable;
591}
592
593static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
594{
595 if (likely(vma->vm_flags & VM_WRITE))
596 pmd = pmd_mkwrite(pmd);
597 return pmd;
598}
599
600static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
601 struct vm_area_struct *vma,
602 unsigned long haddr, pmd_t *pmd,
603 struct page *page)
604{
605 int ret = 0;
606 pgtable_t pgtable;
607
608 VM_BUG_ON(!PageCompound(page));
609 pgtable = pte_alloc_one(mm, haddr);
610 if (unlikely(!pgtable)) {
611 mem_cgroup_uncharge_page(page);
612 put_page(page);
613 return VM_FAULT_OOM;
614 }
615
616 clear_huge_page(page, haddr, HPAGE_PMD_NR);
617 __SetPageUptodate(page);
618
619 spin_lock(&mm->page_table_lock);
620 if (unlikely(!pmd_none(*pmd))) {
621 spin_unlock(&mm->page_table_lock);
622 mem_cgroup_uncharge_page(page);
623 put_page(page);
624 pte_free(mm, pgtable);
625 } else {
626 pmd_t entry;
627 entry = mk_pmd(page, vma->vm_page_prot);
628 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
629 entry = pmd_mkhuge(entry);
630 /*
631 * The spinlocking to take the lru_lock inside
632 * page_add_new_anon_rmap() acts as a full memory
633 * barrier to be sure clear_huge_page writes become
634 * visible after the set_pmd_at() write.
635 */
636 page_add_new_anon_rmap(page, vma, haddr);
637 set_pmd_at(mm, haddr, pmd, entry);
638 prepare_pmd_huge_pte(pgtable, mm);
639 add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
640 spin_unlock(&mm->page_table_lock);
641 }
642
643 return ret;
644}
645
646static inline gfp_t alloc_hugepage_gfpmask(int defrag)
647{
648 return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT);
649}
650
651static inline struct page *alloc_hugepage_vma(int defrag,
652 struct vm_area_struct *vma,
653 unsigned long haddr)
654{
655 return alloc_pages_vma(alloc_hugepage_gfpmask(defrag),
656 HPAGE_PMD_ORDER, vma, haddr);
657}
658
659#ifndef CONFIG_NUMA
660static inline struct page *alloc_hugepage(int defrag)
661{
662 return alloc_pages(alloc_hugepage_gfpmask(defrag),
663 HPAGE_PMD_ORDER);
664}
665#endif
666
667int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
668 unsigned long address, pmd_t *pmd,
669 unsigned int flags)
670{
671 struct page *page;
672 unsigned long haddr = address & HPAGE_PMD_MASK;
673 pte_t *pte;
674
675 if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
676 if (unlikely(anon_vma_prepare(vma)))
677 return VM_FAULT_OOM;
678 if (unlikely(khugepaged_enter(vma)))
679 return VM_FAULT_OOM;
680 page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
681 vma, haddr);
682 if (unlikely(!page))
683 goto out;
684 if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
685 put_page(page);
686 goto out;
687 }
688
689 return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
690 }
691out:
692 /*
693 * Use __pte_alloc instead of pte_alloc_map, because we can't
694 * run pte_offset_map on the pmd, if an huge pmd could
695 * materialize from under us from a different thread.
696 */
697 if (unlikely(__pte_alloc(mm, vma, pmd, address)))
698 return VM_FAULT_OOM;
699 /* if an huge pmd materialized from under us just retry later */
700 if (unlikely(pmd_trans_huge(*pmd)))
701 return 0;
702 /*
703 * A regular pmd is established and it can't morph into a huge pmd
704 * from under us anymore at this point because we hold the mmap_sem
705 * read mode and khugepaged takes it in write mode. So now it's
706 * safe to run pte_offset_map().
707 */
708 pte = pte_offset_map(pmd, address);
709 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
710}
711
712int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
713 pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
714 struct vm_area_struct *vma)
715{
716 struct page *src_page;
717 pmd_t pmd;
718 pgtable_t pgtable;
719 int ret;
720
721 ret = -ENOMEM;
722 pgtable = pte_alloc_one(dst_mm, addr);
723 if (unlikely(!pgtable))
724 goto out;
725
726 spin_lock(&dst_mm->page_table_lock);
727 spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
728
729 ret = -EAGAIN;
730 pmd = *src_pmd;
731 if (unlikely(!pmd_trans_huge(pmd))) {
732 pte_free(dst_mm, pgtable);
733 goto out_unlock;
734 }
735 if (unlikely(pmd_trans_splitting(pmd))) {
736 /* split huge page running from under us */
737 spin_unlock(&src_mm->page_table_lock);
738 spin_unlock(&dst_mm->page_table_lock);
739 pte_free(dst_mm, pgtable);
740
741 wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
742 goto out;
743 }
744 src_page = pmd_page(pmd);
745 VM_BUG_ON(!PageHead(src_page));
746 get_page(src_page);
747 page_dup_rmap(src_page);
748 add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
749
750 pmdp_set_wrprotect(src_mm, addr, src_pmd);
751 pmd = pmd_mkold(pmd_wrprotect(pmd));
752 set_pmd_at(dst_mm, addr, dst_pmd, pmd);
753 prepare_pmd_huge_pte(pgtable, dst_mm);
754
755 ret = 0;
756out_unlock:
757 spin_unlock(&src_mm->page_table_lock);
758 spin_unlock(&dst_mm->page_table_lock);
759out:
760 return ret;
761}
762
763/* no "address" argument so destroys page coloring of some arch */
764pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
765{
766 pgtable_t pgtable;
767
768 assert_spin_locked(&mm->page_table_lock);
769
770 /* FIFO */
771 pgtable = mm->pmd_huge_pte;
772 if (list_empty(&pgtable->lru))
773 mm->pmd_huge_pte = NULL;
774 else {
775 mm->pmd_huge_pte = list_entry(pgtable->lru.next,
776 struct page, lru);
777 list_del(&pgtable->lru);
778 }
779 return pgtable;
780}
781
782static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
783 struct vm_area_struct *vma,
784 unsigned long address,
785 pmd_t *pmd, pmd_t orig_pmd,
786 struct page *page,
787 unsigned long haddr)
788{
789 pgtable_t pgtable;
790 pmd_t _pmd;
791 int ret = 0, i;
792 struct page **pages;
793
794 pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
795 GFP_KERNEL);
796 if (unlikely(!pages)) {
797 ret |= VM_FAULT_OOM;
798 goto out;
799 }
800
801 for (i = 0; i < HPAGE_PMD_NR; i++) {
802 pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
803 vma, address);
804 if (unlikely(!pages[i] ||
805 mem_cgroup_newpage_charge(pages[i], mm,
806 GFP_KERNEL))) {
807 if (pages[i])
808 put_page(pages[i]);
809 mem_cgroup_uncharge_start();
810 while (--i >= 0) {
811 mem_cgroup_uncharge_page(pages[i]);
812 put_page(pages[i]);
813 }
814 mem_cgroup_uncharge_end();
815 kfree(pages);
816 ret |= VM_FAULT_OOM;
817 goto out;
818 }
819 }
820
821 for (i = 0; i < HPAGE_PMD_NR; i++) {
822 copy_user_highpage(pages[i], page + i,
823 haddr + PAGE_SHIFT*i, vma);
824 __SetPageUptodate(pages[i]);
825 cond_resched();
826 }
827
828 spin_lock(&mm->page_table_lock);
829 if (unlikely(!pmd_same(*pmd, orig_pmd)))
830 goto out_free_pages;
831 VM_BUG_ON(!PageHead(page));
832
833 pmdp_clear_flush_notify(vma, haddr, pmd);
834 /* leave pmd empty until pte is filled */
835
836 pgtable = get_pmd_huge_pte(mm);
837 pmd_populate(mm, &_pmd, pgtable);
838
839 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
840 pte_t *pte, entry;
841 entry = mk_pte(pages[i], vma->vm_page_prot);
842 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
843 page_add_new_anon_rmap(pages[i], vma, haddr);
844 pte = pte_offset_map(&_pmd, haddr);
845 VM_BUG_ON(!pte_none(*pte));
846 set_pte_at(mm, haddr, pte, entry);
847 pte_unmap(pte);
848 }
849 kfree(pages);
850
851 mm->nr_ptes++;
852 smp_wmb(); /* make pte visible before pmd */
853 pmd_populate(mm, pmd, pgtable);
854 page_remove_rmap(page);
855 spin_unlock(&mm->page_table_lock);
856
857 ret |= VM_FAULT_WRITE;
858 put_page(page);
859
860out:
861 return ret;
862
863out_free_pages:
864 spin_unlock(&mm->page_table_lock);
865 mem_cgroup_uncharge_start();
866 for (i = 0; i < HPAGE_PMD_NR; i++) {
867 mem_cgroup_uncharge_page(pages[i]);
868 put_page(pages[i]);
869 }
870 mem_cgroup_uncharge_end();
871 kfree(pages);
872 goto out;
873}
874
875int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
876 unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
877{
878 int ret = 0;
879 struct page *page, *new_page;
880 unsigned long haddr;
881
882 VM_BUG_ON(!vma->anon_vma);
883 spin_lock(&mm->page_table_lock);
884 if (unlikely(!pmd_same(*pmd, orig_pmd)))
885 goto out_unlock;
886
887 page = pmd_page(orig_pmd);
888 VM_BUG_ON(!PageCompound(page) || !PageHead(page));
889 haddr = address & HPAGE_PMD_MASK;
890 if (page_mapcount(page) == 1) {
891 pmd_t entry;
892 entry = pmd_mkyoung(orig_pmd);
893 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
894 if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1))
895 update_mmu_cache(vma, address, entry);
896 ret |= VM_FAULT_WRITE;
897 goto out_unlock;
898 }
899 get_page(page);
900 spin_unlock(&mm->page_table_lock);
901
902 if (transparent_hugepage_enabled(vma) &&
903 !transparent_hugepage_debug_cow())
904 new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
905 vma, haddr);
906 else
907 new_page = NULL;
908
909 if (unlikely(!new_page)) {
910 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
911 pmd, orig_pmd, page, haddr);
912 put_page(page);
913 goto out;
914 }
915
916 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
917 put_page(new_page);
918 put_page(page);
919 ret |= VM_FAULT_OOM;
920 goto out;
921 }
922
923 copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
924 __SetPageUptodate(new_page);
925
926 spin_lock(&mm->page_table_lock);
927 put_page(page);
928 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
929 mem_cgroup_uncharge_page(new_page);
930 put_page(new_page);
931 } else {
932 pmd_t entry;
933 VM_BUG_ON(!PageHead(page));
934 entry = mk_pmd(new_page, vma->vm_page_prot);
935 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
936 entry = pmd_mkhuge(entry);
937 pmdp_clear_flush_notify(vma, haddr, pmd);
938 page_add_new_anon_rmap(new_page, vma, haddr);
939 set_pmd_at(mm, haddr, pmd, entry);
940 update_mmu_cache(vma, address, entry);
941 page_remove_rmap(page);
942 put_page(page);
943 ret |= VM_FAULT_WRITE;
944 }
945out_unlock:
946 spin_unlock(&mm->page_table_lock);
947out:
948 return ret;
949}
950
951struct page *follow_trans_huge_pmd(struct mm_struct *mm,
952 unsigned long addr,
953 pmd_t *pmd,
954 unsigned int flags)
955{
956 struct page *page = NULL;
957
958 assert_spin_locked(&mm->page_table_lock);
959
960 if (flags & FOLL_WRITE && !pmd_write(*pmd))
961 goto out;
962
963 page = pmd_page(*pmd);
964 VM_BUG_ON(!PageHead(page));
965 if (flags & FOLL_TOUCH) {
966 pmd_t _pmd;
967 /*
968 * We should set the dirty bit only for FOLL_WRITE but
969 * for now the dirty bit in the pmd is meaningless.
970 * And if the dirty bit will become meaningful and
971 * we'll only set it with FOLL_WRITE, an atomic
972 * set_bit will be required on the pmd to set the
973 * young bit, instead of the current set_pmd_at.
974 */
975 _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
976 set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
977 }
978 page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
979 VM_BUG_ON(!PageCompound(page));
980 if (flags & FOLL_GET)
981 get_page(page);
982
983out:
984 return page;
985}
986
987int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
988 pmd_t *pmd)
989{
990 int ret = 0;
991
992 spin_lock(&tlb->mm->page_table_lock);
993 if (likely(pmd_trans_huge(*pmd))) {
994 if (unlikely(pmd_trans_splitting(*pmd))) {
995 spin_unlock(&tlb->mm->page_table_lock);
996 wait_split_huge_page(vma->anon_vma,
997 pmd);
998 } else {
999 struct page *page;
1000 pgtable_t pgtable;
1001 pgtable = get_pmd_huge_pte(tlb->mm);
1002 page = pmd_page(*pmd);
1003 pmd_clear(pmd);
1004 page_remove_rmap(page);
1005 VM_BUG_ON(page_mapcount(page) < 0);
1006 add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1007 VM_BUG_ON(!PageHead(page));
1008 spin_unlock(&tlb->mm->page_table_lock);
1009 tlb_remove_page(tlb, page);
1010 pte_free(tlb->mm, pgtable);
1011 ret = 1;
1012 }
1013 } else
1014 spin_unlock(&tlb->mm->page_table_lock);
1015
1016 return ret;
1017}
1018
1019int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1020 unsigned long addr, unsigned long end,
1021 unsigned char *vec)
1022{
1023 int ret = 0;
1024
1025 spin_lock(&vma->vm_mm->page_table_lock);
1026 if (likely(pmd_trans_huge(*pmd))) {
1027 ret = !pmd_trans_splitting(*pmd);
1028 spin_unlock(&vma->vm_mm->page_table_lock);
1029 if (unlikely(!ret))
1030 wait_split_huge_page(vma->anon_vma, pmd);
1031 else {
1032 /*
1033 * All logical pages in the range are present
1034 * if backed by a huge page.
1035 */
1036 memset(vec, 1, (end - addr) >> PAGE_SHIFT);
1037 }
1038 } else
1039 spin_unlock(&vma->vm_mm->page_table_lock);
1040
1041 return ret;
1042}
1043
1044int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1045 unsigned long addr, pgprot_t newprot)
1046{
1047 struct mm_struct *mm = vma->vm_mm;
1048 int ret = 0;
1049
1050 spin_lock(&mm->page_table_lock);
1051 if (likely(pmd_trans_huge(*pmd))) {
1052 if (unlikely(pmd_trans_splitting(*pmd))) {
1053 spin_unlock(&mm->page_table_lock);
1054 wait_split_huge_page(vma->anon_vma, pmd);
1055 } else {
1056 pmd_t entry;
1057
1058 entry = pmdp_get_and_clear(mm, addr, pmd);
1059 entry = pmd_modify(entry, newprot);
1060 set_pmd_at(mm, addr, pmd, entry);
1061 spin_unlock(&vma->vm_mm->page_table_lock);
1062 flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
1063 ret = 1;
1064 }
1065 } else
1066 spin_unlock(&vma->vm_mm->page_table_lock);
1067
1068 return ret;
1069}
1070
1071pmd_t *page_check_address_pmd(struct page *page,
1072 struct mm_struct *mm,
1073 unsigned long address,
1074 enum page_check_address_pmd_flag flag)
1075{
1076 pgd_t *pgd;
1077 pud_t *pud;
1078 pmd_t *pmd, *ret = NULL;
1079
1080 if (address & ~HPAGE_PMD_MASK)
1081 goto out;
1082
1083 pgd = pgd_offset(mm, address);
1084 if (!pgd_present(*pgd))
1085 goto out;
1086
1087 pud = pud_offset(pgd, address);
1088 if (!pud_present(*pud))
1089 goto out;
1090
1091 pmd = pmd_offset(pud, address);
1092 if (pmd_none(*pmd))
1093 goto out;
1094 if (pmd_page(*pmd) != page)
1095 goto out;
1096 /*
1097 * split_vma() may create temporary aliased mappings. There is
1098 * no risk as long as all huge pmd are found and have their
1099 * splitting bit set before __split_huge_page_refcount
1100 * runs. Finding the same huge pmd more than once during the
1101 * same rmap walk is not a problem.
1102 */
1103 if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
1104 pmd_trans_splitting(*pmd))
1105 goto out;
1106 if (pmd_trans_huge(*pmd)) {
1107 VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
1108 !pmd_trans_splitting(*pmd));
1109 ret = pmd;
1110 }
1111out:
1112 return ret;
1113}
1114
1115static int __split_huge_page_splitting(struct page *page,
1116 struct vm_area_struct *vma,
1117 unsigned long address)
1118{
1119 struct mm_struct *mm = vma->vm_mm;
1120 pmd_t *pmd;
1121 int ret = 0;
1122
1123 spin_lock(&mm->page_table_lock);
1124 pmd = page_check_address_pmd(page, mm, address,
1125 PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
1126 if (pmd) {
1127 /*
1128 * We can't temporarily set the pmd to null in order
1129 * to split it, the pmd must remain marked huge at all
1130 * times or the VM won't take the pmd_trans_huge paths
1131 * and it won't wait on the anon_vma->root->lock to
1132 * serialize against split_huge_page*.
1133 */
1134 pmdp_splitting_flush_notify(vma, address, pmd);
1135 ret = 1;
1136 }
1137 spin_unlock(&mm->page_table_lock);
1138
1139 return ret;
1140}
1141
1142static void __split_huge_page_refcount(struct page *page)
1143{
1144 int i;
1145 unsigned long head_index = page->index;
1146 struct zone *zone = page_zone(page);
1147 int zonestat;
1148
1149 /* prevent PageLRU to go away from under us, and freeze lru stats */
1150 spin_lock_irq(&zone->lru_lock);
1151 compound_lock(page);
1152
1153 for (i = 1; i < HPAGE_PMD_NR; i++) {
1154 struct page *page_tail = page + i;
1155
1156 /* tail_page->_count cannot change */
1157 atomic_sub(atomic_read(&page_tail->_count), &page->_count);
1158 BUG_ON(page_count(page) <= 0);
1159 atomic_add(page_mapcount(page) + 1, &page_tail->_count);
1160 BUG_ON(atomic_read(&page_tail->_count) <= 0);
1161
1162 /* after clearing PageTail the gup refcount can be released */
1163 smp_mb();
1164
1165 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1166 page_tail->flags |= (page->flags &
1167 ((1L << PG_referenced) |
1168 (1L << PG_swapbacked) |
1169 (1L << PG_mlocked) |
1170 (1L << PG_uptodate)));
1171 page_tail->flags |= (1L << PG_dirty);
1172
1173 /*
1174 * 1) clear PageTail before overwriting first_page
1175 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
1176 */
1177 smp_wmb();
1178
1179 /*
1180 * __split_huge_page_splitting() already set the
1181 * splitting bit in all pmd that could map this
1182 * hugepage, that will ensure no CPU can alter the
1183 * mapcount on the head page. The mapcount is only
1184 * accounted in the head page and it has to be
1185 * transferred to all tail pages in the below code. So
1186 * for this code to be safe, the split the mapcount
1187 * can't change. But that doesn't mean userland can't
1188 * keep changing and reading the page contents while
1189 * we transfer the mapcount, so the pmd splitting
1190 * status is achieved setting a reserved bit in the
1191 * pmd, not by clearing the present bit.
1192 */
1193 BUG_ON(page_mapcount(page_tail));
1194 page_tail->_mapcount = page->_mapcount;
1195
1196 BUG_ON(page_tail->mapping);
1197 page_tail->mapping = page->mapping;
1198
1199 page_tail->index = ++head_index;
1200
1201 BUG_ON(!PageAnon(page_tail));
1202 BUG_ON(!PageUptodate(page_tail));
1203 BUG_ON(!PageDirty(page_tail));
1204 BUG_ON(!PageSwapBacked(page_tail));
1205
1206 lru_add_page_tail(zone, page, page_tail);
1207 }
1208
1209 __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
1210 __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
1211
1212 /*
1213 * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
1214 * so adjust those appropriately if this page is on the LRU.
1215 */
1216 if (PageLRU(page)) {
1217 zonestat = NR_LRU_BASE + page_lru(page);
1218 __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
1219 }
1220
1221 ClearPageCompound(page);
1222 compound_unlock(page);
1223 spin_unlock_irq(&zone->lru_lock);
1224
1225 for (i = 1; i < HPAGE_PMD_NR; i++) {
1226 struct page *page_tail = page + i;
1227 BUG_ON(page_count(page_tail) <= 0);
1228 /*
1229 * Tail pages may be freed if there wasn't any mapping
1230 * like if add_to_swap() is running on a lru page that
1231 * had its mapping zapped. And freeing these pages
1232 * requires taking the lru_lock so we do the put_page
1233 * of the tail pages after the split is complete.
1234 */
1235 put_page(page_tail);
1236 }
1237
1238 /*
1239 * Only the head page (now become a regular page) is required
1240 * to be pinned by the caller.
1241 */
1242 BUG_ON(page_count(page) <= 0);
1243}
1244
1245static int __split_huge_page_map(struct page *page,
1246 struct vm_area_struct *vma,
1247 unsigned long address)
1248{
1249 struct mm_struct *mm = vma->vm_mm;
1250 pmd_t *pmd, _pmd;
1251 int ret = 0, i;
1252 pgtable_t pgtable;
1253 unsigned long haddr;
1254
1255 spin_lock(&mm->page_table_lock);
1256 pmd = page_check_address_pmd(page, mm, address,
1257 PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
1258 if (pmd) {
1259 pgtable = get_pmd_huge_pte(mm);
1260 pmd_populate(mm, &_pmd, pgtable);
1261
1262 for (i = 0, haddr = address; i < HPAGE_PMD_NR;
1263 i++, haddr += PAGE_SIZE) {
1264 pte_t *pte, entry;
1265 BUG_ON(PageCompound(page+i));
1266 entry = mk_pte(page + i, vma->vm_page_prot);
1267 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1268 if (!pmd_write(*pmd))
1269 entry = pte_wrprotect(entry);
1270 else
1271 BUG_ON(page_mapcount(page) != 1);
1272 if (!pmd_young(*pmd))
1273 entry = pte_mkold(entry);
1274 pte = pte_offset_map(&_pmd, haddr);
1275 BUG_ON(!pte_none(*pte));
1276 set_pte_at(mm, haddr, pte, entry);
1277 pte_unmap(pte);
1278 }
1279
1280 mm->nr_ptes++;
1281 smp_wmb(); /* make pte visible before pmd */
1282 /*
1283 * Up to this point the pmd is present and huge and
1284 * userland has the whole access to the hugepage
1285 * during the split (which happens in place). If we
1286 * overwrite the pmd with the not-huge version
1287 * pointing to the pte here (which of course we could
1288 * if all CPUs were bug free), userland could trigger
1289 * a small page size TLB miss on the small sized TLB
1290 * while the hugepage TLB entry is still established
1291 * in the huge TLB. Some CPU doesn't like that. See
1292 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
1293 * Erratum 383 on page 93. Intel should be safe but is
1294 * also warns that it's only safe if the permission
1295 * and cache attributes of the two entries loaded in
1296 * the two TLB is identical (which should be the case
1297 * here). But it is generally safer to never allow
1298 * small and huge TLB entries for the same virtual
1299 * address to be loaded simultaneously. So instead of
1300 * doing "pmd_populate(); flush_tlb_range();" we first
1301 * mark the current pmd notpresent (atomically because
1302 * here the pmd_trans_huge and pmd_trans_splitting
1303 * must remain set at all times on the pmd until the
1304 * split is complete for this pmd), then we flush the
1305 * SMP TLB and finally we write the non-huge version
1306 * of the pmd entry with pmd_populate.
1307 */
1308 set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
1309 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
1310 pmd_populate(mm, pmd, pgtable);
1311 ret = 1;
1312 }
1313 spin_unlock(&mm->page_table_lock);
1314
1315 return ret;
1316}
1317
1318/* must be called with anon_vma->root->lock hold */
1319static void __split_huge_page(struct page *page,
1320 struct anon_vma *anon_vma)
1321{
1322 int mapcount, mapcount2;
1323 struct anon_vma_chain *avc;
1324
1325 BUG_ON(!PageHead(page));
1326 BUG_ON(PageTail(page));
1327
1328 mapcount = 0;
1329 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1330 struct vm_area_struct *vma = avc->vma;
1331 unsigned long addr = vma_address(page, vma);
1332 BUG_ON(is_vma_temporary_stack(vma));
1333 if (addr == -EFAULT)
1334 continue;
1335 mapcount += __split_huge_page_splitting(page, vma, addr);
1336 }
1337 /*
1338 * It is critical that new vmas are added to the tail of the
1339 * anon_vma list. This guarantes that if copy_huge_pmd() runs
1340 * and establishes a child pmd before
1341 * __split_huge_page_splitting() freezes the parent pmd (so if
1342 * we fail to prevent copy_huge_pmd() from running until the
1343 * whole __split_huge_page() is complete), we will still see
1344 * the newly established pmd of the child later during the
1345 * walk, to be able to set it as pmd_trans_splitting too.
1346 */
1347 if (mapcount != page_mapcount(page))
1348 printk(KERN_ERR "mapcount %d page_mapcount %d\n",
1349 mapcount, page_mapcount(page));
1350 BUG_ON(mapcount != page_mapcount(page));
1351
1352 __split_huge_page_refcount(page);
1353
1354 mapcount2 = 0;
1355 list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
1356 struct vm_area_struct *vma = avc->vma;
1357 unsigned long addr = vma_address(page, vma);
1358 BUG_ON(is_vma_temporary_stack(vma));
1359 if (addr == -EFAULT)
1360 continue;
1361 mapcount2 += __split_huge_page_map(page, vma, addr);
1362 }
1363 if (mapcount != mapcount2)
1364 printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
1365 mapcount, mapcount2, page_mapcount(page));
1366 BUG_ON(mapcount != mapcount2);
1367}
1368
1369int split_huge_page(struct page *page)
1370{
1371 struct anon_vma *anon_vma;
1372 int ret = 1;
1373
1374 BUG_ON(!PageAnon(page));
1375 anon_vma = page_lock_anon_vma(page);
1376 if (!anon_vma)
1377 goto out;
1378 ret = 0;
1379 if (!PageCompound(page))
1380 goto out_unlock;
1381
1382 BUG_ON(!PageSwapBacked(page));
1383 __split_huge_page(page, anon_vma);
1384
1385 BUG_ON(PageCompound(page));
1386out_unlock:
1387 page_unlock_anon_vma(anon_vma);
1388out:
1389 return ret;
1390}
1391
1392int hugepage_madvise(struct vm_area_struct *vma,
1393 unsigned long *vm_flags, int advice)
1394{
1395 switch (advice) {
1396 case MADV_HUGEPAGE:
1397 /*
1398 * Be somewhat over-protective like KSM for now!
1399 */
1400 if (*vm_flags & (VM_HUGEPAGE |
1401 VM_SHARED | VM_MAYSHARE |
1402 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1403 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1404 VM_MIXEDMAP | VM_SAO))
1405 return -EINVAL;
1406 *vm_flags &= ~VM_NOHUGEPAGE;
1407 *vm_flags |= VM_HUGEPAGE;
1408 /*
1409 * If the vma become good for khugepaged to scan,
1410 * register it here without waiting a page fault that
1411 * may not happen any time soon.
1412 */
1413 if (unlikely(khugepaged_enter_vma_merge(vma)))
1414 return -ENOMEM;
1415 break;
1416 case MADV_NOHUGEPAGE:
1417 /*
1418 * Be somewhat over-protective like KSM for now!
1419 */
1420 if (*vm_flags & (VM_NOHUGEPAGE |
1421 VM_SHARED | VM_MAYSHARE |
1422 VM_PFNMAP | VM_IO | VM_DONTEXPAND |
1423 VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE |
1424 VM_MIXEDMAP | VM_SAO))
1425 return -EINVAL;
1426 *vm_flags &= ~VM_HUGEPAGE;
1427 *vm_flags |= VM_NOHUGEPAGE;
1428 /*
1429 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
1430 * this vma even if we leave the mm registered in khugepaged if
1431 * it got registered before VM_NOHUGEPAGE was set.
1432 */
1433 break;
1434 }
1435
1436 return 0;
1437}
1438
1439static int __init khugepaged_slab_init(void)
1440{
1441 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
1442 sizeof(struct mm_slot),
1443 __alignof__(struct mm_slot), 0, NULL);
1444 if (!mm_slot_cache)
1445 return -ENOMEM;
1446
1447 return 0;
1448}
1449
1450static void __init khugepaged_slab_free(void)
1451{
1452 kmem_cache_destroy(mm_slot_cache);
1453 mm_slot_cache = NULL;
1454}
1455
1456static inline struct mm_slot *alloc_mm_slot(void)
1457{
1458 if (!mm_slot_cache) /* initialization failed */
1459 return NULL;
1460 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1461}
1462
1463static inline void free_mm_slot(struct mm_slot *mm_slot)
1464{
1465 kmem_cache_free(mm_slot_cache, mm_slot);
1466}
1467
1468static int __init mm_slots_hash_init(void)
1469{
1470 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
1471 GFP_KERNEL);
1472 if (!mm_slots_hash)
1473 return -ENOMEM;
1474 return 0;
1475}
1476
1477#if 0
1478static void __init mm_slots_hash_free(void)
1479{
1480 kfree(mm_slots_hash);
1481 mm_slots_hash = NULL;
1482}
1483#endif
1484
1485static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1486{
1487 struct mm_slot *mm_slot;
1488 struct hlist_head *bucket;
1489 struct hlist_node *node;
1490
1491 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1492 % MM_SLOTS_HASH_HEADS];
1493 hlist_for_each_entry(mm_slot, node, bucket, hash) {
1494 if (mm == mm_slot->mm)
1495 return mm_slot;
1496 }
1497 return NULL;
1498}
1499
1500static void insert_to_mm_slots_hash(struct mm_struct *mm,
1501 struct mm_slot *mm_slot)
1502{
1503 struct hlist_head *bucket;
1504
1505 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1506 % MM_SLOTS_HASH_HEADS];
1507 mm_slot->mm = mm;
1508 hlist_add_head(&mm_slot->hash, bucket);
1509}
1510
1511static inline int khugepaged_test_exit(struct mm_struct *mm)
1512{
1513 return atomic_read(&mm->mm_users) == 0;
1514}
1515
1516int __khugepaged_enter(struct mm_struct *mm)
1517{
1518 struct mm_slot *mm_slot;
1519 int wakeup;
1520
1521 mm_slot = alloc_mm_slot();
1522 if (!mm_slot)
1523 return -ENOMEM;
1524
1525 /* __khugepaged_exit() must not run from under us */
1526 VM_BUG_ON(khugepaged_test_exit(mm));
1527 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
1528 free_mm_slot(mm_slot);
1529 return 0;
1530 }
1531
1532 spin_lock(&khugepaged_mm_lock);
1533 insert_to_mm_slots_hash(mm, mm_slot);
1534 /*
1535 * Insert just behind the scanning cursor, to let the area settle
1536 * down a little.
1537 */
1538 wakeup = list_empty(&khugepaged_scan.mm_head);
1539 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
1540 spin_unlock(&khugepaged_mm_lock);
1541
1542 atomic_inc(&mm->mm_count);
1543 if (wakeup)
1544 wake_up_interruptible(&khugepaged_wait);
1545
1546 return 0;
1547}
1548
1549int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
1550{
1551 unsigned long hstart, hend;
1552 if (!vma->anon_vma)
1553 /*
1554 * Not yet faulted in so we will register later in the
1555 * page fault if needed.
1556 */
1557 return 0;
1558 if (vma->vm_file || vma->vm_ops)
1559 /* khugepaged not yet working on file or special mappings */
1560 return 0;
1561 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1562 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1563 hend = vma->vm_end & HPAGE_PMD_MASK;
1564 if (hstart < hend)
1565 return khugepaged_enter(vma);
1566 return 0;
1567}
1568
1569void __khugepaged_exit(struct mm_struct *mm)
1570{
1571 struct mm_slot *mm_slot;
1572 int free = 0;
1573
1574 spin_lock(&khugepaged_mm_lock);
1575 mm_slot = get_mm_slot(mm);
1576 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
1577 hlist_del(&mm_slot->hash);
1578 list_del(&mm_slot->mm_node);
1579 free = 1;
1580 }
1581
1582 if (free) {
1583 spin_unlock(&khugepaged_mm_lock);
1584 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1585 free_mm_slot(mm_slot);
1586 mmdrop(mm);
1587 } else if (mm_slot) {
1588 spin_unlock(&khugepaged_mm_lock);
1589 /*
1590 * This is required to serialize against
1591 * khugepaged_test_exit() (which is guaranteed to run
1592 * under mmap sem read mode). Stop here (after we
1593 * return all pagetables will be destroyed) until
1594 * khugepaged has finished working on the pagetables
1595 * under the mmap_sem.
1596 */
1597 down_write(&mm->mmap_sem);
1598 up_write(&mm->mmap_sem);
1599 } else
1600 spin_unlock(&khugepaged_mm_lock);
1601}
1602
1603static void release_pte_page(struct page *page)
1604{
1605 /* 0 stands for page_is_file_cache(page) == false */
1606 dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
1607 unlock_page(page);
1608 putback_lru_page(page);
1609}
1610
1611static void release_pte_pages(pte_t *pte, pte_t *_pte)
1612{
1613 while (--_pte >= pte) {
1614 pte_t pteval = *_pte;
1615 if (!pte_none(pteval))
1616 release_pte_page(pte_page(pteval));
1617 }
1618}
1619
1620static void release_all_pte_pages(pte_t *pte)
1621{
1622 release_pte_pages(pte, pte + HPAGE_PMD_NR);
1623}
1624
1625static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1626 unsigned long address,
1627 pte_t *pte)
1628{
1629 struct page *page;
1630 pte_t *_pte;
1631 int referenced = 0, isolated = 0, none = 0;
1632 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
1633 _pte++, address += PAGE_SIZE) {
1634 pte_t pteval = *_pte;
1635 if (pte_none(pteval)) {
1636 if (++none <= khugepaged_max_ptes_none)
1637 continue;
1638 else {
1639 release_pte_pages(pte, _pte);
1640 goto out;
1641 }
1642 }
1643 if (!pte_present(pteval) || !pte_write(pteval)) {
1644 release_pte_pages(pte, _pte);
1645 goto out;
1646 }
1647 page = vm_normal_page(vma, address, pteval);
1648 if (unlikely(!page)) {
1649 release_pte_pages(pte, _pte);
1650 goto out;
1651 }
1652 VM_BUG_ON(PageCompound(page));
1653 BUG_ON(!PageAnon(page));
1654 VM_BUG_ON(!PageSwapBacked(page));
1655
1656 /* cannot use mapcount: can't collapse if there's a gup pin */
1657 if (page_count(page) != 1) {
1658 release_pte_pages(pte, _pte);
1659 goto out;
1660 }
1661 /*
1662 * We can do it before isolate_lru_page because the
1663 * page can't be freed from under us. NOTE: PG_lock
1664 * is needed to serialize against split_huge_page
1665 * when invoked from the VM.
1666 */
1667 if (!trylock_page(page)) {
1668 release_pte_pages(pte, _pte);
1669 goto out;
1670 }
1671 /*
1672 * Isolate the page to avoid collapsing an hugepage
1673 * currently in use by the VM.
1674 */
1675 if (isolate_lru_page(page)) {
1676 unlock_page(page);
1677 release_pte_pages(pte, _pte);
1678 goto out;
1679 }
1680 /* 0 stands for page_is_file_cache(page) == false */
1681 inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
1682 VM_BUG_ON(!PageLocked(page));
1683 VM_BUG_ON(PageLRU(page));
1684
1685 /* If there is no mapped pte young don't collapse the page */
1686 if (pte_young(pteval) || PageReferenced(page) ||
1687 mmu_notifier_test_young(vma->vm_mm, address))
1688 referenced = 1;
1689 }
1690 if (unlikely(!referenced))
1691 release_all_pte_pages(pte);
1692 else
1693 isolated = 1;
1694out:
1695 return isolated;
1696}
1697
1698static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1699 struct vm_area_struct *vma,
1700 unsigned long address,
1701 spinlock_t *ptl)
1702{
1703 pte_t *_pte;
1704 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
1705 pte_t pteval = *_pte;
1706 struct page *src_page;
1707
1708 if (pte_none(pteval)) {
1709 clear_user_highpage(page, address);
1710 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
1711 } else {
1712 src_page = pte_page(pteval);
1713 copy_user_highpage(page, src_page, address, vma);
1714 VM_BUG_ON(page_mapcount(src_page) != 1);
1715 VM_BUG_ON(page_count(src_page) != 2);
1716 release_pte_page(src_page);
1717 /*
1718 * ptl mostly unnecessary, but preempt has to
1719 * be disabled to update the per-cpu stats
1720 * inside page_remove_rmap().
1721 */
1722 spin_lock(ptl);
1723 /*
1724 * paravirt calls inside pte_clear here are
1725 * superfluous.
1726 */
1727 pte_clear(vma->vm_mm, address, _pte);
1728 page_remove_rmap(src_page);
1729 spin_unlock(ptl);
1730 free_page_and_swap_cache(src_page);
1731 }
1732
1733 address += PAGE_SIZE;
1734 page++;
1735 }
1736}
1737
1738static void collapse_huge_page(struct mm_struct *mm,
1739 unsigned long address,
1740 struct page **hpage,
1741 struct vm_area_struct *vma)
1742{
1743 pgd_t *pgd;
1744 pud_t *pud;
1745 pmd_t *pmd, _pmd;
1746 pte_t *pte;
1747 pgtable_t pgtable;
1748 struct page *new_page;
1749 spinlock_t *ptl;
1750 int isolated;
1751 unsigned long hstart, hend;
1752
1753 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1754#ifndef CONFIG_NUMA
1755 VM_BUG_ON(!*hpage);
1756 new_page = *hpage;
1757#else
1758 VM_BUG_ON(*hpage);
1759 /*
1760 * Allocate the page while the vma is still valid and under
1761 * the mmap_sem read mode so there is no memory allocation
1762 * later when we take the mmap_sem in write mode. This is more
1763 * friendly behavior (OTOH it may actually hide bugs) to
1764 * filesystems in userland with daemons allocating memory in
1765 * the userland I/O paths. Allocating memory with the
1766 * mmap_sem in read mode is good idea also to allow greater
1767 * scalability.
1768 */
1769 new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address);
1770 if (unlikely(!new_page)) {
1771 up_read(&mm->mmap_sem);
1772 *hpage = ERR_PTR(-ENOMEM);
1773 return;
1774 }
1775#endif
1776 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
1777 up_read(&mm->mmap_sem);
1778 put_page(new_page);
1779 return;
1780 }
1781
1782 /* after allocating the hugepage upgrade to mmap_sem write mode */
1783 up_read(&mm->mmap_sem);
1784
1785 /*
1786 * Prevent all access to pagetables with the exception of
1787 * gup_fast later hanlded by the ptep_clear_flush and the VM
1788 * handled by the anon_vma lock + PG_lock.
1789 */
1790 down_write(&mm->mmap_sem);
1791 if (unlikely(khugepaged_test_exit(mm)))
1792 goto out;
1793
1794 vma = find_vma(mm, address);
1795 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1796 hend = vma->vm_end & HPAGE_PMD_MASK;
1797 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
1798 goto out;
1799
1800 if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
1801 (vma->vm_flags & VM_NOHUGEPAGE))
1802 goto out;
1803
1804 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
1805 if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
1806 goto out;
1807 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1808
1809 pgd = pgd_offset(mm, address);
1810 if (!pgd_present(*pgd))
1811 goto out;
1812
1813 pud = pud_offset(pgd, address);
1814 if (!pud_present(*pud))
1815 goto out;
1816
1817 pmd = pmd_offset(pud, address);
1818 /* pmd can't go away or become huge under us */
1819 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1820 goto out;
1821
1822 anon_vma_lock(vma->anon_vma);
1823
1824 pte = pte_offset_map(pmd, address);
1825 ptl = pte_lockptr(mm, pmd);
1826
1827 spin_lock(&mm->page_table_lock); /* probably unnecessary */
1828 /*
1829 * After this gup_fast can't run anymore. This also removes
1830 * any huge TLB entry from the CPU so we won't allow
1831 * huge and small TLB entries for the same virtual address
1832 * to avoid the risk of CPU bugs in that area.
1833 */
1834 _pmd = pmdp_clear_flush_notify(vma, address, pmd);
1835 spin_unlock(&mm->page_table_lock);
1836
1837 spin_lock(ptl);
1838 isolated = __collapse_huge_page_isolate(vma, address, pte);
1839 spin_unlock(ptl);
1840 pte_unmap(pte);
1841
1842 if (unlikely(!isolated)) {
1843 spin_lock(&mm->page_table_lock);
1844 BUG_ON(!pmd_none(*pmd));
1845 set_pmd_at(mm, address, pmd, _pmd);
1846 spin_unlock(&mm->page_table_lock);
1847 anon_vma_unlock(vma->anon_vma);
1848 mem_cgroup_uncharge_page(new_page);
1849 goto out;
1850 }
1851
1852 /*
1853 * All pages are isolated and locked so anon_vma rmap
1854 * can't run anymore.
1855 */
1856 anon_vma_unlock(vma->anon_vma);
1857
1858 __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
1859 __SetPageUptodate(new_page);
1860 pgtable = pmd_pgtable(_pmd);
1861 VM_BUG_ON(page_count(pgtable) != 1);
1862 VM_BUG_ON(page_mapcount(pgtable) != 0);
1863
1864 _pmd = mk_pmd(new_page, vma->vm_page_prot);
1865 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
1866 _pmd = pmd_mkhuge(_pmd);
1867
1868 /*
1869 * spin_lock() below is not the equivalent of smp_wmb(), so
1870 * this is needed to avoid the copy_huge_page writes to become
1871 * visible after the set_pmd_at() write.
1872 */
1873 smp_wmb();
1874
1875 spin_lock(&mm->page_table_lock);
1876 BUG_ON(!pmd_none(*pmd));
1877 page_add_new_anon_rmap(new_page, vma, address);
1878 set_pmd_at(mm, address, pmd, _pmd);
1879 update_mmu_cache(vma, address, entry);
1880 prepare_pmd_huge_pte(pgtable, mm);
1881 mm->nr_ptes--;
1882 spin_unlock(&mm->page_table_lock);
1883
1884#ifndef CONFIG_NUMA
1885 *hpage = NULL;
1886#endif
1887 khugepaged_pages_collapsed++;
1888out_up_write:
1889 up_write(&mm->mmap_sem);
1890 return;
1891
1892out:
1893#ifdef CONFIG_NUMA
1894 put_page(new_page);
1895#endif
1896 goto out_up_write;
1897}
1898
1899static int khugepaged_scan_pmd(struct mm_struct *mm,
1900 struct vm_area_struct *vma,
1901 unsigned long address,
1902 struct page **hpage)
1903{
1904 pgd_t *pgd;
1905 pud_t *pud;
1906 pmd_t *pmd;
1907 pte_t *pte, *_pte;
1908 int ret = 0, referenced = 0, none = 0;
1909 struct page *page;
1910 unsigned long _address;
1911 spinlock_t *ptl;
1912
1913 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1914
1915 pgd = pgd_offset(mm, address);
1916 if (!pgd_present(*pgd))
1917 goto out;
1918
1919 pud = pud_offset(pgd, address);
1920 if (!pud_present(*pud))
1921 goto out;
1922
1923 pmd = pmd_offset(pud, address);
1924 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1925 goto out;
1926
1927 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1928 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
1929 _pte++, _address += PAGE_SIZE) {
1930 pte_t pteval = *_pte;
1931 if (pte_none(pteval)) {
1932 if (++none <= khugepaged_max_ptes_none)
1933 continue;
1934 else
1935 goto out_unmap;
1936 }
1937 if (!pte_present(pteval) || !pte_write(pteval))
1938 goto out_unmap;
1939 page = vm_normal_page(vma, _address, pteval);
1940 if (unlikely(!page))
1941 goto out_unmap;
1942 VM_BUG_ON(PageCompound(page));
1943 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
1944 goto out_unmap;
1945 /* cannot use mapcount: can't collapse if there's a gup pin */
1946 if (page_count(page) != 1)
1947 goto out_unmap;
1948 if (pte_young(pteval) || PageReferenced(page) ||
1949 mmu_notifier_test_young(vma->vm_mm, address))
1950 referenced = 1;
1951 }
1952 if (referenced)
1953 ret = 1;
1954out_unmap:
1955 pte_unmap_unlock(pte, ptl);
1956 if (ret)
1957 /* collapse_huge_page will return with the mmap_sem released */
1958 collapse_huge_page(mm, address, hpage, vma);
1959out:
1960 return ret;
1961}
1962
1963static void collect_mm_slot(struct mm_slot *mm_slot)
1964{
1965 struct mm_struct *mm = mm_slot->mm;
1966
1967 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
1968
1969 if (khugepaged_test_exit(mm)) {
1970 /* free mm_slot */
1971 hlist_del(&mm_slot->hash);
1972 list_del(&mm_slot->mm_node);
1973
1974 /*
1975 * Not strictly needed because the mm exited already.
1976 *
1977 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1978 */
1979
1980 /* khugepaged_mm_lock actually not necessary for the below */
1981 free_mm_slot(mm_slot);
1982 mmdrop(mm);
1983 }
1984}
1985
1986static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
1987 struct page **hpage)
1988{
1989 struct mm_slot *mm_slot;
1990 struct mm_struct *mm;
1991 struct vm_area_struct *vma;
1992 int progress = 0;
1993
1994 VM_BUG_ON(!pages);
1995 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
1996
1997 if (khugepaged_scan.mm_slot)
1998 mm_slot = khugepaged_scan.mm_slot;
1999 else {
2000 mm_slot = list_entry(khugepaged_scan.mm_head.next,
2001 struct mm_slot, mm_node);
2002 khugepaged_scan.address = 0;
2003 khugepaged_scan.mm_slot = mm_slot;
2004 }
2005 spin_unlock(&khugepaged_mm_lock);
2006
2007 mm = mm_slot->mm;
2008 down_read(&mm->mmap_sem);
2009 if (unlikely(khugepaged_test_exit(mm)))
2010 vma = NULL;
2011 else
2012 vma = find_vma(mm, khugepaged_scan.address);
2013
2014 progress++;
2015 for (; vma; vma = vma->vm_next) {
2016 unsigned long hstart, hend;
2017
2018 cond_resched();
2019 if (unlikely(khugepaged_test_exit(mm))) {
2020 progress++;
2021 break;
2022 }
2023
2024 if ((!(vma->vm_flags & VM_HUGEPAGE) &&
2025 !khugepaged_always()) ||
2026 (vma->vm_flags & VM_NOHUGEPAGE)) {
2027 progress++;
2028 continue;
2029 }
2030
2031 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
2032 if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
2033 khugepaged_scan.address = vma->vm_end;
2034 progress++;
2035 continue;
2036 }
2037 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
2038
2039 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
2040 hend = vma->vm_end & HPAGE_PMD_MASK;
2041 if (hstart >= hend) {
2042 progress++;
2043 continue;
2044 }
2045 if (khugepaged_scan.address < hstart)
2046 khugepaged_scan.address = hstart;
2047 if (khugepaged_scan.address > hend) {
2048 khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
2049 progress++;
2050 continue;
2051 }
2052 BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
2053
2054 while (khugepaged_scan.address < hend) {
2055 int ret;
2056 cond_resched();
2057 if (unlikely(khugepaged_test_exit(mm)))
2058 goto breakouterloop;
2059
2060 VM_BUG_ON(khugepaged_scan.address < hstart ||
2061 khugepaged_scan.address + HPAGE_PMD_SIZE >
2062 hend);
2063 ret = khugepaged_scan_pmd(mm, vma,
2064 khugepaged_scan.address,
2065 hpage);
2066 /* move to next address */
2067 khugepaged_scan.address += HPAGE_PMD_SIZE;
2068 progress += HPAGE_PMD_NR;
2069 if (ret)
2070 /* we released mmap_sem so break loop */
2071 goto breakouterloop_mmap_sem;
2072 if (progress >= pages)
2073 goto breakouterloop;
2074 }
2075 }
2076breakouterloop:
2077 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
2078breakouterloop_mmap_sem:
2079
2080 spin_lock(&khugepaged_mm_lock);
2081 BUG_ON(khugepaged_scan.mm_slot != mm_slot);
2082 /*
2083 * Release the current mm_slot if this mm is about to die, or
2084 * if we scanned all vmas of this mm.
2085 */
2086 if (khugepaged_test_exit(mm) || !vma) {
2087 /*
2088 * Make sure that if mm_users is reaching zero while
2089 * khugepaged runs here, khugepaged_exit will find
2090 * mm_slot not pointing to the exiting mm.
2091 */
2092 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
2093 khugepaged_scan.mm_slot = list_entry(
2094 mm_slot->mm_node.next,
2095 struct mm_slot, mm_node);
2096 khugepaged_scan.address = 0;
2097 } else {
2098 khugepaged_scan.mm_slot = NULL;
2099 khugepaged_full_scans++;
2100 }
2101
2102 collect_mm_slot(mm_slot);
2103 }
2104
2105 return progress;
2106}
2107
2108static int khugepaged_has_work(void)
2109{
2110 return !list_empty(&khugepaged_scan.mm_head) &&
2111 khugepaged_enabled();
2112}
2113
2114static int khugepaged_wait_event(void)
2115{
2116 return !list_empty(&khugepaged_scan.mm_head) ||
2117 !khugepaged_enabled();
2118}
2119
2120static void khugepaged_do_scan(struct page **hpage)
2121{
2122 unsigned int progress = 0, pass_through_head = 0;
2123 unsigned int pages = khugepaged_pages_to_scan;
2124
2125 barrier(); /* write khugepaged_pages_to_scan to local stack */
2126
2127 while (progress < pages) {
2128 cond_resched();
2129
2130#ifndef CONFIG_NUMA
2131 if (!*hpage) {
2132 *hpage = alloc_hugepage(khugepaged_defrag());
2133 if (unlikely(!*hpage))
2134 break;
2135 }
2136#else
2137 if (IS_ERR(*hpage))
2138 break;
2139#endif
2140
2141 if (unlikely(kthread_should_stop() || freezing(current)))
2142 break;
2143
2144 spin_lock(&khugepaged_mm_lock);
2145 if (!khugepaged_scan.mm_slot)
2146 pass_through_head++;
2147 if (khugepaged_has_work() &&
2148 pass_through_head < 2)
2149 progress += khugepaged_scan_mm_slot(pages - progress,
2150 hpage);
2151 else
2152 progress = pages;
2153 spin_unlock(&khugepaged_mm_lock);
2154 }
2155}
2156
2157static void khugepaged_alloc_sleep(void)
2158{
2159 DEFINE_WAIT(wait);
2160 add_wait_queue(&khugepaged_wait, &wait);
2161 schedule_timeout_interruptible(
2162 msecs_to_jiffies(
2163 khugepaged_alloc_sleep_millisecs));
2164 remove_wait_queue(&khugepaged_wait, &wait);
2165}
2166
2167#ifndef CONFIG_NUMA
2168static struct page *khugepaged_alloc_hugepage(void)
2169{
2170 struct page *hpage;
2171
2172 do {
2173 hpage = alloc_hugepage(khugepaged_defrag());
2174 if (!hpage)
2175 khugepaged_alloc_sleep();
2176 } while (unlikely(!hpage) &&
2177 likely(khugepaged_enabled()));
2178 return hpage;
2179}
2180#endif
2181
2182static void khugepaged_loop(void)
2183{
2184 struct page *hpage;
2185
2186#ifdef CONFIG_NUMA
2187 hpage = NULL;
2188#endif
2189 while (likely(khugepaged_enabled())) {
2190#ifndef CONFIG_NUMA
2191 hpage = khugepaged_alloc_hugepage();
2192 if (unlikely(!hpage))
2193 break;
2194#else
2195 if (IS_ERR(hpage)) {
2196 khugepaged_alloc_sleep();
2197 hpage = NULL;
2198 }
2199#endif
2200
2201 khugepaged_do_scan(&hpage);
2202#ifndef CONFIG_NUMA
2203 if (hpage)
2204 put_page(hpage);
2205#endif
2206 try_to_freeze();
2207 if (unlikely(kthread_should_stop()))
2208 break;
2209 if (khugepaged_has_work()) {
2210 DEFINE_WAIT(wait);
2211 if (!khugepaged_scan_sleep_millisecs)
2212 continue;
2213 add_wait_queue(&khugepaged_wait, &wait);
2214 schedule_timeout_interruptible(
2215 msecs_to_jiffies(
2216 khugepaged_scan_sleep_millisecs));
2217 remove_wait_queue(&khugepaged_wait, &wait);
2218 } else if (khugepaged_enabled())
2219 wait_event_freezable(khugepaged_wait,
2220 khugepaged_wait_event());
2221 }
2222}
2223
2224static int khugepaged(void *none)
2225{
2226 struct mm_slot *mm_slot;
2227
2228 set_freezable();
2229 set_user_nice(current, 19);
2230
2231 /* serialize with start_khugepaged() */
2232 mutex_lock(&khugepaged_mutex);
2233
2234 for (;;) {
2235 mutex_unlock(&khugepaged_mutex);
2236 BUG_ON(khugepaged_thread != current);
2237 khugepaged_loop();
2238 BUG_ON(khugepaged_thread != current);
2239
2240 mutex_lock(&khugepaged_mutex);
2241 if (!khugepaged_enabled())
2242 break;
2243 if (unlikely(kthread_should_stop()))
2244 break;
2245 }
2246
2247 spin_lock(&khugepaged_mm_lock);
2248 mm_slot = khugepaged_scan.mm_slot;
2249 khugepaged_scan.mm_slot = NULL;
2250 if (mm_slot)
2251 collect_mm_slot(mm_slot);
2252 spin_unlock(&khugepaged_mm_lock);
2253
2254 khugepaged_thread = NULL;
2255 mutex_unlock(&khugepaged_mutex);
2256
2257 return 0;
2258}
2259
2260void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
2261{
2262 struct page *page;
2263
2264 spin_lock(&mm->page_table_lock);
2265 if (unlikely(!pmd_trans_huge(*pmd))) {
2266 spin_unlock(&mm->page_table_lock);
2267 return;
2268 }
2269 page = pmd_page(*pmd);
2270 VM_BUG_ON(!page_count(page));
2271 get_page(page);
2272 spin_unlock(&mm->page_table_lock);
2273
2274 split_huge_page(page);
2275
2276 put_page(page);
2277 BUG_ON(pmd_trans_huge(*pmd));
2278}
2279
2280static void split_huge_page_address(struct mm_struct *mm,
2281 unsigned long address)
2282{
2283 pgd_t *pgd;
2284 pud_t *pud;
2285 pmd_t *pmd;
2286
2287 VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
2288
2289 pgd = pgd_offset(mm, address);
2290 if (!pgd_present(*pgd))
2291 return;
2292
2293 pud = pud_offset(pgd, address);
2294 if (!pud_present(*pud))
2295 return;
2296
2297 pmd = pmd_offset(pud, address);
2298 if (!pmd_present(*pmd))
2299 return;
2300 /*
2301 * Caller holds the mmap_sem write mode, so a huge pmd cannot
2302 * materialize from under us.
2303 */
2304 split_huge_page_pmd(mm, pmd);
2305}
2306
2307void __vma_adjust_trans_huge(struct vm_area_struct *vma,
2308 unsigned long start,
2309 unsigned long end,
2310 long adjust_next)
2311{
2312 /*
2313 * If the new start address isn't hpage aligned and it could
2314 * previously contain an hugepage: check if we need to split
2315 * an huge pmd.
2316 */
2317 if (start & ~HPAGE_PMD_MASK &&
2318 (start & HPAGE_PMD_MASK) >= vma->vm_start &&
2319 (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2320 split_huge_page_address(vma->vm_mm, start);
2321
2322 /*
2323 * If the new end address isn't hpage aligned and it could
2324 * previously contain an hugepage: check if we need to split
2325 * an huge pmd.
2326 */
2327 if (end & ~HPAGE_PMD_MASK &&
2328 (end & HPAGE_PMD_MASK) >= vma->vm_start &&
2329 (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2330 split_huge_page_address(vma->vm_mm, end);
2331
2332 /*
2333 * If we're also updating the vma->vm_next->vm_start, if the new
2334 * vm_next->vm_start isn't page aligned and it could previously
2335 * contain an hugepage: check if we need to split an huge pmd.
2336 */
2337 if (adjust_next > 0) {
2338 struct vm_area_struct *next = vma->vm_next;
2339 unsigned long nstart = next->vm_start;
2340 nstart += adjust_next << PAGE_SHIFT;
2341 if (nstart & ~HPAGE_PMD_MASK &&
2342 (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
2343 (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
2344 split_huge_page_address(next->vm_mm, nstart);
2345 }
2346}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 85855240933d..bb0b7c128015 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma)
394 return 0; 394 return 0;
395} 395}
396 396
397static void clear_gigantic_page(struct page *page,
398 unsigned long addr, unsigned long sz)
399{
400 int i;
401 struct page *p = page;
402
403 might_sleep();
404 for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
405 cond_resched();
406 clear_user_highpage(p, addr + i * PAGE_SIZE);
407 }
408}
409static void clear_huge_page(struct page *page,
410 unsigned long addr, unsigned long sz)
411{
412 int i;
413
414 if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
415 clear_gigantic_page(page, addr, sz);
416 return;
417 }
418
419 might_sleep();
420 for (i = 0; i < sz/PAGE_SIZE; i++) {
421 cond_resched();
422 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
423 }
424}
425
426static void copy_user_gigantic_page(struct page *dst, struct page *src,
427 unsigned long addr, struct vm_area_struct *vma)
428{
429 int i;
430 struct hstate *h = hstate_vma(vma);
431 struct page *dst_base = dst;
432 struct page *src_base = src;
433
434 for (i = 0; i < pages_per_huge_page(h); ) {
435 cond_resched();
436 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
437
438 i++;
439 dst = mem_map_next(dst, dst_base, i);
440 src = mem_map_next(src, src_base, i);
441 }
442}
443
444static void copy_user_huge_page(struct page *dst, struct page *src,
445 unsigned long addr, struct vm_area_struct *vma)
446{
447 int i;
448 struct hstate *h = hstate_vma(vma);
449
450 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
451 copy_user_gigantic_page(dst, src, addr, vma);
452 return;
453 }
454
455 might_sleep();
456 for (i = 0; i < pages_per_huge_page(h); i++) {
457 cond_resched();
458 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
459 }
460}
461
462static void copy_gigantic_page(struct page *dst, struct page *src) 397static void copy_gigantic_page(struct page *dst, struct page *src)
463{ 398{
464 int i; 399 int i;
@@ -1428,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
1428 1363
1429 return sprintf(buf, "%lu\n", nr_huge_pages); 1364 return sprintf(buf, "%lu\n", nr_huge_pages);
1430} 1365}
1366
1431static ssize_t nr_hugepages_store_common(bool obey_mempolicy, 1367static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1432 struct kobject *kobj, struct kobj_attribute *attr, 1368 struct kobject *kobj, struct kobj_attribute *attr,
1433 const char *buf, size_t len) 1369 const char *buf, size_t len)
@@ -1440,9 +1376,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1440 1376
1441 err = strict_strtoul(buf, 10, &count); 1377 err = strict_strtoul(buf, 10, &count);
1442 if (err) 1378 if (err)
1443 return 0; 1379 goto out;
1444 1380
1445 h = kobj_to_hstate(kobj, &nid); 1381 h = kobj_to_hstate(kobj, &nid);
1382 if (h->order >= MAX_ORDER) {
1383 err = -EINVAL;
1384 goto out;
1385 }
1386
1446 if (nid == NUMA_NO_NODE) { 1387 if (nid == NUMA_NO_NODE) {
1447 /* 1388 /*
1448 * global hstate attribute 1389 * global hstate attribute
@@ -1468,6 +1409,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
1468 NODEMASK_FREE(nodes_allowed); 1409 NODEMASK_FREE(nodes_allowed);
1469 1410
1470 return len; 1411 return len;
1412out:
1413 NODEMASK_FREE(nodes_allowed);
1414 return err;
1471} 1415}
1472 1416
1473static ssize_t nr_hugepages_show(struct kobject *kobj, 1417static ssize_t nr_hugepages_show(struct kobject *kobj,
@@ -1510,6 +1454,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
1510 struct hstate *h = kobj_to_hstate(kobj, NULL); 1454 struct hstate *h = kobj_to_hstate(kobj, NULL);
1511 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); 1455 return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
1512} 1456}
1457
1513static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, 1458static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1514 struct kobj_attribute *attr, const char *buf, size_t count) 1459 struct kobj_attribute *attr, const char *buf, size_t count)
1515{ 1460{
@@ -1517,9 +1462,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
1517 unsigned long input; 1462 unsigned long input;
1518 struct hstate *h = kobj_to_hstate(kobj, NULL); 1463 struct hstate *h = kobj_to_hstate(kobj, NULL);
1519 1464
1465 if (h->order >= MAX_ORDER)
1466 return -EINVAL;
1467
1520 err = strict_strtoul(buf, 10, &input); 1468 err = strict_strtoul(buf, 10, &input);
1521 if (err) 1469 if (err)
1522 return 0; 1470 return err;
1523 1471
1524 spin_lock(&hugetlb_lock); 1472 spin_lock(&hugetlb_lock);
1525 h->nr_overcommit_huge_pages = input; 1473 h->nr_overcommit_huge_pages = input;
@@ -1922,13 +1870,19 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1922{ 1870{
1923 struct hstate *h = &default_hstate; 1871 struct hstate *h = &default_hstate;
1924 unsigned long tmp; 1872 unsigned long tmp;
1873 int ret;
1925 1874
1926 if (!write) 1875 if (!write)
1927 tmp = h->max_huge_pages; 1876 tmp = h->max_huge_pages;
1928 1877
1878 if (write && h->order >= MAX_ORDER)
1879 return -EINVAL;
1880
1929 table->data = &tmp; 1881 table->data = &tmp;
1930 table->maxlen = sizeof(unsigned long); 1882 table->maxlen = sizeof(unsigned long);
1931 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1883 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
1884 if (ret)
1885 goto out;
1932 1886
1933 if (write) { 1887 if (write) {
1934 NODEMASK_ALLOC(nodemask_t, nodes_allowed, 1888 NODEMASK_ALLOC(nodemask_t, nodes_allowed,
@@ -1943,8 +1897,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
1943 if (nodes_allowed != &node_states[N_HIGH_MEMORY]) 1897 if (nodes_allowed != &node_states[N_HIGH_MEMORY])
1944 NODEMASK_FREE(nodes_allowed); 1898 NODEMASK_FREE(nodes_allowed);
1945 } 1899 }
1946 1900out:
1947 return 0; 1901 return ret;
1948} 1902}
1949 1903
1950int hugetlb_sysctl_handler(struct ctl_table *table, int write, 1904int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -1982,21 +1936,27 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1982{ 1936{
1983 struct hstate *h = &default_hstate; 1937 struct hstate *h = &default_hstate;
1984 unsigned long tmp; 1938 unsigned long tmp;
1939 int ret;
1985 1940
1986 if (!write) 1941 if (!write)
1987 tmp = h->nr_overcommit_huge_pages; 1942 tmp = h->nr_overcommit_huge_pages;
1988 1943
1944 if (write && h->order >= MAX_ORDER)
1945 return -EINVAL;
1946
1989 table->data = &tmp; 1947 table->data = &tmp;
1990 table->maxlen = sizeof(unsigned long); 1948 table->maxlen = sizeof(unsigned long);
1991 proc_doulongvec_minmax(table, write, buffer, length, ppos); 1949 ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
1950 if (ret)
1951 goto out;
1992 1952
1993 if (write) { 1953 if (write) {
1994 spin_lock(&hugetlb_lock); 1954 spin_lock(&hugetlb_lock);
1995 h->nr_overcommit_huge_pages = tmp; 1955 h->nr_overcommit_huge_pages = tmp;
1996 spin_unlock(&hugetlb_lock); 1956 spin_unlock(&hugetlb_lock);
1997 } 1957 }
1998 1958out:
1999 return 0; 1959 return ret;
2000} 1960}
2001 1961
2002#endif /* CONFIG_SYSCTL */ 1962#endif /* CONFIG_SYSCTL */
@@ -2454,7 +2414,8 @@ retry_avoidcopy:
2454 return VM_FAULT_OOM; 2414 return VM_FAULT_OOM;
2455 } 2415 }
2456 2416
2457 copy_user_huge_page(new_page, old_page, address, vma); 2417 copy_user_huge_page(new_page, old_page, address, vma,
2418 pages_per_huge_page(h));
2458 __SetPageUptodate(new_page); 2419 __SetPageUptodate(new_page);
2459 2420
2460 /* 2421 /*
@@ -2558,7 +2519,7 @@ retry:
2558 ret = -PTR_ERR(page); 2519 ret = -PTR_ERR(page);
2559 goto out; 2520 goto out;
2560 } 2521 }
2561 clear_huge_page(page, address, huge_page_size(h)); 2522 clear_huge_page(page, address, pages_per_huge_page(h));
2562 __SetPageUptodate(page); 2523 __SetPageUptodate(page);
2563 2524
2564 if (vma->vm_flags & VM_MAYSHARE) { 2525 if (vma->vm_flags & VM_MAYSHARE) {
diff --git a/mm/internal.h b/mm/internal.h
index dedb0aff673f..4c98630f0f77 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -39,6 +39,15 @@ static inline void __put_page(struct page *page)
39 39
40extern unsigned long highest_memmap_pfn; 40extern unsigned long highest_memmap_pfn;
41 41
42#ifdef CONFIG_SMP
43extern int putback_active_lru_page(struct zone *zone, struct page *page);
44#else
45static inline int putback_active_lru_page(struct zone *zone, struct page *page)
46{
47 return 0;
48}
49#endif
50
42/* 51/*
43 * in mm/vmscan.c: 52 * in mm/vmscan.c:
44 */ 53 */
@@ -134,6 +143,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
134 } 143 }
135} 144}
136 145
146#ifdef CONFIG_TRANSPARENT_HUGEPAGE
147extern unsigned long vma_address(struct page *page,
148 struct vm_area_struct *vma);
149#endif
137#else /* !CONFIG_MMU */ 150#else /* !CONFIG_MMU */
138static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 151static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
139{ 152{
@@ -243,7 +256,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
243 256
244int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 257int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
245 unsigned long start, int len, unsigned int foll_flags, 258 unsigned long start, int len, unsigned int foll_flags,
246 struct page **pages, struct vm_area_struct **vmas); 259 struct page **pages, struct vm_area_struct **vmas,
260 int *nonblocking);
247 261
248#define ZONE_RECLAIM_NOSCAN -2 262#define ZONE_RECLAIM_NOSCAN -2
249#define ZONE_RECLAIM_FULL -1 263#define ZONE_RECLAIM_FULL -1
diff --git a/mm/ksm.c b/mm/ksm.c
index 43bc893470b4..c2b2a94f9d67 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -34,6 +34,7 @@
34#include <linux/swap.h> 34#include <linux/swap.h>
35#include <linux/ksm.h> 35#include <linux/ksm.h>
36#include <linux/hash.h> 36#include <linux/hash.h>
37#include <linux/freezer.h>
37 38
38#include <asm/tlbflush.h> 39#include <asm/tlbflush.h>
39#include "internal.h" 40#include "internal.h"
@@ -411,6 +412,20 @@ out:
411 up_read(&mm->mmap_sem); 412 up_read(&mm->mmap_sem);
412} 413}
413 414
415static struct page *page_trans_compound_anon(struct page *page)
416{
417 if (PageTransCompound(page)) {
418 struct page *head = compound_trans_head(page);
419 /*
420 * head may actually be splitted and freed from under
421 * us but it's ok here.
422 */
423 if (PageAnon(head))
424 return head;
425 }
426 return NULL;
427}
428
414static struct page *get_mergeable_page(struct rmap_item *rmap_item) 429static struct page *get_mergeable_page(struct rmap_item *rmap_item)
415{ 430{
416 struct mm_struct *mm = rmap_item->mm; 431 struct mm_struct *mm = rmap_item->mm;
@@ -430,7 +445,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
430 page = follow_page(vma, addr, FOLL_GET); 445 page = follow_page(vma, addr, FOLL_GET);
431 if (IS_ERR_OR_NULL(page)) 446 if (IS_ERR_OR_NULL(page))
432 goto out; 447 goto out;
433 if (PageAnon(page)) { 448 if (PageAnon(page) || page_trans_compound_anon(page)) {
434 flush_anon_page(vma, page, addr); 449 flush_anon_page(vma, page, addr);
435 flush_dcache_page(page); 450 flush_dcache_page(page);
436 } else { 451 } else {
@@ -708,6 +723,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
708 if (addr == -EFAULT) 723 if (addr == -EFAULT)
709 goto out; 724 goto out;
710 725
726 BUG_ON(PageTransCompound(page));
711 ptep = page_check_address(page, mm, addr, &ptl, 0); 727 ptep = page_check_address(page, mm, addr, &ptl, 0);
712 if (!ptep) 728 if (!ptep)
713 goto out; 729 goto out;
@@ -783,6 +799,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
783 goto out; 799 goto out;
784 800
785 pmd = pmd_offset(pud, addr); 801 pmd = pmd_offset(pud, addr);
802 BUG_ON(pmd_trans_huge(*pmd));
786 if (!pmd_present(*pmd)) 803 if (!pmd_present(*pmd))
787 goto out; 804 goto out;
788 805
@@ -800,6 +817,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
800 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); 817 set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
801 818
802 page_remove_rmap(page); 819 page_remove_rmap(page);
820 if (!page_mapped(page))
821 try_to_free_swap(page);
803 put_page(page); 822 put_page(page);
804 823
805 pte_unmap_unlock(ptep, ptl); 824 pte_unmap_unlock(ptep, ptl);
@@ -808,6 +827,33 @@ out:
808 return err; 827 return err;
809} 828}
810 829
830static int page_trans_compound_anon_split(struct page *page)
831{
832 int ret = 0;
833 struct page *transhuge_head = page_trans_compound_anon(page);
834 if (transhuge_head) {
835 /* Get the reference on the head to split it. */
836 if (get_page_unless_zero(transhuge_head)) {
837 /*
838 * Recheck we got the reference while the head
839 * was still anonymous.
840 */
841 if (PageAnon(transhuge_head))
842 ret = split_huge_page(transhuge_head);
843 else
844 /*
845 * Retry later if split_huge_page run
846 * from under us.
847 */
848 ret = 1;
849 put_page(transhuge_head);
850 } else
851 /* Retry later if split_huge_page run from under us. */
852 ret = 1;
853 }
854 return ret;
855}
856
811/* 857/*
812 * try_to_merge_one_page - take two pages and merge them into one 858 * try_to_merge_one_page - take two pages and merge them into one
813 * @vma: the vma that holds the pte pointing to page 859 * @vma: the vma that holds the pte pointing to page
@@ -828,6 +874,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
828 874
829 if (!(vma->vm_flags & VM_MERGEABLE)) 875 if (!(vma->vm_flags & VM_MERGEABLE))
830 goto out; 876 goto out;
877 if (PageTransCompound(page) && page_trans_compound_anon_split(page))
878 goto out;
879 BUG_ON(PageTransCompound(page));
831 if (!PageAnon(page)) 880 if (!PageAnon(page))
832 goto out; 881 goto out;
833 882
@@ -1247,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
1247 1296
1248 slot = ksm_scan.mm_slot; 1297 slot = ksm_scan.mm_slot;
1249 if (slot == &ksm_mm_head) { 1298 if (slot == &ksm_mm_head) {
1299 /*
1300 * A number of pages can hang around indefinitely on per-cpu
1301 * pagevecs, raised page count preventing write_protect_page
1302 * from merging them. Though it doesn't really matter much,
1303 * it is puzzling to see some stuck in pages_volatile until
1304 * other activity jostles them out, and they also prevented
1305 * LTP's KSM test from succeeding deterministically; so drain
1306 * them here (here rather than on entry to ksm_do_scan(),
1307 * so we don't IPI too often when pages_to_scan is set low).
1308 */
1309 lru_add_drain_all();
1310
1250 root_unstable_tree = RB_ROOT; 1311 root_unstable_tree = RB_ROOT;
1251 1312
1252 spin_lock(&ksm_mmlist_lock); 1313 spin_lock(&ksm_mmlist_lock);
@@ -1277,7 +1338,13 @@ next_mm:
1277 if (ksm_test_exit(mm)) 1338 if (ksm_test_exit(mm))
1278 break; 1339 break;
1279 *page = follow_page(vma, ksm_scan.address, FOLL_GET); 1340 *page = follow_page(vma, ksm_scan.address, FOLL_GET);
1280 if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { 1341 if (IS_ERR_OR_NULL(*page)) {
1342 ksm_scan.address += PAGE_SIZE;
1343 cond_resched();
1344 continue;
1345 }
1346 if (PageAnon(*page) ||
1347 page_trans_compound_anon(*page)) {
1281 flush_anon_page(vma, *page, ksm_scan.address); 1348 flush_anon_page(vma, *page, ksm_scan.address);
1282 flush_dcache_page(*page); 1349 flush_dcache_page(*page);
1283 rmap_item = get_next_rmap_item(slot, 1350 rmap_item = get_next_rmap_item(slot,
@@ -1291,8 +1358,7 @@ next_mm:
1291 up_read(&mm->mmap_sem); 1358 up_read(&mm->mmap_sem);
1292 return rmap_item; 1359 return rmap_item;
1293 } 1360 }
1294 if (!IS_ERR_OR_NULL(*page)) 1361 put_page(*page);
1295 put_page(*page);
1296 ksm_scan.address += PAGE_SIZE; 1362 ksm_scan.address += PAGE_SIZE;
1297 cond_resched(); 1363 cond_resched();
1298 } 1364 }
@@ -1352,7 +1418,7 @@ static void ksm_do_scan(unsigned int scan_npages)
1352 struct rmap_item *rmap_item; 1418 struct rmap_item *rmap_item;
1353 struct page *uninitialized_var(page); 1419 struct page *uninitialized_var(page);
1354 1420
1355 while (scan_npages--) { 1421 while (scan_npages-- && likely(!freezing(current))) {
1356 cond_resched(); 1422 cond_resched();
1357 rmap_item = scan_get_next_rmap_item(&page); 1423 rmap_item = scan_get_next_rmap_item(&page);
1358 if (!rmap_item) 1424 if (!rmap_item)
@@ -1370,6 +1436,7 @@ static int ksmd_should_run(void)
1370 1436
1371static int ksm_scan_thread(void *nothing) 1437static int ksm_scan_thread(void *nothing)
1372{ 1438{
1439 set_freezable();
1373 set_user_nice(current, 5); 1440 set_user_nice(current, 5);
1374 1441
1375 while (!kthread_should_stop()) { 1442 while (!kthread_should_stop()) {
@@ -1378,11 +1445,13 @@ static int ksm_scan_thread(void *nothing)
1378 ksm_do_scan(ksm_thread_pages_to_scan); 1445 ksm_do_scan(ksm_thread_pages_to_scan);
1379 mutex_unlock(&ksm_thread_mutex); 1446 mutex_unlock(&ksm_thread_mutex);
1380 1447
1448 try_to_freeze();
1449
1381 if (ksmd_should_run()) { 1450 if (ksmd_should_run()) {
1382 schedule_timeout_interruptible( 1451 schedule_timeout_interruptible(
1383 msecs_to_jiffies(ksm_thread_sleep_millisecs)); 1452 msecs_to_jiffies(ksm_thread_sleep_millisecs));
1384 } else { 1453 } else {
1385 wait_event_interruptible(ksm_thread_wait, 1454 wait_event_freezable(ksm_thread_wait,
1386 ksmd_should_run() || kthread_should_stop()); 1455 ksmd_should_run() || kthread_should_stop());
1387 } 1456 }
1388 } 1457 }
diff --git a/mm/madvise.c b/mm/madvise.c
index 319528b8db74..2221491ed503 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
71 if (error) 71 if (error)
72 goto out; 72 goto out;
73 break; 73 break;
74 case MADV_HUGEPAGE:
75 case MADV_NOHUGEPAGE:
76 error = hugepage_madvise(vma, &new_flags, behavior);
77 if (error)
78 goto out;
79 break;
74 } 80 }
75 81
76 if (new_flags == vma->vm_flags) { 82 if (new_flags == vma->vm_flags) {
@@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior)
283 case MADV_MERGEABLE: 289 case MADV_MERGEABLE:
284 case MADV_UNMERGEABLE: 290 case MADV_UNMERGEABLE:
285#endif 291#endif
292#ifdef CONFIG_TRANSPARENT_HUGEPAGE
293 case MADV_HUGEPAGE:
294 case MADV_NOHUGEPAGE:
295#endif
286 return 1; 296 return 1;
287 297
288 default: 298 default:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 00bb8a64d028..8ab841031436 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -292,7 +292,6 @@ static struct move_charge_struct {
292 unsigned long moved_charge; 292 unsigned long moved_charge;
293 unsigned long moved_swap; 293 unsigned long moved_swap;
294 struct task_struct *moving_task; /* a task moving charges */ 294 struct task_struct *moving_task; /* a task moving charges */
295 struct mm_struct *mm;
296 wait_queue_head_t waitq; /* a waitq for other context */ 295 wait_queue_head_t waitq; /* a waitq for other context */
297} mc = { 296} mc = {
298 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 297 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
@@ -821,7 +820,6 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
821 return; 820 return;
822 VM_BUG_ON(list_empty(&pc->lru)); 821 VM_BUG_ON(list_empty(&pc->lru));
823 list_del_init(&pc->lru); 822 list_del_init(&pc->lru);
824 return;
825} 823}
826 824
827void mem_cgroup_del_lru(struct page *page) 825void mem_cgroup_del_lru(struct page *page)
@@ -1087,7 +1085,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1087 case 0: 1085 case 0:
1088 list_move(&page->lru, dst); 1086 list_move(&page->lru, dst);
1089 mem_cgroup_del_lru(page); 1087 mem_cgroup_del_lru(page);
1090 nr_taken++; 1088 nr_taken += hpage_nr_pages(page);
1091 break; 1089 break;
1092 case -EBUSY: 1090 case -EBUSY:
1093 /* we don't affect global LRU but rotate in our LRU */ 1091 /* we don't affect global LRU but rotate in our LRU */
@@ -1312,8 +1310,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1312 u64 limit; 1310 u64 limit;
1313 u64 memsw; 1311 u64 memsw;
1314 1312
1315 limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + 1313 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1316 total_swap_pages; 1314 limit += total_swap_pages << PAGE_SHIFT;
1315
1317 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1316 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1318 /* 1317 /*
1319 * If memsw is finite and limits the amount of swap space available 1318 * If memsw is finite and limits the amount of swap space available
@@ -1600,11 +1599,13 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1600 * possibility of race condition. If there is, we take a lock. 1599 * possibility of race condition. If there is, we take a lock.
1601 */ 1600 */
1602 1601
1603static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) 1602void mem_cgroup_update_page_stat(struct page *page,
1603 enum mem_cgroup_page_stat_item idx, int val)
1604{ 1604{
1605 struct mem_cgroup *mem; 1605 struct mem_cgroup *mem;
1606 struct page_cgroup *pc = lookup_page_cgroup(page); 1606 struct page_cgroup *pc = lookup_page_cgroup(page);
1607 bool need_unlock = false; 1607 bool need_unlock = false;
1608 unsigned long uninitialized_var(flags);
1608 1609
1609 if (unlikely(!pc)) 1610 if (unlikely(!pc))
1610 return; 1611 return;
@@ -1616,37 +1617,34 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
1616 /* pc->mem_cgroup is unstable ? */ 1617 /* pc->mem_cgroup is unstable ? */
1617 if (unlikely(mem_cgroup_stealed(mem))) { 1618 if (unlikely(mem_cgroup_stealed(mem))) {
1618 /* take a lock against to access pc->mem_cgroup */ 1619 /* take a lock against to access pc->mem_cgroup */
1619 lock_page_cgroup(pc); 1620 move_lock_page_cgroup(pc, &flags);
1620 need_unlock = true; 1621 need_unlock = true;
1621 mem = pc->mem_cgroup; 1622 mem = pc->mem_cgroup;
1622 if (!mem || !PageCgroupUsed(pc)) 1623 if (!mem || !PageCgroupUsed(pc))
1623 goto out; 1624 goto out;
1624 } 1625 }
1625 1626
1626 this_cpu_add(mem->stat->count[idx], val);
1627
1628 switch (idx) { 1627 switch (idx) {
1629 case MEM_CGROUP_STAT_FILE_MAPPED: 1628 case MEMCG_NR_FILE_MAPPED:
1630 if (val > 0) 1629 if (val > 0)
1631 SetPageCgroupFileMapped(pc); 1630 SetPageCgroupFileMapped(pc);
1632 else if (!page_mapped(page)) 1631 else if (!page_mapped(page))
1633 ClearPageCgroupFileMapped(pc); 1632 ClearPageCgroupFileMapped(pc);
1633 idx = MEM_CGROUP_STAT_FILE_MAPPED;
1634 break; 1634 break;
1635 default: 1635 default:
1636 BUG(); 1636 BUG();
1637 } 1637 }
1638 1638
1639 this_cpu_add(mem->stat->count[idx], val);
1640
1639out: 1641out:
1640 if (unlikely(need_unlock)) 1642 if (unlikely(need_unlock))
1641 unlock_page_cgroup(pc); 1643 move_unlock_page_cgroup(pc, &flags);
1642 rcu_read_unlock(); 1644 rcu_read_unlock();
1643 return; 1645 return;
1644} 1646}
1645 1647EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1646void mem_cgroup_update_file_mapped(struct page *page, int val)
1647{
1648 mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
1649}
1650 1648
1651/* 1649/*
1652 * size of first charge trial. "32" comes from vmscan.c's magic value. 1650 * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -1887,12 +1885,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1887 * oom-killer can be invoked. 1885 * oom-killer can be invoked.
1888 */ 1886 */
1889static int __mem_cgroup_try_charge(struct mm_struct *mm, 1887static int __mem_cgroup_try_charge(struct mm_struct *mm,
1890 gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) 1888 gfp_t gfp_mask,
1889 struct mem_cgroup **memcg, bool oom,
1890 int page_size)
1891{ 1891{
1892 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 1892 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1893 struct mem_cgroup *mem = NULL; 1893 struct mem_cgroup *mem = NULL;
1894 int ret; 1894 int ret;
1895 int csize = CHARGE_SIZE; 1895 int csize = max(CHARGE_SIZE, (unsigned long) page_size);
1896 1896
1897 /* 1897 /*
1898 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 1898 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
@@ -1917,7 +1917,7 @@ again:
1917 VM_BUG_ON(css_is_removed(&mem->css)); 1917 VM_BUG_ON(css_is_removed(&mem->css));
1918 if (mem_cgroup_is_root(mem)) 1918 if (mem_cgroup_is_root(mem))
1919 goto done; 1919 goto done;
1920 if (consume_stock(mem)) 1920 if (page_size == PAGE_SIZE && consume_stock(mem))
1921 goto done; 1921 goto done;
1922 css_get(&mem->css); 1922 css_get(&mem->css);
1923 } else { 1923 } else {
@@ -1940,7 +1940,7 @@ again:
1940 rcu_read_unlock(); 1940 rcu_read_unlock();
1941 goto done; 1941 goto done;
1942 } 1942 }
1943 if (consume_stock(mem)) { 1943 if (page_size == PAGE_SIZE && consume_stock(mem)) {
1944 /* 1944 /*
1945 * It seems dagerous to access memcg without css_get(). 1945 * It seems dagerous to access memcg without css_get().
1946 * But considering how consume_stok works, it's not 1946 * But considering how consume_stok works, it's not
@@ -1981,7 +1981,7 @@ again:
1981 case CHARGE_OK: 1981 case CHARGE_OK:
1982 break; 1982 break;
1983 case CHARGE_RETRY: /* not in OOM situation but retry */ 1983 case CHARGE_RETRY: /* not in OOM situation but retry */
1984 csize = PAGE_SIZE; 1984 csize = page_size;
1985 css_put(&mem->css); 1985 css_put(&mem->css);
1986 mem = NULL; 1986 mem = NULL;
1987 goto again; 1987 goto again;
@@ -2002,8 +2002,8 @@ again:
2002 } 2002 }
2003 } while (ret != CHARGE_OK); 2003 } while (ret != CHARGE_OK);
2004 2004
2005 if (csize > PAGE_SIZE) 2005 if (csize > page_size)
2006 refill_stock(mem, csize - PAGE_SIZE); 2006 refill_stock(mem, csize - page_size);
2007 css_put(&mem->css); 2007 css_put(&mem->css);
2008done: 2008done:
2009 *memcg = mem; 2009 *memcg = mem;
@@ -2031,9 +2031,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2031 } 2031 }
2032} 2032}
2033 2033
2034static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) 2034static void mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2035 int page_size)
2035{ 2036{
2036 __mem_cgroup_cancel_charge(mem, 1); 2037 __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT);
2037} 2038}
2038 2039
2039/* 2040/*
@@ -2087,22 +2088,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2087 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be 2088 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
2088 * USED state. If already USED, uncharge and return. 2089 * USED state. If already USED, uncharge and return.
2089 */ 2090 */
2090 2091static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem,
2091static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2092 struct page_cgroup *pc,
2092 struct page_cgroup *pc, 2093 enum charge_type ctype)
2093 enum charge_type ctype)
2094{ 2094{
2095 /* try_charge() can return NULL to *memcg, taking care of it. */
2096 if (!mem)
2097 return;
2098
2099 lock_page_cgroup(pc);
2100 if (unlikely(PageCgroupUsed(pc))) {
2101 unlock_page_cgroup(pc);
2102 mem_cgroup_cancel_charge(mem);
2103 return;
2104 }
2105
2106 pc->mem_cgroup = mem; 2095 pc->mem_cgroup = mem;
2107 /* 2096 /*
2108 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2097 * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -2127,6 +2116,33 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2127 } 2116 }
2128 2117
2129 mem_cgroup_charge_statistics(mem, pc, true); 2118 mem_cgroup_charge_statistics(mem, pc, true);
2119}
2120
2121static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2122 struct page_cgroup *pc,
2123 enum charge_type ctype,
2124 int page_size)
2125{
2126 int i;
2127 int count = page_size >> PAGE_SHIFT;
2128
2129 /* try_charge() can return NULL to *memcg, taking care of it. */
2130 if (!mem)
2131 return;
2132
2133 lock_page_cgroup(pc);
2134 if (unlikely(PageCgroupUsed(pc))) {
2135 unlock_page_cgroup(pc);
2136 mem_cgroup_cancel_charge(mem, page_size);
2137 return;
2138 }
2139
2140 /*
2141 * we don't need page_cgroup_lock about tail pages, becase they are not
2142 * accessed by any other context at this point.
2143 */
2144 for (i = 0; i < count; i++)
2145 ____mem_cgroup_commit_charge(mem, pc + i, ctype);
2130 2146
2131 unlock_page_cgroup(pc); 2147 unlock_page_cgroup(pc);
2132 /* 2148 /*
@@ -2173,7 +2189,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
2173 mem_cgroup_charge_statistics(from, pc, false); 2189 mem_cgroup_charge_statistics(from, pc, false);
2174 if (uncharge) 2190 if (uncharge)
2175 /* This is not "cancel", but cancel_charge does all we need. */ 2191 /* This is not "cancel", but cancel_charge does all we need. */
2176 mem_cgroup_cancel_charge(from); 2192 mem_cgroup_cancel_charge(from, PAGE_SIZE);
2177 2193
2178 /* caller should have done css_get */ 2194 /* caller should have done css_get */
2179 pc->mem_cgroup = to; 2195 pc->mem_cgroup = to;
@@ -2195,9 +2211,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
2195 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) 2211 struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
2196{ 2212{
2197 int ret = -EINVAL; 2213 int ret = -EINVAL;
2214 unsigned long flags;
2215
2198 lock_page_cgroup(pc); 2216 lock_page_cgroup(pc);
2199 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { 2217 if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
2218 move_lock_page_cgroup(pc, &flags);
2200 __mem_cgroup_move_account(pc, from, to, uncharge); 2219 __mem_cgroup_move_account(pc, from, to, uncharge);
2220 move_unlock_page_cgroup(pc, &flags);
2201 ret = 0; 2221 ret = 0;
2202 } 2222 }
2203 unlock_page_cgroup(pc); 2223 unlock_page_cgroup(pc);
@@ -2234,13 +2254,14 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
2234 goto put; 2254 goto put;
2235 2255
2236 parent = mem_cgroup_from_cont(pcg); 2256 parent = mem_cgroup_from_cont(pcg);
2237 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 2257 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false,
2258 PAGE_SIZE);
2238 if (ret || !parent) 2259 if (ret || !parent)
2239 goto put_back; 2260 goto put_back;
2240 2261
2241 ret = mem_cgroup_move_account(pc, child, parent, true); 2262 ret = mem_cgroup_move_account(pc, child, parent, true);
2242 if (ret) 2263 if (ret)
2243 mem_cgroup_cancel_charge(parent); 2264 mem_cgroup_cancel_charge(parent, PAGE_SIZE);
2244put_back: 2265put_back:
2245 putback_lru_page(page); 2266 putback_lru_page(page);
2246put: 2267put:
@@ -2261,6 +2282,12 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2261 struct mem_cgroup *mem = NULL; 2282 struct mem_cgroup *mem = NULL;
2262 struct page_cgroup *pc; 2283 struct page_cgroup *pc;
2263 int ret; 2284 int ret;
2285 int page_size = PAGE_SIZE;
2286
2287 if (PageTransHuge(page)) {
2288 page_size <<= compound_order(page);
2289 VM_BUG_ON(!PageTransHuge(page));
2290 }
2264 2291
2265 pc = lookup_page_cgroup(page); 2292 pc = lookup_page_cgroup(page);
2266 /* can happen at boot */ 2293 /* can happen at boot */
@@ -2268,11 +2295,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2268 return 0; 2295 return 0;
2269 prefetchw(pc); 2296 prefetchw(pc);
2270 2297
2271 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 2298 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size);
2272 if (ret || !mem) 2299 if (ret || !mem)
2273 return ret; 2300 return ret;
2274 2301
2275 __mem_cgroup_commit_charge(mem, pc, ctype); 2302 __mem_cgroup_commit_charge(mem, pc, ctype, page_size);
2276 return 0; 2303 return 0;
2277} 2304}
2278 2305
@@ -2281,8 +2308,6 @@ int mem_cgroup_newpage_charge(struct page *page,
2281{ 2308{
2282 if (mem_cgroup_disabled()) 2309 if (mem_cgroup_disabled())
2283 return 0; 2310 return 0;
2284 if (PageCompound(page))
2285 return 0;
2286 /* 2311 /*
2287 * If already mapped, we don't have to account. 2312 * If already mapped, we don't have to account.
2288 * If page cache, page->mapping has address_space. 2313 * If page cache, page->mapping has address_space.
@@ -2388,13 +2413,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2388 if (!mem) 2413 if (!mem)
2389 goto charge_cur_mm; 2414 goto charge_cur_mm;
2390 *ptr = mem; 2415 *ptr = mem;
2391 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 2416 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE);
2392 css_put(&mem->css); 2417 css_put(&mem->css);
2393 return ret; 2418 return ret;
2394charge_cur_mm: 2419charge_cur_mm:
2395 if (unlikely(!mm)) 2420 if (unlikely(!mm))
2396 mm = &init_mm; 2421 mm = &init_mm;
2397 return __mem_cgroup_try_charge(mm, mask, ptr, true); 2422 return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE);
2398} 2423}
2399 2424
2400static void 2425static void
@@ -2410,7 +2435,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2410 cgroup_exclude_rmdir(&ptr->css); 2435 cgroup_exclude_rmdir(&ptr->css);
2411 pc = lookup_page_cgroup(page); 2436 pc = lookup_page_cgroup(page);
2412 mem_cgroup_lru_del_before_commit_swapcache(page); 2437 mem_cgroup_lru_del_before_commit_swapcache(page);
2413 __mem_cgroup_commit_charge(ptr, pc, ctype); 2438 __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE);
2414 mem_cgroup_lru_add_after_commit_swapcache(page); 2439 mem_cgroup_lru_add_after_commit_swapcache(page);
2415 /* 2440 /*
2416 * Now swap is on-memory. This means this page may be 2441 * Now swap is on-memory. This means this page may be
@@ -2459,11 +2484,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2459 return; 2484 return;
2460 if (!mem) 2485 if (!mem)
2461 return; 2486 return;
2462 mem_cgroup_cancel_charge(mem); 2487 mem_cgroup_cancel_charge(mem, PAGE_SIZE);
2463} 2488}
2464 2489
2465static void 2490static void
2466__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) 2491__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype,
2492 int page_size)
2467{ 2493{
2468 struct memcg_batch_info *batch = NULL; 2494 struct memcg_batch_info *batch = NULL;
2469 bool uncharge_memsw = true; 2495 bool uncharge_memsw = true;
@@ -2490,6 +2516,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2490 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2516 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2491 goto direct_uncharge; 2517 goto direct_uncharge;
2492 2518
2519 if (page_size != PAGE_SIZE)
2520 goto direct_uncharge;
2521
2493 /* 2522 /*
2494 * In typical case, batch->memcg == mem. This means we can 2523 * In typical case, batch->memcg == mem. This means we can
2495 * merge a series of uncharges to an uncharge of res_counter. 2524 * merge a series of uncharges to an uncharge of res_counter.
@@ -2503,9 +2532,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
2503 batch->memsw_bytes += PAGE_SIZE; 2532 batch->memsw_bytes += PAGE_SIZE;
2504 return; 2533 return;
2505direct_uncharge: 2534direct_uncharge:
2506 res_counter_uncharge(&mem->res, PAGE_SIZE); 2535 res_counter_uncharge(&mem->res, page_size);
2507 if (uncharge_memsw) 2536 if (uncharge_memsw)
2508 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 2537 res_counter_uncharge(&mem->memsw, page_size);
2509 if (unlikely(batch->memcg != mem)) 2538 if (unlikely(batch->memcg != mem))
2510 memcg_oom_recover(mem); 2539 memcg_oom_recover(mem);
2511 return; 2540 return;
@@ -2517,8 +2546,11 @@ direct_uncharge:
2517static struct mem_cgroup * 2546static struct mem_cgroup *
2518__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2547__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2519{ 2548{
2549 int i;
2550 int count;
2520 struct page_cgroup *pc; 2551 struct page_cgroup *pc;
2521 struct mem_cgroup *mem = NULL; 2552 struct mem_cgroup *mem = NULL;
2553 int page_size = PAGE_SIZE;
2522 2554
2523 if (mem_cgroup_disabled()) 2555 if (mem_cgroup_disabled())
2524 return NULL; 2556 return NULL;
@@ -2526,6 +2558,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2526 if (PageSwapCache(page)) 2558 if (PageSwapCache(page))
2527 return NULL; 2559 return NULL;
2528 2560
2561 if (PageTransHuge(page)) {
2562 page_size <<= compound_order(page);
2563 VM_BUG_ON(!PageTransHuge(page));
2564 }
2565
2566 count = page_size >> PAGE_SHIFT;
2529 /* 2567 /*
2530 * Check if our page_cgroup is valid 2568 * Check if our page_cgroup is valid
2531 */ 2569 */
@@ -2558,7 +2596,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2558 break; 2596 break;
2559 } 2597 }
2560 2598
2561 mem_cgroup_charge_statistics(mem, pc, false); 2599 for (i = 0; i < count; i++)
2600 mem_cgroup_charge_statistics(mem, pc + i, false);
2562 2601
2563 ClearPageCgroupUsed(pc); 2602 ClearPageCgroupUsed(pc);
2564 /* 2603 /*
@@ -2579,7 +2618,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2579 mem_cgroup_get(mem); 2618 mem_cgroup_get(mem);
2580 } 2619 }
2581 if (!mem_cgroup_is_root(mem)) 2620 if (!mem_cgroup_is_root(mem))
2582 __do_uncharge(mem, ctype); 2621 __do_uncharge(mem, ctype, page_size);
2583 2622
2584 return mem; 2623 return mem;
2585 2624
@@ -2774,6 +2813,7 @@ int mem_cgroup_prepare_migration(struct page *page,
2774 enum charge_type ctype; 2813 enum charge_type ctype;
2775 int ret = 0; 2814 int ret = 0;
2776 2815
2816 VM_BUG_ON(PageTransHuge(page));
2777 if (mem_cgroup_disabled()) 2817 if (mem_cgroup_disabled())
2778 return 0; 2818 return 0;
2779 2819
@@ -2823,7 +2863,7 @@ int mem_cgroup_prepare_migration(struct page *page,
2823 return 0; 2863 return 0;
2824 2864
2825 *ptr = mem; 2865 *ptr = mem;
2826 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); 2866 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE);
2827 css_put(&mem->css);/* drop extra refcnt */ 2867 css_put(&mem->css);/* drop extra refcnt */
2828 if (ret || *ptr == NULL) { 2868 if (ret || *ptr == NULL) {
2829 if (PageAnon(page)) { 2869 if (PageAnon(page)) {
@@ -2850,13 +2890,13 @@ int mem_cgroup_prepare_migration(struct page *page,
2850 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 2890 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
2851 else 2891 else
2852 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 2892 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
2853 __mem_cgroup_commit_charge(mem, pc, ctype); 2893 __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE);
2854 return ret; 2894 return ret;
2855} 2895}
2856 2896
2857/* remove redundant charge if migration failed*/ 2897/* remove redundant charge if migration failed*/
2858void mem_cgroup_end_migration(struct mem_cgroup *mem, 2898void mem_cgroup_end_migration(struct mem_cgroup *mem,
2859 struct page *oldpage, struct page *newpage) 2899 struct page *oldpage, struct page *newpage, bool migration_ok)
2860{ 2900{
2861 struct page *used, *unused; 2901 struct page *used, *unused;
2862 struct page_cgroup *pc; 2902 struct page_cgroup *pc;
@@ -2865,8 +2905,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
2865 return; 2905 return;
2866 /* blocks rmdir() */ 2906 /* blocks rmdir() */
2867 cgroup_exclude_rmdir(&mem->css); 2907 cgroup_exclude_rmdir(&mem->css);
2868 /* at migration success, oldpage->mapping is NULL. */ 2908 if (!migration_ok) {
2869 if (oldpage->mapping) {
2870 used = oldpage; 2909 used = oldpage;
2871 unused = newpage; 2910 unused = newpage;
2872 } else { 2911 } else {
@@ -4176,13 +4215,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4176 */ 4215 */
4177 if (!node_state(node, N_NORMAL_MEMORY)) 4216 if (!node_state(node, N_NORMAL_MEMORY))
4178 tmp = -1; 4217 tmp = -1;
4179 pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4218 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4180 if (!pn) 4219 if (!pn)
4181 return 1; 4220 return 1;
4182 4221
4183 mem->info.nodeinfo[node] = pn; 4222 mem->info.nodeinfo[node] = pn;
4184 memset(pn, 0, sizeof(*pn));
4185
4186 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4223 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4187 mz = &pn->zoneinfo[zone]; 4224 mz = &pn->zoneinfo[zone];
4188 for_each_lru(l) 4225 for_each_lru(l)
@@ -4206,14 +4243,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
4206 4243
4207 /* Can be very big if MAX_NUMNODES is very big */ 4244 /* Can be very big if MAX_NUMNODES is very big */
4208 if (size < PAGE_SIZE) 4245 if (size < PAGE_SIZE)
4209 mem = kmalloc(size, GFP_KERNEL); 4246 mem = kzalloc(size, GFP_KERNEL);
4210 else 4247 else
4211 mem = vmalloc(size); 4248 mem = vzalloc(size);
4212 4249
4213 if (!mem) 4250 if (!mem)
4214 return NULL; 4251 return NULL;
4215 4252
4216 memset(mem, 0, size);
4217 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4253 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4218 if (!mem->stat) 4254 if (!mem->stat)
4219 goto out_free; 4255 goto out_free;
@@ -4461,7 +4497,8 @@ one_by_one:
4461 batch_count = PRECHARGE_COUNT_AT_ONCE; 4497 batch_count = PRECHARGE_COUNT_AT_ONCE;
4462 cond_resched(); 4498 cond_resched();
4463 } 4499 }
4464 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 4500 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
4501 PAGE_SIZE);
4465 if (ret || !mem) 4502 if (ret || !mem)
4466 /* mem_cgroup_clear_mc() will do uncharge later */ 4503 /* mem_cgroup_clear_mc() will do uncharge later */
4467 return -ENOMEM; 4504 return -ENOMEM;
@@ -4623,6 +4660,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4623 pte_t *pte; 4660 pte_t *pte;
4624 spinlock_t *ptl; 4661 spinlock_t *ptl;
4625 4662
4663 VM_BUG_ON(pmd_trans_huge(*pmd));
4626 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4664 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4627 for (; addr != end; pte++, addr += PAGE_SIZE) 4665 for (; addr != end; pte++, addr += PAGE_SIZE)
4628 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 4666 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
@@ -4638,7 +4676,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4638 unsigned long precharge; 4676 unsigned long precharge;
4639 struct vm_area_struct *vma; 4677 struct vm_area_struct *vma;
4640 4678
4641 /* We've already held the mmap_sem */ 4679 down_read(&mm->mmap_sem);
4642 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4680 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4643 struct mm_walk mem_cgroup_count_precharge_walk = { 4681 struct mm_walk mem_cgroup_count_precharge_walk = {
4644 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4682 .pmd_entry = mem_cgroup_count_precharge_pte_range,
@@ -4650,6 +4688,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4650 walk_page_range(vma->vm_start, vma->vm_end, 4688 walk_page_range(vma->vm_start, vma->vm_end,
4651 &mem_cgroup_count_precharge_walk); 4689 &mem_cgroup_count_precharge_walk);
4652 } 4690 }
4691 up_read(&mm->mmap_sem);
4653 4692
4654 precharge = mc.precharge; 4693 precharge = mc.precharge;
4655 mc.precharge = 0; 4694 mc.precharge = 0;
@@ -4659,10 +4698,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4659 4698
4660static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4699static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4661{ 4700{
4662 return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); 4701 unsigned long precharge = mem_cgroup_count_precharge(mm);
4702
4703 VM_BUG_ON(mc.moving_task);
4704 mc.moving_task = current;
4705 return mem_cgroup_do_precharge(precharge);
4663} 4706}
4664 4707
4665static void mem_cgroup_clear_mc(void) 4708/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
4709static void __mem_cgroup_clear_mc(void)
4666{ 4710{
4667 struct mem_cgroup *from = mc.from; 4711 struct mem_cgroup *from = mc.from;
4668 struct mem_cgroup *to = mc.to; 4712 struct mem_cgroup *to = mc.to;
@@ -4697,23 +4741,28 @@ static void mem_cgroup_clear_mc(void)
4697 PAGE_SIZE * mc.moved_swap); 4741 PAGE_SIZE * mc.moved_swap);
4698 } 4742 }
4699 /* we've already done mem_cgroup_get(mc.to) */ 4743 /* we've already done mem_cgroup_get(mc.to) */
4700
4701 mc.moved_swap = 0; 4744 mc.moved_swap = 0;
4702 } 4745 }
4703 if (mc.mm) { 4746 memcg_oom_recover(from);
4704 up_read(&mc.mm->mmap_sem); 4747 memcg_oom_recover(to);
4705 mmput(mc.mm); 4748 wake_up_all(&mc.waitq);
4706 } 4749}
4750
4751static void mem_cgroup_clear_mc(void)
4752{
4753 struct mem_cgroup *from = mc.from;
4754
4755 /*
4756 * we must clear moving_task before waking up waiters at the end of
4757 * task migration.
4758 */
4759 mc.moving_task = NULL;
4760 __mem_cgroup_clear_mc();
4707 spin_lock(&mc.lock); 4761 spin_lock(&mc.lock);
4708 mc.from = NULL; 4762 mc.from = NULL;
4709 mc.to = NULL; 4763 mc.to = NULL;
4710 spin_unlock(&mc.lock); 4764 spin_unlock(&mc.lock);
4711 mc.moving_task = NULL;
4712 mc.mm = NULL;
4713 mem_cgroup_end_move(from); 4765 mem_cgroup_end_move(from);
4714 memcg_oom_recover(from);
4715 memcg_oom_recover(to);
4716 wake_up_all(&mc.waitq);
4717} 4766}
4718 4767
4719static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4768static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
@@ -4735,38 +4784,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4735 return 0; 4784 return 0;
4736 /* We move charges only when we move a owner of the mm */ 4785 /* We move charges only when we move a owner of the mm */
4737 if (mm->owner == p) { 4786 if (mm->owner == p) {
4738 /*
4739 * We do all the move charge works under one mmap_sem to
4740 * avoid deadlock with down_write(&mmap_sem)
4741 * -> try_charge() -> if (mc.moving_task) -> sleep.
4742 */
4743 down_read(&mm->mmap_sem);
4744
4745 VM_BUG_ON(mc.from); 4787 VM_BUG_ON(mc.from);
4746 VM_BUG_ON(mc.to); 4788 VM_BUG_ON(mc.to);
4747 VM_BUG_ON(mc.precharge); 4789 VM_BUG_ON(mc.precharge);
4748 VM_BUG_ON(mc.moved_charge); 4790 VM_BUG_ON(mc.moved_charge);
4749 VM_BUG_ON(mc.moved_swap); 4791 VM_BUG_ON(mc.moved_swap);
4750 VM_BUG_ON(mc.moving_task);
4751 VM_BUG_ON(mc.mm);
4752
4753 mem_cgroup_start_move(from); 4792 mem_cgroup_start_move(from);
4754 spin_lock(&mc.lock); 4793 spin_lock(&mc.lock);
4755 mc.from = from; 4794 mc.from = from;
4756 mc.to = mem; 4795 mc.to = mem;
4757 mc.precharge = 0;
4758 mc.moved_charge = 0;
4759 mc.moved_swap = 0;
4760 spin_unlock(&mc.lock); 4796 spin_unlock(&mc.lock);
4761 mc.moving_task = current; 4797 /* We set mc.moving_task later */
4762 mc.mm = mm;
4763 4798
4764 ret = mem_cgroup_precharge_mc(mm); 4799 ret = mem_cgroup_precharge_mc(mm);
4765 if (ret) 4800 if (ret)
4766 mem_cgroup_clear_mc(); 4801 mem_cgroup_clear_mc();
4767 /* We call up_read() and mmput() in clear_mc(). */ 4802 }
4768 } else 4803 mmput(mm);
4769 mmput(mm);
4770 } 4804 }
4771 return ret; 4805 return ret;
4772} 4806}
@@ -4789,6 +4823,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4789 spinlock_t *ptl; 4823 spinlock_t *ptl;
4790 4824
4791retry: 4825retry:
4826 VM_BUG_ON(pmd_trans_huge(*pmd));
4792 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4827 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4793 for (; addr != end; addr += PAGE_SIZE) { 4828 for (; addr != end; addr += PAGE_SIZE) {
4794 pte_t ptent = *(pte++); 4829 pte_t ptent = *(pte++);
@@ -4854,7 +4889,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4854 struct vm_area_struct *vma; 4889 struct vm_area_struct *vma;
4855 4890
4856 lru_add_drain_all(); 4891 lru_add_drain_all();
4857 /* We've already held the mmap_sem */ 4892retry:
4893 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
4894 /*
4895 * Someone who are holding the mmap_sem might be waiting in
4896 * waitq. So we cancel all extra charges, wake up all waiters,
4897 * and retry. Because we cancel precharges, we might not be able
4898 * to move enough charges, but moving charge is a best-effort
4899 * feature anyway, so it wouldn't be a big problem.
4900 */
4901 __mem_cgroup_clear_mc();
4902 cond_resched();
4903 goto retry;
4904 }
4858 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4905 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4859 int ret; 4906 int ret;
4860 struct mm_walk mem_cgroup_move_charge_walk = { 4907 struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4873,6 +4920,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
4873 */ 4920 */
4874 break; 4921 break;
4875 } 4922 }
4923 up_read(&mm->mmap_sem);
4876} 4924}
4877 4925
4878static void mem_cgroup_move_task(struct cgroup_subsys *ss, 4926static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -4881,11 +4929,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4881 struct task_struct *p, 4929 struct task_struct *p,
4882 bool threadgroup) 4930 bool threadgroup)
4883{ 4931{
4884 if (!mc.mm) 4932 struct mm_struct *mm;
4933
4934 if (!mc.to)
4885 /* no need to move charge */ 4935 /* no need to move charge */
4886 return; 4936 return;
4887 4937
4888 mem_cgroup_move_charge(mc.mm); 4938 mm = get_task_mm(p);
4939 if (mm) {
4940 mem_cgroup_move_charge(mm);
4941 mmput(mm);
4942 }
4889 mem_cgroup_clear_mc(); 4943 mem_cgroup_clear_mc();
4890} 4944}
4891#else /* !CONFIG_MMU */ 4945#else /* !CONFIG_MMU */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 46ab2c044b0e..548fbd70f026 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -203,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
203#ifdef __ARCH_SI_TRAPNO 203#ifdef __ARCH_SI_TRAPNO
204 si.si_trapno = trapno; 204 si.si_trapno = trapno;
205#endif 205#endif
206 si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; 206 si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
207 /* 207 /*
208 * Don't use force here, it's convenient if the signal 208 * Don't use force here, it's convenient if the signal
209 * can be temporarily blocked. 209 * can be temporarily blocked.
@@ -386,6 +386,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
386 struct task_struct *tsk; 386 struct task_struct *tsk;
387 struct anon_vma *av; 387 struct anon_vma *av;
388 388
389 if (!PageHuge(page) && unlikely(split_huge_page(page)))
390 return;
389 read_lock(&tasklist_lock); 391 read_lock(&tasklist_lock);
390 av = page_lock_anon_vma(page); 392 av = page_lock_anon_vma(page);
391 if (av == NULL) /* Not actually mapped anymore */ 393 if (av == NULL) /* Not actually mapped anymore */
@@ -928,7 +930,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
928static void set_page_hwpoison_huge_page(struct page *hpage) 930static void set_page_hwpoison_huge_page(struct page *hpage)
929{ 931{
930 int i; 932 int i;
931 int nr_pages = 1 << compound_order(hpage); 933 int nr_pages = 1 << compound_trans_order(hpage);
932 for (i = 0; i < nr_pages; i++) 934 for (i = 0; i < nr_pages; i++)
933 SetPageHWPoison(hpage + i); 935 SetPageHWPoison(hpage + i);
934} 936}
@@ -936,7 +938,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
936static void clear_page_hwpoison_huge_page(struct page *hpage) 938static void clear_page_hwpoison_huge_page(struct page *hpage)
937{ 939{
938 int i; 940 int i;
939 int nr_pages = 1 << compound_order(hpage); 941 int nr_pages = 1 << compound_trans_order(hpage);
940 for (i = 0; i < nr_pages; i++) 942 for (i = 0; i < nr_pages; i++)
941 ClearPageHWPoison(hpage + i); 943 ClearPageHWPoison(hpage + i);
942} 944}
@@ -966,7 +968,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
966 return 0; 968 return 0;
967 } 969 }
968 970
969 nr_pages = 1 << compound_order(hpage); 971 nr_pages = 1 << compound_trans_order(hpage);
970 atomic_long_add(nr_pages, &mce_bad_pages); 972 atomic_long_add(nr_pages, &mce_bad_pages);
971 973
972 /* 974 /*
@@ -1164,7 +1166,7 @@ int unpoison_memory(unsigned long pfn)
1164 return 0; 1166 return 0;
1165 } 1167 }
1166 1168
1167 nr_pages = 1 << compound_order(page); 1169 nr_pages = 1 << compound_trans_order(page);
1168 1170
1169 if (!get_page_unless_zero(page)) { 1171 if (!get_page_unless_zero(page)) {
1170 /* 1172 /*
@@ -1290,9 +1292,10 @@ static int soft_offline_huge_page(struct page *page, int flags)
1290 /* Keep page count to indicate a given hugepage is isolated. */ 1292 /* Keep page count to indicate a given hugepage is isolated. */
1291 1293
1292 list_add(&hpage->lru, &pagelist); 1294 list_add(&hpage->lru, &pagelist);
1293 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1295 ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
1296 true);
1294 if (ret) { 1297 if (ret) {
1295 putback_lru_pages(&pagelist); 1298 putback_lru_pages(&pagelist);
1296 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", 1299 pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
1297 pfn, ret, page->flags); 1300 pfn, ret, page->flags);
1298 if (ret > 0) 1301 if (ret > 0)
@@ -1301,7 +1304,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
1301 } 1304 }
1302done: 1305done:
1303 if (!PageHWPoison(hpage)) 1306 if (!PageHWPoison(hpage))
1304 atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); 1307 atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
1305 set_page_hwpoison_huge_page(hpage); 1308 set_page_hwpoison_huge_page(hpage);
1306 dequeue_hwpoisoned_huge_page(hpage); 1309 dequeue_hwpoisoned_huge_page(hpage);
1307 /* keep elevated page count for bad page */ 1310 /* keep elevated page count for bad page */
@@ -1413,7 +1416,8 @@ int soft_offline_page(struct page *page, int flags)
1413 LIST_HEAD(pagelist); 1416 LIST_HEAD(pagelist);
1414 1417
1415 list_add(&page->lru, &pagelist); 1418 list_add(&page->lru, &pagelist);
1416 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); 1419 ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
1420 0, true);
1417 if (ret) { 1421 if (ret) {
1418 pr_info("soft offline: %#lx: migration failed %d, type %lx\n", 1422 pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
1419 pfn, ret, page->flags); 1423 pfn, ret, page->flags);
diff --git a/mm/memory.c b/mm/memory.c
index 02e48aa0ed13..31250faff390 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
394 } 394 }
395} 395}
396 396
397int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) 397int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
398 pmd_t *pmd, unsigned long address)
398{ 399{
399 pgtable_t new = pte_alloc_one(mm, address); 400 pgtable_t new = pte_alloc_one(mm, address);
401 int wait_split_huge_page;
400 if (!new) 402 if (!new)
401 return -ENOMEM; 403 return -ENOMEM;
402 404
@@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
416 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ 418 smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
417 419
418 spin_lock(&mm->page_table_lock); 420 spin_lock(&mm->page_table_lock);
419 if (!pmd_present(*pmd)) { /* Has another populated it ? */ 421 wait_split_huge_page = 0;
422 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
420 mm->nr_ptes++; 423 mm->nr_ptes++;
421 pmd_populate(mm, pmd, new); 424 pmd_populate(mm, pmd, new);
422 new = NULL; 425 new = NULL;
423 } 426 } else if (unlikely(pmd_trans_splitting(*pmd)))
427 wait_split_huge_page = 1;
424 spin_unlock(&mm->page_table_lock); 428 spin_unlock(&mm->page_table_lock);
425 if (new) 429 if (new)
426 pte_free(mm, new); 430 pte_free(mm, new);
431 if (wait_split_huge_page)
432 wait_split_huge_page(vma->anon_vma, pmd);
427 return 0; 433 return 0;
428} 434}
429 435
@@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
436 smp_wmb(); /* See comment in __pte_alloc */ 442 smp_wmb(); /* See comment in __pte_alloc */
437 443
438 spin_lock(&init_mm.page_table_lock); 444 spin_lock(&init_mm.page_table_lock);
439 if (!pmd_present(*pmd)) { /* Has another populated it ? */ 445 if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
440 pmd_populate_kernel(&init_mm, pmd, new); 446 pmd_populate_kernel(&init_mm, pmd, new);
441 new = NULL; 447 new = NULL;
442 } 448 } else
449 VM_BUG_ON(pmd_trans_splitting(*pmd));
443 spin_unlock(&init_mm.page_table_lock); 450 spin_unlock(&init_mm.page_table_lock);
444 if (new) 451 if (new)
445 pte_free_kernel(&init_mm, new); 452 pte_free_kernel(&init_mm, new);
@@ -719,9 +726,9 @@ out_set_pte:
719 return 0; 726 return 0;
720} 727}
721 728
722static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 729int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
723 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, 730 pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
724 unsigned long addr, unsigned long end) 731 unsigned long addr, unsigned long end)
725{ 732{
726 pte_t *orig_src_pte, *orig_dst_pte; 733 pte_t *orig_src_pte, *orig_dst_pte;
727 pte_t *src_pte, *dst_pte; 734 pte_t *src_pte, *dst_pte;
@@ -795,6 +802,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
795 src_pmd = pmd_offset(src_pud, addr); 802 src_pmd = pmd_offset(src_pud, addr);
796 do { 803 do {
797 next = pmd_addr_end(addr, end); 804 next = pmd_addr_end(addr, end);
805 if (pmd_trans_huge(*src_pmd)) {
806 int err;
807 VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
808 err = copy_huge_pmd(dst_mm, src_mm,
809 dst_pmd, src_pmd, addr, vma);
810 if (err == -ENOMEM)
811 return -ENOMEM;
812 if (!err)
813 continue;
814 /* fall through */
815 }
798 if (pmd_none_or_clear_bad(src_pmd)) 816 if (pmd_none_or_clear_bad(src_pmd))
799 continue; 817 continue;
800 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, 818 if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -997,6 +1015,16 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
997 pmd = pmd_offset(pud, addr); 1015 pmd = pmd_offset(pud, addr);
998 do { 1016 do {
999 next = pmd_addr_end(addr, end); 1017 next = pmd_addr_end(addr, end);
1018 if (pmd_trans_huge(*pmd)) {
1019 if (next-addr != HPAGE_PMD_SIZE) {
1020 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
1021 split_huge_page_pmd(vma->vm_mm, pmd);
1022 } else if (zap_huge_pmd(tlb, vma, pmd)) {
1023 (*zap_work)--;
1024 continue;
1025 }
1026 /* fall through */
1027 }
1000 if (pmd_none_or_clear_bad(pmd)) { 1028 if (pmd_none_or_clear_bad(pmd)) {
1001 (*zap_work)--; 1029 (*zap_work)--;
1002 continue; 1030 continue;
@@ -1262,7 +1290,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1262 pud = pud_offset(pgd, address); 1290 pud = pud_offset(pgd, address);
1263 if (pud_none(*pud)) 1291 if (pud_none(*pud))
1264 goto no_page_table; 1292 goto no_page_table;
1265 if (pud_huge(*pud)) { 1293 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
1266 BUG_ON(flags & FOLL_GET); 1294 BUG_ON(flags & FOLL_GET);
1267 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); 1295 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1268 goto out; 1296 goto out;
@@ -1273,11 +1301,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1273 pmd = pmd_offset(pud, address); 1301 pmd = pmd_offset(pud, address);
1274 if (pmd_none(*pmd)) 1302 if (pmd_none(*pmd))
1275 goto no_page_table; 1303 goto no_page_table;
1276 if (pmd_huge(*pmd)) { 1304 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
1277 BUG_ON(flags & FOLL_GET); 1305 BUG_ON(flags & FOLL_GET);
1278 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1306 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
1279 goto out; 1307 goto out;
1280 } 1308 }
1309 if (pmd_trans_huge(*pmd)) {
1310 if (flags & FOLL_SPLIT) {
1311 split_huge_page_pmd(mm, pmd);
1312 goto split_fallthrough;
1313 }
1314 spin_lock(&mm->page_table_lock);
1315 if (likely(pmd_trans_huge(*pmd))) {
1316 if (unlikely(pmd_trans_splitting(*pmd))) {
1317 spin_unlock(&mm->page_table_lock);
1318 wait_split_huge_page(vma->anon_vma, pmd);
1319 } else {
1320 page = follow_trans_huge_pmd(mm, address,
1321 pmd, flags);
1322 spin_unlock(&mm->page_table_lock);
1323 goto out;
1324 }
1325 } else
1326 spin_unlock(&mm->page_table_lock);
1327 /* fall through */
1328 }
1329split_fallthrough:
1281 if (unlikely(pmd_bad(*pmd))) 1330 if (unlikely(pmd_bad(*pmd)))
1282 goto no_page_table; 1331 goto no_page_table;
1283 1332
@@ -1310,6 +1359,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1310 */ 1359 */
1311 mark_page_accessed(page); 1360 mark_page_accessed(page);
1312 } 1361 }
1362 if (flags & FOLL_MLOCK) {
1363 /*
1364 * The preliminary mapping check is mainly to avoid the
1365 * pointless overhead of lock_page on the ZERO_PAGE
1366 * which might bounce very badly if there is contention.
1367 *
1368 * If the page is already locked, we don't need to
1369 * handle it now - vmscan will handle it later if and
1370 * when it attempts to reclaim the page.
1371 */
1372 if (page->mapping && trylock_page(page)) {
1373 lru_add_drain(); /* push cached pages to LRU */
1374 /*
1375 * Because we lock page here and migration is
1376 * blocked by the pte's page reference, we need
1377 * only check for file-cache page truncation.
1378 */
1379 if (page->mapping)
1380 mlock_vma_page(page);
1381 unlock_page(page);
1382 }
1383 }
1313unlock: 1384unlock:
1314 pte_unmap_unlock(ptep, ptl); 1385 pte_unmap_unlock(ptep, ptl);
1315out: 1386out:
@@ -1341,7 +1412,8 @@ no_page_table:
1341 1412
1342int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1413int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1343 unsigned long start, int nr_pages, unsigned int gup_flags, 1414 unsigned long start, int nr_pages, unsigned int gup_flags,
1344 struct page **pages, struct vm_area_struct **vmas) 1415 struct page **pages, struct vm_area_struct **vmas,
1416 int *nonblocking)
1345{ 1417{
1346 int i; 1418 int i;
1347 unsigned long vm_flags; 1419 unsigned long vm_flags;
@@ -1386,6 +1458,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1386 pmd = pmd_offset(pud, pg); 1458 pmd = pmd_offset(pud, pg);
1387 if (pmd_none(*pmd)) 1459 if (pmd_none(*pmd))
1388 return i ? : -EFAULT; 1460 return i ? : -EFAULT;
1461 VM_BUG_ON(pmd_trans_huge(*pmd));
1389 pte = pte_offset_map(pmd, pg); 1462 pte = pte_offset_map(pmd, pg);
1390 if (pte_none(*pte)) { 1463 if (pte_none(*pte)) {
1391 pte_unmap(pte); 1464 pte_unmap(pte);
@@ -1441,10 +1514,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1441 cond_resched(); 1514 cond_resched();
1442 while (!(page = follow_page(vma, start, foll_flags))) { 1515 while (!(page = follow_page(vma, start, foll_flags))) {
1443 int ret; 1516 int ret;
1517 unsigned int fault_flags = 0;
1518
1519 if (foll_flags & FOLL_WRITE)
1520 fault_flags |= FAULT_FLAG_WRITE;
1521 if (nonblocking)
1522 fault_flags |= FAULT_FLAG_ALLOW_RETRY;
1444 1523
1445 ret = handle_mm_fault(mm, vma, start, 1524 ret = handle_mm_fault(mm, vma, start,
1446 (foll_flags & FOLL_WRITE) ? 1525 fault_flags);
1447 FAULT_FLAG_WRITE : 0);
1448 1526
1449 if (ret & VM_FAULT_ERROR) { 1527 if (ret & VM_FAULT_ERROR) {
1450 if (ret & VM_FAULT_OOM) 1528 if (ret & VM_FAULT_OOM)
@@ -1460,6 +1538,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1460 else 1538 else
1461 tsk->min_flt++; 1539 tsk->min_flt++;
1462 1540
1541 if (ret & VM_FAULT_RETRY) {
1542 *nonblocking = 0;
1543 return i;
1544 }
1545
1463 /* 1546 /*
1464 * The VM_FAULT_WRITE bit tells us that 1547 * The VM_FAULT_WRITE bit tells us that
1465 * do_wp_page has broken COW when necessary, 1548 * do_wp_page has broken COW when necessary,
@@ -1559,7 +1642,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1559 if (force) 1642 if (force)
1560 flags |= FOLL_FORCE; 1643 flags |= FOLL_FORCE;
1561 1644
1562 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 1645 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
1646 NULL);
1563} 1647}
1564EXPORT_SYMBOL(get_user_pages); 1648EXPORT_SYMBOL(get_user_pages);
1565 1649
@@ -1584,7 +1668,8 @@ struct page *get_dump_page(unsigned long addr)
1584 struct page *page; 1668 struct page *page;
1585 1669
1586 if (__get_user_pages(current, current->mm, addr, 1, 1670 if (__get_user_pages(current, current->mm, addr, 1,
1587 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) 1671 FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
1672 NULL) < 1)
1588 return NULL; 1673 return NULL;
1589 flush_cache_page(vma, addr, page_to_pfn(page)); 1674 flush_cache_page(vma, addr, page_to_pfn(page));
1590 return page; 1675 return page;
@@ -1598,8 +1683,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
1598 pud_t * pud = pud_alloc(mm, pgd, addr); 1683 pud_t * pud = pud_alloc(mm, pgd, addr);
1599 if (pud) { 1684 if (pud) {
1600 pmd_t * pmd = pmd_alloc(mm, pud, addr); 1685 pmd_t * pmd = pmd_alloc(mm, pud, addr);
1601 if (pmd) 1686 if (pmd) {
1687 VM_BUG_ON(pmd_trans_huge(*pmd));
1602 return pte_alloc_map_lock(mm, pmd, addr, ptl); 1688 return pte_alloc_map_lock(mm, pmd, addr, ptl);
1689 }
1603 } 1690 }
1604 return NULL; 1691 return NULL;
1605} 1692}
@@ -1818,6 +1905,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
1818 pmd = pmd_alloc(mm, pud, addr); 1905 pmd = pmd_alloc(mm, pud, addr);
1819 if (!pmd) 1906 if (!pmd)
1820 return -ENOMEM; 1907 return -ENOMEM;
1908 VM_BUG_ON(pmd_trans_huge(*pmd));
1821 do { 1909 do {
1822 next = pmd_addr_end(addr, end); 1910 next = pmd_addr_end(addr, end);
1823 if (remap_pte_range(mm, pmd, addr, next, 1911 if (remap_pte_range(mm, pmd, addr, next,
@@ -2048,19 +2136,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
2048 return same; 2136 return same;
2049} 2137}
2050 2138
2051/*
2052 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
2053 * servicing faults for write access. In the normal case, do always want
2054 * pte_mkwrite. But get_user_pages can cause write faults for mappings
2055 * that do not have writing enabled, when used by access_process_vm.
2056 */
2057static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
2058{
2059 if (likely(vma->vm_flags & VM_WRITE))
2060 pte = pte_mkwrite(pte);
2061 return pte;
2062}
2063
2064static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) 2139static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
2065{ 2140{
2066 /* 2141 /*
@@ -2112,7 +2187,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2112{ 2187{
2113 struct page *old_page, *new_page; 2188 struct page *old_page, *new_page;
2114 pte_t entry; 2189 pte_t entry;
2115 int reuse = 0, ret = 0; 2190 int ret = 0;
2116 int page_mkwrite = 0; 2191 int page_mkwrite = 0;
2117 struct page *dirty_page = NULL; 2192 struct page *dirty_page = NULL;
2118 2193
@@ -2149,14 +2224,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2149 } 2224 }
2150 page_cache_release(old_page); 2225 page_cache_release(old_page);
2151 } 2226 }
2152 reuse = reuse_swap_page(old_page); 2227 if (reuse_swap_page(old_page)) {
2153 if (reuse)
2154 /* 2228 /*
2155 * The page is all ours. Move it to our anon_vma so 2229 * The page is all ours. Move it to our anon_vma so
2156 * the rmap code will not search our parent or siblings. 2230 * the rmap code will not search our parent or siblings.
2157 * Protected against the rmap code by the page lock. 2231 * Protected against the rmap code by the page lock.
2158 */ 2232 */
2159 page_move_anon_rmap(old_page, vma, address); 2233 page_move_anon_rmap(old_page, vma, address);
2234 unlock_page(old_page);
2235 goto reuse;
2236 }
2160 unlock_page(old_page); 2237 unlock_page(old_page);
2161 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 2238 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
2162 (VM_WRITE|VM_SHARED))) { 2239 (VM_WRITE|VM_SHARED))) {
@@ -2220,18 +2297,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
2220 } 2297 }
2221 dirty_page = old_page; 2298 dirty_page = old_page;
2222 get_page(dirty_page); 2299 get_page(dirty_page);
2223 reuse = 1;
2224 }
2225 2300
2226 if (reuse) {
2227reuse: 2301reuse:
2228 flush_cache_page(vma, address, pte_pfn(orig_pte)); 2302 flush_cache_page(vma, address, pte_pfn(orig_pte));
2229 entry = pte_mkyoung(orig_pte); 2303 entry = pte_mkyoung(orig_pte);
2230 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2304 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2231 if (ptep_set_access_flags(vma, address, page_table, entry,1)) 2305 if (ptep_set_access_flags(vma, address, page_table, entry,1))
2232 update_mmu_cache(vma, address, page_table); 2306 update_mmu_cache(vma, address, page_table);
2307 pte_unmap_unlock(page_table, ptl);
2233 ret |= VM_FAULT_WRITE; 2308 ret |= VM_FAULT_WRITE;
2234 goto unlock; 2309
2310 if (!dirty_page)
2311 return ret;
2312
2313 /*
2314 * Yes, Virginia, this is actually required to prevent a race
2315 * with clear_page_dirty_for_io() from clearing the page dirty
2316 * bit after it clear all dirty ptes, but before a racing
2317 * do_wp_page installs a dirty pte.
2318 *
2319 * do_no_page is protected similarly.
2320 */
2321 if (!page_mkwrite) {
2322 wait_on_page_locked(dirty_page);
2323 set_page_dirty_balance(dirty_page, page_mkwrite);
2324 }
2325 put_page(dirty_page);
2326 if (page_mkwrite) {
2327 struct address_space *mapping = dirty_page->mapping;
2328
2329 set_page_dirty(dirty_page);
2330 unlock_page(dirty_page);
2331 page_cache_release(dirty_page);
2332 if (mapping) {
2333 /*
2334 * Some device drivers do not set page.mapping
2335 * but still dirty their pages
2336 */
2337 balance_dirty_pages_ratelimited(mapping);
2338 }
2339 }
2340
2341 /* file_update_time outside page_lock */
2342 if (vma->vm_file)
2343 file_update_time(vma->vm_file);
2344
2345 return ret;
2235 } 2346 }
2236 2347
2237 /* 2348 /*
@@ -2337,39 +2448,6 @@ gotten:
2337 page_cache_release(old_page); 2448 page_cache_release(old_page);
2338unlock: 2449unlock:
2339 pte_unmap_unlock(page_table, ptl); 2450 pte_unmap_unlock(page_table, ptl);
2340 if (dirty_page) {
2341 /*
2342 * Yes, Virginia, this is actually required to prevent a race
2343 * with clear_page_dirty_for_io() from clearing the page dirty
2344 * bit after it clear all dirty ptes, but before a racing
2345 * do_wp_page installs a dirty pte.
2346 *
2347 * do_no_page is protected similarly.
2348 */
2349 if (!page_mkwrite) {
2350 wait_on_page_locked(dirty_page);
2351 set_page_dirty_balance(dirty_page, page_mkwrite);
2352 }
2353 put_page(dirty_page);
2354 if (page_mkwrite) {
2355 struct address_space *mapping = dirty_page->mapping;
2356
2357 set_page_dirty(dirty_page);
2358 unlock_page(dirty_page);
2359 page_cache_release(dirty_page);
2360 if (mapping) {
2361 /*
2362 * Some device drivers do not set page.mapping
2363 * but still dirty their pages
2364 */
2365 balance_dirty_pages_ratelimited(mapping);
2366 }
2367 }
2368
2369 /* file_update_time outside page_lock */
2370 if (vma->vm_file)
2371 file_update_time(vma->vm_file);
2372 }
2373 return ret; 2451 return ret;
2374oom_free_new: 2452oom_free_new:
2375 page_cache_release(new_page); 2453 page_cache_release(new_page);
@@ -3147,9 +3225,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3147 * but allow concurrent faults), and pte mapped but not yet locked. 3225 * but allow concurrent faults), and pte mapped but not yet locked.
3148 * We return with mmap_sem still held, but pte unmapped and unlocked. 3226 * We return with mmap_sem still held, but pte unmapped and unlocked.
3149 */ 3227 */
3150static inline int handle_pte_fault(struct mm_struct *mm, 3228int handle_pte_fault(struct mm_struct *mm,
3151 struct vm_area_struct *vma, unsigned long address, 3229 struct vm_area_struct *vma, unsigned long address,
3152 pte_t *pte, pmd_t *pmd, unsigned int flags) 3230 pte_t *pte, pmd_t *pmd, unsigned int flags)
3153{ 3231{
3154 pte_t entry; 3232 pte_t entry;
3155 spinlock_t *ptl; 3233 spinlock_t *ptl;
@@ -3228,9 +3306,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3228 pmd = pmd_alloc(mm, pud, address); 3306 pmd = pmd_alloc(mm, pud, address);
3229 if (!pmd) 3307 if (!pmd)
3230 return VM_FAULT_OOM; 3308 return VM_FAULT_OOM;
3231 pte = pte_alloc_map(mm, pmd, address); 3309 if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
3232 if (!pte) 3310 if (!vma->vm_ops)
3311 return do_huge_pmd_anonymous_page(mm, vma, address,
3312 pmd, flags);
3313 } else {
3314 pmd_t orig_pmd = *pmd;
3315 barrier();
3316 if (pmd_trans_huge(orig_pmd)) {
3317 if (flags & FAULT_FLAG_WRITE &&
3318 !pmd_write(orig_pmd) &&
3319 !pmd_trans_splitting(orig_pmd))
3320 return do_huge_pmd_wp_page(mm, vma, address,
3321 pmd, orig_pmd);
3322 return 0;
3323 }
3324 }
3325
3326 /*
3327 * Use __pte_alloc instead of pte_alloc_map, because we can't
3328 * run pte_offset_map on the pmd, if an huge pmd could
3329 * materialize from under us from a different thread.
3330 */
3331 if (unlikely(__pte_alloc(mm, vma, pmd, address)))
3233 return VM_FAULT_OOM; 3332 return VM_FAULT_OOM;
3333 /* if an huge pmd materialized from under us just retry later */
3334 if (unlikely(pmd_trans_huge(*pmd)))
3335 return 0;
3336 /*
3337 * A regular pmd is established and it can't morph into a huge pmd
3338 * from under us anymore at this point because we hold the mmap_sem
3339 * read mode and khugepaged takes it in write mode. So now it's
3340 * safe to run pte_offset_map().
3341 */
3342 pte = pte_offset_map(pmd, address);
3234 3343
3235 return handle_pte_fault(mm, vma, address, pte, pmd, flags); 3344 return handle_pte_fault(mm, vma, address, pte, pmd, flags);
3236} 3345}
@@ -3296,7 +3405,12 @@ int make_pages_present(unsigned long addr, unsigned long end)
3296 vma = find_vma(current->mm, addr); 3405 vma = find_vma(current->mm, addr);
3297 if (!vma) 3406 if (!vma)
3298 return -ENOMEM; 3407 return -ENOMEM;
3299 write = (vma->vm_flags & VM_WRITE) != 0; 3408 /*
3409 * We want to touch writable mappings with a write fault in order
3410 * to break COW, except for shared mappings because these don't COW
3411 * and we would not want to dirty them for nothing.
3412 */
3413 write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
3300 BUG_ON(addr >= end); 3414 BUG_ON(addr >= end);
3301 BUG_ON(end > vma->vm_end); 3415 BUG_ON(end > vma->vm_end);
3302 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; 3416 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
@@ -3368,6 +3482,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address,
3368 goto out; 3482 goto out;
3369 3483
3370 pmd = pmd_offset(pud, address); 3484 pmd = pmd_offset(pud, address);
3485 VM_BUG_ON(pmd_trans_huge(*pmd));
3371 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 3486 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
3372 goto out; 3487 goto out;
3373 3488
@@ -3608,3 +3723,74 @@ void might_fault(void)
3608} 3723}
3609EXPORT_SYMBOL(might_fault); 3724EXPORT_SYMBOL(might_fault);
3610#endif 3725#endif
3726
3727#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
3728static void clear_gigantic_page(struct page *page,
3729 unsigned long addr,
3730 unsigned int pages_per_huge_page)
3731{
3732 int i;
3733 struct page *p = page;
3734
3735 might_sleep();
3736 for (i = 0; i < pages_per_huge_page;
3737 i++, p = mem_map_next(p, page, i)) {
3738 cond_resched();
3739 clear_user_highpage(p, addr + i * PAGE_SIZE);
3740 }
3741}
3742void clear_huge_page(struct page *page,
3743 unsigned long addr, unsigned int pages_per_huge_page)
3744{
3745 int i;
3746
3747 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3748 clear_gigantic_page(page, addr, pages_per_huge_page);
3749 return;
3750 }
3751
3752 might_sleep();
3753 for (i = 0; i < pages_per_huge_page; i++) {
3754 cond_resched();
3755 clear_user_highpage(page + i, addr + i * PAGE_SIZE);
3756 }
3757}
3758
3759static void copy_user_gigantic_page(struct page *dst, struct page *src,
3760 unsigned long addr,
3761 struct vm_area_struct *vma,
3762 unsigned int pages_per_huge_page)
3763{
3764 int i;
3765 struct page *dst_base = dst;
3766 struct page *src_base = src;
3767
3768 for (i = 0; i < pages_per_huge_page; ) {
3769 cond_resched();
3770 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
3771
3772 i++;
3773 dst = mem_map_next(dst, dst_base, i);
3774 src = mem_map_next(src, src_base, i);
3775 }
3776}
3777
3778void copy_user_huge_page(struct page *dst, struct page *src,
3779 unsigned long addr, struct vm_area_struct *vma,
3780 unsigned int pages_per_huge_page)
3781{
3782 int i;
3783
3784 if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
3785 copy_user_gigantic_page(dst, src, addr, vma,
3786 pages_per_huge_page);
3787 return;
3788 }
3789
3790 might_sleep();
3791 for (i = 0; i < pages_per_huge_page; i++) {
3792 cond_resched();
3793 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
3794 }
3795}
3796#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2c6523af5473..e92f04749fcb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -82,9 +82,10 @@ static void release_memory_resource(struct resource *res)
82 82
83#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 83#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
84#ifndef CONFIG_SPARSEMEM_VMEMMAP 84#ifndef CONFIG_SPARSEMEM_VMEMMAP
85static void get_page_bootmem(unsigned long info, struct page *page, int type) 85static void get_page_bootmem(unsigned long info, struct page *page,
86 unsigned long type)
86{ 87{
87 atomic_set(&page->_mapcount, type); 88 page->lru.next = (struct list_head *) type;
88 SetPagePrivate(page); 89 SetPagePrivate(page);
89 set_page_private(page, info); 90 set_page_private(page, info);
90 atomic_inc(&page->_count); 91 atomic_inc(&page->_count);
@@ -94,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type)
94 * so use __ref to tell modpost not to generate a warning */ 95 * so use __ref to tell modpost not to generate a warning */
95void __ref put_page_bootmem(struct page *page) 96void __ref put_page_bootmem(struct page *page)
96{ 97{
97 int type; 98 unsigned long type;
98 99
99 type = atomic_read(&page->_mapcount); 100 type = (unsigned long) page->lru.next;
100 BUG_ON(type >= -1); 101 BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
102 type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
101 103
102 if (atomic_dec_return(&page->_count) == 1) { 104 if (atomic_dec_return(&page->_count) == 1) {
103 ClearPagePrivate(page); 105 ClearPagePrivate(page);
104 set_page_private(page, 0); 106 set_page_private(page, 0);
105 reset_page_mapcount(page); 107 INIT_LIST_HEAD(&page->lru);
106 __free_pages_bootmem(page, 0); 108 __free_pages_bootmem(page, 0);
107 } 109 }
108 110
@@ -733,7 +735,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
733 goto out; 735 goto out;
734 } 736 }
735 /* this function returns # of failed pages */ 737 /* this function returns # of failed pages */
736 ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); 738 ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
739 true, true);
737 if (ret) 740 if (ret)
738 putback_lru_pages(&source); 741 putback_lru_pages(&source);
739 } 742 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 11ff260fb282..368fc9d23610 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
514 pmd = pmd_offset(pud, addr); 514 pmd = pmd_offset(pud, addr);
515 do { 515 do {
516 next = pmd_addr_end(addr, end); 516 next = pmd_addr_end(addr, end);
517 split_huge_page_pmd(vma->vm_mm, pmd);
517 if (pmd_none_or_clear_bad(pmd)) 518 if (pmd_none_or_clear_bad(pmd))
518 continue; 519 continue;
519 if (check_pte_range(vma, pmd, addr, next, nodes, 520 if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -935,7 +936,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
935 return PTR_ERR(vma); 936 return PTR_ERR(vma);
936 937
937 if (!list_empty(&pagelist)) { 938 if (!list_empty(&pagelist)) {
938 err = migrate_pages(&pagelist, new_node_page, dest, 0); 939 err = migrate_pages(&pagelist, new_node_page, dest,
940 false, true);
939 if (err) 941 if (err)
940 putback_lru_pages(&pagelist); 942 putback_lru_pages(&pagelist);
941 } 943 }
@@ -1155,7 +1157,8 @@ static long do_mbind(unsigned long start, unsigned long len,
1155 1157
1156 if (!list_empty(&pagelist)) { 1158 if (!list_empty(&pagelist)) {
1157 nr_failed = migrate_pages(&pagelist, new_vma_page, 1159 nr_failed = migrate_pages(&pagelist, new_vma_page,
1158 (unsigned long)vma, 0); 1160 (unsigned long)vma,
1161 false, true);
1159 if (nr_failed) 1162 if (nr_failed)
1160 putback_lru_pages(&pagelist); 1163 putback_lru_pages(&pagelist);
1161 } 1164 }
@@ -1308,16 +1311,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1308 1311
1309 /* Find the mm_struct */ 1312 /* Find the mm_struct */
1310 rcu_read_lock(); 1313 rcu_read_lock();
1311 read_lock(&tasklist_lock);
1312 task = pid ? find_task_by_vpid(pid) : current; 1314 task = pid ? find_task_by_vpid(pid) : current;
1313 if (!task) { 1315 if (!task) {
1314 read_unlock(&tasklist_lock);
1315 rcu_read_unlock(); 1316 rcu_read_unlock();
1316 err = -ESRCH; 1317 err = -ESRCH;
1317 goto out; 1318 goto out;
1318 } 1319 }
1319 mm = get_task_mm(task); 1320 mm = get_task_mm(task);
1320 read_unlock(&tasklist_lock);
1321 rcu_read_unlock(); 1321 rcu_read_unlock();
1322 1322
1323 err = -EINVAL; 1323 err = -EINVAL;
@@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1796} 1796}
1797 1797
1798/** 1798/**
1799 * alloc_page_vma - Allocate a page for a VMA. 1799 * alloc_pages_vma - Allocate a page for a VMA.
1800 * 1800 *
1801 * @gfp: 1801 * @gfp:
1802 * %GFP_USER user allocation. 1802 * %GFP_USER user allocation.
@@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1805 * %GFP_FS allocation should not call back into a file system. 1805 * %GFP_FS allocation should not call back into a file system.
1806 * %GFP_ATOMIC don't sleep. 1806 * %GFP_ATOMIC don't sleep.
1807 * 1807 *
1808 * @order:Order of the GFP allocation.
1808 * @vma: Pointer to VMA or NULL if not available. 1809 * @vma: Pointer to VMA or NULL if not available.
1809 * @addr: Virtual Address of the allocation. Must be inside the VMA. 1810 * @addr: Virtual Address of the allocation. Must be inside the VMA.
1810 * 1811 *
@@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1818 * Should be called with the mm_sem of the vma hold. 1819 * Should be called with the mm_sem of the vma hold.
1819 */ 1820 */
1820struct page * 1821struct page *
1821alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) 1822alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1823 unsigned long addr)
1822{ 1824{
1823 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1825 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1824 struct zonelist *zl; 1826 struct zonelist *zl;
@@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1830 1832
1831 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1833 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1832 mpol_cond_put(pol); 1834 mpol_cond_put(pol);
1833 page = alloc_page_interleave(gfp, 0, nid); 1835 page = alloc_page_interleave(gfp, order, nid);
1834 put_mems_allowed(); 1836 put_mems_allowed();
1835 return page; 1837 return page;
1836 } 1838 }
@@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1839 /* 1841 /*
1840 * slow path: ref counted shared policy 1842 * slow path: ref counted shared policy
1841 */ 1843 */
1842 struct page *page = __alloc_pages_nodemask(gfp, 0, 1844 struct page *page = __alloc_pages_nodemask(gfp, order,
1843 zl, policy_nodemask(gfp, pol)); 1845 zl, policy_nodemask(gfp, pol));
1844 __mpol_put(pol); 1846 __mpol_put(pol);
1845 put_mems_allowed(); 1847 put_mems_allowed();
@@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1848 /* 1850 /*
1849 * fast path: default or task policy 1851 * fast path: default or task policy
1850 */ 1852 */
1851 page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); 1853 page = __alloc_pages_nodemask(gfp, order, zl,
1854 policy_nodemask(gfp, pol));
1852 put_mems_allowed(); 1855 put_mems_allowed();
1853 return page; 1856 return page;
1854} 1857}
diff --git a/mm/migrate.c b/mm/migrate.c
index 6ae8a66a7045..46fe8cc13d67 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -113,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
113 goto out; 113 goto out;
114 114
115 pmd = pmd_offset(pud, addr); 115 pmd = pmd_offset(pud, addr);
116 if (pmd_trans_huge(*pmd))
117 goto out;
116 if (!pmd_present(*pmd)) 118 if (!pmd_present(*pmd))
117 goto out; 119 goto out;
118 120
@@ -246,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
246 248
247 expected_count = 2 + page_has_private(page); 249 expected_count = 2 + page_has_private(page);
248 if (page_count(page) != expected_count || 250 if (page_count(page) != expected_count ||
249 (struct page *)radix_tree_deref_slot(pslot) != page) { 251 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
250 spin_unlock_irq(&mapping->tree_lock); 252 spin_unlock_irq(&mapping->tree_lock);
251 return -EAGAIN; 253 return -EAGAIN;
252 } 254 }
@@ -318,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
318 320
319 expected_count = 2 + page_has_private(page); 321 expected_count = 2 + page_has_private(page);
320 if (page_count(page) != expected_count || 322 if (page_count(page) != expected_count ||
321 (struct page *)radix_tree_deref_slot(pslot) != page) { 323 radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
322 spin_unlock_irq(&mapping->tree_lock); 324 spin_unlock_irq(&mapping->tree_lock);
323 return -EAGAIN; 325 return -EAGAIN;
324 } 326 }
@@ -614,13 +616,12 @@ static int move_to_new_page(struct page *newpage, struct page *page,
614 * to the newly allocated page in newpage. 616 * to the newly allocated page in newpage.
615 */ 617 */
616static int unmap_and_move(new_page_t get_new_page, unsigned long private, 618static int unmap_and_move(new_page_t get_new_page, unsigned long private,
617 struct page *page, int force, int offlining) 619 struct page *page, int force, bool offlining, bool sync)
618{ 620{
619 int rc = 0; 621 int rc = 0;
620 int *result = NULL; 622 int *result = NULL;
621 struct page *newpage = get_new_page(page, private, &result); 623 struct page *newpage = get_new_page(page, private, &result);
622 int remap_swapcache = 1; 624 int remap_swapcache = 1;
623 int rcu_locked = 0;
624 int charge = 0; 625 int charge = 0;
625 struct mem_cgroup *mem = NULL; 626 struct mem_cgroup *mem = NULL;
626 struct anon_vma *anon_vma = NULL; 627 struct anon_vma *anon_vma = NULL;
@@ -632,6 +633,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
632 /* page was freed from under us. So we are done. */ 633 /* page was freed from under us. So we are done. */
633 goto move_newpage; 634 goto move_newpage;
634 } 635 }
636 if (unlikely(PageTransHuge(page)))
637 if (unlikely(split_huge_page(page)))
638 goto move_newpage;
635 639
636 /* prepare cgroup just returns 0 or -ENOMEM */ 640 /* prepare cgroup just returns 0 or -ENOMEM */
637 rc = -EAGAIN; 641 rc = -EAGAIN;
@@ -639,6 +643,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
639 if (!trylock_page(page)) { 643 if (!trylock_page(page)) {
640 if (!force) 644 if (!force)
641 goto move_newpage; 645 goto move_newpage;
646
647 /*
648 * It's not safe for direct compaction to call lock_page.
649 * For example, during page readahead pages are added locked
650 * to the LRU. Later, when the IO completes the pages are
651 * marked uptodate and unlocked. However, the queueing
652 * could be merging multiple pages for one bio (e.g.
653 * mpage_readpages). If an allocation happens for the
654 * second or third page, the process can end up locking
655 * the same page twice and deadlocking. Rather than
656 * trying to be clever about what pages can be locked,
657 * avoid the use of lock_page for direct compaction
658 * altogether.
659 */
660 if (current->flags & PF_MEMALLOC)
661 goto move_newpage;
662
642 lock_page(page); 663 lock_page(page);
643 } 664 }
644 665
@@ -665,27 +686,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
665 BUG_ON(charge); 686 BUG_ON(charge);
666 687
667 if (PageWriteback(page)) { 688 if (PageWriteback(page)) {
668 if (!force) 689 if (!force || !sync)
669 goto uncharge; 690 goto uncharge;
670 wait_on_page_writeback(page); 691 wait_on_page_writeback(page);
671 } 692 }
672 /* 693 /*
673 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, 694 * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
674 * we cannot notice that anon_vma is freed while we migrates a page. 695 * we cannot notice that anon_vma is freed while we migrates a page.
675 * This rcu_read_lock() delays freeing anon_vma pointer until the end 696 * This get_anon_vma() delays freeing anon_vma pointer until the end
676 * of migration. File cache pages are no problem because of page_lock() 697 * of migration. File cache pages are no problem because of page_lock()
677 * File Caches may use write_page() or lock_page() in migration, then, 698 * File Caches may use write_page() or lock_page() in migration, then,
678 * just care Anon page here. 699 * just care Anon page here.
679 */ 700 */
680 if (PageAnon(page)) { 701 if (PageAnon(page)) {
681 rcu_read_lock(); 702 /*
682 rcu_locked = 1; 703 * Only page_lock_anon_vma() understands the subtleties of
683 704 * getting a hold on an anon_vma from outside one of its mms.
684 /* Determine how to safely use anon_vma */ 705 */
685 if (!page_mapped(page)) { 706 anon_vma = page_lock_anon_vma(page);
686 if (!PageSwapCache(page)) 707 if (anon_vma) {
687 goto rcu_unlock; 708 /*
688 709 * Take a reference count on the anon_vma if the
710 * page is mapped so that it is guaranteed to
711 * exist when the page is remapped later
712 */
713 get_anon_vma(anon_vma);
714 page_unlock_anon_vma(anon_vma);
715 } else if (PageSwapCache(page)) {
689 /* 716 /*
690 * We cannot be sure that the anon_vma of an unmapped 717 * We cannot be sure that the anon_vma of an unmapped
691 * swapcache page is safe to use because we don't 718 * swapcache page is safe to use because we don't
@@ -700,13 +727,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
700 */ 727 */
701 remap_swapcache = 0; 728 remap_swapcache = 0;
702 } else { 729 } else {
703 /* 730 goto uncharge;
704 * Take a reference count on the anon_vma if the
705 * page is mapped so that it is guaranteed to
706 * exist when the page is remapped later
707 */
708 anon_vma = page_anon_vma(page);
709 get_anon_vma(anon_vma);
710 } 731 }
711 } 732 }
712 733
@@ -723,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
723 * free the metadata, so the page can be freed. 744 * free the metadata, so the page can be freed.
724 */ 745 */
725 if (!page->mapping) { 746 if (!page->mapping) {
726 if (!PageAnon(page) && page_has_private(page)) { 747 VM_BUG_ON(PageAnon(page));
727 /* 748 if (page_has_private(page)) {
728 * Go direct to try_to_free_buffers() here because
729 * a) that's what try_to_release_page() would do anyway
730 * b) we may be under rcu_read_lock() here, so we can't
731 * use GFP_KERNEL which is what try_to_release_page()
732 * needs to be effective.
733 */
734 try_to_free_buffers(page); 749 try_to_free_buffers(page);
735 goto rcu_unlock; 750 goto uncharge;
736 } 751 }
737 goto skip_unmap; 752 goto skip_unmap;
738 } 753 }
@@ -746,17 +761,14 @@ skip_unmap:
746 761
747 if (rc && remap_swapcache) 762 if (rc && remap_swapcache)
748 remove_migration_ptes(page, page); 763 remove_migration_ptes(page, page);
749rcu_unlock:
750 764
751 /* Drop an anon_vma reference if we took one */ 765 /* Drop an anon_vma reference if we took one */
752 if (anon_vma) 766 if (anon_vma)
753 drop_anon_vma(anon_vma); 767 drop_anon_vma(anon_vma);
754 768
755 if (rcu_locked)
756 rcu_read_unlock();
757uncharge: 769uncharge:
758 if (!charge) 770 if (!charge)
759 mem_cgroup_end_migration(mem, page, newpage); 771 mem_cgroup_end_migration(mem, page, newpage, rc == 0);
760unlock: 772unlock:
761 unlock_page(page); 773 unlock_page(page);
762 774
@@ -810,12 +822,11 @@ move_newpage:
810 */ 822 */
811static int unmap_and_move_huge_page(new_page_t get_new_page, 823static int unmap_and_move_huge_page(new_page_t get_new_page,
812 unsigned long private, struct page *hpage, 824 unsigned long private, struct page *hpage,
813 int force, int offlining) 825 int force, bool offlining, bool sync)
814{ 826{
815 int rc = 0; 827 int rc = 0;
816 int *result = NULL; 828 int *result = NULL;
817 struct page *new_hpage = get_new_page(hpage, private, &result); 829 struct page *new_hpage = get_new_page(hpage, private, &result);
818 int rcu_locked = 0;
819 struct anon_vma *anon_vma = NULL; 830 struct anon_vma *anon_vma = NULL;
820 831
821 if (!new_hpage) 832 if (!new_hpage)
@@ -824,18 +835,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
824 rc = -EAGAIN; 835 rc = -EAGAIN;
825 836
826 if (!trylock_page(hpage)) { 837 if (!trylock_page(hpage)) {
827 if (!force) 838 if (!force || !sync)
828 goto out; 839 goto out;
829 lock_page(hpage); 840 lock_page(hpage);
830 } 841 }
831 842
832 if (PageAnon(hpage)) { 843 if (PageAnon(hpage)) {
833 rcu_read_lock(); 844 anon_vma = page_lock_anon_vma(hpage);
834 rcu_locked = 1; 845 if (anon_vma) {
835 846 get_anon_vma(anon_vma);
836 if (page_mapped(hpage)) { 847 page_unlock_anon_vma(anon_vma);
837 anon_vma = page_anon_vma(hpage);
838 atomic_inc(&anon_vma->external_refcount);
839 } 848 }
840 } 849 }
841 850
@@ -847,16 +856,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
847 if (rc) 856 if (rc)
848 remove_migration_ptes(hpage, hpage); 857 remove_migration_ptes(hpage, hpage);
849 858
850 if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, 859 if (anon_vma)
851 &anon_vma->lock)) { 860 drop_anon_vma(anon_vma);
852 int empty = list_empty(&anon_vma->head);
853 spin_unlock(&anon_vma->lock);
854 if (empty)
855 anon_vma_free(anon_vma);
856 }
857
858 if (rcu_locked)
859 rcu_read_unlock();
860out: 861out:
861 unlock_page(hpage); 862 unlock_page(hpage);
862 863
@@ -892,7 +893,8 @@ out:
892 * Return: Number of pages not migrated or error code. 893 * Return: Number of pages not migrated or error code.
893 */ 894 */
894int migrate_pages(struct list_head *from, 895int migrate_pages(struct list_head *from,
895 new_page_t get_new_page, unsigned long private, int offlining) 896 new_page_t get_new_page, unsigned long private, bool offlining,
897 bool sync)
896{ 898{
897 int retry = 1; 899 int retry = 1;
898 int nr_failed = 0; 900 int nr_failed = 0;
@@ -912,7 +914,8 @@ int migrate_pages(struct list_head *from,
912 cond_resched(); 914 cond_resched();
913 915
914 rc = unmap_and_move(get_new_page, private, 916 rc = unmap_and_move(get_new_page, private,
915 page, pass > 2, offlining); 917 page, pass > 2, offlining,
918 sync);
916 919
917 switch(rc) { 920 switch(rc) {
918 case -ENOMEM: 921 case -ENOMEM:
@@ -941,7 +944,8 @@ out:
941} 944}
942 945
943int migrate_huge_pages(struct list_head *from, 946int migrate_huge_pages(struct list_head *from,
944 new_page_t get_new_page, unsigned long private, int offlining) 947 new_page_t get_new_page, unsigned long private, bool offlining,
948 bool sync)
945{ 949{
946 int retry = 1; 950 int retry = 1;
947 int nr_failed = 0; 951 int nr_failed = 0;
@@ -957,7 +961,8 @@ int migrate_huge_pages(struct list_head *from,
957 cond_resched(); 961 cond_resched();
958 962
959 rc = unmap_and_move_huge_page(get_new_page, 963 rc = unmap_and_move_huge_page(get_new_page,
960 private, page, pass > 2, offlining); 964 private, page, pass > 2, offlining,
965 sync);
961 966
962 switch(rc) { 967 switch(rc) {
963 case -ENOMEM: 968 case -ENOMEM:
@@ -1042,7 +1047,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
1042 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) 1047 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
1043 goto set_status; 1048 goto set_status;
1044 1049
1045 page = follow_page(vma, pp->addr, FOLL_GET); 1050 page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
1046 1051
1047 err = PTR_ERR(page); 1052 err = PTR_ERR(page);
1048 if (IS_ERR(page)) 1053 if (IS_ERR(page))
@@ -1090,7 +1095,7 @@ set_status:
1090 err = 0; 1095 err = 0;
1091 if (!list_empty(&pagelist)) { 1096 if (!list_empty(&pagelist)) {
1092 err = migrate_pages(&pagelist, new_page_node, 1097 err = migrate_pages(&pagelist, new_page_node,
1093 (unsigned long)pm, 0); 1098 (unsigned long)pm, 0, true);
1094 if (err) 1099 if (err)
1095 putback_lru_pages(&pagelist); 1100 putback_lru_pages(&pagelist);
1096 } 1101 }
diff --git a/mm/mincore.c b/mm/mincore.c
index 9ac42dc6d7b6..a4e6b9d75c76 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
154 pmd = pmd_offset(pud, addr); 154 pmd = pmd_offset(pud, addr);
155 do { 155 do {
156 next = pmd_addr_end(addr, end); 156 next = pmd_addr_end(addr, end);
157 if (pmd_trans_huge(*pmd)) {
158 if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
159 vec += (next - addr) >> PAGE_SHIFT;
160 continue;
161 }
162 /* fall through */
163 }
157 if (pmd_none_or_clear_bad(pmd)) 164 if (pmd_none_or_clear_bad(pmd))
158 mincore_unmapped_range(vma, addr, next, vec); 165 mincore_unmapped_range(vma, addr, next, vec);
159 else 166 else
diff --git a/mm/mlock.c b/mm/mlock.c
index b70919ce4f72..13e81ee8be9d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -155,13 +155,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
155 * vma->vm_mm->mmap_sem must be held for at least read. 155 * vma->vm_mm->mmap_sem must be held for at least read.
156 */ 156 */
157static long __mlock_vma_pages_range(struct vm_area_struct *vma, 157static long __mlock_vma_pages_range(struct vm_area_struct *vma,
158 unsigned long start, unsigned long end) 158 unsigned long start, unsigned long end,
159 int *nonblocking)
159{ 160{
160 struct mm_struct *mm = vma->vm_mm; 161 struct mm_struct *mm = vma->vm_mm;
161 unsigned long addr = start; 162 unsigned long addr = start;
162 struct page *pages[16]; /* 16 gives a reasonable batch */
163 int nr_pages = (end - start) / PAGE_SIZE; 163 int nr_pages = (end - start) / PAGE_SIZE;
164 int ret = 0;
165 int gup_flags; 164 int gup_flags;
166 165
167 VM_BUG_ON(start & ~PAGE_MASK); 166 VM_BUG_ON(start & ~PAGE_MASK);
@@ -170,73 +169,26 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
170 VM_BUG_ON(end > vma->vm_end); 169 VM_BUG_ON(end > vma->vm_end);
171 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); 170 VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
172 171
173 gup_flags = FOLL_TOUCH | FOLL_GET; 172 gup_flags = FOLL_TOUCH;
174 if (vma->vm_flags & VM_WRITE) 173 /*
174 * We want to touch writable mappings with a write fault in order
175 * to break COW, except for shared mappings because these don't COW
176 * and we would not want to dirty them for nothing.
177 */
178 if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
175 gup_flags |= FOLL_WRITE; 179 gup_flags |= FOLL_WRITE;
176 180
181 if (vma->vm_flags & VM_LOCKED)
182 gup_flags |= FOLL_MLOCK;
183
177 /* We don't try to access the guard page of a stack vma */ 184 /* We don't try to access the guard page of a stack vma */
178 if (stack_guard_page(vma, start)) { 185 if (stack_guard_page(vma, start)) {
179 addr += PAGE_SIZE; 186 addr += PAGE_SIZE;
180 nr_pages--; 187 nr_pages--;
181 } 188 }
182 189
183 while (nr_pages > 0) { 190 return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
184 int i; 191 NULL, NULL, nonblocking);
185
186 cond_resched();
187
188 /*
189 * get_user_pages makes pages present if we are
190 * setting mlock. and this extra reference count will
191 * disable migration of this page. However, page may
192 * still be truncated out from under us.
193 */
194 ret = __get_user_pages(current, mm, addr,
195 min_t(int, nr_pages, ARRAY_SIZE(pages)),
196 gup_flags, pages, NULL);
197 /*
198 * This can happen for, e.g., VM_NONLINEAR regions before
199 * a page has been allocated and mapped at a given offset,
200 * or for addresses that map beyond end of a file.
201 * We'll mlock the pages if/when they get faulted in.
202 */
203 if (ret < 0)
204 break;
205
206 lru_add_drain(); /* push cached pages to LRU */
207
208 for (i = 0; i < ret; i++) {
209 struct page *page = pages[i];
210
211 if (page->mapping) {
212 /*
213 * That preliminary check is mainly to avoid
214 * the pointless overhead of lock_page on the
215 * ZERO_PAGE: which might bounce very badly if
216 * there is contention. However, we're still
217 * dirtying its cacheline with get/put_page:
218 * we'll add another __get_user_pages flag to
219 * avoid it if that case turns out to matter.
220 */
221 lock_page(page);
222 /*
223 * Because we lock page here and migration is
224 * blocked by the elevated reference, we need
225 * only check for file-cache page truncation.
226 */
227 if (page->mapping)
228 mlock_vma_page(page);
229 unlock_page(page);
230 }
231 put_page(page); /* ref from get_user_pages() */
232 }
233
234 addr += ret * PAGE_SIZE;
235 nr_pages -= ret;
236 ret = 0;
237 }
238
239 return ret; /* 0 or negative error code */
240} 192}
241 193
242/* 194/*
@@ -280,7 +232,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
280 is_vm_hugetlb_page(vma) || 232 is_vm_hugetlb_page(vma) ||
281 vma == get_gate_vma(current))) { 233 vma == get_gate_vma(current))) {
282 234
283 __mlock_vma_pages_range(vma, start, end); 235 __mlock_vma_pages_range(vma, start, end, NULL);
284 236
285 /* Hide errors from mmap() and other callers */ 237 /* Hide errors from mmap() and other callers */
286 return 0; 238 return 0;
@@ -372,18 +324,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
372 int ret = 0; 324 int ret = 0;
373 int lock = newflags & VM_LOCKED; 325 int lock = newflags & VM_LOCKED;
374 326
375 if (newflags == vma->vm_flags || 327 if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
376 (vma->vm_flags & (VM_IO | VM_PFNMAP))) 328 is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))
377 goto out; /* don't set VM_LOCKED, don't count */ 329 goto out; /* don't set VM_LOCKED, don't count */
378 330
379 if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
380 is_vm_hugetlb_page(vma) ||
381 vma == get_gate_vma(current)) {
382 if (lock)
383 make_pages_present(start, end);
384 goto out; /* don't set VM_LOCKED, don't count */
385 }
386
387 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 331 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
388 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, 332 *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
389 vma->vm_file, pgoff, vma_policy(vma)); 333 vma->vm_file, pgoff, vma_policy(vma));
@@ -419,14 +363,10 @@ success:
419 * set VM_LOCKED, __mlock_vma_pages_range will bring it back. 363 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
420 */ 364 */
421 365
422 if (lock) { 366 if (lock)
423 vma->vm_flags = newflags; 367 vma->vm_flags = newflags;
424 ret = __mlock_vma_pages_range(vma, start, end); 368 else
425 if (ret < 0)
426 ret = __mlock_posix_error_return(ret);
427 } else {
428 munlock_vma_pages_range(vma, start, end); 369 munlock_vma_pages_range(vma, start, end);
429 }
430 370
431out: 371out:
432 *prev = vma; 372 *prev = vma;
@@ -439,7 +379,8 @@ static int do_mlock(unsigned long start, size_t len, int on)
439 struct vm_area_struct * vma, * prev; 379 struct vm_area_struct * vma, * prev;
440 int error; 380 int error;
441 381
442 len = PAGE_ALIGN(len); 382 VM_BUG_ON(start & ~PAGE_MASK);
383 VM_BUG_ON(len != PAGE_ALIGN(len));
443 end = start + len; 384 end = start + len;
444 if (end < start) 385 if (end < start)
445 return -EINVAL; 386 return -EINVAL;
@@ -482,6 +423,62 @@ static int do_mlock(unsigned long start, size_t len, int on)
482 return error; 423 return error;
483} 424}
484 425
426static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
427{
428 struct mm_struct *mm = current->mm;
429 unsigned long end, nstart, nend;
430 struct vm_area_struct *vma = NULL;
431 int locked = 0;
432 int ret = 0;
433
434 VM_BUG_ON(start & ~PAGE_MASK);
435 VM_BUG_ON(len != PAGE_ALIGN(len));
436 end = start + len;
437
438 for (nstart = start; nstart < end; nstart = nend) {
439 /*
440 * We want to fault in pages for [nstart; end) address range.
441 * Find first corresponding VMA.
442 */
443 if (!locked) {
444 locked = 1;
445 down_read(&mm->mmap_sem);
446 vma = find_vma(mm, nstart);
447 } else if (nstart >= vma->vm_end)
448 vma = vma->vm_next;
449 if (!vma || vma->vm_start >= end)
450 break;
451 /*
452 * Set [nstart; nend) to intersection of desired address
453 * range with the first VMA. Also, skip undesirable VMA types.
454 */
455 nend = min(end, vma->vm_end);
456 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
457 continue;
458 if (nstart < vma->vm_start)
459 nstart = vma->vm_start;
460 /*
461 * Now fault in a range of pages. __mlock_vma_pages_range()
462 * double checks the vma flags, so that it won't mlock pages
463 * if the vma was already munlocked.
464 */
465 ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
466 if (ret < 0) {
467 if (ignore_errors) {
468 ret = 0;
469 continue; /* continue at next VMA */
470 }
471 ret = __mlock_posix_error_return(ret);
472 break;
473 }
474 nend = nstart + ret * PAGE_SIZE;
475 ret = 0;
476 }
477 if (locked)
478 up_read(&mm->mmap_sem);
479 return ret; /* 0 or negative error code */
480}
481
485SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) 482SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
486{ 483{
487 unsigned long locked; 484 unsigned long locked;
@@ -507,6 +504,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
507 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) 504 if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
508 error = do_mlock(start, len, 1); 505 error = do_mlock(start, len, 1);
509 up_write(&current->mm->mmap_sem); 506 up_write(&current->mm->mmap_sem);
507 if (!error)
508 error = do_mlock_pages(start, len, 0);
510 return error; 509 return error;
511} 510}
512 511
@@ -571,6 +570,10 @@ SYSCALL_DEFINE1(mlockall, int, flags)
571 capable(CAP_IPC_LOCK)) 570 capable(CAP_IPC_LOCK))
572 ret = do_mlockall(flags); 571 ret = do_mlockall(flags);
573 up_write(&current->mm->mmap_sem); 572 up_write(&current->mm->mmap_sem);
573 if (!ret && (flags & MCL_CURRENT)) {
574 /* Ignore errors */
575 do_mlock_pages(0, TASK_SIZE, 1);
576 }
574out: 577out:
575 return ret; 578 return ret;
576} 579}
diff --git a/mm/mmap.c b/mm/mmap.c
index 50a4aa0255a0..2ec8eb5a9cdd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,7 @@
29#include <linux/mmu_notifier.h> 29#include <linux/mmu_notifier.h>
30#include <linux/perf_event.h> 30#include <linux/perf_event.h>
31#include <linux/audit.h> 31#include <linux/audit.h>
32#include <linux/khugepaged.h>
32 33
33#include <asm/uaccess.h> 34#include <asm/uaccess.h>
34#include <asm/cacheflush.h> 35#include <asm/cacheflush.h>
@@ -253,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
253 down_write(&mm->mmap_sem); 254 down_write(&mm->mmap_sem);
254 255
255#ifdef CONFIG_COMPAT_BRK 256#ifdef CONFIG_COMPAT_BRK
256 min_brk = mm->end_code; 257 /*
258 * CONFIG_COMPAT_BRK can still be overridden by setting
259 * randomize_va_space to 2, which will still cause mm->start_brk
260 * to be arbitrarily shifted
261 */
262 if (mm->start_brk > PAGE_ALIGN(mm->end_data))
263 min_brk = mm->start_brk;
264 else
265 min_brk = mm->end_data;
257#else 266#else
258 min_brk = mm->start_brk; 267 min_brk = mm->start_brk;
259#endif 268#endif
@@ -588,6 +597,8 @@ again: remove_next = 1 + (end > next->vm_end);
588 } 597 }
589 } 598 }
590 599
600 vma_adjust_trans_huge(vma, start, end, adjust_next);
601
591 /* 602 /*
592 * When changing only vma->vm_end, we don't really need anon_vma 603 * When changing only vma->vm_end, we don't really need anon_vma
593 * lock. This is a fairly rare case by itself, but the anon_vma 604 * lock. This is a fairly rare case by itself, but the anon_vma
@@ -815,6 +826,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
815 end, prev->vm_pgoff, NULL); 826 end, prev->vm_pgoff, NULL);
816 if (err) 827 if (err)
817 return NULL; 828 return NULL;
829 khugepaged_enter_vma_merge(prev);
818 return prev; 830 return prev;
819 } 831 }
820 832
@@ -833,6 +845,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
833 next->vm_pgoff - pglen, NULL); 845 next->vm_pgoff - pglen, NULL);
834 if (err) 846 if (err)
835 return NULL; 847 return NULL;
848 khugepaged_enter_vma_merge(area);
836 return area; 849 return area;
837 } 850 }
838 851
@@ -1761,6 +1774,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1761 } 1774 }
1762 } 1775 }
1763 vma_unlock_anon_vma(vma); 1776 vma_unlock_anon_vma(vma);
1777 khugepaged_enter_vma_merge(vma);
1764 return error; 1778 return error;
1765} 1779}
1766#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ 1780#endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1808,6 +1822,7 @@ static int expand_downwards(struct vm_area_struct *vma,
1808 } 1822 }
1809 } 1823 }
1810 vma_unlock_anon_vma(vma); 1824 vma_unlock_anon_vma(vma);
1825 khugepaged_enter_vma_merge(vma);
1811 return error; 1826 return error;
1812} 1827}
1813 1828
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 438951d366f2..8d032de4088e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
100 return young; 100 return young;
101} 101}
102 102
103int __mmu_notifier_test_young(struct mm_struct *mm,
104 unsigned long address)
105{
106 struct mmu_notifier *mn;
107 struct hlist_node *n;
108 int young = 0;
109
110 rcu_read_lock();
111 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
112 if (mn->ops->test_young) {
113 young = mn->ops->test_young(mn, mm, address);
114 if (young)
115 break;
116 }
117 }
118 rcu_read_unlock();
119
120 return young;
121}
122
103void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, 123void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
104 pte_t pte) 124 pte_t pte)
105{ 125{
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb82c855..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
87 return 1; 87 return 1;
88} 88}
89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ 89#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
90
91#ifdef CONFIG_SMP
92/* Called when a more accurate view of NR_FREE_PAGES is needed */
93unsigned long zone_nr_free_pages(struct zone *zone)
94{
95 unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
96
97 /*
98 * While kswapd is awake, it is considered the zone is under some
99 * memory pressure. Under pressure, there is a risk that
100 * per-cpu-counter-drift will allow the min watermark to be breached
101 * potentially causing a live-lock. While kswapd is awake and
102 * free pages are low, get a better estimate for free pages
103 */
104 if (nr_free_pages < zone->percpu_drift_mark &&
105 !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
106 return zone_page_state_snapshot(zone, NR_FREE_PAGES);
107
108 return nr_free_pages;
109}
110#endif /* CONFIG_SMP */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4c5133873097..5a688a2756be 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
78 pte_unmap_unlock(pte - 1, ptl); 78 pte_unmap_unlock(pte - 1, ptl);
79} 79}
80 80
81static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, 81static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
82 unsigned long addr, unsigned long end, pgprot_t newprot, 82 unsigned long addr, unsigned long end, pgprot_t newprot,
83 int dirty_accountable) 83 int dirty_accountable)
84{ 84{
@@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
88 pmd = pmd_offset(pud, addr); 88 pmd = pmd_offset(pud, addr);
89 do { 89 do {
90 next = pmd_addr_end(addr, end); 90 next = pmd_addr_end(addr, end);
91 if (pmd_trans_huge(*pmd)) {
92 if (next - addr != HPAGE_PMD_SIZE)
93 split_huge_page_pmd(vma->vm_mm, pmd);
94 else if (change_huge_pmd(vma, pmd, addr, newprot))
95 continue;
96 /* fall through */
97 }
91 if (pmd_none_or_clear_bad(pmd)) 98 if (pmd_none_or_clear_bad(pmd))
92 continue; 99 continue;
93 change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); 100 change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
101 dirty_accountable);
94 } while (pmd++, addr = next, addr != end); 102 } while (pmd++, addr = next, addr != end);
95} 103}
96 104
97static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, 105static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
98 unsigned long addr, unsigned long end, pgprot_t newprot, 106 unsigned long addr, unsigned long end, pgprot_t newprot,
99 int dirty_accountable) 107 int dirty_accountable)
100{ 108{
@@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
106 next = pud_addr_end(addr, end); 114 next = pud_addr_end(addr, end);
107 if (pud_none_or_clear_bad(pud)) 115 if (pud_none_or_clear_bad(pud))
108 continue; 116 continue;
109 change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); 117 change_pmd_range(vma, pud, addr, next, newprot,
118 dirty_accountable);
110 } while (pud++, addr = next, addr != end); 119 } while (pud++, addr = next, addr != end);
111} 120}
112 121
@@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma,
126 next = pgd_addr_end(addr, end); 135 next = pgd_addr_end(addr, end);
127 if (pgd_none_or_clear_bad(pgd)) 136 if (pgd_none_or_clear_bad(pgd))
128 continue; 137 continue;
129 change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); 138 change_pud_range(vma, pgd, addr, next, newprot,
139 dirty_accountable);
130 } while (pgd++, addr = next, addr != end); 140 } while (pgd++, addr = next, addr != end);
131 flush_tlb_range(vma, start, end); 141 flush_tlb_range(vma, start, end);
132} 142}
diff --git a/mm/mremap.c b/mm/mremap.c
index 563fbdd6293a..9925b6391b80 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
41 return NULL; 41 return NULL;
42 42
43 pmd = pmd_offset(pud, addr); 43 pmd = pmd_offset(pud, addr);
44 split_huge_page_pmd(mm, pmd);
44 if (pmd_none_or_clear_bad(pmd)) 45 if (pmd_none_or_clear_bad(pmd))
45 return NULL; 46 return NULL;
46 47
47 return pmd; 48 return pmd;
48} 49}
49 50
50static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) 51static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
52 unsigned long addr)
51{ 53{
52 pgd_t *pgd; 54 pgd_t *pgd;
53 pud_t *pud; 55 pud_t *pud;
@@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
62 if (!pmd) 64 if (!pmd)
63 return NULL; 65 return NULL;
64 66
65 if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) 67 VM_BUG_ON(pmd_trans_huge(*pmd));
68 if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
66 return NULL; 69 return NULL;
67 70
68 return pmd; 71 return pmd;
@@ -147,7 +150,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
147 old_pmd = get_old_pmd(vma->vm_mm, old_addr); 150 old_pmd = get_old_pmd(vma->vm_mm, old_addr);
148 if (!old_pmd) 151 if (!old_pmd)
149 continue; 152 continue;
150 new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); 153 new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
151 if (!new_pmd) 154 if (!new_pmd)
152 break; 155 break;
153 next = (new_addr + PMD_SIZE) & PMD_MASK; 156 next = (new_addr + PMD_SIZE) & PMD_MASK;
diff --git a/mm/nommu.c b/mm/nommu.c
index ef4045d010d5..f59e1424d3db 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp)
127 127
128int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 128int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
129 unsigned long start, int nr_pages, unsigned int foll_flags, 129 unsigned long start, int nr_pages, unsigned int foll_flags,
130 struct page **pages, struct vm_area_struct **vmas) 130 struct page **pages, struct vm_area_struct **vmas,
131 int *retry)
131{ 132{
132 struct vm_area_struct *vma; 133 struct vm_area_struct *vma;
133 unsigned long vm_flags; 134 unsigned long vm_flags;
@@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
185 if (force) 186 if (force)
186 flags |= FOLL_FORCE; 187 flags |= FOLL_FORCE;
187 188
188 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); 189 return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
190 NULL);
189} 191}
190EXPORT_SYMBOL(get_user_pages); 192EXPORT_SYMBOL(get_user_pages);
191 193
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b4edfe7ce06c..2cb01f6ec5d0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -404,15 +404,18 @@ unsigned long determine_dirtyable_memory(void)
404 * - vm.dirty_background_ratio or vm.dirty_background_bytes 404 * - vm.dirty_background_ratio or vm.dirty_background_bytes
405 * - vm.dirty_ratio or vm.dirty_bytes 405 * - vm.dirty_ratio or vm.dirty_bytes
406 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and 406 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
407 * runtime tasks. 407 * real-time tasks.
408 */ 408 */
409void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) 409void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
410{ 410{
411 unsigned long background; 411 unsigned long background;
412 unsigned long dirty; 412 unsigned long dirty;
413 unsigned long available_memory = determine_dirtyable_memory(); 413 unsigned long uninitialized_var(available_memory);
414 struct task_struct *tsk; 414 struct task_struct *tsk;
415 415
416 if (!vm_dirty_bytes || !dirty_background_bytes)
417 available_memory = determine_dirtyable_memory();
418
416 if (vm_dirty_bytes) 419 if (vm_dirty_bytes)
417 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); 420 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
418 else 421 else
@@ -1103,7 +1106,7 @@ EXPORT_SYMBOL(write_one_page);
1103int __set_page_dirty_no_writeback(struct page *page) 1106int __set_page_dirty_no_writeback(struct page *page)
1104{ 1107{
1105 if (!PageDirty(page)) 1108 if (!PageDirty(page))
1106 SetPageDirty(page); 1109 return !TestSetPageDirty(page);
1107 return 0; 1110 return 0;
1108} 1111}
1109 1112
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ff7e15872398..90c1439549fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -357,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order)
357 } 357 }
358} 358}
359 359
360/* update __split_huge_page_refcount if you change this function */
360static int destroy_compound_page(struct page *page, unsigned long order) 361static int destroy_compound_page(struct page *page, unsigned long order)
361{ 362{
362 int i; 363 int i;
@@ -426,18 +427,10 @@ static inline void rmv_page_order(struct page *page)
426 * 427 *
427 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 428 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
428 */ 429 */
429static inline struct page *
430__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
431{
432 unsigned long buddy_idx = page_idx ^ (1 << order);
433
434 return page + (buddy_idx - page_idx);
435}
436
437static inline unsigned long 430static inline unsigned long
438__find_combined_index(unsigned long page_idx, unsigned int order) 431__find_buddy_index(unsigned long page_idx, unsigned int order)
439{ 432{
440 return (page_idx & ~(1 << order)); 433 return page_idx ^ (1 << order);
441} 434}
442 435
443/* 436/*
@@ -448,8 +441,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
448 * (c) a page and its buddy have the same order && 441 * (c) a page and its buddy have the same order &&
449 * (d) a page and its buddy are in the same zone. 442 * (d) a page and its buddy are in the same zone.
450 * 443 *
451 * For recording whether a page is in the buddy system, we use PG_buddy. 444 * For recording whether a page is in the buddy system, we set ->_mapcount -2.
452 * Setting, clearing, and testing PG_buddy is serialized by zone->lock. 445 * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
453 * 446 *
454 * For recording page's order, we use page_private(page). 447 * For recording page's order, we use page_private(page).
455 */ 448 */
@@ -482,7 +475,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
482 * as necessary, plus some accounting needed to play nicely with other 475 * as necessary, plus some accounting needed to play nicely with other
483 * parts of the VM system. 476 * parts of the VM system.
484 * At each level, we keep a list of pages, which are heads of continuous 477 * At each level, we keep a list of pages, which are heads of continuous
485 * free pages of length of (1 << order) and marked with PG_buddy. Page's 478 * free pages of length of (1 << order) and marked with _mapcount -2. Page's
486 * order is recorded in page_private(page) field. 479 * order is recorded in page_private(page) field.
487 * So when we are allocating or freeing one, we can derive the state of the 480 * So when we are allocating or freeing one, we can derive the state of the
488 * other. That is, if we allocate a small block, and both were 481 * other. That is, if we allocate a small block, and both were
@@ -499,6 +492,7 @@ static inline void __free_one_page(struct page *page,
499{ 492{
500 unsigned long page_idx; 493 unsigned long page_idx;
501 unsigned long combined_idx; 494 unsigned long combined_idx;
495 unsigned long uninitialized_var(buddy_idx);
502 struct page *buddy; 496 struct page *buddy;
503 497
504 if (unlikely(PageCompound(page))) 498 if (unlikely(PageCompound(page)))
@@ -513,7 +507,8 @@ static inline void __free_one_page(struct page *page,
513 VM_BUG_ON(bad_range(zone, page)); 507 VM_BUG_ON(bad_range(zone, page));
514 508
515 while (order < MAX_ORDER-1) { 509 while (order < MAX_ORDER-1) {
516 buddy = __page_find_buddy(page, page_idx, order); 510 buddy_idx = __find_buddy_index(page_idx, order);
511 buddy = page + (buddy_idx - page_idx);
517 if (!page_is_buddy(page, buddy, order)) 512 if (!page_is_buddy(page, buddy, order))
518 break; 513 break;
519 514
@@ -521,7 +516,7 @@ static inline void __free_one_page(struct page *page,
521 list_del(&buddy->lru); 516 list_del(&buddy->lru);
522 zone->free_area[order].nr_free--; 517 zone->free_area[order].nr_free--;
523 rmv_page_order(buddy); 518 rmv_page_order(buddy);
524 combined_idx = __find_combined_index(page_idx, order); 519 combined_idx = buddy_idx & page_idx;
525 page = page + (combined_idx - page_idx); 520 page = page + (combined_idx - page_idx);
526 page_idx = combined_idx; 521 page_idx = combined_idx;
527 order++; 522 order++;
@@ -538,9 +533,10 @@ static inline void __free_one_page(struct page *page,
538 */ 533 */
539 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { 534 if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
540 struct page *higher_page, *higher_buddy; 535 struct page *higher_page, *higher_buddy;
541 combined_idx = __find_combined_index(page_idx, order); 536 combined_idx = buddy_idx & page_idx;
542 higher_page = page + combined_idx - page_idx; 537 higher_page = page + (combined_idx - page_idx);
543 higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); 538 buddy_idx = __find_buddy_index(combined_idx, order + 1);
539 higher_buddy = page + (buddy_idx - combined_idx);
544 if (page_is_buddy(higher_page, higher_buddy, order + 1)) { 540 if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
545 list_add_tail(&page->lru, 541 list_add_tail(&page->lru,
546 &zone->free_area[order].free_list[migratetype]); 542 &zone->free_area[order].free_list[migratetype]);
@@ -651,13 +647,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
651 trace_mm_page_free_direct(page, order); 647 trace_mm_page_free_direct(page, order);
652 kmemcheck_free_shadow(page, order); 648 kmemcheck_free_shadow(page, order);
653 649
654 for (i = 0; i < (1 << order); i++) { 650 if (PageAnon(page))
655 struct page *pg = page + i; 651 page->mapping = NULL;
656 652 for (i = 0; i < (1 << order); i++)
657 if (PageAnon(pg)) 653 bad += free_pages_check(page + i);
658 pg->mapping = NULL;
659 bad += free_pages_check(pg);
660 }
661 if (bad) 654 if (bad)
662 return false; 655 return false;
663 656
@@ -1460,24 +1453,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1460#endif /* CONFIG_FAIL_PAGE_ALLOC */ 1453#endif /* CONFIG_FAIL_PAGE_ALLOC */
1461 1454
1462/* 1455/*
1463 * Return 1 if free pages are above 'mark'. This takes into account the order 1456 * Return true if free pages are above 'mark'. This takes into account the order
1464 * of the allocation. 1457 * of the allocation.
1465 */ 1458 */
1466int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1459static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1467 int classzone_idx, int alloc_flags) 1460 int classzone_idx, int alloc_flags, long free_pages)
1468{ 1461{
1469 /* free_pages my go negative - that's OK */ 1462 /* free_pages my go negative - that's OK */
1470 long min = mark; 1463 long min = mark;
1471 long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
1472 int o; 1464 int o;
1473 1465
1466 free_pages -= (1 << order) + 1;
1474 if (alloc_flags & ALLOC_HIGH) 1467 if (alloc_flags & ALLOC_HIGH)
1475 min -= min / 2; 1468 min -= min / 2;
1476 if (alloc_flags & ALLOC_HARDER) 1469 if (alloc_flags & ALLOC_HARDER)
1477 min -= min / 4; 1470 min -= min / 4;
1478 1471
1479 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1472 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1480 return 0; 1473 return false;
1481 for (o = 0; o < order; o++) { 1474 for (o = 0; o < order; o++) {
1482 /* At the next order, this order's pages become unavailable */ 1475 /* At the next order, this order's pages become unavailable */
1483 free_pages -= z->free_area[o].nr_free << o; 1476 free_pages -= z->free_area[o].nr_free << o;
@@ -1486,9 +1479,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1486 min >>= 1; 1479 min >>= 1;
1487 1480
1488 if (free_pages <= min) 1481 if (free_pages <= min)
1489 return 0; 1482 return false;
1490 } 1483 }
1491 return 1; 1484 return true;
1485}
1486
1487bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1488 int classzone_idx, int alloc_flags)
1489{
1490 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1491 zone_page_state(z, NR_FREE_PAGES));
1492}
1493
1494bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1495 int classzone_idx, int alloc_flags)
1496{
1497 long free_pages = zone_page_state(z, NR_FREE_PAGES);
1498
1499 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1500 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1501
1502 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1503 free_pages);
1492} 1504}
1493 1505
1494#ifdef CONFIG_NUMA 1506#ifdef CONFIG_NUMA
@@ -1793,15 +1805,18 @@ static struct page *
1793__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1805__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1794 struct zonelist *zonelist, enum zone_type high_zoneidx, 1806 struct zonelist *zonelist, enum zone_type high_zoneidx,
1795 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1807 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1796 int migratetype, unsigned long *did_some_progress) 1808 int migratetype, unsigned long *did_some_progress,
1809 bool sync_migration)
1797{ 1810{
1798 struct page *page; 1811 struct page *page;
1799 1812
1800 if (!order || compaction_deferred(preferred_zone)) 1813 if (!order || compaction_deferred(preferred_zone))
1801 return NULL; 1814 return NULL;
1802 1815
1816 current->flags |= PF_MEMALLOC;
1803 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, 1817 *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1804 nodemask); 1818 nodemask, sync_migration);
1819 current->flags &= ~PF_MEMALLOC;
1805 if (*did_some_progress != COMPACT_SKIPPED) { 1820 if (*did_some_progress != COMPACT_SKIPPED) {
1806 1821
1807 /* Page migration frees to the PCP lists but we want merging */ 1822 /* Page migration frees to the PCP lists but we want merging */
@@ -1837,7 +1852,8 @@ static inline struct page *
1837__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, 1852__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1838 struct zonelist *zonelist, enum zone_type high_zoneidx, 1853 struct zonelist *zonelist, enum zone_type high_zoneidx,
1839 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, 1854 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1840 int migratetype, unsigned long *did_some_progress) 1855 int migratetype, unsigned long *did_some_progress,
1856 bool sync_migration)
1841{ 1857{
1842 return NULL; 1858 return NULL;
1843} 1859}
@@ -1852,23 +1868,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1852{ 1868{
1853 struct page *page = NULL; 1869 struct page *page = NULL;
1854 struct reclaim_state reclaim_state; 1870 struct reclaim_state reclaim_state;
1855 struct task_struct *p = current;
1856 bool drained = false; 1871 bool drained = false;
1857 1872
1858 cond_resched(); 1873 cond_resched();
1859 1874
1860 /* We now go into synchronous reclaim */ 1875 /* We now go into synchronous reclaim */
1861 cpuset_memory_pressure_bump(); 1876 cpuset_memory_pressure_bump();
1862 p->flags |= PF_MEMALLOC; 1877 current->flags |= PF_MEMALLOC;
1863 lockdep_set_current_reclaim_state(gfp_mask); 1878 lockdep_set_current_reclaim_state(gfp_mask);
1864 reclaim_state.reclaimed_slab = 0; 1879 reclaim_state.reclaimed_slab = 0;
1865 p->reclaim_state = &reclaim_state; 1880 current->reclaim_state = &reclaim_state;
1866 1881
1867 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); 1882 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1868 1883
1869 p->reclaim_state = NULL; 1884 current->reclaim_state = NULL;
1870 lockdep_clear_current_reclaim_state(); 1885 lockdep_clear_current_reclaim_state();
1871 p->flags &= ~PF_MEMALLOC; 1886 current->flags &= ~PF_MEMALLOC;
1872 1887
1873 cond_resched(); 1888 cond_resched();
1874 1889
@@ -1920,19 +1935,19 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1920 1935
1921static inline 1936static inline
1922void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, 1937void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1923 enum zone_type high_zoneidx) 1938 enum zone_type high_zoneidx,
1939 enum zone_type classzone_idx)
1924{ 1940{
1925 struct zoneref *z; 1941 struct zoneref *z;
1926 struct zone *zone; 1942 struct zone *zone;
1927 1943
1928 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1944 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1929 wakeup_kswapd(zone, order); 1945 wakeup_kswapd(zone, order, classzone_idx);
1930} 1946}
1931 1947
1932static inline int 1948static inline int
1933gfp_to_alloc_flags(gfp_t gfp_mask) 1949gfp_to_alloc_flags(gfp_t gfp_mask)
1934{ 1950{
1935 struct task_struct *p = current;
1936 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; 1951 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1937 const gfp_t wait = gfp_mask & __GFP_WAIT; 1952 const gfp_t wait = gfp_mask & __GFP_WAIT;
1938 1953
@@ -1948,18 +1963,23 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
1948 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); 1963 alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
1949 1964
1950 if (!wait) { 1965 if (!wait) {
1951 alloc_flags |= ALLOC_HARDER; 1966 /*
1967 * Not worth trying to allocate harder for
1968 * __GFP_NOMEMALLOC even if it can't schedule.
1969 */
1970 if (!(gfp_mask & __GFP_NOMEMALLOC))
1971 alloc_flags |= ALLOC_HARDER;
1952 /* 1972 /*
1953 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 1973 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1954 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1974 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1955 */ 1975 */
1956 alloc_flags &= ~ALLOC_CPUSET; 1976 alloc_flags &= ~ALLOC_CPUSET;
1957 } else if (unlikely(rt_task(p)) && !in_interrupt()) 1977 } else if (unlikely(rt_task(current)) && !in_interrupt())
1958 alloc_flags |= ALLOC_HARDER; 1978 alloc_flags |= ALLOC_HARDER;
1959 1979
1960 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 1980 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
1961 if (!in_interrupt() && 1981 if (!in_interrupt() &&
1962 ((p->flags & PF_MEMALLOC) || 1982 ((current->flags & PF_MEMALLOC) ||
1963 unlikely(test_thread_flag(TIF_MEMDIE)))) 1983 unlikely(test_thread_flag(TIF_MEMDIE))))
1964 alloc_flags |= ALLOC_NO_WATERMARKS; 1984 alloc_flags |= ALLOC_NO_WATERMARKS;
1965 } 1985 }
@@ -1978,7 +1998,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1978 int alloc_flags; 1998 int alloc_flags;
1979 unsigned long pages_reclaimed = 0; 1999 unsigned long pages_reclaimed = 0;
1980 unsigned long did_some_progress; 2000 unsigned long did_some_progress;
1981 struct task_struct *p = current; 2001 bool sync_migration = false;
1982 2002
1983 /* 2003 /*
1984 * In the slowpath, we sanity check order to avoid ever trying to 2004 * In the slowpath, we sanity check order to avoid ever trying to
@@ -2003,7 +2023,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2003 goto nopage; 2023 goto nopage;
2004 2024
2005restart: 2025restart:
2006 wake_all_kswapd(order, zonelist, high_zoneidx); 2026 if (!(gfp_mask & __GFP_NO_KSWAPD))
2027 wake_all_kswapd(order, zonelist, high_zoneidx,
2028 zone_idx(preferred_zone));
2007 2029
2008 /* 2030 /*
2009 * OK, we're below the kswapd watermark and have kicked background 2031 * OK, we're below the kswapd watermark and have kicked background
@@ -2034,21 +2056,26 @@ rebalance:
2034 goto nopage; 2056 goto nopage;
2035 2057
2036 /* Avoid recursion of direct reclaim */ 2058 /* Avoid recursion of direct reclaim */
2037 if (p->flags & PF_MEMALLOC) 2059 if (current->flags & PF_MEMALLOC)
2038 goto nopage; 2060 goto nopage;
2039 2061
2040 /* Avoid allocations with no watermarks from looping endlessly */ 2062 /* Avoid allocations with no watermarks from looping endlessly */
2041 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) 2063 if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
2042 goto nopage; 2064 goto nopage;
2043 2065
2044 /* Try direct compaction */ 2066 /*
2067 * Try direct compaction. The first pass is asynchronous. Subsequent
2068 * attempts after direct reclaim are synchronous
2069 */
2045 page = __alloc_pages_direct_compact(gfp_mask, order, 2070 page = __alloc_pages_direct_compact(gfp_mask, order,
2046 zonelist, high_zoneidx, 2071 zonelist, high_zoneidx,
2047 nodemask, 2072 nodemask,
2048 alloc_flags, preferred_zone, 2073 alloc_flags, preferred_zone,
2049 migratetype, &did_some_progress); 2074 migratetype, &did_some_progress,
2075 sync_migration);
2050 if (page) 2076 if (page)
2051 goto got_pg; 2077 goto got_pg;
2078 sync_migration = true;
2052 2079
2053 /* Try direct reclaim and then allocating */ 2080 /* Try direct reclaim and then allocating */
2054 page = __alloc_pages_direct_reclaim(gfp_mask, order, 2081 page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2102,13 +2129,27 @@ rebalance:
2102 /* Wait for some write requests to complete then retry */ 2129 /* Wait for some write requests to complete then retry */
2103 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); 2130 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2104 goto rebalance; 2131 goto rebalance;
2132 } else {
2133 /*
2134 * High-order allocations do not necessarily loop after
2135 * direct reclaim and reclaim/compaction depends on compaction
2136 * being called after reclaim so call directly if necessary
2137 */
2138 page = __alloc_pages_direct_compact(gfp_mask, order,
2139 zonelist, high_zoneidx,
2140 nodemask,
2141 alloc_flags, preferred_zone,
2142 migratetype, &did_some_progress,
2143 sync_migration);
2144 if (page)
2145 goto got_pg;
2105 } 2146 }
2106 2147
2107nopage: 2148nopage:
2108 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 2149 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
2109 printk(KERN_WARNING "%s: page allocation failure." 2150 printk(KERN_WARNING "%s: page allocation failure."
2110 " order:%d, mode:0x%x\n", 2151 " order:%d, mode:0x%x\n",
2111 p->comm, order, gfp_mask); 2152 current->comm, order, gfp_mask);
2112 dump_stack(); 2153 dump_stack();
2113 show_mem(); 2154 show_mem();
2114 } 2155 }
@@ -2442,7 +2483,7 @@ void show_free_areas(void)
2442 " all_unreclaimable? %s" 2483 " all_unreclaimable? %s"
2443 "\n", 2484 "\n",
2444 zone->name, 2485 zone->name,
2445 K(zone_nr_free_pages(zone)), 2486 K(zone_page_state(zone, NR_FREE_PAGES)),
2446 K(min_wmark_pages(zone)), 2487 K(min_wmark_pages(zone)),
2447 K(low_wmark_pages(zone)), 2488 K(low_wmark_pages(zone)),
2448 K(high_wmark_pages(zone)), 2489 K(high_wmark_pages(zone)),
@@ -2585,9 +2626,16 @@ static int __parse_numa_zonelist_order(char *s)
2585 2626
2586static __init int setup_numa_zonelist_order(char *s) 2627static __init int setup_numa_zonelist_order(char *s)
2587{ 2628{
2588 if (s) 2629 int ret;
2589 return __parse_numa_zonelist_order(s); 2630
2590 return 0; 2631 if (!s)
2632 return 0;
2633
2634 ret = __parse_numa_zonelist_order(s);
2635 if (ret == 0)
2636 strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
2637
2638 return ret;
2591} 2639}
2592early_param("numa_zonelist_order", setup_numa_zonelist_order); 2640early_param("numa_zonelist_order", setup_numa_zonelist_order);
2593 2641
@@ -4014,7 +4062,7 @@ static void __init setup_usemap(struct pglist_data *pgdat,
4014 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); 4062 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
4015} 4063}
4016#else 4064#else
4017static void inline setup_usemap(struct pglist_data *pgdat, 4065static inline void setup_usemap(struct pglist_data *pgdat,
4018 struct zone *zone, unsigned long zonesize) {} 4066 struct zone *zone, unsigned long zonesize) {}
4019#endif /* CONFIG_SPARSEMEM */ 4067#endif /* CONFIG_SPARSEMEM */
4020 4068
@@ -5517,7 +5565,6 @@ static struct trace_print_flags pageflag_names[] = {
5517 {1UL << PG_swapcache, "swapcache" }, 5565 {1UL << PG_swapcache, "swapcache" },
5518 {1UL << PG_mappedtodisk, "mappedtodisk" }, 5566 {1UL << PG_mappedtodisk, "mappedtodisk" },
5519 {1UL << PG_reclaim, "reclaim" }, 5567 {1UL << PG_reclaim, "reclaim" },
5520 {1UL << PG_buddy, "buddy" },
5521 {1UL << PG_swapbacked, "swapbacked" }, 5568 {1UL << PG_swapbacked, "swapbacked" },
5522 {1UL << PG_unevictable, "unevictable" }, 5569 {1UL << PG_unevictable, "unevictable" },
5523#ifdef CONFIG_MMU 5570#ifdef CONFIG_MMU
@@ -5565,7 +5612,7 @@ void dump_page(struct page *page)
5565{ 5612{
5566 printk(KERN_ALERT 5613 printk(KERN_ALERT
5567 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", 5614 "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
5568 page, page_count(page), page_mapcount(page), 5615 page, atomic_read(&page->_count), page_mapcount(page),
5569 page->mapping, page->index); 5616 page->mapping, page->index);
5570 dump_page_flags(page->flags); 5617 dump_page_flags(page->flags);
5571} 5618}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 38cc58b8b2b0..7cfa6ae02303 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
34 pmd = pmd_offset(pud, addr); 34 pmd = pmd_offset(pud, addr);
35 do { 35 do {
36 next = pmd_addr_end(addr, end); 36 next = pmd_addr_end(addr, end);
37 split_huge_page_pmd(walk->mm, pmd);
37 if (pmd_none_or_clear_bad(pmd)) { 38 if (pmd_none_or_clear_bad(pmd)) {
38 if (walk->pte_hole) 39 if (walk->pte_hole)
39 err = walk->pte_hole(addr, next, walk); 40 err = walk->pte_hole(addr, next, walk);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 7d9c1d0ebd3f..ea534960a04b 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
421 return NULL; 421 return NULL;
422 422
423 vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, 423 vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
424 pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); 424 pcpu_nr_groups, pcpu_atom_size);
425 if (!vms) { 425 if (!vms) {
426 pcpu_free_chunk(chunk); 426 pcpu_free_chunk(chunk);
427 return NULL; 427 return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index 02ba91230b99..3f930018aa60 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -258,7 +258,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
258 258
259/* 259/*
260 * (Un)populated page region iterators. Iterate over (un)populated 260 * (Un)populated page region iterators. Iterate over (un)populated
261 * page regions betwen @start and @end in @chunk. @rs and @re should 261 * page regions between @start and @end in @chunk. @rs and @re should
262 * be integer variables and will be set to start and end page index of 262 * be integer variables and will be set to start and end page index of
263 * the current region. 263 * the current region.
264 */ 264 */
@@ -293,12 +293,8 @@ static void *pcpu_mem_alloc(size_t size)
293 293
294 if (size <= PAGE_SIZE) 294 if (size <= PAGE_SIZE)
295 return kzalloc(size, GFP_KERNEL); 295 return kzalloc(size, GFP_KERNEL);
296 else { 296 else
297 void *ptr = vmalloc(size); 297 return vzalloc(size);
298 if (ptr)
299 memset(ptr, 0, size);
300 return ptr;
301 }
302} 298}
303 299
304/** 300/**
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
new file mode 100644
index 000000000000..d030548047e2
--- /dev/null
+++ b/mm/pgtable-generic.c
@@ -0,0 +1,123 @@
1/*
2 * mm/pgtable-generic.c
3 *
4 * Generic pgtable methods declared in asm-generic/pgtable.h
5 *
6 * Copyright (C) 2010 Linus Torvalds
7 */
8
9#include <asm/tlb.h>
10#include <asm-generic/pgtable.h>
11
12#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
13/*
14 * Only sets the access flags (dirty, accessed, and
15 * writable). Furthermore, we know it always gets set to a "more
16 * permissive" setting, which allows most architectures to optimize
17 * this. We return whether the PTE actually changed, which in turn
18 * instructs the caller to do things like update__mmu_cache. This
19 * used to be done in the caller, but sparc needs minor faults to
20 * force that call on sun4c so we changed this macro slightly
21 */
22int ptep_set_access_flags(struct vm_area_struct *vma,
23 unsigned long address, pte_t *ptep,
24 pte_t entry, int dirty)
25{
26 int changed = !pte_same(*ptep, entry);
27 if (changed) {
28 set_pte_at(vma->vm_mm, address, ptep, entry);
29 flush_tlb_page(vma, address);
30 }
31 return changed;
32}
33#endif
34
35#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
36int pmdp_set_access_flags(struct vm_area_struct *vma,
37 unsigned long address, pmd_t *pmdp,
38 pmd_t entry, int dirty)
39{
40#ifdef CONFIG_TRANSPARENT_HUGEPAGE
41 int changed = !pmd_same(*pmdp, entry);
42 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
43 if (changed) {
44 set_pmd_at(vma->vm_mm, address, pmdp, entry);
45 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
46 }
47 return changed;
48#else /* CONFIG_TRANSPARENT_HUGEPAGE */
49 BUG();
50 return 0;
51#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
52}
53#endif
54
55#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
56int ptep_clear_flush_young(struct vm_area_struct *vma,
57 unsigned long address, pte_t *ptep)
58{
59 int young;
60 young = ptep_test_and_clear_young(vma, address, ptep);
61 if (young)
62 flush_tlb_page(vma, address);
63 return young;
64}
65#endif
66
67#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
68int pmdp_clear_flush_young(struct vm_area_struct *vma,
69 unsigned long address, pmd_t *pmdp)
70{
71 int young;
72#ifndef CONFIG_TRANSPARENT_HUGEPAGE
73 BUG();
74#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
75 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
76 young = pmdp_test_and_clear_young(vma, address, pmdp);
77 if (young)
78 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
79 return young;
80}
81#endif
82
83#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
84pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
85 pte_t *ptep)
86{
87 pte_t pte;
88 pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
89 flush_tlb_page(vma, address);
90 return pte;
91}
92#endif
93
94#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
95pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
96 pmd_t *pmdp)
97{
98 pmd_t pmd;
99#ifndef CONFIG_TRANSPARENT_HUGEPAGE
100 BUG();
101#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
102 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
103 pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
104 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
105 return pmd;
106}
107#endif
108
109#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
110pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
111 pmd_t *pmdp)
112{
113#ifdef CONFIG_TRANSPARENT_HUGEPAGE
114 pmd_t pmd = pmd_mksplitting(*pmdp);
115 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
116 set_pmd_at(vma->vm_mm, address, pmdp, pmd);
117 /* tlb flush only to serialize against gup-fast */
118 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
119#else /* CONFIG_TRANSPARENT_HUGEPAGE */
120 BUG();
121#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
122}
123#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 1a8bf76bfd03..f21f4a1d6a1c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -94,7 +94,7 @@ static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
94 * anonymous pages mapped into it with that anon_vma. 94 * anonymous pages mapped into it with that anon_vma.
95 * 95 *
96 * The common case will be that we already have one, but if 96 * The common case will be that we already have one, but if
97 * if not we either need to find an adjacent mapping that we 97 * not we either need to find an adjacent mapping that we
98 * can re-use the anon_vma from (very common when the only 98 * can re-use the anon_vma from (very common when the only
99 * reason for splitting a vma has been mprotect()), or we 99 * reason for splitting a vma has been mprotect()), or we
100 * allocate a new one. 100 * allocate a new one.
@@ -177,6 +177,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
177 list_add(&avc->same_vma, &vma->anon_vma_chain); 177 list_add(&avc->same_vma, &vma->anon_vma_chain);
178 178
179 anon_vma_lock(anon_vma); 179 anon_vma_lock(anon_vma);
180 /*
181 * It's critical to add new vmas to the tail of the anon_vma,
182 * see comment in huge_memory.c:__split_huge_page().
183 */
180 list_add_tail(&avc->same_anon_vma, &anon_vma->head); 184 list_add_tail(&avc->same_anon_vma, &anon_vma->head);
181 anon_vma_unlock(anon_vma); 185 anon_vma_unlock(anon_vma);
182} 186}
@@ -360,7 +364,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
360 * Returns virtual address or -EFAULT if page's index/offset is not 364 * Returns virtual address or -EFAULT if page's index/offset is not
361 * within the range mapped the @vma. 365 * within the range mapped the @vma.
362 */ 366 */
363static inline unsigned long 367inline unsigned long
364vma_address(struct page *page, struct vm_area_struct *vma) 368vma_address(struct page *page, struct vm_area_struct *vma)
365{ 369{
366 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 370 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -435,6 +439,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
435 pmd = pmd_offset(pud, address); 439 pmd = pmd_offset(pud, address);
436 if (!pmd_present(*pmd)) 440 if (!pmd_present(*pmd))
437 return NULL; 441 return NULL;
442 if (pmd_trans_huge(*pmd))
443 return NULL;
438 444
439 pte = pte_offset_map(pmd, address); 445 pte = pte_offset_map(pmd, address);
440 /* Make a quick check before getting the lock */ 446 /* Make a quick check before getting the lock */
@@ -489,35 +495,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
489 unsigned long *vm_flags) 495 unsigned long *vm_flags)
490{ 496{
491 struct mm_struct *mm = vma->vm_mm; 497 struct mm_struct *mm = vma->vm_mm;
492 pte_t *pte;
493 spinlock_t *ptl;
494 int referenced = 0; 498 int referenced = 0;
495 499
496 pte = page_check_address(page, mm, address, &ptl, 0);
497 if (!pte)
498 goto out;
499
500 /* 500 /*
501 * Don't want to elevate referenced for mlocked page that gets this far, 501 * Don't want to elevate referenced for mlocked page that gets this far,
502 * in order that it progresses to try_to_unmap and is moved to the 502 * in order that it progresses to try_to_unmap and is moved to the
503 * unevictable list. 503 * unevictable list.
504 */ 504 */
505 if (vma->vm_flags & VM_LOCKED) { 505 if (vma->vm_flags & VM_LOCKED) {
506 *mapcount = 1; /* break early from loop */ 506 *mapcount = 0; /* break early from loop */
507 *vm_flags |= VM_LOCKED; 507 *vm_flags |= VM_LOCKED;
508 goto out_unmap; 508 goto out;
509 }
510
511 if (ptep_clear_flush_young_notify(vma, address, pte)) {
512 /*
513 * Don't treat a reference through a sequentially read
514 * mapping as such. If the page has been used in
515 * another mapping, we will catch it; if this other
516 * mapping is already gone, the unmap path will have
517 * set PG_referenced or activated the page.
518 */
519 if (likely(!VM_SequentialReadHint(vma)))
520 referenced++;
521 } 509 }
522 510
523 /* Pretend the page is referenced if the task has the 511 /* Pretend the page is referenced if the task has the
@@ -526,9 +514,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
526 rwsem_is_locked(&mm->mmap_sem)) 514 rwsem_is_locked(&mm->mmap_sem))
527 referenced++; 515 referenced++;
528 516
529out_unmap: 517 if (unlikely(PageTransHuge(page))) {
518 pmd_t *pmd;
519
520 spin_lock(&mm->page_table_lock);
521 pmd = page_check_address_pmd(page, mm, address,
522 PAGE_CHECK_ADDRESS_PMD_FLAG);
523 if (pmd && !pmd_trans_splitting(*pmd) &&
524 pmdp_clear_flush_young_notify(vma, address, pmd))
525 referenced++;
526 spin_unlock(&mm->page_table_lock);
527 } else {
528 pte_t *pte;
529 spinlock_t *ptl;
530
531 pte = page_check_address(page, mm, address, &ptl, 0);
532 if (!pte)
533 goto out;
534
535 if (ptep_clear_flush_young_notify(vma, address, pte)) {
536 /*
537 * Don't treat a reference through a sequentially read
538 * mapping as such. If the page has been used in
539 * another mapping, we will catch it; if this other
540 * mapping is already gone, the unmap path will have
541 * set PG_referenced or activated the page.
542 */
543 if (likely(!VM_SequentialReadHint(vma)))
544 referenced++;
545 }
546 pte_unmap_unlock(pte, ptl);
547 }
548
530 (*mapcount)--; 549 (*mapcount)--;
531 pte_unmap_unlock(pte, ptl);
532 550
533 if (referenced) 551 if (referenced)
534 *vm_flags |= vma->vm_flags; 552 *vm_flags |= vma->vm_flags;
@@ -864,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page,
864 struct vm_area_struct *vma, unsigned long address, int exclusive) 882 struct vm_area_struct *vma, unsigned long address, int exclusive)
865{ 883{
866 int first = atomic_inc_and_test(&page->_mapcount); 884 int first = atomic_inc_and_test(&page->_mapcount);
867 if (first) 885 if (first) {
868 __inc_zone_page_state(page, NR_ANON_PAGES); 886 if (!PageTransHuge(page))
887 __inc_zone_page_state(page, NR_ANON_PAGES);
888 else
889 __inc_zone_page_state(page,
890 NR_ANON_TRANSPARENT_HUGEPAGES);
891 }
869 if (unlikely(PageKsm(page))) 892 if (unlikely(PageKsm(page)))
870 return; 893 return;
871 894
@@ -893,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page,
893 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); 916 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
894 SetPageSwapBacked(page); 917 SetPageSwapBacked(page);
895 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ 918 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
896 __inc_zone_page_state(page, NR_ANON_PAGES); 919 if (!PageTransHuge(page))
920 __inc_zone_page_state(page, NR_ANON_PAGES);
921 else
922 __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
897 __page_set_anon_rmap(page, vma, address, 1); 923 __page_set_anon_rmap(page, vma, address, 1);
898 if (page_evictable(page, vma)) 924 if (page_evictable(page, vma))
899 lru_cache_add_lru(page, LRU_ACTIVE_ANON); 925 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -911,7 +937,7 @@ void page_add_file_rmap(struct page *page)
911{ 937{
912 if (atomic_inc_and_test(&page->_mapcount)) { 938 if (atomic_inc_and_test(&page->_mapcount)) {
913 __inc_zone_page_state(page, NR_FILE_MAPPED); 939 __inc_zone_page_state(page, NR_FILE_MAPPED);
914 mem_cgroup_update_file_mapped(page, 1); 940 mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
915 } 941 }
916} 942}
917 943
@@ -946,10 +972,14 @@ void page_remove_rmap(struct page *page)
946 return; 972 return;
947 if (PageAnon(page)) { 973 if (PageAnon(page)) {
948 mem_cgroup_uncharge_page(page); 974 mem_cgroup_uncharge_page(page);
949 __dec_zone_page_state(page, NR_ANON_PAGES); 975 if (!PageTransHuge(page))
976 __dec_zone_page_state(page, NR_ANON_PAGES);
977 else
978 __dec_zone_page_state(page,
979 NR_ANON_TRANSPARENT_HUGEPAGES);
950 } else { 980 } else {
951 __dec_zone_page_state(page, NR_FILE_MAPPED); 981 __dec_zone_page_state(page, NR_FILE_MAPPED);
952 mem_cgroup_update_file_mapped(page, -1); 982 mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
953 } 983 }
954 /* 984 /*
955 * It would be tidy to reset the PageAnon mapping here, 985 * It would be tidy to reset the PageAnon mapping here,
@@ -1202,7 +1232,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
1202 return ret; 1232 return ret;
1203} 1233}
1204 1234
1205static bool is_vma_temporary_stack(struct vm_area_struct *vma) 1235bool is_vma_temporary_stack(struct vm_area_struct *vma)
1206{ 1236{
1207 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); 1237 int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
1208 1238
@@ -1400,6 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
1400 int ret; 1430 int ret;
1401 1431
1402 BUG_ON(!PageLocked(page)); 1432 BUG_ON(!PageLocked(page));
1433 VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
1403 1434
1404 if (unlikely(PageKsm(page))) 1435 if (unlikely(PageKsm(page)))
1405 ret = try_to_unmap_ksm(page, flags); 1436 ret = try_to_unmap_ksm(page, flags);
diff --git a/mm/shmem.c b/mm/shmem.c
index 47fdeeb9d636..5ee67c990602 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2415,13 +2415,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
2415 return &p->vfs_inode; 2415 return &p->vfs_inode;
2416} 2416}
2417 2417
2418static void shmem_i_callback(struct rcu_head *head)
2419{
2420 struct inode *inode = container_of(head, struct inode, i_rcu);
2421 INIT_LIST_HEAD(&inode->i_dentry);
2422 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
2423}
2424
2418static void shmem_destroy_inode(struct inode *inode) 2425static void shmem_destroy_inode(struct inode *inode)
2419{ 2426{
2420 if ((inode->i_mode & S_IFMT) == S_IFREG) { 2427 if ((inode->i_mode & S_IFMT) == S_IFREG) {
2421 /* only struct inode is valid if it's an inline symlink */ 2428 /* only struct inode is valid if it's an inline symlink */
2422 mpol_free_shared_policy(&SHMEM_I(inode)->policy); 2429 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
2423 } 2430 }
2424 kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); 2431 call_rcu(&inode->i_rcu, shmem_i_callback);
2425} 2432}
2426 2433
2427static void init_once(void *foo) 2434static void init_once(void *foo)
diff --git a/mm/slab.c b/mm/slab.c
index b1e40dafbab3..264037449f08 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -829,12 +829,12 @@ static void init_reap_node(int cpu)
829 829
830static void next_reap_node(void) 830static void next_reap_node(void)
831{ 831{
832 int node = __get_cpu_var(slab_reap_node); 832 int node = __this_cpu_read(slab_reap_node);
833 833
834 node = next_node(node, node_online_map); 834 node = next_node(node, node_online_map);
835 if (unlikely(node >= MAX_NUMNODES)) 835 if (unlikely(node >= MAX_NUMNODES))
836 node = first_node(node_online_map); 836 node = first_node(node_online_map);
837 __get_cpu_var(slab_reap_node) = node; 837 __this_cpu_write(slab_reap_node, node);
838} 838}
839 839
840#else 840#else
@@ -1012,7 +1012,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
1012 */ 1012 */
1013static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) 1013static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
1014{ 1014{
1015 int node = __get_cpu_var(slab_reap_node); 1015 int node = __this_cpu_read(slab_reap_node);
1016 1016
1017 if (l3->alien) { 1017 if (l3->alien) {
1018 struct array_cache *ac = l3->alien[node]; 1018 struct array_cache *ac = l3->alien[node];
@@ -1293,7 +1293,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1293 * anything expensive but will only modify reap_work 1293 * anything expensive but will only modify reap_work
1294 * and reschedule the timer. 1294 * and reschedule the timer.
1295 */ 1295 */
1296 cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); 1296 cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
1297 /* Now the cache_reaper is guaranteed to be not running. */ 1297 /* Now the cache_reaper is guaranteed to be not running. */
1298 per_cpu(slab_reap_work, cpu).work.func = NULL; 1298 per_cpu(slab_reap_work, cpu).work.func = NULL;
1299 break; 1299 break;
@@ -2781,7 +2781,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
2781/* 2781/*
2782 * Map pages beginning at addr to the given cache and slab. This is required 2782 * Map pages beginning at addr to the given cache and slab. This is required
2783 * for the slab allocator to be able to lookup the cache and slab of a 2783 * for the slab allocator to be able to lookup the cache and slab of a
2784 * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. 2784 * virtual address for kfree, ksize, and slab debugging.
2785 */ 2785 */
2786static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, 2786static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2787 void *addr) 2787 void *addr)
@@ -3653,42 +3653,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3653EXPORT_SYMBOL(kmem_cache_alloc); 3653EXPORT_SYMBOL(kmem_cache_alloc);
3654 3654
3655#ifdef CONFIG_TRACING 3655#ifdef CONFIG_TRACING
3656void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) 3656void *
3657kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
3657{ 3658{
3658 return __cache_alloc(cachep, flags, __builtin_return_address(0)); 3659 void *ret;
3659}
3660EXPORT_SYMBOL(kmem_cache_alloc_notrace);
3661#endif
3662 3660
3663/** 3661 ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3664 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
3665 * @cachep: the cache we're checking against
3666 * @ptr: pointer to validate
3667 *
3668 * This verifies that the untrusted pointer looks sane;
3669 * it is _not_ a guarantee that the pointer is actually
3670 * part of the slab cache in question, but it at least
3671 * validates that the pointer can be dereferenced and
3672 * looks half-way sane.
3673 *
3674 * Currently only used for dentry validation.
3675 */
3676int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3677{
3678 unsigned long size = cachep->buffer_size;
3679 struct page *page;
3680 3662
3681 if (unlikely(!kern_ptr_validate(ptr, size))) 3663 trace_kmalloc(_RET_IP_, ret,
3682 goto out; 3664 size, slab_buffer_size(cachep), flags);
3683 page = virt_to_page(ptr); 3665 return ret;
3684 if (unlikely(!PageSlab(page)))
3685 goto out;
3686 if (unlikely(page_get_cache(page) != cachep))
3687 goto out;
3688 return 1;
3689out:
3690 return 0;
3691} 3666}
3667EXPORT_SYMBOL(kmem_cache_alloc_trace);
3668#endif
3692 3669
3693#ifdef CONFIG_NUMA 3670#ifdef CONFIG_NUMA
3694void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3671void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
@@ -3705,31 +3682,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3705EXPORT_SYMBOL(kmem_cache_alloc_node); 3682EXPORT_SYMBOL(kmem_cache_alloc_node);
3706 3683
3707#ifdef CONFIG_TRACING 3684#ifdef CONFIG_TRACING
3708void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, 3685void *kmem_cache_alloc_node_trace(size_t size,
3709 gfp_t flags, 3686 struct kmem_cache *cachep,
3710 int nodeid) 3687 gfp_t flags,
3688 int nodeid)
3711{ 3689{
3712 return __cache_alloc_node(cachep, flags, nodeid, 3690 void *ret;
3691
3692 ret = __cache_alloc_node(cachep, flags, nodeid,
3713 __builtin_return_address(0)); 3693 __builtin_return_address(0));
3694 trace_kmalloc_node(_RET_IP_, ret,
3695 size, slab_buffer_size(cachep),
3696 flags, nodeid);
3697 return ret;
3714} 3698}
3715EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); 3699EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
3716#endif 3700#endif
3717 3701
3718static __always_inline void * 3702static __always_inline void *
3719__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) 3703__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3720{ 3704{
3721 struct kmem_cache *cachep; 3705 struct kmem_cache *cachep;
3722 void *ret;
3723 3706
3724 cachep = kmem_find_general_cachep(size, flags); 3707 cachep = kmem_find_general_cachep(size, flags);
3725 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3708 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3726 return cachep; 3709 return cachep;
3727 ret = kmem_cache_alloc_node_notrace(cachep, flags, node); 3710 return kmem_cache_alloc_node_trace(size, cachep, flags, node);
3728
3729 trace_kmalloc_node((unsigned long) caller, ret,
3730 size, cachep->buffer_size, flags, node);
3731
3732 return ret;
3733} 3711}
3734 3712
3735#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) 3713#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
diff --git a/mm/slob.c b/mm/slob.c
index 617b6d6c42c7..3588eaaef726 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -678,11 +678,6 @@ int kmem_cache_shrink(struct kmem_cache *d)
678} 678}
679EXPORT_SYMBOL(kmem_cache_shrink); 679EXPORT_SYMBOL(kmem_cache_shrink);
680 680
681int kmem_ptr_validate(struct kmem_cache *a, const void *b)
682{
683 return 0;
684}
685
686static unsigned int slob_ready __read_mostly; 681static unsigned int slob_ready __read_mostly;
687 682
688int slab_is_available(void) 683int slab_is_available(void)
diff --git a/mm/slub.c b/mm/slub.c
index bec0e355fbad..c7ef0070dd86 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -28,6 +28,8 @@
28#include <linux/math64.h> 28#include <linux/math64.h>
29#include <linux/fault-inject.h> 29#include <linux/fault-inject.h>
30 30
31#include <trace/events/kmem.h>
32
31/* 33/*
32 * Lock order: 34 * Lock order:
33 * 1. slab_lock(page) 35 * 1. slab_lock(page)
@@ -1774,11 +1776,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1774EXPORT_SYMBOL(kmem_cache_alloc); 1776EXPORT_SYMBOL(kmem_cache_alloc);
1775 1777
1776#ifdef CONFIG_TRACING 1778#ifdef CONFIG_TRACING
1777void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) 1779void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
1780{
1781 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
1782 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
1783 return ret;
1784}
1785EXPORT_SYMBOL(kmem_cache_alloc_trace);
1786
1787void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
1778{ 1788{
1779 return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); 1789 void *ret = kmalloc_order(size, flags, order);
1790 trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
1791 return ret;
1780} 1792}
1781EXPORT_SYMBOL(kmem_cache_alloc_notrace); 1793EXPORT_SYMBOL(kmalloc_order_trace);
1782#endif 1794#endif
1783 1795
1784#ifdef CONFIG_NUMA 1796#ifdef CONFIG_NUMA
@@ -1794,13 +1806,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1794EXPORT_SYMBOL(kmem_cache_alloc_node); 1806EXPORT_SYMBOL(kmem_cache_alloc_node);
1795 1807
1796#ifdef CONFIG_TRACING 1808#ifdef CONFIG_TRACING
1797void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, 1809void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
1798 gfp_t gfpflags, 1810 gfp_t gfpflags,
1799 int node) 1811 int node, size_t size)
1800{ 1812{
1801 return slab_alloc(s, gfpflags, node, _RET_IP_); 1813 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
1814
1815 trace_kmalloc_node(_RET_IP_, ret,
1816 size, s->size, gfpflags, node);
1817 return ret;
1802} 1818}
1803EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); 1819EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
1804#endif 1820#endif
1805#endif 1821#endif
1806 1822
@@ -1917,17 +1933,6 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
1917} 1933}
1918EXPORT_SYMBOL(kmem_cache_free); 1934EXPORT_SYMBOL(kmem_cache_free);
1919 1935
1920/* Figure out on which slab page the object resides */
1921static struct page *get_object_page(const void *x)
1922{
1923 struct page *page = virt_to_head_page(x);
1924
1925 if (!PageSlab(page))
1926 return NULL;
1927
1928 return page;
1929}
1930
1931/* 1936/*
1932 * Object placement in a slab is made very easy because we always start at 1937 * Object placement in a slab is made very easy because we always start at
1933 * offset 0. If we tune the size of the object to the alignment then we can 1938 * offset 0. If we tune the size of the object to the alignment then we can
@@ -2386,35 +2391,6 @@ error:
2386} 2391}
2387 2392
2388/* 2393/*
2389 * Check if a given pointer is valid
2390 */
2391int kmem_ptr_validate(struct kmem_cache *s, const void *object)
2392{
2393 struct page *page;
2394
2395 if (!kern_ptr_validate(object, s->size))
2396 return 0;
2397
2398 page = get_object_page(object);
2399
2400 if (!page || s != page->slab)
2401 /* No slab or wrong slab */
2402 return 0;
2403
2404 if (!check_valid_pointer(s, page, object))
2405 return 0;
2406
2407 /*
2408 * We could also check if the object is on the slabs freelist.
2409 * But this would be too expensive and it seems that the main
2410 * purpose of kmem_ptr_valid() is to check if the object belongs
2411 * to a certain slab.
2412 */
2413 return 1;
2414}
2415EXPORT_SYMBOL(kmem_ptr_validate);
2416
2417/*
2418 * Determine the size of a slab object 2394 * Determine the size of a slab object
2419 */ 2395 */
2420unsigned int kmem_cache_size(struct kmem_cache *s) 2396unsigned int kmem_cache_size(struct kmem_cache *s)
@@ -3660,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
3660 len += sprintf(buf + len, "%7ld ", l->count); 3636 len += sprintf(buf + len, "%7ld ", l->count);
3661 3637
3662 if (l->addr) 3638 if (l->addr)
3663 len += sprint_symbol(buf + len, (unsigned long)l->addr); 3639 len += sprintf(buf + len, "%pS", (void *)l->addr);
3664 else 3640 else
3665 len += sprintf(buf + len, "<not-available>"); 3641 len += sprintf(buf + len, "<not-available>");
3666 3642
@@ -3970,12 +3946,9 @@ SLAB_ATTR(min_partial);
3970 3946
3971static ssize_t ctor_show(struct kmem_cache *s, char *buf) 3947static ssize_t ctor_show(struct kmem_cache *s, char *buf)
3972{ 3948{
3973 if (s->ctor) { 3949 if (!s->ctor)
3974 int n = sprint_symbol(buf, (unsigned long)s->ctor); 3950 return 0;
3975 3951 return sprintf(buf, "%pS\n", s->ctor);
3976 return n + sprintf(buf + n, "\n");
3977 }
3978 return 0;
3979} 3952}
3980SLAB_ATTR_RO(ctor); 3953SLAB_ATTR_RO(ctor);
3981 3954
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 29d6cbffb283..64b984091edb 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -9,7 +9,7 @@
9 * 9 *
10 * However, virtual mappings need a page table and TLBs. Many Linux 10 * However, virtual mappings need a page table and TLBs. Many Linux
11 * architectures already map their physical space using 1-1 mappings 11 * architectures already map their physical space using 1-1 mappings
12 * via TLBs. For those arches the virtual memmory map is essentially 12 * via TLBs. For those arches the virtual memory map is essentially
13 * for free if we use the same page size as the 1-1 mappings. In that 13 * for free if we use the same page size as the 1-1 mappings. In that
14 * case the overhead consists of a few additional pages that are 14 * case the overhead consists of a few additional pages that are
15 * allocated to create a view of memory for vmemmap. 15 * allocated to create a view of memory for vmemmap.
diff --git a/mm/sparse.c b/mm/sparse.c
index 95ac219af379..93250207c5cf 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
671static void free_map_bootmem(struct page *page, unsigned long nr_pages) 671static void free_map_bootmem(struct page *page, unsigned long nr_pages)
672{ 672{
673 unsigned long maps_section_nr, removing_section_nr, i; 673 unsigned long maps_section_nr, removing_section_nr, i;
674 int magic; 674 unsigned long magic;
675 675
676 for (i = 0; i < nr_pages; i++, page++) { 676 for (i = 0; i < nr_pages; i++, page++) {
677 magic = atomic_read(&page->_mapcount); 677 magic = (unsigned long) page->lru.next;
678 678
679 BUG_ON(magic == NODE_INFO); 679 BUG_ON(magic == NODE_INFO);
680 680
diff --git a/mm/swap.c b/mm/swap.c
index 3f4854205b16..bbc1ce9f9460 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -56,17 +56,97 @@ static void __page_cache_release(struct page *page)
56 del_page_from_lru(zone, page); 56 del_page_from_lru(zone, page);
57 spin_unlock_irqrestore(&zone->lru_lock, flags); 57 spin_unlock_irqrestore(&zone->lru_lock, flags);
58 } 58 }
59}
60
61static void __put_single_page(struct page *page)
62{
63 __page_cache_release(page);
59 free_hot_cold_page(page, 0); 64 free_hot_cold_page(page, 0);
60} 65}
61 66
62static void put_compound_page(struct page *page) 67static void __put_compound_page(struct page *page)
63{ 68{
64 page = compound_head(page); 69 compound_page_dtor *dtor;
65 if (put_page_testzero(page)) {
66 compound_page_dtor *dtor;
67 70
68 dtor = get_compound_page_dtor(page); 71 __page_cache_release(page);
69 (*dtor)(page); 72 dtor = get_compound_page_dtor(page);
73 (*dtor)(page);
74}
75
76static void put_compound_page(struct page *page)
77{
78 if (unlikely(PageTail(page))) {
79 /* __split_huge_page_refcount can run under us */
80 struct page *page_head = page->first_page;
81 smp_rmb();
82 /*
83 * If PageTail is still set after smp_rmb() we can be sure
84 * that the page->first_page we read wasn't a dangling pointer.
85 * See __split_huge_page_refcount() smp_wmb().
86 */
87 if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
88 unsigned long flags;
89 /*
90 * Verify that our page_head wasn't converted
91 * to a a regular page before we got a
92 * reference on it.
93 */
94 if (unlikely(!PageHead(page_head))) {
95 /* PageHead is cleared after PageTail */
96 smp_rmb();
97 VM_BUG_ON(PageTail(page));
98 goto out_put_head;
99 }
100 /*
101 * Only run compound_lock on a valid PageHead,
102 * after having it pinned with
103 * get_page_unless_zero() above.
104 */
105 smp_mb();
106 /* page_head wasn't a dangling pointer */
107 flags = compound_lock_irqsave(page_head);
108 if (unlikely(!PageTail(page))) {
109 /* __split_huge_page_refcount run before us */
110 compound_unlock_irqrestore(page_head, flags);
111 VM_BUG_ON(PageHead(page_head));
112 out_put_head:
113 if (put_page_testzero(page_head))
114 __put_single_page(page_head);
115 out_put_single:
116 if (put_page_testzero(page))
117 __put_single_page(page);
118 return;
119 }
120 VM_BUG_ON(page_head != page->first_page);
121 /*
122 * We can release the refcount taken by
123 * get_page_unless_zero now that
124 * split_huge_page_refcount is blocked on the
125 * compound_lock.
126 */
127 if (put_page_testzero(page_head))
128 VM_BUG_ON(1);
129 /* __split_huge_page_refcount will wait now */
130 VM_BUG_ON(atomic_read(&page->_count) <= 0);
131 atomic_dec(&page->_count);
132 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
133 compound_unlock_irqrestore(page_head, flags);
134 if (put_page_testzero(page_head)) {
135 if (PageHead(page_head))
136 __put_compound_page(page_head);
137 else
138 __put_single_page(page_head);
139 }
140 } else {
141 /* page_head is a dangling pointer */
142 VM_BUG_ON(PageTail(page));
143 goto out_put_single;
144 }
145 } else if (put_page_testzero(page)) {
146 if (PageHead(page))
147 __put_compound_page(page);
148 else
149 __put_single_page(page);
70 } 150 }
71} 151}
72 152
@@ -75,7 +155,7 @@ void put_page(struct page *page)
75 if (unlikely(PageCompound(page))) 155 if (unlikely(PageCompound(page)))
76 put_compound_page(page); 156 put_compound_page(page);
77 else if (put_page_testzero(page)) 157 else if (put_page_testzero(page))
78 __page_cache_release(page); 158 __put_single_page(page);
79} 159}
80EXPORT_SYMBOL(put_page); 160EXPORT_SYMBOL(put_page);
81 161
@@ -98,15 +178,13 @@ void put_pages_list(struct list_head *pages)
98} 178}
99EXPORT_SYMBOL(put_pages_list); 179EXPORT_SYMBOL(put_pages_list);
100 180
101/* 181static void pagevec_lru_move_fn(struct pagevec *pvec,
102 * pagevec_move_tail() must be called with IRQ disabled. 182 void (*move_fn)(struct page *page, void *arg),
103 * Otherwise this may cause nasty races. 183 void *arg)
104 */
105static void pagevec_move_tail(struct pagevec *pvec)
106{ 184{
107 int i; 185 int i;
108 int pgmoved = 0;
109 struct zone *zone = NULL; 186 struct zone *zone = NULL;
187 unsigned long flags = 0;
110 188
111 for (i = 0; i < pagevec_count(pvec); i++) { 189 for (i = 0; i < pagevec_count(pvec); i++) {
112 struct page *page = pvec->pages[i]; 190 struct page *page = pvec->pages[i];
@@ -114,29 +192,49 @@ static void pagevec_move_tail(struct pagevec *pvec)
114 192
115 if (pagezone != zone) { 193 if (pagezone != zone) {
116 if (zone) 194 if (zone)
117 spin_unlock(&zone->lru_lock); 195 spin_unlock_irqrestore(&zone->lru_lock, flags);
118 zone = pagezone; 196 zone = pagezone;
119 spin_lock(&zone->lru_lock); 197 spin_lock_irqsave(&zone->lru_lock, flags);
120 }
121 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
122 int lru = page_lru_base_type(page);
123 list_move_tail(&page->lru, &zone->lru[lru].list);
124 pgmoved++;
125 } 198 }
199
200 (*move_fn)(page, arg);
126 } 201 }
127 if (zone) 202 if (zone)
128 spin_unlock(&zone->lru_lock); 203 spin_unlock_irqrestore(&zone->lru_lock, flags);
129 __count_vm_events(PGROTATED, pgmoved); 204 release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
130 release_pages(pvec->pages, pvec->nr, pvec->cold);
131 pagevec_reinit(pvec); 205 pagevec_reinit(pvec);
132} 206}
133 207
208static void pagevec_move_tail_fn(struct page *page, void *arg)
209{
210 int *pgmoved = arg;
211 struct zone *zone = page_zone(page);
212
213 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
214 int lru = page_lru_base_type(page);
215 list_move_tail(&page->lru, &zone->lru[lru].list);
216 (*pgmoved)++;
217 }
218}
219
220/*
221 * pagevec_move_tail() must be called with IRQ disabled.
222 * Otherwise this may cause nasty races.
223 */
224static void pagevec_move_tail(struct pagevec *pvec)
225{
226 int pgmoved = 0;
227
228 pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
229 __count_vm_events(PGROTATED, pgmoved);
230}
231
134/* 232/*
135 * Writeback is about to end against a page which has been marked for immediate 233 * Writeback is about to end against a page which has been marked for immediate
136 * reclaim. If it still appears to be reclaimable, move it to the tail of the 234 * reclaim. If it still appears to be reclaimable, move it to the tail of the
137 * inactive list. 235 * inactive list.
138 */ 236 */
139void rotate_reclaimable_page(struct page *page) 237void rotate_reclaimable_page(struct page *page)
140{ 238{
141 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 239 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
142 !PageUnevictable(page) && PageLRU(page)) { 240 !PageUnevictable(page) && PageLRU(page)) {
@@ -173,27 +271,94 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page,
173} 271}
174 272
175/* 273/*
176 * FIXME: speed this up? 274 * A page will go to active list either by activate_page or putback_lru_page.
275 * In the activate_page case, the page hasn't active bit set. The page might
276 * not in LRU list because it's isolated before it gets a chance to be moved to
277 * active list. The window is small because pagevec just stores several pages.
278 * For such case, we do nothing for such page.
279 * In the putback_lru_page case, the page isn't in lru list but has active
280 * bit set
177 */ 281 */
178void activate_page(struct page *page) 282static void __activate_page(struct page *page, void *arg)
179{ 283{
180 struct zone *zone = page_zone(page); 284 struct zone *zone = page_zone(page);
285 int file = page_is_file_cache(page);
286 int lru = page_lru_base_type(page);
287 bool putback = !PageLRU(page);
181 288
182 spin_lock_irq(&zone->lru_lock); 289 /* The page is isolated before it's moved to active list */
183 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { 290 if (!PageLRU(page) && !PageActive(page))
184 int file = page_is_file_cache(page); 291 return;
185 int lru = page_lru_base_type(page); 292 if ((PageLRU(page) && PageActive(page)) || PageUnevictable(page))
293 return;
294
295 if (!putback)
186 del_page_from_lru_list(zone, page, lru); 296 del_page_from_lru_list(zone, page, lru);
297 else
298 SetPageLRU(page);
187 299
188 SetPageActive(page); 300 SetPageActive(page);
189 lru += LRU_ACTIVE; 301 lru += LRU_ACTIVE;
190 add_page_to_lru_list(zone, page, lru); 302 add_page_to_lru_list(zone, page, lru);
191 __count_vm_event(PGACTIVATE);
192 303
193 update_page_reclaim_stat(zone, page, file, 1); 304 if (putback)
305 return;
306 __count_vm_event(PGACTIVATE);
307 update_page_reclaim_stat(zone, page, file, 1);
308}
309
310#ifdef CONFIG_SMP
311static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
312
313static void activate_page_drain(int cpu)
314{
315 struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
316
317 if (pagevec_count(pvec))
318 pagevec_lru_move_fn(pvec, __activate_page, NULL);
319}
320
321void activate_page(struct page *page)
322{
323 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
324 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
325
326 page_cache_get(page);
327 if (!pagevec_add(pvec, page))
328 pagevec_lru_move_fn(pvec, __activate_page, NULL);
329 put_cpu_var(activate_page_pvecs);
330 }
331}
332
333/* Caller should hold zone->lru_lock */
334int putback_active_lru_page(struct zone *zone, struct page *page)
335{
336 struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
337
338 if (!pagevec_add(pvec, page)) {
339 spin_unlock_irq(&zone->lru_lock);
340 pagevec_lru_move_fn(pvec, __activate_page, NULL);
341 spin_lock_irq(&zone->lru_lock);
194 } 342 }
343 put_cpu_var(activate_page_pvecs);
344 return 1;
345}
346
347#else
348static inline void activate_page_drain(int cpu)
349{
350}
351
352void activate_page(struct page *page)
353{
354 struct zone *zone = page_zone(page);
355
356 spin_lock_irq(&zone->lru_lock);
357 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page))
358 __activate_page(page, NULL);
195 spin_unlock_irq(&zone->lru_lock); 359 spin_unlock_irq(&zone->lru_lock);
196} 360}
361#endif
197 362
198/* 363/*
199 * Mark a page as having seen activity. 364 * Mark a page as having seen activity.
@@ -292,6 +457,7 @@ static void drain_cpu_pagevecs(int cpu)
292 pagevec_move_tail(pvec); 457 pagevec_move_tail(pvec);
293 local_irq_restore(flags); 458 local_irq_restore(flags);
294 } 459 }
460 activate_page_drain(cpu);
295} 461}
296 462
297void lru_add_drain(void) 463void lru_add_drain(void)
@@ -399,44 +565,70 @@ void __pagevec_release(struct pagevec *pvec)
399 565
400EXPORT_SYMBOL(__pagevec_release); 566EXPORT_SYMBOL(__pagevec_release);
401 567
568/* used by __split_huge_page_refcount() */
569void lru_add_page_tail(struct zone* zone,
570 struct page *page, struct page *page_tail)
571{
572 int active;
573 enum lru_list lru;
574 const int file = 0;
575 struct list_head *head;
576
577 VM_BUG_ON(!PageHead(page));
578 VM_BUG_ON(PageCompound(page_tail));
579 VM_BUG_ON(PageLRU(page_tail));
580 VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
581
582 SetPageLRU(page_tail);
583
584 if (page_evictable(page_tail, NULL)) {
585 if (PageActive(page)) {
586 SetPageActive(page_tail);
587 active = 1;
588 lru = LRU_ACTIVE_ANON;
589 } else {
590 active = 0;
591 lru = LRU_INACTIVE_ANON;
592 }
593 update_page_reclaim_stat(zone, page_tail, file, active);
594 if (likely(PageLRU(page)))
595 head = page->lru.prev;
596 else
597 head = &zone->lru[lru].list;
598 __add_page_to_lru_list(zone, page_tail, lru, head);
599 } else {
600 SetPageUnevictable(page_tail);
601 add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
602 }
603}
604
605static void ____pagevec_lru_add_fn(struct page *page, void *arg)
606{
607 enum lru_list lru = (enum lru_list)arg;
608 struct zone *zone = page_zone(page);
609 int file = is_file_lru(lru);
610 int active = is_active_lru(lru);
611
612 VM_BUG_ON(PageActive(page));
613 VM_BUG_ON(PageUnevictable(page));
614 VM_BUG_ON(PageLRU(page));
615
616 SetPageLRU(page);
617 if (active)
618 SetPageActive(page);
619 update_page_reclaim_stat(zone, page, file, active);
620 add_page_to_lru_list(zone, page, lru);
621}
622
402/* 623/*
403 * Add the passed pages to the LRU, then drop the caller's refcount 624 * Add the passed pages to the LRU, then drop the caller's refcount
404 * on them. Reinitialises the caller's pagevec. 625 * on them. Reinitialises the caller's pagevec.
405 */ 626 */
406void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) 627void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
407{ 628{
408 int i;
409 struct zone *zone = NULL;
410
411 VM_BUG_ON(is_unevictable_lru(lru)); 629 VM_BUG_ON(is_unevictable_lru(lru));
412 630
413 for (i = 0; i < pagevec_count(pvec); i++) { 631 pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru);
414 struct page *page = pvec->pages[i];
415 struct zone *pagezone = page_zone(page);
416 int file;
417 int active;
418
419 if (pagezone != zone) {
420 if (zone)
421 spin_unlock_irq(&zone->lru_lock);
422 zone = pagezone;
423 spin_lock_irq(&zone->lru_lock);
424 }
425 VM_BUG_ON(PageActive(page));
426 VM_BUG_ON(PageUnevictable(page));
427 VM_BUG_ON(PageLRU(page));
428 SetPageLRU(page);
429 active = is_active_lru(lru);
430 file = is_file_lru(lru);
431 if (active)
432 SetPageActive(page);
433 update_page_reclaim_stat(zone, page, file, active);
434 add_page_to_lru_list(zone, page, lru);
435 }
436 if (zone)
437 spin_unlock_irq(&zone->lru_lock);
438 release_pages(pvec->pages, pvec->nr, pvec->cold);
439 pagevec_reinit(pvec);
440} 632}
441 633
442EXPORT_SYMBOL(____pagevec_lru_add); 634EXPORT_SYMBOL(____pagevec_lru_add);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e10f5833167f..5c8cfabbc9bc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -157,6 +157,12 @@ int add_to_swap(struct page *page)
157 if (!entry.val) 157 if (!entry.val)
158 return 0; 158 return 0;
159 159
160 if (unlikely(PageTransHuge(page)))
161 if (unlikely(split_huge_page(page))) {
162 swapcache_free(entry, NULL);
163 return 0;
164 }
165
160 /* 166 /*
161 * Radix-tree node allocations from PF_MEMALLOC contexts could 167 * Radix-tree node allocations from PF_MEMALLOC contexts could
162 * completely exhaust the page allocator. __GFP_NOMEMALLOC 168 * completely exhaust the page allocator. __GFP_NOMEMALLOC
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 67ddaaf98c74..07a458d72fa8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -964,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
964 pmd = pmd_offset(pud, addr); 964 pmd = pmd_offset(pud, addr);
965 do { 965 do {
966 next = pmd_addr_end(addr, end); 966 next = pmd_addr_end(addr, end);
967 if (unlikely(pmd_trans_huge(*pmd)))
968 continue;
967 if (pmd_none_or_clear_bad(pmd)) 969 if (pmd_none_or_clear_bad(pmd))
968 continue; 970 continue;
969 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 971 ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
@@ -1677,7 +1679,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1677 if (S_ISBLK(inode->i_mode)) { 1679 if (S_ISBLK(inode->i_mode)) {
1678 struct block_device *bdev = I_BDEV(inode); 1680 struct block_device *bdev = I_BDEV(inode);
1679 set_blocksize(bdev, p->old_block_size); 1681 set_blocksize(bdev, p->old_block_size);
1680 bd_release(bdev); 1682 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
1681 } else { 1683 } else {
1682 mutex_lock(&inode->i_mutex); 1684 mutex_lock(&inode->i_mutex);
1683 inode->i_flags &= ~S_SWAPFILE; 1685 inode->i_flags &= ~S_SWAPFILE;
@@ -1939,7 +1941,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1939 error = -EINVAL; 1941 error = -EINVAL;
1940 if (S_ISBLK(inode->i_mode)) { 1942 if (S_ISBLK(inode->i_mode)) {
1941 bdev = I_BDEV(inode); 1943 bdev = I_BDEV(inode);
1942 error = bd_claim(bdev, sys_swapon); 1944 error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL,
1945 sys_swapon);
1943 if (error < 0) { 1946 if (error < 0) {
1944 bdev = NULL; 1947 bdev = NULL;
1945 error = -EINVAL; 1948 error = -EINVAL;
@@ -2136,7 +2139,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
2136bad_swap: 2139bad_swap:
2137 if (bdev) { 2140 if (bdev) {
2138 set_blocksize(bdev, p->old_block_size); 2141 set_blocksize(bdev, p->old_block_size);
2139 bd_release(bdev); 2142 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2140 } 2143 }
2141 destroy_swap_extents(p); 2144 destroy_swap_extents(p);
2142 swap_cgroup_swapoff(type); 2145 swap_cgroup_swapoff(type);
diff --git a/mm/util.c b/mm/util.c
index 73dac81e9f78..f126975ef23e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -186,27 +186,6 @@ void kzfree(const void *p)
186} 186}
187EXPORT_SYMBOL(kzfree); 187EXPORT_SYMBOL(kzfree);
188 188
189int kern_ptr_validate(const void *ptr, unsigned long size)
190{
191 unsigned long addr = (unsigned long)ptr;
192 unsigned long min_addr = PAGE_OFFSET;
193 unsigned long align_mask = sizeof(void *) - 1;
194
195 if (unlikely(addr < min_addr))
196 goto out;
197 if (unlikely(addr > (unsigned long)high_memory - size))
198 goto out;
199 if (unlikely(addr & align_mask))
200 goto out;
201 if (unlikely(!kern_addr_valid(addr)))
202 goto out;
203 if (unlikely(!kern_addr_valid(addr + size - 1)))
204 goto out;
205 return 1;
206out:
207 return 0;
208}
209
210/* 189/*
211 * strndup_user - duplicate an existing string from user space 190 * strndup_user - duplicate an existing string from user space
212 * @s: The string to duplicate 191 * @s: The string to duplicate
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 816f074fb4e1..f9b166732e70 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -748,7 +748,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
748 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, 748 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
749 VMALLOC_START, VMALLOC_END, 749 VMALLOC_START, VMALLOC_END,
750 node, gfp_mask); 750 node, gfp_mask);
751 if (unlikely(IS_ERR(va))) { 751 if (IS_ERR(va)) {
752 kfree(vb); 752 kfree(vb);
753 return ERR_CAST(va); 753 return ERR_CAST(va);
754 } 754 }
@@ -1316,13 +1316,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1316 -1, GFP_KERNEL, caller); 1316 -1, GFP_KERNEL, caller);
1317} 1317}
1318 1318
1319struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
1320 int node, gfp_t gfp_mask)
1321{
1322 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1323 node, gfp_mask, __builtin_return_address(0));
1324}
1325
1326static struct vm_struct *find_vm_area(const void *addr) 1319static struct vm_struct *find_vm_area(const void *addr)
1327{ 1320{
1328 struct vmap_area *va; 1321 struct vmap_area *va;
@@ -1538,25 +1531,12 @@ fail:
1538 return NULL; 1531 return NULL;
1539} 1532}
1540 1533
1541void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1542{
1543 void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
1544 __builtin_return_address(0));
1545
1546 /*
1547 * A ref_count = 3 is needed because the vm_struct and vmap_area
1548 * structures allocated in the __get_vm_area_node() function contain
1549 * references to the virtual address of the vmalloc'ed block.
1550 */
1551 kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
1552
1553 return addr;
1554}
1555
1556/** 1534/**
1557 * __vmalloc_node - allocate virtually contiguous memory 1535 * __vmalloc_node_range - allocate virtually contiguous memory
1558 * @size: allocation size 1536 * @size: allocation size
1559 * @align: desired alignment 1537 * @align: desired alignment
1538 * @start: vm area range start
1539 * @end: vm area range end
1560 * @gfp_mask: flags for the page level allocator 1540 * @gfp_mask: flags for the page level allocator
1561 * @prot: protection mask for the allocated pages 1541 * @prot: protection mask for the allocated pages
1562 * @node: node to use for allocation or -1 1542 * @node: node to use for allocation or -1
@@ -1566,9 +1546,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1566 * allocator with @gfp_mask flags. Map them into contiguous 1546 * allocator with @gfp_mask flags. Map them into contiguous
1567 * kernel virtual space, using a pagetable protection of @prot. 1547 * kernel virtual space, using a pagetable protection of @prot.
1568 */ 1548 */
1569static void *__vmalloc_node(unsigned long size, unsigned long align, 1549void *__vmalloc_node_range(unsigned long size, unsigned long align,
1570 gfp_t gfp_mask, pgprot_t prot, 1550 unsigned long start, unsigned long end, gfp_t gfp_mask,
1571 int node, void *caller) 1551 pgprot_t prot, int node, void *caller)
1572{ 1552{
1573 struct vm_struct *area; 1553 struct vm_struct *area;
1574 void *addr; 1554 void *addr;
@@ -1578,8 +1558,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1578 if (!size || (size >> PAGE_SHIFT) > totalram_pages) 1558 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1579 return NULL; 1559 return NULL;
1580 1560
1581 area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, 1561 area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
1582 VMALLOC_END, node, gfp_mask, caller); 1562 gfp_mask, caller);
1583 1563
1584 if (!area) 1564 if (!area)
1585 return NULL; 1565 return NULL;
@@ -1596,6 +1576,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
1596 return addr; 1576 return addr;
1597} 1577}
1598 1578
1579/**
1580 * __vmalloc_node - allocate virtually contiguous memory
1581 * @size: allocation size
1582 * @align: desired alignment
1583 * @gfp_mask: flags for the page level allocator
1584 * @prot: protection mask for the allocated pages
1585 * @node: node to use for allocation or -1
1586 * @caller: caller's return address
1587 *
1588 * Allocate enough pages to cover @size from the page level
1589 * allocator with @gfp_mask flags. Map them into contiguous
1590 * kernel virtual space, using a pagetable protection of @prot.
1591 */
1592static void *__vmalloc_node(unsigned long size, unsigned long align,
1593 gfp_t gfp_mask, pgprot_t prot,
1594 int node, void *caller)
1595{
1596 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
1597 gfp_mask, prot, node, caller);
1598}
1599
1599void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1600void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1600{ 1601{
1601 return __vmalloc_node(size, 1, gfp_mask, prot, -1, 1602 return __vmalloc_node(size, 1, gfp_mask, prot, -1,
@@ -2204,17 +2205,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
2204 * @sizes: array containing size of each area 2205 * @sizes: array containing size of each area
2205 * @nr_vms: the number of areas to allocate 2206 * @nr_vms: the number of areas to allocate
2206 * @align: alignment, all entries in @offsets and @sizes must be aligned to this 2207 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
2207 * @gfp_mask: allocation mask
2208 * 2208 *
2209 * Returns: kmalloc'd vm_struct pointer array pointing to allocated 2209 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
2210 * vm_structs on success, %NULL on failure 2210 * vm_structs on success, %NULL on failure
2211 * 2211 *
2212 * Percpu allocator wants to use congruent vm areas so that it can 2212 * Percpu allocator wants to use congruent vm areas so that it can
2213 * maintain the offsets among percpu areas. This function allocates 2213 * maintain the offsets among percpu areas. This function allocates
2214 * congruent vmalloc areas for it. These areas tend to be scattered 2214 * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
2215 * pretty far, distance between two areas easily going up to 2215 * be scattered pretty far, distance between two areas easily going up
2216 * gigabytes. To avoid interacting with regular vmallocs, these areas 2216 * to gigabytes. To avoid interacting with regular vmallocs, these
2217 * are allocated from top. 2217 * areas are allocated from top.
2218 * 2218 *
2219 * Despite its complicated look, this allocator is rather simple. It 2219 * Despite its complicated look, this allocator is rather simple. It
2220 * does everything top-down and scans areas from the end looking for 2220 * does everything top-down and scans areas from the end looking for
@@ -2225,7 +2225,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
2225 */ 2225 */
2226struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, 2226struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2227 const size_t *sizes, int nr_vms, 2227 const size_t *sizes, int nr_vms,
2228 size_t align, gfp_t gfp_mask) 2228 size_t align)
2229{ 2229{
2230 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); 2230 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
2231 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); 2231 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2235,8 +2235,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2235 unsigned long base, start, end, last_end; 2235 unsigned long base, start, end, last_end;
2236 bool purged = false; 2236 bool purged = false;
2237 2237
2238 gfp_mask &= GFP_RECLAIM_MASK;
2239
2240 /* verify parameters and allocate data structures */ 2238 /* verify parameters and allocate data structures */
2241 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); 2239 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
2242 for (last_area = 0, area = 0; area < nr_vms; area++) { 2240 for (last_area = 0, area = 0; area < nr_vms; area++) {
@@ -2269,14 +2267,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2269 return NULL; 2267 return NULL;
2270 } 2268 }
2271 2269
2272 vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); 2270 vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
2273 vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); 2271 vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
2274 if (!vas || !vms) 2272 if (!vas || !vms)
2275 goto err_free; 2273 goto err_free;
2276 2274
2277 for (area = 0; area < nr_vms; area++) { 2275 for (area = 0; area < nr_vms; area++) {
2278 vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); 2276 vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
2279 vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); 2277 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
2280 if (!vas[area] || !vms[area]) 2278 if (!vas[area] || !vms[area])
2281 goto err_free; 2279 goto err_free;
2282 } 2280 }
@@ -2457,13 +2455,8 @@ static int s_show(struct seq_file *m, void *p)
2457 seq_printf(m, "0x%p-0x%p %7ld", 2455 seq_printf(m, "0x%p-0x%p %7ld",
2458 v->addr, v->addr + v->size, v->size); 2456 v->addr, v->addr + v->size, v->size);
2459 2457
2460 if (v->caller) { 2458 if (v->caller)
2461 char buff[KSYM_SYMBOL_LEN]; 2459 seq_printf(m, " %pS", v->caller);
2462
2463 seq_putc(m, ' ');
2464 sprint_symbol(buff, (unsigned long)v->caller);
2465 seq_puts(m, buff);
2466 }
2467 2460
2468 if (v->nr_pages) 2461 if (v->nr_pages)
2469 seq_printf(m, " pages=%d", v->nr_pages); 2462 seq_printf(m, " pages=%d", v->nr_pages);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9ca587c69274..99999a9b2b0b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
32#include <linux/topology.h> 32#include <linux/topology.h>
33#include <linux/cpu.h> 33#include <linux/cpu.h>
34#include <linux/cpuset.h> 34#include <linux/cpuset.h>
35#include <linux/compaction.h>
35#include <linux/notifier.h> 36#include <linux/notifier.h>
36#include <linux/rwsem.h> 37#include <linux/rwsem.h>
37#include <linux/delay.h> 38#include <linux/delay.h>
@@ -40,6 +41,7 @@
40#include <linux/memcontrol.h> 41#include <linux/memcontrol.h>
41#include <linux/delayacct.h> 42#include <linux/delayacct.h>
42#include <linux/sysctl.h> 43#include <linux/sysctl.h>
44#include <linux/compaction.h>
43 45
44#include <asm/tlbflush.h> 46#include <asm/tlbflush.h>
45#include <asm/div64.h> 47#include <asm/div64.h>
@@ -51,11 +53,23 @@
51#define CREATE_TRACE_POINTS 53#define CREATE_TRACE_POINTS
52#include <trace/events/vmscan.h> 54#include <trace/events/vmscan.h>
53 55
54enum lumpy_mode { 56/*
55 LUMPY_MODE_NONE, 57 * reclaim_mode determines how the inactive list is shrunk
56 LUMPY_MODE_ASYNC, 58 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
57 LUMPY_MODE_SYNC, 59 * RECLAIM_MODE_ASYNC: Do not block
58}; 60 * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
61 * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
62 * page from the LRU and reclaim all pages within a
63 * naturally aligned range
64 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
65 * order-0 pages and then compact the zone
66 */
67typedef unsigned __bitwise__ reclaim_mode_t;
68#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
69#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
70#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
71#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
72#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
59 73
60struct scan_control { 74struct scan_control {
61 /* Incremented by the number of inactive pages that were scanned */ 75 /* Incremented by the number of inactive pages that were scanned */
@@ -88,7 +102,7 @@ struct scan_control {
88 * Intend to reclaim enough continuous memory rather than reclaim 102 * Intend to reclaim enough continuous memory rather than reclaim
89 * enough amount of memory. i.e, mode for high order allocation. 103 * enough amount of memory. i.e, mode for high order allocation.
90 */ 104 */
91 enum lumpy_mode lumpy_reclaim_mode; 105 reclaim_mode_t reclaim_mode;
92 106
93 /* Which cgroup do we reclaim from */ 107 /* Which cgroup do we reclaim from */
94 struct mem_cgroup *mem_cgroup; 108 struct mem_cgroup *mem_cgroup;
@@ -271,34 +285,37 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
271 return ret; 285 return ret;
272} 286}
273 287
274static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, 288static void set_reclaim_mode(int priority, struct scan_control *sc,
275 bool sync) 289 bool sync)
276{ 290{
277 enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; 291 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
278 292
279 /* 293 /*
280 * Some reclaim have alredy been failed. No worth to try synchronous 294 * Initially assume we are entering either lumpy reclaim or
281 * lumpy reclaim. 295 * reclaim/compaction.Depending on the order, we will either set the
296 * sync mode or just reclaim order-0 pages later.
282 */ 297 */
283 if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) 298 if (COMPACTION_BUILD)
284 return; 299 sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
300 else
301 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
285 302
286 /* 303 /*
287 * If we need a large contiguous chunk of memory, or have 304 * Avoid using lumpy reclaim or reclaim/compaction if possible by
288 * trouble getting a small set of contiguous pages, we 305 * restricting when its set to either costly allocations or when
289 * will reclaim both active and inactive pages. 306 * under memory pressure
290 */ 307 */
291 if (sc->order > PAGE_ALLOC_COSTLY_ORDER) 308 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
292 sc->lumpy_reclaim_mode = mode; 309 sc->reclaim_mode |= syncmode;
293 else if (sc->order && priority < DEF_PRIORITY - 2) 310 else if (sc->order && priority < DEF_PRIORITY - 2)
294 sc->lumpy_reclaim_mode = mode; 311 sc->reclaim_mode |= syncmode;
295 else 312 else
296 sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; 313 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
297} 314}
298 315
299static void disable_lumpy_reclaim_mode(struct scan_control *sc) 316static void reset_reclaim_mode(struct scan_control *sc)
300{ 317{
301 sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; 318 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
302} 319}
303 320
304static inline int is_page_cache_freeable(struct page *page) 321static inline int is_page_cache_freeable(struct page *page)
@@ -429,7 +446,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
429 * first attempt to free a range of pages fails. 446 * first attempt to free a range of pages fails.
430 */ 447 */
431 if (PageWriteback(page) && 448 if (PageWriteback(page) &&
432 sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) 449 (sc->reclaim_mode & RECLAIM_MODE_SYNC))
433 wait_on_page_writeback(page); 450 wait_on_page_writeback(page);
434 451
435 if (!PageWriteback(page)) { 452 if (!PageWriteback(page)) {
@@ -437,7 +454,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
437 ClearPageReclaim(page); 454 ClearPageReclaim(page);
438 } 455 }
439 trace_mm_vmscan_writepage(page, 456 trace_mm_vmscan_writepage(page,
440 trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); 457 trace_reclaim_flags(page, sc->reclaim_mode));
441 inc_zone_page_state(page, NR_VMSCAN_WRITE); 458 inc_zone_page_state(page, NR_VMSCAN_WRITE);
442 return PAGE_SUCCESS; 459 return PAGE_SUCCESS;
443 } 460 }
@@ -622,7 +639,7 @@ static enum page_references page_check_references(struct page *page,
622 referenced_page = TestClearPageReferenced(page); 639 referenced_page = TestClearPageReferenced(page);
623 640
624 /* Lumpy reclaim - ignore references */ 641 /* Lumpy reclaim - ignore references */
625 if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) 642 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
626 return PAGEREF_RECLAIM; 643 return PAGEREF_RECLAIM;
627 644
628 /* 645 /*
@@ -739,7 +756,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
739 * for any page for which writeback has already 756 * for any page for which writeback has already
740 * started. 757 * started.
741 */ 758 */
742 if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && 759 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
743 may_enter_fs) 760 may_enter_fs)
744 wait_on_page_writeback(page); 761 wait_on_page_writeback(page);
745 else { 762 else {
@@ -895,7 +912,7 @@ cull_mlocked:
895 try_to_free_swap(page); 912 try_to_free_swap(page);
896 unlock_page(page); 913 unlock_page(page);
897 putback_lru_page(page); 914 putback_lru_page(page);
898 disable_lumpy_reclaim_mode(sc); 915 reset_reclaim_mode(sc);
899 continue; 916 continue;
900 917
901activate_locked: 918activate_locked:
@@ -908,7 +925,7 @@ activate_locked:
908keep_locked: 925keep_locked:
909 unlock_page(page); 926 unlock_page(page);
910keep: 927keep:
911 disable_lumpy_reclaim_mode(sc); 928 reset_reclaim_mode(sc);
912keep_lumpy: 929keep_lumpy:
913 list_add(&page->lru, &ret_pages); 930 list_add(&page->lru, &ret_pages);
914 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 931 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
@@ -1028,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1028 case 0: 1045 case 0:
1029 list_move(&page->lru, dst); 1046 list_move(&page->lru, dst);
1030 mem_cgroup_del_lru(page); 1047 mem_cgroup_del_lru(page);
1031 nr_taken++; 1048 nr_taken += hpage_nr_pages(page);
1032 break; 1049 break;
1033 1050
1034 case -EBUSY: 1051 case -EBUSY:
@@ -1086,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1086 if (__isolate_lru_page(cursor_page, mode, file) == 0) { 1103 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1087 list_move(&cursor_page->lru, dst); 1104 list_move(&cursor_page->lru, dst);
1088 mem_cgroup_del_lru(cursor_page); 1105 mem_cgroup_del_lru(cursor_page);
1089 nr_taken++; 1106 nr_taken += hpage_nr_pages(page);
1090 nr_lumpy_taken++; 1107 nr_lumpy_taken++;
1091 if (PageDirty(cursor_page)) 1108 if (PageDirty(cursor_page))
1092 nr_lumpy_dirty++; 1109 nr_lumpy_dirty++;
@@ -1141,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
1141 struct page *page; 1158 struct page *page;
1142 1159
1143 list_for_each_entry(page, page_list, lru) { 1160 list_for_each_entry(page, page_list, lru) {
1161 int numpages = hpage_nr_pages(page);
1144 lru = page_lru_base_type(page); 1162 lru = page_lru_base_type(page);
1145 if (PageActive(page)) { 1163 if (PageActive(page)) {
1146 lru += LRU_ACTIVE; 1164 lru += LRU_ACTIVE;
1147 ClearPageActive(page); 1165 ClearPageActive(page);
1148 nr_active++; 1166 nr_active += numpages;
1149 } 1167 }
1150 if (count) 1168 if (count)
1151 count[lru]++; 1169 count[lru] += numpages;
1152 } 1170 }
1153 1171
1154 return nr_active; 1172 return nr_active;
@@ -1253,13 +1271,16 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
1253 spin_lock_irq(&zone->lru_lock); 1271 spin_lock_irq(&zone->lru_lock);
1254 continue; 1272 continue;
1255 } 1273 }
1256 SetPageLRU(page);
1257 lru = page_lru(page); 1274 lru = page_lru(page);
1258 add_page_to_lru_list(zone, page, lru);
1259 if (is_active_lru(lru)) { 1275 if (is_active_lru(lru)) {
1260 int file = is_file_lru(lru); 1276 int file = is_file_lru(lru);
1261 reclaim_stat->recent_rotated[file]++; 1277 int numpages = hpage_nr_pages(page);
1278 reclaim_stat->recent_rotated[file] += numpages;
1279 if (putback_active_lru_page(zone, page))
1280 continue;
1262 } 1281 }
1282 SetPageLRU(page);
1283 add_page_to_lru_list(zone, page, lru);
1263 if (!pagevec_add(&pvec, page)) { 1284 if (!pagevec_add(&pvec, page)) {
1264 spin_unlock_irq(&zone->lru_lock); 1285 spin_unlock_irq(&zone->lru_lock);
1265 __pagevec_release(&pvec); 1286 __pagevec_release(&pvec);
@@ -1324,7 +1345,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
1324 return false; 1345 return false;
1325 1346
1326 /* Only stall on lumpy reclaim */ 1347 /* Only stall on lumpy reclaim */
1327 if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) 1348 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1328 return false; 1349 return false;
1329 1350
1330 /* If we have relaimed everything on the isolated list, no stall */ 1351 /* If we have relaimed everything on the isolated list, no stall */
@@ -1368,15 +1389,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1368 return SWAP_CLUSTER_MAX; 1389 return SWAP_CLUSTER_MAX;
1369 } 1390 }
1370 1391
1371 set_lumpy_reclaim_mode(priority, sc, false); 1392 set_reclaim_mode(priority, sc, false);
1372 lru_add_drain(); 1393 lru_add_drain();
1373 spin_lock_irq(&zone->lru_lock); 1394 spin_lock_irq(&zone->lru_lock);
1374 1395
1375 if (scanning_global_lru(sc)) { 1396 if (scanning_global_lru(sc)) {
1376 nr_taken = isolate_pages_global(nr_to_scan, 1397 nr_taken = isolate_pages_global(nr_to_scan,
1377 &page_list, &nr_scanned, sc->order, 1398 &page_list, &nr_scanned, sc->order,
1378 sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? 1399 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1379 ISOLATE_INACTIVE : ISOLATE_BOTH, 1400 ISOLATE_BOTH : ISOLATE_INACTIVE,
1380 zone, 0, file); 1401 zone, 0, file);
1381 zone->pages_scanned += nr_scanned; 1402 zone->pages_scanned += nr_scanned;
1382 if (current_is_kswapd()) 1403 if (current_is_kswapd())
@@ -1388,8 +1409,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1388 } else { 1409 } else {
1389 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1410 nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1390 &page_list, &nr_scanned, sc->order, 1411 &page_list, &nr_scanned, sc->order,
1391 sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? 1412 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1392 ISOLATE_INACTIVE : ISOLATE_BOTH, 1413 ISOLATE_BOTH : ISOLATE_INACTIVE,
1393 zone, sc->mem_cgroup, 1414 zone, sc->mem_cgroup,
1394 0, file); 1415 0, file);
1395 /* 1416 /*
@@ -1411,7 +1432,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1411 1432
1412 /* Check if we should syncronously wait for writeback */ 1433 /* Check if we should syncronously wait for writeback */
1413 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1434 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1414 set_lumpy_reclaim_mode(priority, sc, true); 1435 set_reclaim_mode(priority, sc, true);
1415 nr_reclaimed += shrink_page_list(&page_list, zone, sc); 1436 nr_reclaimed += shrink_page_list(&page_list, zone, sc);
1416 } 1437 }
1417 1438
@@ -1426,7 +1447,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1426 zone_idx(zone), 1447 zone_idx(zone),
1427 nr_scanned, nr_reclaimed, 1448 nr_scanned, nr_reclaimed,
1428 priority, 1449 priority,
1429 trace_shrink_flags(file, sc->lumpy_reclaim_mode)); 1450 trace_shrink_flags(file, sc->reclaim_mode));
1430 return nr_reclaimed; 1451 return nr_reclaimed;
1431} 1452}
1432 1453
@@ -1466,7 +1487,7 @@ static void move_active_pages_to_lru(struct zone *zone,
1466 1487
1467 list_move(&page->lru, &zone->lru[lru].list); 1488 list_move(&page->lru, &zone->lru[lru].list);
1468 mem_cgroup_add_lru_list(page, lru); 1489 mem_cgroup_add_lru_list(page, lru);
1469 pgmoved++; 1490 pgmoved += hpage_nr_pages(page);
1470 1491
1471 if (!pagevec_add(&pvec, page) || list_empty(list)) { 1492 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1472 spin_unlock_irq(&zone->lru_lock); 1493 spin_unlock_irq(&zone->lru_lock);
@@ -1534,7 +1555,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1534 } 1555 }
1535 1556
1536 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { 1557 if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1537 nr_rotated++; 1558 nr_rotated += hpage_nr_pages(page);
1538 /* 1559 /*
1539 * Identify referenced, file-backed active pages and 1560 * Identify referenced, file-backed active pages and
1540 * give them one more trip around the active list. So 1561 * give them one more trip around the active list. So
@@ -1805,6 +1826,57 @@ out:
1805} 1826}
1806 1827
1807/* 1828/*
1829 * Reclaim/compaction depends on a number of pages being freed. To avoid
1830 * disruption to the system, a small number of order-0 pages continue to be
1831 * rotated and reclaimed in the normal fashion. However, by the time we get
1832 * back to the allocator and call try_to_compact_zone(), we ensure that
1833 * there are enough free pages for it to be likely successful
1834 */
1835static inline bool should_continue_reclaim(struct zone *zone,
1836 unsigned long nr_reclaimed,
1837 unsigned long nr_scanned,
1838 struct scan_control *sc)
1839{
1840 unsigned long pages_for_compaction;
1841 unsigned long inactive_lru_pages;
1842
1843 /* If not in reclaim/compaction mode, stop */
1844 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
1845 return false;
1846
1847 /*
1848 * If we failed to reclaim and have scanned the full list, stop.
1849 * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
1850 * faster but obviously would be less likely to succeed
1851 * allocation. If this is desirable, use GFP_REPEAT to decide
1852 * if both reclaimed and scanned should be checked or just
1853 * reclaimed
1854 */
1855 if (!nr_reclaimed && !nr_scanned)
1856 return false;
1857
1858 /*
1859 * If we have not reclaimed enough pages for compaction and the
1860 * inactive lists are large enough, continue reclaiming
1861 */
1862 pages_for_compaction = (2UL << sc->order);
1863 inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
1864 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1865 if (sc->nr_reclaimed < pages_for_compaction &&
1866 inactive_lru_pages > pages_for_compaction)
1867 return true;
1868
1869 /* If compaction would go ahead or the allocation would succeed, stop */
1870 switch (compaction_suitable(zone, sc->order)) {
1871 case COMPACT_PARTIAL:
1872 case COMPACT_CONTINUE:
1873 return false;
1874 default:
1875 return true;
1876 }
1877}
1878
1879/*
1808 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1880 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1809 */ 1881 */
1810static void shrink_zone(int priority, struct zone *zone, 1882static void shrink_zone(int priority, struct zone *zone,
@@ -1813,9 +1885,12 @@ static void shrink_zone(int priority, struct zone *zone,
1813 unsigned long nr[NR_LRU_LISTS]; 1885 unsigned long nr[NR_LRU_LISTS];
1814 unsigned long nr_to_scan; 1886 unsigned long nr_to_scan;
1815 enum lru_list l; 1887 enum lru_list l;
1816 unsigned long nr_reclaimed = sc->nr_reclaimed; 1888 unsigned long nr_reclaimed;
1817 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1889 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1890 unsigned long nr_scanned = sc->nr_scanned;
1818 1891
1892restart:
1893 nr_reclaimed = 0;
1819 get_scan_count(zone, sc, nr, priority); 1894 get_scan_count(zone, sc, nr, priority);
1820 1895
1821 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1896 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1841,8 +1916,7 @@ static void shrink_zone(int priority, struct zone *zone,
1841 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 1916 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
1842 break; 1917 break;
1843 } 1918 }
1844 1919 sc->nr_reclaimed += nr_reclaimed;
1845 sc->nr_reclaimed = nr_reclaimed;
1846 1920
1847 /* 1921 /*
1848 * Even if we did not try to evict anon pages at all, we want to 1922 * Even if we did not try to evict anon pages at all, we want to
@@ -1851,6 +1925,11 @@ static void shrink_zone(int priority, struct zone *zone,
1851 if (inactive_anon_is_low(zone, sc)) 1925 if (inactive_anon_is_low(zone, sc))
1852 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1926 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1853 1927
1928 /* reclaim/compaction might need reclaim to continue */
1929 if (should_continue_reclaim(zone, nr_reclaimed,
1930 sc->nr_scanned - nr_scanned, sc))
1931 goto restart;
1932
1854 throttle_vm_writeout(sc->gfp_mask); 1933 throttle_vm_writeout(sc->gfp_mask);
1855} 1934}
1856 1935
@@ -2124,38 +2203,87 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2124} 2203}
2125#endif 2204#endif
2126 2205
2206/*
2207 * pgdat_balanced is used when checking if a node is balanced for high-order
2208 * allocations. Only zones that meet watermarks and are in a zone allowed
2209 * by the callers classzone_idx are added to balanced_pages. The total of
2210 * balanced pages must be at least 25% of the zones allowed by classzone_idx
2211 * for the node to be considered balanced. Forcing all zones to be balanced
2212 * for high orders can cause excessive reclaim when there are imbalanced zones.
2213 * The choice of 25% is due to
2214 * o a 16M DMA zone that is balanced will not balance a zone on any
2215 * reasonable sized machine
2216 * o On all other machines, the top zone must be at least a reasonable
2217 * precentage of the middle zones. For example, on 32-bit x86, highmem
2218 * would need to be at least 256M for it to be balance a whole node.
2219 * Similarly, on x86-64 the Normal zone would need to be at least 1G
2220 * to balance a node on its own. These seemed like reasonable ratios.
2221 */
2222static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2223 int classzone_idx)
2224{
2225 unsigned long present_pages = 0;
2226 int i;
2227
2228 for (i = 0; i <= classzone_idx; i++)
2229 present_pages += pgdat->node_zones[i].present_pages;
2230
2231 return balanced_pages > (present_pages >> 2);
2232}
2233
2127/* is kswapd sleeping prematurely? */ 2234/* is kswapd sleeping prematurely? */
2128static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) 2235static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2236 int classzone_idx)
2129{ 2237{
2130 int i; 2238 int i;
2239 unsigned long balanced = 0;
2240 bool all_zones_ok = true;
2131 2241
2132 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2242 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2133 if (remaining) 2243 if (remaining)
2134 return 1; 2244 return true;
2135 2245
2136 /* If after HZ/10, a zone is below the high mark, it's premature */ 2246 /* Check the watermark levels */
2137 for (i = 0; i < pgdat->nr_zones; i++) { 2247 for (i = 0; i < pgdat->nr_zones; i++) {
2138 struct zone *zone = pgdat->node_zones + i; 2248 struct zone *zone = pgdat->node_zones + i;
2139 2249
2140 if (!populated_zone(zone)) 2250 if (!populated_zone(zone))
2141 continue; 2251 continue;
2142 2252
2143 if (zone->all_unreclaimable) 2253 /*
2254 * balance_pgdat() skips over all_unreclaimable after
2255 * DEF_PRIORITY. Effectively, it considers them balanced so
2256 * they must be considered balanced here as well if kswapd
2257 * is to sleep
2258 */
2259 if (zone->all_unreclaimable) {
2260 balanced += zone->present_pages;
2144 continue; 2261 continue;
2262 }
2145 2263
2146 if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), 2264 if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
2147 0, 0)) 2265 classzone_idx, 0))
2148 return 1; 2266 all_zones_ok = false;
2267 else
2268 balanced += zone->present_pages;
2149 } 2269 }
2150 2270
2151 return 0; 2271 /*
2272 * For high-order requests, the balanced zones must contain at least
2273 * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
2274 * must be balanced
2275 */
2276 if (order)
2277 return pgdat_balanced(pgdat, balanced, classzone_idx);
2278 else
2279 return !all_zones_ok;
2152} 2280}
2153 2281
2154/* 2282/*
2155 * For kswapd, balance_pgdat() will work across all this node's zones until 2283 * For kswapd, balance_pgdat() will work across all this node's zones until
2156 * they are all at high_wmark_pages(zone). 2284 * they are all at high_wmark_pages(zone).
2157 * 2285 *
2158 * Returns the number of pages which were actually freed. 2286 * Returns the final order kswapd was reclaiming at
2159 * 2287 *
2160 * There is special handling here for zones which are full of pinned pages. 2288 * There is special handling here for zones which are full of pinned pages.
2161 * This can happen if the pages are all mlocked, or if they are all used by 2289 * This can happen if the pages are all mlocked, or if they are all used by
@@ -2172,11 +2300,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
2172 * interoperates with the page allocator fallback scheme to ensure that aging 2300 * interoperates with the page allocator fallback scheme to ensure that aging
2173 * of pages is balanced across the zones. 2301 * of pages is balanced across the zones.
2174 */ 2302 */
2175static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 2303static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2304 int *classzone_idx)
2176{ 2305{
2177 int all_zones_ok; 2306 int all_zones_ok;
2307 unsigned long balanced;
2178 int priority; 2308 int priority;
2179 int i; 2309 int i;
2310 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2180 unsigned long total_scanned; 2311 unsigned long total_scanned;
2181 struct reclaim_state *reclaim_state = current->reclaim_state; 2312 struct reclaim_state *reclaim_state = current->reclaim_state;
2182 struct scan_control sc = { 2313 struct scan_control sc = {
@@ -2199,7 +2330,6 @@ loop_again:
2199 count_vm_event(PAGEOUTRUN); 2330 count_vm_event(PAGEOUTRUN);
2200 2331
2201 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2332 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2202 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2203 unsigned long lru_pages = 0; 2333 unsigned long lru_pages = 0;
2204 int has_under_min_watermark_zone = 0; 2334 int has_under_min_watermark_zone = 0;
2205 2335
@@ -2208,6 +2338,7 @@ loop_again:
2208 disable_swap_token(); 2338 disable_swap_token();
2209 2339
2210 all_zones_ok = 1; 2340 all_zones_ok = 1;
2341 balanced = 0;
2211 2342
2212 /* 2343 /*
2213 * Scan in the highmem->dma direction for the highest 2344 * Scan in the highmem->dma direction for the highest
@@ -2230,9 +2361,10 @@ loop_again:
2230 shrink_active_list(SWAP_CLUSTER_MAX, zone, 2361 shrink_active_list(SWAP_CLUSTER_MAX, zone,
2231 &sc, priority, 0); 2362 &sc, priority, 0);
2232 2363
2233 if (!zone_watermark_ok(zone, order, 2364 if (!zone_watermark_ok_safe(zone, order,
2234 high_wmark_pages(zone), 0, 0)) { 2365 high_wmark_pages(zone), 0, 0)) {
2235 end_zone = i; 2366 end_zone = i;
2367 *classzone_idx = i;
2236 break; 2368 break;
2237 } 2369 }
2238 } 2370 }
@@ -2255,6 +2387,7 @@ loop_again:
2255 * cause too much scanning of the lower zones. 2387 * cause too much scanning of the lower zones.
2256 */ 2388 */
2257 for (i = 0; i <= end_zone; i++) { 2389 for (i = 0; i <= end_zone; i++) {
2390 int compaction;
2258 struct zone *zone = pgdat->node_zones + i; 2391 struct zone *zone = pgdat->node_zones + i;
2259 int nr_slab; 2392 int nr_slab;
2260 2393
@@ -2276,7 +2409,7 @@ loop_again:
2276 * We put equal pressure on every zone, unless one 2409 * We put equal pressure on every zone, unless one
2277 * zone has way too many pages free already. 2410 * zone has way too many pages free already.
2278 */ 2411 */
2279 if (!zone_watermark_ok(zone, order, 2412 if (!zone_watermark_ok_safe(zone, order,
2280 8*high_wmark_pages(zone), end_zone, 0)) 2413 8*high_wmark_pages(zone), end_zone, 0))
2281 shrink_zone(priority, zone, &sc); 2414 shrink_zone(priority, zone, &sc);
2282 reclaim_state->reclaimed_slab = 0; 2415 reclaim_state->reclaimed_slab = 0;
@@ -2284,9 +2417,26 @@ loop_again:
2284 lru_pages); 2417 lru_pages);
2285 sc.nr_reclaimed += reclaim_state->reclaimed_slab; 2418 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2286 total_scanned += sc.nr_scanned; 2419 total_scanned += sc.nr_scanned;
2420
2421 compaction = 0;
2422 if (order &&
2423 zone_watermark_ok(zone, 0,
2424 high_wmark_pages(zone),
2425 end_zone, 0) &&
2426 !zone_watermark_ok(zone, order,
2427 high_wmark_pages(zone),
2428 end_zone, 0)) {
2429 compact_zone_order(zone,
2430 order,
2431 sc.gfp_mask, false,
2432 COMPACT_MODE_KSWAPD);
2433 compaction = 1;
2434 }
2435
2287 if (zone->all_unreclaimable) 2436 if (zone->all_unreclaimable)
2288 continue; 2437 continue;
2289 if (nr_slab == 0 && !zone_reclaimable(zone)) 2438 if (!compaction && nr_slab == 0 &&
2439 !zone_reclaimable(zone))
2290 zone->all_unreclaimable = 1; 2440 zone->all_unreclaimable = 1;
2291 /* 2441 /*
2292 * If we've done a decent amount of scanning and 2442 * If we've done a decent amount of scanning and
@@ -2297,7 +2447,7 @@ loop_again:
2297 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) 2447 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2298 sc.may_writepage = 1; 2448 sc.may_writepage = 1;
2299 2449
2300 if (!zone_watermark_ok(zone, order, 2450 if (!zone_watermark_ok_safe(zone, order,
2301 high_wmark_pages(zone), end_zone, 0)) { 2451 high_wmark_pages(zone), end_zone, 0)) {
2302 all_zones_ok = 0; 2452 all_zones_ok = 0;
2303 /* 2453 /*
@@ -2305,7 +2455,7 @@ loop_again:
2305 * means that we have a GFP_ATOMIC allocation 2455 * means that we have a GFP_ATOMIC allocation
2306 * failure risk. Hurry up! 2456 * failure risk. Hurry up!
2307 */ 2457 */
2308 if (!zone_watermark_ok(zone, order, 2458 if (!zone_watermark_ok_safe(zone, order,
2309 min_wmark_pages(zone), end_zone, 0)) 2459 min_wmark_pages(zone), end_zone, 0))
2310 has_under_min_watermark_zone = 1; 2460 has_under_min_watermark_zone = 1;
2311 } else { 2461 } else {
@@ -2317,10 +2467,12 @@ loop_again:
2317 * spectulatively avoid congestion waits 2467 * spectulatively avoid congestion waits
2318 */ 2468 */
2319 zone_clear_flag(zone, ZONE_CONGESTED); 2469 zone_clear_flag(zone, ZONE_CONGESTED);
2470 if (i <= *classzone_idx)
2471 balanced += zone->present_pages;
2320 } 2472 }
2321 2473
2322 } 2474 }
2323 if (all_zones_ok) 2475 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2324 break; /* kswapd: all done */ 2476 break; /* kswapd: all done */
2325 /* 2477 /*
2326 * OK, kswapd is getting into trouble. Take a nap, then take 2478 * OK, kswapd is getting into trouble. Take a nap, then take
@@ -2343,7 +2495,13 @@ loop_again:
2343 break; 2495 break;
2344 } 2496 }
2345out: 2497out:
2346 if (!all_zones_ok) { 2498
2499 /*
2500 * order-0: All zones must meet high watermark for a balanced node
2501 * high-order: Balanced zones must make up at least 25% of the node
2502 * for the node to be balanced
2503 */
2504 if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
2347 cond_resched(); 2505 cond_resched();
2348 2506
2349 try_to_freeze(); 2507 try_to_freeze();
@@ -2368,7 +2526,88 @@ out:
2368 goto loop_again; 2526 goto loop_again;
2369 } 2527 }
2370 2528
2371 return sc.nr_reclaimed; 2529 /*
2530 * If kswapd was reclaiming at a higher order, it has the option of
2531 * sleeping without all zones being balanced. Before it does, it must
2532 * ensure that the watermarks for order-0 on *all* zones are met and
2533 * that the congestion flags are cleared. The congestion flag must
2534 * be cleared as kswapd is the only mechanism that clears the flag
2535 * and it is potentially going to sleep here.
2536 */
2537 if (order) {
2538 for (i = 0; i <= end_zone; i++) {
2539 struct zone *zone = pgdat->node_zones + i;
2540
2541 if (!populated_zone(zone))
2542 continue;
2543
2544 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2545 continue;
2546
2547 /* Confirm the zone is balanced for order-0 */
2548 if (!zone_watermark_ok(zone, 0,
2549 high_wmark_pages(zone), 0, 0)) {
2550 order = sc.order = 0;
2551 goto loop_again;
2552 }
2553
2554 /* If balanced, clear the congested flag */
2555 zone_clear_flag(zone, ZONE_CONGESTED);
2556 }
2557 }
2558
2559 /*
2560 * Return the order we were reclaiming at so sleeping_prematurely()
2561 * makes a decision on the order we were last reclaiming at. However,
2562 * if another caller entered the allocator slow path while kswapd
2563 * was awake, order will remain at the higher level
2564 */
2565 *classzone_idx = end_zone;
2566 return order;
2567}
2568
2569static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2570{
2571 long remaining = 0;
2572 DEFINE_WAIT(wait);
2573
2574 if (freezing(current) || kthread_should_stop())
2575 return;
2576
2577 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2578
2579 /* Try to sleep for a short interval */
2580 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2581 remaining = schedule_timeout(HZ/10);
2582 finish_wait(&pgdat->kswapd_wait, &wait);
2583 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2584 }
2585
2586 /*
2587 * After a short sleep, check if it was a premature sleep. If not, then
2588 * go fully to sleep until explicitly woken up.
2589 */
2590 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
2591 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2592
2593 /*
2594 * vmstat counters are not perfectly accurate and the estimated
2595 * value for counters such as NR_FREE_PAGES can deviate from the
2596 * true value by nr_online_cpus * threshold. To avoid the zone
2597 * watermarks being breached while under pressure, we reduce the
2598 * per-cpu vmstat threshold while kswapd is awake and restore
2599 * them before going back to sleep.
2600 */
2601 set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2602 schedule();
2603 set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2604 } else {
2605 if (remaining)
2606 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2607 else
2608 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2609 }
2610 finish_wait(&pgdat->kswapd_wait, &wait);
2372} 2611}
2373 2612
2374/* 2613/*
@@ -2387,9 +2626,10 @@ out:
2387static int kswapd(void *p) 2626static int kswapd(void *p)
2388{ 2627{
2389 unsigned long order; 2628 unsigned long order;
2629 int classzone_idx;
2390 pg_data_t *pgdat = (pg_data_t*)p; 2630 pg_data_t *pgdat = (pg_data_t*)p;
2391 struct task_struct *tsk = current; 2631 struct task_struct *tsk = current;
2392 DEFINE_WAIT(wait); 2632
2393 struct reclaim_state reclaim_state = { 2633 struct reclaim_state reclaim_state = {
2394 .reclaimed_slab = 0, 2634 .reclaimed_slab = 0,
2395 }; 2635 };
@@ -2417,49 +2657,30 @@ static int kswapd(void *p)
2417 set_freezable(); 2657 set_freezable();
2418 2658
2419 order = 0; 2659 order = 0;
2660 classzone_idx = MAX_NR_ZONES - 1;
2420 for ( ; ; ) { 2661 for ( ; ; ) {
2421 unsigned long new_order; 2662 unsigned long new_order;
2663 int new_classzone_idx;
2422 int ret; 2664 int ret;
2423 2665
2424 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2425 new_order = pgdat->kswapd_max_order; 2666 new_order = pgdat->kswapd_max_order;
2667 new_classzone_idx = pgdat->classzone_idx;
2426 pgdat->kswapd_max_order = 0; 2668 pgdat->kswapd_max_order = 0;
2427 if (order < new_order) { 2669 pgdat->classzone_idx = MAX_NR_ZONES - 1;
2670 if (order < new_order || classzone_idx > new_classzone_idx) {
2428 /* 2671 /*
2429 * Don't sleep if someone wants a larger 'order' 2672 * Don't sleep if someone wants a larger 'order'
2430 * allocation 2673 * allocation or has tigher zone constraints
2431 */ 2674 */
2432 order = new_order; 2675 order = new_order;
2676 classzone_idx = new_classzone_idx;
2433 } else { 2677 } else {
2434 if (!freezing(current) && !kthread_should_stop()) { 2678 kswapd_try_to_sleep(pgdat, order, classzone_idx);
2435 long remaining = 0;
2436
2437 /* Try to sleep for a short interval */
2438 if (!sleeping_prematurely(pgdat, order, remaining)) {
2439 remaining = schedule_timeout(HZ/10);
2440 finish_wait(&pgdat->kswapd_wait, &wait);
2441 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2442 }
2443
2444 /*
2445 * After a short sleep, check if it was a
2446 * premature sleep. If not, then go fully
2447 * to sleep until explicitly woken up
2448 */
2449 if (!sleeping_prematurely(pgdat, order, remaining)) {
2450 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2451 schedule();
2452 } else {
2453 if (remaining)
2454 count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
2455 else
2456 count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
2457 }
2458 }
2459
2460 order = pgdat->kswapd_max_order; 2679 order = pgdat->kswapd_max_order;
2680 classzone_idx = pgdat->classzone_idx;
2681 pgdat->kswapd_max_order = 0;
2682 pgdat->classzone_idx = MAX_NR_ZONES - 1;
2461 } 2683 }
2462 finish_wait(&pgdat->kswapd_wait, &wait);
2463 2684
2464 ret = try_to_freeze(); 2685 ret = try_to_freeze();
2465 if (kthread_should_stop()) 2686 if (kthread_should_stop())
@@ -2471,7 +2692,7 @@ static int kswapd(void *p)
2471 */ 2692 */
2472 if (!ret) { 2693 if (!ret) {
2473 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 2694 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2474 balance_pgdat(pgdat, order); 2695 order = balance_pgdat(pgdat, order, &classzone_idx);
2475 } 2696 }
2476 } 2697 }
2477 return 0; 2698 return 0;
@@ -2480,23 +2701,26 @@ static int kswapd(void *p)
2480/* 2701/*
2481 * A zone is low on free memory, so wake its kswapd task to service it. 2702 * A zone is low on free memory, so wake its kswapd task to service it.
2482 */ 2703 */
2483void wakeup_kswapd(struct zone *zone, int order) 2704void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
2484{ 2705{
2485 pg_data_t *pgdat; 2706 pg_data_t *pgdat;
2486 2707
2487 if (!populated_zone(zone)) 2708 if (!populated_zone(zone))
2488 return; 2709 return;
2489 2710
2490 pgdat = zone->zone_pgdat;
2491 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2492 return;
2493 if (pgdat->kswapd_max_order < order)
2494 pgdat->kswapd_max_order = order;
2495 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2496 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 2711 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
2497 return; 2712 return;
2713 pgdat = zone->zone_pgdat;
2714 if (pgdat->kswapd_max_order < order) {
2715 pgdat->kswapd_max_order = order;
2716 pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
2717 }
2498 if (!waitqueue_active(&pgdat->kswapd_wait)) 2718 if (!waitqueue_active(&pgdat->kswapd_wait))
2499 return; 2719 return;
2720 if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
2721 return;
2722
2723 trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
2500 wake_up_interruptible(&pgdat->kswapd_wait); 2724 wake_up_interruptible(&pgdat->kswapd_wait);
2501} 2725}
2502 2726
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8f62f17ee1c7..0c3b5048773e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -83,7 +83,31 @@ EXPORT_SYMBOL(vm_stat);
83 83
84#ifdef CONFIG_SMP 84#ifdef CONFIG_SMP
85 85
86static int calculate_threshold(struct zone *zone) 86int calculate_pressure_threshold(struct zone *zone)
87{
88 int threshold;
89 int watermark_distance;
90
91 /*
92 * As vmstats are not up to date, there is drift between the estimated
93 * and real values. For high thresholds and a high number of CPUs, it
94 * is possible for the min watermark to be breached while the estimated
95 * value looks fine. The pressure threshold is a reduced value such
96 * that even the maximum amount of drift will not accidentally breach
97 * the min watermark
98 */
99 watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
100 threshold = max(1, (int)(watermark_distance / num_online_cpus()));
101
102 /*
103 * Maximum threshold is 125
104 */
105 threshold = min(125, threshold);
106
107 return threshold;
108}
109
110int calculate_normal_threshold(struct zone *zone)
87{ 111{
88 int threshold; 112 int threshold;
89 int mem; /* memory in 128 MB units */ 113 int mem; /* memory in 128 MB units */
@@ -142,7 +166,7 @@ static void refresh_zone_stat_thresholds(void)
142 for_each_populated_zone(zone) { 166 for_each_populated_zone(zone) {
143 unsigned long max_drift, tolerate_drift; 167 unsigned long max_drift, tolerate_drift;
144 168
145 threshold = calculate_threshold(zone); 169 threshold = calculate_normal_threshold(zone);
146 170
147 for_each_online_cpu(cpu) 171 for_each_online_cpu(cpu)
148 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 172 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
@@ -161,42 +185,50 @@ static void refresh_zone_stat_thresholds(void)
161 } 185 }
162} 186}
163 187
188void set_pgdat_percpu_threshold(pg_data_t *pgdat,
189 int (*calculate_pressure)(struct zone *))
190{
191 struct zone *zone;
192 int cpu;
193 int threshold;
194 int i;
195
196 for (i = 0; i < pgdat->nr_zones; i++) {
197 zone = &pgdat->node_zones[i];
198 if (!zone->percpu_drift_mark)
199 continue;
200
201 threshold = (*calculate_pressure)(zone);
202 for_each_possible_cpu(cpu)
203 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
204 = threshold;
205 }
206}
207
164/* 208/*
165 * For use when we know that interrupts are disabled. 209 * For use when we know that interrupts are disabled.
166 */ 210 */
167void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, 211void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
168 int delta) 212 int delta)
169{ 213{
170 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 214 struct per_cpu_pageset __percpu *pcp = zone->pageset;
171 215 s8 __percpu *p = pcp->vm_stat_diff + item;
172 s8 *p = pcp->vm_stat_diff + item;
173 long x; 216 long x;
217 long t;
218
219 x = delta + __this_cpu_read(*p);
174 220
175 x = delta + *p; 221 t = __this_cpu_read(pcp->stat_threshold);
176 222
177 if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { 223 if (unlikely(x > t || x < -t)) {
178 zone_page_state_add(x, zone, item); 224 zone_page_state_add(x, zone, item);
179 x = 0; 225 x = 0;
180 } 226 }
181 *p = x; 227 __this_cpu_write(*p, x);
182} 228}
183EXPORT_SYMBOL(__mod_zone_page_state); 229EXPORT_SYMBOL(__mod_zone_page_state);
184 230
185/* 231/*
186 * For an unknown interrupt state
187 */
188void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
189 int delta)
190{
191 unsigned long flags;
192
193 local_irq_save(flags);
194 __mod_zone_page_state(zone, item, delta);
195 local_irq_restore(flags);
196}
197EXPORT_SYMBOL(mod_zone_page_state);
198
199/*
200 * Optimized increment and decrement functions. 232 * Optimized increment and decrement functions.
201 * 233 *
202 * These are only for a single page and therefore can take a struct page * 234 * These are only for a single page and therefore can take a struct page *
@@ -221,16 +253,17 @@ EXPORT_SYMBOL(mod_zone_page_state);
221 */ 253 */
222void __inc_zone_state(struct zone *zone, enum zone_stat_item item) 254void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
223{ 255{
224 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 256 struct per_cpu_pageset __percpu *pcp = zone->pageset;
225 s8 *p = pcp->vm_stat_diff + item; 257 s8 __percpu *p = pcp->vm_stat_diff + item;
258 s8 v, t;
226 259
227 (*p)++; 260 v = __this_cpu_inc_return(*p);
261 t = __this_cpu_read(pcp->stat_threshold);
262 if (unlikely(v > t)) {
263 s8 overstep = t >> 1;
228 264
229 if (unlikely(*p > pcp->stat_threshold)) { 265 zone_page_state_add(v + overstep, zone, item);
230 int overstep = pcp->stat_threshold / 2; 266 __this_cpu_write(*p, -overstep);
231
232 zone_page_state_add(*p + overstep, zone, item);
233 *p = -overstep;
234 } 267 }
235} 268}
236 269
@@ -242,16 +275,17 @@ EXPORT_SYMBOL(__inc_zone_page_state);
242 275
243void __dec_zone_state(struct zone *zone, enum zone_stat_item item) 276void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
244{ 277{
245 struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); 278 struct per_cpu_pageset __percpu *pcp = zone->pageset;
246 s8 *p = pcp->vm_stat_diff + item; 279 s8 __percpu *p = pcp->vm_stat_diff + item;
247 280 s8 v, t;
248 (*p)--;
249 281
250 if (unlikely(*p < - pcp->stat_threshold)) { 282 v = __this_cpu_dec_return(*p);
251 int overstep = pcp->stat_threshold / 2; 283 t = __this_cpu_read(pcp->stat_threshold);
284 if (unlikely(v < - t)) {
285 s8 overstep = t >> 1;
252 286
253 zone_page_state_add(*p - overstep, zone, item); 287 zone_page_state_add(v - overstep, zone, item);
254 *p = overstep; 288 __this_cpu_write(*p, overstep);
255 } 289 }
256} 290}
257 291
@@ -261,6 +295,92 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
261} 295}
262EXPORT_SYMBOL(__dec_zone_page_state); 296EXPORT_SYMBOL(__dec_zone_page_state);
263 297
298#ifdef CONFIG_CMPXCHG_LOCAL
299/*
300 * If we have cmpxchg_local support then we do not need to incur the overhead
301 * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
302 *
303 * mod_state() modifies the zone counter state through atomic per cpu
304 * operations.
305 *
306 * Overstep mode specifies how overstep should handled:
307 * 0 No overstepping
308 * 1 Overstepping half of threshold
309 * -1 Overstepping minus half of threshold
310*/
311static inline void mod_state(struct zone *zone,
312 enum zone_stat_item item, int delta, int overstep_mode)
313{
314 struct per_cpu_pageset __percpu *pcp = zone->pageset;
315 s8 __percpu *p = pcp->vm_stat_diff + item;
316 long o, n, t, z;
317
318 do {
319 z = 0; /* overflow to zone counters */
320
321 /*
322 * The fetching of the stat_threshold is racy. We may apply
323 * a counter threshold to the wrong the cpu if we get
324 * rescheduled while executing here. However, the following
325 * will apply the threshold again and therefore bring the
326 * counter under the threshold.
327 */
328 t = this_cpu_read(pcp->stat_threshold);
329
330 o = this_cpu_read(*p);
331 n = delta + o;
332
333 if (n > t || n < -t) {
334 int os = overstep_mode * (t >> 1) ;
335
336 /* Overflow must be added to zone counters */
337 z = n + os;
338 n = -os;
339 }
340 } while (this_cpu_cmpxchg(*p, o, n) != o);
341
342 if (z)
343 zone_page_state_add(z, zone, item);
344}
345
346void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
347 int delta)
348{
349 mod_state(zone, item, delta, 0);
350}
351EXPORT_SYMBOL(mod_zone_page_state);
352
353void inc_zone_state(struct zone *zone, enum zone_stat_item item)
354{
355 mod_state(zone, item, 1, 1);
356}
357
358void inc_zone_page_state(struct page *page, enum zone_stat_item item)
359{
360 mod_state(page_zone(page), item, 1, 1);
361}
362EXPORT_SYMBOL(inc_zone_page_state);
363
364void dec_zone_page_state(struct page *page, enum zone_stat_item item)
365{
366 mod_state(page_zone(page), item, -1, -1);
367}
368EXPORT_SYMBOL(dec_zone_page_state);
369#else
370/*
371 * Use interrupt disable to serialize counter updates
372 */
373void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
374 int delta)
375{
376 unsigned long flags;
377
378 local_irq_save(flags);
379 __mod_zone_page_state(zone, item, delta);
380 local_irq_restore(flags);
381}
382EXPORT_SYMBOL(mod_zone_page_state);
383
264void inc_zone_state(struct zone *zone, enum zone_stat_item item) 384void inc_zone_state(struct zone *zone, enum zone_stat_item item)
265{ 385{
266 unsigned long flags; 386 unsigned long flags;
@@ -291,6 +411,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
291 local_irq_restore(flags); 411 local_irq_restore(flags);
292} 412}
293EXPORT_SYMBOL(dec_zone_page_state); 413EXPORT_SYMBOL(dec_zone_page_state);
414#endif
294 415
295/* 416/*
296 * Update the zone counters for one cpu. 417 * Update the zone counters for one cpu.
@@ -759,6 +880,7 @@ static const char * const vmstat_text[] = {
759 "numa_local", 880 "numa_local",
760 "numa_other", 881 "numa_other",
761#endif 882#endif
883 "nr_anon_transparent_hugepages",
762 "nr_dirty_threshold", 884 "nr_dirty_threshold",
763 "nr_dirty_background_threshold", 885 "nr_dirty_background_threshold",
764 886
@@ -834,7 +956,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
834 "\n scanned %lu" 956 "\n scanned %lu"
835 "\n spanned %lu" 957 "\n spanned %lu"
836 "\n present %lu", 958 "\n present %lu",
837 zone_nr_free_pages(zone), 959 zone_page_state(zone, NR_FREE_PAGES),
838 min_wmark_pages(zone), 960 min_wmark_pages(zone),
839 low_wmark_pages(zone), 961 low_wmark_pages(zone),
840 high_wmark_pages(zone), 962 high_wmark_pages(zone),
@@ -1033,7 +1155,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
1033 break; 1155 break;
1034 case CPU_DOWN_PREPARE: 1156 case CPU_DOWN_PREPARE:
1035 case CPU_DOWN_PREPARE_FROZEN: 1157 case CPU_DOWN_PREPARE_FROZEN:
1036 cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu)); 1158 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
1037 per_cpu(vmstat_work, cpu).work.func = NULL; 1159 per_cpu(vmstat_work, cpu).work.func = NULL;
1038 break; 1160 break;
1039 case CPU_DOWN_FAILED: 1161 case CPU_DOWN_FAILED: