aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2012-06-29 08:45:58 -0400
committerJiri Kosina <jkosina@suse.cz>2012-06-29 08:45:58 -0400
commit59f91e5dd0504dc0ebfaa0b6f3a55e6931f96266 (patch)
treeb913718405d44a921905ac71044fbde410256865 /mm
parent57bdfdd80077addf518a9b90c4a66890efc4f70e (diff)
parent89abfab133ef1f5902abafb744df72793213ac19 (diff)
Merge branch 'master' into for-next
Conflicts: include/linux/mmzone.h Synced with Linus' tree so that trivial patch can be applied on top of up-to-date code properly. Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/Makefile9
-rw-r--r--mm/bootmem.c134
-rw-r--r--mm/compaction.c142
-rw-r--r--mm/filemap.c39
-rw-r--r--mm/huge_memory.c21
-rw-r--r--mm/hugetlb.c32
-rw-r--r--mm/internal.h14
-rw-r--r--mm/madvise.c15
-rw-r--r--mm/memblock.c42
-rw-r--r--mm/memcontrol.c127
-rw-r--r--mm/memory-failure.c8
-rw-r--r--mm/memory.c20
-rw-r--r--mm/memory_hotplug.c14
-rw-r--r--mm/mempolicy.c36
-rw-r--r--mm/mmap.c53
-rw-r--r--mm/nobootmem.c112
-rw-r--r--mm/oom_kill.c44
-rw-r--r--mm/page_alloc.c78
-rw-r--r--mm/readahead.c40
-rw-r--r--mm/rmap.c6
-rw-r--r--mm/shmem.c513
-rw-r--r--mm/sparse.c25
-rw-r--r--mm/swap.c51
-rw-r--r--mm/swapfile.c33
-rw-r--r--mm/thrash.c155
-rw-r--r--mm/truncate.c25
-rw-r--r--mm/vmalloc.c7
-rw-r--r--mm/vmscan.c306
-rw-r--r--mm/vmstat.c10
30 files changed, 1137 insertions, 984 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 39220026c797..b2176374b98e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -349,6 +349,16 @@ choice
349 benefit. 349 benefit.
350endchoice 350endchoice
351 351
352config CROSS_MEMORY_ATTACH
353 bool "Cross Memory Support"
354 depends on MMU
355 default y
356 help
357 Enabling this option adds the system calls process_vm_readv and
358 process_vm_writev which allow a process with the correct privileges
359 to directly read from or write to to another process's address space.
360 See the man page for more details.
361
352# 362#
353# UP and nommu archs use km based percpu allocator 363# UP and nommu archs use km based percpu allocator
354# 364#
diff --git a/mm/Makefile b/mm/Makefile
index 8aada89efbbb..a156285ce88d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,8 +5,11 @@
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o pagewalk.o pgtable-generic.o \ 8 vmalloc.o pagewalk.o pgtable-generic.o
9 process_vm_access.o 9
10ifdef CONFIG_CROSS_MEMORY_ATTACH
11mmu-$(CONFIG_MMU) += process_vm_access.o
12endif
10 13
11obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ 14obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
12 maccess.o page_alloc.o page-writeback.o \ 15 maccess.o page_alloc.o page-writeback.o \
@@ -25,7 +28,7 @@ endif
25obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 28obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
26 29
27obj-$(CONFIG_BOUNCE) += bounce.o 30obj-$(CONFIG_BOUNCE) += bounce.o
28obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 31obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
29obj-$(CONFIG_HAS_DMA) += dmapool.o 32obj-$(CONFIG_HAS_DMA) += dmapool.o
30obj-$(CONFIG_HUGETLBFS) += hugetlb.o 33obj-$(CONFIG_HUGETLBFS) += hugetlb.o
31obj-$(CONFIG_NUMA) += mempolicy.o 34obj-$(CONFIG_NUMA) += mempolicy.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 0131170c9d54..ec4fcb7a56c8 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -77,16 +77,16 @@ unsigned long __init bootmem_bootmap_pages(unsigned long pages)
77 */ 77 */
78static void __init link_bootmem(bootmem_data_t *bdata) 78static void __init link_bootmem(bootmem_data_t *bdata)
79{ 79{
80 struct list_head *iter; 80 bootmem_data_t *ent;
81 81
82 list_for_each(iter, &bdata_list) { 82 list_for_each_entry(ent, &bdata_list, list) {
83 bootmem_data_t *ent; 83 if (bdata->node_min_pfn < ent->node_min_pfn) {
84 84 list_add_tail(&bdata->list, &ent->list);
85 ent = list_entry(iter, bootmem_data_t, list); 85 return;
86 if (bdata->node_min_pfn < ent->node_min_pfn) 86 }
87 break;
88 } 87 }
89 list_add_tail(&bdata->list, iter); 88
89 list_add_tail(&bdata->list, &bdata_list);
90} 90}
91 91
92/* 92/*
@@ -203,7 +203,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
203 } else { 203 } else {
204 unsigned long off = 0; 204 unsigned long off = 0;
205 205
206 while (vec && off < BITS_PER_LONG) { 206 vec >>= start & (BITS_PER_LONG - 1);
207 while (vec) {
207 if (vec & 1) { 208 if (vec & 1) {
208 page = pfn_to_page(start + off); 209 page = pfn_to_page(start + off);
209 __free_pages_bootmem(page, 0); 210 __free_pages_bootmem(page, 0);
@@ -467,7 +468,7 @@ static unsigned long __init align_off(struct bootmem_data *bdata,
467 return ALIGN(base + off, align) - base; 468 return ALIGN(base + off, align) - base;
468} 469}
469 470
470static void * __init alloc_bootmem_core(struct bootmem_data *bdata, 471static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
471 unsigned long size, unsigned long align, 472 unsigned long size, unsigned long align,
472 unsigned long goal, unsigned long limit) 473 unsigned long goal, unsigned long limit)
473{ 474{
@@ -588,14 +589,14 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
588 p_bdata = bootmem_arch_preferred_node(bdata, size, align, 589 p_bdata = bootmem_arch_preferred_node(bdata, size, align,
589 goal, limit); 590 goal, limit);
590 if (p_bdata) 591 if (p_bdata)
591 return alloc_bootmem_core(p_bdata, size, align, 592 return alloc_bootmem_bdata(p_bdata, size, align,
592 goal, limit); 593 goal, limit);
593 } 594 }
594#endif 595#endif
595 return NULL; 596 return NULL;
596} 597}
597 598
598static void * __init ___alloc_bootmem_nopanic(unsigned long size, 599static void * __init alloc_bootmem_core(unsigned long size,
599 unsigned long align, 600 unsigned long align,
600 unsigned long goal, 601 unsigned long goal,
601 unsigned long limit) 602 unsigned long limit)
@@ -603,7 +604,6 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
603 bootmem_data_t *bdata; 604 bootmem_data_t *bdata;
604 void *region; 605 void *region;
605 606
606restart:
607 region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); 607 region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
608 if (region) 608 if (region)
609 return region; 609 return region;
@@ -614,11 +614,25 @@ restart:
614 if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) 614 if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
615 break; 615 break;
616 616
617 region = alloc_bootmem_core(bdata, size, align, goal, limit); 617 region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
618 if (region) 618 if (region)
619 return region; 619 return region;
620 } 620 }
621 621
622 return NULL;
623}
624
625static void * __init ___alloc_bootmem_nopanic(unsigned long size,
626 unsigned long align,
627 unsigned long goal,
628 unsigned long limit)
629{
630 void *ptr;
631
632restart:
633 ptr = alloc_bootmem_core(size, align, goal, limit);
634 if (ptr)
635 return ptr;
622 if (goal) { 636 if (goal) {
623 goal = 0; 637 goal = 0;
624 goto restart; 638 goto restart;
@@ -684,21 +698,56 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
684 return ___alloc_bootmem(size, align, goal, limit); 698 return ___alloc_bootmem(size, align, goal, limit);
685} 699}
686 700
687static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, 701static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
688 unsigned long size, unsigned long align, 702 unsigned long size, unsigned long align,
689 unsigned long goal, unsigned long limit) 703 unsigned long goal, unsigned long limit)
690{ 704{
691 void *ptr; 705 void *ptr;
692 706
693 ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit); 707again:
708 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
709 align, goal, limit);
694 if (ptr) 710 if (ptr)
695 return ptr; 711 return ptr;
696 712
697 ptr = alloc_bootmem_core(bdata, size, align, goal, limit); 713 ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
698 if (ptr) 714 if (ptr)
699 return ptr; 715 return ptr;
700 716
701 return ___alloc_bootmem(size, align, goal, limit); 717 ptr = alloc_bootmem_core(size, align, goal, limit);
718 if (ptr)
719 return ptr;
720
721 if (goal) {
722 goal = 0;
723 goto again;
724 }
725
726 return NULL;
727}
728
729void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
730 unsigned long align, unsigned long goal)
731{
732 if (WARN_ON_ONCE(slab_is_available()))
733 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
734
735 return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
736}
737
738void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
739 unsigned long align, unsigned long goal,
740 unsigned long limit)
741{
742 void *ptr;
743
744 ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
745 if (ptr)
746 return ptr;
747
748 printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
749 panic("Out of memory");
750 return NULL;
702} 751}
703 752
704/** 753/**
@@ -722,7 +771,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
722 if (WARN_ON_ONCE(slab_is_available())) 771 if (WARN_ON_ONCE(slab_is_available()))
723 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 772 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
724 773
725 return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); 774 return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
726} 775}
727 776
728void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, 777void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -743,7 +792,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
743 unsigned long new_goal; 792 unsigned long new_goal;
744 793
745 new_goal = MAX_DMA32_PFN << PAGE_SHIFT; 794 new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
746 ptr = alloc_bootmem_core(pgdat->bdata, size, align, 795 ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
747 new_goal, 0); 796 new_goal, 0);
748 if (ptr) 797 if (ptr)
749 return ptr; 798 return ptr;
@@ -754,47 +803,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
754 803
755} 804}
756 805
757#ifdef CONFIG_SPARSEMEM
758/**
759 * alloc_bootmem_section - allocate boot memory from a specific section
760 * @size: size of the request in bytes
761 * @section_nr: sparse map section to allocate from
762 *
763 * Return NULL on failure.
764 */
765void * __init alloc_bootmem_section(unsigned long size,
766 unsigned long section_nr)
767{
768 bootmem_data_t *bdata;
769 unsigned long pfn, goal;
770
771 pfn = section_nr_to_pfn(section_nr);
772 goal = pfn << PAGE_SHIFT;
773 bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
774
775 return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
776}
777#endif
778
779void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
780 unsigned long align, unsigned long goal)
781{
782 void *ptr;
783
784 if (WARN_ON_ONCE(slab_is_available()))
785 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
786
787 ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
788 if (ptr)
789 return ptr;
790
791 ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
792 if (ptr)
793 return ptr;
794
795 return __alloc_bootmem_nopanic(size, align, goal);
796}
797
798#ifndef ARCH_LOW_ADDRESS_LIMIT 806#ifndef ARCH_LOW_ADDRESS_LIMIT
799#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL 807#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
800#endif 808#endif
@@ -839,6 +847,6 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
839 if (WARN_ON_ONCE(slab_is_available())) 847 if (WARN_ON_ONCE(slab_is_available()))
840 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 848 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
841 849
842 return ___alloc_bootmem_node(pgdat->bdata, size, align, 850 return ___alloc_bootmem_node(pgdat, size, align,
843 goal, ARCH_LOW_ADDRESS_LIMIT); 851 goal, ARCH_LOW_ADDRESS_LIMIT);
844} 852}
diff --git a/mm/compaction.c b/mm/compaction.c
index da7d35ea5103..840ee288e296 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -235,7 +235,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
235 */ 235 */
236 while (unlikely(too_many_isolated(zone))) { 236 while (unlikely(too_many_isolated(zone))) {
237 /* async migration should just abort */ 237 /* async migration should just abort */
238 if (!cc->sync) 238 if (cc->mode != COMPACT_SYNC)
239 return 0; 239 return 0;
240 240
241 congestion_wait(BLK_RW_ASYNC, HZ/10); 241 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -303,7 +303,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
303 * satisfies the allocation 303 * satisfies the allocation
304 */ 304 */
305 pageblock_nr = low_pfn >> pageblock_order; 305 pageblock_nr = low_pfn >> pageblock_order;
306 if (!cc->sync && last_pageblock_nr != pageblock_nr && 306 if (cc->mode != COMPACT_SYNC &&
307 last_pageblock_nr != pageblock_nr &&
307 !migrate_async_suitable(get_pageblock_migratetype(page))) { 308 !migrate_async_suitable(get_pageblock_migratetype(page))) {
308 low_pfn += pageblock_nr_pages; 309 low_pfn += pageblock_nr_pages;
309 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; 310 low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
@@ -324,7 +325,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
324 continue; 325 continue;
325 } 326 }
326 327
327 if (!cc->sync) 328 if (cc->mode != COMPACT_SYNC)
328 mode |= ISOLATE_ASYNC_MIGRATE; 329 mode |= ISOLATE_ASYNC_MIGRATE;
329 330
330 /* Try isolate the page */ 331 /* Try isolate the page */
@@ -357,27 +358,90 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
357 358
358#endif /* CONFIG_COMPACTION || CONFIG_CMA */ 359#endif /* CONFIG_COMPACTION || CONFIG_CMA */
359#ifdef CONFIG_COMPACTION 360#ifdef CONFIG_COMPACTION
361/*
362 * Returns true if MIGRATE_UNMOVABLE pageblock was successfully
363 * converted to MIGRATE_MOVABLE type, false otherwise.
364 */
365static bool rescue_unmovable_pageblock(struct page *page)
366{
367 unsigned long pfn, start_pfn, end_pfn;
368 struct page *start_page, *end_page;
369
370 pfn = page_to_pfn(page);
371 start_pfn = pfn & ~(pageblock_nr_pages - 1);
372 end_pfn = start_pfn + pageblock_nr_pages;
373
374 start_page = pfn_to_page(start_pfn);
375 end_page = pfn_to_page(end_pfn);
376
377 /* Do not deal with pageblocks that overlap zones */
378 if (page_zone(start_page) != page_zone(end_page))
379 return false;
380
381 for (page = start_page, pfn = start_pfn; page < end_page; pfn++,
382 page++) {
383 if (!pfn_valid_within(pfn))
384 continue;
385
386 if (PageBuddy(page)) {
387 int order = page_order(page);
388
389 pfn += (1 << order) - 1;
390 page += (1 << order) - 1;
391
392 continue;
393 } else if (page_count(page) == 0 || PageLRU(page))
394 continue;
395
396 return false;
397 }
398
399 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
400 move_freepages_block(page_zone(page), page, MIGRATE_MOVABLE);
401 return true;
402}
360 403
361/* Returns true if the page is within a block suitable for migration to */ 404enum smt_result {
362static bool suitable_migration_target(struct page *page) 405 GOOD_AS_MIGRATION_TARGET,
406 FAIL_UNMOVABLE_TARGET,
407 FAIL_BAD_TARGET,
408};
409
410/*
411 * Returns GOOD_AS_MIGRATION_TARGET if the page is within a block
412 * suitable for migration to, FAIL_UNMOVABLE_TARGET if the page
413 * is within a MIGRATE_UNMOVABLE block, FAIL_BAD_TARGET otherwise.
414 */
415static enum smt_result suitable_migration_target(struct page *page,
416 struct compact_control *cc)
363{ 417{
364 418
365 int migratetype = get_pageblock_migratetype(page); 419 int migratetype = get_pageblock_migratetype(page);
366 420
367 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ 421 /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
368 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) 422 if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
369 return false; 423 return FAIL_BAD_TARGET;
370 424
371 /* If the page is a large free page, then allow migration */ 425 /* If the page is a large free page, then allow migration */
372 if (PageBuddy(page) && page_order(page) >= pageblock_order) 426 if (PageBuddy(page) && page_order(page) >= pageblock_order)
373 return true; 427 return GOOD_AS_MIGRATION_TARGET;
374 428
375 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ 429 /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
376 if (migrate_async_suitable(migratetype)) 430 if (cc->mode != COMPACT_ASYNC_UNMOVABLE &&
377 return true; 431 migrate_async_suitable(migratetype))
432 return GOOD_AS_MIGRATION_TARGET;
433
434 if (cc->mode == COMPACT_ASYNC_MOVABLE &&
435 migratetype == MIGRATE_UNMOVABLE)
436 return FAIL_UNMOVABLE_TARGET;
437
438 if (cc->mode != COMPACT_ASYNC_MOVABLE &&
439 migratetype == MIGRATE_UNMOVABLE &&
440 rescue_unmovable_pageblock(page))
441 return GOOD_AS_MIGRATION_TARGET;
378 442
379 /* Otherwise skip the block */ 443 /* Otherwise skip the block */
380 return false; 444 return FAIL_BAD_TARGET;
381} 445}
382 446
383/* 447/*
@@ -411,6 +475,13 @@ static void isolate_freepages(struct zone *zone,
411 zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; 475 zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
412 476
413 /* 477 /*
478 * isolate_freepages() may be called more than once during
479 * compact_zone_order() run and we want only the most recent
480 * count.
481 */
482 cc->nr_pageblocks_skipped = 0;
483
484 /*
414 * Isolate free pages until enough are available to migrate the 485 * Isolate free pages until enough are available to migrate the
415 * pages on cc->migratepages. We stop searching if the migrate 486 * pages on cc->migratepages. We stop searching if the migrate
416 * and free page scanners meet or enough free pages are isolated. 487 * and free page scanners meet or enough free pages are isolated.
@@ -418,6 +489,7 @@ static void isolate_freepages(struct zone *zone,
418 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; 489 for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
419 pfn -= pageblock_nr_pages) { 490 pfn -= pageblock_nr_pages) {
420 unsigned long isolated; 491 unsigned long isolated;
492 enum smt_result ret;
421 493
422 if (!pfn_valid(pfn)) 494 if (!pfn_valid(pfn))
423 continue; 495 continue;
@@ -434,9 +506,12 @@ static void isolate_freepages(struct zone *zone,
434 continue; 506 continue;
435 507
436 /* Check the block is suitable for migration */ 508 /* Check the block is suitable for migration */
437 if (!suitable_migration_target(page)) 509 ret = suitable_migration_target(page, cc);
510 if (ret != GOOD_AS_MIGRATION_TARGET) {
511 if (ret == FAIL_UNMOVABLE_TARGET)
512 cc->nr_pageblocks_skipped++;
438 continue; 513 continue;
439 514 }
440 /* 515 /*
441 * Found a block suitable for isolating free pages from. Now 516 * Found a block suitable for isolating free pages from. Now
442 * we disabled interrupts, double check things are ok and 517 * we disabled interrupts, double check things are ok and
@@ -445,12 +520,14 @@ static void isolate_freepages(struct zone *zone,
445 */ 520 */
446 isolated = 0; 521 isolated = 0;
447 spin_lock_irqsave(&zone->lock, flags); 522 spin_lock_irqsave(&zone->lock, flags);
448 if (suitable_migration_target(page)) { 523 ret = suitable_migration_target(page, cc);
524 if (ret == GOOD_AS_MIGRATION_TARGET) {
449 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); 525 end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
450 isolated = isolate_freepages_block(pfn, end_pfn, 526 isolated = isolate_freepages_block(pfn, end_pfn,
451 freelist, false); 527 freelist, false);
452 nr_freepages += isolated; 528 nr_freepages += isolated;
453 } 529 } else if (ret == FAIL_UNMOVABLE_TARGET)
530 cc->nr_pageblocks_skipped++;
454 spin_unlock_irqrestore(&zone->lock, flags); 531 spin_unlock_irqrestore(&zone->lock, flags);
455 532
456 /* 533 /*
@@ -682,8 +759,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
682 759
683 nr_migrate = cc->nr_migratepages; 760 nr_migrate = cc->nr_migratepages;
684 err = migrate_pages(&cc->migratepages, compaction_alloc, 761 err = migrate_pages(&cc->migratepages, compaction_alloc,
685 (unsigned long)cc, false, 762 (unsigned long)&cc->freepages, false,
686 cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); 763 (cc->mode == COMPACT_SYNC) ? MIGRATE_SYNC_LIGHT
764 : MIGRATE_ASYNC);
687 update_nr_listpages(cc); 765 update_nr_listpages(cc);
688 nr_remaining = cc->nr_migratepages; 766 nr_remaining = cc->nr_migratepages;
689 767
@@ -712,7 +790,8 @@ out:
712 790
713static unsigned long compact_zone_order(struct zone *zone, 791static unsigned long compact_zone_order(struct zone *zone,
714 int order, gfp_t gfp_mask, 792 int order, gfp_t gfp_mask,
715 bool sync) 793 enum compact_mode mode,
794 unsigned long *nr_pageblocks_skipped)
716{ 795{
717 struct compact_control cc = { 796 struct compact_control cc = {
718 .nr_freepages = 0, 797 .nr_freepages = 0,
@@ -720,12 +799,17 @@ static unsigned long compact_zone_order(struct zone *zone,
720 .order = order, 799 .order = order,
721 .migratetype = allocflags_to_migratetype(gfp_mask), 800 .migratetype = allocflags_to_migratetype(gfp_mask),
722 .zone = zone, 801 .zone = zone,
723 .sync = sync, 802 .mode = mode,
724 }; 803 };
804 unsigned long rc;
805
725 INIT_LIST_HEAD(&cc.freepages); 806 INIT_LIST_HEAD(&cc.freepages);
726 INIT_LIST_HEAD(&cc.migratepages); 807 INIT_LIST_HEAD(&cc.migratepages);
727 808
728 return compact_zone(zone, &cc); 809 rc = compact_zone(zone, &cc);
810 *nr_pageblocks_skipped = cc.nr_pageblocks_skipped;
811
812 return rc;
729} 813}
730 814
731int sysctl_extfrag_threshold = 500; 815int sysctl_extfrag_threshold = 500;
@@ -750,6 +834,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
750 struct zoneref *z; 834 struct zoneref *z;
751 struct zone *zone; 835 struct zone *zone;
752 int rc = COMPACT_SKIPPED; 836 int rc = COMPACT_SKIPPED;
837 unsigned long nr_pageblocks_skipped;
838 enum compact_mode mode;
753 839
754 /* 840 /*
755 * Check whether it is worth even starting compaction. The order check is 841 * Check whether it is worth even starting compaction. The order check is
@@ -766,12 +852,22 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
766 nodemask) { 852 nodemask) {
767 int status; 853 int status;
768 854
769 status = compact_zone_order(zone, order, gfp_mask, sync); 855 mode = sync ? COMPACT_SYNC : COMPACT_ASYNC_MOVABLE;
856retry:
857 status = compact_zone_order(zone, order, gfp_mask, mode,
858 &nr_pageblocks_skipped);
770 rc = max(status, rc); 859 rc = max(status, rc);
771 860
772 /* If a normal allocation would succeed, stop compacting */ 861 /* If a normal allocation would succeed, stop compacting */
773 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) 862 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
774 break; 863 break;
864
865 if (rc == COMPACT_COMPLETE && mode == COMPACT_ASYNC_MOVABLE) {
866 if (nr_pageblocks_skipped) {
867 mode = COMPACT_ASYNC_UNMOVABLE;
868 goto retry;
869 }
870 }
775 } 871 }
776 872
777 return rc; 873 return rc;
@@ -805,7 +901,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
805 if (ok && cc->order > zone->compact_order_failed) 901 if (ok && cc->order > zone->compact_order_failed)
806 zone->compact_order_failed = cc->order + 1; 902 zone->compact_order_failed = cc->order + 1;
807 /* Currently async compaction is never deferred. */ 903 /* Currently async compaction is never deferred. */
808 else if (!ok && cc->sync) 904 else if (!ok && cc->mode == COMPACT_SYNC)
809 defer_compaction(zone, cc->order); 905 defer_compaction(zone, cc->order);
810 } 906 }
811 907
@@ -820,7 +916,7 @@ int compact_pgdat(pg_data_t *pgdat, int order)
820{ 916{
821 struct compact_control cc = { 917 struct compact_control cc = {
822 .order = order, 918 .order = order,
823 .sync = false, 919 .mode = COMPACT_ASYNC_MOVABLE,
824 }; 920 };
825 921
826 return __compact_pgdat(pgdat, &cc); 922 return __compact_pgdat(pgdat, &cc);
@@ -830,7 +926,7 @@ static int compact_node(int nid)
830{ 926{
831 struct compact_control cc = { 927 struct compact_control cc = {
832 .order = -1, 928 .order = -1,
833 .sync = true, 929 .mode = COMPACT_SYNC,
834 }; 930 };
835 931
836 return __compact_pgdat(NODE_DATA(nid), &cc); 932 return __compact_pgdat(NODE_DATA(nid), &cc);
diff --git a/mm/filemap.c b/mm/filemap.c
index 79c4b2b0b14e..64b48f934b89 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,7 +29,6 @@
29#include <linux/pagevec.h> 29#include <linux/pagevec.h>
30#include <linux/blkdev.h> 30#include <linux/blkdev.h>
31#include <linux/security.h> 31#include <linux/security.h>
32#include <linux/syscalls.h>
33#include <linux/cpuset.h> 32#include <linux/cpuset.h>
34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 33#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35#include <linux/memcontrol.h> 34#include <linux/memcontrol.h>
@@ -1478,44 +1477,6 @@ out:
1478} 1477}
1479EXPORT_SYMBOL(generic_file_aio_read); 1478EXPORT_SYMBOL(generic_file_aio_read);
1480 1479
1481static ssize_t
1482do_readahead(struct address_space *mapping, struct file *filp,
1483 pgoff_t index, unsigned long nr)
1484{
1485 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1486 return -EINVAL;
1487
1488 force_page_cache_readahead(mapping, filp, index, nr);
1489 return 0;
1490}
1491
1492SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
1493{
1494 ssize_t ret;
1495 struct file *file;
1496
1497 ret = -EBADF;
1498 file = fget(fd);
1499 if (file) {
1500 if (file->f_mode & FMODE_READ) {
1501 struct address_space *mapping = file->f_mapping;
1502 pgoff_t start = offset >> PAGE_CACHE_SHIFT;
1503 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
1504 unsigned long len = end - start + 1;
1505 ret = do_readahead(mapping, file, start, len);
1506 }
1507 fput(file);
1508 }
1509 return ret;
1510}
1511#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
1512asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
1513{
1514 return SYSC_readahead((int) fd, offset, (size_t) count);
1515}
1516SYSCALL_ALIAS(sys_readahead, SyS_readahead);
1517#endif
1518
1519#ifdef CONFIG_MMU 1480#ifdef CONFIG_MMU
1520/** 1481/**
1521 * page_cache_read - adds requested page to the page cache if not already there 1482 * page_cache_read - adds requested page to the page cache if not already there
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f0e5306eeb55..d0def42c121b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -636,16 +636,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
636 unsigned long haddr, pmd_t *pmd, 636 unsigned long haddr, pmd_t *pmd,
637 struct page *page) 637 struct page *page)
638{ 638{
639 int ret = 0;
640 pgtable_t pgtable; 639 pgtable_t pgtable;
641 640
642 VM_BUG_ON(!PageCompound(page)); 641 VM_BUG_ON(!PageCompound(page));
643 pgtable = pte_alloc_one(mm, haddr); 642 pgtable = pte_alloc_one(mm, haddr);
644 if (unlikely(!pgtable)) { 643 if (unlikely(!pgtable))
645 mem_cgroup_uncharge_page(page);
646 put_page(page);
647 return VM_FAULT_OOM; 644 return VM_FAULT_OOM;
648 }
649 645
650 clear_huge_page(page, haddr, HPAGE_PMD_NR); 646 clear_huge_page(page, haddr, HPAGE_PMD_NR);
651 __SetPageUptodate(page); 647 __SetPageUptodate(page);
@@ -675,7 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
675 spin_unlock(&mm->page_table_lock); 671 spin_unlock(&mm->page_table_lock);
676 } 672 }
677 673
678 return ret; 674 return 0;
679} 675}
680 676
681static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) 677static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
@@ -724,8 +720,14 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
724 put_page(page); 720 put_page(page);
725 goto out; 721 goto out;
726 } 722 }
723 if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
724 page))) {
725 mem_cgroup_uncharge_page(page);
726 put_page(page);
727 goto out;
728 }
727 729
728 return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); 730 return 0;
729 } 731 }
730out: 732out:
731 /* 733 /*
@@ -950,6 +952,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
950 count_vm_event(THP_FAULT_FALLBACK); 952 count_vm_event(THP_FAULT_FALLBACK);
951 ret = do_huge_pmd_wp_page_fallback(mm, vma, address, 953 ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
952 pmd, orig_pmd, page, haddr); 954 pmd, orig_pmd, page, haddr);
955 if (ret & VM_FAULT_OOM)
956 split_huge_page(page);
953 put_page(page); 957 put_page(page);
954 goto out; 958 goto out;
955 } 959 }
@@ -957,6 +961,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
957 961
958 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { 962 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
959 put_page(new_page); 963 put_page(new_page);
964 split_huge_page(page);
960 put_page(page); 965 put_page(page);
961 ret |= VM_FAULT_OOM; 966 ret |= VM_FAULT_OOM;
962 goto out; 967 goto out;
@@ -968,8 +973,10 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
968 spin_lock(&mm->page_table_lock); 973 spin_lock(&mm->page_table_lock);
969 put_page(page); 974 put_page(page);
970 if (unlikely(!pmd_same(*pmd, orig_pmd))) { 975 if (unlikely(!pmd_same(*pmd, orig_pmd))) {
976 spin_unlock(&mm->page_table_lock);
971 mem_cgroup_uncharge_page(new_page); 977 mem_cgroup_uncharge_page(new_page);
972 put_page(new_page); 978 put_page(new_page);
979 goto out;
973 } else { 980 } else {
974 pmd_t entry; 981 pmd_t entry;
975 VM_BUG_ON(!PageHead(page)); 982 VM_BUG_ON(!PageHead(page));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4e28416c47fb..285a81e87ec8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -273,8 +273,8 @@ static long region_count(struct list_head *head, long f, long t)
273 273
274 /* Locate each segment we overlap with, and count that overlap. */ 274 /* Locate each segment we overlap with, and count that overlap. */
275 list_for_each_entry(rg, head, link) { 275 list_for_each_entry(rg, head, link) {
276 int seg_from; 276 long seg_from;
277 int seg_to; 277 long seg_to;
278 278
279 if (rg->to <= f) 279 if (rg->to <= f)
280 continue; 280 continue;
@@ -2157,6 +2157,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
2157 kref_get(&reservations->refs); 2157 kref_get(&reservations->refs);
2158} 2158}
2159 2159
2160static void resv_map_put(struct vm_area_struct *vma)
2161{
2162 struct resv_map *reservations = vma_resv_map(vma);
2163
2164 if (!reservations)
2165 return;
2166 kref_put(&reservations->refs, resv_map_release);
2167}
2168
2160static void hugetlb_vm_op_close(struct vm_area_struct *vma) 2169static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2161{ 2170{
2162 struct hstate *h = hstate_vma(vma); 2171 struct hstate *h = hstate_vma(vma);
@@ -2173,7 +2182,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2173 reserve = (end - start) - 2182 reserve = (end - start) -
2174 region_count(&reservations->regions, start, end); 2183 region_count(&reservations->regions, start, end);
2175 2184
2176 kref_put(&reservations->refs, resv_map_release); 2185 resv_map_put(vma);
2177 2186
2178 if (reserve) { 2187 if (reserve) {
2179 hugetlb_acct_memory(h, -reserve); 2188 hugetlb_acct_memory(h, -reserve);
@@ -2991,12 +3000,16 @@ int hugetlb_reserve_pages(struct inode *inode,
2991 set_vma_resv_flags(vma, HPAGE_RESV_OWNER); 3000 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
2992 } 3001 }
2993 3002
2994 if (chg < 0) 3003 if (chg < 0) {
2995 return chg; 3004 ret = chg;
3005 goto out_err;
3006 }
2996 3007
2997 /* There must be enough pages in the subpool for the mapping */ 3008 /* There must be enough pages in the subpool for the mapping */
2998 if (hugepage_subpool_get_pages(spool, chg)) 3009 if (hugepage_subpool_get_pages(spool, chg)) {
2999 return -ENOSPC; 3010 ret = -ENOSPC;
3011 goto out_err;
3012 }
3000 3013
3001 /* 3014 /*
3002 * Check enough hugepages are available for the reservation. 3015 * Check enough hugepages are available for the reservation.
@@ -3005,7 +3018,7 @@ int hugetlb_reserve_pages(struct inode *inode,
3005 ret = hugetlb_acct_memory(h, chg); 3018 ret = hugetlb_acct_memory(h, chg);
3006 if (ret < 0) { 3019 if (ret < 0) {
3007 hugepage_subpool_put_pages(spool, chg); 3020 hugepage_subpool_put_pages(spool, chg);
3008 return ret; 3021 goto out_err;
3009 } 3022 }
3010 3023
3011 /* 3024 /*
@@ -3022,6 +3035,9 @@ int hugetlb_reserve_pages(struct inode *inode,
3022 if (!vma || vma->vm_flags & VM_MAYSHARE) 3035 if (!vma || vma->vm_flags & VM_MAYSHARE)
3023 region_add(&inode->i_mapping->private_list, from, to); 3036 region_add(&inode->i_mapping->private_list, from, to);
3024 return 0; 3037 return 0;
3038out_err:
3039 resv_map_put(vma);
3040 return ret;
3025} 3041}
3026 3042
3027void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) 3043void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
diff --git a/mm/internal.h b/mm/internal.h
index aee4761cf9a9..4194ab9dc19b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -94,6 +94,9 @@ extern void putback_lru_page(struct page *page);
94/* 94/*
95 * in mm/page_alloc.c 95 * in mm/page_alloc.c
96 */ 96 */
97extern void set_pageblock_migratetype(struct page *page, int migratetype);
98extern int move_freepages_block(struct zone *zone, struct page *page,
99 int migratetype);
97extern void __free_pages_bootmem(struct page *page, unsigned int order); 100extern void __free_pages_bootmem(struct page *page, unsigned int order);
98extern void prep_compound_page(struct page *page, unsigned long order); 101extern void prep_compound_page(struct page *page, unsigned long order);
99#ifdef CONFIG_MEMORY_FAILURE 102#ifdef CONFIG_MEMORY_FAILURE
@@ -101,6 +104,7 @@ extern bool is_free_buddy_page(struct page *page);
101#endif 104#endif
102 105
103#if defined CONFIG_COMPACTION || defined CONFIG_CMA 106#if defined CONFIG_COMPACTION || defined CONFIG_CMA
107#include <linux/compaction.h>
104 108
105/* 109/*
106 * in mm/compaction.c 110 * in mm/compaction.c
@@ -119,11 +123,14 @@ struct compact_control {
119 unsigned long nr_migratepages; /* Number of pages to migrate */ 123 unsigned long nr_migratepages; /* Number of pages to migrate */
120 unsigned long free_pfn; /* isolate_freepages search base */ 124 unsigned long free_pfn; /* isolate_freepages search base */
121 unsigned long migrate_pfn; /* isolate_migratepages search base */ 125 unsigned long migrate_pfn; /* isolate_migratepages search base */
122 bool sync; /* Synchronous migration */ 126 enum compact_mode mode; /* Compaction mode */
123 127
124 int order; /* order a direct compactor needs */ 128 int order; /* order a direct compactor needs */
125 int migratetype; /* MOVABLE, RECLAIMABLE etc */ 129 int migratetype; /* MOVABLE, RECLAIMABLE etc */
126 struct zone *zone; 130 struct zone *zone;
131
132 /* Number of UNMOVABLE destination pageblocks skipped during scan */
133 unsigned long nr_pageblocks_skipped;
127}; 134};
128 135
129unsigned long 136unsigned long
@@ -164,7 +171,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
164 * to determine if it's being mapped into a LOCKED vma. 171 * to determine if it's being mapped into a LOCKED vma.
165 * If so, mark page as mlocked. 172 * If so, mark page as mlocked.
166 */ 173 */
167static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) 174static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
175 struct page *page)
168{ 176{
169 VM_BUG_ON(PageLRU(page)); 177 VM_BUG_ON(PageLRU(page));
170 178
@@ -222,7 +230,7 @@ extern unsigned long vma_address(struct page *page,
222 struct vm_area_struct *vma); 230 struct vm_area_struct *vma);
223#endif 231#endif
224#else /* !CONFIG_MMU */ 232#else /* !CONFIG_MMU */
225static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 233static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
226{ 234{
227 return 0; 235 return 0;
228} 236}
diff --git a/mm/madvise.c b/mm/madvise.c
index 1ccbba5b6674..deff1b64a08c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,8 +11,10 @@
11#include <linux/mempolicy.h> 11#include <linux/mempolicy.h>
12#include <linux/page-isolation.h> 12#include <linux/page-isolation.h>
13#include <linux/hugetlb.h> 13#include <linux/hugetlb.h>
14#include <linux/falloc.h>
14#include <linux/sched.h> 15#include <linux/sched.h>
15#include <linux/ksm.h> 16#include <linux/ksm.h>
17#include <linux/fs.h>
16 18
17/* 19/*
18 * Any behaviour which results in changes to the vma->vm_flags needs to 20 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -200,8 +202,7 @@ static long madvise_remove(struct vm_area_struct *vma,
200 struct vm_area_struct **prev, 202 struct vm_area_struct **prev,
201 unsigned long start, unsigned long end) 203 unsigned long start, unsigned long end)
202{ 204{
203 struct address_space *mapping; 205 loff_t offset;
204 loff_t offset, endoff;
205 int error; 206 int error;
206 207
207 *prev = NULL; /* tell sys_madvise we drop mmap_sem */ 208 *prev = NULL; /* tell sys_madvise we drop mmap_sem */
@@ -217,16 +218,14 @@ static long madvise_remove(struct vm_area_struct *vma,
217 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) 218 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
218 return -EACCES; 219 return -EACCES;
219 220
220 mapping = vma->vm_file->f_mapping;
221
222 offset = (loff_t)(start - vma->vm_start) 221 offset = (loff_t)(start - vma->vm_start)
223 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); 222 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
224 endoff = (loff_t)(end - vma->vm_start - 1)
225 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
226 223
227 /* vmtruncate_range needs to take i_mutex */ 224 /* filesystem's fallocate may need to take i_mutex */
228 up_read(&current->mm->mmap_sem); 225 up_read(&current->mm->mmap_sem);
229 error = vmtruncate_range(mapping->host, offset, endoff); 226 error = do_fallocate(vma->vm_file,
227 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
228 offset, end - start);
230 down_read(&current->mm->mmap_sem); 229 down_read(&current->mm->mmap_sem);
231 return error; 230 return error;
232} 231}
diff --git a/mm/memblock.c b/mm/memblock.c
index a44eab3157f8..952123eba433 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -37,6 +37,8 @@ struct memblock memblock __initdata_memblock = {
37 37
38int memblock_debug __initdata_memblock; 38int memblock_debug __initdata_memblock;
39static int memblock_can_resize __initdata_memblock; 39static int memblock_can_resize __initdata_memblock;
40static int memblock_memory_in_slab __initdata_memblock = 0;
41static int memblock_reserved_in_slab __initdata_memblock = 0;
40 42
41/* inline so we don't get a warning when pr_debug is compiled out */ 43/* inline so we don't get a warning when pr_debug is compiled out */
42static inline const char *memblock_type_name(struct memblock_type *type) 44static inline const char *memblock_type_name(struct memblock_type *type)
@@ -187,6 +189,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
187 struct memblock_region *new_array, *old_array; 189 struct memblock_region *new_array, *old_array;
188 phys_addr_t old_size, new_size, addr; 190 phys_addr_t old_size, new_size, addr;
189 int use_slab = slab_is_available(); 191 int use_slab = slab_is_available();
192 int *in_slab;
190 193
191 /* We don't allow resizing until we know about the reserved regions 194 /* We don't allow resizing until we know about the reserved regions
192 * of memory that aren't suitable for allocation 195 * of memory that aren't suitable for allocation
@@ -198,6 +201,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
198 old_size = type->max * sizeof(struct memblock_region); 201 old_size = type->max * sizeof(struct memblock_region);
199 new_size = old_size << 1; 202 new_size = old_size << 1;
200 203
204 /* Retrieve the slab flag */
205 if (type == &memblock.memory)
206 in_slab = &memblock_memory_in_slab;
207 else
208 in_slab = &memblock_reserved_in_slab;
209
201 /* Try to find some space for it. 210 /* Try to find some space for it.
202 * 211 *
203 * WARNING: We assume that either slab_is_available() and we use it or 212 * WARNING: We assume that either slab_is_available() and we use it or
@@ -212,14 +221,15 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
212 if (use_slab) { 221 if (use_slab) {
213 new_array = kmalloc(new_size, GFP_KERNEL); 222 new_array = kmalloc(new_size, GFP_KERNEL);
214 addr = new_array ? __pa(new_array) : 0; 223 addr = new_array ? __pa(new_array) : 0;
215 } else 224 } else {
216 addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); 225 addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t));
226 new_array = addr ? __va(addr) : 0;
227 }
217 if (!addr) { 228 if (!addr) {
218 pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", 229 pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
219 memblock_type_name(type), type->max, type->max * 2); 230 memblock_type_name(type), type->max, type->max * 2);
220 return -1; 231 return -1;
221 } 232 }
222 new_array = __va(addr);
223 233
224 memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", 234 memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
225 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); 235 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
@@ -234,22 +244,24 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
234 type->regions = new_array; 244 type->regions = new_array;
235 type->max <<= 1; 245 type->max <<= 1;
236 246
237 /* If we use SLAB that's it, we are done */ 247 /* Free old array. We needn't free it if the array is the
238 if (use_slab) 248 * static one
239 return 0;
240
241 /* Add the new reserved region now. Should not fail ! */
242 BUG_ON(memblock_reserve(addr, new_size));
243
244 /* If the array wasn't our static init one, then free it. We only do
245 * that before SLAB is available as later on, we don't know whether
246 * to use kfree or free_bootmem_pages(). Shouldn't be a big deal
247 * anyways
248 */ 249 */
249 if (old_array != memblock_memory_init_regions && 250 if (*in_slab)
250 old_array != memblock_reserved_init_regions) 251 kfree(old_array);
252 else if (old_array != memblock_memory_init_regions &&
253 old_array != memblock_reserved_init_regions)
251 memblock_free(__pa(old_array), old_size); 254 memblock_free(__pa(old_array), old_size);
252 255
256 /* Reserve the new array if that comes from the memblock.
257 * Otherwise, we needn't do it
258 */
259 if (!use_slab)
260 BUG_ON(memblock_reserve(addr, new_size));
261
262 /* Update slab flag */
263 *in_slab = use_slab;
264
253 return 0; 265 return 0;
254} 266}
255 267
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f342778a0c0a..00c8898dbb81 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -138,7 +138,6 @@ struct mem_cgroup_per_zone {
138 138
139 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 139 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
140 140
141 struct zone_reclaim_stat reclaim_stat;
142 struct rb_node tree_node; /* RB tree node */ 141 struct rb_node tree_node; /* RB tree node */
143 unsigned long long usage_in_excess;/* Set to the value by which */ 142 unsigned long long usage_in_excess;/* Set to the value by which */
144 /* the soft limit is exceeded*/ 143 /* the soft limit is exceeded*/
@@ -1149,15 +1148,25 @@ struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
1149 * Checks whether given mem is same or in the root_mem_cgroup's 1148 * Checks whether given mem is same or in the root_mem_cgroup's
1150 * hierarchy subtree 1149 * hierarchy subtree
1151 */ 1150 */
1151bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1152 struct mem_cgroup *memcg)
1153{
1154 if (root_memcg == memcg)
1155 return true;
1156 if (!root_memcg->use_hierarchy)
1157 return false;
1158 return css_is_ancestor(&memcg->css, &root_memcg->css);
1159}
1160
1152static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1161static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1153 struct mem_cgroup *memcg) 1162 struct mem_cgroup *memcg)
1154{ 1163{
1155 if (root_memcg != memcg) { 1164 bool ret;
1156 return (root_memcg->use_hierarchy &&
1157 css_is_ancestor(&memcg->css, &root_memcg->css));
1158 }
1159 1165
1160 return true; 1166 rcu_read_lock();
1167 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1168 rcu_read_unlock();
1169 return ret;
1161} 1170}
1162 1171
1163int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) 1172int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
@@ -1233,16 +1242,6 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
1233 return (active > inactive); 1242 return (active > inactive);
1234} 1243}
1235 1244
1236struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1237 struct zone *zone)
1238{
1239 int nid = zone_to_nid(zone);
1240 int zid = zone_idx(zone);
1241 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1242
1243 return &mz->reclaim_stat;
1244}
1245
1246struct zone_reclaim_stat * 1245struct zone_reclaim_stat *
1247mem_cgroup_get_reclaim_stat_from_page(struct page *page) 1246mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1248{ 1247{
@@ -1258,7 +1257,7 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1258 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1257 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1259 smp_rmb(); 1258 smp_rmb();
1260 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 1259 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1261 return &mz->reclaim_stat; 1260 return &mz->lruvec.reclaim_stat;
1262} 1261}
1263 1262
1264#define mem_cgroup_from_res_counter(counter, member) \ 1263#define mem_cgroup_from_res_counter(counter, member) \
@@ -2845,24 +2844,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2845 */ 2844 */
2846 if (do_swap_account && PageSwapCache(page)) { 2845 if (do_swap_account && PageSwapCache(page)) {
2847 swp_entry_t ent = {.val = page_private(page)}; 2846 swp_entry_t ent = {.val = page_private(page)};
2848 struct mem_cgroup *swap_memcg; 2847 mem_cgroup_uncharge_swap(ent);
2849 unsigned short id;
2850
2851 id = swap_cgroup_record(ent, 0);
2852 rcu_read_lock();
2853 swap_memcg = mem_cgroup_lookup(id);
2854 if (swap_memcg) {
2855 /*
2856 * This recorded memcg can be obsolete one. So, avoid
2857 * calling css_tryget
2858 */
2859 if (!mem_cgroup_is_root(swap_memcg))
2860 res_counter_uncharge(&swap_memcg->memsw,
2861 PAGE_SIZE);
2862 mem_cgroup_swap_statistics(swap_memcg, false);
2863 mem_cgroup_put(swap_memcg);
2864 }
2865 rcu_read_unlock();
2866 } 2848 }
2867 /* 2849 /*
2868 * At swapin, we may charge account against cgroup which has no tasks. 2850 * At swapin, we may charge account against cgroup which has no tasks.
@@ -3155,7 +3137,6 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
3155 * @entry: swap entry to be moved 3137 * @entry: swap entry to be moved
3156 * @from: mem_cgroup which the entry is moved from 3138 * @from: mem_cgroup which the entry is moved from
3157 * @to: mem_cgroup which the entry is moved to 3139 * @to: mem_cgroup which the entry is moved to
3158 * @need_fixup: whether we should fixup res_counters and refcounts.
3159 * 3140 *
3160 * It succeeds only when the swap_cgroup's record for this entry is the same 3141 * It succeeds only when the swap_cgroup's record for this entry is the same
3161 * as the mem_cgroup's id of @from. 3142 * as the mem_cgroup's id of @from.
@@ -3166,7 +3147,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
3166 * both res and memsw, and called css_get(). 3147 * both res and memsw, and called css_get().
3167 */ 3148 */
3168static int mem_cgroup_move_swap_account(swp_entry_t entry, 3149static int mem_cgroup_move_swap_account(swp_entry_t entry,
3169 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3150 struct mem_cgroup *from, struct mem_cgroup *to)
3170{ 3151{
3171 unsigned short old_id, new_id; 3152 unsigned short old_id, new_id;
3172 3153
@@ -3185,24 +3166,13 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
3185 * swap-in, the refcount of @to might be decreased to 0. 3166 * swap-in, the refcount of @to might be decreased to 0.
3186 */ 3167 */
3187 mem_cgroup_get(to); 3168 mem_cgroup_get(to);
3188 if (need_fixup) {
3189 if (!mem_cgroup_is_root(from))
3190 res_counter_uncharge(&from->memsw, PAGE_SIZE);
3191 mem_cgroup_put(from);
3192 /*
3193 * we charged both to->res and to->memsw, so we should
3194 * uncharge to->res.
3195 */
3196 if (!mem_cgroup_is_root(to))
3197 res_counter_uncharge(&to->res, PAGE_SIZE);
3198 }
3199 return 0; 3169 return 0;
3200 } 3170 }
3201 return -EINVAL; 3171 return -EINVAL;
3202} 3172}
3203#else 3173#else
3204static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3174static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3205 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3175 struct mem_cgroup *from, struct mem_cgroup *to)
3206{ 3176{
3207 return -EINVAL; 3177 return -EINVAL;
3208} 3178}
@@ -3363,7 +3333,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3363void mem_cgroup_replace_page_cache(struct page *oldpage, 3333void mem_cgroup_replace_page_cache(struct page *oldpage,
3364 struct page *newpage) 3334 struct page *newpage)
3365{ 3335{
3366 struct mem_cgroup *memcg; 3336 struct mem_cgroup *memcg = NULL;
3367 struct page_cgroup *pc; 3337 struct page_cgroup *pc;
3368 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 3338 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3369 3339
@@ -3373,11 +3343,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
3373 pc = lookup_page_cgroup(oldpage); 3343 pc = lookup_page_cgroup(oldpage);
3374 /* fix accounting on old pages */ 3344 /* fix accounting on old pages */
3375 lock_page_cgroup(pc); 3345 lock_page_cgroup(pc);
3376 memcg = pc->mem_cgroup; 3346 if (PageCgroupUsed(pc)) {
3377 mem_cgroup_charge_statistics(memcg, false, -1); 3347 memcg = pc->mem_cgroup;
3378 ClearPageCgroupUsed(pc); 3348 mem_cgroup_charge_statistics(memcg, false, -1);
3349 ClearPageCgroupUsed(pc);
3350 }
3379 unlock_page_cgroup(pc); 3351 unlock_page_cgroup(pc);
3380 3352
3353 /*
3354 * When called from shmem_replace_page(), in some cases the
3355 * oldpage has already been charged, and in some cases not.
3356 */
3357 if (!memcg)
3358 return;
3359
3381 if (PageSwapBacked(oldpage)) 3360 if (PageSwapBacked(oldpage))
3382 type = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3361 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3383 3362
@@ -4226,21 +4205,19 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4226 { 4205 {
4227 int nid, zid; 4206 int nid, zid;
4228 struct mem_cgroup_per_zone *mz; 4207 struct mem_cgroup_per_zone *mz;
4208 struct zone_reclaim_stat *rstat;
4229 unsigned long recent_rotated[2] = {0, 0}; 4209 unsigned long recent_rotated[2] = {0, 0};
4230 unsigned long recent_scanned[2] = {0, 0}; 4210 unsigned long recent_scanned[2] = {0, 0};
4231 4211
4232 for_each_online_node(nid) 4212 for_each_online_node(nid)
4233 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4213 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4234 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 4214 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
4215 rstat = &mz->lruvec.reclaim_stat;
4235 4216
4236 recent_rotated[0] += 4217 recent_rotated[0] += rstat->recent_rotated[0];
4237 mz->reclaim_stat.recent_rotated[0]; 4218 recent_rotated[1] += rstat->recent_rotated[1];
4238 recent_rotated[1] += 4219 recent_scanned[0] += rstat->recent_scanned[0];
4239 mz->reclaim_stat.recent_rotated[1]; 4220 recent_scanned[1] += rstat->recent_scanned[1];
4240 recent_scanned[0] +=
4241 mz->reclaim_stat.recent_scanned[0];
4242 recent_scanned[1] +=
4243 mz->reclaim_stat.recent_scanned[1];
4244 } 4221 }
4245 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 4222 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
4246 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 4223 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
@@ -5135,7 +5112,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5135 return NULL; 5112 return NULL;
5136 if (PageAnon(page)) { 5113 if (PageAnon(page)) {
5137 /* we don't move shared anon */ 5114 /* we don't move shared anon */
5138 if (!move_anon() || page_mapcount(page) > 2) 5115 if (!move_anon())
5139 return NULL; 5116 return NULL;
5140 } else if (!move_file()) 5117 } else if (!move_file())
5141 /* we ignore mapcount for file pages */ 5118 /* we ignore mapcount for file pages */
@@ -5146,26 +5123,32 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5146 return page; 5123 return page;
5147} 5124}
5148 5125
5126#ifdef CONFIG_SWAP
5149static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5127static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5150 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5128 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5151{ 5129{
5152 int usage_count;
5153 struct page *page = NULL; 5130 struct page *page = NULL;
5154 swp_entry_t ent = pte_to_swp_entry(ptent); 5131 swp_entry_t ent = pte_to_swp_entry(ptent);
5155 5132
5156 if (!move_anon() || non_swap_entry(ent)) 5133 if (!move_anon() || non_swap_entry(ent))
5157 return NULL; 5134 return NULL;
5158 usage_count = mem_cgroup_count_swap_user(ent, &page); 5135 /*
5159 if (usage_count > 1) { /* we don't move shared anon */ 5136 * Because lookup_swap_cache() updates some statistics counter,
5160 if (page) 5137 * we call find_get_page() with swapper_space directly.
5161 put_page(page); 5138 */
5162 return NULL; 5139 page = find_get_page(&swapper_space, ent.val);
5163 }
5164 if (do_swap_account) 5140 if (do_swap_account)
5165 entry->val = ent.val; 5141 entry->val = ent.val;
5166 5142
5167 return page; 5143 return page;
5168} 5144}
5145#else
5146static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5147 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5148{
5149 return NULL;
5150}
5151#endif
5169 5152
5170static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5153static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5171 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5154 unsigned long addr, pte_t ptent, swp_entry_t *entry)
@@ -5521,8 +5504,7 @@ put: /* get_mctgt_type() gets the page */
5521 break; 5504 break;
5522 case MC_TARGET_SWAP: 5505 case MC_TARGET_SWAP:
5523 ent = target.ent; 5506 ent = target.ent;
5524 if (!mem_cgroup_move_swap_account(ent, 5507 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5525 mc.from, mc.to, false)) {
5526 mc.precharge--; 5508 mc.precharge--;
5527 /* we fixup refcnts and charges later. */ 5509 /* we fixup refcnts and charges later. */
5528 mc.moved_swap++; 5510 mc.moved_swap++;
@@ -5598,7 +5580,6 @@ static void mem_cgroup_move_task(struct cgroup *cont,
5598 if (mm) { 5580 if (mm) {
5599 if (mc.to) 5581 if (mc.to)
5600 mem_cgroup_move_charge(mm); 5582 mem_cgroup_move_charge(mm);
5601 put_swap_token(mm);
5602 mmput(mm); 5583 mmput(mm);
5603 } 5584 }
5604 if (mc.to) 5585 if (mc.to)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c99ad4e6b88c..ab1e7145e290 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1388,16 +1388,16 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
1388 */ 1388 */
1389 if (!get_page_unless_zero(compound_head(p))) { 1389 if (!get_page_unless_zero(compound_head(p))) {
1390 if (PageHuge(p)) { 1390 if (PageHuge(p)) {
1391 pr_info("get_any_page: %#lx free huge page\n", pfn); 1391 pr_info("%s: %#lx free huge page\n", __func__, pfn);
1392 ret = dequeue_hwpoisoned_huge_page(compound_head(p)); 1392 ret = dequeue_hwpoisoned_huge_page(compound_head(p));
1393 } else if (is_free_buddy_page(p)) { 1393 } else if (is_free_buddy_page(p)) {
1394 pr_info("get_any_page: %#lx free buddy page\n", pfn); 1394 pr_info("%s: %#lx free buddy page\n", __func__, pfn);
1395 /* Set hwpoison bit while page is still isolated */ 1395 /* Set hwpoison bit while page is still isolated */
1396 SetPageHWPoison(p); 1396 SetPageHWPoison(p);
1397 ret = 0; 1397 ret = 0;
1398 } else { 1398 } else {
1399 pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", 1399 pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
1400 pfn, p->flags); 1400 __func__, pfn, p->flags);
1401 ret = -EIO; 1401 ret = -EIO;
1402 } 1402 }
1403 } else { 1403 } else {
diff --git a/mm/memory.c b/mm/memory.c
index e40f6759ba98..1b7dc662bf9f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2908,7 +2908,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2908 delayacct_set_flag(DELAYACCT_PF_SWAPIN); 2908 delayacct_set_flag(DELAYACCT_PF_SWAPIN);
2909 page = lookup_swap_cache(entry); 2909 page = lookup_swap_cache(entry);
2910 if (!page) { 2910 if (!page) {
2911 grab_swap_token(mm); /* Contend for token _before_ read-in */
2912 page = swapin_readahead(entry, 2911 page = swapin_readahead(entry,
2913 GFP_HIGHUSER_MOVABLE, vma, address); 2912 GFP_HIGHUSER_MOVABLE, vma, address);
2914 if (!page) { 2913 if (!page) {
@@ -2938,6 +2937,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2938 } 2937 }
2939 2938
2940 locked = lock_page_or_retry(page, mm, flags); 2939 locked = lock_page_or_retry(page, mm, flags);
2940
2941 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2941 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2942 if (!locked) { 2942 if (!locked) {
2943 ret |= VM_FAULT_RETRY; 2943 ret |= VM_FAULT_RETRY;
@@ -3486,6 +3486,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3486 if (unlikely(is_vm_hugetlb_page(vma))) 3486 if (unlikely(is_vm_hugetlb_page(vma)))
3487 return hugetlb_fault(mm, vma, address, flags); 3487 return hugetlb_fault(mm, vma, address, flags);
3488 3488
3489retry:
3489 pgd = pgd_offset(mm, address); 3490 pgd = pgd_offset(mm, address);
3490 pud = pud_alloc(mm, pgd, address); 3491 pud = pud_alloc(mm, pgd, address);
3491 if (!pud) 3492 if (!pud)
@@ -3499,13 +3500,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3499 pmd, flags); 3500 pmd, flags);
3500 } else { 3501 } else {
3501 pmd_t orig_pmd = *pmd; 3502 pmd_t orig_pmd = *pmd;
3503 int ret;
3504
3502 barrier(); 3505 barrier();
3503 if (pmd_trans_huge(orig_pmd)) { 3506 if (pmd_trans_huge(orig_pmd)) {
3504 if (flags & FAULT_FLAG_WRITE && 3507 if (flags & FAULT_FLAG_WRITE &&
3505 !pmd_write(orig_pmd) && 3508 !pmd_write(orig_pmd) &&
3506 !pmd_trans_splitting(orig_pmd)) 3509 !pmd_trans_splitting(orig_pmd)) {
3507 return do_huge_pmd_wp_page(mm, vma, address, 3510 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3508 pmd, orig_pmd); 3511 orig_pmd);
3512 /*
3513 * If COW results in an oom, the huge pmd will
3514 * have been split, so retry the fault on the
3515 * pte for a smaller charge.
3516 */
3517 if (unlikely(ret & VM_FAULT_OOM))
3518 goto retry;
3519 return ret;
3520 }
3509 return 0; 3521 return 0;
3510 } 3522 }
3511 } 3523 }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fc898cb4fe8f..0d7e3ec8e0f3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -74,8 +74,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
74 res->end = start + size - 1; 74 res->end = start + size - 1;
75 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 75 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
76 if (request_resource(&iomem_resource, res) < 0) { 76 if (request_resource(&iomem_resource, res) < 0) {
77 printk("System RAM resource %llx - %llx cannot be added\n", 77 printk("System RAM resource %pR cannot be added\n", res);
78 (unsigned long long)res->start, (unsigned long long)res->end);
79 kfree(res); 78 kfree(res);
80 res = NULL; 79 res = NULL;
81 } 80 }
@@ -502,8 +501,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
502 online_pages_range); 501 online_pages_range);
503 if (ret) { 502 if (ret) {
504 mutex_unlock(&zonelists_mutex); 503 mutex_unlock(&zonelists_mutex);
505 printk(KERN_DEBUG "online_pages %lx at %lx failed\n", 504 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
506 nr_pages, pfn); 505 (unsigned long long) pfn << PAGE_SHIFT,
506 (((unsigned long long) pfn + nr_pages)
507 << PAGE_SHIFT) - 1);
507 memory_notify(MEM_CANCEL_ONLINE, &arg); 508 memory_notify(MEM_CANCEL_ONLINE, &arg);
508 unlock_memory_hotplug(); 509 unlock_memory_hotplug();
509 return ret; 510 return ret;
@@ -977,8 +978,9 @@ repeat:
977 return 0; 978 return 0;
978 979
979failed_removal: 980failed_removal:
980 printk(KERN_INFO "memory offlining %lx to %lx failed\n", 981 printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
981 start_pfn, end_pfn); 982 (unsigned long long) start_pfn << PAGE_SHIFT,
983 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
982 memory_notify(MEM_CANCEL_OFFLINE, &arg); 984 memory_notify(MEM_CANCEL_OFFLINE, &arg);
983 /* pushback to free area */ 985 /* pushback to free area */
984 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 986 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 88f9422b92e7..f15c1b24ca18 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -390,7 +390,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
390{ 390{
391 if (!pol) 391 if (!pol)
392 return; 392 return;
393 if (!mpol_store_user_nodemask(pol) && step == 0 && 393 if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
394 nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) 394 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
395 return; 395 return;
396 396
@@ -950,8 +950,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
950 * 950 *
951 * Returns the number of page that could not be moved. 951 * Returns the number of page that could not be moved.
952 */ 952 */
953int do_migrate_pages(struct mm_struct *mm, 953int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
954 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 954 const nodemask_t *to, int flags)
955{ 955{
956 int busy = 0; 956 int busy = 0;
957 int err; 957 int err;
@@ -963,7 +963,7 @@ int do_migrate_pages(struct mm_struct *mm,
963 963
964 down_read(&mm->mmap_sem); 964 down_read(&mm->mmap_sem);
965 965
966 err = migrate_vmas(mm, from_nodes, to_nodes, flags); 966 err = migrate_vmas(mm, from, to, flags);
967 if (err) 967 if (err)
968 goto out; 968 goto out;
969 969
@@ -998,14 +998,34 @@ int do_migrate_pages(struct mm_struct *mm,
998 * moved to an empty node, then there is nothing left worth migrating. 998 * moved to an empty node, then there is nothing left worth migrating.
999 */ 999 */
1000 1000
1001 tmp = *from_nodes; 1001 tmp = *from;
1002 while (!nodes_empty(tmp)) { 1002 while (!nodes_empty(tmp)) {
1003 int s,d; 1003 int s,d;
1004 int source = -1; 1004 int source = -1;
1005 int dest = 0; 1005 int dest = 0;
1006 1006
1007 for_each_node_mask(s, tmp) { 1007 for_each_node_mask(s, tmp) {
1008 d = node_remap(s, *from_nodes, *to_nodes); 1008
1009 /*
1010 * do_migrate_pages() tries to maintain the relative
1011 * node relationship of the pages established between
1012 * threads and memory areas.
1013 *
1014 * However if the number of source nodes is not equal to
1015 * the number of destination nodes we can not preserve
1016 * this node relative relationship. In that case, skip
1017 * copying memory from a node that is in the destination
1018 * mask.
1019 *
1020 * Example: [2,3,4] -> [3,4,5] moves everything.
1021 * [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1022 */
1023
1024 if ((nodes_weight(*from) != nodes_weight(*to)) &&
1025 (node_isset(s, *to)))
1026 continue;
1027
1028 d = node_remap(s, *from, *to);
1009 if (s == d) 1029 if (s == d)
1010 continue; 1030 continue;
1011 1031
@@ -1065,8 +1085,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
1065{ 1085{
1066} 1086}
1067 1087
1068int do_migrate_pages(struct mm_struct *mm, 1088int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1069 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 1089 const nodemask_t *to, int flags)
1070{ 1090{
1071 return -ENOSYS; 1091 return -ENOSYS;
1072} 1092}
diff --git a/mm/mmap.c b/mm/mmap.c
index e8dcfc7de866..4a9c2a391e28 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1639,33 +1639,34 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1639{ 1639{
1640 struct vm_area_struct *vma = NULL; 1640 struct vm_area_struct *vma = NULL;
1641 1641
1642 if (mm) { 1642 if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */
1643 /* Check the cache first. */ 1643 return NULL;
1644 /* (Cache hit rate is typically around 35%.) */ 1644
1645 vma = mm->mmap_cache; 1645 /* Check the cache first. */
1646 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { 1646 /* (Cache hit rate is typically around 35%.) */
1647 struct rb_node * rb_node; 1647 vma = mm->mmap_cache;
1648 1648 if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
1649 rb_node = mm->mm_rb.rb_node; 1649 struct rb_node *rb_node;
1650 vma = NULL; 1650
1651 1651 rb_node = mm->mm_rb.rb_node;
1652 while (rb_node) { 1652 vma = NULL;
1653 struct vm_area_struct * vma_tmp; 1653
1654 1654 while (rb_node) {
1655 vma_tmp = rb_entry(rb_node, 1655 struct vm_area_struct *vma_tmp;
1656 struct vm_area_struct, vm_rb); 1656
1657 1657 vma_tmp = rb_entry(rb_node,
1658 if (vma_tmp->vm_end > addr) { 1658 struct vm_area_struct, vm_rb);
1659 vma = vma_tmp; 1659
1660 if (vma_tmp->vm_start <= addr) 1660 if (vma_tmp->vm_end > addr) {
1661 break; 1661 vma = vma_tmp;
1662 rb_node = rb_node->rb_left; 1662 if (vma_tmp->vm_start <= addr)
1663 } else 1663 break;
1664 rb_node = rb_node->rb_right; 1664 rb_node = rb_node->rb_left;
1665 } 1665 } else
1666 if (vma) 1666 rb_node = rb_node->rb_right;
1667 mm->mmap_cache = vma;
1668 } 1667 }
1668 if (vma)
1669 mm->mmap_cache = vma;
1669 } 1670 }
1670 return vma; 1671 return vma;
1671} 1672}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 1983fb1c7026..d23415c001bc 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -274,86 +274,85 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
274 return ___alloc_bootmem(size, align, goal, limit); 274 return ___alloc_bootmem(size, align, goal, limit);
275} 275}
276 276
277/** 277static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
278 * __alloc_bootmem_node - allocate boot memory from a specific node 278 unsigned long size,
279 * @pgdat: node to allocate from 279 unsigned long align,
280 * @size: size of the request in bytes 280 unsigned long goal,
281 * @align: alignment of the region 281 unsigned long limit)
282 * @goal: preferred starting address of the region
283 *
284 * The goal is dropped if it can not be satisfied and the allocation will
285 * fall back to memory below @goal.
286 *
287 * Allocation may fall back to any node in the system if the specified node
288 * can not hold the requested memory.
289 *
290 * The function panics if the request can not be satisfied.
291 */
292void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
293 unsigned long align, unsigned long goal)
294{ 282{
295 void *ptr; 283 void *ptr;
296 284
297 if (WARN_ON_ONCE(slab_is_available()))
298 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
299
300again: 285again:
301 ptr = __alloc_memory_core_early(pgdat->node_id, size, align, 286 ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
302 goal, -1ULL); 287 goal, limit);
303 if (ptr) 288 if (ptr)
304 return ptr; 289 return ptr;
305 290
306 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, 291 ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
307 goal, -1ULL); 292 goal, limit);
308 if (!ptr && goal) { 293 if (ptr)
294 return ptr;
295
296 if (goal) {
309 goal = 0; 297 goal = 0;
310 goto again; 298 goto again;
311 } 299 }
312 return ptr; 300
301 return NULL;
313} 302}
314 303
315void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, 304void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
316 unsigned long align, unsigned long goal) 305 unsigned long align, unsigned long goal)
317{ 306{
318 return __alloc_bootmem_node(pgdat, size, align, goal); 307 if (WARN_ON_ONCE(slab_is_available()))
308 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
309
310 return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
319} 311}
320 312
321#ifdef CONFIG_SPARSEMEM 313void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
322/** 314 unsigned long align, unsigned long goal,
323 * alloc_bootmem_section - allocate boot memory from a specific section 315 unsigned long limit)
324 * @size: size of the request in bytes
325 * @section_nr: sparse map section to allocate from
326 *
327 * Return NULL on failure.
328 */
329void * __init alloc_bootmem_section(unsigned long size,
330 unsigned long section_nr)
331{ 316{
332 unsigned long pfn, goal, limit; 317 void *ptr;
333 318
334 pfn = section_nr_to_pfn(section_nr); 319 ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
335 goal = pfn << PAGE_SHIFT; 320 if (ptr)
336 limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; 321 return ptr;
337 322
338 return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, 323 printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
339 SMP_CACHE_BYTES, goal, limit); 324 panic("Out of memory");
325 return NULL;
340} 326}
341#endif
342 327
343void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, 328/**
329 * __alloc_bootmem_node - allocate boot memory from a specific node
330 * @pgdat: node to allocate from
331 * @size: size of the request in bytes
332 * @align: alignment of the region
333 * @goal: preferred starting address of the region
334 *
335 * The goal is dropped if it can not be satisfied and the allocation will
336 * fall back to memory below @goal.
337 *
338 * Allocation may fall back to any node in the system if the specified node
339 * can not hold the requested memory.
340 *
341 * The function panics if the request can not be satisfied.
342 */
343void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
344 unsigned long align, unsigned long goal) 344 unsigned long align, unsigned long goal)
345{ 345{
346 void *ptr;
347
348 if (WARN_ON_ONCE(slab_is_available())) 346 if (WARN_ON_ONCE(slab_is_available()))
349 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 347 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
350 348
351 ptr = __alloc_memory_core_early(pgdat->node_id, size, align, 349 return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
352 goal, -1ULL); 350}
353 if (ptr)
354 return ptr;
355 351
356 return __alloc_bootmem_nopanic(size, align, goal); 352void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
353 unsigned long align, unsigned long goal)
354{
355 return __alloc_bootmem_node(pgdat, size, align, goal);
357} 356}
358 357
359#ifndef ARCH_LOW_ADDRESS_LIMIT 358#ifndef ARCH_LOW_ADDRESS_LIMIT
@@ -397,16 +396,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
397void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, 396void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
398 unsigned long align, unsigned long goal) 397 unsigned long align, unsigned long goal)
399{ 398{
400 void *ptr;
401
402 if (WARN_ON_ONCE(slab_is_available())) 399 if (WARN_ON_ONCE(slab_is_available()))
403 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); 400 return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
404 401
405 ptr = __alloc_memory_core_early(pgdat->node_id, size, align, 402 return ___alloc_bootmem_node(pgdat, size, align, goal,
406 goal, ARCH_LOW_ADDRESS_LIMIT); 403 ARCH_LOW_ADDRESS_LIMIT);
407 if (ptr)
408 return ptr;
409
410 return __alloc_memory_core_early(MAX_NUMNODES, size, align,
411 goal, ARCH_LOW_ADDRESS_LIMIT);
412} 404}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9f09a1fde9f9..ed0e19677360 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -180,10 +180,10 @@ static bool oom_unkillable_task(struct task_struct *p,
180 * predictable as possible. The goal is to return the highest value for the 180 * predictable as possible. The goal is to return the highest value for the
181 * task consuming the most memory to avoid subsequent oom failures. 181 * task consuming the most memory to avoid subsequent oom failures.
182 */ 182 */
183unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, 183unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
184 const nodemask_t *nodemask, unsigned long totalpages) 184 const nodemask_t *nodemask, unsigned long totalpages)
185{ 185{
186 long points; 186 unsigned long points;
187 187
188 if (oom_unkillable_task(p, memcg, nodemask)) 188 if (oom_unkillable_task(p, memcg, nodemask))
189 return 0; 189 return 0;
@@ -198,21 +198,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
198 } 198 }
199 199
200 /* 200 /*
201 * The memory controller may have a limit of 0 bytes, so avoid a divide
202 * by zero, if necessary.
203 */
204 if (!totalpages)
205 totalpages = 1;
206
207 /*
208 * The baseline for the badness score is the proportion of RAM that each 201 * The baseline for the badness score is the proportion of RAM that each
209 * task's rss, pagetable and swap space use. 202 * task's rss, pagetable and swap space use.
210 */ 203 */
211 points = get_mm_rss(p->mm) + p->mm->nr_ptes; 204 points = get_mm_rss(p->mm) + p->mm->nr_ptes +
212 points += get_mm_counter(p->mm, MM_SWAPENTS); 205 get_mm_counter(p->mm, MM_SWAPENTS);
213
214 points *= 1000;
215 points /= totalpages;
216 task_unlock(p); 206 task_unlock(p);
217 207
218 /* 208 /*
@@ -220,23 +210,20 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
220 * implementation used by LSMs. 210 * implementation used by LSMs.
221 */ 211 */
222 if (has_capability_noaudit(p, CAP_SYS_ADMIN)) 212 if (has_capability_noaudit(p, CAP_SYS_ADMIN))
223 points -= 30; 213 points -= 30 * totalpages / 1000;
224 214
225 /* 215 /*
226 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may 216 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
227 * either completely disable oom killing or always prefer a certain 217 * either completely disable oom killing or always prefer a certain
228 * task. 218 * task.
229 */ 219 */
230 points += p->signal->oom_score_adj; 220 points += p->signal->oom_score_adj * totalpages / 1000;
231 221
232 /* 222 /*
233 * Never return 0 for an eligible task that may be killed since it's 223 * Never return 0 for an eligible task regardless of the root bonus and
234 * possible that no single user task uses more than 0.1% of memory and 224 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
235 * no single admin tasks uses more than 3.0%.
236 */ 225 */
237 if (points <= 0) 226 return points ? points : 1;
238 return 1;
239 return (points < 1000) ? points : 1000;
240} 227}
241 228
242/* 229/*
@@ -314,7 +301,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
314{ 301{
315 struct task_struct *g, *p; 302 struct task_struct *g, *p;
316 struct task_struct *chosen = NULL; 303 struct task_struct *chosen = NULL;
317 *ppoints = 0; 304 unsigned long chosen_points = 0;
318 305
319 do_each_thread(g, p) { 306 do_each_thread(g, p) {
320 unsigned int points; 307 unsigned int points;
@@ -354,7 +341,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
354 */ 341 */
355 if (p == current) { 342 if (p == current) {
356 chosen = p; 343 chosen = p;
357 *ppoints = 1000; 344 chosen_points = ULONG_MAX;
358 } else if (!force_kill) { 345 } else if (!force_kill) {
359 /* 346 /*
360 * If this task is not being ptraced on exit, 347 * If this task is not being ptraced on exit,
@@ -367,12 +354,13 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
367 } 354 }
368 355
369 points = oom_badness(p, memcg, nodemask, totalpages); 356 points = oom_badness(p, memcg, nodemask, totalpages);
370 if (points > *ppoints) { 357 if (points > chosen_points) {
371 chosen = p; 358 chosen = p;
372 *ppoints = points; 359 chosen_points = points;
373 } 360 }
374 } while_each_thread(g, p); 361 } while_each_thread(g, p);
375 362
363 *ppoints = chosen_points * 1000 / totalpages;
376 return chosen; 364 return chosen;
377} 365}
378 366
@@ -572,7 +560,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
572 } 560 }
573 561
574 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 562 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
575 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; 563 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
576 read_lock(&tasklist_lock); 564 read_lock(&tasklist_lock);
577 p = select_bad_process(&points, limit, memcg, NULL, false); 565 p = select_bad_process(&points, limit, memcg, NULL, false);
578 if (p && PTR_ERR(p) != -1UL) 566 if (p && PTR_ERR(p) != -1UL)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bab8e3bc4202..8cbfc38e68ac 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -219,7 +219,7 @@ EXPORT_SYMBOL(nr_online_nodes);
219 219
220int page_group_by_mobility_disabled __read_mostly; 220int page_group_by_mobility_disabled __read_mostly;
221 221
222static void set_pageblock_migratetype(struct page *page, int migratetype) 222void set_pageblock_migratetype(struct page *page, int migratetype)
223{ 223{
224 224
225 if (unlikely(page_group_by_mobility_disabled)) 225 if (unlikely(page_group_by_mobility_disabled))
@@ -954,8 +954,8 @@ static int move_freepages(struct zone *zone,
954 return pages_moved; 954 return pages_moved;
955} 955}
956 956
957static int move_freepages_block(struct zone *zone, struct page *page, 957int move_freepages_block(struct zone *zone, struct page *page,
958 int migratetype) 958 int migratetype)
959{ 959{
960 unsigned long start_pfn, end_pfn; 960 unsigned long start_pfn, end_pfn;
961 struct page *start_page, *end_page; 961 struct page *start_page, *end_page;
@@ -4300,25 +4300,24 @@ static inline void setup_usemap(struct pglist_data *pgdat,
4300 4300
4301#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4301#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4302 4302
4303/* Return a sensible default order for the pageblock size. */
4304static inline int pageblock_default_order(void)
4305{
4306 if (HPAGE_SHIFT > PAGE_SHIFT)
4307 return HUGETLB_PAGE_ORDER;
4308
4309 return MAX_ORDER-1;
4310}
4311
4312/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4303/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4313static inline void __init set_pageblock_order(unsigned int order) 4304static inline void __init set_pageblock_order(void)
4314{ 4305{
4306 unsigned int order;
4307
4315 /* Check that pageblock_nr_pages has not already been setup */ 4308 /* Check that pageblock_nr_pages has not already been setup */
4316 if (pageblock_order) 4309 if (pageblock_order)
4317 return; 4310 return;
4318 4311
4312 if (HPAGE_SHIFT > PAGE_SHIFT)
4313 order = HUGETLB_PAGE_ORDER;
4314 else
4315 order = MAX_ORDER - 1;
4316
4319 /* 4317 /*
4320 * Assume the largest contiguous order of interest is a huge page. 4318 * Assume the largest contiguous order of interest is a huge page.
4321 * This value may be variable depending on boot parameters on IA64 4319 * This value may be variable depending on boot parameters on IA64 and
4320 * powerpc.
4322 */ 4321 */
4323 pageblock_order = order; 4322 pageblock_order = order;
4324} 4323}
@@ -4326,15 +4325,13 @@ static inline void __init set_pageblock_order(unsigned int order)
4326 4325
4327/* 4326/*
4328 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() 4327 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
4329 * and pageblock_default_order() are unused as pageblock_order is set 4328 * is unused as pageblock_order is set at compile-time. See
4330 * at compile-time. See include/linux/pageblock-flags.h for the values of 4329 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4331 * pageblock_order based on the kernel config 4330 * the kernel config
4332 */ 4331 */
4333static inline int pageblock_default_order(unsigned int order) 4332static inline void set_pageblock_order(void)
4334{ 4333{
4335 return MAX_ORDER-1;
4336} 4334}
4337#define set_pageblock_order(x) do {} while (0)
4338 4335
4339#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4336#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4340 4337
@@ -4413,16 +4410,16 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4413 zone_pcp_init(zone); 4410 zone_pcp_init(zone);
4414 for_each_lru(lru) 4411 for_each_lru(lru)
4415 INIT_LIST_HEAD(&zone->lruvec.lists[lru]); 4412 INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
4416 zone->reclaim_stat.recent_rotated[0] = 0; 4413 zone->lruvec.reclaim_stat.recent_rotated[0] = 0;
4417 zone->reclaim_stat.recent_rotated[1] = 0; 4414 zone->lruvec.reclaim_stat.recent_rotated[1] = 0;
4418 zone->reclaim_stat.recent_scanned[0] = 0; 4415 zone->lruvec.reclaim_stat.recent_scanned[0] = 0;
4419 zone->reclaim_stat.recent_scanned[1] = 0; 4416 zone->lruvec.reclaim_stat.recent_scanned[1] = 0;
4420 zap_zone_vm_stats(zone); 4417 zap_zone_vm_stats(zone);
4421 zone->flags = 0; 4418 zone->flags = 0;
4422 if (!size) 4419 if (!size)
4423 continue; 4420 continue;
4424 4421
4425 set_pageblock_order(pageblock_default_order()); 4422 set_pageblock_order();
4426 setup_usemap(pgdat, zone, size); 4423 setup_usemap(pgdat, zone, size);
4427 ret = init_currently_empty_zone(zone, zone_start_pfn, 4424 ret = init_currently_empty_zone(zone, zone_start_pfn,
4428 size, MEMMAP_EARLY); 4425 size, MEMMAP_EARLY);
@@ -4815,7 +4812,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4815 find_zone_movable_pfns_for_nodes(); 4812 find_zone_movable_pfns_for_nodes();
4816 4813
4817 /* Print out the zone ranges */ 4814 /* Print out the zone ranges */
4818 printk("Zone PFN ranges:\n"); 4815 printk("Zone ranges:\n");
4819 for (i = 0; i < MAX_NR_ZONES; i++) { 4816 for (i = 0; i < MAX_NR_ZONES; i++) {
4820 if (i == ZONE_MOVABLE) 4817 if (i == ZONE_MOVABLE)
4821 continue; 4818 continue;
@@ -4824,22 +4821,25 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4824 arch_zone_highest_possible_pfn[i]) 4821 arch_zone_highest_possible_pfn[i])
4825 printk(KERN_CONT "empty\n"); 4822 printk(KERN_CONT "empty\n");
4826 else 4823 else
4827 printk(KERN_CONT "%0#10lx -> %0#10lx\n", 4824 printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
4828 arch_zone_lowest_possible_pfn[i], 4825 arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
4829 arch_zone_highest_possible_pfn[i]); 4826 (arch_zone_highest_possible_pfn[i]
4827 << PAGE_SHIFT) - 1);
4830 } 4828 }
4831 4829
4832 /* Print out the PFNs ZONE_MOVABLE begins at in each node */ 4830 /* Print out the PFNs ZONE_MOVABLE begins at in each node */
4833 printk("Movable zone start PFN for each node\n"); 4831 printk("Movable zone start for each node\n");
4834 for (i = 0; i < MAX_NUMNODES; i++) { 4832 for (i = 0; i < MAX_NUMNODES; i++) {
4835 if (zone_movable_pfn[i]) 4833 if (zone_movable_pfn[i])
4836 printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); 4834 printk(" Node %d: %#010lx\n", i,
4835 zone_movable_pfn[i] << PAGE_SHIFT);
4837 } 4836 }
4838 4837
4839 /* Print out the early_node_map[] */ 4838 /* Print out the early_node_map[] */
4840 printk("Early memory PFN ranges\n"); 4839 printk("Early memory node ranges\n");
4841 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) 4840 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
4842 printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn); 4841 printk(" node %3d: [mem %#010lx-%#010lx]\n", nid,
4842 start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
4843 4843
4844 /* Initialise every node */ 4844 /* Initialise every node */
4845 mminit_verify_pageflags_layout(); 4845 mminit_verify_pageflags_layout();
@@ -5657,7 +5657,7 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
5657 .nr_migratepages = 0, 5657 .nr_migratepages = 0,
5658 .order = -1, 5658 .order = -1,
5659 .zone = page_zone(pfn_to_page(start)), 5659 .zone = page_zone(pfn_to_page(start)),
5660 .sync = true, 5660 .mode = COMPACT_SYNC,
5661 }; 5661 };
5662 INIT_LIST_HEAD(&cc.migratepages); 5662 INIT_LIST_HEAD(&cc.migratepages);
5663 5663
@@ -5938,7 +5938,7 @@ bool is_free_buddy_page(struct page *page)
5938} 5938}
5939#endif 5939#endif
5940 5940
5941static struct trace_print_flags pageflag_names[] = { 5941static const struct trace_print_flags pageflag_names[] = {
5942 {1UL << PG_locked, "locked" }, 5942 {1UL << PG_locked, "locked" },
5943 {1UL << PG_error, "error" }, 5943 {1UL << PG_error, "error" },
5944 {1UL << PG_referenced, "referenced" }, 5944 {1UL << PG_referenced, "referenced" },
@@ -5973,7 +5973,9 @@ static struct trace_print_flags pageflag_names[] = {
5973#ifdef CONFIG_MEMORY_FAILURE 5973#ifdef CONFIG_MEMORY_FAILURE
5974 {1UL << PG_hwpoison, "hwpoison" }, 5974 {1UL << PG_hwpoison, "hwpoison" },
5975#endif 5975#endif
5976 {-1UL, NULL }, 5976#ifdef CONFIG_TRANSPARENT_HUGEPAGE
5977 {1UL << PG_compound_lock, "compound_lock" },
5978#endif
5977}; 5979};
5978 5980
5979static void dump_page_flags(unsigned long flags) 5981static void dump_page_flags(unsigned long flags)
@@ -5982,12 +5984,14 @@ static void dump_page_flags(unsigned long flags)
5982 unsigned long mask; 5984 unsigned long mask;
5983 int i; 5985 int i;
5984 5986
5987 BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
5988
5985 printk(KERN_ALERT "page flags: %#lx(", flags); 5989 printk(KERN_ALERT "page flags: %#lx(", flags);
5986 5990
5987 /* remove zone id */ 5991 /* remove zone id */
5988 flags &= (1UL << NR_PAGEFLAGS) - 1; 5992 flags &= (1UL << NR_PAGEFLAGS) - 1;
5989 5993
5990 for (i = 0; pageflag_names[i].name && flags; i++) { 5994 for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
5991 5995
5992 mask = pageflag_names[i].mask; 5996 mask = pageflag_names[i].mask;
5993 if ((flags & mask) != mask) 5997 if ((flags & mask) != mask)
diff --git a/mm/readahead.c b/mm/readahead.c
index cbcbb02f3e28..ea8f8fa21649 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,6 +17,8 @@
17#include <linux/task_io_accounting_ops.h> 17#include <linux/task_io_accounting_ops.h>
18#include <linux/pagevec.h> 18#include <linux/pagevec.h>
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/syscalls.h>
21#include <linux/file.h>
20 22
21/* 23/*
22 * Initialise a struct file's readahead state. Assumes that the caller has 24 * Initialise a struct file's readahead state. Assumes that the caller has
@@ -562,3 +564,41 @@ page_cache_async_readahead(struct address_space *mapping,
562 ondemand_readahead(mapping, ra, filp, true, offset, req_size); 564 ondemand_readahead(mapping, ra, filp, true, offset, req_size);
563} 565}
564EXPORT_SYMBOL_GPL(page_cache_async_readahead); 566EXPORT_SYMBOL_GPL(page_cache_async_readahead);
567
568static ssize_t
569do_readahead(struct address_space *mapping, struct file *filp,
570 pgoff_t index, unsigned long nr)
571{
572 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
573 return -EINVAL;
574
575 force_page_cache_readahead(mapping, filp, index, nr);
576 return 0;
577}
578
579SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
580{
581 ssize_t ret;
582 struct file *file;
583
584 ret = -EBADF;
585 file = fget(fd);
586 if (file) {
587 if (file->f_mode & FMODE_READ) {
588 struct address_space *mapping = file->f_mapping;
589 pgoff_t start = offset >> PAGE_CACHE_SHIFT;
590 pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
591 unsigned long len = end - start + 1;
592 ret = do_readahead(mapping, file, start, len);
593 }
594 fput(file);
595 }
596 return ret;
597}
598#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
599asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
600{
601 return SYSC_readahead((int) fd, offset, (size_t) count);
602}
603SYSCALL_ALIAS(sys_readahead, SyS_readahead);
604#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 5b5ad584ffb7..0f3b7cda2a24 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -755,12 +755,6 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
755 pte_unmap_unlock(pte, ptl); 755 pte_unmap_unlock(pte, ptl);
756 } 756 }
757 757
758 /* Pretend the page is referenced if the task has the
759 swap token and is in the middle of a page fault. */
760 if (mm != current->mm && has_swap_token(mm) &&
761 rwsem_is_locked(&mm->mmap_sem))
762 referenced++;
763
764 (*mapcount)--; 758 (*mapcount)--;
765 759
766 if (referenced) 760 if (referenced)
diff --git a/mm/shmem.c b/mm/shmem.c
index be5af34a070d..d576b84d913c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt;
53#include <linux/blkdev.h> 53#include <linux/blkdev.h>
54#include <linux/pagevec.h> 54#include <linux/pagevec.h>
55#include <linux/percpu_counter.h> 55#include <linux/percpu_counter.h>
56#include <linux/falloc.h>
56#include <linux/splice.h> 57#include <linux/splice.h>
57#include <linux/security.h> 58#include <linux/security.h>
58#include <linux/swapops.h> 59#include <linux/swapops.h>
@@ -83,12 +84,25 @@ struct shmem_xattr {
83 char value[0]; 84 char value[0];
84}; 85};
85 86
87/*
88 * shmem_fallocate and shmem_writepage communicate via inode->i_private
89 * (with i_mutex making sure that it has only one user at a time):
90 * we would prefer not to enlarge the shmem inode just for that.
91 */
92struct shmem_falloc {
93 pgoff_t start; /* start of range currently being fallocated */
94 pgoff_t next; /* the next page offset to be fallocated */
95 pgoff_t nr_falloced; /* how many new pages have been fallocated */
96 pgoff_t nr_unswapped; /* how often writepage refused to swap out */
97};
98
86/* Flag allocation requirements to shmem_getpage */ 99/* Flag allocation requirements to shmem_getpage */
87enum sgp_type { 100enum sgp_type {
88 SGP_READ, /* don't exceed i_size, don't allocate page */ 101 SGP_READ, /* don't exceed i_size, don't allocate page */
89 SGP_CACHE, /* don't exceed i_size, may allocate page */ 102 SGP_CACHE, /* don't exceed i_size, may allocate page */
90 SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ 103 SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */
91 SGP_WRITE, /* may exceed i_size, may allocate page */ 104 SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */
105 SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
92}; 106};
93 107
94#ifdef CONFIG_TMPFS 108#ifdef CONFIG_TMPFS
@@ -103,6 +117,9 @@ static unsigned long shmem_default_max_inodes(void)
103} 117}
104#endif 118#endif
105 119
120static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
121static int shmem_replace_page(struct page **pagep, gfp_t gfp,
122 struct shmem_inode_info *info, pgoff_t index);
106static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, 123static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
107 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); 124 struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
108 125
@@ -423,27 +440,31 @@ void shmem_unlock_mapping(struct address_space *mapping)
423 440
424/* 441/*
425 * Remove range of pages and swap entries from radix tree, and free them. 442 * Remove range of pages and swap entries from radix tree, and free them.
443 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
426 */ 444 */
427void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) 445static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
446 bool unfalloc)
428{ 447{
429 struct address_space *mapping = inode->i_mapping; 448 struct address_space *mapping = inode->i_mapping;
430 struct shmem_inode_info *info = SHMEM_I(inode); 449 struct shmem_inode_info *info = SHMEM_I(inode);
431 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 450 pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
432 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); 451 pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
433 pgoff_t end = (lend >> PAGE_CACHE_SHIFT); 452 unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
453 unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
434 struct pagevec pvec; 454 struct pagevec pvec;
435 pgoff_t indices[PAGEVEC_SIZE]; 455 pgoff_t indices[PAGEVEC_SIZE];
436 long nr_swaps_freed = 0; 456 long nr_swaps_freed = 0;
437 pgoff_t index; 457 pgoff_t index;
438 int i; 458 int i;
439 459
440 BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); 460 if (lend == -1)
461 end = -1; /* unsigned, so actually very big */
441 462
442 pagevec_init(&pvec, 0); 463 pagevec_init(&pvec, 0);
443 index = start; 464 index = start;
444 while (index <= end) { 465 while (index < end) {
445 pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 466 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
446 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, 467 min(end - index, (pgoff_t)PAGEVEC_SIZE),
447 pvec.pages, indices); 468 pvec.pages, indices);
448 if (!pvec.nr) 469 if (!pvec.nr)
449 break; 470 break;
@@ -452,10 +473,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
452 struct page *page = pvec.pages[i]; 473 struct page *page = pvec.pages[i];
453 474
454 index = indices[i]; 475 index = indices[i];
455 if (index > end) 476 if (index >= end)
456 break; 477 break;
457 478
458 if (radix_tree_exceptional_entry(page)) { 479 if (radix_tree_exceptional_entry(page)) {
480 if (unfalloc)
481 continue;
459 nr_swaps_freed += !shmem_free_swap(mapping, 482 nr_swaps_freed += !shmem_free_swap(mapping,
460 index, page); 483 index, page);
461 continue; 484 continue;
@@ -463,9 +486,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
463 486
464 if (!trylock_page(page)) 487 if (!trylock_page(page))
465 continue; 488 continue;
466 if (page->mapping == mapping) { 489 if (!unfalloc || !PageUptodate(page)) {
467 VM_BUG_ON(PageWriteback(page)); 490 if (page->mapping == mapping) {
468 truncate_inode_page(mapping, page); 491 VM_BUG_ON(PageWriteback(page));
492 truncate_inode_page(mapping, page);
493 }
469 } 494 }
470 unlock_page(page); 495 unlock_page(page);
471 } 496 }
@@ -476,30 +501,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
476 index++; 501 index++;
477 } 502 }
478 503
479 if (partial) { 504 if (partial_start) {
480 struct page *page = NULL; 505 struct page *page = NULL;
481 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); 506 shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
482 if (page) { 507 if (page) {
483 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 508 unsigned int top = PAGE_CACHE_SIZE;
509 if (start > end) {
510 top = partial_end;
511 partial_end = 0;
512 }
513 zero_user_segment(page, partial_start, top);
514 set_page_dirty(page);
515 unlock_page(page);
516 page_cache_release(page);
517 }
518 }
519 if (partial_end) {
520 struct page *page = NULL;
521 shmem_getpage(inode, end, &page, SGP_READ, NULL);
522 if (page) {
523 zero_user_segment(page, 0, partial_end);
484 set_page_dirty(page); 524 set_page_dirty(page);
485 unlock_page(page); 525 unlock_page(page);
486 page_cache_release(page); 526 page_cache_release(page);
487 } 527 }
488 } 528 }
529 if (start >= end)
530 return;
489 531
490 index = start; 532 index = start;
491 for ( ; ; ) { 533 for ( ; ; ) {
492 cond_resched(); 534 cond_resched();
493 pvec.nr = shmem_find_get_pages_and_swap(mapping, index, 535 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
494 min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, 536 min(end - index, (pgoff_t)PAGEVEC_SIZE),
495 pvec.pages, indices); 537 pvec.pages, indices);
496 if (!pvec.nr) { 538 if (!pvec.nr) {
497 if (index == start) 539 if (index == start || unfalloc)
498 break; 540 break;
499 index = start; 541 index = start;
500 continue; 542 continue;
501 } 543 }
502 if (index == start && indices[0] > end) { 544 if ((index == start || unfalloc) && indices[0] >= end) {
503 shmem_deswap_pagevec(&pvec); 545 shmem_deswap_pagevec(&pvec);
504 pagevec_release(&pvec); 546 pagevec_release(&pvec);
505 break; 547 break;
@@ -509,19 +551,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
509 struct page *page = pvec.pages[i]; 551 struct page *page = pvec.pages[i];
510 552
511 index = indices[i]; 553 index = indices[i];
512 if (index > end) 554 if (index >= end)
513 break; 555 break;
514 556
515 if (radix_tree_exceptional_entry(page)) { 557 if (radix_tree_exceptional_entry(page)) {
558 if (unfalloc)
559 continue;
516 nr_swaps_freed += !shmem_free_swap(mapping, 560 nr_swaps_freed += !shmem_free_swap(mapping,
517 index, page); 561 index, page);
518 continue; 562 continue;
519 } 563 }
520 564
521 lock_page(page); 565 lock_page(page);
522 if (page->mapping == mapping) { 566 if (!unfalloc || !PageUptodate(page)) {
523 VM_BUG_ON(PageWriteback(page)); 567 if (page->mapping == mapping) {
524 truncate_inode_page(mapping, page); 568 VM_BUG_ON(PageWriteback(page));
569 truncate_inode_page(mapping, page);
570 }
525 } 571 }
526 unlock_page(page); 572 unlock_page(page);
527 } 573 }
@@ -535,7 +581,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
535 info->swapped -= nr_swaps_freed; 581 info->swapped -= nr_swaps_freed;
536 shmem_recalc_inode(inode); 582 shmem_recalc_inode(inode);
537 spin_unlock(&info->lock); 583 spin_unlock(&info->lock);
584}
538 585
586void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
587{
588 shmem_undo_range(inode, lstart, lend, false);
539 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 589 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
540} 590}
541EXPORT_SYMBOL_GPL(shmem_truncate_range); 591EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -604,12 +654,13 @@ static void shmem_evict_inode(struct inode *inode)
604 * If swap found in inode, free it and move page from swapcache to filecache. 654 * If swap found in inode, free it and move page from swapcache to filecache.
605 */ 655 */
606static int shmem_unuse_inode(struct shmem_inode_info *info, 656static int shmem_unuse_inode(struct shmem_inode_info *info,
607 swp_entry_t swap, struct page *page) 657 swp_entry_t swap, struct page **pagep)
608{ 658{
609 struct address_space *mapping = info->vfs_inode.i_mapping; 659 struct address_space *mapping = info->vfs_inode.i_mapping;
610 void *radswap; 660 void *radswap;
611 pgoff_t index; 661 pgoff_t index;
612 int error; 662 gfp_t gfp;
663 int error = 0;
613 664
614 radswap = swp_to_radix_entry(swap); 665 radswap = swp_to_radix_entry(swap);
615 index = radix_tree_locate_item(&mapping->page_tree, radswap); 666 index = radix_tree_locate_item(&mapping->page_tree, radswap);
@@ -625,22 +676,37 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
625 if (shmem_swaplist.next != &info->swaplist) 676 if (shmem_swaplist.next != &info->swaplist)
626 list_move_tail(&shmem_swaplist, &info->swaplist); 677 list_move_tail(&shmem_swaplist, &info->swaplist);
627 678
679 gfp = mapping_gfp_mask(mapping);
680 if (shmem_should_replace_page(*pagep, gfp)) {
681 mutex_unlock(&shmem_swaplist_mutex);
682 error = shmem_replace_page(pagep, gfp, info, index);
683 mutex_lock(&shmem_swaplist_mutex);
684 /*
685 * We needed to drop mutex to make that restrictive page
686 * allocation; but the inode might already be freed by now,
687 * and we cannot refer to inode or mapping or info to check.
688 * However, we do hold page lock on the PageSwapCache page,
689 * so can check if that still has our reference remaining.
690 */
691 if (!page_swapcount(*pagep))
692 error = -ENOENT;
693 }
694
628 /* 695 /*
629 * We rely on shmem_swaplist_mutex, not only to protect the swaplist, 696 * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
630 * but also to hold up shmem_evict_inode(): so inode cannot be freed 697 * but also to hold up shmem_evict_inode(): so inode cannot be freed
631 * beneath us (pagelock doesn't help until the page is in pagecache). 698 * beneath us (pagelock doesn't help until the page is in pagecache).
632 */ 699 */
633 error = shmem_add_to_page_cache(page, mapping, index, 700 if (!error)
701 error = shmem_add_to_page_cache(*pagep, mapping, index,
634 GFP_NOWAIT, radswap); 702 GFP_NOWAIT, radswap);
635 /* which does mem_cgroup_uncharge_cache_page on error */
636
637 if (error != -ENOMEM) { 703 if (error != -ENOMEM) {
638 /* 704 /*
639 * Truncation and eviction use free_swap_and_cache(), which 705 * Truncation and eviction use free_swap_and_cache(), which
640 * only does trylock page: if we raced, best clean up here. 706 * only does trylock page: if we raced, best clean up here.
641 */ 707 */
642 delete_from_swap_cache(page); 708 delete_from_swap_cache(*pagep);
643 set_page_dirty(page); 709 set_page_dirty(*pagep);
644 if (!error) { 710 if (!error) {
645 spin_lock(&info->lock); 711 spin_lock(&info->lock);
646 info->swapped--; 712 info->swapped--;
@@ -660,7 +726,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
660 struct list_head *this, *next; 726 struct list_head *this, *next;
661 struct shmem_inode_info *info; 727 struct shmem_inode_info *info;
662 int found = 0; 728 int found = 0;
663 int error; 729 int error = 0;
730
731 /*
732 * There's a faint possibility that swap page was replaced before
733 * caller locked it: it will come back later with the right page.
734 */
735 if (unlikely(!PageSwapCache(page)))
736 goto out;
664 737
665 /* 738 /*
666 * Charge page using GFP_KERNEL while we can wait, before taking 739 * Charge page using GFP_KERNEL while we can wait, before taking
@@ -676,7 +749,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
676 list_for_each_safe(this, next, &shmem_swaplist) { 749 list_for_each_safe(this, next, &shmem_swaplist) {
677 info = list_entry(this, struct shmem_inode_info, swaplist); 750 info = list_entry(this, struct shmem_inode_info, swaplist);
678 if (info->swapped) 751 if (info->swapped)
679 found = shmem_unuse_inode(info, swap, page); 752 found = shmem_unuse_inode(info, swap, &page);
680 else 753 else
681 list_del_init(&info->swaplist); 754 list_del_init(&info->swaplist);
682 cond_resched(); 755 cond_resched();
@@ -685,8 +758,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
685 } 758 }
686 mutex_unlock(&shmem_swaplist_mutex); 759 mutex_unlock(&shmem_swaplist_mutex);
687 760
688 if (!found)
689 mem_cgroup_uncharge_cache_page(page);
690 if (found < 0) 761 if (found < 0)
691 error = found; 762 error = found;
692out: 763out:
@@ -727,6 +798,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
727 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ 798 WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
728 goto redirty; 799 goto redirty;
729 } 800 }
801
802 /*
803 * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
804 * value into swapfile.c, the only way we can correctly account for a
805 * fallocated page arriving here is now to initialize it and write it.
806 *
807 * That's okay for a page already fallocated earlier, but if we have
808 * not yet completed the fallocation, then (a) we want to keep track
809 * of this page in case we have to undo it, and (b) it may not be a
810 * good idea to continue anyway, once we're pushing into swap. So
811 * reactivate the page, and let shmem_fallocate() quit when too many.
812 */
813 if (!PageUptodate(page)) {
814 if (inode->i_private) {
815 struct shmem_falloc *shmem_falloc;
816 spin_lock(&inode->i_lock);
817 shmem_falloc = inode->i_private;
818 if (shmem_falloc &&
819 index >= shmem_falloc->start &&
820 index < shmem_falloc->next)
821 shmem_falloc->nr_unswapped++;
822 else
823 shmem_falloc = NULL;
824 spin_unlock(&inode->i_lock);
825 if (shmem_falloc)
826 goto redirty;
827 }
828 clear_highpage(page);
829 flush_dcache_page(page);
830 SetPageUptodate(page);
831 }
832
730 swap = get_swap_page(); 833 swap = get_swap_page();
731 if (!swap.val) 834 if (!swap.val)
732 goto redirty; 835 goto redirty;
@@ -856,6 +959,84 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
856#endif 959#endif
857 960
858/* 961/*
962 * When a page is moved from swapcache to shmem filecache (either by the
963 * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
964 * shmem_unuse_inode()), it may have been read in earlier from swap, in
965 * ignorance of the mapping it belongs to. If that mapping has special
966 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
967 * we may need to copy to a suitable page before moving to filecache.
968 *
969 * In a future release, this may well be extended to respect cpuset and
970 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
971 * but for now it is a simple matter of zone.
972 */
973static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
974{
975 return page_zonenum(page) > gfp_zone(gfp);
976}
977
978static int shmem_replace_page(struct page **pagep, gfp_t gfp,
979 struct shmem_inode_info *info, pgoff_t index)
980{
981 struct page *oldpage, *newpage;
982 struct address_space *swap_mapping;
983 pgoff_t swap_index;
984 int error;
985
986 oldpage = *pagep;
987 swap_index = page_private(oldpage);
988 swap_mapping = page_mapping(oldpage);
989
990 /*
991 * We have arrived here because our zones are constrained, so don't
992 * limit chance of success by further cpuset and node constraints.
993 */
994 gfp &= ~GFP_CONSTRAINT_MASK;
995 newpage = shmem_alloc_page(gfp, info, index);
996 if (!newpage)
997 return -ENOMEM;
998 VM_BUG_ON(shmem_should_replace_page(newpage, gfp));
999
1000 *pagep = newpage;
1001 page_cache_get(newpage);
1002 copy_highpage(newpage, oldpage);
1003
1004 VM_BUG_ON(!PageLocked(oldpage));
1005 __set_page_locked(newpage);
1006 VM_BUG_ON(!PageUptodate(oldpage));
1007 SetPageUptodate(newpage);
1008 VM_BUG_ON(!PageSwapBacked(oldpage));
1009 SetPageSwapBacked(newpage);
1010 VM_BUG_ON(!swap_index);
1011 set_page_private(newpage, swap_index);
1012 VM_BUG_ON(!PageSwapCache(oldpage));
1013 SetPageSwapCache(newpage);
1014
1015 /*
1016 * Our caller will very soon move newpage out of swapcache, but it's
1017 * a nice clean interface for us to replace oldpage by newpage there.
1018 */
1019 spin_lock_irq(&swap_mapping->tree_lock);
1020 error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
1021 newpage);
1022 __inc_zone_page_state(newpage, NR_FILE_PAGES);
1023 __dec_zone_page_state(oldpage, NR_FILE_PAGES);
1024 spin_unlock_irq(&swap_mapping->tree_lock);
1025 BUG_ON(error);
1026
1027 mem_cgroup_replace_page_cache(oldpage, newpage);
1028 lru_cache_add_anon(newpage);
1029
1030 ClearPageSwapCache(oldpage);
1031 set_page_private(oldpage, 0);
1032
1033 unlock_page(oldpage);
1034 page_cache_release(oldpage);
1035 page_cache_release(oldpage);
1036 return 0;
1037}
1038
1039/*
859 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate 1040 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
860 * 1041 *
861 * If we allocate a new one we do not mark it dirty. That's up to the 1042 * If we allocate a new one we do not mark it dirty. That's up to the
@@ -872,6 +1053,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
872 swp_entry_t swap; 1053 swp_entry_t swap;
873 int error; 1054 int error;
874 int once = 0; 1055 int once = 0;
1056 int alloced = 0;
875 1057
876 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) 1058 if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
877 return -EFBIG; 1059 return -EFBIG;
@@ -883,19 +1065,21 @@ repeat:
883 page = NULL; 1065 page = NULL;
884 } 1066 }
885 1067
886 if (sgp != SGP_WRITE && 1068 if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
887 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { 1069 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
888 error = -EINVAL; 1070 error = -EINVAL;
889 goto failed; 1071 goto failed;
890 } 1072 }
891 1073
1074 /* fallocated page? */
1075 if (page && !PageUptodate(page)) {
1076 if (sgp != SGP_READ)
1077 goto clear;
1078 unlock_page(page);
1079 page_cache_release(page);
1080 page = NULL;
1081 }
892 if (page || (sgp == SGP_READ && !swap.val)) { 1082 if (page || (sgp == SGP_READ && !swap.val)) {
893 /*
894 * Once we can get the page lock, it must be uptodate:
895 * if there were an error in reading back from swap,
896 * the page would not be inserted into the filecache.
897 */
898 BUG_ON(page && !PageUptodate(page));
899 *pagep = page; 1083 *pagep = page;
900 return 0; 1084 return 0;
901 } 1085 }
@@ -923,19 +1107,20 @@ repeat:
923 1107
924 /* We have to do this with page locked to prevent races */ 1108 /* We have to do this with page locked to prevent races */
925 lock_page(page); 1109 lock_page(page);
1110 if (!PageSwapCache(page) || page->mapping) {
1111 error = -EEXIST; /* try again */
1112 goto failed;
1113 }
926 if (!PageUptodate(page)) { 1114 if (!PageUptodate(page)) {
927 error = -EIO; 1115 error = -EIO;
928 goto failed; 1116 goto failed;
929 } 1117 }
930 wait_on_page_writeback(page); 1118 wait_on_page_writeback(page);
931 1119
932 /* Someone may have already done it for us */ 1120 if (shmem_should_replace_page(page, gfp)) {
933 if (page->mapping) { 1121 error = shmem_replace_page(&page, gfp, info, index);
934 if (page->mapping == mapping && 1122 if (error)
935 page->index == index) 1123 goto failed;
936 goto done;
937 error = -EEXIST;
938 goto failed;
939 } 1124 }
940 1125
941 error = mem_cgroup_cache_charge(page, current->mm, 1126 error = mem_cgroup_cache_charge(page, current->mm,
@@ -991,19 +1176,36 @@ repeat:
991 inode->i_blocks += BLOCKS_PER_PAGE; 1176 inode->i_blocks += BLOCKS_PER_PAGE;
992 shmem_recalc_inode(inode); 1177 shmem_recalc_inode(inode);
993 spin_unlock(&info->lock); 1178 spin_unlock(&info->lock);
1179 alloced = true;
994 1180
995 clear_highpage(page); 1181 /*
996 flush_dcache_page(page); 1182 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
997 SetPageUptodate(page); 1183 */
1184 if (sgp == SGP_FALLOC)
1185 sgp = SGP_WRITE;
1186clear:
1187 /*
1188 * Let SGP_WRITE caller clear ends if write does not fill page;
1189 * but SGP_FALLOC on a page fallocated earlier must initialize
1190 * it now, lest undo on failure cancel our earlier guarantee.
1191 */
1192 if (sgp != SGP_WRITE) {
1193 clear_highpage(page);
1194 flush_dcache_page(page);
1195 SetPageUptodate(page);
1196 }
998 if (sgp == SGP_DIRTY) 1197 if (sgp == SGP_DIRTY)
999 set_page_dirty(page); 1198 set_page_dirty(page);
1000 } 1199 }
1001done: 1200
1002 /* Perhaps the file has been truncated since we checked */ 1201 /* Perhaps the file has been truncated since we checked */
1003 if (sgp != SGP_WRITE && 1202 if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
1004 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { 1203 ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
1005 error = -EINVAL; 1204 error = -EINVAL;
1006 goto trunc; 1205 if (alloced)
1206 goto trunc;
1207 else
1208 goto failed;
1007 } 1209 }
1008 *pagep = page; 1210 *pagep = page;
1009 return 0; 1211 return 0;
@@ -1012,6 +1214,7 @@ done:
1012 * Error recovery. 1214 * Error recovery.
1013 */ 1215 */
1014trunc: 1216trunc:
1217 info = SHMEM_I(inode);
1015 ClearPageDirty(page); 1218 ClearPageDirty(page);
1016 delete_from_page_cache(page); 1219 delete_from_page_cache(page);
1017 spin_lock(&info->lock); 1220 spin_lock(&info->lock);
@@ -1019,6 +1222,7 @@ trunc:
1019 inode->i_blocks -= BLOCKS_PER_PAGE; 1222 inode->i_blocks -= BLOCKS_PER_PAGE;
1020 spin_unlock(&info->lock); 1223 spin_unlock(&info->lock);
1021decused: 1224decused:
1225 sbinfo = SHMEM_SB(inode->i_sb);
1022 if (sbinfo->max_blocks) 1226 if (sbinfo->max_blocks)
1023 percpu_counter_add(&sbinfo->used_blocks, -1); 1227 percpu_counter_add(&sbinfo->used_blocks, -1);
1024unacct: 1228unacct:
@@ -1204,6 +1408,14 @@ shmem_write_end(struct file *file, struct address_space *mapping,
1204 if (pos + copied > inode->i_size) 1408 if (pos + copied > inode->i_size)
1205 i_size_write(inode, pos + copied); 1409 i_size_write(inode, pos + copied);
1206 1410
1411 if (!PageUptodate(page)) {
1412 if (copied < PAGE_CACHE_SIZE) {
1413 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1414 zero_user_segments(page, 0, from,
1415 from + copied, PAGE_CACHE_SIZE);
1416 }
1417 SetPageUptodate(page);
1418 }
1207 set_page_dirty(page); 1419 set_page_dirty(page);
1208 unlock_page(page); 1420 unlock_page(page);
1209 page_cache_release(page); 1421 page_cache_release(page);
@@ -1462,6 +1674,199 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
1462 return error; 1674 return error;
1463} 1675}
1464 1676
1677/*
1678 * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
1679 */
1680static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
1681 pgoff_t index, pgoff_t end, int origin)
1682{
1683 struct page *page;
1684 struct pagevec pvec;
1685 pgoff_t indices[PAGEVEC_SIZE];
1686 bool done = false;
1687 int i;
1688
1689 pagevec_init(&pvec, 0);
1690 pvec.nr = 1; /* start small: we may be there already */
1691 while (!done) {
1692 pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
1693 pvec.nr, pvec.pages, indices);
1694 if (!pvec.nr) {
1695 if (origin == SEEK_DATA)
1696 index = end;
1697 break;
1698 }
1699 for (i = 0; i < pvec.nr; i++, index++) {
1700 if (index < indices[i]) {
1701 if (origin == SEEK_HOLE) {
1702 done = true;
1703 break;
1704 }
1705 index = indices[i];
1706 }
1707 page = pvec.pages[i];
1708 if (page && !radix_tree_exceptional_entry(page)) {
1709 if (!PageUptodate(page))
1710 page = NULL;
1711 }
1712 if (index >= end ||
1713 (page && origin == SEEK_DATA) ||
1714 (!page && origin == SEEK_HOLE)) {
1715 done = true;
1716 break;
1717 }
1718 }
1719 shmem_deswap_pagevec(&pvec);
1720 pagevec_release(&pvec);
1721 pvec.nr = PAGEVEC_SIZE;
1722 cond_resched();
1723 }
1724 return index;
1725}
1726
1727static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
1728{
1729 struct address_space *mapping;
1730 struct inode *inode;
1731 pgoff_t start, end;
1732 loff_t new_offset;
1733
1734 if (origin != SEEK_DATA && origin != SEEK_HOLE)
1735 return generic_file_llseek_size(file, offset, origin,
1736 MAX_LFS_FILESIZE);
1737 mapping = file->f_mapping;
1738 inode = mapping->host;
1739 mutex_lock(&inode->i_mutex);
1740 /* We're holding i_mutex so we can access i_size directly */
1741
1742 if (offset < 0)
1743 offset = -EINVAL;
1744 else if (offset >= inode->i_size)
1745 offset = -ENXIO;
1746 else {
1747 start = offset >> PAGE_CACHE_SHIFT;
1748 end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1749 new_offset = shmem_seek_hole_data(mapping, start, end, origin);
1750 new_offset <<= PAGE_CACHE_SHIFT;
1751 if (new_offset > offset) {
1752 if (new_offset < inode->i_size)
1753 offset = new_offset;
1754 else if (origin == SEEK_DATA)
1755 offset = -ENXIO;
1756 else
1757 offset = inode->i_size;
1758 }
1759 }
1760
1761 if (offset >= 0 && offset != file->f_pos) {
1762 file->f_pos = offset;
1763 file->f_version = 0;
1764 }
1765 mutex_unlock(&inode->i_mutex);
1766 return offset;
1767}
1768
1769static long shmem_fallocate(struct file *file, int mode, loff_t offset,
1770 loff_t len)
1771{
1772 struct inode *inode = file->f_path.dentry->d_inode;
1773 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
1774 struct shmem_falloc shmem_falloc;
1775 pgoff_t start, index, end;
1776 int error;
1777
1778 mutex_lock(&inode->i_mutex);
1779
1780 if (mode & FALLOC_FL_PUNCH_HOLE) {
1781 struct address_space *mapping = file->f_mapping;
1782 loff_t unmap_start = round_up(offset, PAGE_SIZE);
1783 loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
1784
1785 if ((u64)unmap_end > (u64)unmap_start)
1786 unmap_mapping_range(mapping, unmap_start,
1787 1 + unmap_end - unmap_start, 0);
1788 shmem_truncate_range(inode, offset, offset + len - 1);
1789 /* No need to unmap again: hole-punching leaves COWed pages */
1790 error = 0;
1791 goto out;
1792 }
1793
1794 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
1795 error = inode_newsize_ok(inode, offset + len);
1796 if (error)
1797 goto out;
1798
1799 start = offset >> PAGE_CACHE_SHIFT;
1800 end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1801 /* Try to avoid a swapstorm if len is impossible to satisfy */
1802 if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
1803 error = -ENOSPC;
1804 goto out;
1805 }
1806
1807 shmem_falloc.start = start;
1808 shmem_falloc.next = start;
1809 shmem_falloc.nr_falloced = 0;
1810 shmem_falloc.nr_unswapped = 0;
1811 spin_lock(&inode->i_lock);
1812 inode->i_private = &shmem_falloc;
1813 spin_unlock(&inode->i_lock);
1814
1815 for (index = start; index < end; index++) {
1816 struct page *page;
1817
1818 /*
1819 * Good, the fallocate(2) manpage permits EINTR: we may have
1820 * been interrupted because we are using up too much memory.
1821 */
1822 if (signal_pending(current))
1823 error = -EINTR;
1824 else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
1825 error = -ENOMEM;
1826 else
1827 error = shmem_getpage(inode, index, &page, SGP_FALLOC,
1828 NULL);
1829 if (error) {
1830 /* Remove the !PageUptodate pages we added */
1831 shmem_undo_range(inode,
1832 (loff_t)start << PAGE_CACHE_SHIFT,
1833 (loff_t)index << PAGE_CACHE_SHIFT, true);
1834 goto undone;
1835 }
1836
1837 /*
1838 * Inform shmem_writepage() how far we have reached.
1839 * No need for lock or barrier: we have the page lock.
1840 */
1841 shmem_falloc.next++;
1842 if (!PageUptodate(page))
1843 shmem_falloc.nr_falloced++;
1844
1845 /*
1846 * If !PageUptodate, leave it that way so that freeable pages
1847 * can be recognized if we need to rollback on error later.
1848 * But set_page_dirty so that memory pressure will swap rather
1849 * than free the pages we are allocating (and SGP_CACHE pages
1850 * might still be clean: we now need to mark those dirty too).
1851 */
1852 set_page_dirty(page);
1853 unlock_page(page);
1854 page_cache_release(page);
1855 cond_resched();
1856 }
1857
1858 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
1859 i_size_write(inode, offset + len);
1860 inode->i_ctime = CURRENT_TIME;
1861undone:
1862 spin_lock(&inode->i_lock);
1863 inode->i_private = NULL;
1864 spin_unlock(&inode->i_lock);
1865out:
1866 mutex_unlock(&inode->i_mutex);
1867 return error;
1868}
1869
1465static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1870static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1466{ 1871{
1467 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 1872 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -1665,6 +2070,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
1665 kaddr = kmap_atomic(page); 2070 kaddr = kmap_atomic(page);
1666 memcpy(kaddr, symname, len); 2071 memcpy(kaddr, symname, len);
1667 kunmap_atomic(kaddr); 2072 kunmap_atomic(kaddr);
2073 SetPageUptodate(page);
1668 set_page_dirty(page); 2074 set_page_dirty(page);
1669 unlock_page(page); 2075 unlock_page(page);
1670 page_cache_release(page); 2076 page_cache_release(page);
@@ -2270,6 +2676,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2270 } 2676 }
2271 } 2677 }
2272 sb->s_export_op = &shmem_export_ops; 2678 sb->s_export_op = &shmem_export_ops;
2679 sb->s_flags |= MS_NOSEC;
2273#else 2680#else
2274 sb->s_flags |= MS_NOUSER; 2681 sb->s_flags |= MS_NOUSER;
2275#endif 2682#endif
@@ -2364,7 +2771,7 @@ static const struct address_space_operations shmem_aops = {
2364static const struct file_operations shmem_file_operations = { 2771static const struct file_operations shmem_file_operations = {
2365 .mmap = shmem_mmap, 2772 .mmap = shmem_mmap,
2366#ifdef CONFIG_TMPFS 2773#ifdef CONFIG_TMPFS
2367 .llseek = generic_file_llseek, 2774 .llseek = shmem_file_llseek,
2368 .read = do_sync_read, 2775 .read = do_sync_read,
2369 .write = do_sync_write, 2776 .write = do_sync_write,
2370 .aio_read = shmem_file_aio_read, 2777 .aio_read = shmem_file_aio_read,
@@ -2372,12 +2779,12 @@ static const struct file_operations shmem_file_operations = {
2372 .fsync = noop_fsync, 2779 .fsync = noop_fsync,
2373 .splice_read = shmem_file_splice_read, 2780 .splice_read = shmem_file_splice_read,
2374 .splice_write = generic_file_splice_write, 2781 .splice_write = generic_file_splice_write,
2782 .fallocate = shmem_fallocate,
2375#endif 2783#endif
2376}; 2784};
2377 2785
2378static const struct inode_operations shmem_inode_operations = { 2786static const struct inode_operations shmem_inode_operations = {
2379 .setattr = shmem_setattr, 2787 .setattr = shmem_setattr,
2380 .truncate_range = shmem_truncate_range,
2381#ifdef CONFIG_TMPFS_XATTR 2788#ifdef CONFIG_TMPFS_XATTR
2382 .setxattr = shmem_setxattr, 2789 .setxattr = shmem_setxattr,
2383 .getxattr = shmem_getxattr, 2790 .getxattr = shmem_getxattr,
diff --git a/mm/sparse.c b/mm/sparse.c
index a8bc7d364deb..6a4bf9160e85 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -273,10 +273,10 @@ static unsigned long *__kmalloc_section_usemap(void)
273#ifdef CONFIG_MEMORY_HOTREMOVE 273#ifdef CONFIG_MEMORY_HOTREMOVE
274static unsigned long * __init 274static unsigned long * __init
275sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, 275sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
276 unsigned long count) 276 unsigned long size)
277{ 277{
278 unsigned long section_nr; 278 pg_data_t *host_pgdat;
279 279 unsigned long goal;
280 /* 280 /*
281 * A page may contain usemaps for other sections preventing the 281 * A page may contain usemaps for other sections preventing the
282 * page being freed and making a section unremovable while 282 * page being freed and making a section unremovable while
@@ -287,8 +287,10 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
287 * from the same section as the pgdat where possible to avoid 287 * from the same section as the pgdat where possible to avoid
288 * this problem. 288 * this problem.
289 */ 289 */
290 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); 290 goal = __pa(pgdat) & PAGE_SECTION_MASK;
291 return alloc_bootmem_section(usemap_size() * count, section_nr); 291 host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT));
292 return __alloc_bootmem_node_nopanic(host_pgdat, size,
293 SMP_CACHE_BYTES, goal);
292} 294}
293 295
294static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 296static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -332,9 +334,9 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
332#else 334#else
333static unsigned long * __init 335static unsigned long * __init
334sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, 336sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
335 unsigned long count) 337 unsigned long size)
336{ 338{
337 return NULL; 339 return alloc_bootmem_node_nopanic(pgdat, size);
338} 340}
339 341
340static void __init check_usemap_section_nr(int nid, unsigned long *usemap) 342static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -352,13 +354,10 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
352 int size = usemap_size(); 354 int size = usemap_size();
353 355
354 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), 356 usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
355 usemap_count); 357 size * usemap_count);
356 if (!usemap) { 358 if (!usemap) {
357 usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); 359 printk(KERN_WARNING "%s: allocation failed\n", __func__);
358 if (!usemap) { 360 return;
359 printk(KERN_WARNING "%s: allocation failed\n", __func__);
360 return;
361 }
362 } 361 }
363 362
364 for (pnum = pnum_begin; pnum < pnum_end; pnum++) { 363 for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
diff --git a/mm/swap.c b/mm/swap.c
index 5c13f1338972..0503ad705e7c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -82,6 +82,25 @@ static void put_compound_page(struct page *page)
82 if (likely(page != page_head && 82 if (likely(page != page_head &&
83 get_page_unless_zero(page_head))) { 83 get_page_unless_zero(page_head))) {
84 unsigned long flags; 84 unsigned long flags;
85
86 /*
87 * THP can not break up slab pages so avoid taking
88 * compound_lock(). Slab performs non-atomic bit ops
89 * on page->flags for better performance. In particular
90 * slab_unlock() in slub used to be a hot path. It is
91 * still hot on arches that do not support
92 * this_cpu_cmpxchg_double().
93 */
94 if (PageSlab(page_head)) {
95 if (PageTail(page)) {
96 if (put_page_testzero(page_head))
97 VM_BUG_ON(1);
98
99 atomic_dec(&page->_mapcount);
100 goto skip_lock_tail;
101 } else
102 goto skip_lock;
103 }
85 /* 104 /*
86 * page_head wasn't a dangling pointer but it 105 * page_head wasn't a dangling pointer but it
87 * may not be a head page anymore by the time 106 * may not be a head page anymore by the time
@@ -92,10 +111,10 @@ static void put_compound_page(struct page *page)
92 if (unlikely(!PageTail(page))) { 111 if (unlikely(!PageTail(page))) {
93 /* __split_huge_page_refcount run before us */ 112 /* __split_huge_page_refcount run before us */
94 compound_unlock_irqrestore(page_head, flags); 113 compound_unlock_irqrestore(page_head, flags);
95 VM_BUG_ON(PageHead(page_head)); 114skip_lock:
96 if (put_page_testzero(page_head)) 115 if (put_page_testzero(page_head))
97 __put_single_page(page_head); 116 __put_single_page(page_head);
98 out_put_single: 117out_put_single:
99 if (put_page_testzero(page)) 118 if (put_page_testzero(page))
100 __put_single_page(page); 119 __put_single_page(page);
101 return; 120 return;
@@ -115,6 +134,8 @@ static void put_compound_page(struct page *page)
115 VM_BUG_ON(atomic_read(&page_head->_count) <= 0); 134 VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
116 VM_BUG_ON(atomic_read(&page->_count) != 0); 135 VM_BUG_ON(atomic_read(&page->_count) != 0);
117 compound_unlock_irqrestore(page_head, flags); 136 compound_unlock_irqrestore(page_head, flags);
137
138skip_lock_tail:
118 if (put_page_testzero(page_head)) { 139 if (put_page_testzero(page_head)) {
119 if (PageHead(page_head)) 140 if (PageHead(page_head))
120 __put_compound_page(page_head); 141 __put_compound_page(page_head);
@@ -162,6 +183,18 @@ bool __get_page_tail(struct page *page)
162 struct page *page_head = compound_trans_head(page); 183 struct page *page_head = compound_trans_head(page);
163 184
164 if (likely(page != page_head && get_page_unless_zero(page_head))) { 185 if (likely(page != page_head && get_page_unless_zero(page_head))) {
186
187 /* Ref to put_compound_page() comment. */
188 if (PageSlab(page_head)) {
189 if (likely(PageTail(page))) {
190 __get_page_tail_foll(page, false);
191 return true;
192 } else {
193 put_page(page_head);
194 return false;
195 }
196 }
197
165 /* 198 /*
166 * page_head wasn't a dangling pointer but it 199 * page_head wasn't a dangling pointer but it
167 * may not be a head page anymore by the time 200 * may not be a head page anymore by the time
@@ -279,21 +312,15 @@ void rotate_reclaimable_page(struct page *page)
279static void update_page_reclaim_stat(struct zone *zone, struct page *page, 312static void update_page_reclaim_stat(struct zone *zone, struct page *page,
280 int file, int rotated) 313 int file, int rotated)
281{ 314{
282 struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat; 315 struct zone_reclaim_stat *reclaim_stat;
283 struct zone_reclaim_stat *memcg_reclaim_stat;
284 316
285 memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page); 317 reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
318 if (!reclaim_stat)
319 reclaim_stat = &zone->lruvec.reclaim_stat;
286 320
287 reclaim_stat->recent_scanned[file]++; 321 reclaim_stat->recent_scanned[file]++;
288 if (rotated) 322 if (rotated)
289 reclaim_stat->recent_rotated[file]++; 323 reclaim_stat->recent_rotated[file]++;
290
291 if (!memcg_reclaim_stat)
292 return;
293
294 memcg_reclaim_stat->recent_scanned[file]++;
295 if (rotated)
296 memcg_reclaim_stat->recent_rotated[file]++;
297} 324}
298 325
299static void __activate_page(struct page *page, void *arg) 326static void __activate_page(struct page *page, void *arg)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fafc26d1b1dc..457b10baef59 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -601,7 +601,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
601 * This does not give an exact answer when swap count is continued, 601 * This does not give an exact answer when swap count is continued,
602 * but does include the high COUNT_CONTINUED flag to allow for that. 602 * but does include the high COUNT_CONTINUED flag to allow for that.
603 */ 603 */
604static inline int page_swapcount(struct page *page) 604int page_swapcount(struct page *page)
605{ 605{
606 int count = 0; 606 int count = 0;
607 struct swap_info_struct *p; 607 struct swap_info_struct *p;
@@ -717,37 +717,6 @@ int free_swap_and_cache(swp_entry_t entry)
717 return p != NULL; 717 return p != NULL;
718} 718}
719 719
720#ifdef CONFIG_CGROUP_MEM_RES_CTLR
721/**
722 * mem_cgroup_count_swap_user - count the user of a swap entry
723 * @ent: the swap entry to be checked
724 * @pagep: the pointer for the swap cache page of the entry to be stored
725 *
726 * Returns the number of the user of the swap entry. The number is valid only
727 * for swaps of anonymous pages.
728 * If the entry is found on swap cache, the page is stored to pagep with
729 * refcount of it being incremented.
730 */
731int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
732{
733 struct page *page;
734 struct swap_info_struct *p;
735 int count = 0;
736
737 page = find_get_page(&swapper_space, ent.val);
738 if (page)
739 count += page_mapcount(page);
740 p = swap_info_get(ent);
741 if (p) {
742 count += swap_count(p->swap_map[swp_offset(ent)]);
743 spin_unlock(&swap_lock);
744 }
745
746 *pagep = page;
747 return count;
748}
749#endif
750
751#ifdef CONFIG_HIBERNATION 720#ifdef CONFIG_HIBERNATION
752/* 721/*
753 * Find the swap type that corresponds to given device (if any). 722 * Find the swap type that corresponds to given device (if any).
diff --git a/mm/thrash.c b/mm/thrash.c
deleted file mode 100644
index 57ad495dbd54..000000000000
--- a/mm/thrash.c
+++ /dev/null
@@ -1,155 +0,0 @@
1/*
2 * mm/thrash.c
3 *
4 * Copyright (C) 2004, Red Hat, Inc.
5 * Copyright (C) 2004, Rik van Riel <riel@redhat.com>
6 * Released under the GPL, see the file COPYING for details.
7 *
8 * Simple token based thrashing protection, using the algorithm
9 * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
10 *
11 * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
12 * Improved algorithm to pass token:
13 * Each task has a priority which is incremented if it contended
14 * for the token in an interval less than its previous attempt.
15 * If the token is acquired, that task's priority is boosted to prevent
16 * the token from bouncing around too often and to let the task make
17 * some progress in its execution.
18 */
19
20#include <linux/jiffies.h>
21#include <linux/mm.h>
22#include <linux/sched.h>
23#include <linux/swap.h>
24#include <linux/memcontrol.h>
25
26#include <trace/events/vmscan.h>
27
28#define TOKEN_AGING_INTERVAL (0xFF)
29
30static DEFINE_SPINLOCK(swap_token_lock);
31struct mm_struct *swap_token_mm;
32static struct mem_cgroup *swap_token_memcg;
33
34#ifdef CONFIG_CGROUP_MEM_RES_CTLR
35static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
36{
37 struct mem_cgroup *memcg;
38
39 memcg = try_get_mem_cgroup_from_mm(mm);
40 if (memcg)
41 css_put(mem_cgroup_css(memcg));
42
43 return memcg;
44}
45#else
46static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
47{
48 return NULL;
49}
50#endif
51
52void grab_swap_token(struct mm_struct *mm)
53{
54 int current_interval;
55 unsigned int old_prio = mm->token_priority;
56 static unsigned int global_faults;
57 static unsigned int last_aging;
58
59 global_faults++;
60
61 current_interval = global_faults - mm->faultstamp;
62
63 if (!spin_trylock(&swap_token_lock))
64 return;
65
66 /* First come first served */
67 if (!swap_token_mm)
68 goto replace_token;
69
70 /*
71 * Usually, we don't need priority aging because long interval faults
72 * makes priority decrease quickly. But there is one exception. If the
73 * token owner task is sleeping, it never make long interval faults.
74 * Thus, we need a priority aging mechanism instead. The requirements
75 * of priority aging are
76 * 1) An aging interval is reasonable enough long. Too short aging
77 * interval makes quick swap token lost and decrease performance.
78 * 2) The swap token owner task have to get priority aging even if
79 * it's under sleep.
80 */
81 if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
82 swap_token_mm->token_priority /= 2;
83 last_aging = global_faults;
84 }
85
86 if (mm == swap_token_mm) {
87 mm->token_priority += 2;
88 goto update_priority;
89 }
90
91 if (current_interval < mm->last_interval)
92 mm->token_priority++;
93 else {
94 if (likely(mm->token_priority > 0))
95 mm->token_priority--;
96 }
97
98 /* Check if we deserve the token */
99 if (mm->token_priority > swap_token_mm->token_priority)
100 goto replace_token;
101
102update_priority:
103 trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
104
105out:
106 mm->faultstamp = global_faults;
107 mm->last_interval = current_interval;
108 spin_unlock(&swap_token_lock);
109 return;
110
111replace_token:
112 mm->token_priority += 2;
113 trace_replace_swap_token(swap_token_mm, mm);
114 swap_token_mm = mm;
115 swap_token_memcg = swap_token_memcg_from_mm(mm);
116 last_aging = global_faults;
117 goto out;
118}
119
120/* Called on process exit. */
121void __put_swap_token(struct mm_struct *mm)
122{
123 spin_lock(&swap_token_lock);
124 if (likely(mm == swap_token_mm)) {
125 trace_put_swap_token(swap_token_mm);
126 swap_token_mm = NULL;
127 swap_token_memcg = NULL;
128 }
129 spin_unlock(&swap_token_lock);
130}
131
132static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
133{
134 if (!a)
135 return true;
136 if (!b)
137 return true;
138 if (a == b)
139 return true;
140 return false;
141}
142
143void disable_swap_token(struct mem_cgroup *memcg)
144{
145 /* memcg reclaim don't disable unrelated mm token. */
146 if (match_memcg(memcg, swap_token_memcg)) {
147 spin_lock(&swap_token_lock);
148 if (match_memcg(memcg, swap_token_memcg)) {
149 trace_disable_swap_token(swap_token_mm);
150 swap_token_mm = NULL;
151 swap_token_memcg = NULL;
152 }
153 spin_unlock(&swap_token_lock);
154 }
155}
diff --git a/mm/truncate.c b/mm/truncate.c
index 61a183b89df6..75801acdaac7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -602,31 +602,6 @@ int vmtruncate(struct inode *inode, loff_t newsize)
602} 602}
603EXPORT_SYMBOL(vmtruncate); 603EXPORT_SYMBOL(vmtruncate);
604 604
605int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
606{
607 struct address_space *mapping = inode->i_mapping;
608 loff_t holebegin = round_up(lstart, PAGE_SIZE);
609 loff_t holelen = 1 + lend - holebegin;
610
611 /*
612 * If the underlying filesystem is not going to provide
613 * a way to truncate a range of blocks (punch a hole) -
614 * we should return failure right now.
615 */
616 if (!inode->i_op->truncate_range)
617 return -ENOSYS;
618
619 mutex_lock(&inode->i_mutex);
620 inode_dio_wait(inode);
621 unmap_mapping_range(mapping, holebegin, holelen, 1);
622 inode->i_op->truncate_range(inode, lstart, lend);
623 /* unmap again to remove racily COWed private pages */
624 unmap_mapping_range(mapping, holebegin, holelen, 1);
625 mutex_unlock(&inode->i_mutex);
626
627 return 0;
628}
629
630/** 605/**
631 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched 606 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
632 * @inode: inode 607 * @inode: inode
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 94dff883b449..2aad49981b57 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1185,9 +1185,10 @@ void __init vmalloc_init(void)
1185 /* Import existing vmlist entries. */ 1185 /* Import existing vmlist entries. */
1186 for (tmp = vmlist; tmp; tmp = tmp->next) { 1186 for (tmp = vmlist; tmp; tmp = tmp->next) {
1187 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); 1187 va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
1188 va->flags = tmp->flags | VM_VM_AREA; 1188 va->flags = VM_VM_AREA;
1189 va->va_start = (unsigned long)tmp->addr; 1189 va->va_start = (unsigned long)tmp->addr;
1190 va->va_end = va->va_start + tmp->size; 1190 va->va_end = va->va_start + tmp->size;
1191 va->vm = tmp;
1191 __insert_vmap_area(va); 1192 __insert_vmap_area(va);
1192 } 1193 }
1193 1194
@@ -2375,8 +2376,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2375 return NULL; 2376 return NULL;
2376 } 2377 }
2377 2378
2378 vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); 2379 vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
2379 vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); 2380 vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
2380 if (!vas || !vms) 2381 if (!vas || !vms)
2381 goto err_free2; 2382 goto err_free2;
2382 2383
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3d1365c17868..8deb5f4da4d9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -53,24 +53,6 @@
53#define CREATE_TRACE_POINTS 53#define CREATE_TRACE_POINTS
54#include <trace/events/vmscan.h> 54#include <trace/events/vmscan.h>
55 55
56/*
57 * reclaim_mode determines how the inactive list is shrunk
58 * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
59 * RECLAIM_MODE_ASYNC: Do not block
60 * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback
61 * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
62 * page from the LRU and reclaim all pages within a
63 * naturally aligned range
64 * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
65 * order-0 pages and then compact the zone
66 */
67typedef unsigned __bitwise__ reclaim_mode_t;
68#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u)
69#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u)
70#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u)
71#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u)
72#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u)
73
74struct scan_control { 56struct scan_control {
75 /* Incremented by the number of inactive pages that were scanned */ 57 /* Incremented by the number of inactive pages that were scanned */
76 unsigned long nr_scanned; 58 unsigned long nr_scanned;
@@ -97,12 +79,6 @@ struct scan_control {
97 int order; 79 int order;
98 80
99 /* 81 /*
100 * Intend to reclaim enough continuous memory rather than reclaim
101 * enough amount of memory. i.e, mode for high order allocation.
102 */
103 reclaim_mode_t reclaim_mode;
104
105 /*
106 * The memory cgroup that hit its limit and as a result is the 82 * The memory cgroup that hit its limit and as a result is the
107 * primary target of this reclaim invocation. 83 * primary target of this reclaim invocation.
108 */ 84 */
@@ -164,35 +140,22 @@ static bool global_reclaim(struct scan_control *sc)
164{ 140{
165 return !sc->target_mem_cgroup; 141 return !sc->target_mem_cgroup;
166} 142}
167
168static bool scanning_global_lru(struct mem_cgroup_zone *mz)
169{
170 return !mz->mem_cgroup;
171}
172#else 143#else
173static bool global_reclaim(struct scan_control *sc) 144static bool global_reclaim(struct scan_control *sc)
174{ 145{
175 return true; 146 return true;
176} 147}
177
178static bool scanning_global_lru(struct mem_cgroup_zone *mz)
179{
180 return true;
181}
182#endif 148#endif
183 149
184static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) 150static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
185{ 151{
186 if (!scanning_global_lru(mz)) 152 return &mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup)->reclaim_stat;
187 return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone);
188
189 return &mz->zone->reclaim_stat;
190} 153}
191 154
192static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz, 155static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
193 enum lru_list lru) 156 enum lru_list lru)
194{ 157{
195 if (!scanning_global_lru(mz)) 158 if (!mem_cgroup_disabled())
196 return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup, 159 return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
197 zone_to_nid(mz->zone), 160 zone_to_nid(mz->zone),
198 zone_idx(mz->zone), 161 zone_idx(mz->zone),
@@ -364,39 +327,6 @@ out:
364 return ret; 327 return ret;
365} 328}
366 329
367static void set_reclaim_mode(int priority, struct scan_control *sc,
368 bool sync)
369{
370 reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
371
372 /*
373 * Initially assume we are entering either lumpy reclaim or
374 * reclaim/compaction.Depending on the order, we will either set the
375 * sync mode or just reclaim order-0 pages later.
376 */
377 if (COMPACTION_BUILD)
378 sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
379 else
380 sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
381
382 /*
383 * Avoid using lumpy reclaim or reclaim/compaction if possible by
384 * restricting when its set to either costly allocations or when
385 * under memory pressure
386 */
387 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
388 sc->reclaim_mode |= syncmode;
389 else if (sc->order && priority < DEF_PRIORITY - 2)
390 sc->reclaim_mode |= syncmode;
391 else
392 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
393}
394
395static void reset_reclaim_mode(struct scan_control *sc)
396{
397 sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
398}
399
400static inline int is_page_cache_freeable(struct page *page) 330static inline int is_page_cache_freeable(struct page *page)
401{ 331{
402 /* 332 /*
@@ -416,10 +346,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
416 return 1; 346 return 1;
417 if (bdi == current->backing_dev_info) 347 if (bdi == current->backing_dev_info)
418 return 1; 348 return 1;
419
420 /* lumpy reclaim for hugepage often need a lot of write */
421 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
422 return 1;
423 return 0; 349 return 0;
424} 350}
425 351
@@ -523,8 +449,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
523 /* synchronous write or broken a_ops? */ 449 /* synchronous write or broken a_ops? */
524 ClearPageReclaim(page); 450 ClearPageReclaim(page);
525 } 451 }
526 trace_mm_vmscan_writepage(page, 452 trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
527 trace_reclaim_flags(page, sc->reclaim_mode));
528 inc_zone_page_state(page, NR_VMSCAN_WRITE); 453 inc_zone_page_state(page, NR_VMSCAN_WRITE);
529 return PAGE_SUCCESS; 454 return PAGE_SUCCESS;
530 } 455 }
@@ -707,13 +632,10 @@ static enum page_references page_check_references(struct page *page,
707 int referenced_ptes, referenced_page; 632 int referenced_ptes, referenced_page;
708 unsigned long vm_flags; 633 unsigned long vm_flags;
709 634
710 referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); 635 referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
636 &vm_flags);
711 referenced_page = TestClearPageReferenced(page); 637 referenced_page = TestClearPageReferenced(page);
712 638
713 /* Lumpy reclaim - ignore references */
714 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
715 return PAGEREF_RECLAIM;
716
717 /* 639 /*
718 * Mlock lost the isolation race with us. Let try_to_unmap() 640 * Mlock lost the isolation race with us. Let try_to_unmap()
719 * move the page to the unevictable list. 641 * move the page to the unevictable list.
@@ -722,7 +644,7 @@ static enum page_references page_check_references(struct page *page,
722 return PAGEREF_RECLAIM; 644 return PAGEREF_RECLAIM;
723 645
724 if (referenced_ptes) { 646 if (referenced_ptes) {
725 if (PageAnon(page)) 647 if (PageSwapBacked(page))
726 return PAGEREF_ACTIVATE; 648 return PAGEREF_ACTIVATE;
727 /* 649 /*
728 * All mapped pages start out with page table 650 * All mapped pages start out with page table
@@ -813,19 +735,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
813 735
814 if (PageWriteback(page)) { 736 if (PageWriteback(page)) {
815 nr_writeback++; 737 nr_writeback++;
816 /* 738 unlock_page(page);
817 * Synchronous reclaim cannot queue pages for 739 goto keep;
818 * writeback due to the possibility of stack overflow
819 * but if it encounters a page under writeback, wait
820 * for the IO to complete.
821 */
822 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
823 may_enter_fs)
824 wait_on_page_writeback(page);
825 else {
826 unlock_page(page);
827 goto keep_lumpy;
828 }
829 } 740 }
830 741
831 references = page_check_references(page, mz, sc); 742 references = page_check_references(page, mz, sc);
@@ -908,7 +819,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
908 goto activate_locked; 819 goto activate_locked;
909 case PAGE_SUCCESS: 820 case PAGE_SUCCESS:
910 if (PageWriteback(page)) 821 if (PageWriteback(page))
911 goto keep_lumpy; 822 goto keep;
912 if (PageDirty(page)) 823 if (PageDirty(page))
913 goto keep; 824 goto keep;
914 825
@@ -994,7 +905,6 @@ cull_mlocked:
994 try_to_free_swap(page); 905 try_to_free_swap(page);
995 unlock_page(page); 906 unlock_page(page);
996 putback_lru_page(page); 907 putback_lru_page(page);
997 reset_reclaim_mode(sc);
998 continue; 908 continue;
999 909
1000activate_locked: 910activate_locked:
@@ -1007,8 +917,6 @@ activate_locked:
1007keep_locked: 917keep_locked:
1008 unlock_page(page); 918 unlock_page(page);
1009keep: 919keep:
1010 reset_reclaim_mode(sc);
1011keep_lumpy:
1012 list_add(&page->lru, &ret_pages); 920 list_add(&page->lru, &ret_pages);
1013 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 921 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
1014 } 922 }
@@ -1064,11 +972,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1064 if (!all_lru_mode && !!page_is_file_cache(page) != file) 972 if (!all_lru_mode && !!page_is_file_cache(page) != file)
1065 return ret; 973 return ret;
1066 974
1067 /* 975 /* Do not give back unevictable pages for compaction */
1068 * When this function is being called for lumpy reclaim, we
1069 * initially look into all LRU pages, active, inactive and
1070 * unevictable; only give shrink_page_list evictable pages.
1071 */
1072 if (PageUnevictable(page)) 976 if (PageUnevictable(page))
1073 return ret; 977 return ret;
1074 978
@@ -1153,9 +1057,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1153 struct lruvec *lruvec; 1057 struct lruvec *lruvec;
1154 struct list_head *src; 1058 struct list_head *src;
1155 unsigned long nr_taken = 0; 1059 unsigned long nr_taken = 0;
1156 unsigned long nr_lumpy_taken = 0;
1157 unsigned long nr_lumpy_dirty = 0;
1158 unsigned long nr_lumpy_failed = 0;
1159 unsigned long scan; 1060 unsigned long scan;
1160 int lru = LRU_BASE; 1061 int lru = LRU_BASE;
1161 1062
@@ -1168,10 +1069,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1168 1069
1169 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { 1070 for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
1170 struct page *page; 1071 struct page *page;
1171 unsigned long pfn;
1172 unsigned long end_pfn;
1173 unsigned long page_pfn;
1174 int zone_id;
1175 1072
1176 page = lru_to_page(src); 1073 page = lru_to_page(src);
1177 prefetchw_prev_lru_page(page, src, flags); 1074 prefetchw_prev_lru_page(page, src, flags);
@@ -1193,84 +1090,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1193 default: 1090 default:
1194 BUG(); 1091 BUG();
1195 } 1092 }
1196
1197 if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
1198 continue;
1199
1200 /*
1201 * Attempt to take all pages in the order aligned region
1202 * surrounding the tag page. Only take those pages of
1203 * the same active state as that tag page. We may safely
1204 * round the target page pfn down to the requested order
1205 * as the mem_map is guaranteed valid out to MAX_ORDER,
1206 * where that page is in a different zone we will detect
1207 * it from its zone id and abort this block scan.
1208 */
1209 zone_id = page_zone_id(page);
1210 page_pfn = page_to_pfn(page);
1211 pfn = page_pfn & ~((1 << sc->order) - 1);
1212 end_pfn = pfn + (1 << sc->order);
1213 for (; pfn < end_pfn; pfn++) {
1214 struct page *cursor_page;
1215
1216 /* The target page is in the block, ignore it. */
1217 if (unlikely(pfn == page_pfn))
1218 continue;
1219
1220 /* Avoid holes within the zone. */
1221 if (unlikely(!pfn_valid_within(pfn)))
1222 break;
1223
1224 cursor_page = pfn_to_page(pfn);
1225
1226 /* Check that we have not crossed a zone boundary. */
1227 if (unlikely(page_zone_id(cursor_page) != zone_id))
1228 break;
1229
1230 /*
1231 * If we don't have enough swap space, reclaiming of
1232 * anon page which don't already have a swap slot is
1233 * pointless.
1234 */
1235 if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
1236 !PageSwapCache(cursor_page))
1237 break;
1238
1239 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
1240 unsigned int isolated_pages;
1241
1242 mem_cgroup_lru_del(cursor_page);
1243 list_move(&cursor_page->lru, dst);
1244 isolated_pages = hpage_nr_pages(cursor_page);
1245 nr_taken += isolated_pages;
1246 nr_lumpy_taken += isolated_pages;
1247 if (PageDirty(cursor_page))
1248 nr_lumpy_dirty += isolated_pages;
1249 scan++;
1250 pfn += isolated_pages - 1;
1251 } else {
1252 /*
1253 * Check if the page is freed already.
1254 *
1255 * We can't use page_count() as that
1256 * requires compound_head and we don't
1257 * have a pin on the page here. If a
1258 * page is tail, we may or may not
1259 * have isolated the head, so assume
1260 * it's not free, it'd be tricky to
1261 * track the head status without a
1262 * page pin.
1263 */
1264 if (!PageTail(cursor_page) &&
1265 !atomic_read(&cursor_page->_count))
1266 continue;
1267 break;
1268 }
1269 }
1270
1271 /* If we break out of the loop above, lumpy reclaim failed */
1272 if (pfn < end_pfn)
1273 nr_lumpy_failed++;
1274 } 1093 }
1275 1094
1276 *nr_scanned = scan; 1095 *nr_scanned = scan;
@@ -1278,7 +1097,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1278 trace_mm_vmscan_lru_isolate(sc->order, 1097 trace_mm_vmscan_lru_isolate(sc->order,
1279 nr_to_scan, scan, 1098 nr_to_scan, scan,
1280 nr_taken, 1099 nr_taken,
1281 nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
1282 mode, file); 1100 mode, file);
1283 return nr_taken; 1101 return nr_taken;
1284} 1102}
@@ -1454,47 +1272,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz,
1454} 1272}
1455 1273
1456/* 1274/*
1457 * Returns true if a direct reclaim should wait on pages under writeback.
1458 *
1459 * If we are direct reclaiming for contiguous pages and we do not reclaim
1460 * everything in the list, try again and wait for writeback IO to complete.
1461 * This will stall high-order allocations noticeably. Only do that when really
1462 * need to free the pages under high memory pressure.
1463 */
1464static inline bool should_reclaim_stall(unsigned long nr_taken,
1465 unsigned long nr_freed,
1466 int priority,
1467 struct scan_control *sc)
1468{
1469 int lumpy_stall_priority;
1470
1471 /* kswapd should not stall on sync IO */
1472 if (current_is_kswapd())
1473 return false;
1474
1475 /* Only stall on lumpy reclaim */
1476 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1477 return false;
1478
1479 /* If we have reclaimed everything on the isolated list, no stall */
1480 if (nr_freed == nr_taken)
1481 return false;
1482
1483 /*
1484 * For high-order allocations, there are two stall thresholds.
1485 * High-cost allocations stall immediately where as lower
1486 * order allocations such as stacks require the scanning
1487 * priority to be much higher before stalling.
1488 */
1489 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1490 lumpy_stall_priority = DEF_PRIORITY;
1491 else
1492 lumpy_stall_priority = DEF_PRIORITY / 3;
1493
1494 return priority <= lumpy_stall_priority;
1495}
1496
1497/*
1498 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1275 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
1499 * of reclaimed pages 1276 * of reclaimed pages
1500 */ 1277 */
@@ -1522,10 +1299,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1522 return SWAP_CLUSTER_MAX; 1299 return SWAP_CLUSTER_MAX;
1523 } 1300 }
1524 1301
1525 set_reclaim_mode(priority, sc, false);
1526 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1527 isolate_mode |= ISOLATE_ACTIVE;
1528
1529 lru_add_drain(); 1302 lru_add_drain();
1530 1303
1531 if (!sc->may_unmap) 1304 if (!sc->may_unmap)
@@ -1556,13 +1329,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1556 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, 1329 nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
1557 &nr_dirty, &nr_writeback); 1330 &nr_dirty, &nr_writeback);
1558 1331
1559 /* Check if we should syncronously wait for writeback */
1560 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1561 set_reclaim_mode(priority, sc, true);
1562 nr_reclaimed += shrink_page_list(&page_list, mz, sc,
1563 priority, &nr_dirty, &nr_writeback);
1564 }
1565
1566 spin_lock_irq(&zone->lru_lock); 1332 spin_lock_irq(&zone->lru_lock);
1567 1333
1568 reclaim_stat->recent_scanned[0] += nr_anon; 1334 reclaim_stat->recent_scanned[0] += nr_anon;
@@ -1616,7 +1382,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
1616 zone_idx(zone), 1382 zone_idx(zone),
1617 nr_scanned, nr_reclaimed, 1383 nr_scanned, nr_reclaimed,
1618 priority, 1384 priority,
1619 trace_shrink_flags(file, sc->reclaim_mode)); 1385 trace_shrink_flags(file));
1620 return nr_reclaimed; 1386 return nr_reclaimed;
1621} 1387}
1622 1388
@@ -1695,8 +1461,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
1695 1461
1696 lru_add_drain(); 1462 lru_add_drain();
1697 1463
1698 reset_reclaim_mode(sc);
1699
1700 if (!sc->may_unmap) 1464 if (!sc->may_unmap)
1701 isolate_mode |= ISOLATE_UNMAPPED; 1465 isolate_mode |= ISOLATE_UNMAPPED;
1702 if (!sc->may_writepage) 1466 if (!sc->may_writepage)
@@ -1737,7 +1501,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
1737 } 1501 }
1738 } 1502 }
1739 1503
1740 if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { 1504 if (page_referenced(page, 0, sc->target_mem_cgroup,
1505 &vm_flags)) {
1741 nr_rotated += hpage_nr_pages(page); 1506 nr_rotated += hpage_nr_pages(page);
1742 /* 1507 /*
1743 * Identify referenced, file-backed active pages and 1508 * Identify referenced, file-backed active pages and
@@ -1811,7 +1576,7 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
1811 if (!total_swap_pages) 1576 if (!total_swap_pages)
1812 return 0; 1577 return 0;
1813 1578
1814 if (!scanning_global_lru(mz)) 1579 if (!mem_cgroup_disabled())
1815 return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, 1580 return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
1816 mz->zone); 1581 mz->zone);
1817 1582
@@ -1850,7 +1615,7 @@ static int inactive_file_is_low_global(struct zone *zone)
1850 */ 1615 */
1851static int inactive_file_is_low(struct mem_cgroup_zone *mz) 1616static int inactive_file_is_low(struct mem_cgroup_zone *mz)
1852{ 1617{
1853 if (!scanning_global_lru(mz)) 1618 if (!mem_cgroup_disabled())
1854 return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, 1619 return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
1855 mz->zone); 1620 mz->zone);
1856 1621
@@ -1984,10 +1749,10 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
1984 * proportional to the fraction of recently scanned pages on 1749 * proportional to the fraction of recently scanned pages on
1985 * each list that were recently referenced and in active use. 1750 * each list that were recently referenced and in active use.
1986 */ 1751 */
1987 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); 1752 ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
1988 ap /= reclaim_stat->recent_rotated[0] + 1; 1753 ap /= reclaim_stat->recent_rotated[0] + 1;
1989 1754
1990 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); 1755 fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
1991 fp /= reclaim_stat->recent_rotated[1] + 1; 1756 fp /= reclaim_stat->recent_rotated[1] + 1;
1992 spin_unlock_irq(&mz->zone->lru_lock); 1757 spin_unlock_irq(&mz->zone->lru_lock);
1993 1758
@@ -2000,7 +1765,7 @@ out:
2000 unsigned long scan; 1765 unsigned long scan;
2001 1766
2002 scan = zone_nr_lru_pages(mz, lru); 1767 scan = zone_nr_lru_pages(mz, lru);
2003 if (priority || noswap) { 1768 if (priority || noswap || !vmscan_swappiness(mz, sc)) {
2004 scan >>= priority; 1769 scan >>= priority;
2005 if (!scan && force_scan) 1770 if (!scan && force_scan)
2006 scan = SWAP_CLUSTER_MAX; 1771 scan = SWAP_CLUSTER_MAX;
@@ -2010,23 +1775,35 @@ out:
2010 } 1775 }
2011} 1776}
2012 1777
1778/* Use reclaim/compaction for costly allocs or under memory pressure */
1779static bool in_reclaim_compaction(int priority, struct scan_control *sc)
1780{
1781 if (COMPACTION_BUILD && sc->order &&
1782 (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
1783 priority < DEF_PRIORITY - 2))
1784 return true;
1785
1786 return false;
1787}
1788
2013/* 1789/*
2014 * Reclaim/compaction depends on a number of pages being freed. To avoid 1790 * Reclaim/compaction is used for high-order allocation requests. It reclaims
2015 * disruption to the system, a small number of order-0 pages continue to be 1791 * order-0 pages before compacting the zone. should_continue_reclaim() returns
2016 * rotated and reclaimed in the normal fashion. However, by the time we get 1792 * true if more pages should be reclaimed such that when the page allocator
2017 * back to the allocator and call try_to_compact_zone(), we ensure that 1793 * calls try_to_compact_zone() that it will have enough free pages to succeed.
2018 * there are enough free pages for it to be likely successful 1794 * It will give up earlier than that if there is difficulty reclaiming pages.
2019 */ 1795 */
2020static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, 1796static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
2021 unsigned long nr_reclaimed, 1797 unsigned long nr_reclaimed,
2022 unsigned long nr_scanned, 1798 unsigned long nr_scanned,
1799 int priority,
2023 struct scan_control *sc) 1800 struct scan_control *sc)
2024{ 1801{
2025 unsigned long pages_for_compaction; 1802 unsigned long pages_for_compaction;
2026 unsigned long inactive_lru_pages; 1803 unsigned long inactive_lru_pages;
2027 1804
2028 /* If not in reclaim/compaction mode, stop */ 1805 /* If not in reclaim/compaction mode, stop */
2029 if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) 1806 if (!in_reclaim_compaction(priority, sc))
2030 return false; 1807 return false;
2031 1808
2032 /* Consider stopping depending on scan and reclaim activity */ 1809 /* Consider stopping depending on scan and reclaim activity */
@@ -2128,7 +1905,8 @@ restart:
2128 1905
2129 /* reclaim/compaction might need reclaim to continue */ 1906 /* reclaim/compaction might need reclaim to continue */
2130 if (should_continue_reclaim(mz, nr_reclaimed, 1907 if (should_continue_reclaim(mz, nr_reclaimed,
2131 sc->nr_scanned - nr_scanned, sc)) 1908 sc->nr_scanned - nr_scanned,
1909 priority, sc))
2132 goto restart; 1910 goto restart;
2133 1911
2134 throttle_vm_writeout(sc->gfp_mask); 1912 throttle_vm_writeout(sc->gfp_mask);
@@ -2353,8 +2131,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2353 2131
2354 for (priority = DEF_PRIORITY; priority >= 0; priority--) { 2132 for (priority = DEF_PRIORITY; priority >= 0; priority--) {
2355 sc->nr_scanned = 0; 2133 sc->nr_scanned = 0;
2356 if (!priority)
2357 disable_swap_token(sc->target_mem_cgroup);
2358 aborted_reclaim = shrink_zones(priority, zonelist, sc); 2134 aborted_reclaim = shrink_zones(priority, zonelist, sc);
2359 2135
2360 /* 2136 /*
@@ -2705,10 +2481,6 @@ loop_again:
2705 unsigned long lru_pages = 0; 2481 unsigned long lru_pages = 0;
2706 int has_under_min_watermark_zone = 0; 2482 int has_under_min_watermark_zone = 0;
2707 2483
2708 /* The swap token gets in the way of swapout... */
2709 if (!priority)
2710 disable_swap_token(NULL);
2711
2712 all_zones_ok = 1; 2484 all_zones_ok = 1;
2713 balanced = 0; 2485 balanced = 0;
2714 2486
@@ -3537,7 +3309,7 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
3537 if (mapping_unevictable(page_mapping(page))) 3309 if (mapping_unevictable(page_mapping(page)))
3538 return 0; 3310 return 0;
3539 3311
3540 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) 3312 if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
3541 return 0; 3313 return 0;
3542 3314
3543 return 1; 3315 return 1;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0dad31dc1618..1bbbbd9776ad 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1223,7 +1223,6 @@ module_init(setup_vmstat)
1223#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) 1223#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
1224#include <linux/debugfs.h> 1224#include <linux/debugfs.h>
1225 1225
1226static struct dentry *extfrag_debug_root;
1227 1226
1228/* 1227/*
1229 * Return an index indicating how much of the available free memory is 1228 * Return an index indicating how much of the available free memory is
@@ -1361,19 +1360,24 @@ static const struct file_operations extfrag_file_ops = {
1361 1360
1362static int __init extfrag_debug_init(void) 1361static int __init extfrag_debug_init(void)
1363{ 1362{
1363 struct dentry *extfrag_debug_root;
1364
1364 extfrag_debug_root = debugfs_create_dir("extfrag", NULL); 1365 extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
1365 if (!extfrag_debug_root) 1366 if (!extfrag_debug_root)
1366 return -ENOMEM; 1367 return -ENOMEM;
1367 1368
1368 if (!debugfs_create_file("unusable_index", 0444, 1369 if (!debugfs_create_file("unusable_index", 0444,
1369 extfrag_debug_root, NULL, &unusable_file_ops)) 1370 extfrag_debug_root, NULL, &unusable_file_ops))
1370 return -ENOMEM; 1371 goto fail;
1371 1372
1372 if (!debugfs_create_file("extfrag_index", 0444, 1373 if (!debugfs_create_file("extfrag_index", 0444,
1373 extfrag_debug_root, NULL, &extfrag_file_ops)) 1374 extfrag_debug_root, NULL, &extfrag_file_ops))
1374 return -ENOMEM; 1375 goto fail;
1375 1376
1376 return 0; 1377 return 0;
1378fail:
1379 debugfs_remove_recursive(extfrag_debug_root);
1380 return -ENOMEM;
1377} 1381}
1378 1382
1379module_init(extfrag_debug_init); 1383module_init(extfrag_debug_init);