diff options
author | Jiri Kosina <jkosina@suse.cz> | 2012-06-29 08:45:58 -0400 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2012-06-29 08:45:58 -0400 |
commit | 59f91e5dd0504dc0ebfaa0b6f3a55e6931f96266 (patch) | |
tree | b913718405d44a921905ac71044fbde410256865 /mm | |
parent | 57bdfdd80077addf518a9b90c4a66890efc4f70e (diff) | |
parent | 89abfab133ef1f5902abafb744df72793213ac19 (diff) |
Merge branch 'master' into for-next
Conflicts:
include/linux/mmzone.h
Synced with Linus' tree so that trivial patch can be applied
on top of up-to-date code properly.
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 10 | ||||
-rw-r--r-- | mm/Makefile | 9 | ||||
-rw-r--r-- | mm/bootmem.c | 134 | ||||
-rw-r--r-- | mm/compaction.c | 142 | ||||
-rw-r--r-- | mm/filemap.c | 39 | ||||
-rw-r--r-- | mm/huge_memory.c | 21 | ||||
-rw-r--r-- | mm/hugetlb.c | 32 | ||||
-rw-r--r-- | mm/internal.h | 14 | ||||
-rw-r--r-- | mm/madvise.c | 15 | ||||
-rw-r--r-- | mm/memblock.c | 42 | ||||
-rw-r--r-- | mm/memcontrol.c | 127 | ||||
-rw-r--r-- | mm/memory-failure.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 20 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 14 | ||||
-rw-r--r-- | mm/mempolicy.c | 36 | ||||
-rw-r--r-- | mm/mmap.c | 53 | ||||
-rw-r--r-- | mm/nobootmem.c | 112 | ||||
-rw-r--r-- | mm/oom_kill.c | 44 | ||||
-rw-r--r-- | mm/page_alloc.c | 78 | ||||
-rw-r--r-- | mm/readahead.c | 40 | ||||
-rw-r--r-- | mm/rmap.c | 6 | ||||
-rw-r--r-- | mm/shmem.c | 513 | ||||
-rw-r--r-- | mm/sparse.c | 25 | ||||
-rw-r--r-- | mm/swap.c | 51 | ||||
-rw-r--r-- | mm/swapfile.c | 33 | ||||
-rw-r--r-- | mm/thrash.c | 155 | ||||
-rw-r--r-- | mm/truncate.c | 25 | ||||
-rw-r--r-- | mm/vmalloc.c | 7 | ||||
-rw-r--r-- | mm/vmscan.c | 306 | ||||
-rw-r--r-- | mm/vmstat.c | 10 |
30 files changed, 1137 insertions, 984 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 39220026c797..b2176374b98e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -349,6 +349,16 @@ choice | |||
349 | benefit. | 349 | benefit. |
350 | endchoice | 350 | endchoice |
351 | 351 | ||
352 | config CROSS_MEMORY_ATTACH | ||
353 | bool "Cross Memory Support" | ||
354 | depends on MMU | ||
355 | default y | ||
356 | help | ||
357 | Enabling this option adds the system calls process_vm_readv and | ||
358 | process_vm_writev which allow a process with the correct privileges | ||
359 | to directly read from or write to to another process's address space. | ||
360 | See the man page for more details. | ||
361 | |||
352 | # | 362 | # |
353 | # UP and nommu archs use km based percpu allocator | 363 | # UP and nommu archs use km based percpu allocator |
354 | # | 364 | # |
diff --git a/mm/Makefile b/mm/Makefile index 8aada89efbbb..a156285ce88d 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -5,8 +5,11 @@ | |||
5 | mmu-y := nommu.o | 5 | mmu-y := nommu.o |
6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | 6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ |
7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ |
8 | vmalloc.o pagewalk.o pgtable-generic.o \ | 8 | vmalloc.o pagewalk.o pgtable-generic.o |
9 | process_vm_access.o | 9 | |
10 | ifdef CONFIG_CROSS_MEMORY_ATTACH | ||
11 | mmu-$(CONFIG_MMU) += process_vm_access.o | ||
12 | endif | ||
10 | 13 | ||
11 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ | 14 | obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ |
12 | maccess.o page_alloc.o page-writeback.o \ | 15 | maccess.o page_alloc.o page-writeback.o \ |
@@ -25,7 +28,7 @@ endif | |||
25 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | 28 | obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o |
26 | 29 | ||
27 | obj-$(CONFIG_BOUNCE) += bounce.o | 30 | obj-$(CONFIG_BOUNCE) += bounce.o |
28 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o | 31 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o |
29 | obj-$(CONFIG_HAS_DMA) += dmapool.o | 32 | obj-$(CONFIG_HAS_DMA) += dmapool.o |
30 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 33 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
31 | obj-$(CONFIG_NUMA) += mempolicy.o | 34 | obj-$(CONFIG_NUMA) += mempolicy.o |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 0131170c9d54..ec4fcb7a56c8 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -77,16 +77,16 @@ unsigned long __init bootmem_bootmap_pages(unsigned long pages) | |||
77 | */ | 77 | */ |
78 | static void __init link_bootmem(bootmem_data_t *bdata) | 78 | static void __init link_bootmem(bootmem_data_t *bdata) |
79 | { | 79 | { |
80 | struct list_head *iter; | 80 | bootmem_data_t *ent; |
81 | 81 | ||
82 | list_for_each(iter, &bdata_list) { | 82 | list_for_each_entry(ent, &bdata_list, list) { |
83 | bootmem_data_t *ent; | 83 | if (bdata->node_min_pfn < ent->node_min_pfn) { |
84 | 84 | list_add_tail(&bdata->list, &ent->list); | |
85 | ent = list_entry(iter, bootmem_data_t, list); | 85 | return; |
86 | if (bdata->node_min_pfn < ent->node_min_pfn) | 86 | } |
87 | break; | ||
88 | } | 87 | } |
89 | list_add_tail(&bdata->list, iter); | 88 | |
89 | list_add_tail(&bdata->list, &bdata_list); | ||
90 | } | 90 | } |
91 | 91 | ||
92 | /* | 92 | /* |
@@ -203,7 +203,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
203 | } else { | 203 | } else { |
204 | unsigned long off = 0; | 204 | unsigned long off = 0; |
205 | 205 | ||
206 | while (vec && off < BITS_PER_LONG) { | 206 | vec >>= start & (BITS_PER_LONG - 1); |
207 | while (vec) { | ||
207 | if (vec & 1) { | 208 | if (vec & 1) { |
208 | page = pfn_to_page(start + off); | 209 | page = pfn_to_page(start + off); |
209 | __free_pages_bootmem(page, 0); | 210 | __free_pages_bootmem(page, 0); |
@@ -467,7 +468,7 @@ static unsigned long __init align_off(struct bootmem_data *bdata, | |||
467 | return ALIGN(base + off, align) - base; | 468 | return ALIGN(base + off, align) - base; |
468 | } | 469 | } |
469 | 470 | ||
470 | static void * __init alloc_bootmem_core(struct bootmem_data *bdata, | 471 | static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata, |
471 | unsigned long size, unsigned long align, | 472 | unsigned long size, unsigned long align, |
472 | unsigned long goal, unsigned long limit) | 473 | unsigned long goal, unsigned long limit) |
473 | { | 474 | { |
@@ -588,14 +589,14 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, | |||
588 | p_bdata = bootmem_arch_preferred_node(bdata, size, align, | 589 | p_bdata = bootmem_arch_preferred_node(bdata, size, align, |
589 | goal, limit); | 590 | goal, limit); |
590 | if (p_bdata) | 591 | if (p_bdata) |
591 | return alloc_bootmem_core(p_bdata, size, align, | 592 | return alloc_bootmem_bdata(p_bdata, size, align, |
592 | goal, limit); | 593 | goal, limit); |
593 | } | 594 | } |
594 | #endif | 595 | #endif |
595 | return NULL; | 596 | return NULL; |
596 | } | 597 | } |
597 | 598 | ||
598 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, | 599 | static void * __init alloc_bootmem_core(unsigned long size, |
599 | unsigned long align, | 600 | unsigned long align, |
600 | unsigned long goal, | 601 | unsigned long goal, |
601 | unsigned long limit) | 602 | unsigned long limit) |
@@ -603,7 +604,6 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size, | |||
603 | bootmem_data_t *bdata; | 604 | bootmem_data_t *bdata; |
604 | void *region; | 605 | void *region; |
605 | 606 | ||
606 | restart: | ||
607 | region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); | 607 | region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); |
608 | if (region) | 608 | if (region) |
609 | return region; | 609 | return region; |
@@ -614,11 +614,25 @@ restart: | |||
614 | if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) | 614 | if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) |
615 | break; | 615 | break; |
616 | 616 | ||
617 | region = alloc_bootmem_core(bdata, size, align, goal, limit); | 617 | region = alloc_bootmem_bdata(bdata, size, align, goal, limit); |
618 | if (region) | 618 | if (region) |
619 | return region; | 619 | return region; |
620 | } | 620 | } |
621 | 621 | ||
622 | return NULL; | ||
623 | } | ||
624 | |||
625 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, | ||
626 | unsigned long align, | ||
627 | unsigned long goal, | ||
628 | unsigned long limit) | ||
629 | { | ||
630 | void *ptr; | ||
631 | |||
632 | restart: | ||
633 | ptr = alloc_bootmem_core(size, align, goal, limit); | ||
634 | if (ptr) | ||
635 | return ptr; | ||
622 | if (goal) { | 636 | if (goal) { |
623 | goal = 0; | 637 | goal = 0; |
624 | goto restart; | 638 | goto restart; |
@@ -684,21 +698,56 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, | |||
684 | return ___alloc_bootmem(size, align, goal, limit); | 698 | return ___alloc_bootmem(size, align, goal, limit); |
685 | } | 699 | } |
686 | 700 | ||
687 | static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | 701 | static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, |
688 | unsigned long size, unsigned long align, | 702 | unsigned long size, unsigned long align, |
689 | unsigned long goal, unsigned long limit) | 703 | unsigned long goal, unsigned long limit) |
690 | { | 704 | { |
691 | void *ptr; | 705 | void *ptr; |
692 | 706 | ||
693 | ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit); | 707 | again: |
708 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, | ||
709 | align, goal, limit); | ||
694 | if (ptr) | 710 | if (ptr) |
695 | return ptr; | 711 | return ptr; |
696 | 712 | ||
697 | ptr = alloc_bootmem_core(bdata, size, align, goal, limit); | 713 | ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); |
698 | if (ptr) | 714 | if (ptr) |
699 | return ptr; | 715 | return ptr; |
700 | 716 | ||
701 | return ___alloc_bootmem(size, align, goal, limit); | 717 | ptr = alloc_bootmem_core(size, align, goal, limit); |
718 | if (ptr) | ||
719 | return ptr; | ||
720 | |||
721 | if (goal) { | ||
722 | goal = 0; | ||
723 | goto again; | ||
724 | } | ||
725 | |||
726 | return NULL; | ||
727 | } | ||
728 | |||
729 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | ||
730 | unsigned long align, unsigned long goal) | ||
731 | { | ||
732 | if (WARN_ON_ONCE(slab_is_available())) | ||
733 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
734 | |||
735 | return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); | ||
736 | } | ||
737 | |||
738 | void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | ||
739 | unsigned long align, unsigned long goal, | ||
740 | unsigned long limit) | ||
741 | { | ||
742 | void *ptr; | ||
743 | |||
744 | ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); | ||
745 | if (ptr) | ||
746 | return ptr; | ||
747 | |||
748 | printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); | ||
749 | panic("Out of memory"); | ||
750 | return NULL; | ||
702 | } | 751 | } |
703 | 752 | ||
704 | /** | 753 | /** |
@@ -722,7 +771,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | |||
722 | if (WARN_ON_ONCE(slab_is_available())) | 771 | if (WARN_ON_ONCE(slab_is_available())) |
723 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 772 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
724 | 773 | ||
725 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); | 774 | return ___alloc_bootmem_node(pgdat, size, align, goal, 0); |
726 | } | 775 | } |
727 | 776 | ||
728 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | 777 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, |
@@ -743,7 +792,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | |||
743 | unsigned long new_goal; | 792 | unsigned long new_goal; |
744 | 793 | ||
745 | new_goal = MAX_DMA32_PFN << PAGE_SHIFT; | 794 | new_goal = MAX_DMA32_PFN << PAGE_SHIFT; |
746 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, | 795 | ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, |
747 | new_goal, 0); | 796 | new_goal, 0); |
748 | if (ptr) | 797 | if (ptr) |
749 | return ptr; | 798 | return ptr; |
@@ -754,47 +803,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | |||
754 | 803 | ||
755 | } | 804 | } |
756 | 805 | ||
757 | #ifdef CONFIG_SPARSEMEM | ||
758 | /** | ||
759 | * alloc_bootmem_section - allocate boot memory from a specific section | ||
760 | * @size: size of the request in bytes | ||
761 | * @section_nr: sparse map section to allocate from | ||
762 | * | ||
763 | * Return NULL on failure. | ||
764 | */ | ||
765 | void * __init alloc_bootmem_section(unsigned long size, | ||
766 | unsigned long section_nr) | ||
767 | { | ||
768 | bootmem_data_t *bdata; | ||
769 | unsigned long pfn, goal; | ||
770 | |||
771 | pfn = section_nr_to_pfn(section_nr); | ||
772 | goal = pfn << PAGE_SHIFT; | ||
773 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; | ||
774 | |||
775 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0); | ||
776 | } | ||
777 | #endif | ||
778 | |||
779 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | ||
780 | unsigned long align, unsigned long goal) | ||
781 | { | ||
782 | void *ptr; | ||
783 | |||
784 | if (WARN_ON_ONCE(slab_is_available())) | ||
785 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
786 | |||
787 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); | ||
788 | if (ptr) | ||
789 | return ptr; | ||
790 | |||
791 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); | ||
792 | if (ptr) | ||
793 | return ptr; | ||
794 | |||
795 | return __alloc_bootmem_nopanic(size, align, goal); | ||
796 | } | ||
797 | |||
798 | #ifndef ARCH_LOW_ADDRESS_LIMIT | 806 | #ifndef ARCH_LOW_ADDRESS_LIMIT |
799 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL | 807 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL |
800 | #endif | 808 | #endif |
@@ -839,6 +847,6 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | |||
839 | if (WARN_ON_ONCE(slab_is_available())) | 847 | if (WARN_ON_ONCE(slab_is_available())) |
840 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 848 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
841 | 849 | ||
842 | return ___alloc_bootmem_node(pgdat->bdata, size, align, | 850 | return ___alloc_bootmem_node(pgdat, size, align, |
843 | goal, ARCH_LOW_ADDRESS_LIMIT); | 851 | goal, ARCH_LOW_ADDRESS_LIMIT); |
844 | } | 852 | } |
diff --git a/mm/compaction.c b/mm/compaction.c index da7d35ea5103..840ee288e296 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -235,7 +235,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
235 | */ | 235 | */ |
236 | while (unlikely(too_many_isolated(zone))) { | 236 | while (unlikely(too_many_isolated(zone))) { |
237 | /* async migration should just abort */ | 237 | /* async migration should just abort */ |
238 | if (!cc->sync) | 238 | if (cc->mode != COMPACT_SYNC) |
239 | return 0; | 239 | return 0; |
240 | 240 | ||
241 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 241 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
@@ -303,7 +303,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
303 | * satisfies the allocation | 303 | * satisfies the allocation |
304 | */ | 304 | */ |
305 | pageblock_nr = low_pfn >> pageblock_order; | 305 | pageblock_nr = low_pfn >> pageblock_order; |
306 | if (!cc->sync && last_pageblock_nr != pageblock_nr && | 306 | if (cc->mode != COMPACT_SYNC && |
307 | last_pageblock_nr != pageblock_nr && | ||
307 | !migrate_async_suitable(get_pageblock_migratetype(page))) { | 308 | !migrate_async_suitable(get_pageblock_migratetype(page))) { |
308 | low_pfn += pageblock_nr_pages; | 309 | low_pfn += pageblock_nr_pages; |
309 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; | 310 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; |
@@ -324,7 +325,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
324 | continue; | 325 | continue; |
325 | } | 326 | } |
326 | 327 | ||
327 | if (!cc->sync) | 328 | if (cc->mode != COMPACT_SYNC) |
328 | mode |= ISOLATE_ASYNC_MIGRATE; | 329 | mode |= ISOLATE_ASYNC_MIGRATE; |
329 | 330 | ||
330 | /* Try isolate the page */ | 331 | /* Try isolate the page */ |
@@ -357,27 +358,90 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
357 | 358 | ||
358 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ | 359 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ |
359 | #ifdef CONFIG_COMPACTION | 360 | #ifdef CONFIG_COMPACTION |
361 | /* | ||
362 | * Returns true if MIGRATE_UNMOVABLE pageblock was successfully | ||
363 | * converted to MIGRATE_MOVABLE type, false otherwise. | ||
364 | */ | ||
365 | static bool rescue_unmovable_pageblock(struct page *page) | ||
366 | { | ||
367 | unsigned long pfn, start_pfn, end_pfn; | ||
368 | struct page *start_page, *end_page; | ||
369 | |||
370 | pfn = page_to_pfn(page); | ||
371 | start_pfn = pfn & ~(pageblock_nr_pages - 1); | ||
372 | end_pfn = start_pfn + pageblock_nr_pages; | ||
373 | |||
374 | start_page = pfn_to_page(start_pfn); | ||
375 | end_page = pfn_to_page(end_pfn); | ||
376 | |||
377 | /* Do not deal with pageblocks that overlap zones */ | ||
378 | if (page_zone(start_page) != page_zone(end_page)) | ||
379 | return false; | ||
380 | |||
381 | for (page = start_page, pfn = start_pfn; page < end_page; pfn++, | ||
382 | page++) { | ||
383 | if (!pfn_valid_within(pfn)) | ||
384 | continue; | ||
385 | |||
386 | if (PageBuddy(page)) { | ||
387 | int order = page_order(page); | ||
388 | |||
389 | pfn += (1 << order) - 1; | ||
390 | page += (1 << order) - 1; | ||
391 | |||
392 | continue; | ||
393 | } else if (page_count(page) == 0 || PageLRU(page)) | ||
394 | continue; | ||
395 | |||
396 | return false; | ||
397 | } | ||
398 | |||
399 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | ||
400 | move_freepages_block(page_zone(page), page, MIGRATE_MOVABLE); | ||
401 | return true; | ||
402 | } | ||
360 | 403 | ||
361 | /* Returns true if the page is within a block suitable for migration to */ | 404 | enum smt_result { |
362 | static bool suitable_migration_target(struct page *page) | 405 | GOOD_AS_MIGRATION_TARGET, |
406 | FAIL_UNMOVABLE_TARGET, | ||
407 | FAIL_BAD_TARGET, | ||
408 | }; | ||
409 | |||
410 | /* | ||
411 | * Returns GOOD_AS_MIGRATION_TARGET if the page is within a block | ||
412 | * suitable for migration to, FAIL_UNMOVABLE_TARGET if the page | ||
413 | * is within a MIGRATE_UNMOVABLE block, FAIL_BAD_TARGET otherwise. | ||
414 | */ | ||
415 | static enum smt_result suitable_migration_target(struct page *page, | ||
416 | struct compact_control *cc) | ||
363 | { | 417 | { |
364 | 418 | ||
365 | int migratetype = get_pageblock_migratetype(page); | 419 | int migratetype = get_pageblock_migratetype(page); |
366 | 420 | ||
367 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | 421 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ |
368 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) | 422 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) |
369 | return false; | 423 | return FAIL_BAD_TARGET; |
370 | 424 | ||
371 | /* If the page is a large free page, then allow migration */ | 425 | /* If the page is a large free page, then allow migration */ |
372 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | 426 | if (PageBuddy(page) && page_order(page) >= pageblock_order) |
373 | return true; | 427 | return GOOD_AS_MIGRATION_TARGET; |
374 | 428 | ||
375 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | 429 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ |
376 | if (migrate_async_suitable(migratetype)) | 430 | if (cc->mode != COMPACT_ASYNC_UNMOVABLE && |
377 | return true; | 431 | migrate_async_suitable(migratetype)) |
432 | return GOOD_AS_MIGRATION_TARGET; | ||
433 | |||
434 | if (cc->mode == COMPACT_ASYNC_MOVABLE && | ||
435 | migratetype == MIGRATE_UNMOVABLE) | ||
436 | return FAIL_UNMOVABLE_TARGET; | ||
437 | |||
438 | if (cc->mode != COMPACT_ASYNC_MOVABLE && | ||
439 | migratetype == MIGRATE_UNMOVABLE && | ||
440 | rescue_unmovable_pageblock(page)) | ||
441 | return GOOD_AS_MIGRATION_TARGET; | ||
378 | 442 | ||
379 | /* Otherwise skip the block */ | 443 | /* Otherwise skip the block */ |
380 | return false; | 444 | return FAIL_BAD_TARGET; |
381 | } | 445 | } |
382 | 446 | ||
383 | /* | 447 | /* |
@@ -411,6 +475,13 @@ static void isolate_freepages(struct zone *zone, | |||
411 | zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 475 | zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; |
412 | 476 | ||
413 | /* | 477 | /* |
478 | * isolate_freepages() may be called more than once during | ||
479 | * compact_zone_order() run and we want only the most recent | ||
480 | * count. | ||
481 | */ | ||
482 | cc->nr_pageblocks_skipped = 0; | ||
483 | |||
484 | /* | ||
414 | * Isolate free pages until enough are available to migrate the | 485 | * Isolate free pages until enough are available to migrate the |
415 | * pages on cc->migratepages. We stop searching if the migrate | 486 | * pages on cc->migratepages. We stop searching if the migrate |
416 | * and free page scanners meet or enough free pages are isolated. | 487 | * and free page scanners meet or enough free pages are isolated. |
@@ -418,6 +489,7 @@ static void isolate_freepages(struct zone *zone, | |||
418 | for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; | 489 | for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; |
419 | pfn -= pageblock_nr_pages) { | 490 | pfn -= pageblock_nr_pages) { |
420 | unsigned long isolated; | 491 | unsigned long isolated; |
492 | enum smt_result ret; | ||
421 | 493 | ||
422 | if (!pfn_valid(pfn)) | 494 | if (!pfn_valid(pfn)) |
423 | continue; | 495 | continue; |
@@ -434,9 +506,12 @@ static void isolate_freepages(struct zone *zone, | |||
434 | continue; | 506 | continue; |
435 | 507 | ||
436 | /* Check the block is suitable for migration */ | 508 | /* Check the block is suitable for migration */ |
437 | if (!suitable_migration_target(page)) | 509 | ret = suitable_migration_target(page, cc); |
510 | if (ret != GOOD_AS_MIGRATION_TARGET) { | ||
511 | if (ret == FAIL_UNMOVABLE_TARGET) | ||
512 | cc->nr_pageblocks_skipped++; | ||
438 | continue; | 513 | continue; |
439 | 514 | } | |
440 | /* | 515 | /* |
441 | * Found a block suitable for isolating free pages from. Now | 516 | * Found a block suitable for isolating free pages from. Now |
442 | * we disabled interrupts, double check things are ok and | 517 | * we disabled interrupts, double check things are ok and |
@@ -445,12 +520,14 @@ static void isolate_freepages(struct zone *zone, | |||
445 | */ | 520 | */ |
446 | isolated = 0; | 521 | isolated = 0; |
447 | spin_lock_irqsave(&zone->lock, flags); | 522 | spin_lock_irqsave(&zone->lock, flags); |
448 | if (suitable_migration_target(page)) { | 523 | ret = suitable_migration_target(page, cc); |
524 | if (ret == GOOD_AS_MIGRATION_TARGET) { | ||
449 | end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); | 525 | end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); |
450 | isolated = isolate_freepages_block(pfn, end_pfn, | 526 | isolated = isolate_freepages_block(pfn, end_pfn, |
451 | freelist, false); | 527 | freelist, false); |
452 | nr_freepages += isolated; | 528 | nr_freepages += isolated; |
453 | } | 529 | } else if (ret == FAIL_UNMOVABLE_TARGET) |
530 | cc->nr_pageblocks_skipped++; | ||
454 | spin_unlock_irqrestore(&zone->lock, flags); | 531 | spin_unlock_irqrestore(&zone->lock, flags); |
455 | 532 | ||
456 | /* | 533 | /* |
@@ -682,8 +759,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
682 | 759 | ||
683 | nr_migrate = cc->nr_migratepages; | 760 | nr_migrate = cc->nr_migratepages; |
684 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 761 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
685 | (unsigned long)cc, false, | 762 | (unsigned long)&cc->freepages, false, |
686 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); | 763 | (cc->mode == COMPACT_SYNC) ? MIGRATE_SYNC_LIGHT |
764 | : MIGRATE_ASYNC); | ||
687 | update_nr_listpages(cc); | 765 | update_nr_listpages(cc); |
688 | nr_remaining = cc->nr_migratepages; | 766 | nr_remaining = cc->nr_migratepages; |
689 | 767 | ||
@@ -712,7 +790,8 @@ out: | |||
712 | 790 | ||
713 | static unsigned long compact_zone_order(struct zone *zone, | 791 | static unsigned long compact_zone_order(struct zone *zone, |
714 | int order, gfp_t gfp_mask, | 792 | int order, gfp_t gfp_mask, |
715 | bool sync) | 793 | enum compact_mode mode, |
794 | unsigned long *nr_pageblocks_skipped) | ||
716 | { | 795 | { |
717 | struct compact_control cc = { | 796 | struct compact_control cc = { |
718 | .nr_freepages = 0, | 797 | .nr_freepages = 0, |
@@ -720,12 +799,17 @@ static unsigned long compact_zone_order(struct zone *zone, | |||
720 | .order = order, | 799 | .order = order, |
721 | .migratetype = allocflags_to_migratetype(gfp_mask), | 800 | .migratetype = allocflags_to_migratetype(gfp_mask), |
722 | .zone = zone, | 801 | .zone = zone, |
723 | .sync = sync, | 802 | .mode = mode, |
724 | }; | 803 | }; |
804 | unsigned long rc; | ||
805 | |||
725 | INIT_LIST_HEAD(&cc.freepages); | 806 | INIT_LIST_HEAD(&cc.freepages); |
726 | INIT_LIST_HEAD(&cc.migratepages); | 807 | INIT_LIST_HEAD(&cc.migratepages); |
727 | 808 | ||
728 | return compact_zone(zone, &cc); | 809 | rc = compact_zone(zone, &cc); |
810 | *nr_pageblocks_skipped = cc.nr_pageblocks_skipped; | ||
811 | |||
812 | return rc; | ||
729 | } | 813 | } |
730 | 814 | ||
731 | int sysctl_extfrag_threshold = 500; | 815 | int sysctl_extfrag_threshold = 500; |
@@ -750,6 +834,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
750 | struct zoneref *z; | 834 | struct zoneref *z; |
751 | struct zone *zone; | 835 | struct zone *zone; |
752 | int rc = COMPACT_SKIPPED; | 836 | int rc = COMPACT_SKIPPED; |
837 | unsigned long nr_pageblocks_skipped; | ||
838 | enum compact_mode mode; | ||
753 | 839 | ||
754 | /* | 840 | /* |
755 | * Check whether it is worth even starting compaction. The order check is | 841 | * Check whether it is worth even starting compaction. The order check is |
@@ -766,12 +852,22 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, | |||
766 | nodemask) { | 852 | nodemask) { |
767 | int status; | 853 | int status; |
768 | 854 | ||
769 | status = compact_zone_order(zone, order, gfp_mask, sync); | 855 | mode = sync ? COMPACT_SYNC : COMPACT_ASYNC_MOVABLE; |
856 | retry: | ||
857 | status = compact_zone_order(zone, order, gfp_mask, mode, | ||
858 | &nr_pageblocks_skipped); | ||
770 | rc = max(status, rc); | 859 | rc = max(status, rc); |
771 | 860 | ||
772 | /* If a normal allocation would succeed, stop compacting */ | 861 | /* If a normal allocation would succeed, stop compacting */ |
773 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | 862 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) |
774 | break; | 863 | break; |
864 | |||
865 | if (rc == COMPACT_COMPLETE && mode == COMPACT_ASYNC_MOVABLE) { | ||
866 | if (nr_pageblocks_skipped) { | ||
867 | mode = COMPACT_ASYNC_UNMOVABLE; | ||
868 | goto retry; | ||
869 | } | ||
870 | } | ||
775 | } | 871 | } |
776 | 872 | ||
777 | return rc; | 873 | return rc; |
@@ -805,7 +901,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
805 | if (ok && cc->order > zone->compact_order_failed) | 901 | if (ok && cc->order > zone->compact_order_failed) |
806 | zone->compact_order_failed = cc->order + 1; | 902 | zone->compact_order_failed = cc->order + 1; |
807 | /* Currently async compaction is never deferred. */ | 903 | /* Currently async compaction is never deferred. */ |
808 | else if (!ok && cc->sync) | 904 | else if (!ok && cc->mode == COMPACT_SYNC) |
809 | defer_compaction(zone, cc->order); | 905 | defer_compaction(zone, cc->order); |
810 | } | 906 | } |
811 | 907 | ||
@@ -820,7 +916,7 @@ int compact_pgdat(pg_data_t *pgdat, int order) | |||
820 | { | 916 | { |
821 | struct compact_control cc = { | 917 | struct compact_control cc = { |
822 | .order = order, | 918 | .order = order, |
823 | .sync = false, | 919 | .mode = COMPACT_ASYNC_MOVABLE, |
824 | }; | 920 | }; |
825 | 921 | ||
826 | return __compact_pgdat(pgdat, &cc); | 922 | return __compact_pgdat(pgdat, &cc); |
@@ -830,7 +926,7 @@ static int compact_node(int nid) | |||
830 | { | 926 | { |
831 | struct compact_control cc = { | 927 | struct compact_control cc = { |
832 | .order = -1, | 928 | .order = -1, |
833 | .sync = true, | 929 | .mode = COMPACT_SYNC, |
834 | }; | 930 | }; |
835 | 931 | ||
836 | return __compact_pgdat(NODE_DATA(nid), &cc); | 932 | return __compact_pgdat(NODE_DATA(nid), &cc); |
diff --git a/mm/filemap.c b/mm/filemap.c index 79c4b2b0b14e..64b48f934b89 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -29,7 +29,6 @@ | |||
29 | #include <linux/pagevec.h> | 29 | #include <linux/pagevec.h> |
30 | #include <linux/blkdev.h> | 30 | #include <linux/blkdev.h> |
31 | #include <linux/security.h> | 31 | #include <linux/security.h> |
32 | #include <linux/syscalls.h> | ||
33 | #include <linux/cpuset.h> | 32 | #include <linux/cpuset.h> |
34 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 33 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
35 | #include <linux/memcontrol.h> | 34 | #include <linux/memcontrol.h> |
@@ -1478,44 +1477,6 @@ out: | |||
1478 | } | 1477 | } |
1479 | EXPORT_SYMBOL(generic_file_aio_read); | 1478 | EXPORT_SYMBOL(generic_file_aio_read); |
1480 | 1479 | ||
1481 | static ssize_t | ||
1482 | do_readahead(struct address_space *mapping, struct file *filp, | ||
1483 | pgoff_t index, unsigned long nr) | ||
1484 | { | ||
1485 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | ||
1486 | return -EINVAL; | ||
1487 | |||
1488 | force_page_cache_readahead(mapping, filp, index, nr); | ||
1489 | return 0; | ||
1490 | } | ||
1491 | |||
1492 | SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) | ||
1493 | { | ||
1494 | ssize_t ret; | ||
1495 | struct file *file; | ||
1496 | |||
1497 | ret = -EBADF; | ||
1498 | file = fget(fd); | ||
1499 | if (file) { | ||
1500 | if (file->f_mode & FMODE_READ) { | ||
1501 | struct address_space *mapping = file->f_mapping; | ||
1502 | pgoff_t start = offset >> PAGE_CACHE_SHIFT; | ||
1503 | pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; | ||
1504 | unsigned long len = end - start + 1; | ||
1505 | ret = do_readahead(mapping, file, start, len); | ||
1506 | } | ||
1507 | fput(file); | ||
1508 | } | ||
1509 | return ret; | ||
1510 | } | ||
1511 | #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS | ||
1512 | asmlinkage long SyS_readahead(long fd, loff_t offset, long count) | ||
1513 | { | ||
1514 | return SYSC_readahead((int) fd, offset, (size_t) count); | ||
1515 | } | ||
1516 | SYSCALL_ALIAS(sys_readahead, SyS_readahead); | ||
1517 | #endif | ||
1518 | |||
1519 | #ifdef CONFIG_MMU | 1480 | #ifdef CONFIG_MMU |
1520 | /** | 1481 | /** |
1521 | * page_cache_read - adds requested page to the page cache if not already there | 1482 | * page_cache_read - adds requested page to the page cache if not already there |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f0e5306eeb55..d0def42c121b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -636,16 +636,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
636 | unsigned long haddr, pmd_t *pmd, | 636 | unsigned long haddr, pmd_t *pmd, |
637 | struct page *page) | 637 | struct page *page) |
638 | { | 638 | { |
639 | int ret = 0; | ||
640 | pgtable_t pgtable; | 639 | pgtable_t pgtable; |
641 | 640 | ||
642 | VM_BUG_ON(!PageCompound(page)); | 641 | VM_BUG_ON(!PageCompound(page)); |
643 | pgtable = pte_alloc_one(mm, haddr); | 642 | pgtable = pte_alloc_one(mm, haddr); |
644 | if (unlikely(!pgtable)) { | 643 | if (unlikely(!pgtable)) |
645 | mem_cgroup_uncharge_page(page); | ||
646 | put_page(page); | ||
647 | return VM_FAULT_OOM; | 644 | return VM_FAULT_OOM; |
648 | } | ||
649 | 645 | ||
650 | clear_huge_page(page, haddr, HPAGE_PMD_NR); | 646 | clear_huge_page(page, haddr, HPAGE_PMD_NR); |
651 | __SetPageUptodate(page); | 647 | __SetPageUptodate(page); |
@@ -675,7 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
675 | spin_unlock(&mm->page_table_lock); | 671 | spin_unlock(&mm->page_table_lock); |
676 | } | 672 | } |
677 | 673 | ||
678 | return ret; | 674 | return 0; |
679 | } | 675 | } |
680 | 676 | ||
681 | static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) | 677 | static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) |
@@ -724,8 +720,14 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
724 | put_page(page); | 720 | put_page(page); |
725 | goto out; | 721 | goto out; |
726 | } | 722 | } |
723 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, | ||
724 | page))) { | ||
725 | mem_cgroup_uncharge_page(page); | ||
726 | put_page(page); | ||
727 | goto out; | ||
728 | } | ||
727 | 729 | ||
728 | return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); | 730 | return 0; |
729 | } | 731 | } |
730 | out: | 732 | out: |
731 | /* | 733 | /* |
@@ -950,6 +952,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
950 | count_vm_event(THP_FAULT_FALLBACK); | 952 | count_vm_event(THP_FAULT_FALLBACK); |
951 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | 953 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, |
952 | pmd, orig_pmd, page, haddr); | 954 | pmd, orig_pmd, page, haddr); |
955 | if (ret & VM_FAULT_OOM) | ||
956 | split_huge_page(page); | ||
953 | put_page(page); | 957 | put_page(page); |
954 | goto out; | 958 | goto out; |
955 | } | 959 | } |
@@ -957,6 +961,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
957 | 961 | ||
958 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 962 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { |
959 | put_page(new_page); | 963 | put_page(new_page); |
964 | split_huge_page(page); | ||
960 | put_page(page); | 965 | put_page(page); |
961 | ret |= VM_FAULT_OOM; | 966 | ret |= VM_FAULT_OOM; |
962 | goto out; | 967 | goto out; |
@@ -968,8 +973,10 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
968 | spin_lock(&mm->page_table_lock); | 973 | spin_lock(&mm->page_table_lock); |
969 | put_page(page); | 974 | put_page(page); |
970 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 975 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
976 | spin_unlock(&mm->page_table_lock); | ||
971 | mem_cgroup_uncharge_page(new_page); | 977 | mem_cgroup_uncharge_page(new_page); |
972 | put_page(new_page); | 978 | put_page(new_page); |
979 | goto out; | ||
973 | } else { | 980 | } else { |
974 | pmd_t entry; | 981 | pmd_t entry; |
975 | VM_BUG_ON(!PageHead(page)); | 982 | VM_BUG_ON(!PageHead(page)); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4e28416c47fb..285a81e87ec8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -273,8 +273,8 @@ static long region_count(struct list_head *head, long f, long t) | |||
273 | 273 | ||
274 | /* Locate each segment we overlap with, and count that overlap. */ | 274 | /* Locate each segment we overlap with, and count that overlap. */ |
275 | list_for_each_entry(rg, head, link) { | 275 | list_for_each_entry(rg, head, link) { |
276 | int seg_from; | 276 | long seg_from; |
277 | int seg_to; | 277 | long seg_to; |
278 | 278 | ||
279 | if (rg->to <= f) | 279 | if (rg->to <= f) |
280 | continue; | 280 | continue; |
@@ -2157,6 +2157,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) | |||
2157 | kref_get(&reservations->refs); | 2157 | kref_get(&reservations->refs); |
2158 | } | 2158 | } |
2159 | 2159 | ||
2160 | static void resv_map_put(struct vm_area_struct *vma) | ||
2161 | { | ||
2162 | struct resv_map *reservations = vma_resv_map(vma); | ||
2163 | |||
2164 | if (!reservations) | ||
2165 | return; | ||
2166 | kref_put(&reservations->refs, resv_map_release); | ||
2167 | } | ||
2168 | |||
2160 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) | 2169 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) |
2161 | { | 2170 | { |
2162 | struct hstate *h = hstate_vma(vma); | 2171 | struct hstate *h = hstate_vma(vma); |
@@ -2173,7 +2182,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
2173 | reserve = (end - start) - | 2182 | reserve = (end - start) - |
2174 | region_count(&reservations->regions, start, end); | 2183 | region_count(&reservations->regions, start, end); |
2175 | 2184 | ||
2176 | kref_put(&reservations->refs, resv_map_release); | 2185 | resv_map_put(vma); |
2177 | 2186 | ||
2178 | if (reserve) { | 2187 | if (reserve) { |
2179 | hugetlb_acct_memory(h, -reserve); | 2188 | hugetlb_acct_memory(h, -reserve); |
@@ -2991,12 +3000,16 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
2991 | set_vma_resv_flags(vma, HPAGE_RESV_OWNER); | 3000 | set_vma_resv_flags(vma, HPAGE_RESV_OWNER); |
2992 | } | 3001 | } |
2993 | 3002 | ||
2994 | if (chg < 0) | 3003 | if (chg < 0) { |
2995 | return chg; | 3004 | ret = chg; |
3005 | goto out_err; | ||
3006 | } | ||
2996 | 3007 | ||
2997 | /* There must be enough pages in the subpool for the mapping */ | 3008 | /* There must be enough pages in the subpool for the mapping */ |
2998 | if (hugepage_subpool_get_pages(spool, chg)) | 3009 | if (hugepage_subpool_get_pages(spool, chg)) { |
2999 | return -ENOSPC; | 3010 | ret = -ENOSPC; |
3011 | goto out_err; | ||
3012 | } | ||
3000 | 3013 | ||
3001 | /* | 3014 | /* |
3002 | * Check enough hugepages are available for the reservation. | 3015 | * Check enough hugepages are available for the reservation. |
@@ -3005,7 +3018,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
3005 | ret = hugetlb_acct_memory(h, chg); | 3018 | ret = hugetlb_acct_memory(h, chg); |
3006 | if (ret < 0) { | 3019 | if (ret < 0) { |
3007 | hugepage_subpool_put_pages(spool, chg); | 3020 | hugepage_subpool_put_pages(spool, chg); |
3008 | return ret; | 3021 | goto out_err; |
3009 | } | 3022 | } |
3010 | 3023 | ||
3011 | /* | 3024 | /* |
@@ -3022,6 +3035,9 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
3022 | if (!vma || vma->vm_flags & VM_MAYSHARE) | 3035 | if (!vma || vma->vm_flags & VM_MAYSHARE) |
3023 | region_add(&inode->i_mapping->private_list, from, to); | 3036 | region_add(&inode->i_mapping->private_list, from, to); |
3024 | return 0; | 3037 | return 0; |
3038 | out_err: | ||
3039 | resv_map_put(vma); | ||
3040 | return ret; | ||
3025 | } | 3041 | } |
3026 | 3042 | ||
3027 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | 3043 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) |
diff --git a/mm/internal.h b/mm/internal.h index aee4761cf9a9..4194ab9dc19b 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -94,6 +94,9 @@ extern void putback_lru_page(struct page *page); | |||
94 | /* | 94 | /* |
95 | * in mm/page_alloc.c | 95 | * in mm/page_alloc.c |
96 | */ | 96 | */ |
97 | extern void set_pageblock_migratetype(struct page *page, int migratetype); | ||
98 | extern int move_freepages_block(struct zone *zone, struct page *page, | ||
99 | int migratetype); | ||
97 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 100 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
98 | extern void prep_compound_page(struct page *page, unsigned long order); | 101 | extern void prep_compound_page(struct page *page, unsigned long order); |
99 | #ifdef CONFIG_MEMORY_FAILURE | 102 | #ifdef CONFIG_MEMORY_FAILURE |
@@ -101,6 +104,7 @@ extern bool is_free_buddy_page(struct page *page); | |||
101 | #endif | 104 | #endif |
102 | 105 | ||
103 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | 106 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA |
107 | #include <linux/compaction.h> | ||
104 | 108 | ||
105 | /* | 109 | /* |
106 | * in mm/compaction.c | 110 | * in mm/compaction.c |
@@ -119,11 +123,14 @@ struct compact_control { | |||
119 | unsigned long nr_migratepages; /* Number of pages to migrate */ | 123 | unsigned long nr_migratepages; /* Number of pages to migrate */ |
120 | unsigned long free_pfn; /* isolate_freepages search base */ | 124 | unsigned long free_pfn; /* isolate_freepages search base */ |
121 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 125 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
122 | bool sync; /* Synchronous migration */ | 126 | enum compact_mode mode; /* Compaction mode */ |
123 | 127 | ||
124 | int order; /* order a direct compactor needs */ | 128 | int order; /* order a direct compactor needs */ |
125 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 129 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
126 | struct zone *zone; | 130 | struct zone *zone; |
131 | |||
132 | /* Number of UNMOVABLE destination pageblocks skipped during scan */ | ||
133 | unsigned long nr_pageblocks_skipped; | ||
127 | }; | 134 | }; |
128 | 135 | ||
129 | unsigned long | 136 | unsigned long |
@@ -164,7 +171,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
164 | * to determine if it's being mapped into a LOCKED vma. | 171 | * to determine if it's being mapped into a LOCKED vma. |
165 | * If so, mark page as mlocked. | 172 | * If so, mark page as mlocked. |
166 | */ | 173 | */ |
167 | static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) | 174 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, |
175 | struct page *page) | ||
168 | { | 176 | { |
169 | VM_BUG_ON(PageLRU(page)); | 177 | VM_BUG_ON(PageLRU(page)); |
170 | 178 | ||
@@ -222,7 +230,7 @@ extern unsigned long vma_address(struct page *page, | |||
222 | struct vm_area_struct *vma); | 230 | struct vm_area_struct *vma); |
223 | #endif | 231 | #endif |
224 | #else /* !CONFIG_MMU */ | 232 | #else /* !CONFIG_MMU */ |
225 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | 233 | static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p) |
226 | { | 234 | { |
227 | return 0; | 235 | return 0; |
228 | } | 236 | } |
diff --git a/mm/madvise.c b/mm/madvise.c index 1ccbba5b6674..deff1b64a08c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -11,8 +11,10 @@ | |||
11 | #include <linux/mempolicy.h> | 11 | #include <linux/mempolicy.h> |
12 | #include <linux/page-isolation.h> | 12 | #include <linux/page-isolation.h> |
13 | #include <linux/hugetlb.h> | 13 | #include <linux/hugetlb.h> |
14 | #include <linux/falloc.h> | ||
14 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
15 | #include <linux/ksm.h> | 16 | #include <linux/ksm.h> |
17 | #include <linux/fs.h> | ||
16 | 18 | ||
17 | /* | 19 | /* |
18 | * Any behaviour which results in changes to the vma->vm_flags needs to | 20 | * Any behaviour which results in changes to the vma->vm_flags needs to |
@@ -200,8 +202,7 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
200 | struct vm_area_struct **prev, | 202 | struct vm_area_struct **prev, |
201 | unsigned long start, unsigned long end) | 203 | unsigned long start, unsigned long end) |
202 | { | 204 | { |
203 | struct address_space *mapping; | 205 | loff_t offset; |
204 | loff_t offset, endoff; | ||
205 | int error; | 206 | int error; |
206 | 207 | ||
207 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ | 208 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ |
@@ -217,16 +218,14 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
217 | if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) | 218 | if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) |
218 | return -EACCES; | 219 | return -EACCES; |
219 | 220 | ||
220 | mapping = vma->vm_file->f_mapping; | ||
221 | |||
222 | offset = (loff_t)(start - vma->vm_start) | 221 | offset = (loff_t)(start - vma->vm_start) |
223 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | 222 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
224 | endoff = (loff_t)(end - vma->vm_start - 1) | ||
225 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | ||
226 | 223 | ||
227 | /* vmtruncate_range needs to take i_mutex */ | 224 | /* filesystem's fallocate may need to take i_mutex */ |
228 | up_read(¤t->mm->mmap_sem); | 225 | up_read(¤t->mm->mmap_sem); |
229 | error = vmtruncate_range(mapping->host, offset, endoff); | 226 | error = do_fallocate(vma->vm_file, |
227 | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, | ||
228 | offset, end - start); | ||
230 | down_read(¤t->mm->mmap_sem); | 229 | down_read(¤t->mm->mmap_sem); |
231 | return error; | 230 | return error; |
232 | } | 231 | } |
diff --git a/mm/memblock.c b/mm/memblock.c index a44eab3157f8..952123eba433 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
@@ -37,6 +37,8 @@ struct memblock memblock __initdata_memblock = { | |||
37 | 37 | ||
38 | int memblock_debug __initdata_memblock; | 38 | int memblock_debug __initdata_memblock; |
39 | static int memblock_can_resize __initdata_memblock; | 39 | static int memblock_can_resize __initdata_memblock; |
40 | static int memblock_memory_in_slab __initdata_memblock = 0; | ||
41 | static int memblock_reserved_in_slab __initdata_memblock = 0; | ||
40 | 42 | ||
41 | /* inline so we don't get a warning when pr_debug is compiled out */ | 43 | /* inline so we don't get a warning when pr_debug is compiled out */ |
42 | static inline const char *memblock_type_name(struct memblock_type *type) | 44 | static inline const char *memblock_type_name(struct memblock_type *type) |
@@ -187,6 +189,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
187 | struct memblock_region *new_array, *old_array; | 189 | struct memblock_region *new_array, *old_array; |
188 | phys_addr_t old_size, new_size, addr; | 190 | phys_addr_t old_size, new_size, addr; |
189 | int use_slab = slab_is_available(); | 191 | int use_slab = slab_is_available(); |
192 | int *in_slab; | ||
190 | 193 | ||
191 | /* We don't allow resizing until we know about the reserved regions | 194 | /* We don't allow resizing until we know about the reserved regions |
192 | * of memory that aren't suitable for allocation | 195 | * of memory that aren't suitable for allocation |
@@ -198,6 +201,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
198 | old_size = type->max * sizeof(struct memblock_region); | 201 | old_size = type->max * sizeof(struct memblock_region); |
199 | new_size = old_size << 1; | 202 | new_size = old_size << 1; |
200 | 203 | ||
204 | /* Retrieve the slab flag */ | ||
205 | if (type == &memblock.memory) | ||
206 | in_slab = &memblock_memory_in_slab; | ||
207 | else | ||
208 | in_slab = &memblock_reserved_in_slab; | ||
209 | |||
201 | /* Try to find some space for it. | 210 | /* Try to find some space for it. |
202 | * | 211 | * |
203 | * WARNING: We assume that either slab_is_available() and we use it or | 212 | * WARNING: We assume that either slab_is_available() and we use it or |
@@ -212,14 +221,15 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
212 | if (use_slab) { | 221 | if (use_slab) { |
213 | new_array = kmalloc(new_size, GFP_KERNEL); | 222 | new_array = kmalloc(new_size, GFP_KERNEL); |
214 | addr = new_array ? __pa(new_array) : 0; | 223 | addr = new_array ? __pa(new_array) : 0; |
215 | } else | 224 | } else { |
216 | addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); | 225 | addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); |
226 | new_array = addr ? __va(addr) : 0; | ||
227 | } | ||
217 | if (!addr) { | 228 | if (!addr) { |
218 | pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", | 229 | pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", |
219 | memblock_type_name(type), type->max, type->max * 2); | 230 | memblock_type_name(type), type->max, type->max * 2); |
220 | return -1; | 231 | return -1; |
221 | } | 232 | } |
222 | new_array = __va(addr); | ||
223 | 233 | ||
224 | memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", | 234 | memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", |
225 | memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); | 235 | memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); |
@@ -234,22 +244,24 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
234 | type->regions = new_array; | 244 | type->regions = new_array; |
235 | type->max <<= 1; | 245 | type->max <<= 1; |
236 | 246 | ||
237 | /* If we use SLAB that's it, we are done */ | 247 | /* Free old array. We needn't free it if the array is the |
238 | if (use_slab) | 248 | * static one |
239 | return 0; | ||
240 | |||
241 | /* Add the new reserved region now. Should not fail ! */ | ||
242 | BUG_ON(memblock_reserve(addr, new_size)); | ||
243 | |||
244 | /* If the array wasn't our static init one, then free it. We only do | ||
245 | * that before SLAB is available as later on, we don't know whether | ||
246 | * to use kfree or free_bootmem_pages(). Shouldn't be a big deal | ||
247 | * anyways | ||
248 | */ | 249 | */ |
249 | if (old_array != memblock_memory_init_regions && | 250 | if (*in_slab) |
250 | old_array != memblock_reserved_init_regions) | 251 | kfree(old_array); |
252 | else if (old_array != memblock_memory_init_regions && | ||
253 | old_array != memblock_reserved_init_regions) | ||
251 | memblock_free(__pa(old_array), old_size); | 254 | memblock_free(__pa(old_array), old_size); |
252 | 255 | ||
256 | /* Reserve the new array if that comes from the memblock. | ||
257 | * Otherwise, we needn't do it | ||
258 | */ | ||
259 | if (!use_slab) | ||
260 | BUG_ON(memblock_reserve(addr, new_size)); | ||
261 | |||
262 | /* Update slab flag */ | ||
263 | *in_slab = use_slab; | ||
264 | |||
253 | return 0; | 265 | return 0; |
254 | } | 266 | } |
255 | 267 | ||
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f342778a0c0a..00c8898dbb81 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -138,7 +138,6 @@ struct mem_cgroup_per_zone { | |||
138 | 138 | ||
139 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; | 139 | struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; |
140 | 140 | ||
141 | struct zone_reclaim_stat reclaim_stat; | ||
142 | struct rb_node tree_node; /* RB tree node */ | 141 | struct rb_node tree_node; /* RB tree node */ |
143 | unsigned long long usage_in_excess;/* Set to the value by which */ | 142 | unsigned long long usage_in_excess;/* Set to the value by which */ |
144 | /* the soft limit is exceeded*/ | 143 | /* the soft limit is exceeded*/ |
@@ -1149,15 +1148,25 @@ struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone, | |||
1149 | * Checks whether given mem is same or in the root_mem_cgroup's | 1148 | * Checks whether given mem is same or in the root_mem_cgroup's |
1150 | * hierarchy subtree | 1149 | * hierarchy subtree |
1151 | */ | 1150 | */ |
1151 | bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | ||
1152 | struct mem_cgroup *memcg) | ||
1153 | { | ||
1154 | if (root_memcg == memcg) | ||
1155 | return true; | ||
1156 | if (!root_memcg->use_hierarchy) | ||
1157 | return false; | ||
1158 | return css_is_ancestor(&memcg->css, &root_memcg->css); | ||
1159 | } | ||
1160 | |||
1152 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | 1161 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, |
1153 | struct mem_cgroup *memcg) | 1162 | struct mem_cgroup *memcg) |
1154 | { | 1163 | { |
1155 | if (root_memcg != memcg) { | 1164 | bool ret; |
1156 | return (root_memcg->use_hierarchy && | ||
1157 | css_is_ancestor(&memcg->css, &root_memcg->css)); | ||
1158 | } | ||
1159 | 1165 | ||
1160 | return true; | 1166 | rcu_read_lock(); |
1167 | ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); | ||
1168 | rcu_read_unlock(); | ||
1169 | return ret; | ||
1161 | } | 1170 | } |
1162 | 1171 | ||
1163 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) | 1172 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) |
@@ -1233,16 +1242,6 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) | |||
1233 | return (active > inactive); | 1242 | return (active > inactive); |
1234 | } | 1243 | } |
1235 | 1244 | ||
1236 | struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, | ||
1237 | struct zone *zone) | ||
1238 | { | ||
1239 | int nid = zone_to_nid(zone); | ||
1240 | int zid = zone_idx(zone); | ||
1241 | struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); | ||
1242 | |||
1243 | return &mz->reclaim_stat; | ||
1244 | } | ||
1245 | |||
1246 | struct zone_reclaim_stat * | 1245 | struct zone_reclaim_stat * |
1247 | mem_cgroup_get_reclaim_stat_from_page(struct page *page) | 1246 | mem_cgroup_get_reclaim_stat_from_page(struct page *page) |
1248 | { | 1247 | { |
@@ -1258,7 +1257,7 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) | |||
1258 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ | 1257 | /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ |
1259 | smp_rmb(); | 1258 | smp_rmb(); |
1260 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); | 1259 | mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); |
1261 | return &mz->reclaim_stat; | 1260 | return &mz->lruvec.reclaim_stat; |
1262 | } | 1261 | } |
1263 | 1262 | ||
1264 | #define mem_cgroup_from_res_counter(counter, member) \ | 1263 | #define mem_cgroup_from_res_counter(counter, member) \ |
@@ -2845,24 +2844,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, | |||
2845 | */ | 2844 | */ |
2846 | if (do_swap_account && PageSwapCache(page)) { | 2845 | if (do_swap_account && PageSwapCache(page)) { |
2847 | swp_entry_t ent = {.val = page_private(page)}; | 2846 | swp_entry_t ent = {.val = page_private(page)}; |
2848 | struct mem_cgroup *swap_memcg; | 2847 | mem_cgroup_uncharge_swap(ent); |
2849 | unsigned short id; | ||
2850 | |||
2851 | id = swap_cgroup_record(ent, 0); | ||
2852 | rcu_read_lock(); | ||
2853 | swap_memcg = mem_cgroup_lookup(id); | ||
2854 | if (swap_memcg) { | ||
2855 | /* | ||
2856 | * This recorded memcg can be obsolete one. So, avoid | ||
2857 | * calling css_tryget | ||
2858 | */ | ||
2859 | if (!mem_cgroup_is_root(swap_memcg)) | ||
2860 | res_counter_uncharge(&swap_memcg->memsw, | ||
2861 | PAGE_SIZE); | ||
2862 | mem_cgroup_swap_statistics(swap_memcg, false); | ||
2863 | mem_cgroup_put(swap_memcg); | ||
2864 | } | ||
2865 | rcu_read_unlock(); | ||
2866 | } | 2848 | } |
2867 | /* | 2849 | /* |
2868 | * At swapin, we may charge account against cgroup which has no tasks. | 2850 | * At swapin, we may charge account against cgroup which has no tasks. |
@@ -3155,7 +3137,6 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
3155 | * @entry: swap entry to be moved | 3137 | * @entry: swap entry to be moved |
3156 | * @from: mem_cgroup which the entry is moved from | 3138 | * @from: mem_cgroup which the entry is moved from |
3157 | * @to: mem_cgroup which the entry is moved to | 3139 | * @to: mem_cgroup which the entry is moved to |
3158 | * @need_fixup: whether we should fixup res_counters and refcounts. | ||
3159 | * | 3140 | * |
3160 | * It succeeds only when the swap_cgroup's record for this entry is the same | 3141 | * It succeeds only when the swap_cgroup's record for this entry is the same |
3161 | * as the mem_cgroup's id of @from. | 3142 | * as the mem_cgroup's id of @from. |
@@ -3166,7 +3147,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
3166 | * both res and memsw, and called css_get(). | 3147 | * both res and memsw, and called css_get(). |
3167 | */ | 3148 | */ |
3168 | static int mem_cgroup_move_swap_account(swp_entry_t entry, | 3149 | static int mem_cgroup_move_swap_account(swp_entry_t entry, |
3169 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | 3150 | struct mem_cgroup *from, struct mem_cgroup *to) |
3170 | { | 3151 | { |
3171 | unsigned short old_id, new_id; | 3152 | unsigned short old_id, new_id; |
3172 | 3153 | ||
@@ -3185,24 +3166,13 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
3185 | * swap-in, the refcount of @to might be decreased to 0. | 3166 | * swap-in, the refcount of @to might be decreased to 0. |
3186 | */ | 3167 | */ |
3187 | mem_cgroup_get(to); | 3168 | mem_cgroup_get(to); |
3188 | if (need_fixup) { | ||
3189 | if (!mem_cgroup_is_root(from)) | ||
3190 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
3191 | mem_cgroup_put(from); | ||
3192 | /* | ||
3193 | * we charged both to->res and to->memsw, so we should | ||
3194 | * uncharge to->res. | ||
3195 | */ | ||
3196 | if (!mem_cgroup_is_root(to)) | ||
3197 | res_counter_uncharge(&to->res, PAGE_SIZE); | ||
3198 | } | ||
3199 | return 0; | 3169 | return 0; |
3200 | } | 3170 | } |
3201 | return -EINVAL; | 3171 | return -EINVAL; |
3202 | } | 3172 | } |
3203 | #else | 3173 | #else |
3204 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | 3174 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, |
3205 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | 3175 | struct mem_cgroup *from, struct mem_cgroup *to) |
3206 | { | 3176 | { |
3207 | return -EINVAL; | 3177 | return -EINVAL; |
3208 | } | 3178 | } |
@@ -3363,7 +3333,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, | |||
3363 | void mem_cgroup_replace_page_cache(struct page *oldpage, | 3333 | void mem_cgroup_replace_page_cache(struct page *oldpage, |
3364 | struct page *newpage) | 3334 | struct page *newpage) |
3365 | { | 3335 | { |
3366 | struct mem_cgroup *memcg; | 3336 | struct mem_cgroup *memcg = NULL; |
3367 | struct page_cgroup *pc; | 3337 | struct page_cgroup *pc; |
3368 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; | 3338 | enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; |
3369 | 3339 | ||
@@ -3373,11 +3343,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, | |||
3373 | pc = lookup_page_cgroup(oldpage); | 3343 | pc = lookup_page_cgroup(oldpage); |
3374 | /* fix accounting on old pages */ | 3344 | /* fix accounting on old pages */ |
3375 | lock_page_cgroup(pc); | 3345 | lock_page_cgroup(pc); |
3376 | memcg = pc->mem_cgroup; | 3346 | if (PageCgroupUsed(pc)) { |
3377 | mem_cgroup_charge_statistics(memcg, false, -1); | 3347 | memcg = pc->mem_cgroup; |
3378 | ClearPageCgroupUsed(pc); | 3348 | mem_cgroup_charge_statistics(memcg, false, -1); |
3349 | ClearPageCgroupUsed(pc); | ||
3350 | } | ||
3379 | unlock_page_cgroup(pc); | 3351 | unlock_page_cgroup(pc); |
3380 | 3352 | ||
3353 | /* | ||
3354 | * When called from shmem_replace_page(), in some cases the | ||
3355 | * oldpage has already been charged, and in some cases not. | ||
3356 | */ | ||
3357 | if (!memcg) | ||
3358 | return; | ||
3359 | |||
3381 | if (PageSwapBacked(oldpage)) | 3360 | if (PageSwapBacked(oldpage)) |
3382 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; | 3361 | type = MEM_CGROUP_CHARGE_TYPE_SHMEM; |
3383 | 3362 | ||
@@ -4226,21 +4205,19 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
4226 | { | 4205 | { |
4227 | int nid, zid; | 4206 | int nid, zid; |
4228 | struct mem_cgroup_per_zone *mz; | 4207 | struct mem_cgroup_per_zone *mz; |
4208 | struct zone_reclaim_stat *rstat; | ||
4229 | unsigned long recent_rotated[2] = {0, 0}; | 4209 | unsigned long recent_rotated[2] = {0, 0}; |
4230 | unsigned long recent_scanned[2] = {0, 0}; | 4210 | unsigned long recent_scanned[2] = {0, 0}; |
4231 | 4211 | ||
4232 | for_each_online_node(nid) | 4212 | for_each_online_node(nid) |
4233 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 4213 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
4234 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); | 4214 | mz = mem_cgroup_zoneinfo(memcg, nid, zid); |
4215 | rstat = &mz->lruvec.reclaim_stat; | ||
4235 | 4216 | ||
4236 | recent_rotated[0] += | 4217 | recent_rotated[0] += rstat->recent_rotated[0]; |
4237 | mz->reclaim_stat.recent_rotated[0]; | 4218 | recent_rotated[1] += rstat->recent_rotated[1]; |
4238 | recent_rotated[1] += | 4219 | recent_scanned[0] += rstat->recent_scanned[0]; |
4239 | mz->reclaim_stat.recent_rotated[1]; | 4220 | recent_scanned[1] += rstat->recent_scanned[1]; |
4240 | recent_scanned[0] += | ||
4241 | mz->reclaim_stat.recent_scanned[0]; | ||
4242 | recent_scanned[1] += | ||
4243 | mz->reclaim_stat.recent_scanned[1]; | ||
4244 | } | 4221 | } |
4245 | cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); | 4222 | cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); |
4246 | cb->fill(cb, "recent_rotated_file", recent_rotated[1]); | 4223 | cb->fill(cb, "recent_rotated_file", recent_rotated[1]); |
@@ -5135,7 +5112,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | |||
5135 | return NULL; | 5112 | return NULL; |
5136 | if (PageAnon(page)) { | 5113 | if (PageAnon(page)) { |
5137 | /* we don't move shared anon */ | 5114 | /* we don't move shared anon */ |
5138 | if (!move_anon() || page_mapcount(page) > 2) | 5115 | if (!move_anon()) |
5139 | return NULL; | 5116 | return NULL; |
5140 | } else if (!move_file()) | 5117 | } else if (!move_file()) |
5141 | /* we ignore mapcount for file pages */ | 5118 | /* we ignore mapcount for file pages */ |
@@ -5146,26 +5123,32 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | |||
5146 | return page; | 5123 | return page; |
5147 | } | 5124 | } |
5148 | 5125 | ||
5126 | #ifdef CONFIG_SWAP | ||
5149 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | 5127 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, |
5150 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | 5128 | unsigned long addr, pte_t ptent, swp_entry_t *entry) |
5151 | { | 5129 | { |
5152 | int usage_count; | ||
5153 | struct page *page = NULL; | 5130 | struct page *page = NULL; |
5154 | swp_entry_t ent = pte_to_swp_entry(ptent); | 5131 | swp_entry_t ent = pte_to_swp_entry(ptent); |
5155 | 5132 | ||
5156 | if (!move_anon() || non_swap_entry(ent)) | 5133 | if (!move_anon() || non_swap_entry(ent)) |
5157 | return NULL; | 5134 | return NULL; |
5158 | usage_count = mem_cgroup_count_swap_user(ent, &page); | 5135 | /* |
5159 | if (usage_count > 1) { /* we don't move shared anon */ | 5136 | * Because lookup_swap_cache() updates some statistics counter, |
5160 | if (page) | 5137 | * we call find_get_page() with swapper_space directly. |
5161 | put_page(page); | 5138 | */ |
5162 | return NULL; | 5139 | page = find_get_page(&swapper_space, ent.val); |
5163 | } | ||
5164 | if (do_swap_account) | 5140 | if (do_swap_account) |
5165 | entry->val = ent.val; | 5141 | entry->val = ent.val; |
5166 | 5142 | ||
5167 | return page; | 5143 | return page; |
5168 | } | 5144 | } |
5145 | #else | ||
5146 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | ||
5147 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | ||
5148 | { | ||
5149 | return NULL; | ||
5150 | } | ||
5151 | #endif | ||
5169 | 5152 | ||
5170 | static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | 5153 | static struct page *mc_handle_file_pte(struct vm_area_struct *vma, |
5171 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | 5154 | unsigned long addr, pte_t ptent, swp_entry_t *entry) |
@@ -5521,8 +5504,7 @@ put: /* get_mctgt_type() gets the page */ | |||
5521 | break; | 5504 | break; |
5522 | case MC_TARGET_SWAP: | 5505 | case MC_TARGET_SWAP: |
5523 | ent = target.ent; | 5506 | ent = target.ent; |
5524 | if (!mem_cgroup_move_swap_account(ent, | 5507 | if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { |
5525 | mc.from, mc.to, false)) { | ||
5526 | mc.precharge--; | 5508 | mc.precharge--; |
5527 | /* we fixup refcnts and charges later. */ | 5509 | /* we fixup refcnts and charges later. */ |
5528 | mc.moved_swap++; | 5510 | mc.moved_swap++; |
@@ -5598,7 +5580,6 @@ static void mem_cgroup_move_task(struct cgroup *cont, | |||
5598 | if (mm) { | 5580 | if (mm) { |
5599 | if (mc.to) | 5581 | if (mc.to) |
5600 | mem_cgroup_move_charge(mm); | 5582 | mem_cgroup_move_charge(mm); |
5601 | put_swap_token(mm); | ||
5602 | mmput(mm); | 5583 | mmput(mm); |
5603 | } | 5584 | } |
5604 | if (mc.to) | 5585 | if (mc.to) |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c99ad4e6b88c..ab1e7145e290 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -1388,16 +1388,16 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) | |||
1388 | */ | 1388 | */ |
1389 | if (!get_page_unless_zero(compound_head(p))) { | 1389 | if (!get_page_unless_zero(compound_head(p))) { |
1390 | if (PageHuge(p)) { | 1390 | if (PageHuge(p)) { |
1391 | pr_info("get_any_page: %#lx free huge page\n", pfn); | 1391 | pr_info("%s: %#lx free huge page\n", __func__, pfn); |
1392 | ret = dequeue_hwpoisoned_huge_page(compound_head(p)); | 1392 | ret = dequeue_hwpoisoned_huge_page(compound_head(p)); |
1393 | } else if (is_free_buddy_page(p)) { | 1393 | } else if (is_free_buddy_page(p)) { |
1394 | pr_info("get_any_page: %#lx free buddy page\n", pfn); | 1394 | pr_info("%s: %#lx free buddy page\n", __func__, pfn); |
1395 | /* Set hwpoison bit while page is still isolated */ | 1395 | /* Set hwpoison bit while page is still isolated */ |
1396 | SetPageHWPoison(p); | 1396 | SetPageHWPoison(p); |
1397 | ret = 0; | 1397 | ret = 0; |
1398 | } else { | 1398 | } else { |
1399 | pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", | 1399 | pr_info("%s: %#lx: unknown zero refcount page type %lx\n", |
1400 | pfn, p->flags); | 1400 | __func__, pfn, p->flags); |
1401 | ret = -EIO; | 1401 | ret = -EIO; |
1402 | } | 1402 | } |
1403 | } else { | 1403 | } else { |
diff --git a/mm/memory.c b/mm/memory.c index e40f6759ba98..1b7dc662bf9f 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -2908,7 +2908,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2908 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 2908 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
2909 | page = lookup_swap_cache(entry); | 2909 | page = lookup_swap_cache(entry); |
2910 | if (!page) { | 2910 | if (!page) { |
2911 | grab_swap_token(mm); /* Contend for token _before_ read-in */ | ||
2912 | page = swapin_readahead(entry, | 2911 | page = swapin_readahead(entry, |
2913 | GFP_HIGHUSER_MOVABLE, vma, address); | 2912 | GFP_HIGHUSER_MOVABLE, vma, address); |
2914 | if (!page) { | 2913 | if (!page) { |
@@ -2938,6 +2937,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2938 | } | 2937 | } |
2939 | 2938 | ||
2940 | locked = lock_page_or_retry(page, mm, flags); | 2939 | locked = lock_page_or_retry(page, mm, flags); |
2940 | |||
2941 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2941 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
2942 | if (!locked) { | 2942 | if (!locked) { |
2943 | ret |= VM_FAULT_RETRY; | 2943 | ret |= VM_FAULT_RETRY; |
@@ -3486,6 +3486,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3486 | if (unlikely(is_vm_hugetlb_page(vma))) | 3486 | if (unlikely(is_vm_hugetlb_page(vma))) |
3487 | return hugetlb_fault(mm, vma, address, flags); | 3487 | return hugetlb_fault(mm, vma, address, flags); |
3488 | 3488 | ||
3489 | retry: | ||
3489 | pgd = pgd_offset(mm, address); | 3490 | pgd = pgd_offset(mm, address); |
3490 | pud = pud_alloc(mm, pgd, address); | 3491 | pud = pud_alloc(mm, pgd, address); |
3491 | if (!pud) | 3492 | if (!pud) |
@@ -3499,13 +3500,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3499 | pmd, flags); | 3500 | pmd, flags); |
3500 | } else { | 3501 | } else { |
3501 | pmd_t orig_pmd = *pmd; | 3502 | pmd_t orig_pmd = *pmd; |
3503 | int ret; | ||
3504 | |||
3502 | barrier(); | 3505 | barrier(); |
3503 | if (pmd_trans_huge(orig_pmd)) { | 3506 | if (pmd_trans_huge(orig_pmd)) { |
3504 | if (flags & FAULT_FLAG_WRITE && | 3507 | if (flags & FAULT_FLAG_WRITE && |
3505 | !pmd_write(orig_pmd) && | 3508 | !pmd_write(orig_pmd) && |
3506 | !pmd_trans_splitting(orig_pmd)) | 3509 | !pmd_trans_splitting(orig_pmd)) { |
3507 | return do_huge_pmd_wp_page(mm, vma, address, | 3510 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, |
3508 | pmd, orig_pmd); | 3511 | orig_pmd); |
3512 | /* | ||
3513 | * If COW results in an oom, the huge pmd will | ||
3514 | * have been split, so retry the fault on the | ||
3515 | * pte for a smaller charge. | ||
3516 | */ | ||
3517 | if (unlikely(ret & VM_FAULT_OOM)) | ||
3518 | goto retry; | ||
3519 | return ret; | ||
3520 | } | ||
3509 | return 0; | 3521 | return 0; |
3510 | } | 3522 | } |
3511 | } | 3523 | } |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fc898cb4fe8f..0d7e3ec8e0f3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -74,8 +74,7 @@ static struct resource *register_memory_resource(u64 start, u64 size) | |||
74 | res->end = start + size - 1; | 74 | res->end = start + size - 1; |
75 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; | 75 | res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
76 | if (request_resource(&iomem_resource, res) < 0) { | 76 | if (request_resource(&iomem_resource, res) < 0) { |
77 | printk("System RAM resource %llx - %llx cannot be added\n", | 77 | printk("System RAM resource %pR cannot be added\n", res); |
78 | (unsigned long long)res->start, (unsigned long long)res->end); | ||
79 | kfree(res); | 78 | kfree(res); |
80 | res = NULL; | 79 | res = NULL; |
81 | } | 80 | } |
@@ -502,8 +501,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) | |||
502 | online_pages_range); | 501 | online_pages_range); |
503 | if (ret) { | 502 | if (ret) { |
504 | mutex_unlock(&zonelists_mutex); | 503 | mutex_unlock(&zonelists_mutex); |
505 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", | 504 | printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", |
506 | nr_pages, pfn); | 505 | (unsigned long long) pfn << PAGE_SHIFT, |
506 | (((unsigned long long) pfn + nr_pages) | ||
507 | << PAGE_SHIFT) - 1); | ||
507 | memory_notify(MEM_CANCEL_ONLINE, &arg); | 508 | memory_notify(MEM_CANCEL_ONLINE, &arg); |
508 | unlock_memory_hotplug(); | 509 | unlock_memory_hotplug(); |
509 | return ret; | 510 | return ret; |
@@ -977,8 +978,9 @@ repeat: | |||
977 | return 0; | 978 | return 0; |
978 | 979 | ||
979 | failed_removal: | 980 | failed_removal: |
980 | printk(KERN_INFO "memory offlining %lx to %lx failed\n", | 981 | printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", |
981 | start_pfn, end_pfn); | 982 | (unsigned long long) start_pfn << PAGE_SHIFT, |
983 | ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); | ||
982 | memory_notify(MEM_CANCEL_OFFLINE, &arg); | 984 | memory_notify(MEM_CANCEL_OFFLINE, &arg); |
983 | /* pushback to free area */ | 985 | /* pushback to free area */ |
984 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | 986 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 88f9422b92e7..f15c1b24ca18 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -390,7 +390,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, | |||
390 | { | 390 | { |
391 | if (!pol) | 391 | if (!pol) |
392 | return; | 392 | return; |
393 | if (!mpol_store_user_nodemask(pol) && step == 0 && | 393 | if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE && |
394 | nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) | 394 | nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) |
395 | return; | 395 | return; |
396 | 396 | ||
@@ -950,8 +950,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
950 | * | 950 | * |
951 | * Returns the number of page that could not be moved. | 951 | * Returns the number of page that could not be moved. |
952 | */ | 952 | */ |
953 | int do_migrate_pages(struct mm_struct *mm, | 953 | int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, |
954 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | 954 | const nodemask_t *to, int flags) |
955 | { | 955 | { |
956 | int busy = 0; | 956 | int busy = 0; |
957 | int err; | 957 | int err; |
@@ -963,7 +963,7 @@ int do_migrate_pages(struct mm_struct *mm, | |||
963 | 963 | ||
964 | down_read(&mm->mmap_sem); | 964 | down_read(&mm->mmap_sem); |
965 | 965 | ||
966 | err = migrate_vmas(mm, from_nodes, to_nodes, flags); | 966 | err = migrate_vmas(mm, from, to, flags); |
967 | if (err) | 967 | if (err) |
968 | goto out; | 968 | goto out; |
969 | 969 | ||
@@ -998,14 +998,34 @@ int do_migrate_pages(struct mm_struct *mm, | |||
998 | * moved to an empty node, then there is nothing left worth migrating. | 998 | * moved to an empty node, then there is nothing left worth migrating. |
999 | */ | 999 | */ |
1000 | 1000 | ||
1001 | tmp = *from_nodes; | 1001 | tmp = *from; |
1002 | while (!nodes_empty(tmp)) { | 1002 | while (!nodes_empty(tmp)) { |
1003 | int s,d; | 1003 | int s,d; |
1004 | int source = -1; | 1004 | int source = -1; |
1005 | int dest = 0; | 1005 | int dest = 0; |
1006 | 1006 | ||
1007 | for_each_node_mask(s, tmp) { | 1007 | for_each_node_mask(s, tmp) { |
1008 | d = node_remap(s, *from_nodes, *to_nodes); | 1008 | |
1009 | /* | ||
1010 | * do_migrate_pages() tries to maintain the relative | ||
1011 | * node relationship of the pages established between | ||
1012 | * threads and memory areas. | ||
1013 | * | ||
1014 | * However if the number of source nodes is not equal to | ||
1015 | * the number of destination nodes we can not preserve | ||
1016 | * this node relative relationship. In that case, skip | ||
1017 | * copying memory from a node that is in the destination | ||
1018 | * mask. | ||
1019 | * | ||
1020 | * Example: [2,3,4] -> [3,4,5] moves everything. | ||
1021 | * [0-7] - > [3,4,5] moves only 0,1,2,6,7. | ||
1022 | */ | ||
1023 | |||
1024 | if ((nodes_weight(*from) != nodes_weight(*to)) && | ||
1025 | (node_isset(s, *to))) | ||
1026 | continue; | ||
1027 | |||
1028 | d = node_remap(s, *from, *to); | ||
1009 | if (s == d) | 1029 | if (s == d) |
1010 | continue; | 1030 | continue; |
1011 | 1031 | ||
@@ -1065,8 +1085,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
1065 | { | 1085 | { |
1066 | } | 1086 | } |
1067 | 1087 | ||
1068 | int do_migrate_pages(struct mm_struct *mm, | 1088 | int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, |
1069 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | 1089 | const nodemask_t *to, int flags) |
1070 | { | 1090 | { |
1071 | return -ENOSYS; | 1091 | return -ENOSYS; |
1072 | } | 1092 | } |
@@ -1639,33 +1639,34 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | |||
1639 | { | 1639 | { |
1640 | struct vm_area_struct *vma = NULL; | 1640 | struct vm_area_struct *vma = NULL; |
1641 | 1641 | ||
1642 | if (mm) { | 1642 | if (WARN_ON_ONCE(!mm)) /* Remove this in linux-3.6 */ |
1643 | /* Check the cache first. */ | 1643 | return NULL; |
1644 | /* (Cache hit rate is typically around 35%.) */ | 1644 | |
1645 | vma = mm->mmap_cache; | 1645 | /* Check the cache first. */ |
1646 | if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { | 1646 | /* (Cache hit rate is typically around 35%.) */ |
1647 | struct rb_node * rb_node; | 1647 | vma = mm->mmap_cache; |
1648 | 1648 | if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) { | |
1649 | rb_node = mm->mm_rb.rb_node; | 1649 | struct rb_node *rb_node; |
1650 | vma = NULL; | 1650 | |
1651 | 1651 | rb_node = mm->mm_rb.rb_node; | |
1652 | while (rb_node) { | 1652 | vma = NULL; |
1653 | struct vm_area_struct * vma_tmp; | 1653 | |
1654 | 1654 | while (rb_node) { | |
1655 | vma_tmp = rb_entry(rb_node, | 1655 | struct vm_area_struct *vma_tmp; |
1656 | struct vm_area_struct, vm_rb); | 1656 | |
1657 | 1657 | vma_tmp = rb_entry(rb_node, | |
1658 | if (vma_tmp->vm_end > addr) { | 1658 | struct vm_area_struct, vm_rb); |
1659 | vma = vma_tmp; | 1659 | |
1660 | if (vma_tmp->vm_start <= addr) | 1660 | if (vma_tmp->vm_end > addr) { |
1661 | break; | 1661 | vma = vma_tmp; |
1662 | rb_node = rb_node->rb_left; | 1662 | if (vma_tmp->vm_start <= addr) |
1663 | } else | 1663 | break; |
1664 | rb_node = rb_node->rb_right; | 1664 | rb_node = rb_node->rb_left; |
1665 | } | 1665 | } else |
1666 | if (vma) | 1666 | rb_node = rb_node->rb_right; |
1667 | mm->mmap_cache = vma; | ||
1668 | } | 1667 | } |
1668 | if (vma) | ||
1669 | mm->mmap_cache = vma; | ||
1669 | } | 1670 | } |
1670 | return vma; | 1671 | return vma; |
1671 | } | 1672 | } |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 1983fb1c7026..d23415c001bc 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
@@ -274,86 +274,85 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, | |||
274 | return ___alloc_bootmem(size, align, goal, limit); | 274 | return ___alloc_bootmem(size, align, goal, limit); |
275 | } | 275 | } |
276 | 276 | ||
277 | /** | 277 | static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, |
278 | * __alloc_bootmem_node - allocate boot memory from a specific node | 278 | unsigned long size, |
279 | * @pgdat: node to allocate from | 279 | unsigned long align, |
280 | * @size: size of the request in bytes | 280 | unsigned long goal, |
281 | * @align: alignment of the region | 281 | unsigned long limit) |
282 | * @goal: preferred starting address of the region | ||
283 | * | ||
284 | * The goal is dropped if it can not be satisfied and the allocation will | ||
285 | * fall back to memory below @goal. | ||
286 | * | ||
287 | * Allocation may fall back to any node in the system if the specified node | ||
288 | * can not hold the requested memory. | ||
289 | * | ||
290 | * The function panics if the request can not be satisfied. | ||
291 | */ | ||
292 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | ||
293 | unsigned long align, unsigned long goal) | ||
294 | { | 282 | { |
295 | void *ptr; | 283 | void *ptr; |
296 | 284 | ||
297 | if (WARN_ON_ONCE(slab_is_available())) | ||
298 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
299 | |||
300 | again: | 285 | again: |
301 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | 286 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, |
302 | goal, -1ULL); | 287 | goal, limit); |
303 | if (ptr) | 288 | if (ptr) |
304 | return ptr; | 289 | return ptr; |
305 | 290 | ||
306 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, | 291 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, |
307 | goal, -1ULL); | 292 | goal, limit); |
308 | if (!ptr && goal) { | 293 | if (ptr) |
294 | return ptr; | ||
295 | |||
296 | if (goal) { | ||
309 | goal = 0; | 297 | goal = 0; |
310 | goto again; | 298 | goto again; |
311 | } | 299 | } |
312 | return ptr; | 300 | |
301 | return NULL; | ||
313 | } | 302 | } |
314 | 303 | ||
315 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | 304 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, |
316 | unsigned long align, unsigned long goal) | 305 | unsigned long align, unsigned long goal) |
317 | { | 306 | { |
318 | return __alloc_bootmem_node(pgdat, size, align, goal); | 307 | if (WARN_ON_ONCE(slab_is_available())) |
308 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
309 | |||
310 | return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); | ||
319 | } | 311 | } |
320 | 312 | ||
321 | #ifdef CONFIG_SPARSEMEM | 313 | void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, |
322 | /** | 314 | unsigned long align, unsigned long goal, |
323 | * alloc_bootmem_section - allocate boot memory from a specific section | 315 | unsigned long limit) |
324 | * @size: size of the request in bytes | ||
325 | * @section_nr: sparse map section to allocate from | ||
326 | * | ||
327 | * Return NULL on failure. | ||
328 | */ | ||
329 | void * __init alloc_bootmem_section(unsigned long size, | ||
330 | unsigned long section_nr) | ||
331 | { | 316 | { |
332 | unsigned long pfn, goal, limit; | 317 | void *ptr; |
333 | 318 | ||
334 | pfn = section_nr_to_pfn(section_nr); | 319 | ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit); |
335 | goal = pfn << PAGE_SHIFT; | 320 | if (ptr) |
336 | limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; | 321 | return ptr; |
337 | 322 | ||
338 | return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, | 323 | printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); |
339 | SMP_CACHE_BYTES, goal, limit); | 324 | panic("Out of memory"); |
325 | return NULL; | ||
340 | } | 326 | } |
341 | #endif | ||
342 | 327 | ||
343 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | 328 | /** |
329 | * __alloc_bootmem_node - allocate boot memory from a specific node | ||
330 | * @pgdat: node to allocate from | ||
331 | * @size: size of the request in bytes | ||
332 | * @align: alignment of the region | ||
333 | * @goal: preferred starting address of the region | ||
334 | * | ||
335 | * The goal is dropped if it can not be satisfied and the allocation will | ||
336 | * fall back to memory below @goal. | ||
337 | * | ||
338 | * Allocation may fall back to any node in the system if the specified node | ||
339 | * can not hold the requested memory. | ||
340 | * | ||
341 | * The function panics if the request can not be satisfied. | ||
342 | */ | ||
343 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | ||
344 | unsigned long align, unsigned long goal) | 344 | unsigned long align, unsigned long goal) |
345 | { | 345 | { |
346 | void *ptr; | ||
347 | |||
348 | if (WARN_ON_ONCE(slab_is_available())) | 346 | if (WARN_ON_ONCE(slab_is_available())) |
349 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 347 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
350 | 348 | ||
351 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | 349 | return ___alloc_bootmem_node(pgdat, size, align, goal, 0); |
352 | goal, -1ULL); | 350 | } |
353 | if (ptr) | ||
354 | return ptr; | ||
355 | 351 | ||
356 | return __alloc_bootmem_nopanic(size, align, goal); | 352 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, |
353 | unsigned long align, unsigned long goal) | ||
354 | { | ||
355 | return __alloc_bootmem_node(pgdat, size, align, goal); | ||
357 | } | 356 | } |
358 | 357 | ||
359 | #ifndef ARCH_LOW_ADDRESS_LIMIT | 358 | #ifndef ARCH_LOW_ADDRESS_LIMIT |
@@ -397,16 +396,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, | |||
397 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | 396 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, |
398 | unsigned long align, unsigned long goal) | 397 | unsigned long align, unsigned long goal) |
399 | { | 398 | { |
400 | void *ptr; | ||
401 | |||
402 | if (WARN_ON_ONCE(slab_is_available())) | 399 | if (WARN_ON_ONCE(slab_is_available())) |
403 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 400 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
404 | 401 | ||
405 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | 402 | return ___alloc_bootmem_node(pgdat, size, align, goal, |
406 | goal, ARCH_LOW_ADDRESS_LIMIT); | 403 | ARCH_LOW_ADDRESS_LIMIT); |
407 | if (ptr) | ||
408 | return ptr; | ||
409 | |||
410 | return __alloc_memory_core_early(MAX_NUMNODES, size, align, | ||
411 | goal, ARCH_LOW_ADDRESS_LIMIT); | ||
412 | } | 404 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 9f09a1fde9f9..ed0e19677360 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -180,10 +180,10 @@ static bool oom_unkillable_task(struct task_struct *p, | |||
180 | * predictable as possible. The goal is to return the highest value for the | 180 | * predictable as possible. The goal is to return the highest value for the |
181 | * task consuming the most memory to avoid subsequent oom failures. | 181 | * task consuming the most memory to avoid subsequent oom failures. |
182 | */ | 182 | */ |
183 | unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | 183 | unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, |
184 | const nodemask_t *nodemask, unsigned long totalpages) | 184 | const nodemask_t *nodemask, unsigned long totalpages) |
185 | { | 185 | { |
186 | long points; | 186 | unsigned long points; |
187 | 187 | ||
188 | if (oom_unkillable_task(p, memcg, nodemask)) | 188 | if (oom_unkillable_task(p, memcg, nodemask)) |
189 | return 0; | 189 | return 0; |
@@ -198,21 +198,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
198 | } | 198 | } |
199 | 199 | ||
200 | /* | 200 | /* |
201 | * The memory controller may have a limit of 0 bytes, so avoid a divide | ||
202 | * by zero, if necessary. | ||
203 | */ | ||
204 | if (!totalpages) | ||
205 | totalpages = 1; | ||
206 | |||
207 | /* | ||
208 | * The baseline for the badness score is the proportion of RAM that each | 201 | * The baseline for the badness score is the proportion of RAM that each |
209 | * task's rss, pagetable and swap space use. | 202 | * task's rss, pagetable and swap space use. |
210 | */ | 203 | */ |
211 | points = get_mm_rss(p->mm) + p->mm->nr_ptes; | 204 | points = get_mm_rss(p->mm) + p->mm->nr_ptes + |
212 | points += get_mm_counter(p->mm, MM_SWAPENTS); | 205 | get_mm_counter(p->mm, MM_SWAPENTS); |
213 | |||
214 | points *= 1000; | ||
215 | points /= totalpages; | ||
216 | task_unlock(p); | 206 | task_unlock(p); |
217 | 207 | ||
218 | /* | 208 | /* |
@@ -220,23 +210,20 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
220 | * implementation used by LSMs. | 210 | * implementation used by LSMs. |
221 | */ | 211 | */ |
222 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) | 212 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) |
223 | points -= 30; | 213 | points -= 30 * totalpages / 1000; |
224 | 214 | ||
225 | /* | 215 | /* |
226 | * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may | 216 | * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may |
227 | * either completely disable oom killing or always prefer a certain | 217 | * either completely disable oom killing or always prefer a certain |
228 | * task. | 218 | * task. |
229 | */ | 219 | */ |
230 | points += p->signal->oom_score_adj; | 220 | points += p->signal->oom_score_adj * totalpages / 1000; |
231 | 221 | ||
232 | /* | 222 | /* |
233 | * Never return 0 for an eligible task that may be killed since it's | 223 | * Never return 0 for an eligible task regardless of the root bonus and |
234 | * possible that no single user task uses more than 0.1% of memory and | 224 | * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). |
235 | * no single admin tasks uses more than 3.0%. | ||
236 | */ | 225 | */ |
237 | if (points <= 0) | 226 | return points ? points : 1; |
238 | return 1; | ||
239 | return (points < 1000) ? points : 1000; | ||
240 | } | 227 | } |
241 | 228 | ||
242 | /* | 229 | /* |
@@ -314,7 +301,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
314 | { | 301 | { |
315 | struct task_struct *g, *p; | 302 | struct task_struct *g, *p; |
316 | struct task_struct *chosen = NULL; | 303 | struct task_struct *chosen = NULL; |
317 | *ppoints = 0; | 304 | unsigned long chosen_points = 0; |
318 | 305 | ||
319 | do_each_thread(g, p) { | 306 | do_each_thread(g, p) { |
320 | unsigned int points; | 307 | unsigned int points; |
@@ -354,7 +341,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
354 | */ | 341 | */ |
355 | if (p == current) { | 342 | if (p == current) { |
356 | chosen = p; | 343 | chosen = p; |
357 | *ppoints = 1000; | 344 | chosen_points = ULONG_MAX; |
358 | } else if (!force_kill) { | 345 | } else if (!force_kill) { |
359 | /* | 346 | /* |
360 | * If this task is not being ptraced on exit, | 347 | * If this task is not being ptraced on exit, |
@@ -367,12 +354,13 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
367 | } | 354 | } |
368 | 355 | ||
369 | points = oom_badness(p, memcg, nodemask, totalpages); | 356 | points = oom_badness(p, memcg, nodemask, totalpages); |
370 | if (points > *ppoints) { | 357 | if (points > chosen_points) { |
371 | chosen = p; | 358 | chosen = p; |
372 | *ppoints = points; | 359 | chosen_points = points; |
373 | } | 360 | } |
374 | } while_each_thread(g, p); | 361 | } while_each_thread(g, p); |
375 | 362 | ||
363 | *ppoints = chosen_points * 1000 / totalpages; | ||
376 | return chosen; | 364 | return chosen; |
377 | } | 365 | } |
378 | 366 | ||
@@ -572,7 +560,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
572 | } | 560 | } |
573 | 561 | ||
574 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); | 562 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); |
575 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; | 563 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; |
576 | read_lock(&tasklist_lock); | 564 | read_lock(&tasklist_lock); |
577 | p = select_bad_process(&points, limit, memcg, NULL, false); | 565 | p = select_bad_process(&points, limit, memcg, NULL, false); |
578 | if (p && PTR_ERR(p) != -1UL) | 566 | if (p && PTR_ERR(p) != -1UL) |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bab8e3bc4202..8cbfc38e68ac 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -219,7 +219,7 @@ EXPORT_SYMBOL(nr_online_nodes); | |||
219 | 219 | ||
220 | int page_group_by_mobility_disabled __read_mostly; | 220 | int page_group_by_mobility_disabled __read_mostly; |
221 | 221 | ||
222 | static void set_pageblock_migratetype(struct page *page, int migratetype) | 222 | void set_pageblock_migratetype(struct page *page, int migratetype) |
223 | { | 223 | { |
224 | 224 | ||
225 | if (unlikely(page_group_by_mobility_disabled)) | 225 | if (unlikely(page_group_by_mobility_disabled)) |
@@ -954,8 +954,8 @@ static int move_freepages(struct zone *zone, | |||
954 | return pages_moved; | 954 | return pages_moved; |
955 | } | 955 | } |
956 | 956 | ||
957 | static int move_freepages_block(struct zone *zone, struct page *page, | 957 | int move_freepages_block(struct zone *zone, struct page *page, |
958 | int migratetype) | 958 | int migratetype) |
959 | { | 959 | { |
960 | unsigned long start_pfn, end_pfn; | 960 | unsigned long start_pfn, end_pfn; |
961 | struct page *start_page, *end_page; | 961 | struct page *start_page, *end_page; |
@@ -4300,25 +4300,24 @@ static inline void setup_usemap(struct pglist_data *pgdat, | |||
4300 | 4300 | ||
4301 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 4301 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
4302 | 4302 | ||
4303 | /* Return a sensible default order for the pageblock size. */ | ||
4304 | static inline int pageblock_default_order(void) | ||
4305 | { | ||
4306 | if (HPAGE_SHIFT > PAGE_SHIFT) | ||
4307 | return HUGETLB_PAGE_ORDER; | ||
4308 | |||
4309 | return MAX_ORDER-1; | ||
4310 | } | ||
4311 | |||
4312 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ | 4303 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ |
4313 | static inline void __init set_pageblock_order(unsigned int order) | 4304 | static inline void __init set_pageblock_order(void) |
4314 | { | 4305 | { |
4306 | unsigned int order; | ||
4307 | |||
4315 | /* Check that pageblock_nr_pages has not already been setup */ | 4308 | /* Check that pageblock_nr_pages has not already been setup */ |
4316 | if (pageblock_order) | 4309 | if (pageblock_order) |
4317 | return; | 4310 | return; |
4318 | 4311 | ||
4312 | if (HPAGE_SHIFT > PAGE_SHIFT) | ||
4313 | order = HUGETLB_PAGE_ORDER; | ||
4314 | else | ||
4315 | order = MAX_ORDER - 1; | ||
4316 | |||
4319 | /* | 4317 | /* |
4320 | * Assume the largest contiguous order of interest is a huge page. | 4318 | * Assume the largest contiguous order of interest is a huge page. |
4321 | * This value may be variable depending on boot parameters on IA64 | 4319 | * This value may be variable depending on boot parameters on IA64 and |
4320 | * powerpc. | ||
4322 | */ | 4321 | */ |
4323 | pageblock_order = order; | 4322 | pageblock_order = order; |
4324 | } | 4323 | } |
@@ -4326,15 +4325,13 @@ static inline void __init set_pageblock_order(unsigned int order) | |||
4326 | 4325 | ||
4327 | /* | 4326 | /* |
4328 | * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() | 4327 | * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() |
4329 | * and pageblock_default_order() are unused as pageblock_order is set | 4328 | * is unused as pageblock_order is set at compile-time. See |
4330 | * at compile-time. See include/linux/pageblock-flags.h for the values of | 4329 | * include/linux/pageblock-flags.h for the values of pageblock_order based on |
4331 | * pageblock_order based on the kernel config | 4330 | * the kernel config |
4332 | */ | 4331 | */ |
4333 | static inline int pageblock_default_order(unsigned int order) | 4332 | static inline void set_pageblock_order(void) |
4334 | { | 4333 | { |
4335 | return MAX_ORDER-1; | ||
4336 | } | 4334 | } |
4337 | #define set_pageblock_order(x) do {} while (0) | ||
4338 | 4335 | ||
4339 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | 4336 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ |
4340 | 4337 | ||
@@ -4413,16 +4410,16 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4413 | zone_pcp_init(zone); | 4410 | zone_pcp_init(zone); |
4414 | for_each_lru(lru) | 4411 | for_each_lru(lru) |
4415 | INIT_LIST_HEAD(&zone->lruvec.lists[lru]); | 4412 | INIT_LIST_HEAD(&zone->lruvec.lists[lru]); |
4416 | zone->reclaim_stat.recent_rotated[0] = 0; | 4413 | zone->lruvec.reclaim_stat.recent_rotated[0] = 0; |
4417 | zone->reclaim_stat.recent_rotated[1] = 0; | 4414 | zone->lruvec.reclaim_stat.recent_rotated[1] = 0; |
4418 | zone->reclaim_stat.recent_scanned[0] = 0; | 4415 | zone->lruvec.reclaim_stat.recent_scanned[0] = 0; |
4419 | zone->reclaim_stat.recent_scanned[1] = 0; | 4416 | zone->lruvec.reclaim_stat.recent_scanned[1] = 0; |
4420 | zap_zone_vm_stats(zone); | 4417 | zap_zone_vm_stats(zone); |
4421 | zone->flags = 0; | 4418 | zone->flags = 0; |
4422 | if (!size) | 4419 | if (!size) |
4423 | continue; | 4420 | continue; |
4424 | 4421 | ||
4425 | set_pageblock_order(pageblock_default_order()); | 4422 | set_pageblock_order(); |
4426 | setup_usemap(pgdat, zone, size); | 4423 | setup_usemap(pgdat, zone, size); |
4427 | ret = init_currently_empty_zone(zone, zone_start_pfn, | 4424 | ret = init_currently_empty_zone(zone, zone_start_pfn, |
4428 | size, MEMMAP_EARLY); | 4425 | size, MEMMAP_EARLY); |
@@ -4815,7 +4812,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4815 | find_zone_movable_pfns_for_nodes(); | 4812 | find_zone_movable_pfns_for_nodes(); |
4816 | 4813 | ||
4817 | /* Print out the zone ranges */ | 4814 | /* Print out the zone ranges */ |
4818 | printk("Zone PFN ranges:\n"); | 4815 | printk("Zone ranges:\n"); |
4819 | for (i = 0; i < MAX_NR_ZONES; i++) { | 4816 | for (i = 0; i < MAX_NR_ZONES; i++) { |
4820 | if (i == ZONE_MOVABLE) | 4817 | if (i == ZONE_MOVABLE) |
4821 | continue; | 4818 | continue; |
@@ -4824,22 +4821,25 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4824 | arch_zone_highest_possible_pfn[i]) | 4821 | arch_zone_highest_possible_pfn[i]) |
4825 | printk(KERN_CONT "empty\n"); | 4822 | printk(KERN_CONT "empty\n"); |
4826 | else | 4823 | else |
4827 | printk(KERN_CONT "%0#10lx -> %0#10lx\n", | 4824 | printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", |
4828 | arch_zone_lowest_possible_pfn[i], | 4825 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, |
4829 | arch_zone_highest_possible_pfn[i]); | 4826 | (arch_zone_highest_possible_pfn[i] |
4827 | << PAGE_SHIFT) - 1); | ||
4830 | } | 4828 | } |
4831 | 4829 | ||
4832 | /* Print out the PFNs ZONE_MOVABLE begins at in each node */ | 4830 | /* Print out the PFNs ZONE_MOVABLE begins at in each node */ |
4833 | printk("Movable zone start PFN for each node\n"); | 4831 | printk("Movable zone start for each node\n"); |
4834 | for (i = 0; i < MAX_NUMNODES; i++) { | 4832 | for (i = 0; i < MAX_NUMNODES; i++) { |
4835 | if (zone_movable_pfn[i]) | 4833 | if (zone_movable_pfn[i]) |
4836 | printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); | 4834 | printk(" Node %d: %#010lx\n", i, |
4835 | zone_movable_pfn[i] << PAGE_SHIFT); | ||
4837 | } | 4836 | } |
4838 | 4837 | ||
4839 | /* Print out the early_node_map[] */ | 4838 | /* Print out the early_node_map[] */ |
4840 | printk("Early memory PFN ranges\n"); | 4839 | printk("Early memory node ranges\n"); |
4841 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 4840 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
4842 | printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn); | 4841 | printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, |
4842 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); | ||
4843 | 4843 | ||
4844 | /* Initialise every node */ | 4844 | /* Initialise every node */ |
4845 | mminit_verify_pageflags_layout(); | 4845 | mminit_verify_pageflags_layout(); |
@@ -5657,7 +5657,7 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) | |||
5657 | .nr_migratepages = 0, | 5657 | .nr_migratepages = 0, |
5658 | .order = -1, | 5658 | .order = -1, |
5659 | .zone = page_zone(pfn_to_page(start)), | 5659 | .zone = page_zone(pfn_to_page(start)), |
5660 | .sync = true, | 5660 | .mode = COMPACT_SYNC, |
5661 | }; | 5661 | }; |
5662 | INIT_LIST_HEAD(&cc.migratepages); | 5662 | INIT_LIST_HEAD(&cc.migratepages); |
5663 | 5663 | ||
@@ -5938,7 +5938,7 @@ bool is_free_buddy_page(struct page *page) | |||
5938 | } | 5938 | } |
5939 | #endif | 5939 | #endif |
5940 | 5940 | ||
5941 | static struct trace_print_flags pageflag_names[] = { | 5941 | static const struct trace_print_flags pageflag_names[] = { |
5942 | {1UL << PG_locked, "locked" }, | 5942 | {1UL << PG_locked, "locked" }, |
5943 | {1UL << PG_error, "error" }, | 5943 | {1UL << PG_error, "error" }, |
5944 | {1UL << PG_referenced, "referenced" }, | 5944 | {1UL << PG_referenced, "referenced" }, |
@@ -5973,7 +5973,9 @@ static struct trace_print_flags pageflag_names[] = { | |||
5973 | #ifdef CONFIG_MEMORY_FAILURE | 5973 | #ifdef CONFIG_MEMORY_FAILURE |
5974 | {1UL << PG_hwpoison, "hwpoison" }, | 5974 | {1UL << PG_hwpoison, "hwpoison" }, |
5975 | #endif | 5975 | #endif |
5976 | {-1UL, NULL }, | 5976 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
5977 | {1UL << PG_compound_lock, "compound_lock" }, | ||
5978 | #endif | ||
5977 | }; | 5979 | }; |
5978 | 5980 | ||
5979 | static void dump_page_flags(unsigned long flags) | 5981 | static void dump_page_flags(unsigned long flags) |
@@ -5982,12 +5984,14 @@ static void dump_page_flags(unsigned long flags) | |||
5982 | unsigned long mask; | 5984 | unsigned long mask; |
5983 | int i; | 5985 | int i; |
5984 | 5986 | ||
5987 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); | ||
5988 | |||
5985 | printk(KERN_ALERT "page flags: %#lx(", flags); | 5989 | printk(KERN_ALERT "page flags: %#lx(", flags); |
5986 | 5990 | ||
5987 | /* remove zone id */ | 5991 | /* remove zone id */ |
5988 | flags &= (1UL << NR_PAGEFLAGS) - 1; | 5992 | flags &= (1UL << NR_PAGEFLAGS) - 1; |
5989 | 5993 | ||
5990 | for (i = 0; pageflag_names[i].name && flags; i++) { | 5994 | for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { |
5991 | 5995 | ||
5992 | mask = pageflag_names[i].mask; | 5996 | mask = pageflag_names[i].mask; |
5993 | if ((flags & mask) != mask) | 5997 | if ((flags & mask) != mask) |
diff --git a/mm/readahead.c b/mm/readahead.c index cbcbb02f3e28..ea8f8fa21649 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
@@ -17,6 +17,8 @@ | |||
17 | #include <linux/task_io_accounting_ops.h> | 17 | #include <linux/task_io_accounting_ops.h> |
18 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
20 | #include <linux/syscalls.h> | ||
21 | #include <linux/file.h> | ||
20 | 22 | ||
21 | /* | 23 | /* |
22 | * Initialise a struct file's readahead state. Assumes that the caller has | 24 | * Initialise a struct file's readahead state. Assumes that the caller has |
@@ -562,3 +564,41 @@ page_cache_async_readahead(struct address_space *mapping, | |||
562 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); | 564 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); |
563 | } | 565 | } |
564 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); | 566 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); |
567 | |||
568 | static ssize_t | ||
569 | do_readahead(struct address_space *mapping, struct file *filp, | ||
570 | pgoff_t index, unsigned long nr) | ||
571 | { | ||
572 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | ||
573 | return -EINVAL; | ||
574 | |||
575 | force_page_cache_readahead(mapping, filp, index, nr); | ||
576 | return 0; | ||
577 | } | ||
578 | |||
579 | SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) | ||
580 | { | ||
581 | ssize_t ret; | ||
582 | struct file *file; | ||
583 | |||
584 | ret = -EBADF; | ||
585 | file = fget(fd); | ||
586 | if (file) { | ||
587 | if (file->f_mode & FMODE_READ) { | ||
588 | struct address_space *mapping = file->f_mapping; | ||
589 | pgoff_t start = offset >> PAGE_CACHE_SHIFT; | ||
590 | pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT; | ||
591 | unsigned long len = end - start + 1; | ||
592 | ret = do_readahead(mapping, file, start, len); | ||
593 | } | ||
594 | fput(file); | ||
595 | } | ||
596 | return ret; | ||
597 | } | ||
598 | #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS | ||
599 | asmlinkage long SyS_readahead(long fd, loff_t offset, long count) | ||
600 | { | ||
601 | return SYSC_readahead((int) fd, offset, (size_t) count); | ||
602 | } | ||
603 | SYSCALL_ALIAS(sys_readahead, SyS_readahead); | ||
604 | #endif | ||
@@ -755,12 +755,6 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
755 | pte_unmap_unlock(pte, ptl); | 755 | pte_unmap_unlock(pte, ptl); |
756 | } | 756 | } |
757 | 757 | ||
758 | /* Pretend the page is referenced if the task has the | ||
759 | swap token and is in the middle of a page fault. */ | ||
760 | if (mm != current->mm && has_swap_token(mm) && | ||
761 | rwsem_is_locked(&mm->mmap_sem)) | ||
762 | referenced++; | ||
763 | |||
764 | (*mapcount)--; | 758 | (*mapcount)--; |
765 | 759 | ||
766 | if (referenced) | 760 | if (referenced) |
diff --git a/mm/shmem.c b/mm/shmem.c index be5af34a070d..d576b84d913c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt; | |||
53 | #include <linux/blkdev.h> | 53 | #include <linux/blkdev.h> |
54 | #include <linux/pagevec.h> | 54 | #include <linux/pagevec.h> |
55 | #include <linux/percpu_counter.h> | 55 | #include <linux/percpu_counter.h> |
56 | #include <linux/falloc.h> | ||
56 | #include <linux/splice.h> | 57 | #include <linux/splice.h> |
57 | #include <linux/security.h> | 58 | #include <linux/security.h> |
58 | #include <linux/swapops.h> | 59 | #include <linux/swapops.h> |
@@ -83,12 +84,25 @@ struct shmem_xattr { | |||
83 | char value[0]; | 84 | char value[0]; |
84 | }; | 85 | }; |
85 | 86 | ||
87 | /* | ||
88 | * shmem_fallocate and shmem_writepage communicate via inode->i_private | ||
89 | * (with i_mutex making sure that it has only one user at a time): | ||
90 | * we would prefer not to enlarge the shmem inode just for that. | ||
91 | */ | ||
92 | struct shmem_falloc { | ||
93 | pgoff_t start; /* start of range currently being fallocated */ | ||
94 | pgoff_t next; /* the next page offset to be fallocated */ | ||
95 | pgoff_t nr_falloced; /* how many new pages have been fallocated */ | ||
96 | pgoff_t nr_unswapped; /* how often writepage refused to swap out */ | ||
97 | }; | ||
98 | |||
86 | /* Flag allocation requirements to shmem_getpage */ | 99 | /* Flag allocation requirements to shmem_getpage */ |
87 | enum sgp_type { | 100 | enum sgp_type { |
88 | SGP_READ, /* don't exceed i_size, don't allocate page */ | 101 | SGP_READ, /* don't exceed i_size, don't allocate page */ |
89 | SGP_CACHE, /* don't exceed i_size, may allocate page */ | 102 | SGP_CACHE, /* don't exceed i_size, may allocate page */ |
90 | SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ | 103 | SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ |
91 | SGP_WRITE, /* may exceed i_size, may allocate page */ | 104 | SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ |
105 | SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ | ||
92 | }; | 106 | }; |
93 | 107 | ||
94 | #ifdef CONFIG_TMPFS | 108 | #ifdef CONFIG_TMPFS |
@@ -103,6 +117,9 @@ static unsigned long shmem_default_max_inodes(void) | |||
103 | } | 117 | } |
104 | #endif | 118 | #endif |
105 | 119 | ||
120 | static bool shmem_should_replace_page(struct page *page, gfp_t gfp); | ||
121 | static int shmem_replace_page(struct page **pagep, gfp_t gfp, | ||
122 | struct shmem_inode_info *info, pgoff_t index); | ||
106 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | 123 | static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, |
107 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); | 124 | struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); |
108 | 125 | ||
@@ -423,27 +440,31 @@ void shmem_unlock_mapping(struct address_space *mapping) | |||
423 | 440 | ||
424 | /* | 441 | /* |
425 | * Remove range of pages and swap entries from radix tree, and free them. | 442 | * Remove range of pages and swap entries from radix tree, and free them. |
443 | * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. | ||
426 | */ | 444 | */ |
427 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | 445 | static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, |
446 | bool unfalloc) | ||
428 | { | 447 | { |
429 | struct address_space *mapping = inode->i_mapping; | 448 | struct address_space *mapping = inode->i_mapping; |
430 | struct shmem_inode_info *info = SHMEM_I(inode); | 449 | struct shmem_inode_info *info = SHMEM_I(inode); |
431 | pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 450 | pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
432 | unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); | 451 | pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT; |
433 | pgoff_t end = (lend >> PAGE_CACHE_SHIFT); | 452 | unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1); |
453 | unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1); | ||
434 | struct pagevec pvec; | 454 | struct pagevec pvec; |
435 | pgoff_t indices[PAGEVEC_SIZE]; | 455 | pgoff_t indices[PAGEVEC_SIZE]; |
436 | long nr_swaps_freed = 0; | 456 | long nr_swaps_freed = 0; |
437 | pgoff_t index; | 457 | pgoff_t index; |
438 | int i; | 458 | int i; |
439 | 459 | ||
440 | BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); | 460 | if (lend == -1) |
461 | end = -1; /* unsigned, so actually very big */ | ||
441 | 462 | ||
442 | pagevec_init(&pvec, 0); | 463 | pagevec_init(&pvec, 0); |
443 | index = start; | 464 | index = start; |
444 | while (index <= end) { | 465 | while (index < end) { |
445 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | 466 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
446 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, | 467 | min(end - index, (pgoff_t)PAGEVEC_SIZE), |
447 | pvec.pages, indices); | 468 | pvec.pages, indices); |
448 | if (!pvec.nr) | 469 | if (!pvec.nr) |
449 | break; | 470 | break; |
@@ -452,10 +473,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
452 | struct page *page = pvec.pages[i]; | 473 | struct page *page = pvec.pages[i]; |
453 | 474 | ||
454 | index = indices[i]; | 475 | index = indices[i]; |
455 | if (index > end) | 476 | if (index >= end) |
456 | break; | 477 | break; |
457 | 478 | ||
458 | if (radix_tree_exceptional_entry(page)) { | 479 | if (radix_tree_exceptional_entry(page)) { |
480 | if (unfalloc) | ||
481 | continue; | ||
459 | nr_swaps_freed += !shmem_free_swap(mapping, | 482 | nr_swaps_freed += !shmem_free_swap(mapping, |
460 | index, page); | 483 | index, page); |
461 | continue; | 484 | continue; |
@@ -463,9 +486,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
463 | 486 | ||
464 | if (!trylock_page(page)) | 487 | if (!trylock_page(page)) |
465 | continue; | 488 | continue; |
466 | if (page->mapping == mapping) { | 489 | if (!unfalloc || !PageUptodate(page)) { |
467 | VM_BUG_ON(PageWriteback(page)); | 490 | if (page->mapping == mapping) { |
468 | truncate_inode_page(mapping, page); | 491 | VM_BUG_ON(PageWriteback(page)); |
492 | truncate_inode_page(mapping, page); | ||
493 | } | ||
469 | } | 494 | } |
470 | unlock_page(page); | 495 | unlock_page(page); |
471 | } | 496 | } |
@@ -476,30 +501,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
476 | index++; | 501 | index++; |
477 | } | 502 | } |
478 | 503 | ||
479 | if (partial) { | 504 | if (partial_start) { |
480 | struct page *page = NULL; | 505 | struct page *page = NULL; |
481 | shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); | 506 | shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); |
482 | if (page) { | 507 | if (page) { |
483 | zero_user_segment(page, partial, PAGE_CACHE_SIZE); | 508 | unsigned int top = PAGE_CACHE_SIZE; |
509 | if (start > end) { | ||
510 | top = partial_end; | ||
511 | partial_end = 0; | ||
512 | } | ||
513 | zero_user_segment(page, partial_start, top); | ||
514 | set_page_dirty(page); | ||
515 | unlock_page(page); | ||
516 | page_cache_release(page); | ||
517 | } | ||
518 | } | ||
519 | if (partial_end) { | ||
520 | struct page *page = NULL; | ||
521 | shmem_getpage(inode, end, &page, SGP_READ, NULL); | ||
522 | if (page) { | ||
523 | zero_user_segment(page, 0, partial_end); | ||
484 | set_page_dirty(page); | 524 | set_page_dirty(page); |
485 | unlock_page(page); | 525 | unlock_page(page); |
486 | page_cache_release(page); | 526 | page_cache_release(page); |
487 | } | 527 | } |
488 | } | 528 | } |
529 | if (start >= end) | ||
530 | return; | ||
489 | 531 | ||
490 | index = start; | 532 | index = start; |
491 | for ( ; ; ) { | 533 | for ( ; ; ) { |
492 | cond_resched(); | 534 | cond_resched(); |
493 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | 535 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, |
494 | min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, | 536 | min(end - index, (pgoff_t)PAGEVEC_SIZE), |
495 | pvec.pages, indices); | 537 | pvec.pages, indices); |
496 | if (!pvec.nr) { | 538 | if (!pvec.nr) { |
497 | if (index == start) | 539 | if (index == start || unfalloc) |
498 | break; | 540 | break; |
499 | index = start; | 541 | index = start; |
500 | continue; | 542 | continue; |
501 | } | 543 | } |
502 | if (index == start && indices[0] > end) { | 544 | if ((index == start || unfalloc) && indices[0] >= end) { |
503 | shmem_deswap_pagevec(&pvec); | 545 | shmem_deswap_pagevec(&pvec); |
504 | pagevec_release(&pvec); | 546 | pagevec_release(&pvec); |
505 | break; | 547 | break; |
@@ -509,19 +551,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
509 | struct page *page = pvec.pages[i]; | 551 | struct page *page = pvec.pages[i]; |
510 | 552 | ||
511 | index = indices[i]; | 553 | index = indices[i]; |
512 | if (index > end) | 554 | if (index >= end) |
513 | break; | 555 | break; |
514 | 556 | ||
515 | if (radix_tree_exceptional_entry(page)) { | 557 | if (radix_tree_exceptional_entry(page)) { |
558 | if (unfalloc) | ||
559 | continue; | ||
516 | nr_swaps_freed += !shmem_free_swap(mapping, | 560 | nr_swaps_freed += !shmem_free_swap(mapping, |
517 | index, page); | 561 | index, page); |
518 | continue; | 562 | continue; |
519 | } | 563 | } |
520 | 564 | ||
521 | lock_page(page); | 565 | lock_page(page); |
522 | if (page->mapping == mapping) { | 566 | if (!unfalloc || !PageUptodate(page)) { |
523 | VM_BUG_ON(PageWriteback(page)); | 567 | if (page->mapping == mapping) { |
524 | truncate_inode_page(mapping, page); | 568 | VM_BUG_ON(PageWriteback(page)); |
569 | truncate_inode_page(mapping, page); | ||
570 | } | ||
525 | } | 571 | } |
526 | unlock_page(page); | 572 | unlock_page(page); |
527 | } | 573 | } |
@@ -535,7 +581,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | |||
535 | info->swapped -= nr_swaps_freed; | 581 | info->swapped -= nr_swaps_freed; |
536 | shmem_recalc_inode(inode); | 582 | shmem_recalc_inode(inode); |
537 | spin_unlock(&info->lock); | 583 | spin_unlock(&info->lock); |
584 | } | ||
538 | 585 | ||
586 | void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) | ||
587 | { | ||
588 | shmem_undo_range(inode, lstart, lend, false); | ||
539 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 589 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
540 | } | 590 | } |
541 | EXPORT_SYMBOL_GPL(shmem_truncate_range); | 591 | EXPORT_SYMBOL_GPL(shmem_truncate_range); |
@@ -604,12 +654,13 @@ static void shmem_evict_inode(struct inode *inode) | |||
604 | * If swap found in inode, free it and move page from swapcache to filecache. | 654 | * If swap found in inode, free it and move page from swapcache to filecache. |
605 | */ | 655 | */ |
606 | static int shmem_unuse_inode(struct shmem_inode_info *info, | 656 | static int shmem_unuse_inode(struct shmem_inode_info *info, |
607 | swp_entry_t swap, struct page *page) | 657 | swp_entry_t swap, struct page **pagep) |
608 | { | 658 | { |
609 | struct address_space *mapping = info->vfs_inode.i_mapping; | 659 | struct address_space *mapping = info->vfs_inode.i_mapping; |
610 | void *radswap; | 660 | void *radswap; |
611 | pgoff_t index; | 661 | pgoff_t index; |
612 | int error; | 662 | gfp_t gfp; |
663 | int error = 0; | ||
613 | 664 | ||
614 | radswap = swp_to_radix_entry(swap); | 665 | radswap = swp_to_radix_entry(swap); |
615 | index = radix_tree_locate_item(&mapping->page_tree, radswap); | 666 | index = radix_tree_locate_item(&mapping->page_tree, radswap); |
@@ -625,22 +676,37 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, | |||
625 | if (shmem_swaplist.next != &info->swaplist) | 676 | if (shmem_swaplist.next != &info->swaplist) |
626 | list_move_tail(&shmem_swaplist, &info->swaplist); | 677 | list_move_tail(&shmem_swaplist, &info->swaplist); |
627 | 678 | ||
679 | gfp = mapping_gfp_mask(mapping); | ||
680 | if (shmem_should_replace_page(*pagep, gfp)) { | ||
681 | mutex_unlock(&shmem_swaplist_mutex); | ||
682 | error = shmem_replace_page(pagep, gfp, info, index); | ||
683 | mutex_lock(&shmem_swaplist_mutex); | ||
684 | /* | ||
685 | * We needed to drop mutex to make that restrictive page | ||
686 | * allocation; but the inode might already be freed by now, | ||
687 | * and we cannot refer to inode or mapping or info to check. | ||
688 | * However, we do hold page lock on the PageSwapCache page, | ||
689 | * so can check if that still has our reference remaining. | ||
690 | */ | ||
691 | if (!page_swapcount(*pagep)) | ||
692 | error = -ENOENT; | ||
693 | } | ||
694 | |||
628 | /* | 695 | /* |
629 | * We rely on shmem_swaplist_mutex, not only to protect the swaplist, | 696 | * We rely on shmem_swaplist_mutex, not only to protect the swaplist, |
630 | * but also to hold up shmem_evict_inode(): so inode cannot be freed | 697 | * but also to hold up shmem_evict_inode(): so inode cannot be freed |
631 | * beneath us (pagelock doesn't help until the page is in pagecache). | 698 | * beneath us (pagelock doesn't help until the page is in pagecache). |
632 | */ | 699 | */ |
633 | error = shmem_add_to_page_cache(page, mapping, index, | 700 | if (!error) |
701 | error = shmem_add_to_page_cache(*pagep, mapping, index, | ||
634 | GFP_NOWAIT, radswap); | 702 | GFP_NOWAIT, radswap); |
635 | /* which does mem_cgroup_uncharge_cache_page on error */ | ||
636 | |||
637 | if (error != -ENOMEM) { | 703 | if (error != -ENOMEM) { |
638 | /* | 704 | /* |
639 | * Truncation and eviction use free_swap_and_cache(), which | 705 | * Truncation and eviction use free_swap_and_cache(), which |
640 | * only does trylock page: if we raced, best clean up here. | 706 | * only does trylock page: if we raced, best clean up here. |
641 | */ | 707 | */ |
642 | delete_from_swap_cache(page); | 708 | delete_from_swap_cache(*pagep); |
643 | set_page_dirty(page); | 709 | set_page_dirty(*pagep); |
644 | if (!error) { | 710 | if (!error) { |
645 | spin_lock(&info->lock); | 711 | spin_lock(&info->lock); |
646 | info->swapped--; | 712 | info->swapped--; |
@@ -660,7 +726,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
660 | struct list_head *this, *next; | 726 | struct list_head *this, *next; |
661 | struct shmem_inode_info *info; | 727 | struct shmem_inode_info *info; |
662 | int found = 0; | 728 | int found = 0; |
663 | int error; | 729 | int error = 0; |
730 | |||
731 | /* | ||
732 | * There's a faint possibility that swap page was replaced before | ||
733 | * caller locked it: it will come back later with the right page. | ||
734 | */ | ||
735 | if (unlikely(!PageSwapCache(page))) | ||
736 | goto out; | ||
664 | 737 | ||
665 | /* | 738 | /* |
666 | * Charge page using GFP_KERNEL while we can wait, before taking | 739 | * Charge page using GFP_KERNEL while we can wait, before taking |
@@ -676,7 +749,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
676 | list_for_each_safe(this, next, &shmem_swaplist) { | 749 | list_for_each_safe(this, next, &shmem_swaplist) { |
677 | info = list_entry(this, struct shmem_inode_info, swaplist); | 750 | info = list_entry(this, struct shmem_inode_info, swaplist); |
678 | if (info->swapped) | 751 | if (info->swapped) |
679 | found = shmem_unuse_inode(info, swap, page); | 752 | found = shmem_unuse_inode(info, swap, &page); |
680 | else | 753 | else |
681 | list_del_init(&info->swaplist); | 754 | list_del_init(&info->swaplist); |
682 | cond_resched(); | 755 | cond_resched(); |
@@ -685,8 +758,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
685 | } | 758 | } |
686 | mutex_unlock(&shmem_swaplist_mutex); | 759 | mutex_unlock(&shmem_swaplist_mutex); |
687 | 760 | ||
688 | if (!found) | ||
689 | mem_cgroup_uncharge_cache_page(page); | ||
690 | if (found < 0) | 761 | if (found < 0) |
691 | error = found; | 762 | error = found; |
692 | out: | 763 | out: |
@@ -727,6 +798,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
727 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ | 798 | WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ |
728 | goto redirty; | 799 | goto redirty; |
729 | } | 800 | } |
801 | |||
802 | /* | ||
803 | * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC | ||
804 | * value into swapfile.c, the only way we can correctly account for a | ||
805 | * fallocated page arriving here is now to initialize it and write it. | ||
806 | * | ||
807 | * That's okay for a page already fallocated earlier, but if we have | ||
808 | * not yet completed the fallocation, then (a) we want to keep track | ||
809 | * of this page in case we have to undo it, and (b) it may not be a | ||
810 | * good idea to continue anyway, once we're pushing into swap. So | ||
811 | * reactivate the page, and let shmem_fallocate() quit when too many. | ||
812 | */ | ||
813 | if (!PageUptodate(page)) { | ||
814 | if (inode->i_private) { | ||
815 | struct shmem_falloc *shmem_falloc; | ||
816 | spin_lock(&inode->i_lock); | ||
817 | shmem_falloc = inode->i_private; | ||
818 | if (shmem_falloc && | ||
819 | index >= shmem_falloc->start && | ||
820 | index < shmem_falloc->next) | ||
821 | shmem_falloc->nr_unswapped++; | ||
822 | else | ||
823 | shmem_falloc = NULL; | ||
824 | spin_unlock(&inode->i_lock); | ||
825 | if (shmem_falloc) | ||
826 | goto redirty; | ||
827 | } | ||
828 | clear_highpage(page); | ||
829 | flush_dcache_page(page); | ||
830 | SetPageUptodate(page); | ||
831 | } | ||
832 | |||
730 | swap = get_swap_page(); | 833 | swap = get_swap_page(); |
731 | if (!swap.val) | 834 | if (!swap.val) |
732 | goto redirty; | 835 | goto redirty; |
@@ -856,6 +959,84 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) | |||
856 | #endif | 959 | #endif |
857 | 960 | ||
858 | /* | 961 | /* |
962 | * When a page is moved from swapcache to shmem filecache (either by the | ||
963 | * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of | ||
964 | * shmem_unuse_inode()), it may have been read in earlier from swap, in | ||
965 | * ignorance of the mapping it belongs to. If that mapping has special | ||
966 | * constraints (like the gma500 GEM driver, which requires RAM below 4GB), | ||
967 | * we may need to copy to a suitable page before moving to filecache. | ||
968 | * | ||
969 | * In a future release, this may well be extended to respect cpuset and | ||
970 | * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); | ||
971 | * but for now it is a simple matter of zone. | ||
972 | */ | ||
973 | static bool shmem_should_replace_page(struct page *page, gfp_t gfp) | ||
974 | { | ||
975 | return page_zonenum(page) > gfp_zone(gfp); | ||
976 | } | ||
977 | |||
978 | static int shmem_replace_page(struct page **pagep, gfp_t gfp, | ||
979 | struct shmem_inode_info *info, pgoff_t index) | ||
980 | { | ||
981 | struct page *oldpage, *newpage; | ||
982 | struct address_space *swap_mapping; | ||
983 | pgoff_t swap_index; | ||
984 | int error; | ||
985 | |||
986 | oldpage = *pagep; | ||
987 | swap_index = page_private(oldpage); | ||
988 | swap_mapping = page_mapping(oldpage); | ||
989 | |||
990 | /* | ||
991 | * We have arrived here because our zones are constrained, so don't | ||
992 | * limit chance of success by further cpuset and node constraints. | ||
993 | */ | ||
994 | gfp &= ~GFP_CONSTRAINT_MASK; | ||
995 | newpage = shmem_alloc_page(gfp, info, index); | ||
996 | if (!newpage) | ||
997 | return -ENOMEM; | ||
998 | VM_BUG_ON(shmem_should_replace_page(newpage, gfp)); | ||
999 | |||
1000 | *pagep = newpage; | ||
1001 | page_cache_get(newpage); | ||
1002 | copy_highpage(newpage, oldpage); | ||
1003 | |||
1004 | VM_BUG_ON(!PageLocked(oldpage)); | ||
1005 | __set_page_locked(newpage); | ||
1006 | VM_BUG_ON(!PageUptodate(oldpage)); | ||
1007 | SetPageUptodate(newpage); | ||
1008 | VM_BUG_ON(!PageSwapBacked(oldpage)); | ||
1009 | SetPageSwapBacked(newpage); | ||
1010 | VM_BUG_ON(!swap_index); | ||
1011 | set_page_private(newpage, swap_index); | ||
1012 | VM_BUG_ON(!PageSwapCache(oldpage)); | ||
1013 | SetPageSwapCache(newpage); | ||
1014 | |||
1015 | /* | ||
1016 | * Our caller will very soon move newpage out of swapcache, but it's | ||
1017 | * a nice clean interface for us to replace oldpage by newpage there. | ||
1018 | */ | ||
1019 | spin_lock_irq(&swap_mapping->tree_lock); | ||
1020 | error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, | ||
1021 | newpage); | ||
1022 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | ||
1023 | __dec_zone_page_state(oldpage, NR_FILE_PAGES); | ||
1024 | spin_unlock_irq(&swap_mapping->tree_lock); | ||
1025 | BUG_ON(error); | ||
1026 | |||
1027 | mem_cgroup_replace_page_cache(oldpage, newpage); | ||
1028 | lru_cache_add_anon(newpage); | ||
1029 | |||
1030 | ClearPageSwapCache(oldpage); | ||
1031 | set_page_private(oldpage, 0); | ||
1032 | |||
1033 | unlock_page(oldpage); | ||
1034 | page_cache_release(oldpage); | ||
1035 | page_cache_release(oldpage); | ||
1036 | return 0; | ||
1037 | } | ||
1038 | |||
1039 | /* | ||
859 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate | 1040 | * shmem_getpage_gfp - find page in cache, or get from swap, or allocate |
860 | * | 1041 | * |
861 | * If we allocate a new one we do not mark it dirty. That's up to the | 1042 | * If we allocate a new one we do not mark it dirty. That's up to the |
@@ -872,6 +1053,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, | |||
872 | swp_entry_t swap; | 1053 | swp_entry_t swap; |
873 | int error; | 1054 | int error; |
874 | int once = 0; | 1055 | int once = 0; |
1056 | int alloced = 0; | ||
875 | 1057 | ||
876 | if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) | 1058 | if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) |
877 | return -EFBIG; | 1059 | return -EFBIG; |
@@ -883,19 +1065,21 @@ repeat: | |||
883 | page = NULL; | 1065 | page = NULL; |
884 | } | 1066 | } |
885 | 1067 | ||
886 | if (sgp != SGP_WRITE && | 1068 | if (sgp != SGP_WRITE && sgp != SGP_FALLOC && |
887 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | 1069 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
888 | error = -EINVAL; | 1070 | error = -EINVAL; |
889 | goto failed; | 1071 | goto failed; |
890 | } | 1072 | } |
891 | 1073 | ||
1074 | /* fallocated page? */ | ||
1075 | if (page && !PageUptodate(page)) { | ||
1076 | if (sgp != SGP_READ) | ||
1077 | goto clear; | ||
1078 | unlock_page(page); | ||
1079 | page_cache_release(page); | ||
1080 | page = NULL; | ||
1081 | } | ||
892 | if (page || (sgp == SGP_READ && !swap.val)) { | 1082 | if (page || (sgp == SGP_READ && !swap.val)) { |
893 | /* | ||
894 | * Once we can get the page lock, it must be uptodate: | ||
895 | * if there were an error in reading back from swap, | ||
896 | * the page would not be inserted into the filecache. | ||
897 | */ | ||
898 | BUG_ON(page && !PageUptodate(page)); | ||
899 | *pagep = page; | 1083 | *pagep = page; |
900 | return 0; | 1084 | return 0; |
901 | } | 1085 | } |
@@ -923,19 +1107,20 @@ repeat: | |||
923 | 1107 | ||
924 | /* We have to do this with page locked to prevent races */ | 1108 | /* We have to do this with page locked to prevent races */ |
925 | lock_page(page); | 1109 | lock_page(page); |
1110 | if (!PageSwapCache(page) || page->mapping) { | ||
1111 | error = -EEXIST; /* try again */ | ||
1112 | goto failed; | ||
1113 | } | ||
926 | if (!PageUptodate(page)) { | 1114 | if (!PageUptodate(page)) { |
927 | error = -EIO; | 1115 | error = -EIO; |
928 | goto failed; | 1116 | goto failed; |
929 | } | 1117 | } |
930 | wait_on_page_writeback(page); | 1118 | wait_on_page_writeback(page); |
931 | 1119 | ||
932 | /* Someone may have already done it for us */ | 1120 | if (shmem_should_replace_page(page, gfp)) { |
933 | if (page->mapping) { | 1121 | error = shmem_replace_page(&page, gfp, info, index); |
934 | if (page->mapping == mapping && | 1122 | if (error) |
935 | page->index == index) | 1123 | goto failed; |
936 | goto done; | ||
937 | error = -EEXIST; | ||
938 | goto failed; | ||
939 | } | 1124 | } |
940 | 1125 | ||
941 | error = mem_cgroup_cache_charge(page, current->mm, | 1126 | error = mem_cgroup_cache_charge(page, current->mm, |
@@ -991,19 +1176,36 @@ repeat: | |||
991 | inode->i_blocks += BLOCKS_PER_PAGE; | 1176 | inode->i_blocks += BLOCKS_PER_PAGE; |
992 | shmem_recalc_inode(inode); | 1177 | shmem_recalc_inode(inode); |
993 | spin_unlock(&info->lock); | 1178 | spin_unlock(&info->lock); |
1179 | alloced = true; | ||
994 | 1180 | ||
995 | clear_highpage(page); | 1181 | /* |
996 | flush_dcache_page(page); | 1182 | * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. |
997 | SetPageUptodate(page); | 1183 | */ |
1184 | if (sgp == SGP_FALLOC) | ||
1185 | sgp = SGP_WRITE; | ||
1186 | clear: | ||
1187 | /* | ||
1188 | * Let SGP_WRITE caller clear ends if write does not fill page; | ||
1189 | * but SGP_FALLOC on a page fallocated earlier must initialize | ||
1190 | * it now, lest undo on failure cancel our earlier guarantee. | ||
1191 | */ | ||
1192 | if (sgp != SGP_WRITE) { | ||
1193 | clear_highpage(page); | ||
1194 | flush_dcache_page(page); | ||
1195 | SetPageUptodate(page); | ||
1196 | } | ||
998 | if (sgp == SGP_DIRTY) | 1197 | if (sgp == SGP_DIRTY) |
999 | set_page_dirty(page); | 1198 | set_page_dirty(page); |
1000 | } | 1199 | } |
1001 | done: | 1200 | |
1002 | /* Perhaps the file has been truncated since we checked */ | 1201 | /* Perhaps the file has been truncated since we checked */ |
1003 | if (sgp != SGP_WRITE && | 1202 | if (sgp != SGP_WRITE && sgp != SGP_FALLOC && |
1004 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { | 1203 | ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { |
1005 | error = -EINVAL; | 1204 | error = -EINVAL; |
1006 | goto trunc; | 1205 | if (alloced) |
1206 | goto trunc; | ||
1207 | else | ||
1208 | goto failed; | ||
1007 | } | 1209 | } |
1008 | *pagep = page; | 1210 | *pagep = page; |
1009 | return 0; | 1211 | return 0; |
@@ -1012,6 +1214,7 @@ done: | |||
1012 | * Error recovery. | 1214 | * Error recovery. |
1013 | */ | 1215 | */ |
1014 | trunc: | 1216 | trunc: |
1217 | info = SHMEM_I(inode); | ||
1015 | ClearPageDirty(page); | 1218 | ClearPageDirty(page); |
1016 | delete_from_page_cache(page); | 1219 | delete_from_page_cache(page); |
1017 | spin_lock(&info->lock); | 1220 | spin_lock(&info->lock); |
@@ -1019,6 +1222,7 @@ trunc: | |||
1019 | inode->i_blocks -= BLOCKS_PER_PAGE; | 1222 | inode->i_blocks -= BLOCKS_PER_PAGE; |
1020 | spin_unlock(&info->lock); | 1223 | spin_unlock(&info->lock); |
1021 | decused: | 1224 | decused: |
1225 | sbinfo = SHMEM_SB(inode->i_sb); | ||
1022 | if (sbinfo->max_blocks) | 1226 | if (sbinfo->max_blocks) |
1023 | percpu_counter_add(&sbinfo->used_blocks, -1); | 1227 | percpu_counter_add(&sbinfo->used_blocks, -1); |
1024 | unacct: | 1228 | unacct: |
@@ -1204,6 +1408,14 @@ shmem_write_end(struct file *file, struct address_space *mapping, | |||
1204 | if (pos + copied > inode->i_size) | 1408 | if (pos + copied > inode->i_size) |
1205 | i_size_write(inode, pos + copied); | 1409 | i_size_write(inode, pos + copied); |
1206 | 1410 | ||
1411 | if (!PageUptodate(page)) { | ||
1412 | if (copied < PAGE_CACHE_SIZE) { | ||
1413 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); | ||
1414 | zero_user_segments(page, 0, from, | ||
1415 | from + copied, PAGE_CACHE_SIZE); | ||
1416 | } | ||
1417 | SetPageUptodate(page); | ||
1418 | } | ||
1207 | set_page_dirty(page); | 1419 | set_page_dirty(page); |
1208 | unlock_page(page); | 1420 | unlock_page(page); |
1209 | page_cache_release(page); | 1421 | page_cache_release(page); |
@@ -1462,6 +1674,199 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | |||
1462 | return error; | 1674 | return error; |
1463 | } | 1675 | } |
1464 | 1676 | ||
1677 | /* | ||
1678 | * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. | ||
1679 | */ | ||
1680 | static pgoff_t shmem_seek_hole_data(struct address_space *mapping, | ||
1681 | pgoff_t index, pgoff_t end, int origin) | ||
1682 | { | ||
1683 | struct page *page; | ||
1684 | struct pagevec pvec; | ||
1685 | pgoff_t indices[PAGEVEC_SIZE]; | ||
1686 | bool done = false; | ||
1687 | int i; | ||
1688 | |||
1689 | pagevec_init(&pvec, 0); | ||
1690 | pvec.nr = 1; /* start small: we may be there already */ | ||
1691 | while (!done) { | ||
1692 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | ||
1693 | pvec.nr, pvec.pages, indices); | ||
1694 | if (!pvec.nr) { | ||
1695 | if (origin == SEEK_DATA) | ||
1696 | index = end; | ||
1697 | break; | ||
1698 | } | ||
1699 | for (i = 0; i < pvec.nr; i++, index++) { | ||
1700 | if (index < indices[i]) { | ||
1701 | if (origin == SEEK_HOLE) { | ||
1702 | done = true; | ||
1703 | break; | ||
1704 | } | ||
1705 | index = indices[i]; | ||
1706 | } | ||
1707 | page = pvec.pages[i]; | ||
1708 | if (page && !radix_tree_exceptional_entry(page)) { | ||
1709 | if (!PageUptodate(page)) | ||
1710 | page = NULL; | ||
1711 | } | ||
1712 | if (index >= end || | ||
1713 | (page && origin == SEEK_DATA) || | ||
1714 | (!page && origin == SEEK_HOLE)) { | ||
1715 | done = true; | ||
1716 | break; | ||
1717 | } | ||
1718 | } | ||
1719 | shmem_deswap_pagevec(&pvec); | ||
1720 | pagevec_release(&pvec); | ||
1721 | pvec.nr = PAGEVEC_SIZE; | ||
1722 | cond_resched(); | ||
1723 | } | ||
1724 | return index; | ||
1725 | } | ||
1726 | |||
1727 | static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) | ||
1728 | { | ||
1729 | struct address_space *mapping; | ||
1730 | struct inode *inode; | ||
1731 | pgoff_t start, end; | ||
1732 | loff_t new_offset; | ||
1733 | |||
1734 | if (origin != SEEK_DATA && origin != SEEK_HOLE) | ||
1735 | return generic_file_llseek_size(file, offset, origin, | ||
1736 | MAX_LFS_FILESIZE); | ||
1737 | mapping = file->f_mapping; | ||
1738 | inode = mapping->host; | ||
1739 | mutex_lock(&inode->i_mutex); | ||
1740 | /* We're holding i_mutex so we can access i_size directly */ | ||
1741 | |||
1742 | if (offset < 0) | ||
1743 | offset = -EINVAL; | ||
1744 | else if (offset >= inode->i_size) | ||
1745 | offset = -ENXIO; | ||
1746 | else { | ||
1747 | start = offset >> PAGE_CACHE_SHIFT; | ||
1748 | end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1749 | new_offset = shmem_seek_hole_data(mapping, start, end, origin); | ||
1750 | new_offset <<= PAGE_CACHE_SHIFT; | ||
1751 | if (new_offset > offset) { | ||
1752 | if (new_offset < inode->i_size) | ||
1753 | offset = new_offset; | ||
1754 | else if (origin == SEEK_DATA) | ||
1755 | offset = -ENXIO; | ||
1756 | else | ||
1757 | offset = inode->i_size; | ||
1758 | } | ||
1759 | } | ||
1760 | |||
1761 | if (offset >= 0 && offset != file->f_pos) { | ||
1762 | file->f_pos = offset; | ||
1763 | file->f_version = 0; | ||
1764 | } | ||
1765 | mutex_unlock(&inode->i_mutex); | ||
1766 | return offset; | ||
1767 | } | ||
1768 | |||
1769 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, | ||
1770 | loff_t len) | ||
1771 | { | ||
1772 | struct inode *inode = file->f_path.dentry->d_inode; | ||
1773 | struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); | ||
1774 | struct shmem_falloc shmem_falloc; | ||
1775 | pgoff_t start, index, end; | ||
1776 | int error; | ||
1777 | |||
1778 | mutex_lock(&inode->i_mutex); | ||
1779 | |||
1780 | if (mode & FALLOC_FL_PUNCH_HOLE) { | ||
1781 | struct address_space *mapping = file->f_mapping; | ||
1782 | loff_t unmap_start = round_up(offset, PAGE_SIZE); | ||
1783 | loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; | ||
1784 | |||
1785 | if ((u64)unmap_end > (u64)unmap_start) | ||
1786 | unmap_mapping_range(mapping, unmap_start, | ||
1787 | 1 + unmap_end - unmap_start, 0); | ||
1788 | shmem_truncate_range(inode, offset, offset + len - 1); | ||
1789 | /* No need to unmap again: hole-punching leaves COWed pages */ | ||
1790 | error = 0; | ||
1791 | goto out; | ||
1792 | } | ||
1793 | |||
1794 | /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ | ||
1795 | error = inode_newsize_ok(inode, offset + len); | ||
1796 | if (error) | ||
1797 | goto out; | ||
1798 | |||
1799 | start = offset >> PAGE_CACHE_SHIFT; | ||
1800 | end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
1801 | /* Try to avoid a swapstorm if len is impossible to satisfy */ | ||
1802 | if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) { | ||
1803 | error = -ENOSPC; | ||
1804 | goto out; | ||
1805 | } | ||
1806 | |||
1807 | shmem_falloc.start = start; | ||
1808 | shmem_falloc.next = start; | ||
1809 | shmem_falloc.nr_falloced = 0; | ||
1810 | shmem_falloc.nr_unswapped = 0; | ||
1811 | spin_lock(&inode->i_lock); | ||
1812 | inode->i_private = &shmem_falloc; | ||
1813 | spin_unlock(&inode->i_lock); | ||
1814 | |||
1815 | for (index = start; index < end; index++) { | ||
1816 | struct page *page; | ||
1817 | |||
1818 | /* | ||
1819 | * Good, the fallocate(2) manpage permits EINTR: we may have | ||
1820 | * been interrupted because we are using up too much memory. | ||
1821 | */ | ||
1822 | if (signal_pending(current)) | ||
1823 | error = -EINTR; | ||
1824 | else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) | ||
1825 | error = -ENOMEM; | ||
1826 | else | ||
1827 | error = shmem_getpage(inode, index, &page, SGP_FALLOC, | ||
1828 | NULL); | ||
1829 | if (error) { | ||
1830 | /* Remove the !PageUptodate pages we added */ | ||
1831 | shmem_undo_range(inode, | ||
1832 | (loff_t)start << PAGE_CACHE_SHIFT, | ||
1833 | (loff_t)index << PAGE_CACHE_SHIFT, true); | ||
1834 | goto undone; | ||
1835 | } | ||
1836 | |||
1837 | /* | ||
1838 | * Inform shmem_writepage() how far we have reached. | ||
1839 | * No need for lock or barrier: we have the page lock. | ||
1840 | */ | ||
1841 | shmem_falloc.next++; | ||
1842 | if (!PageUptodate(page)) | ||
1843 | shmem_falloc.nr_falloced++; | ||
1844 | |||
1845 | /* | ||
1846 | * If !PageUptodate, leave it that way so that freeable pages | ||
1847 | * can be recognized if we need to rollback on error later. | ||
1848 | * But set_page_dirty so that memory pressure will swap rather | ||
1849 | * than free the pages we are allocating (and SGP_CACHE pages | ||
1850 | * might still be clean: we now need to mark those dirty too). | ||
1851 | */ | ||
1852 | set_page_dirty(page); | ||
1853 | unlock_page(page); | ||
1854 | page_cache_release(page); | ||
1855 | cond_resched(); | ||
1856 | } | ||
1857 | |||
1858 | if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) | ||
1859 | i_size_write(inode, offset + len); | ||
1860 | inode->i_ctime = CURRENT_TIME; | ||
1861 | undone: | ||
1862 | spin_lock(&inode->i_lock); | ||
1863 | inode->i_private = NULL; | ||
1864 | spin_unlock(&inode->i_lock); | ||
1865 | out: | ||
1866 | mutex_unlock(&inode->i_mutex); | ||
1867 | return error; | ||
1868 | } | ||
1869 | |||
1465 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) | 1870 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) |
1466 | { | 1871 | { |
1467 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); | 1872 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); |
@@ -1665,6 +2070,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
1665 | kaddr = kmap_atomic(page); | 2070 | kaddr = kmap_atomic(page); |
1666 | memcpy(kaddr, symname, len); | 2071 | memcpy(kaddr, symname, len); |
1667 | kunmap_atomic(kaddr); | 2072 | kunmap_atomic(kaddr); |
2073 | SetPageUptodate(page); | ||
1668 | set_page_dirty(page); | 2074 | set_page_dirty(page); |
1669 | unlock_page(page); | 2075 | unlock_page(page); |
1670 | page_cache_release(page); | 2076 | page_cache_release(page); |
@@ -2270,6 +2676,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) | |||
2270 | } | 2676 | } |
2271 | } | 2677 | } |
2272 | sb->s_export_op = &shmem_export_ops; | 2678 | sb->s_export_op = &shmem_export_ops; |
2679 | sb->s_flags |= MS_NOSEC; | ||
2273 | #else | 2680 | #else |
2274 | sb->s_flags |= MS_NOUSER; | 2681 | sb->s_flags |= MS_NOUSER; |
2275 | #endif | 2682 | #endif |
@@ -2364,7 +2771,7 @@ static const struct address_space_operations shmem_aops = { | |||
2364 | static const struct file_operations shmem_file_operations = { | 2771 | static const struct file_operations shmem_file_operations = { |
2365 | .mmap = shmem_mmap, | 2772 | .mmap = shmem_mmap, |
2366 | #ifdef CONFIG_TMPFS | 2773 | #ifdef CONFIG_TMPFS |
2367 | .llseek = generic_file_llseek, | 2774 | .llseek = shmem_file_llseek, |
2368 | .read = do_sync_read, | 2775 | .read = do_sync_read, |
2369 | .write = do_sync_write, | 2776 | .write = do_sync_write, |
2370 | .aio_read = shmem_file_aio_read, | 2777 | .aio_read = shmem_file_aio_read, |
@@ -2372,12 +2779,12 @@ static const struct file_operations shmem_file_operations = { | |||
2372 | .fsync = noop_fsync, | 2779 | .fsync = noop_fsync, |
2373 | .splice_read = shmem_file_splice_read, | 2780 | .splice_read = shmem_file_splice_read, |
2374 | .splice_write = generic_file_splice_write, | 2781 | .splice_write = generic_file_splice_write, |
2782 | .fallocate = shmem_fallocate, | ||
2375 | #endif | 2783 | #endif |
2376 | }; | 2784 | }; |
2377 | 2785 | ||
2378 | static const struct inode_operations shmem_inode_operations = { | 2786 | static const struct inode_operations shmem_inode_operations = { |
2379 | .setattr = shmem_setattr, | 2787 | .setattr = shmem_setattr, |
2380 | .truncate_range = shmem_truncate_range, | ||
2381 | #ifdef CONFIG_TMPFS_XATTR | 2788 | #ifdef CONFIG_TMPFS_XATTR |
2382 | .setxattr = shmem_setxattr, | 2789 | .setxattr = shmem_setxattr, |
2383 | .getxattr = shmem_getxattr, | 2790 | .getxattr = shmem_getxattr, |
diff --git a/mm/sparse.c b/mm/sparse.c index a8bc7d364deb..6a4bf9160e85 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -273,10 +273,10 @@ static unsigned long *__kmalloc_section_usemap(void) | |||
273 | #ifdef CONFIG_MEMORY_HOTREMOVE | 273 | #ifdef CONFIG_MEMORY_HOTREMOVE |
274 | static unsigned long * __init | 274 | static unsigned long * __init |
275 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | 275 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
276 | unsigned long count) | 276 | unsigned long size) |
277 | { | 277 | { |
278 | unsigned long section_nr; | 278 | pg_data_t *host_pgdat; |
279 | 279 | unsigned long goal; | |
280 | /* | 280 | /* |
281 | * A page may contain usemaps for other sections preventing the | 281 | * A page may contain usemaps for other sections preventing the |
282 | * page being freed and making a section unremovable while | 282 | * page being freed and making a section unremovable while |
@@ -287,8 +287,10 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | |||
287 | * from the same section as the pgdat where possible to avoid | 287 | * from the same section as the pgdat where possible to avoid |
288 | * this problem. | 288 | * this problem. |
289 | */ | 289 | */ |
290 | section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); | 290 | goal = __pa(pgdat) & PAGE_SECTION_MASK; |
291 | return alloc_bootmem_section(usemap_size() * count, section_nr); | 291 | host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT)); |
292 | return __alloc_bootmem_node_nopanic(host_pgdat, size, | ||
293 | SMP_CACHE_BYTES, goal); | ||
292 | } | 294 | } |
293 | 295 | ||
294 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | 296 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) |
@@ -332,9 +334,9 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | |||
332 | #else | 334 | #else |
333 | static unsigned long * __init | 335 | static unsigned long * __init |
334 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | 336 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
335 | unsigned long count) | 337 | unsigned long size) |
336 | { | 338 | { |
337 | return NULL; | 339 | return alloc_bootmem_node_nopanic(pgdat, size); |
338 | } | 340 | } |
339 | 341 | ||
340 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | 342 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) |
@@ -352,13 +354,10 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, | |||
352 | int size = usemap_size(); | 354 | int size = usemap_size(); |
353 | 355 | ||
354 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), | 356 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), |
355 | usemap_count); | 357 | size * usemap_count); |
356 | if (!usemap) { | 358 | if (!usemap) { |
357 | usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); | 359 | printk(KERN_WARNING "%s: allocation failed\n", __func__); |
358 | if (!usemap) { | 360 | return; |
359 | printk(KERN_WARNING "%s: allocation failed\n", __func__); | ||
360 | return; | ||
361 | } | ||
362 | } | 361 | } |
363 | 362 | ||
364 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | 363 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
@@ -82,6 +82,25 @@ static void put_compound_page(struct page *page) | |||
82 | if (likely(page != page_head && | 82 | if (likely(page != page_head && |
83 | get_page_unless_zero(page_head))) { | 83 | get_page_unless_zero(page_head))) { |
84 | unsigned long flags; | 84 | unsigned long flags; |
85 | |||
86 | /* | ||
87 | * THP can not break up slab pages so avoid taking | ||
88 | * compound_lock(). Slab performs non-atomic bit ops | ||
89 | * on page->flags for better performance. In particular | ||
90 | * slab_unlock() in slub used to be a hot path. It is | ||
91 | * still hot on arches that do not support | ||
92 | * this_cpu_cmpxchg_double(). | ||
93 | */ | ||
94 | if (PageSlab(page_head)) { | ||
95 | if (PageTail(page)) { | ||
96 | if (put_page_testzero(page_head)) | ||
97 | VM_BUG_ON(1); | ||
98 | |||
99 | atomic_dec(&page->_mapcount); | ||
100 | goto skip_lock_tail; | ||
101 | } else | ||
102 | goto skip_lock; | ||
103 | } | ||
85 | /* | 104 | /* |
86 | * page_head wasn't a dangling pointer but it | 105 | * page_head wasn't a dangling pointer but it |
87 | * may not be a head page anymore by the time | 106 | * may not be a head page anymore by the time |
@@ -92,10 +111,10 @@ static void put_compound_page(struct page *page) | |||
92 | if (unlikely(!PageTail(page))) { | 111 | if (unlikely(!PageTail(page))) { |
93 | /* __split_huge_page_refcount run before us */ | 112 | /* __split_huge_page_refcount run before us */ |
94 | compound_unlock_irqrestore(page_head, flags); | 113 | compound_unlock_irqrestore(page_head, flags); |
95 | VM_BUG_ON(PageHead(page_head)); | 114 | skip_lock: |
96 | if (put_page_testzero(page_head)) | 115 | if (put_page_testzero(page_head)) |
97 | __put_single_page(page_head); | 116 | __put_single_page(page_head); |
98 | out_put_single: | 117 | out_put_single: |
99 | if (put_page_testzero(page)) | 118 | if (put_page_testzero(page)) |
100 | __put_single_page(page); | 119 | __put_single_page(page); |
101 | return; | 120 | return; |
@@ -115,6 +134,8 @@ static void put_compound_page(struct page *page) | |||
115 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); | 134 | VM_BUG_ON(atomic_read(&page_head->_count) <= 0); |
116 | VM_BUG_ON(atomic_read(&page->_count) != 0); | 135 | VM_BUG_ON(atomic_read(&page->_count) != 0); |
117 | compound_unlock_irqrestore(page_head, flags); | 136 | compound_unlock_irqrestore(page_head, flags); |
137 | |||
138 | skip_lock_tail: | ||
118 | if (put_page_testzero(page_head)) { | 139 | if (put_page_testzero(page_head)) { |
119 | if (PageHead(page_head)) | 140 | if (PageHead(page_head)) |
120 | __put_compound_page(page_head); | 141 | __put_compound_page(page_head); |
@@ -162,6 +183,18 @@ bool __get_page_tail(struct page *page) | |||
162 | struct page *page_head = compound_trans_head(page); | 183 | struct page *page_head = compound_trans_head(page); |
163 | 184 | ||
164 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | 185 | if (likely(page != page_head && get_page_unless_zero(page_head))) { |
186 | |||
187 | /* Ref to put_compound_page() comment. */ | ||
188 | if (PageSlab(page_head)) { | ||
189 | if (likely(PageTail(page))) { | ||
190 | __get_page_tail_foll(page, false); | ||
191 | return true; | ||
192 | } else { | ||
193 | put_page(page_head); | ||
194 | return false; | ||
195 | } | ||
196 | } | ||
197 | |||
165 | /* | 198 | /* |
166 | * page_head wasn't a dangling pointer but it | 199 | * page_head wasn't a dangling pointer but it |
167 | * may not be a head page anymore by the time | 200 | * may not be a head page anymore by the time |
@@ -279,21 +312,15 @@ void rotate_reclaimable_page(struct page *page) | |||
279 | static void update_page_reclaim_stat(struct zone *zone, struct page *page, | 312 | static void update_page_reclaim_stat(struct zone *zone, struct page *page, |
280 | int file, int rotated) | 313 | int file, int rotated) |
281 | { | 314 | { |
282 | struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat; | 315 | struct zone_reclaim_stat *reclaim_stat; |
283 | struct zone_reclaim_stat *memcg_reclaim_stat; | ||
284 | 316 | ||
285 | memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page); | 317 | reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page); |
318 | if (!reclaim_stat) | ||
319 | reclaim_stat = &zone->lruvec.reclaim_stat; | ||
286 | 320 | ||
287 | reclaim_stat->recent_scanned[file]++; | 321 | reclaim_stat->recent_scanned[file]++; |
288 | if (rotated) | 322 | if (rotated) |
289 | reclaim_stat->recent_rotated[file]++; | 323 | reclaim_stat->recent_rotated[file]++; |
290 | |||
291 | if (!memcg_reclaim_stat) | ||
292 | return; | ||
293 | |||
294 | memcg_reclaim_stat->recent_scanned[file]++; | ||
295 | if (rotated) | ||
296 | memcg_reclaim_stat->recent_rotated[file]++; | ||
297 | } | 324 | } |
298 | 325 | ||
299 | static void __activate_page(struct page *page, void *arg) | 326 | static void __activate_page(struct page *page, void *arg) |
diff --git a/mm/swapfile.c b/mm/swapfile.c index fafc26d1b1dc..457b10baef59 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -601,7 +601,7 @@ void swapcache_free(swp_entry_t entry, struct page *page) | |||
601 | * This does not give an exact answer when swap count is continued, | 601 | * This does not give an exact answer when swap count is continued, |
602 | * but does include the high COUNT_CONTINUED flag to allow for that. | 602 | * but does include the high COUNT_CONTINUED flag to allow for that. |
603 | */ | 603 | */ |
604 | static inline int page_swapcount(struct page *page) | 604 | int page_swapcount(struct page *page) |
605 | { | 605 | { |
606 | int count = 0; | 606 | int count = 0; |
607 | struct swap_info_struct *p; | 607 | struct swap_info_struct *p; |
@@ -717,37 +717,6 @@ int free_swap_and_cache(swp_entry_t entry) | |||
717 | return p != NULL; | 717 | return p != NULL; |
718 | } | 718 | } |
719 | 719 | ||
720 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
721 | /** | ||
722 | * mem_cgroup_count_swap_user - count the user of a swap entry | ||
723 | * @ent: the swap entry to be checked | ||
724 | * @pagep: the pointer for the swap cache page of the entry to be stored | ||
725 | * | ||
726 | * Returns the number of the user of the swap entry. The number is valid only | ||
727 | * for swaps of anonymous pages. | ||
728 | * If the entry is found on swap cache, the page is stored to pagep with | ||
729 | * refcount of it being incremented. | ||
730 | */ | ||
731 | int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) | ||
732 | { | ||
733 | struct page *page; | ||
734 | struct swap_info_struct *p; | ||
735 | int count = 0; | ||
736 | |||
737 | page = find_get_page(&swapper_space, ent.val); | ||
738 | if (page) | ||
739 | count += page_mapcount(page); | ||
740 | p = swap_info_get(ent); | ||
741 | if (p) { | ||
742 | count += swap_count(p->swap_map[swp_offset(ent)]); | ||
743 | spin_unlock(&swap_lock); | ||
744 | } | ||
745 | |||
746 | *pagep = page; | ||
747 | return count; | ||
748 | } | ||
749 | #endif | ||
750 | |||
751 | #ifdef CONFIG_HIBERNATION | 720 | #ifdef CONFIG_HIBERNATION |
752 | /* | 721 | /* |
753 | * Find the swap type that corresponds to given device (if any). | 722 | * Find the swap type that corresponds to given device (if any). |
diff --git a/mm/thrash.c b/mm/thrash.c deleted file mode 100644 index 57ad495dbd54..000000000000 --- a/mm/thrash.c +++ /dev/null | |||
@@ -1,155 +0,0 @@ | |||
1 | /* | ||
2 | * mm/thrash.c | ||
3 | * | ||
4 | * Copyright (C) 2004, Red Hat, Inc. | ||
5 | * Copyright (C) 2004, Rik van Riel <riel@redhat.com> | ||
6 | * Released under the GPL, see the file COPYING for details. | ||
7 | * | ||
8 | * Simple token based thrashing protection, using the algorithm | ||
9 | * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html | ||
10 | * | ||
11 | * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> | ||
12 | * Improved algorithm to pass token: | ||
13 | * Each task has a priority which is incremented if it contended | ||
14 | * for the token in an interval less than its previous attempt. | ||
15 | * If the token is acquired, that task's priority is boosted to prevent | ||
16 | * the token from bouncing around too often and to let the task make | ||
17 | * some progress in its execution. | ||
18 | */ | ||
19 | |||
20 | #include <linux/jiffies.h> | ||
21 | #include <linux/mm.h> | ||
22 | #include <linux/sched.h> | ||
23 | #include <linux/swap.h> | ||
24 | #include <linux/memcontrol.h> | ||
25 | |||
26 | #include <trace/events/vmscan.h> | ||
27 | |||
28 | #define TOKEN_AGING_INTERVAL (0xFF) | ||
29 | |||
30 | static DEFINE_SPINLOCK(swap_token_lock); | ||
31 | struct mm_struct *swap_token_mm; | ||
32 | static struct mem_cgroup *swap_token_memcg; | ||
33 | |||
34 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
35 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) | ||
36 | { | ||
37 | struct mem_cgroup *memcg; | ||
38 | |||
39 | memcg = try_get_mem_cgroup_from_mm(mm); | ||
40 | if (memcg) | ||
41 | css_put(mem_cgroup_css(memcg)); | ||
42 | |||
43 | return memcg; | ||
44 | } | ||
45 | #else | ||
46 | static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) | ||
47 | { | ||
48 | return NULL; | ||
49 | } | ||
50 | #endif | ||
51 | |||
52 | void grab_swap_token(struct mm_struct *mm) | ||
53 | { | ||
54 | int current_interval; | ||
55 | unsigned int old_prio = mm->token_priority; | ||
56 | static unsigned int global_faults; | ||
57 | static unsigned int last_aging; | ||
58 | |||
59 | global_faults++; | ||
60 | |||
61 | current_interval = global_faults - mm->faultstamp; | ||
62 | |||
63 | if (!spin_trylock(&swap_token_lock)) | ||
64 | return; | ||
65 | |||
66 | /* First come first served */ | ||
67 | if (!swap_token_mm) | ||
68 | goto replace_token; | ||
69 | |||
70 | /* | ||
71 | * Usually, we don't need priority aging because long interval faults | ||
72 | * makes priority decrease quickly. But there is one exception. If the | ||
73 | * token owner task is sleeping, it never make long interval faults. | ||
74 | * Thus, we need a priority aging mechanism instead. The requirements | ||
75 | * of priority aging are | ||
76 | * 1) An aging interval is reasonable enough long. Too short aging | ||
77 | * interval makes quick swap token lost and decrease performance. | ||
78 | * 2) The swap token owner task have to get priority aging even if | ||
79 | * it's under sleep. | ||
80 | */ | ||
81 | if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { | ||
82 | swap_token_mm->token_priority /= 2; | ||
83 | last_aging = global_faults; | ||
84 | } | ||
85 | |||
86 | if (mm == swap_token_mm) { | ||
87 | mm->token_priority += 2; | ||
88 | goto update_priority; | ||
89 | } | ||
90 | |||
91 | if (current_interval < mm->last_interval) | ||
92 | mm->token_priority++; | ||
93 | else { | ||
94 | if (likely(mm->token_priority > 0)) | ||
95 | mm->token_priority--; | ||
96 | } | ||
97 | |||
98 | /* Check if we deserve the token */ | ||
99 | if (mm->token_priority > swap_token_mm->token_priority) | ||
100 | goto replace_token; | ||
101 | |||
102 | update_priority: | ||
103 | trace_update_swap_token_priority(mm, old_prio, swap_token_mm); | ||
104 | |||
105 | out: | ||
106 | mm->faultstamp = global_faults; | ||
107 | mm->last_interval = current_interval; | ||
108 | spin_unlock(&swap_token_lock); | ||
109 | return; | ||
110 | |||
111 | replace_token: | ||
112 | mm->token_priority += 2; | ||
113 | trace_replace_swap_token(swap_token_mm, mm); | ||
114 | swap_token_mm = mm; | ||
115 | swap_token_memcg = swap_token_memcg_from_mm(mm); | ||
116 | last_aging = global_faults; | ||
117 | goto out; | ||
118 | } | ||
119 | |||
120 | /* Called on process exit. */ | ||
121 | void __put_swap_token(struct mm_struct *mm) | ||
122 | { | ||
123 | spin_lock(&swap_token_lock); | ||
124 | if (likely(mm == swap_token_mm)) { | ||
125 | trace_put_swap_token(swap_token_mm); | ||
126 | swap_token_mm = NULL; | ||
127 | swap_token_memcg = NULL; | ||
128 | } | ||
129 | spin_unlock(&swap_token_lock); | ||
130 | } | ||
131 | |||
132 | static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b) | ||
133 | { | ||
134 | if (!a) | ||
135 | return true; | ||
136 | if (!b) | ||
137 | return true; | ||
138 | if (a == b) | ||
139 | return true; | ||
140 | return false; | ||
141 | } | ||
142 | |||
143 | void disable_swap_token(struct mem_cgroup *memcg) | ||
144 | { | ||
145 | /* memcg reclaim don't disable unrelated mm token. */ | ||
146 | if (match_memcg(memcg, swap_token_memcg)) { | ||
147 | spin_lock(&swap_token_lock); | ||
148 | if (match_memcg(memcg, swap_token_memcg)) { | ||
149 | trace_disable_swap_token(swap_token_mm); | ||
150 | swap_token_mm = NULL; | ||
151 | swap_token_memcg = NULL; | ||
152 | } | ||
153 | spin_unlock(&swap_token_lock); | ||
154 | } | ||
155 | } | ||
diff --git a/mm/truncate.c b/mm/truncate.c index 61a183b89df6..75801acdaac7 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -602,31 +602,6 @@ int vmtruncate(struct inode *inode, loff_t newsize) | |||
602 | } | 602 | } |
603 | EXPORT_SYMBOL(vmtruncate); | 603 | EXPORT_SYMBOL(vmtruncate); |
604 | 604 | ||
605 | int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) | ||
606 | { | ||
607 | struct address_space *mapping = inode->i_mapping; | ||
608 | loff_t holebegin = round_up(lstart, PAGE_SIZE); | ||
609 | loff_t holelen = 1 + lend - holebegin; | ||
610 | |||
611 | /* | ||
612 | * If the underlying filesystem is not going to provide | ||
613 | * a way to truncate a range of blocks (punch a hole) - | ||
614 | * we should return failure right now. | ||
615 | */ | ||
616 | if (!inode->i_op->truncate_range) | ||
617 | return -ENOSYS; | ||
618 | |||
619 | mutex_lock(&inode->i_mutex); | ||
620 | inode_dio_wait(inode); | ||
621 | unmap_mapping_range(mapping, holebegin, holelen, 1); | ||
622 | inode->i_op->truncate_range(inode, lstart, lend); | ||
623 | /* unmap again to remove racily COWed private pages */ | ||
624 | unmap_mapping_range(mapping, holebegin, holelen, 1); | ||
625 | mutex_unlock(&inode->i_mutex); | ||
626 | |||
627 | return 0; | ||
628 | } | ||
629 | |||
630 | /** | 605 | /** |
631 | * truncate_pagecache_range - unmap and remove pagecache that is hole-punched | 606 | * truncate_pagecache_range - unmap and remove pagecache that is hole-punched |
632 | * @inode: inode | 607 | * @inode: inode |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 94dff883b449..2aad49981b57 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -1185,9 +1185,10 @@ void __init vmalloc_init(void) | |||
1185 | /* Import existing vmlist entries. */ | 1185 | /* Import existing vmlist entries. */ |
1186 | for (tmp = vmlist; tmp; tmp = tmp->next) { | 1186 | for (tmp = vmlist; tmp; tmp = tmp->next) { |
1187 | va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); | 1187 | va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); |
1188 | va->flags = tmp->flags | VM_VM_AREA; | 1188 | va->flags = VM_VM_AREA; |
1189 | va->va_start = (unsigned long)tmp->addr; | 1189 | va->va_start = (unsigned long)tmp->addr; |
1190 | va->va_end = va->va_start + tmp->size; | 1190 | va->va_end = va->va_start + tmp->size; |
1191 | va->vm = tmp; | ||
1191 | __insert_vmap_area(va); | 1192 | __insert_vmap_area(va); |
1192 | } | 1193 | } |
1193 | 1194 | ||
@@ -2375,8 +2376,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | |||
2375 | return NULL; | 2376 | return NULL; |
2376 | } | 2377 | } |
2377 | 2378 | ||
2378 | vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); | 2379 | vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL); |
2379 | vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); | 2380 | vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL); |
2380 | if (!vas || !vms) | 2381 | if (!vas || !vms) |
2381 | goto err_free2; | 2382 | goto err_free2; |
2382 | 2383 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 3d1365c17868..8deb5f4da4d9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -53,24 +53,6 @@ | |||
53 | #define CREATE_TRACE_POINTS | 53 | #define CREATE_TRACE_POINTS |
54 | #include <trace/events/vmscan.h> | 54 | #include <trace/events/vmscan.h> |
55 | 55 | ||
56 | /* | ||
57 | * reclaim_mode determines how the inactive list is shrunk | ||
58 | * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages | ||
59 | * RECLAIM_MODE_ASYNC: Do not block | ||
60 | * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback | ||
61 | * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference | ||
62 | * page from the LRU and reclaim all pages within a | ||
63 | * naturally aligned range | ||
64 | * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of | ||
65 | * order-0 pages and then compact the zone | ||
66 | */ | ||
67 | typedef unsigned __bitwise__ reclaim_mode_t; | ||
68 | #define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) | ||
69 | #define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) | ||
70 | #define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) | ||
71 | #define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) | ||
72 | #define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) | ||
73 | |||
74 | struct scan_control { | 56 | struct scan_control { |
75 | /* Incremented by the number of inactive pages that were scanned */ | 57 | /* Incremented by the number of inactive pages that were scanned */ |
76 | unsigned long nr_scanned; | 58 | unsigned long nr_scanned; |
@@ -97,12 +79,6 @@ struct scan_control { | |||
97 | int order; | 79 | int order; |
98 | 80 | ||
99 | /* | 81 | /* |
100 | * Intend to reclaim enough continuous memory rather than reclaim | ||
101 | * enough amount of memory. i.e, mode for high order allocation. | ||
102 | */ | ||
103 | reclaim_mode_t reclaim_mode; | ||
104 | |||
105 | /* | ||
106 | * The memory cgroup that hit its limit and as a result is the | 82 | * The memory cgroup that hit its limit and as a result is the |
107 | * primary target of this reclaim invocation. | 83 | * primary target of this reclaim invocation. |
108 | */ | 84 | */ |
@@ -164,35 +140,22 @@ static bool global_reclaim(struct scan_control *sc) | |||
164 | { | 140 | { |
165 | return !sc->target_mem_cgroup; | 141 | return !sc->target_mem_cgroup; |
166 | } | 142 | } |
167 | |||
168 | static bool scanning_global_lru(struct mem_cgroup_zone *mz) | ||
169 | { | ||
170 | return !mz->mem_cgroup; | ||
171 | } | ||
172 | #else | 143 | #else |
173 | static bool global_reclaim(struct scan_control *sc) | 144 | static bool global_reclaim(struct scan_control *sc) |
174 | { | 145 | { |
175 | return true; | 146 | return true; |
176 | } | 147 | } |
177 | |||
178 | static bool scanning_global_lru(struct mem_cgroup_zone *mz) | ||
179 | { | ||
180 | return true; | ||
181 | } | ||
182 | #endif | 148 | #endif |
183 | 149 | ||
184 | static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) | 150 | static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz) |
185 | { | 151 | { |
186 | if (!scanning_global_lru(mz)) | 152 | return &mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup)->reclaim_stat; |
187 | return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone); | ||
188 | |||
189 | return &mz->zone->reclaim_stat; | ||
190 | } | 153 | } |
191 | 154 | ||
192 | static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz, | 155 | static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz, |
193 | enum lru_list lru) | 156 | enum lru_list lru) |
194 | { | 157 | { |
195 | if (!scanning_global_lru(mz)) | 158 | if (!mem_cgroup_disabled()) |
196 | return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup, | 159 | return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup, |
197 | zone_to_nid(mz->zone), | 160 | zone_to_nid(mz->zone), |
198 | zone_idx(mz->zone), | 161 | zone_idx(mz->zone), |
@@ -364,39 +327,6 @@ out: | |||
364 | return ret; | 327 | return ret; |
365 | } | 328 | } |
366 | 329 | ||
367 | static void set_reclaim_mode(int priority, struct scan_control *sc, | ||
368 | bool sync) | ||
369 | { | ||
370 | reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; | ||
371 | |||
372 | /* | ||
373 | * Initially assume we are entering either lumpy reclaim or | ||
374 | * reclaim/compaction.Depending on the order, we will either set the | ||
375 | * sync mode or just reclaim order-0 pages later. | ||
376 | */ | ||
377 | if (COMPACTION_BUILD) | ||
378 | sc->reclaim_mode = RECLAIM_MODE_COMPACTION; | ||
379 | else | ||
380 | sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; | ||
381 | |||
382 | /* | ||
383 | * Avoid using lumpy reclaim or reclaim/compaction if possible by | ||
384 | * restricting when its set to either costly allocations or when | ||
385 | * under memory pressure | ||
386 | */ | ||
387 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
388 | sc->reclaim_mode |= syncmode; | ||
389 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
390 | sc->reclaim_mode |= syncmode; | ||
391 | else | ||
392 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; | ||
393 | } | ||
394 | |||
395 | static void reset_reclaim_mode(struct scan_control *sc) | ||
396 | { | ||
397 | sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; | ||
398 | } | ||
399 | |||
400 | static inline int is_page_cache_freeable(struct page *page) | 330 | static inline int is_page_cache_freeable(struct page *page) |
401 | { | 331 | { |
402 | /* | 332 | /* |
@@ -416,10 +346,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi, | |||
416 | return 1; | 346 | return 1; |
417 | if (bdi == current->backing_dev_info) | 347 | if (bdi == current->backing_dev_info) |
418 | return 1; | 348 | return 1; |
419 | |||
420 | /* lumpy reclaim for hugepage often need a lot of write */ | ||
421 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
422 | return 1; | ||
423 | return 0; | 349 | return 0; |
424 | } | 350 | } |
425 | 351 | ||
@@ -523,8 +449,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
523 | /* synchronous write or broken a_ops? */ | 449 | /* synchronous write or broken a_ops? */ |
524 | ClearPageReclaim(page); | 450 | ClearPageReclaim(page); |
525 | } | 451 | } |
526 | trace_mm_vmscan_writepage(page, | 452 | trace_mm_vmscan_writepage(page, trace_reclaim_flags(page)); |
527 | trace_reclaim_flags(page, sc->reclaim_mode)); | ||
528 | inc_zone_page_state(page, NR_VMSCAN_WRITE); | 453 | inc_zone_page_state(page, NR_VMSCAN_WRITE); |
529 | return PAGE_SUCCESS; | 454 | return PAGE_SUCCESS; |
530 | } | 455 | } |
@@ -707,13 +632,10 @@ static enum page_references page_check_references(struct page *page, | |||
707 | int referenced_ptes, referenced_page; | 632 | int referenced_ptes, referenced_page; |
708 | unsigned long vm_flags; | 633 | unsigned long vm_flags; |
709 | 634 | ||
710 | referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags); | 635 | referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, |
636 | &vm_flags); | ||
711 | referenced_page = TestClearPageReferenced(page); | 637 | referenced_page = TestClearPageReferenced(page); |
712 | 638 | ||
713 | /* Lumpy reclaim - ignore references */ | ||
714 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) | ||
715 | return PAGEREF_RECLAIM; | ||
716 | |||
717 | /* | 639 | /* |
718 | * Mlock lost the isolation race with us. Let try_to_unmap() | 640 | * Mlock lost the isolation race with us. Let try_to_unmap() |
719 | * move the page to the unevictable list. | 641 | * move the page to the unevictable list. |
@@ -722,7 +644,7 @@ static enum page_references page_check_references(struct page *page, | |||
722 | return PAGEREF_RECLAIM; | 644 | return PAGEREF_RECLAIM; |
723 | 645 | ||
724 | if (referenced_ptes) { | 646 | if (referenced_ptes) { |
725 | if (PageAnon(page)) | 647 | if (PageSwapBacked(page)) |
726 | return PAGEREF_ACTIVATE; | 648 | return PAGEREF_ACTIVATE; |
727 | /* | 649 | /* |
728 | * All mapped pages start out with page table | 650 | * All mapped pages start out with page table |
@@ -813,19 +735,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
813 | 735 | ||
814 | if (PageWriteback(page)) { | 736 | if (PageWriteback(page)) { |
815 | nr_writeback++; | 737 | nr_writeback++; |
816 | /* | 738 | unlock_page(page); |
817 | * Synchronous reclaim cannot queue pages for | 739 | goto keep; |
818 | * writeback due to the possibility of stack overflow | ||
819 | * but if it encounters a page under writeback, wait | ||
820 | * for the IO to complete. | ||
821 | */ | ||
822 | if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && | ||
823 | may_enter_fs) | ||
824 | wait_on_page_writeback(page); | ||
825 | else { | ||
826 | unlock_page(page); | ||
827 | goto keep_lumpy; | ||
828 | } | ||
829 | } | 740 | } |
830 | 741 | ||
831 | references = page_check_references(page, mz, sc); | 742 | references = page_check_references(page, mz, sc); |
@@ -908,7 +819,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
908 | goto activate_locked; | 819 | goto activate_locked; |
909 | case PAGE_SUCCESS: | 820 | case PAGE_SUCCESS: |
910 | if (PageWriteback(page)) | 821 | if (PageWriteback(page)) |
911 | goto keep_lumpy; | 822 | goto keep; |
912 | if (PageDirty(page)) | 823 | if (PageDirty(page)) |
913 | goto keep; | 824 | goto keep; |
914 | 825 | ||
@@ -994,7 +905,6 @@ cull_mlocked: | |||
994 | try_to_free_swap(page); | 905 | try_to_free_swap(page); |
995 | unlock_page(page); | 906 | unlock_page(page); |
996 | putback_lru_page(page); | 907 | putback_lru_page(page); |
997 | reset_reclaim_mode(sc); | ||
998 | continue; | 908 | continue; |
999 | 909 | ||
1000 | activate_locked: | 910 | activate_locked: |
@@ -1007,8 +917,6 @@ activate_locked: | |||
1007 | keep_locked: | 917 | keep_locked: |
1008 | unlock_page(page); | 918 | unlock_page(page); |
1009 | keep: | 919 | keep: |
1010 | reset_reclaim_mode(sc); | ||
1011 | keep_lumpy: | ||
1012 | list_add(&page->lru, &ret_pages); | 920 | list_add(&page->lru, &ret_pages); |
1013 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); | 921 | VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); |
1014 | } | 922 | } |
@@ -1064,11 +972,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) | |||
1064 | if (!all_lru_mode && !!page_is_file_cache(page) != file) | 972 | if (!all_lru_mode && !!page_is_file_cache(page) != file) |
1065 | return ret; | 973 | return ret; |
1066 | 974 | ||
1067 | /* | 975 | /* Do not give back unevictable pages for compaction */ |
1068 | * When this function is being called for lumpy reclaim, we | ||
1069 | * initially look into all LRU pages, active, inactive and | ||
1070 | * unevictable; only give shrink_page_list evictable pages. | ||
1071 | */ | ||
1072 | if (PageUnevictable(page)) | 976 | if (PageUnevictable(page)) |
1073 | return ret; | 977 | return ret; |
1074 | 978 | ||
@@ -1153,9 +1057,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1153 | struct lruvec *lruvec; | 1057 | struct lruvec *lruvec; |
1154 | struct list_head *src; | 1058 | struct list_head *src; |
1155 | unsigned long nr_taken = 0; | 1059 | unsigned long nr_taken = 0; |
1156 | unsigned long nr_lumpy_taken = 0; | ||
1157 | unsigned long nr_lumpy_dirty = 0; | ||
1158 | unsigned long nr_lumpy_failed = 0; | ||
1159 | unsigned long scan; | 1060 | unsigned long scan; |
1160 | int lru = LRU_BASE; | 1061 | int lru = LRU_BASE; |
1161 | 1062 | ||
@@ -1168,10 +1069,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1168 | 1069 | ||
1169 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { | 1070 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { |
1170 | struct page *page; | 1071 | struct page *page; |
1171 | unsigned long pfn; | ||
1172 | unsigned long end_pfn; | ||
1173 | unsigned long page_pfn; | ||
1174 | int zone_id; | ||
1175 | 1072 | ||
1176 | page = lru_to_page(src); | 1073 | page = lru_to_page(src); |
1177 | prefetchw_prev_lru_page(page, src, flags); | 1074 | prefetchw_prev_lru_page(page, src, flags); |
@@ -1193,84 +1090,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1193 | default: | 1090 | default: |
1194 | BUG(); | 1091 | BUG(); |
1195 | } | 1092 | } |
1196 | |||
1197 | if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)) | ||
1198 | continue; | ||
1199 | |||
1200 | /* | ||
1201 | * Attempt to take all pages in the order aligned region | ||
1202 | * surrounding the tag page. Only take those pages of | ||
1203 | * the same active state as that tag page. We may safely | ||
1204 | * round the target page pfn down to the requested order | ||
1205 | * as the mem_map is guaranteed valid out to MAX_ORDER, | ||
1206 | * where that page is in a different zone we will detect | ||
1207 | * it from its zone id and abort this block scan. | ||
1208 | */ | ||
1209 | zone_id = page_zone_id(page); | ||
1210 | page_pfn = page_to_pfn(page); | ||
1211 | pfn = page_pfn & ~((1 << sc->order) - 1); | ||
1212 | end_pfn = pfn + (1 << sc->order); | ||
1213 | for (; pfn < end_pfn; pfn++) { | ||
1214 | struct page *cursor_page; | ||
1215 | |||
1216 | /* The target page is in the block, ignore it. */ | ||
1217 | if (unlikely(pfn == page_pfn)) | ||
1218 | continue; | ||
1219 | |||
1220 | /* Avoid holes within the zone. */ | ||
1221 | if (unlikely(!pfn_valid_within(pfn))) | ||
1222 | break; | ||
1223 | |||
1224 | cursor_page = pfn_to_page(pfn); | ||
1225 | |||
1226 | /* Check that we have not crossed a zone boundary. */ | ||
1227 | if (unlikely(page_zone_id(cursor_page) != zone_id)) | ||
1228 | break; | ||
1229 | |||
1230 | /* | ||
1231 | * If we don't have enough swap space, reclaiming of | ||
1232 | * anon page which don't already have a swap slot is | ||
1233 | * pointless. | ||
1234 | */ | ||
1235 | if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) && | ||
1236 | !PageSwapCache(cursor_page)) | ||
1237 | break; | ||
1238 | |||
1239 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { | ||
1240 | unsigned int isolated_pages; | ||
1241 | |||
1242 | mem_cgroup_lru_del(cursor_page); | ||
1243 | list_move(&cursor_page->lru, dst); | ||
1244 | isolated_pages = hpage_nr_pages(cursor_page); | ||
1245 | nr_taken += isolated_pages; | ||
1246 | nr_lumpy_taken += isolated_pages; | ||
1247 | if (PageDirty(cursor_page)) | ||
1248 | nr_lumpy_dirty += isolated_pages; | ||
1249 | scan++; | ||
1250 | pfn += isolated_pages - 1; | ||
1251 | } else { | ||
1252 | /* | ||
1253 | * Check if the page is freed already. | ||
1254 | * | ||
1255 | * We can't use page_count() as that | ||
1256 | * requires compound_head and we don't | ||
1257 | * have a pin on the page here. If a | ||
1258 | * page is tail, we may or may not | ||
1259 | * have isolated the head, so assume | ||
1260 | * it's not free, it'd be tricky to | ||
1261 | * track the head status without a | ||
1262 | * page pin. | ||
1263 | */ | ||
1264 | if (!PageTail(cursor_page) && | ||
1265 | !atomic_read(&cursor_page->_count)) | ||
1266 | continue; | ||
1267 | break; | ||
1268 | } | ||
1269 | } | ||
1270 | |||
1271 | /* If we break out of the loop above, lumpy reclaim failed */ | ||
1272 | if (pfn < end_pfn) | ||
1273 | nr_lumpy_failed++; | ||
1274 | } | 1093 | } |
1275 | 1094 | ||
1276 | *nr_scanned = scan; | 1095 | *nr_scanned = scan; |
@@ -1278,7 +1097,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1278 | trace_mm_vmscan_lru_isolate(sc->order, | 1097 | trace_mm_vmscan_lru_isolate(sc->order, |
1279 | nr_to_scan, scan, | 1098 | nr_to_scan, scan, |
1280 | nr_taken, | 1099 | nr_taken, |
1281 | nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, | ||
1282 | mode, file); | 1100 | mode, file); |
1283 | return nr_taken; | 1101 | return nr_taken; |
1284 | } | 1102 | } |
@@ -1454,47 +1272,6 @@ update_isolated_counts(struct mem_cgroup_zone *mz, | |||
1454 | } | 1272 | } |
1455 | 1273 | ||
1456 | /* | 1274 | /* |
1457 | * Returns true if a direct reclaim should wait on pages under writeback. | ||
1458 | * | ||
1459 | * If we are direct reclaiming for contiguous pages and we do not reclaim | ||
1460 | * everything in the list, try again and wait for writeback IO to complete. | ||
1461 | * This will stall high-order allocations noticeably. Only do that when really | ||
1462 | * need to free the pages under high memory pressure. | ||
1463 | */ | ||
1464 | static inline bool should_reclaim_stall(unsigned long nr_taken, | ||
1465 | unsigned long nr_freed, | ||
1466 | int priority, | ||
1467 | struct scan_control *sc) | ||
1468 | { | ||
1469 | int lumpy_stall_priority; | ||
1470 | |||
1471 | /* kswapd should not stall on sync IO */ | ||
1472 | if (current_is_kswapd()) | ||
1473 | return false; | ||
1474 | |||
1475 | /* Only stall on lumpy reclaim */ | ||
1476 | if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) | ||
1477 | return false; | ||
1478 | |||
1479 | /* If we have reclaimed everything on the isolated list, no stall */ | ||
1480 | if (nr_freed == nr_taken) | ||
1481 | return false; | ||
1482 | |||
1483 | /* | ||
1484 | * For high-order allocations, there are two stall thresholds. | ||
1485 | * High-cost allocations stall immediately where as lower | ||
1486 | * order allocations such as stacks require the scanning | ||
1487 | * priority to be much higher before stalling. | ||
1488 | */ | ||
1489 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
1490 | lumpy_stall_priority = DEF_PRIORITY; | ||
1491 | else | ||
1492 | lumpy_stall_priority = DEF_PRIORITY / 3; | ||
1493 | |||
1494 | return priority <= lumpy_stall_priority; | ||
1495 | } | ||
1496 | |||
1497 | /* | ||
1498 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number | 1275 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number |
1499 | * of reclaimed pages | 1276 | * of reclaimed pages |
1500 | */ | 1277 | */ |
@@ -1522,10 +1299,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1522 | return SWAP_CLUSTER_MAX; | 1299 | return SWAP_CLUSTER_MAX; |
1523 | } | 1300 | } |
1524 | 1301 | ||
1525 | set_reclaim_mode(priority, sc, false); | ||
1526 | if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) | ||
1527 | isolate_mode |= ISOLATE_ACTIVE; | ||
1528 | |||
1529 | lru_add_drain(); | 1302 | lru_add_drain(); |
1530 | 1303 | ||
1531 | if (!sc->may_unmap) | 1304 | if (!sc->may_unmap) |
@@ -1556,13 +1329,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1556 | nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, | 1329 | nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority, |
1557 | &nr_dirty, &nr_writeback); | 1330 | &nr_dirty, &nr_writeback); |
1558 | 1331 | ||
1559 | /* Check if we should syncronously wait for writeback */ | ||
1560 | if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { | ||
1561 | set_reclaim_mode(priority, sc, true); | ||
1562 | nr_reclaimed += shrink_page_list(&page_list, mz, sc, | ||
1563 | priority, &nr_dirty, &nr_writeback); | ||
1564 | } | ||
1565 | |||
1566 | spin_lock_irq(&zone->lru_lock); | 1332 | spin_lock_irq(&zone->lru_lock); |
1567 | 1333 | ||
1568 | reclaim_stat->recent_scanned[0] += nr_anon; | 1334 | reclaim_stat->recent_scanned[0] += nr_anon; |
@@ -1616,7 +1382,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz, | |||
1616 | zone_idx(zone), | 1382 | zone_idx(zone), |
1617 | nr_scanned, nr_reclaimed, | 1383 | nr_scanned, nr_reclaimed, |
1618 | priority, | 1384 | priority, |
1619 | trace_shrink_flags(file, sc->reclaim_mode)); | 1385 | trace_shrink_flags(file)); |
1620 | return nr_reclaimed; | 1386 | return nr_reclaimed; |
1621 | } | 1387 | } |
1622 | 1388 | ||
@@ -1695,8 +1461,6 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1695 | 1461 | ||
1696 | lru_add_drain(); | 1462 | lru_add_drain(); |
1697 | 1463 | ||
1698 | reset_reclaim_mode(sc); | ||
1699 | |||
1700 | if (!sc->may_unmap) | 1464 | if (!sc->may_unmap) |
1701 | isolate_mode |= ISOLATE_UNMAPPED; | 1465 | isolate_mode |= ISOLATE_UNMAPPED; |
1702 | if (!sc->may_writepage) | 1466 | if (!sc->may_writepage) |
@@ -1737,7 +1501,8 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1737 | } | 1501 | } |
1738 | } | 1502 | } |
1739 | 1503 | ||
1740 | if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) { | 1504 | if (page_referenced(page, 0, sc->target_mem_cgroup, |
1505 | &vm_flags)) { | ||
1741 | nr_rotated += hpage_nr_pages(page); | 1506 | nr_rotated += hpage_nr_pages(page); |
1742 | /* | 1507 | /* |
1743 | * Identify referenced, file-backed active pages and | 1508 | * Identify referenced, file-backed active pages and |
@@ -1811,7 +1576,7 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz) | |||
1811 | if (!total_swap_pages) | 1576 | if (!total_swap_pages) |
1812 | return 0; | 1577 | return 0; |
1813 | 1578 | ||
1814 | if (!scanning_global_lru(mz)) | 1579 | if (!mem_cgroup_disabled()) |
1815 | return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, | 1580 | return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup, |
1816 | mz->zone); | 1581 | mz->zone); |
1817 | 1582 | ||
@@ -1850,7 +1615,7 @@ static int inactive_file_is_low_global(struct zone *zone) | |||
1850 | */ | 1615 | */ |
1851 | static int inactive_file_is_low(struct mem_cgroup_zone *mz) | 1616 | static int inactive_file_is_low(struct mem_cgroup_zone *mz) |
1852 | { | 1617 | { |
1853 | if (!scanning_global_lru(mz)) | 1618 | if (!mem_cgroup_disabled()) |
1854 | return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, | 1619 | return mem_cgroup_inactive_file_is_low(mz->mem_cgroup, |
1855 | mz->zone); | 1620 | mz->zone); |
1856 | 1621 | ||
@@ -1984,10 +1749,10 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc, | |||
1984 | * proportional to the fraction of recently scanned pages on | 1749 | * proportional to the fraction of recently scanned pages on |
1985 | * each list that were recently referenced and in active use. | 1750 | * each list that were recently referenced and in active use. |
1986 | */ | 1751 | */ |
1987 | ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); | 1752 | ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1); |
1988 | ap /= reclaim_stat->recent_rotated[0] + 1; | 1753 | ap /= reclaim_stat->recent_rotated[0] + 1; |
1989 | 1754 | ||
1990 | fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); | 1755 | fp = file_prio * (reclaim_stat->recent_scanned[1] + 1); |
1991 | fp /= reclaim_stat->recent_rotated[1] + 1; | 1756 | fp /= reclaim_stat->recent_rotated[1] + 1; |
1992 | spin_unlock_irq(&mz->zone->lru_lock); | 1757 | spin_unlock_irq(&mz->zone->lru_lock); |
1993 | 1758 | ||
@@ -2000,7 +1765,7 @@ out: | |||
2000 | unsigned long scan; | 1765 | unsigned long scan; |
2001 | 1766 | ||
2002 | scan = zone_nr_lru_pages(mz, lru); | 1767 | scan = zone_nr_lru_pages(mz, lru); |
2003 | if (priority || noswap) { | 1768 | if (priority || noswap || !vmscan_swappiness(mz, sc)) { |
2004 | scan >>= priority; | 1769 | scan >>= priority; |
2005 | if (!scan && force_scan) | 1770 | if (!scan && force_scan) |
2006 | scan = SWAP_CLUSTER_MAX; | 1771 | scan = SWAP_CLUSTER_MAX; |
@@ -2010,23 +1775,35 @@ out: | |||
2010 | } | 1775 | } |
2011 | } | 1776 | } |
2012 | 1777 | ||
1778 | /* Use reclaim/compaction for costly allocs or under memory pressure */ | ||
1779 | static bool in_reclaim_compaction(int priority, struct scan_control *sc) | ||
1780 | { | ||
1781 | if (COMPACTION_BUILD && sc->order && | ||
1782 | (sc->order > PAGE_ALLOC_COSTLY_ORDER || | ||
1783 | priority < DEF_PRIORITY - 2)) | ||
1784 | return true; | ||
1785 | |||
1786 | return false; | ||
1787 | } | ||
1788 | |||
2013 | /* | 1789 | /* |
2014 | * Reclaim/compaction depends on a number of pages being freed. To avoid | 1790 | * Reclaim/compaction is used for high-order allocation requests. It reclaims |
2015 | * disruption to the system, a small number of order-0 pages continue to be | 1791 | * order-0 pages before compacting the zone. should_continue_reclaim() returns |
2016 | * rotated and reclaimed in the normal fashion. However, by the time we get | 1792 | * true if more pages should be reclaimed such that when the page allocator |
2017 | * back to the allocator and call try_to_compact_zone(), we ensure that | 1793 | * calls try_to_compact_zone() that it will have enough free pages to succeed. |
2018 | * there are enough free pages for it to be likely successful | 1794 | * It will give up earlier than that if there is difficulty reclaiming pages. |
2019 | */ | 1795 | */ |
2020 | static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, | 1796 | static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz, |
2021 | unsigned long nr_reclaimed, | 1797 | unsigned long nr_reclaimed, |
2022 | unsigned long nr_scanned, | 1798 | unsigned long nr_scanned, |
1799 | int priority, | ||
2023 | struct scan_control *sc) | 1800 | struct scan_control *sc) |
2024 | { | 1801 | { |
2025 | unsigned long pages_for_compaction; | 1802 | unsigned long pages_for_compaction; |
2026 | unsigned long inactive_lru_pages; | 1803 | unsigned long inactive_lru_pages; |
2027 | 1804 | ||
2028 | /* If not in reclaim/compaction mode, stop */ | 1805 | /* If not in reclaim/compaction mode, stop */ |
2029 | if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) | 1806 | if (!in_reclaim_compaction(priority, sc)) |
2030 | return false; | 1807 | return false; |
2031 | 1808 | ||
2032 | /* Consider stopping depending on scan and reclaim activity */ | 1809 | /* Consider stopping depending on scan and reclaim activity */ |
@@ -2128,7 +1905,8 @@ restart: | |||
2128 | 1905 | ||
2129 | /* reclaim/compaction might need reclaim to continue */ | 1906 | /* reclaim/compaction might need reclaim to continue */ |
2130 | if (should_continue_reclaim(mz, nr_reclaimed, | 1907 | if (should_continue_reclaim(mz, nr_reclaimed, |
2131 | sc->nr_scanned - nr_scanned, sc)) | 1908 | sc->nr_scanned - nr_scanned, |
1909 | priority, sc)) | ||
2132 | goto restart; | 1910 | goto restart; |
2133 | 1911 | ||
2134 | throttle_vm_writeout(sc->gfp_mask); | 1912 | throttle_vm_writeout(sc->gfp_mask); |
@@ -2353,8 +2131,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2353 | 2131 | ||
2354 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2132 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
2355 | sc->nr_scanned = 0; | 2133 | sc->nr_scanned = 0; |
2356 | if (!priority) | ||
2357 | disable_swap_token(sc->target_mem_cgroup); | ||
2358 | aborted_reclaim = shrink_zones(priority, zonelist, sc); | 2134 | aborted_reclaim = shrink_zones(priority, zonelist, sc); |
2359 | 2135 | ||
2360 | /* | 2136 | /* |
@@ -2705,10 +2481,6 @@ loop_again: | |||
2705 | unsigned long lru_pages = 0; | 2481 | unsigned long lru_pages = 0; |
2706 | int has_under_min_watermark_zone = 0; | 2482 | int has_under_min_watermark_zone = 0; |
2707 | 2483 | ||
2708 | /* The swap token gets in the way of swapout... */ | ||
2709 | if (!priority) | ||
2710 | disable_swap_token(NULL); | ||
2711 | |||
2712 | all_zones_ok = 1; | 2484 | all_zones_ok = 1; |
2713 | balanced = 0; | 2485 | balanced = 0; |
2714 | 2486 | ||
@@ -3537,7 +3309,7 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) | |||
3537 | if (mapping_unevictable(page_mapping(page))) | 3309 | if (mapping_unevictable(page_mapping(page))) |
3538 | return 0; | 3310 | return 0; |
3539 | 3311 | ||
3540 | if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) | 3312 | if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page))) |
3541 | return 0; | 3313 | return 0; |
3542 | 3314 | ||
3543 | return 1; | 3315 | return 1; |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 0dad31dc1618..1bbbbd9776ad 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -1223,7 +1223,6 @@ module_init(setup_vmstat) | |||
1223 | #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) | 1223 | #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) |
1224 | #include <linux/debugfs.h> | 1224 | #include <linux/debugfs.h> |
1225 | 1225 | ||
1226 | static struct dentry *extfrag_debug_root; | ||
1227 | 1226 | ||
1228 | /* | 1227 | /* |
1229 | * Return an index indicating how much of the available free memory is | 1228 | * Return an index indicating how much of the available free memory is |
@@ -1361,19 +1360,24 @@ static const struct file_operations extfrag_file_ops = { | |||
1361 | 1360 | ||
1362 | static int __init extfrag_debug_init(void) | 1361 | static int __init extfrag_debug_init(void) |
1363 | { | 1362 | { |
1363 | struct dentry *extfrag_debug_root; | ||
1364 | |||
1364 | extfrag_debug_root = debugfs_create_dir("extfrag", NULL); | 1365 | extfrag_debug_root = debugfs_create_dir("extfrag", NULL); |
1365 | if (!extfrag_debug_root) | 1366 | if (!extfrag_debug_root) |
1366 | return -ENOMEM; | 1367 | return -ENOMEM; |
1367 | 1368 | ||
1368 | if (!debugfs_create_file("unusable_index", 0444, | 1369 | if (!debugfs_create_file("unusable_index", 0444, |
1369 | extfrag_debug_root, NULL, &unusable_file_ops)) | 1370 | extfrag_debug_root, NULL, &unusable_file_ops)) |
1370 | return -ENOMEM; | 1371 | goto fail; |
1371 | 1372 | ||
1372 | if (!debugfs_create_file("extfrag_index", 0444, | 1373 | if (!debugfs_create_file("extfrag_index", 0444, |
1373 | extfrag_debug_root, NULL, &extfrag_file_ops)) | 1374 | extfrag_debug_root, NULL, &extfrag_file_ops)) |
1374 | return -ENOMEM; | 1375 | goto fail; |
1375 | 1376 | ||
1376 | return 0; | 1377 | return 0; |
1378 | fail: | ||
1379 | debugfs_remove_recursive(extfrag_debug_root); | ||
1380 | return -ENOMEM; | ||
1377 | } | 1381 | } |
1378 | 1382 | ||
1379 | module_init(extfrag_debug_init); | 1383 | module_init(extfrag_debug_init); |